PRE-RELEASE v0.7.0
Self-heal device= → ifindex attribution and expose plugin meta counters in the scrape. ipng_stats_rescan_interval (default 60s, 0 to disable) runs a per-worker timer that re-resolves every binding via if_nametoindex, so interface teardown/recreate (e.g. GRE tunnel reprovision) picks up the new ifindex without requiring an nginx reload. nginx_ipng_ifindex_misses_total increments whenever a cmsg-reported ingress ifindex doesn't match any binding — making stale mappings observable. Also expose the existing zone_full_events and flushes_total shared-memory counters, which were tracked but never emitted. JSON output gains a top-level "meta" object; schema stays at 2 (additive change). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -183,6 +183,13 @@ typedef struct {
|
||||
* the plugin in the design doc). */
|
||||
ngx_atomic_uint_t zone_full_events;
|
||||
ngx_atomic_uint_t flushes_total;
|
||||
/* Requests whose connection carried an IP_PKTINFO ifindex that
|
||||
* didn't match any configured binding — a non-zero rate here
|
||||
* means traffic is falling through to the default source and
|
||||
* the `device=<ifname>` → ifindex mapping is stale (interface
|
||||
* recreated with a new ifindex) or the interface is genuinely
|
||||
* unconfigured. */
|
||||
ngx_atomic_uint_t ifindex_misses;
|
||||
} ngx_http_ipng_stats_shctx_t;
|
||||
|
||||
|
||||
@@ -224,6 +231,7 @@ typedef struct {
|
||||
ngx_str_t zone_name;
|
||||
size_t zone_size;
|
||||
ngx_msec_t flush_interval;
|
||||
ngx_msec_t rescan_interval; /* 0 disables periodic rescan */
|
||||
ngx_str_t default_source;
|
||||
ngx_uint_t nbuckets;
|
||||
ngx_uint_t *bucket_bounds_ms; /* len = nbuckets */
|
||||
@@ -267,6 +275,11 @@ typedef struct {
|
||||
u_char *logtail_end;
|
||||
ngx_event_t logtail_flush_ev;
|
||||
ngx_socket_t logtail_udp_fd; /* per-worker UDP socket, or -1 */
|
||||
|
||||
/* Periodic device= → ifindex rescan timer. Each worker runs its
|
||||
* own; there's no shared state since bindings live in the per-
|
||||
* process main conf (COW-forked from master). */
|
||||
ngx_event_t rescan_ev;
|
||||
} ngx_http_ipng_stats_worker_t;
|
||||
|
||||
|
||||
@@ -307,6 +320,7 @@ static ngx_int_t ngx_http_ipng_stats_init_zone(ngx_shm_zone_t *shm_zone,
|
||||
static ngx_int_t ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle);
|
||||
static ngx_int_t ngx_http_ipng_stats_init_worker(ngx_cycle_t *cycle);
|
||||
static void ngx_http_ipng_stats_exit_worker(ngx_cycle_t *cycle);
|
||||
static void ngx_http_ipng_stats_rescan_timer(ngx_event_t *ev);
|
||||
|
||||
static ngx_int_t ngx_http_ipng_stats_log_handler(ngx_http_request_t *r);
|
||||
static ngx_int_t ngx_http_ipng_stats_content_handler(ngx_http_request_t *r);
|
||||
@@ -367,6 +381,13 @@ static ngx_command_t ngx_http_ipng_stats_commands[] = {
|
||||
offsetof(ngx_http_ipng_stats_main_conf_t, flush_interval),
|
||||
NULL },
|
||||
|
||||
{ ngx_string("ipng_stats_rescan_interval"),
|
||||
NGX_HTTP_MAIN_CONF|NGX_CONF_TAKE1,
|
||||
ngx_conf_set_msec_slot,
|
||||
NGX_HTTP_MAIN_CONF_OFFSET,
|
||||
offsetof(ngx_http_ipng_stats_main_conf_t, rescan_interval),
|
||||
NULL },
|
||||
|
||||
{ ngx_string("ipng_stats_default_source"),
|
||||
NGX_HTTP_MAIN_CONF|NGX_CONF_TAKE1,
|
||||
ngx_conf_set_str_slot,
|
||||
@@ -652,6 +673,7 @@ ngx_http_ipng_stats_create_main_conf(ngx_conf_t *cf)
|
||||
}
|
||||
|
||||
imcf->flush_interval = NGX_CONF_UNSET_MSEC;
|
||||
imcf->rescan_interval = NGX_CONF_UNSET_MSEC;
|
||||
imcf->nbuckets = 0;
|
||||
imcf->bucket_bounds_ms = NULL;
|
||||
imcf->nbytebuckets = 0;
|
||||
@@ -676,6 +698,18 @@ ngx_http_ipng_stats_init_main_conf(ngx_conf_t *cf, void *conf)
|
||||
return NGX_CONF_ERROR;
|
||||
}
|
||||
|
||||
/* Default 60s. Set to 0 to disable the periodic rescan and rely
|
||||
* solely on `nginx -s reload` to refresh the device= → ifindex
|
||||
* table. A minimum of 1s keeps the timer from starving the
|
||||
* worker event loop on pathological values. */
|
||||
ngx_conf_init_msec_value(imcf->rescan_interval, 60000);
|
||||
if (imcf->rescan_interval > 0 && imcf->rescan_interval < 1000) {
|
||||
ngx_log_error(NGX_LOG_EMERG, cf->log, 0,
|
||||
"ipng_stats_rescan_interval must be 0 (disabled) "
|
||||
"or at least 1s");
|
||||
return NGX_CONF_ERROR;
|
||||
}
|
||||
|
||||
if (imcf->default_source.len == 0) {
|
||||
ngx_str_set(&imcf->default_source, "direct");
|
||||
}
|
||||
@@ -1214,6 +1248,68 @@ ngx_http_ipng_stats_init_zone(ngx_shm_zone_t *shm_zone, void *data)
|
||||
/* init_module: enable IP_PKTINFO on listen sockets, resolve ifindexes */
|
||||
/* ----------------------------------------------------------------- */
|
||||
|
||||
/* Resolve each configured device= name to its current kernel ifindex.
|
||||
* Called from init_module (master, each cycle) and from the per-worker
|
||||
* rescan timer. `log_initial` controls whether an unchanged mapping is
|
||||
* logged at NOTICE: true from init_module (operator expects to see the
|
||||
* table at startup/reload), false from the rescan timer (only log on
|
||||
* change, keeping idle rescans silent). Returns the number of entries
|
||||
* whose ifindex actually changed. */
|
||||
static ngx_uint_t
|
||||
ngx_http_ipng_stats_resolve_bindings(ngx_log_t *log,
|
||||
ngx_http_ipng_stats_main_conf_t *imcf, ngx_uint_t log_initial)
|
||||
{
|
||||
ngx_http_ipng_stats_binding_t *bindings;
|
||||
ngx_uint_t i, changed = 0, prev, now;
|
||||
char devname[IFNAMSIZ];
|
||||
size_t dlen;
|
||||
|
||||
if (imcf == NULL || imcf->bindings == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bindings = imcf->bindings->elts;
|
||||
for (i = 0; i < imcf->bindings->nelts; i++) {
|
||||
if (bindings[i].device.len == 0) continue;
|
||||
dlen = bindings[i].device.len < IFNAMSIZ - 1
|
||||
? bindings[i].device.len : IFNAMSIZ - 1;
|
||||
ngx_memcpy(devname, bindings[i].device.data, dlen);
|
||||
devname[dlen] = '\0';
|
||||
|
||||
prev = bindings[i].ifindex;
|
||||
now = if_nametoindex(devname);
|
||||
bindings[i].ifindex = now;
|
||||
|
||||
if (now == 0) {
|
||||
if (log_initial || prev != 0) {
|
||||
ngx_log_error(NGX_LOG_WARN, log, ngx_errno,
|
||||
"ipng_stats: if_nametoindex(\"%s\") failed — "
|
||||
"traffic via that interface will fall back "
|
||||
"to the default source", devname);
|
||||
}
|
||||
if (prev != 0) changed++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (prev != now) {
|
||||
changed++;
|
||||
ngx_log_error(NGX_LOG_NOTICE, log, 0,
|
||||
"ipng_stats: device \"%V\" -> ifindex %ui "
|
||||
"(source=\"%V\", previous=%ui)",
|
||||
&bindings[i].device, now,
|
||||
&bindings[i].source, prev);
|
||||
} else if (log_initial) {
|
||||
ngx_log_error(NGX_LOG_NOTICE, log, 0,
|
||||
"ipng_stats: device \"%V\" -> ifindex %ui "
|
||||
"(source=\"%V\")",
|
||||
&bindings[i].device, now, &bindings[i].source);
|
||||
}
|
||||
}
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
|
||||
/* init_module: enable IP_PKTINFO / IPV6_RECVPKTINFO on every HTTP
|
||||
* listening socket so that each accepted TCP connection carries an
|
||||
* IP_PKTINFO/IPV6_PKTINFO cmsg the log handler can retrieve via
|
||||
@@ -1230,12 +1326,9 @@ static ngx_int_t
|
||||
ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle)
|
||||
{
|
||||
ngx_http_ipng_stats_main_conf_t *imcf;
|
||||
ngx_http_ipng_stats_binding_t *bindings;
|
||||
ngx_listening_t *ls;
|
||||
ngx_uint_t i;
|
||||
int one = 1;
|
||||
char devname[IFNAMSIZ];
|
||||
size_t dlen;
|
||||
|
||||
imcf = ngx_http_cycle_get_module_main_conf(cycle,
|
||||
ngx_http_ipng_stats_module);
|
||||
@@ -1243,27 +1336,7 @@ ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle)
|
||||
return NGX_OK;
|
||||
}
|
||||
|
||||
bindings = imcf->bindings->elts;
|
||||
for (i = 0; i < imcf->bindings->nelts; i++) {
|
||||
if (bindings[i].device.len == 0) continue;
|
||||
dlen = bindings[i].device.len < IFNAMSIZ - 1
|
||||
? bindings[i].device.len : IFNAMSIZ - 1;
|
||||
ngx_memcpy(devname, bindings[i].device.data, dlen);
|
||||
devname[dlen] = '\0';
|
||||
bindings[i].ifindex = if_nametoindex(devname);
|
||||
if (bindings[i].ifindex == 0) {
|
||||
ngx_log_error(NGX_LOG_WARN, cycle->log, ngx_errno,
|
||||
"ipng_stats: if_nametoindex(\"%s\") failed — "
|
||||
"traffic via that interface will fall back to "
|
||||
"the default source", devname);
|
||||
continue;
|
||||
}
|
||||
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
|
||||
"ipng_stats: device \"%V\" -> ifindex %ui "
|
||||
"(source=\"%V\")",
|
||||
&bindings[i].device, bindings[i].ifindex,
|
||||
&bindings[i].source);
|
||||
}
|
||||
(void) ngx_http_ipng_stats_resolve_bindings(cycle->log, imcf, 1);
|
||||
|
||||
ls = cycle->listening.elts;
|
||||
for (i = 0; i < cycle->listening.nelts; i++) {
|
||||
@@ -1380,6 +1453,24 @@ ngx_http_ipng_stats_init_worker(ngx_cycle_t *cycle)
|
||||
ngx_add_timer(&w->logtail_flush_ev, imcf->logtail_flush);
|
||||
}
|
||||
|
||||
/* Periodic device= → ifindex rescan. Each worker re-resolves its
|
||||
* own copy of imcf->bindings on this timer, so interface recreations
|
||||
* (e.g. a GRE tunnel being torn down and re-added with a new
|
||||
* ifindex) self-heal without an nginx reload. Misses between
|
||||
* kernel change and next tick still increment ifindex_misses, so
|
||||
* the signal remains visible in the scrape. */
|
||||
if (imcf->rescan_interval > 0
|
||||
&& imcf->bindings != NULL
|
||||
&& imcf->bindings->nelts > 0)
|
||||
{
|
||||
ngx_memzero(&w->rescan_ev, sizeof(w->rescan_ev));
|
||||
w->rescan_ev.handler = ngx_http_ipng_stats_rescan_timer;
|
||||
w->rescan_ev.log = cycle->log;
|
||||
w->rescan_ev.data = w;
|
||||
w->rescan_ev.cancelable = 1;
|
||||
ngx_add_timer(&w->rescan_ev, imcf->rescan_interval);
|
||||
}
|
||||
|
||||
return NGX_OK;
|
||||
}
|
||||
|
||||
@@ -1407,6 +1498,29 @@ ngx_http_ipng_stats_exit_worker(ngx_cycle_t *cycle)
|
||||
close(w->logtail_udp_fd);
|
||||
w->logtail_udp_fd = (ngx_socket_t) -1;
|
||||
}
|
||||
if (w->rescan_ev.timer_set) {
|
||||
ngx_del_timer(&w->rescan_ev);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ngx_http_ipng_stats_rescan_timer(ngx_event_t *ev)
|
||||
{
|
||||
ngx_http_ipng_stats_worker_t *w = ev->data;
|
||||
ngx_http_ipng_stats_main_conf_t *imcf;
|
||||
|
||||
imcf = ngx_http_cycle_get_module_main_conf(ngx_cycle,
|
||||
ngx_http_ipng_stats_module);
|
||||
if (imcf == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
(void) ngx_http_ipng_stats_resolve_bindings(ev->log, imcf, 0);
|
||||
|
||||
if (!ngx_exiting && !ngx_quit && imcf->rescan_interval > 0) {
|
||||
ngx_add_timer(&w->rescan_ev, imcf->rescan_interval);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1689,6 +1803,15 @@ ngx_http_ipng_stats_resolve_source(ngx_http_request_t *r,
|
||||
return NGX_OK;
|
||||
}
|
||||
}
|
||||
|
||||
/* Kernel gave us an ifindex but no binding claims it. Either
|
||||
* the interface is genuinely unconfigured, or a configured
|
||||
* device has been recreated with a new ifindex since the last
|
||||
* rescan. Either way the operator wants to know. */
|
||||
if (imcf->shm_zone != NULL && imcf->shm_zone->data != NULL) {
|
||||
ngx_http_ipng_stats_shctx_t *sh = imcf->shm_zone->data;
|
||||
(void) ngx_atomic_fetch_add(&sh->ifindex_misses, 1);
|
||||
}
|
||||
}
|
||||
|
||||
*source_out = imcf->default_source;
|
||||
@@ -2583,7 +2706,7 @@ ngx_http_ipng_stats_render_prom(ngx_http_request_t *r,
|
||||
|
||||
slab = (ngx_slab_pool_t *) imcf->shm_zone->shm.addr;
|
||||
|
||||
cl = ngx_http_ipng_stats_chain_buf(r, 1536);
|
||||
cl = ngx_http_ipng_stats_chain_buf(r, 2048);
|
||||
if (cl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR;
|
||||
cl->buf->last = ngx_sprintf(cl->buf->last,
|
||||
"# nginx-ipng-stats-plugin %s (schema=%d)\n"
|
||||
@@ -2602,8 +2725,21 @@ ngx_http_ipng_stats_render_prom(ngx_http_request_t *r,
|
||||
"# HELP nginx_ipng_bytes_in Request size histogram in bytes.\n"
|
||||
"# TYPE nginx_ipng_bytes_in histogram\n"
|
||||
"# HELP nginx_ipng_bytes_out Request size histogram in bytes.\n"
|
||||
"# TYPE nginx_ipng_bytes_out histogram\n",
|
||||
NGX_HTTP_IPNG_STATS_VERSION, NGX_HTTP_IPNG_STATS_SCHEMA_VERSION);
|
||||
"# TYPE nginx_ipng_bytes_out histogram\n"
|
||||
"# HELP nginx_ipng_ifindex_misses_total Connections whose ingress "
|
||||
"ifindex did not match any configured device= binding.\n"
|
||||
"# TYPE nginx_ipng_ifindex_misses_total counter\n"
|
||||
"nginx_ipng_ifindex_misses_total %uA\n"
|
||||
"# HELP nginx_ipng_zone_full_events_total Shared-zone intern or "
|
||||
"slab allocation failures (zone full).\n"
|
||||
"# TYPE nginx_ipng_zone_full_events_total counter\n"
|
||||
"nginx_ipng_zone_full_events_total %uA\n"
|
||||
"# HELP nginx_ipng_flushes_total Per-worker flushes into the "
|
||||
"shared zone.\n"
|
||||
"# TYPE nginx_ipng_flushes_total counter\n"
|
||||
"nginx_ipng_flushes_total %uA\n",
|
||||
NGX_HTTP_IPNG_STATS_VERSION, NGX_HTTP_IPNG_STATS_SCHEMA_VERSION,
|
||||
sh->ifindex_misses, sh->zone_full_events, sh->flushes_total);
|
||||
if (ngx_http_ipng_stats_append(&last, cl) != NGX_OK) {
|
||||
return NGX_HTTP_INTERNAL_SERVER_ERROR;
|
||||
}
|
||||
@@ -2754,11 +2890,14 @@ ngx_http_ipng_stats_render_json(ngx_http_request_t *r,
|
||||
|
||||
slab = (ngx_slab_pool_t *) imcf->shm_zone->shm.addr;
|
||||
|
||||
cl = ngx_http_ipng_stats_chain_buf(r, 64);
|
||||
cl = ngx_http_ipng_stats_chain_buf(r, 256);
|
||||
if (cl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR;
|
||||
cl->buf->last = ngx_sprintf(cl->buf->last,
|
||||
"{\"schema\":%d,\"records\":[",
|
||||
NGX_HTTP_IPNG_STATS_SCHEMA_VERSION);
|
||||
"{\"schema\":%d,\"meta\":{\"ifindex_misses\":%uA,"
|
||||
"\"zone_full_events\":%uA,\"flushes_total\":%uA},"
|
||||
"\"records\":[",
|
||||
NGX_HTTP_IPNG_STATS_SCHEMA_VERSION,
|
||||
sh->ifindex_misses, sh->zone_full_events, sh->flushes_total);
|
||||
if (ngx_http_ipng_stats_append(&last, cl) != NGX_OK) {
|
||||
return NGX_HTTP_INTERNAL_SERVER_ERROR;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user