diff --git a/Makefile b/Makefile index d26414d..743f428 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ BUILD_DIR := $(CURDIR)/build # the package version from there directly. The C code picks up VERSION # via the generated src/version.h (written by the version-header target # below and depended on by the module build). -VERSION := 0.6.0 +VERSION := 0.7.0 NGINX_SRC ?= diff --git a/debian/changelog b/debian/changelog index ff744e7..4fbcbec 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,20 @@ +nginx-ipng-stats-plugin (0.7.0-1) unstable; urgency=medium + + * Pre-release v0.7.0. + - New ipng_stats_rescan_interval directive (default 60s, 0 to + disable). Per-worker timer that re-resolves every device= → + ifindex mapping, self-healing attribution after interface + teardown/recreate without requiring nginx -s reload. + - New nginx_ipng_ifindex_misses_total counter: connections whose + ingress ifindex didn't match any configured binding. Surfaces + stale mappings and genuinely-unconfigured interfaces. + - Expose the existing nginx_ipng_zone_full_events_total and + nginx_ipng_flushes_total meta counters in both the Prometheus + and JSON scrape outputs. JSON gains a top-level "meta" object + alongside "records" (schema unchanged, additive). + + -- Pim van Pelt Sat, 18 Apr 2026 19:36:36 +0200 + nginx-ipng-stats-plugin (0.6.0-1) unstable; urgency=medium * Pre-release v0.6.0. diff --git a/docs/config-guide.md b/docs/config-guide.md index 0a9fcd0..87a3053 100644 --- a/docs/config-guide.md +++ b/docs/config-guide.md @@ -81,6 +81,22 @@ is sized so that a scrape interval of 5–15 s sees effectively no lag. See FR-4.2, FR-5.2. +### `ipng_stats_rescan_interval ` + +**Context:** `http`. + +**Value:** an nginx duration string (e.g. `30s`, `60s`, `5m`) or `0` to disable. + +**Default:** `60s`. + +**Minimum:** `1s` (when non-zero). + +**Effect:** sets the cadence of a per-worker timer that re-resolves every `device=` binding via `if_nametoindex(3)`. This +self-heals the attribution table when a configured interface is torn down and recreated (e.g. a GRE tunnel reprovision) — it gets a +fresh kernel ifindex, which the next rescan picks up. Between the kernel change and the next tick, arriving traffic falls through to +the default source and increments `nginx_ipng_ifindex_misses_total`; watch that counter to size this interval. Set to `0` to disable +and rely solely on `nginx -s reload` (which always re-runs `if_nametoindex` for every binding in the new cycle). + ### `ipng_stats_default_source ` **Context:** `http`. @@ -276,7 +292,8 @@ per-three-digit-code breakdown should enable `ipng_stats_logtail` and derive it | `nginx_ipng_zone_bytes_used` | gauge | — | Shared-memory zone bytes currently allocated. | | `nginx_ipng_zone_bytes_total` | gauge | — | Shared-memory zone capacity in bytes. | | `nginx_ipng_zone_full_events_total` | counter | — | Number of key insertions dropped because the zone was full. | -| `nginx_ipng_flushes_total` | counter | `worker` | Number of per-worker flush ticks executed. | +| `nginx_ipng_flushes_total` | counter | — | Per-worker flushes into the shared zone, summed across workers. | +| `nginx_ipng_ifindex_misses_total` | counter | — | Connections whose ingress ifindex did not match any configured `device=` binding. | | `nginx_ipng_flush_duration_seconds` | histogram | `worker` | Histogram of flush durations. | | `nginx_ipng_scrape_duration_seconds` | histogram | — | Histogram of scrape handler runtimes. | @@ -287,6 +304,11 @@ See FR-2.*, FR-3.7. ```json { "schema": 2, + "meta": { + "ifindex_misses": 0, + "zone_full_events": 0, + "flushes_total": 1234 + }, "records": [ { "source_tag": "mg1", @@ -321,6 +343,7 @@ See FR-3.6. | --- | --- | --- | --- | --- | | `ipng_stats_zone` | ✅ | — | — | — | | `ipng_stats_flush_interval` | ✅ | — | — | — | +| `ipng_stats_rescan_interval` | ✅ | — | — | — | | `ipng_stats_default_source` | ✅ | — | — | — | | `ipng_stats_buckets` | ✅ | — | — | — | | `ipng_stats_byte_buckets` | ✅ | — | — | — | diff --git a/docs/user-guide.md b/docs/user-guide.md index f07fa8d..4e71420 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -434,6 +434,13 @@ values in `listens.conf`, or the interfaces aren't up. Run `ip -br link` and con `ipng_stats_zone ipng:` (default 4 MB is enough for ~hundreds of VIPs — the code dimension is bucketed to six classes, so one 4 MB zone holds a very large deployment). +**`nginx_ipng_ifindex_misses_total` is climbing.** A connection arrived on an interface whose ifindex isn't in the binding table. +Two common causes: (a) a configured interface was torn down and recreated (e.g. a GRE tunnel reprovision) and now has a fresh +ifindex — the per-worker rescan timer (`ipng_stats_rescan_interval`, default `60s`) will pick it up on the next tick; (b) traffic +legitimately arrives on an interface that no `device=` binding claims — either add the binding or accept that it lands under the +default source. If the counter keeps rising between rescans, shorten `ipng_stats_rescan_interval` or trigger `nginx -s reload` to +re-resolve immediately. + **`curl http://127.0.0.1:9113/.well-known/ipng/statsz` returns "403 Forbidden".** The `allow`/`deny` ACL is blocking your source address. Either add yourself or scrape from a host already in the allow list. diff --git a/src/ngx_http_ipng_stats_module.c b/src/ngx_http_ipng_stats_module.c index 702f0af..71434c5 100644 --- a/src/ngx_http_ipng_stats_module.c +++ b/src/ngx_http_ipng_stats_module.c @@ -183,6 +183,13 @@ typedef struct { * the plugin in the design doc). */ ngx_atomic_uint_t zone_full_events; ngx_atomic_uint_t flushes_total; + /* Requests whose connection carried an IP_PKTINFO ifindex that + * didn't match any configured binding — a non-zero rate here + * means traffic is falling through to the default source and + * the `device=` → ifindex mapping is stale (interface + * recreated with a new ifindex) or the interface is genuinely + * unconfigured. */ + ngx_atomic_uint_t ifindex_misses; } ngx_http_ipng_stats_shctx_t; @@ -224,6 +231,7 @@ typedef struct { ngx_str_t zone_name; size_t zone_size; ngx_msec_t flush_interval; + ngx_msec_t rescan_interval; /* 0 disables periodic rescan */ ngx_str_t default_source; ngx_uint_t nbuckets; ngx_uint_t *bucket_bounds_ms; /* len = nbuckets */ @@ -267,6 +275,11 @@ typedef struct { u_char *logtail_end; ngx_event_t logtail_flush_ev; ngx_socket_t logtail_udp_fd; /* per-worker UDP socket, or -1 */ + + /* Periodic device= → ifindex rescan timer. Each worker runs its + * own; there's no shared state since bindings live in the per- + * process main conf (COW-forked from master). */ + ngx_event_t rescan_ev; } ngx_http_ipng_stats_worker_t; @@ -307,6 +320,7 @@ static ngx_int_t ngx_http_ipng_stats_init_zone(ngx_shm_zone_t *shm_zone, static ngx_int_t ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle); static ngx_int_t ngx_http_ipng_stats_init_worker(ngx_cycle_t *cycle); static void ngx_http_ipng_stats_exit_worker(ngx_cycle_t *cycle); +static void ngx_http_ipng_stats_rescan_timer(ngx_event_t *ev); static ngx_int_t ngx_http_ipng_stats_log_handler(ngx_http_request_t *r); static ngx_int_t ngx_http_ipng_stats_content_handler(ngx_http_request_t *r); @@ -367,6 +381,13 @@ static ngx_command_t ngx_http_ipng_stats_commands[] = { offsetof(ngx_http_ipng_stats_main_conf_t, flush_interval), NULL }, + { ngx_string("ipng_stats_rescan_interval"), + NGX_HTTP_MAIN_CONF|NGX_CONF_TAKE1, + ngx_conf_set_msec_slot, + NGX_HTTP_MAIN_CONF_OFFSET, + offsetof(ngx_http_ipng_stats_main_conf_t, rescan_interval), + NULL }, + { ngx_string("ipng_stats_default_source"), NGX_HTTP_MAIN_CONF|NGX_CONF_TAKE1, ngx_conf_set_str_slot, @@ -652,6 +673,7 @@ ngx_http_ipng_stats_create_main_conf(ngx_conf_t *cf) } imcf->flush_interval = NGX_CONF_UNSET_MSEC; + imcf->rescan_interval = NGX_CONF_UNSET_MSEC; imcf->nbuckets = 0; imcf->bucket_bounds_ms = NULL; imcf->nbytebuckets = 0; @@ -676,6 +698,18 @@ ngx_http_ipng_stats_init_main_conf(ngx_conf_t *cf, void *conf) return NGX_CONF_ERROR; } + /* Default 60s. Set to 0 to disable the periodic rescan and rely + * solely on `nginx -s reload` to refresh the device= → ifindex + * table. A minimum of 1s keeps the timer from starving the + * worker event loop on pathological values. */ + ngx_conf_init_msec_value(imcf->rescan_interval, 60000); + if (imcf->rescan_interval > 0 && imcf->rescan_interval < 1000) { + ngx_log_error(NGX_LOG_EMERG, cf->log, 0, + "ipng_stats_rescan_interval must be 0 (disabled) " + "or at least 1s"); + return NGX_CONF_ERROR; + } + if (imcf->default_source.len == 0) { ngx_str_set(&imcf->default_source, "direct"); } @@ -1214,6 +1248,68 @@ ngx_http_ipng_stats_init_zone(ngx_shm_zone_t *shm_zone, void *data) /* init_module: enable IP_PKTINFO on listen sockets, resolve ifindexes */ /* ----------------------------------------------------------------- */ +/* Resolve each configured device= name to its current kernel ifindex. + * Called from init_module (master, each cycle) and from the per-worker + * rescan timer. `log_initial` controls whether an unchanged mapping is + * logged at NOTICE: true from init_module (operator expects to see the + * table at startup/reload), false from the rescan timer (only log on + * change, keeping idle rescans silent). Returns the number of entries + * whose ifindex actually changed. */ +static ngx_uint_t +ngx_http_ipng_stats_resolve_bindings(ngx_log_t *log, + ngx_http_ipng_stats_main_conf_t *imcf, ngx_uint_t log_initial) +{ + ngx_http_ipng_stats_binding_t *bindings; + ngx_uint_t i, changed = 0, prev, now; + char devname[IFNAMSIZ]; + size_t dlen; + + if (imcf == NULL || imcf->bindings == NULL) { + return 0; + } + + bindings = imcf->bindings->elts; + for (i = 0; i < imcf->bindings->nelts; i++) { + if (bindings[i].device.len == 0) continue; + dlen = bindings[i].device.len < IFNAMSIZ - 1 + ? bindings[i].device.len : IFNAMSIZ - 1; + ngx_memcpy(devname, bindings[i].device.data, dlen); + devname[dlen] = '\0'; + + prev = bindings[i].ifindex; + now = if_nametoindex(devname); + bindings[i].ifindex = now; + + if (now == 0) { + if (log_initial || prev != 0) { + ngx_log_error(NGX_LOG_WARN, log, ngx_errno, + "ipng_stats: if_nametoindex(\"%s\") failed — " + "traffic via that interface will fall back " + "to the default source", devname); + } + if (prev != 0) changed++; + continue; + } + + if (prev != now) { + changed++; + ngx_log_error(NGX_LOG_NOTICE, log, 0, + "ipng_stats: device \"%V\" -> ifindex %ui " + "(source=\"%V\", previous=%ui)", + &bindings[i].device, now, + &bindings[i].source, prev); + } else if (log_initial) { + ngx_log_error(NGX_LOG_NOTICE, log, 0, + "ipng_stats: device \"%V\" -> ifindex %ui " + "(source=\"%V\")", + &bindings[i].device, now, &bindings[i].source); + } + } + + return changed; +} + + /* init_module: enable IP_PKTINFO / IPV6_RECVPKTINFO on every HTTP * listening socket so that each accepted TCP connection carries an * IP_PKTINFO/IPV6_PKTINFO cmsg the log handler can retrieve via @@ -1230,12 +1326,9 @@ static ngx_int_t ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle) { ngx_http_ipng_stats_main_conf_t *imcf; - ngx_http_ipng_stats_binding_t *bindings; ngx_listening_t *ls; ngx_uint_t i; int one = 1; - char devname[IFNAMSIZ]; - size_t dlen; imcf = ngx_http_cycle_get_module_main_conf(cycle, ngx_http_ipng_stats_module); @@ -1243,27 +1336,7 @@ ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle) return NGX_OK; } - bindings = imcf->bindings->elts; - for (i = 0; i < imcf->bindings->nelts; i++) { - if (bindings[i].device.len == 0) continue; - dlen = bindings[i].device.len < IFNAMSIZ - 1 - ? bindings[i].device.len : IFNAMSIZ - 1; - ngx_memcpy(devname, bindings[i].device.data, dlen); - devname[dlen] = '\0'; - bindings[i].ifindex = if_nametoindex(devname); - if (bindings[i].ifindex == 0) { - ngx_log_error(NGX_LOG_WARN, cycle->log, ngx_errno, - "ipng_stats: if_nametoindex(\"%s\") failed — " - "traffic via that interface will fall back to " - "the default source", devname); - continue; - } - ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0, - "ipng_stats: device \"%V\" -> ifindex %ui " - "(source=\"%V\")", - &bindings[i].device, bindings[i].ifindex, - &bindings[i].source); - } + (void) ngx_http_ipng_stats_resolve_bindings(cycle->log, imcf, 1); ls = cycle->listening.elts; for (i = 0; i < cycle->listening.nelts; i++) { @@ -1380,6 +1453,24 @@ ngx_http_ipng_stats_init_worker(ngx_cycle_t *cycle) ngx_add_timer(&w->logtail_flush_ev, imcf->logtail_flush); } + /* Periodic device= → ifindex rescan. Each worker re-resolves its + * own copy of imcf->bindings on this timer, so interface recreations + * (e.g. a GRE tunnel being torn down and re-added with a new + * ifindex) self-heal without an nginx reload. Misses between + * kernel change and next tick still increment ifindex_misses, so + * the signal remains visible in the scrape. */ + if (imcf->rescan_interval > 0 + && imcf->bindings != NULL + && imcf->bindings->nelts > 0) + { + ngx_memzero(&w->rescan_ev, sizeof(w->rescan_ev)); + w->rescan_ev.handler = ngx_http_ipng_stats_rescan_timer; + w->rescan_ev.log = cycle->log; + w->rescan_ev.data = w; + w->rescan_ev.cancelable = 1; + ngx_add_timer(&w->rescan_ev, imcf->rescan_interval); + } + return NGX_OK; } @@ -1407,6 +1498,29 @@ ngx_http_ipng_stats_exit_worker(ngx_cycle_t *cycle) close(w->logtail_udp_fd); w->logtail_udp_fd = (ngx_socket_t) -1; } + if (w->rescan_ev.timer_set) { + ngx_del_timer(&w->rescan_ev); + } +} + + +static void +ngx_http_ipng_stats_rescan_timer(ngx_event_t *ev) +{ + ngx_http_ipng_stats_worker_t *w = ev->data; + ngx_http_ipng_stats_main_conf_t *imcf; + + imcf = ngx_http_cycle_get_module_main_conf(ngx_cycle, + ngx_http_ipng_stats_module); + if (imcf == NULL) { + return; + } + + (void) ngx_http_ipng_stats_resolve_bindings(ev->log, imcf, 0); + + if (!ngx_exiting && !ngx_quit && imcf->rescan_interval > 0) { + ngx_add_timer(&w->rescan_ev, imcf->rescan_interval); + } } @@ -1689,6 +1803,15 @@ ngx_http_ipng_stats_resolve_source(ngx_http_request_t *r, return NGX_OK; } } + + /* Kernel gave us an ifindex but no binding claims it. Either + * the interface is genuinely unconfigured, or a configured + * device has been recreated with a new ifindex since the last + * rescan. Either way the operator wants to know. */ + if (imcf->shm_zone != NULL && imcf->shm_zone->data != NULL) { + ngx_http_ipng_stats_shctx_t *sh = imcf->shm_zone->data; + (void) ngx_atomic_fetch_add(&sh->ifindex_misses, 1); + } } *source_out = imcf->default_source; @@ -2583,7 +2706,7 @@ ngx_http_ipng_stats_render_prom(ngx_http_request_t *r, slab = (ngx_slab_pool_t *) imcf->shm_zone->shm.addr; - cl = ngx_http_ipng_stats_chain_buf(r, 1536); + cl = ngx_http_ipng_stats_chain_buf(r, 2048); if (cl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; cl->buf->last = ngx_sprintf(cl->buf->last, "# nginx-ipng-stats-plugin %s (schema=%d)\n" @@ -2602,8 +2725,21 @@ ngx_http_ipng_stats_render_prom(ngx_http_request_t *r, "# HELP nginx_ipng_bytes_in Request size histogram in bytes.\n" "# TYPE nginx_ipng_bytes_in histogram\n" "# HELP nginx_ipng_bytes_out Request size histogram in bytes.\n" - "# TYPE nginx_ipng_bytes_out histogram\n", - NGX_HTTP_IPNG_STATS_VERSION, NGX_HTTP_IPNG_STATS_SCHEMA_VERSION); + "# TYPE nginx_ipng_bytes_out histogram\n" + "# HELP nginx_ipng_ifindex_misses_total Connections whose ingress " + "ifindex did not match any configured device= binding.\n" + "# TYPE nginx_ipng_ifindex_misses_total counter\n" + "nginx_ipng_ifindex_misses_total %uA\n" + "# HELP nginx_ipng_zone_full_events_total Shared-zone intern or " + "slab allocation failures (zone full).\n" + "# TYPE nginx_ipng_zone_full_events_total counter\n" + "nginx_ipng_zone_full_events_total %uA\n" + "# HELP nginx_ipng_flushes_total Per-worker flushes into the " + "shared zone.\n" + "# TYPE nginx_ipng_flushes_total counter\n" + "nginx_ipng_flushes_total %uA\n", + NGX_HTTP_IPNG_STATS_VERSION, NGX_HTTP_IPNG_STATS_SCHEMA_VERSION, + sh->ifindex_misses, sh->zone_full_events, sh->flushes_total); if (ngx_http_ipng_stats_append(&last, cl) != NGX_OK) { return NGX_HTTP_INTERNAL_SERVER_ERROR; } @@ -2754,11 +2890,14 @@ ngx_http_ipng_stats_render_json(ngx_http_request_t *r, slab = (ngx_slab_pool_t *) imcf->shm_zone->shm.addr; - cl = ngx_http_ipng_stats_chain_buf(r, 64); + cl = ngx_http_ipng_stats_chain_buf(r, 256); if (cl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; cl->buf->last = ngx_sprintf(cl->buf->last, - "{\"schema\":%d,\"records\":[", - NGX_HTTP_IPNG_STATS_SCHEMA_VERSION); + "{\"schema\":%d,\"meta\":{\"ifindex_misses\":%uA," + "\"zone_full_events\":%uA,\"flushes_total\":%uA}," + "\"records\":[", + NGX_HTTP_IPNG_STATS_SCHEMA_VERSION, + sh->ifindex_misses, sh->zone_full_events, sh->flushes_total); if (ngx_http_ipng_stats_append(&last, cl) != NGX_OK) { return NGX_HTTP_INTERNAL_SERVER_ERROR; }