diff --git a/Makefile b/Makefile index f8f661f..08ff538 100644 --- a/Makefile +++ b/Makefile @@ -181,6 +181,9 @@ build-asan: version-header $(MAKE) -C "$$NGX_SRC" -f objs/Makefile install; \ install -d $$PREFIX/modules; \ install -m 0644 "$$NGX_SRC/objs/$(MODULE_NAME).so" $$PREFIX/modules/; \ + for d in client_body_temp fastcgi_temp proxy_temp scgi_temp uwsgi_temp; do \ + rm -rf "$$PREFIX/$$d"; \ + done; \ echo ""; \ echo "ASan build ready:"; \ echo " nginx: $$PREFIX/sbin/nginx"; \ diff --git a/debian/rules b/debian/rules index 5b294ee..88f221e 100755 --- a/debian/rules +++ b/debian/rules @@ -36,3 +36,11 @@ override_dh_auto_clean: # Makefile's `clean` target, which wipes build/ wholesale — and # that includes build/nginx-asan/ from a prior `make build-asan`. # Users who want a fresh reset run `make clean` at the top level. + +override_dh_clean: + # `dh_clean` recurses from the package root to remove junk files + # (editor backups, autom4te caches, etc.). `make build-asan` + # produces build/nginx-asan/{fastcgi,proxy,scgi,uwsgi}_temp owned + # by "nobody" with mode 0700, which the current user can't + # traverse — so we exclude anything under build/ from dh_clean. + dh_clean -X build/ diff --git a/docs/config-guide.md b/docs/config-guide.md index 6a465af..0a9fcd0 100644 --- a/docs/config-guide.md +++ b/docs/config-guide.md @@ -18,12 +18,13 @@ is invoked, so they compose with every standard `listen` parameter (`ssl`, `http **Default:** not set (plain listen). -**Effect:** the resulting listening socket has `SO_BINDTODEVICE` applied at init-module time, making the kernel accept only connections -whose ingress interface is ``. Combined with a wildcard listen address (`80`, `[::]:80`) this is the mechanism by which the -plugin attributes traffic to a specific ingress interface. +**Effect:** records a binding between `` and the listen's source tag. At request time the log handler reads the ingress +ifindex for the connection (via `IP_PKTINFO` / `IPV6_PKTINFO` cmsg that the module enables on every HTTP listening socket at +init-module time) and attributes the request to whichever binding matches. The listening socket itself is a plain wildcard — no +`SO_BINDTODEVICE`, no extra sockets — which keeps outgoing packets on the default routing table and makes DSR / maglev +deployments work. -The `setsockopt(SO_BINDTODEVICE)` call runs in the nginx master process while it still holds its initial privileges — workers never -call it, and no additional Linux capability is required beyond what stock nginx already has (NFR-6.1). +No additional Linux capability is required beyond what stock nginx already has (NFR-6.1). See FR-1.1, FR-1.5, FR-1.6. diff --git a/docs/user-guide.md b/docs/user-guide.md index ce9b8d8..f07fa8d 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -42,8 +42,11 @@ nginx -V 2>&1 | grep -o ngx_http_ipng_stats_module ## 2. Set up interfaces for per-device attribution -The plugin attributes traffic by watching which interface the request came in on, using `SO_BINDTODEVICE` on per-interface listening -sockets. For this to work, each traffic source that should be tracked separately MUST arrive on its own interface. +The plugin attributes traffic by watching which interface the request came in on. It enables `IP_PKTINFO` / `IPV6_RECVPKTINFO` on +each listening socket and reads the ingress `ifindex` per accepted connection, so the listening sockets remain plain wildcards and +outgoing packets follow the normal routing table — this is what makes DSR / maglev deployments work, where the SYN arrives via a +GRE tunnel but the SYN-ACK must leave via the default route. For the attribution itself to work, each traffic source that should be +tracked separately MUST arrive on its own interface. This works with any kind of Linux interface — GRE tunnels, VLANs, VXLANs, bonded links, or plain ethernet. This guide uses GRE tunnels as the example, but the module does not care about the interface type. @@ -175,11 +178,10 @@ VIP is a `server_name` change; adding a new interface is an append to `listens.c ### All listens on a shared port must be device-tagged -If you use multiple `listen` directives on the same port (e.g. port 80), **every one of them must carry `device=`**. Mixing a -device-pinned listen with a plain `listen 80;` or with an address-specific `listen 192.0.2.1:80;` on the same port is **not -supported** and nginx will fail to start. This is a kernel-level limitation: a device-pinned socket sets `SO_BINDTODEVICE` before -`bind(2)`, while a plain wildcard socket sets no device filter — Linux refuses to hold both on the same `(addr, port)` tuple, so -the second bind fails with `EADDRINUSE` regardless of what the nginx config-level dedup might do. +If you use multiple `listen` directives on the same port (e.g. port 80), **every one of them must carry `device=`**. Mixing +a device-tagged listen with a plain `listen 80;` or with an address-specific `listen 192.0.2.1:80;` on the same port is **not +supported** — nginx's config-level dedup rejects same-sockaddr listens within a server block, and the module's wrapper only +exempts directives that carry `device=`. For "direct" traffic — clients hitting the host on a non-attributed interface — use a **separate port** on the direct interface (e.g. `listen 198.51.100.1:8081;`). That listen then has no `device=`, so it falls back to the tag set by diff --git a/src/ngx_http_ipng_stats_module.c b/src/ngx_http_ipng_stats_module.c index 9681d1a..9628d11 100644 --- a/src/ngx_http_ipng_stats_module.c +++ b/src/ngx_http_ipng_stats_module.c @@ -181,22 +181,21 @@ typedef struct { } ngx_http_ipng_stats_shctx_t; -/* Per-listen binding recorded by the listen wrapper at config parse - * time. Resolved to an ngx_listening_t* at init_module time. +/* Per-device binding recorded by the listen wrapper at config parse + * time. `ifindex` is resolved from `device` at init_module time and + * becomes the lookup key at request time: the log handler reads the + * ingress ifindex from the connection's IP_PKTINFO/IPV6_PKTINFO cmsg + * and matches against this table. * - * `needs_clone` marks a listen directive that shares a sockaddr with - * an earlier binding. Nginx's core listen handler rejects such - * duplicates with "a duplicate listen ..."; our wrapper therefore - * skips the core call for these, and init_module manufactures an - * ngx_listening_t for each by cloning the template it shares the - * sockaddr with. */ + * We deliberately don't use SO_BINDTODEVICE on the listening sockets — + * that option pins *egress* to the bound interface too, which breaks + * maglev / DSR deployments where the SYN arrives via a GRE tunnel and + * the SYN-ACK must leave via the default route. See docs/design.md. */ typedef struct { - ngx_str_t device; - ngx_str_t source; - ngx_sockaddr_t sockaddr; - socklen_t socklen; - ngx_listening_t *listening; /* filled at init_module */ - unsigned needs_clone:1; + ngx_str_t device; + ngx_str_t source; + ngx_uint_t ifindex; /* filled at init_module (0 until resolved) */ + sa_family_t family; /* AF_INET / AF_INET6 of the listen sockaddr */ } ngx_http_ipng_stats_binding_t; @@ -525,17 +524,15 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd, return ngx_http_core_listen_orig(cf, cmd, conf); } - /* Parse the listen address ourselves so we can dedup on sockaddr. - * We need this before adding `bind` so we can decide whether to - * call the core handler at all. */ + /* Parse the listen address ourselves so we can dedup per (cscf, + * sockaddr) — nginx's core handler rejects the same sockaddr + * appearing twice in the same server block. */ ngx_memzero(&u, sizeof(ngx_url_t)); u.url = value[1]; u.listen = 1; u.default_port = 80; if (ngx_parse_url(cf->pool, &u) != NGX_OK || u.naddrs == 0) { - /* Restore the stripped args by just passing through; nginx - * will produce its own parse error. */ return ngx_http_core_listen_orig(cf, cmd, conf); } @@ -556,11 +553,12 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd, } } - /* Decide whether to call the core handler. nginx rejects a - * repeat of (sockaddr, cscf), but accepts the same sockaddr in a - * different server block and accepts different sockaddrs in the - * same server block. Track the (cscf, sockaddr) pairs we've - * already handed off. */ + /* Skip the core handler when this (cscf, sockaddr) pair was + * already processed — matches nginx's own "duplicate listen" + * check and lets a server block carry multiple device-tagged + * listens at the same port. Across different server blocks the + * same sockaddr re-appears and nginx merges the cscf via + * ngx_http_add_server. */ void *cscf = ngx_http_conf_get_module_srv_conf(cf, ngx_http_core_module); ngx_http_ipng_stats_seen_t *seen = imcf->listens_seen->elts; ngx_uint_t same_cscf_sockaddr = 0; @@ -578,37 +576,6 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd, } if (!same_cscf_sockaddr) { - /* First time this (cscf, sockaddr) pair is seen. If this - * sockaddr has never been bound at all, inject `bind` so - * nginx allocates a dedicated listening socket for it (and - * sets opt.set=1 / opt.bind=1). For subsequent cscfs that - * reference the same sockaddr, the address entry already - * exists with opt.set=1; passing `bind` again would trip - * nginx's "duplicate listen options" check, so we skip it - * and let nginx's cross-server-block merge attach this cscf - * via ngx_http_add_server. */ - ngx_http_ipng_stats_binding_t *bs = imcf->bindings->elts; - ngx_uint_t sockaddr_ever_seen = 0; - for (i = 0; i < imcf->bindings->nelts; i++) { - if (bs[i].socklen == u.addrs[0].socklen - && ngx_cmp_sockaddr((struct sockaddr *) &bs[i].sockaddr, - bs[i].socklen, - u.addrs[0].sockaddr, - u.addrs[0].socklen, 1) == NGX_OK) - { - sockaddr_ever_seen = 1; - break; - } - } - - if (!sockaddr_ever_seen) { - ngx_str_t *bind_arg = ngx_array_push(cf->args); - if (bind_arg == NULL) { - return NGX_CONF_ERROR; - } - ngx_str_set(bind_arg, "bind"); - } - rv = ngx_http_core_listen_orig(cf, cmd, conf); if (rv != NGX_CONF_OK) { return rv; @@ -621,67 +588,44 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd, ngx_memcpy(&s->sockaddr, u.addrs[0].sockaddr, u.addrs[0].socklen); } - /* Dedup bindings on (sockaddr, device) globally: two server - * blocks that share the same `include` only produce one binding - * per (sockaddr, device) pair. init_module creates one socket per - * unique binding regardless of how many server blocks referenced - * it. */ - for (i = 0; i < u.naddrs; i++) { + /* Dedup bindings on (device, family) pair. Same device appearing + * under multiple server blocks (because they share a listen + * include) collapses to a single binding per family. The same + * device can still carry different source tags for IPv4 vs IPv6 + * — the resolver looks up on (ifindex, family). */ + sa_family_t fam = u.addrs[0].sockaddr->sa_family; + if (device.len > 0) { ngx_http_ipng_stats_binding_t *existing = imcf->bindings->elts; - ngx_uint_t j; - ngx_uint_t dup_binding = 0; - ngx_uint_t sockaddr_reused = 0; - - for (j = 0; j < imcf->bindings->nelts; j++) { - if (existing[j].socklen == u.addrs[i].socklen - && ngx_cmp_sockaddr((struct sockaddr *) &existing[j].sockaddr, - existing[j].socklen, - u.addrs[i].sockaddr, - u.addrs[i].socklen, 1) == NGX_OK) + for (i = 0; i < imcf->bindings->nelts; i++) { + if (existing[i].family == fam + && existing[i].device.len == device.len + && ngx_memcmp(existing[i].device.data, device.data, + device.len) == 0) { - sockaddr_reused = 1; - if (existing[j].device.len == device.len - && (device.len == 0 - || ngx_memcmp(existing[j].device.data, device.data, - device.len) == 0)) - { - dup_binding = 1; - break; - } + return NGX_CONF_OK; } } + } - if (dup_binding) continue; + b = ngx_array_push(imcf->bindings); + if (b == NULL) return NGX_CONF_ERROR; + ngx_memzero(b, sizeof(*b)); + b->family = fam; - b = ngx_array_push(imcf->bindings); - if (b == NULL) { - return NGX_CONF_ERROR; - } - ngx_memzero(b, sizeof(*b)); - - if (device.len > 0) { - b->device.data = ngx_pnalloc(cf->pool, device.len); - if (b->device.data == NULL) { - return NGX_CONF_ERROR; - } - ngx_memcpy(b->device.data, device.data, device.len); - b->device.len = device.len; - } - if (source.len > 0) { - b->source.data = ngx_pnalloc(cf->pool, source.len); - if (b->source.data == NULL) { - return NGX_CONF_ERROR; - } - ngx_memcpy(b->source.data, source.data, source.len); - b->source.len = source.len; - } else if (device.len > 0) { - /* FR-1.4: default source = device name. */ - b->source = b->device; - } - - b->socklen = u.addrs[i].socklen; - ngx_memcpy(&b->sockaddr, u.addrs[i].sockaddr, u.addrs[i].socklen); - b->needs_clone = sockaddr_reused ? 1 : 0; + if (device.len > 0) { + b->device.data = ngx_pnalloc(cf->pool, device.len); + if (b->device.data == NULL) return NGX_CONF_ERROR; + ngx_memcpy(b->device.data, device.data, device.len); + b->device.len = device.len; + } + if (source.len > 0) { + b->source.data = ngx_pnalloc(cf->pool, source.len); + if (b->source.data == NULL) return NGX_CONF_ERROR; + ngx_memcpy(b->source.data, source.data, source.len); + b->source.len = source.len; + } else if (device.len > 0) { + /* FR-1.4: default source = device name. */ + b->source = b->device; } return NGX_CONF_OK; @@ -1265,103 +1209,28 @@ ngx_http_ipng_stats_init_zone(ngx_shm_zone_t *shm_zone, void *data) /* init_module: rebind listen sockets with SO_BINDTODEVICE */ /* ----------------------------------------------------------------- */ -/* Create a device-pinned listening socket: socket + SO_REUSEADDR + - * SO_REUSEPORT + SO_BINDTODEVICE (set *before* bind, which is what - * lets the kernel permit multiple sockets on the same wildcard - * addr+port) + bind + listen + nonblocking. +/* init_module: enable IP_PKTINFO / IPV6_RECVPKTINFO on every HTTP + * listening socket so that each accepted TCP connection carries an + * IP_PKTINFO/IPV6_PKTINFO cmsg the log handler can retrieve via + * getsockopt(IP_PKTOPTIONS). Resolve each configured device= name + * to an ifindex once, up front, since the log handler's attribution + * lookup is ifindex-keyed on the hot path. * - * SO_REUSEPORT is required so that on `nginx -s reload` the new - * master's rebind doesn't collide with the still-bound sockets held - * by the old workers during their graceful-drain window. The kernel - * still uses SO_BINDTODEVICE to filter per device, so traffic - * attribution stays correct. - * - * Returns the new fd, or -1 on failure with errno preserved. */ -static ngx_socket_t -ngx_http_ipng_stats_open_dev_socket(ngx_cycle_t *cycle, - struct sockaddr *sa, socklen_t salen, int backlog, ngx_str_t *device) -{ - ngx_socket_t s; - int one = 1; - char devname[IFNAMSIZ]; - size_t dlen; - - s = socket(sa->sa_family, SOCK_STREAM, 0); - if (s == (ngx_socket_t) -1) { - ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno, - "ipng_stats: socket() failed"); - return (ngx_socket_t) -1; - } - if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) == -1) { - ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno, - "ipng_stats: setsockopt(SO_REUSEADDR) failed"); - close(s); - return (ngx_socket_t) -1; - } - if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) == -1) { - ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno, - "ipng_stats: setsockopt(SO_REUSEPORT) failed"); - close(s); - return (ngx_socket_t) -1; - } -#if (NGX_HAVE_INET6) - if (sa->sa_family == AF_INET6) { - /* Match nginx's default (ipv6only=on): keep the [::]:X listen - * strictly IPv6. Without this, Linux's bindv6only=0 default - * makes the socket claim the IPv4 wildcard too, which collides - * with sibling IPv4-specific listens on the same port (e.g. a - * mgmt-address listener). */ - if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)) == -1) { - ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno, - "ipng_stats: setsockopt(IPV6_V6ONLY) failed"); - close(s); - return (ngx_socket_t) -1; - } - } -#endif - dlen = device->len < IFNAMSIZ - 1 ? device->len : IFNAMSIZ - 1; - ngx_memcpy(devname, device->data, dlen); - devname[dlen] = '\0'; - if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, devname, - (socklen_t) (dlen + 1)) == -1) - { - ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno, - "ipng_stats: setsockopt(SO_BINDTODEVICE, \"%s\") failed", - devname); - close(s); - return (ngx_socket_t) -1; - } - if (bind(s, sa, salen) == -1) { - ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno, - "ipng_stats: bind() failed for device \"%s\"", devname); - close(s); - return (ngx_socket_t) -1; - } - if (listen(s, backlog) == -1) { - ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno, - "ipng_stats: listen() failed for device \"%s\"", devname); - close(s); - return (ngx_socket_t) -1; - } - if (ngx_nonblocking(s) == -1) { - ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno, - "ipng_stats: fcntl(O_NONBLOCK) failed"); - close(s); - return (ngx_socket_t) -1; - } - return s; -} - - + * Crucially the listening sockets themselves are left as plain + * wildcards — no SO_BINDTODEVICE, no extra sockets — so the kernel + * returns outgoing packets via the normal routing table. This is + * what makes DSR / maglev deployments work: SYN arrives through a + * GRE tunnel, SYN-ACK leaves via the default route. */ static ngx_int_t ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle) { ngx_http_ipng_stats_main_conf_t *imcf; ngx_http_ipng_stats_binding_t *bindings; - ngx_listening_t *ls, *tmpl, *dup; - ngx_uint_t i, j; - ngx_uint_t *target_idx; - u_char *claimed; + ngx_listening_t *ls; + ngx_uint_t i; + int one = 1; + char devname[IFNAMSIZ]; + size_t dlen; imcf = ngx_http_cycle_get_module_main_conf(cycle, ngx_http_ipng_stats_module); @@ -1370,116 +1239,52 @@ ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle) } bindings = imcf->bindings->elts; - - /* Phase 1: append a cloned ngx_listening_t for every duplicate - * binding. ngx_array_push may reallocate elts, so we store indices - * not pointers. `target_idx[j]` is the index into cycle->listening - * that binding j ends up owning. */ - target_idx = ngx_pcalloc(cycle->pool, - imcf->bindings->nelts * sizeof(ngx_uint_t)); - if (target_idx == NULL) { - return NGX_ERROR; - } - - for (j = 0; j < imcf->bindings->nelts; j++) { - if (!bindings[j].needs_clone) continue; - - ls = cycle->listening.elts; - tmpl = NULL; - for (i = 0; i < cycle->listening.nelts; i++) { - if (ls[i].socklen == bindings[j].socklen - && ngx_cmp_sockaddr(ls[i].sockaddr, ls[i].socklen, - (struct sockaddr *) &bindings[j].sockaddr, - bindings[j].socklen, 1) == NGX_OK) - { - tmpl = &ls[i]; - break; - } - } - if (tmpl == NULL) { - ngx_log_error(NGX_LOG_EMERG, cycle->log, 0, - "ipng_stats: init_module: no template listening " - "found for cloned binding (source=\"%V\")", - &bindings[j].source); - return NGX_ERROR; - } - - dup = ngx_array_push(&cycle->listening); - if (dup == NULL) return NGX_ERROR; - *dup = *tmpl; - dup->fd = (ngx_socket_t) -1; - dup->previous = NULL; - - target_idx[j] = cycle->listening.nelts - 1; - } - - /* Phase 2: map every non-clone binding to an existing listening - * entry (by sockaddr, first unclaimed wins). */ - claimed = ngx_pcalloc(cycle->pool, cycle->listening.nelts); - if (claimed == NULL) return NGX_ERROR; - - for (j = 0; j < imcf->bindings->nelts; j++) { - if (bindings[j].needs_clone) { - claimed[target_idx[j]] = 1; + for (i = 0; i < imcf->bindings->nelts; i++) { + if (bindings[i].device.len == 0) continue; + dlen = bindings[i].device.len < IFNAMSIZ - 1 + ? bindings[i].device.len : IFNAMSIZ - 1; + ngx_memcpy(devname, bindings[i].device.data, dlen); + devname[dlen] = '\0'; + bindings[i].ifindex = if_nametoindex(devname); + if (bindings[i].ifindex == 0) { + ngx_log_error(NGX_LOG_WARN, cycle->log, ngx_errno, + "ipng_stats: if_nametoindex(\"%s\") failed — " + "traffic via that interface will fall back to " + "the default source", devname); continue; } - ls = cycle->listening.elts; - for (i = 0; i < cycle->listening.nelts; i++) { - if (claimed[i]) continue; - if (ls[i].socklen != bindings[j].socklen) continue; - if (ngx_cmp_sockaddr(ls[i].sockaddr, ls[i].socklen, - (struct sockaddr *) &bindings[j].sockaddr, - bindings[j].socklen, 1) != NGX_OK) - { - continue; - } - target_idx[j] = i; - claimed[i] = 1; - goto found; - } - ngx_log_error(NGX_LOG_EMERG, cycle->log, 0, - "ipng_stats: init_module: no listening entry for " - "binding (source=\"%V\")", &bindings[j].source); - return NGX_ERROR; - found: - ; - } - - /* Phase 3: close every pre-bound fd on a target listening entry. - * nginx ran ngx_open_listening_sockets before init_module, so the - * first-seen listen at each sockaddr has a naked bind that would - * block subsequent device-pinned binds on the same addr. Free the - * ports before we rebind. */ - ls = cycle->listening.elts; - for (j = 0; j < imcf->bindings->nelts; j++) { - ngx_listening_t *t = &ls[target_idx[j]]; - if (t->fd != (ngx_socket_t) -1) { - close(t->fd); - t->fd = (ngx_socket_t) -1; - } - } - - /* Phase 4: rebind each target with SO_BINDTODEVICE set before - * bind(). `ls->inherited = 1` tells nginx not to touch the socket - * in any subsequent setup pass. */ - for (j = 0; j < imcf->bindings->nelts; j++) { - ngx_listening_t *t = &ls[target_idx[j]]; - ngx_socket_t s; - - s = ngx_http_ipng_stats_open_dev_socket(cycle, - t->sockaddr, t->socklen, t->backlog, &bindings[j].device); - if (s == (ngx_socket_t) -1) { - return NGX_ERROR; - } - t->fd = s; - t->inherited = 1; - bindings[j].listening = t; - ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0, - "ipng_stats: listen %V bound to device \"%V\" " - "(source=\"%V\", fd=%d)", - &t->addr_text, &bindings[j].device, - &bindings[j].source, (int) s); + "ipng_stats: device \"%V\" -> ifindex %ui " + "(source=\"%V\")", + &bindings[i].device, bindings[i].ifindex, + &bindings[i].source); + } + + ls = cycle->listening.elts; + for (i = 0; i < cycle->listening.nelts; i++) { + if (ls[i].fd == (ngx_socket_t) -1) continue; + if (ls[i].sockaddr == NULL) continue; + if (ls[i].type != SOCK_STREAM) continue; + + if (ls[i].sockaddr->sa_family == AF_INET) { + if (setsockopt(ls[i].fd, IPPROTO_IP, IP_PKTINFO, + &one, sizeof(one)) == -1) + { + ngx_log_error(NGX_LOG_WARN, cycle->log, ngx_errno, + "ipng_stats: setsockopt(IP_PKTINFO) failed " + "on listen fd %d", (int) ls[i].fd); + } +#if (NGX_HAVE_INET6) + } else if (ls[i].sockaddr->sa_family == AF_INET6) { + if (setsockopt(ls[i].fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, + &one, sizeof(one)) == -1) + { + ngx_log_error(NGX_LOG_WARN, cycle->log, ngx_errno, + "ipng_stats: setsockopt(IPV6_RECVPKTINFO) " + "failed on listen fd %d", (int) ls[i].fd); + } +#endif + } } return NGX_OK; @@ -1810,17 +1615,71 @@ ngx_http_ipng_stats_bucket_index(ngx_uint_t ms, ngx_uint_t *bounds, } +/* Determine the source tag for a request by asking the kernel which + * interface the connection's SYN came in on. IP_PKTINFO / + * IPV6_RECVPKTINFO (enabled on the listening socket at init_module + * time) tells the kernel to stash an in(6)_pktinfo cmsg on each + * accepted connection; getsockopt(IP_PKTOPTIONS) returns that cmsg + * here. The ifindex lookup is cheap — a short linear scan, since a + * host has O(10) attributed interfaces — and avoids the + * SO_BINDTODEVICE egress-pinning problem. */ static ngx_int_t ngx_http_ipng_stats_resolve_source(ngx_http_request_t *r, ngx_http_ipng_stats_main_conf_t *imcf, ngx_str_t *source_out) { ngx_http_ipng_stats_binding_t *b; - ngx_uint_t i; + ngx_uint_t i, ifindex = 0; + u_char cbuf[256]; + socklen_t clen = sizeof(cbuf); + struct cmsghdr *cm; + struct msghdr mh; + int level, optname; - if (imcf->bindings != NULL) { + if (imcf->bindings == NULL || imcf->bindings->nelts == 0) { + *source_out = imcf->default_source; + return NGX_OK; + } + + if (r->connection->local_sockaddr + && r->connection->local_sockaddr->sa_family == AF_INET6) + { + level = IPPROTO_IPV6; + /* glibc's only exposes the legacy name for + * this option; the numeric value (6) is the same as the + * unnamed "modern" IPV6_PKTOPTIONS. */ + optname = IPV6_2292PKTOPTIONS; + } else { + level = IPPROTO_IP; + optname = IP_PKTOPTIONS; + } + + if (getsockopt(r->connection->fd, level, optname, cbuf, &clen) == 0) { + ngx_memzero(&mh, sizeof(mh)); + mh.msg_control = cbuf; + mh.msg_controllen = clen; + for (cm = CMSG_FIRSTHDR(&mh); cm; cm = CMSG_NXTHDR(&mh, cm)) { + if (cm->cmsg_level == IPPROTO_IP && cm->cmsg_type == IP_PKTINFO) { + ifindex = ((struct in_pktinfo *) CMSG_DATA(cm))->ipi_ifindex; + break; + } +#if (NGX_HAVE_INET6) + if (cm->cmsg_level == IPPROTO_IPV6 + && cm->cmsg_type == IPV6_PKTINFO) + { + ifindex = ((struct in6_pktinfo *) CMSG_DATA(cm))->ipi6_ifindex; + break; + } +#endif + } + } + + if (ifindex > 0) { + sa_family_t fam = r->connection->local_sockaddr + ? r->connection->local_sockaddr->sa_family + : AF_INET; b = imcf->bindings->elts; for (i = 0; i < imcf->bindings->nelts; i++) { - if (b[i].listening == r->connection->listening) { + if (b[i].ifindex == ifindex && b[i].family == fam) { *source_out = b[i].source; return NGX_OK; } diff --git a/tests/01-module/01-e2e.robot b/tests/01-module/01-e2e.robot index 6a0b8f4..b0a91db 100644 --- a/tests/01-module/01-e2e.robot +++ b/tests/01-module/01-e2e.robot @@ -35,16 +35,17 @@ Shared-listen-include across multiple server blocks ... must start without "conflicting server name" or ... "duplicate listen options" warnings, and the ... module must end up with exactly one listening - ... socket per (device, family) pair — not one per - ... (server block × device × family), which would - ... exhaust the fd table on a real host. + ... socket per address family on port 8080 (one for + ... v4 wildcard, one for v6) — not one per (server + ... block × device × family), which would exhaust + ... the fd table on a real host. ${output} = Docker Exec ${SERVER} nginx -t 2>&1 Should Not Contain ${output} conflicting server name Should Not Contain ${output} duplicate listen ${listens} = Docker Exec ${SERVER} ss -tlnH ${count} = Get Regexp Matches ${listens} :8080\\s - Length Should Be ${count} 4 - ... Expected 4 listening sockets on port 8080 (v4+v6 × eth1+eth2); got ${count} + Length Should Be ${count} 2 + ... Expected 2 listening sockets on port 8080 (v4+v6 wildcards); got ${count} Prometheus scrape [Documentation] Scrape returns HELP/TYPE preamble.