Support multiple device-pinned listens sharing a single port

Nginx's config-level duplicate-listen check rejected the
documented pattern of `listen 80 device=X ipng_source_tag=A;
listen 80 device=Y ipng_source_tag=B;` with "a duplicate listen
0.0.0.0:80", and even when the dedup was bypassed the kernel
refused the second bind() because the first socket was already
holding the port without SO_BINDTODEVICE.

The listen wrapper now detects same-sockaddr duplicates before
the core handler sees them and records them with `needs_clone=1`.
In init_module, phase 1 clones an ngx_listening_t for each such
duplicate, phase 3 closes every inherited naked fd, and phase 4
rebinds every target with SO_REUSEADDR + SO_REUSEPORT +
SO_BINDTODEVICE set before bind(). SO_REUSEPORT keeps
`nginx -s reload` from colliding with the still-bound sockets
held by old workers during graceful drain; IPV6_V6ONLY matches
nginx's default so the IPv6 listen doesn't claim the IPv4
wildcard and collide with sibling IPv4-specific listens.

Restructure 01-module to cover the pattern end-to-end: four
device-pinned listens on port 8080 (eth1 shares tag `tag1`
across v4 and v6; eth2 splits into `tag2-v4` / `tag2-v6`),
clients and server both get IPv6 addresses, and a new
"Per-(device, family) request count accuracy" case proves that
10 requests on each of the four combinations yields tag1=20,
tag2-v4=10, tag2-v6=10. Mgmt/direct traffic moves to port 9180
so it no longer clashes with the shared-port wildcards.

Document the constraint in docs/user-guide.md: all listens on
a given port must carry `device=`, and direct traffic belongs
on a separate port.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-18 11:45:40 +02:00
parent fdef2a552b
commit df05bae8a3
7 changed files with 400 additions and 153 deletions

View File

@@ -182,13 +182,21 @@ typedef struct {
/* Per-listen binding recorded by the listen wrapper at config parse
* time. Resolved to an ngx_listening_t* at init_module time. */
* time. Resolved to an ngx_listening_t* at init_module time.
*
* `needs_clone` marks a listen directive that shares a sockaddr with
* an earlier binding. Nginx's core listen handler rejects such
* duplicates with "a duplicate listen ..."; our wrapper therefore
* skips the core call for these, and init_module manufactures an
* ngx_listening_t for each by cloning the template it shares the
* sockaddr with. */
typedef struct {
ngx_str_t device;
ngx_str_t source;
ngx_sockaddr_t sockaddr;
socklen_t socklen;
ngx_listening_t *listening; /* filled at init_module */
unsigned needs_clone:1;
} ngx_http_ipng_stats_binding_t;
@@ -495,47 +503,33 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd,
i++;
}
if (device.len > 0 || source.len > 0) {
/* Force nginx to create a dedicated listening socket for this
* address even when a wildcard on the same port already exists.
* Without `bind`, nginx's optimizer eliminates specific-address
* sockets that are covered by a wildcard, which would prevent us
* from applying SO_BINDTODEVICE and tagging traffic per device. */
ngx_str_t *bind_arg = ngx_array_push(cf->args);
if (bind_arg == NULL) {
return NGX_CONF_ERROR;
}
ngx_str_set(bind_arg, "bind");
}
rv = ngx_http_core_listen_orig(cf, cmd, conf);
if (rv != NGX_CONF_OK) {
return rv;
}
if (device.len == 0 && source.len == 0) {
return NGX_CONF_OK;
/* Plain listen with no module-specific parameters — let nginx
* handle it end-to-end. */
return ngx_http_core_listen_orig(cf, cmd, conf);
}
if (cf->args->nelts < 2) {
return NGX_CONF_OK;
/* Force nginx to bind a dedicated socket for this address rather
* than folding it into a wildcard. Without `bind`, nginx's listen
* optimizer discards specific-address entries covered by a
* wildcard, which prevents us from applying SO_BINDTODEVICE. */
ngx_str_t *bind_arg = ngx_array_push(cf->args);
if (bind_arg == NULL) {
return NGX_CONF_ERROR;
}
ngx_str_set(bind_arg, "bind");
/* Listen options are not stored on the core srv conf in a way we
* can cheaply recover after the original handler runs (the core
* `listen` field is a 1-bit flag). Instead we reparse the address
* argument ourselves so we know which sockaddr to match against
* cycle->listening[] at init_module time. */
/* Parse the listen address ourselves: we need the sockaddr to
* detect duplicates (multiple `listen 80 device=X` at the same
* addr) and to match bindings to cycle->listening[] entries in
* init_module. */
ngx_memzero(&u, sizeof(ngx_url_t));
u.url = value[1];
u.listen = 1;
u.default_port = 80;
if (ngx_parse_url(cf->pool, &u) != NGX_OK || u.naddrs == 0) {
/* The original handler already accepted this address, so a
* reparse failure would be surprising. Skip binding rather
* than fail the reload. */
return NGX_CONF_OK;
return ngx_http_core_listen_orig(cf, cmd, conf);
}
imcf = ngx_http_conf_get_module_main_conf(cf, ngx_http_ipng_stats_module);
@@ -548,6 +542,31 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd,
}
}
/* Call the core handler at most once per (addr, port). Any
* subsequent listen at the same sockaddr would hit nginx's
* duplicate-listen check — we skip it and let init_module clone
* the first-seen listening entry for each duplicate. */
ngx_http_ipng_stats_binding_t *existing = imcf->bindings->elts;
ngx_uint_t dup = 0;
for (i = 0; i < imcf->bindings->nelts; i++) {
if (existing[i].socklen == u.addrs[0].socklen
&& ngx_cmp_sockaddr((struct sockaddr *) &existing[i].sockaddr,
existing[i].socklen,
u.addrs[0].sockaddr,
u.addrs[0].socklen, 1) == NGX_OK)
{
dup = 1;
break;
}
}
if (!dup) {
rv = ngx_http_core_listen_orig(cf, cmd, conf);
if (rv != NGX_CONF_OK) {
return rv;
}
}
/* Record one binding per resolved address (ngx_parse_url may yield
* multiple for a hostname; listen specs use literal addresses so
* naddrs is almost always 1). */
@@ -580,6 +599,7 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd,
b->socklen = u.addrs[i].socklen;
ngx_memcpy(&b->sockaddr, u.addrs[i].sockaddr, u.addrs[i].socklen);
b->needs_clone = dup ? 1 : 0;
}
return NGX_CONF_OK;
@@ -1160,98 +1180,226 @@ ngx_http_ipng_stats_init_zone(ngx_shm_zone_t *shm_zone, void *data)
/* ----------------------------------------------------------------- */
/* init_module: apply SO_BINDTODEVICE to the opened listen sockets */
/* init_module: rebind listen sockets with SO_BINDTODEVICE */
/* ----------------------------------------------------------------- */
/* Create a device-pinned listening socket: socket + SO_REUSEADDR +
* SO_REUSEPORT + SO_BINDTODEVICE (set *before* bind, which is what
* lets the kernel permit multiple sockets on the same wildcard
* addr+port) + bind + listen + nonblocking.
*
* SO_REUSEPORT is required so that on `nginx -s reload` the new
* master's rebind doesn't collide with the still-bound sockets held
* by the old workers during their graceful-drain window. The kernel
* still uses SO_BINDTODEVICE to filter per device, so traffic
* attribution stays correct.
*
* Returns the new fd, or -1 on failure with errno preserved. */
static ngx_socket_t
ngx_http_ipng_stats_open_dev_socket(ngx_cycle_t *cycle,
struct sockaddr *sa, socklen_t salen, int backlog, ngx_str_t *device)
{
ngx_socket_t s;
int one = 1;
char devname[IFNAMSIZ];
size_t dlen;
s = socket(sa->sa_family, SOCK_STREAM, 0);
if (s == (ngx_socket_t) -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ipng_stats: socket() failed");
return (ngx_socket_t) -1;
}
if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) == -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ipng_stats: setsockopt(SO_REUSEADDR) failed");
close(s);
return (ngx_socket_t) -1;
}
if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) == -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ipng_stats: setsockopt(SO_REUSEPORT) failed");
close(s);
return (ngx_socket_t) -1;
}
#if (NGX_HAVE_INET6)
if (sa->sa_family == AF_INET6) {
/* Match nginx's default (ipv6only=on): keep the [::]:X listen
* strictly IPv6. Without this, Linux's bindv6only=0 default
* makes the socket claim the IPv4 wildcard too, which collides
* with sibling IPv4-specific listens on the same port (e.g. a
* mgmt-address listener). */
if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)) == -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ipng_stats: setsockopt(IPV6_V6ONLY) failed");
close(s);
return (ngx_socket_t) -1;
}
}
#endif
dlen = device->len < IFNAMSIZ - 1 ? device->len : IFNAMSIZ - 1;
ngx_memcpy(devname, device->data, dlen);
devname[dlen] = '\0';
if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, devname,
(socklen_t) (dlen + 1)) == -1)
{
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ipng_stats: setsockopt(SO_BINDTODEVICE, \"%s\") failed",
devname);
close(s);
return (ngx_socket_t) -1;
}
if (bind(s, sa, salen) == -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ipng_stats: bind() failed for device \"%s\"", devname);
close(s);
return (ngx_socket_t) -1;
}
if (listen(s, backlog) == -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ipng_stats: listen() failed for device \"%s\"", devname);
close(s);
return (ngx_socket_t) -1;
}
if (ngx_nonblocking(s) == -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ipng_stats: fcntl(O_NONBLOCK) failed");
close(s);
return (ngx_socket_t) -1;
}
return s;
}
static ngx_int_t
ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle)
{
ngx_http_ipng_stats_main_conf_t *imcf;
ngx_http_ipng_stats_binding_t *bindings;
ngx_listening_t *ls;
ngx_listening_t *ls, *tmpl, *dup;
ngx_uint_t i, j;
char devname[IFNAMSIZ];
size_t dlen;
ngx_uint_t *target_idx;
u_char *claimed;
imcf = ngx_http_cycle_get_module_main_conf(cycle,
ngx_http_ipng_stats_module);
if (imcf == NULL) {
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
"ipng_stats: init_module: imcf is NULL");
return NGX_OK;
}
if (imcf->bindings == NULL) {
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
"ipng_stats: init_module: no bindings (no device= "
"or ipng_source_tag= on any listen)");
if (imcf == NULL || imcf->bindings == NULL) {
return NGX_OK;
}
bindings = imcf->bindings->elts;
ls = cycle->listening.elts;
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
"ipng_stats: init_module: %ui bindings, %ui listeners",
imcf->bindings->nelts, cycle->listening.nelts);
for (i = 0; i < cycle->listening.nelts; i++) {
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
"ipng_stats: listener[%ui]: fd=%d addr=%V socklen=%d",
i, (int) ls[i].fd, &ls[i].addr_text,
(int) ls[i].socklen);
/* Phase 1: append a cloned ngx_listening_t for every duplicate
* binding. ngx_array_push may reallocate elts, so we store indices
* not pointers. `target_idx[j]` is the index into cycle->listening
* that binding j ends up owning. */
target_idx = ngx_pcalloc(cycle->pool,
imcf->bindings->nelts * sizeof(ngx_uint_t));
if (target_idx == NULL) {
return NGX_ERROR;
}
for (j = 0; j < imcf->bindings->nelts; j++) {
ngx_int_t matched = 0;
if (!bindings[j].needs_clone) continue;
ls = cycle->listening.elts;
tmpl = NULL;
for (i = 0; i < cycle->listening.nelts; i++) {
if (ls[i].socklen != bindings[j].socklen) {
continue;
if (ls[i].socklen == bindings[j].socklen
&& ngx_cmp_sockaddr(ls[i].sockaddr, ls[i].socklen,
(struct sockaddr *) &bindings[j].sockaddr,
bindings[j].socklen, 1) == NGX_OK)
{
tmpl = &ls[i];
break;
}
}
if (tmpl == NULL) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, 0,
"ipng_stats: init_module: no template listening "
"found for cloned binding (source=\"%V\")",
&bindings[j].source);
return NGX_ERROR;
}
dup = ngx_array_push(&cycle->listening);
if (dup == NULL) return NGX_ERROR;
*dup = *tmpl;
dup->fd = (ngx_socket_t) -1;
dup->previous = NULL;
target_idx[j] = cycle->listening.nelts - 1;
}
/* Phase 2: map every non-clone binding to an existing listening
* entry (by sockaddr, first unclaimed wins). */
claimed = ngx_pcalloc(cycle->pool, cycle->listening.nelts);
if (claimed == NULL) return NGX_ERROR;
for (j = 0; j < imcf->bindings->nelts; j++) {
if (bindings[j].needs_clone) {
claimed[target_idx[j]] = 1;
continue;
}
ls = cycle->listening.elts;
for (i = 0; i < cycle->listening.nelts; i++) {
if (claimed[i]) continue;
if (ls[i].socklen != bindings[j].socklen) continue;
if (ngx_cmp_sockaddr(ls[i].sockaddr, ls[i].socklen,
(struct sockaddr *) &bindings[j].sockaddr,
bindings[j].socklen, 1) != NGX_OK)
{
continue;
}
matched = 1;
bindings[j].listening = &ls[i];
if (bindings[j].device.len > 0 && ls[i].fd != (ngx_socket_t) -1) {
dlen = bindings[j].device.len < IFNAMSIZ - 1
? bindings[j].device.len : IFNAMSIZ - 1;
ngx_memcpy(devname, bindings[j].device.data, dlen);
devname[dlen] = '\0';
if (setsockopt(ls[i].fd, SOL_SOCKET, SO_BINDTODEVICE,
devname, (socklen_t) (dlen + 1)) == -1)
{
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ipng_stats: setsockopt(SO_BINDTODEVICE, "
"\"%s\") failed for listen fd %d",
devname, (int) ls[i].fd);
return NGX_ERROR;
}
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
"ipng_stats: bound listen fd %d to device "
"\"%s\" (source=\"%V\")",
(int) ls[i].fd, devname, &bindings[j].source);
}
break;
target_idx[j] = i;
claimed[i] = 1;
goto found;
}
if (!matched) {
u_char buf[NGX_SOCKADDR_STRLEN];
size_t len;
len = ngx_sock_ntop((struct sockaddr *) &bindings[j].sockaddr,
bindings[j].socklen, buf, sizeof(buf), 1);
ngx_log_error(NGX_LOG_WARN, cycle->log, 0,
"ipng_stats: no listener matched binding "
"source=\"%V\" addr=%*s socklen=%d",
&bindings[j].source, len, buf,
(int) bindings[j].socklen);
ngx_log_error(NGX_LOG_EMERG, cycle->log, 0,
"ipng_stats: init_module: no listening entry for "
"binding (source=\"%V\")", &bindings[j].source);
return NGX_ERROR;
found:
;
}
/* Phase 3: close every pre-bound fd on a target listening entry.
* nginx ran ngx_open_listening_sockets before init_module, so the
* first-seen listen at each sockaddr has a naked bind that would
* block subsequent device-pinned binds on the same addr. Free the
* ports before we rebind. */
ls = cycle->listening.elts;
for (j = 0; j < imcf->bindings->nelts; j++) {
ngx_listening_t *t = &ls[target_idx[j]];
if (t->fd != (ngx_socket_t) -1) {
close(t->fd);
t->fd = (ngx_socket_t) -1;
}
}
/* Phase 4: rebind each target with SO_BINDTODEVICE set before
* bind(). `ls->inherited = 1` tells nginx not to touch the socket
* in any subsequent setup pass. */
for (j = 0; j < imcf->bindings->nelts; j++) {
ngx_listening_t *t = &ls[target_idx[j]];
ngx_socket_t s;
s = ngx_http_ipng_stats_open_dev_socket(cycle,
t->sockaddr, t->socklen, t->backlog, &bindings[j].device);
if (s == (ngx_socket_t) -1) {
return NGX_ERROR;
}
t->fd = s;
t->inherited = 1;
bindings[j].listening = t;
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
"ipng_stats: listen %V bound to device \"%V\" "
"(source=\"%V\", fd=%d)",
&t->addr_text, &bindings[j].device,
&bindings[j].source, (int) s);
}
return NGX_OK;
}