Support multiple device-pinned listens sharing a single port
Nginx's config-level duplicate-listen check rejected the documented pattern of `listen 80 device=X ipng_source_tag=A; listen 80 device=Y ipng_source_tag=B;` with "a duplicate listen 0.0.0.0:80", and even when the dedup was bypassed the kernel refused the second bind() because the first socket was already holding the port without SO_BINDTODEVICE. The listen wrapper now detects same-sockaddr duplicates before the core handler sees them and records them with `needs_clone=1`. In init_module, phase 1 clones an ngx_listening_t for each such duplicate, phase 3 closes every inherited naked fd, and phase 4 rebinds every target with SO_REUSEADDR + SO_REUSEPORT + SO_BINDTODEVICE set before bind(). SO_REUSEPORT keeps `nginx -s reload` from colliding with the still-bound sockets held by old workers during graceful drain; IPV6_V6ONLY matches nginx's default so the IPv6 listen doesn't claim the IPv4 wildcard and collide with sibling IPv4-specific listens. Restructure 01-module to cover the pattern end-to-end: four device-pinned listens on port 8080 (eth1 shares tag `tag1` across v4 and v6; eth2 splits into `tag2-v4` / `tag2-v6`), clients and server both get IPv6 addresses, and a new "Per-(device, family) request count accuracy" case proves that 10 requests on each of the four combinations yields tag1=20, tag2-v4=10, tag2-v6=10. Mgmt/direct traffic moves to port 9180 so it no longer clashes with the shared-port wildcards. Document the constraint in docs/user-guide.md: all listens on a given port must carry `device=`, and direct traffic belongs on a separate port. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -182,13 +182,21 @@ typedef struct {
|
||||
|
||||
|
||||
/* Per-listen binding recorded by the listen wrapper at config parse
|
||||
* time. Resolved to an ngx_listening_t* at init_module time. */
|
||||
* time. Resolved to an ngx_listening_t* at init_module time.
|
||||
*
|
||||
* `needs_clone` marks a listen directive that shares a sockaddr with
|
||||
* an earlier binding. Nginx's core listen handler rejects such
|
||||
* duplicates with "a duplicate listen ..."; our wrapper therefore
|
||||
* skips the core call for these, and init_module manufactures an
|
||||
* ngx_listening_t for each by cloning the template it shares the
|
||||
* sockaddr with. */
|
||||
typedef struct {
|
||||
ngx_str_t device;
|
||||
ngx_str_t source;
|
||||
ngx_sockaddr_t sockaddr;
|
||||
socklen_t socklen;
|
||||
ngx_listening_t *listening; /* filled at init_module */
|
||||
unsigned needs_clone:1;
|
||||
} ngx_http_ipng_stats_binding_t;
|
||||
|
||||
|
||||
@@ -495,47 +503,33 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd,
|
||||
i++;
|
||||
}
|
||||
|
||||
if (device.len > 0 || source.len > 0) {
|
||||
/* Force nginx to create a dedicated listening socket for this
|
||||
* address even when a wildcard on the same port already exists.
|
||||
* Without `bind`, nginx's optimizer eliminates specific-address
|
||||
* sockets that are covered by a wildcard, which would prevent us
|
||||
* from applying SO_BINDTODEVICE and tagging traffic per device. */
|
||||
ngx_str_t *bind_arg = ngx_array_push(cf->args);
|
||||
if (bind_arg == NULL) {
|
||||
return NGX_CONF_ERROR;
|
||||
}
|
||||
ngx_str_set(bind_arg, "bind");
|
||||
}
|
||||
|
||||
rv = ngx_http_core_listen_orig(cf, cmd, conf);
|
||||
if (rv != NGX_CONF_OK) {
|
||||
return rv;
|
||||
}
|
||||
|
||||
if (device.len == 0 && source.len == 0) {
|
||||
return NGX_CONF_OK;
|
||||
/* Plain listen with no module-specific parameters — let nginx
|
||||
* handle it end-to-end. */
|
||||
return ngx_http_core_listen_orig(cf, cmd, conf);
|
||||
}
|
||||
|
||||
if (cf->args->nelts < 2) {
|
||||
return NGX_CONF_OK;
|
||||
/* Force nginx to bind a dedicated socket for this address rather
|
||||
* than folding it into a wildcard. Without `bind`, nginx's listen
|
||||
* optimizer discards specific-address entries covered by a
|
||||
* wildcard, which prevents us from applying SO_BINDTODEVICE. */
|
||||
ngx_str_t *bind_arg = ngx_array_push(cf->args);
|
||||
if (bind_arg == NULL) {
|
||||
return NGX_CONF_ERROR;
|
||||
}
|
||||
ngx_str_set(bind_arg, "bind");
|
||||
|
||||
/* Listen options are not stored on the core srv conf in a way we
|
||||
* can cheaply recover after the original handler runs (the core
|
||||
* `listen` field is a 1-bit flag). Instead we reparse the address
|
||||
* argument ourselves so we know which sockaddr to match against
|
||||
* cycle->listening[] at init_module time. */
|
||||
/* Parse the listen address ourselves: we need the sockaddr to
|
||||
* detect duplicates (multiple `listen 80 device=X` at the same
|
||||
* addr) and to match bindings to cycle->listening[] entries in
|
||||
* init_module. */
|
||||
ngx_memzero(&u, sizeof(ngx_url_t));
|
||||
u.url = value[1];
|
||||
u.listen = 1;
|
||||
u.default_port = 80;
|
||||
|
||||
if (ngx_parse_url(cf->pool, &u) != NGX_OK || u.naddrs == 0) {
|
||||
/* The original handler already accepted this address, so a
|
||||
* reparse failure would be surprising. Skip binding rather
|
||||
* than fail the reload. */
|
||||
return NGX_CONF_OK;
|
||||
return ngx_http_core_listen_orig(cf, cmd, conf);
|
||||
}
|
||||
|
||||
imcf = ngx_http_conf_get_module_main_conf(cf, ngx_http_ipng_stats_module);
|
||||
@@ -548,6 +542,31 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd,
|
||||
}
|
||||
}
|
||||
|
||||
/* Call the core handler at most once per (addr, port). Any
|
||||
* subsequent listen at the same sockaddr would hit nginx's
|
||||
* duplicate-listen check — we skip it and let init_module clone
|
||||
* the first-seen listening entry for each duplicate. */
|
||||
ngx_http_ipng_stats_binding_t *existing = imcf->bindings->elts;
|
||||
ngx_uint_t dup = 0;
|
||||
for (i = 0; i < imcf->bindings->nelts; i++) {
|
||||
if (existing[i].socklen == u.addrs[0].socklen
|
||||
&& ngx_cmp_sockaddr((struct sockaddr *) &existing[i].sockaddr,
|
||||
existing[i].socklen,
|
||||
u.addrs[0].sockaddr,
|
||||
u.addrs[0].socklen, 1) == NGX_OK)
|
||||
{
|
||||
dup = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!dup) {
|
||||
rv = ngx_http_core_listen_orig(cf, cmd, conf);
|
||||
if (rv != NGX_CONF_OK) {
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
|
||||
/* Record one binding per resolved address (ngx_parse_url may yield
|
||||
* multiple for a hostname; listen specs use literal addresses so
|
||||
* naddrs is almost always 1). */
|
||||
@@ -580,6 +599,7 @@ ngx_http_ipng_stats_listen_wrapper(ngx_conf_t *cf, ngx_command_t *cmd,
|
||||
|
||||
b->socklen = u.addrs[i].socklen;
|
||||
ngx_memcpy(&b->sockaddr, u.addrs[i].sockaddr, u.addrs[i].socklen);
|
||||
b->needs_clone = dup ? 1 : 0;
|
||||
}
|
||||
|
||||
return NGX_CONF_OK;
|
||||
@@ -1160,98 +1180,226 @@ ngx_http_ipng_stats_init_zone(ngx_shm_zone_t *shm_zone, void *data)
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------- */
|
||||
/* init_module: apply SO_BINDTODEVICE to the opened listen sockets */
|
||||
/* init_module: rebind listen sockets with SO_BINDTODEVICE */
|
||||
/* ----------------------------------------------------------------- */
|
||||
|
||||
/* Create a device-pinned listening socket: socket + SO_REUSEADDR +
|
||||
* SO_REUSEPORT + SO_BINDTODEVICE (set *before* bind, which is what
|
||||
* lets the kernel permit multiple sockets on the same wildcard
|
||||
* addr+port) + bind + listen + nonblocking.
|
||||
*
|
||||
* SO_REUSEPORT is required so that on `nginx -s reload` the new
|
||||
* master's rebind doesn't collide with the still-bound sockets held
|
||||
* by the old workers during their graceful-drain window. The kernel
|
||||
* still uses SO_BINDTODEVICE to filter per device, so traffic
|
||||
* attribution stays correct.
|
||||
*
|
||||
* Returns the new fd, or -1 on failure with errno preserved. */
|
||||
static ngx_socket_t
|
||||
ngx_http_ipng_stats_open_dev_socket(ngx_cycle_t *cycle,
|
||||
struct sockaddr *sa, socklen_t salen, int backlog, ngx_str_t *device)
|
||||
{
|
||||
ngx_socket_t s;
|
||||
int one = 1;
|
||||
char devname[IFNAMSIZ];
|
||||
size_t dlen;
|
||||
|
||||
s = socket(sa->sa_family, SOCK_STREAM, 0);
|
||||
if (s == (ngx_socket_t) -1) {
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
|
||||
"ipng_stats: socket() failed");
|
||||
return (ngx_socket_t) -1;
|
||||
}
|
||||
if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) == -1) {
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
|
||||
"ipng_stats: setsockopt(SO_REUSEADDR) failed");
|
||||
close(s);
|
||||
return (ngx_socket_t) -1;
|
||||
}
|
||||
if (setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) == -1) {
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
|
||||
"ipng_stats: setsockopt(SO_REUSEPORT) failed");
|
||||
close(s);
|
||||
return (ngx_socket_t) -1;
|
||||
}
|
||||
#if (NGX_HAVE_INET6)
|
||||
if (sa->sa_family == AF_INET6) {
|
||||
/* Match nginx's default (ipv6only=on): keep the [::]:X listen
|
||||
* strictly IPv6. Without this, Linux's bindv6only=0 default
|
||||
* makes the socket claim the IPv4 wildcard too, which collides
|
||||
* with sibling IPv4-specific listens on the same port (e.g. a
|
||||
* mgmt-address listener). */
|
||||
if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)) == -1) {
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
|
||||
"ipng_stats: setsockopt(IPV6_V6ONLY) failed");
|
||||
close(s);
|
||||
return (ngx_socket_t) -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
dlen = device->len < IFNAMSIZ - 1 ? device->len : IFNAMSIZ - 1;
|
||||
ngx_memcpy(devname, device->data, dlen);
|
||||
devname[dlen] = '\0';
|
||||
if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, devname,
|
||||
(socklen_t) (dlen + 1)) == -1)
|
||||
{
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
|
||||
"ipng_stats: setsockopt(SO_BINDTODEVICE, \"%s\") failed",
|
||||
devname);
|
||||
close(s);
|
||||
return (ngx_socket_t) -1;
|
||||
}
|
||||
if (bind(s, sa, salen) == -1) {
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
|
||||
"ipng_stats: bind() failed for device \"%s\"", devname);
|
||||
close(s);
|
||||
return (ngx_socket_t) -1;
|
||||
}
|
||||
if (listen(s, backlog) == -1) {
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
|
||||
"ipng_stats: listen() failed for device \"%s\"", devname);
|
||||
close(s);
|
||||
return (ngx_socket_t) -1;
|
||||
}
|
||||
if (ngx_nonblocking(s) == -1) {
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
|
||||
"ipng_stats: fcntl(O_NONBLOCK) failed");
|
||||
close(s);
|
||||
return (ngx_socket_t) -1;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
static ngx_int_t
|
||||
ngx_http_ipng_stats_init_module(ngx_cycle_t *cycle)
|
||||
{
|
||||
ngx_http_ipng_stats_main_conf_t *imcf;
|
||||
ngx_http_ipng_stats_binding_t *bindings;
|
||||
ngx_listening_t *ls;
|
||||
ngx_listening_t *ls, *tmpl, *dup;
|
||||
ngx_uint_t i, j;
|
||||
char devname[IFNAMSIZ];
|
||||
size_t dlen;
|
||||
ngx_uint_t *target_idx;
|
||||
u_char *claimed;
|
||||
|
||||
imcf = ngx_http_cycle_get_module_main_conf(cycle,
|
||||
ngx_http_ipng_stats_module);
|
||||
if (imcf == NULL) {
|
||||
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
|
||||
"ipng_stats: init_module: imcf is NULL");
|
||||
return NGX_OK;
|
||||
}
|
||||
if (imcf->bindings == NULL) {
|
||||
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
|
||||
"ipng_stats: init_module: no bindings (no device= "
|
||||
"or ipng_source_tag= on any listen)");
|
||||
if (imcf == NULL || imcf->bindings == NULL) {
|
||||
return NGX_OK;
|
||||
}
|
||||
|
||||
bindings = imcf->bindings->elts;
|
||||
ls = cycle->listening.elts;
|
||||
|
||||
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
|
||||
"ipng_stats: init_module: %ui bindings, %ui listeners",
|
||||
imcf->bindings->nelts, cycle->listening.nelts);
|
||||
|
||||
for (i = 0; i < cycle->listening.nelts; i++) {
|
||||
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
|
||||
"ipng_stats: listener[%ui]: fd=%d addr=%V socklen=%d",
|
||||
i, (int) ls[i].fd, &ls[i].addr_text,
|
||||
(int) ls[i].socklen);
|
||||
/* Phase 1: append a cloned ngx_listening_t for every duplicate
|
||||
* binding. ngx_array_push may reallocate elts, so we store indices
|
||||
* not pointers. `target_idx[j]` is the index into cycle->listening
|
||||
* that binding j ends up owning. */
|
||||
target_idx = ngx_pcalloc(cycle->pool,
|
||||
imcf->bindings->nelts * sizeof(ngx_uint_t));
|
||||
if (target_idx == NULL) {
|
||||
return NGX_ERROR;
|
||||
}
|
||||
|
||||
for (j = 0; j < imcf->bindings->nelts; j++) {
|
||||
ngx_int_t matched = 0;
|
||||
if (!bindings[j].needs_clone) continue;
|
||||
|
||||
ls = cycle->listening.elts;
|
||||
tmpl = NULL;
|
||||
for (i = 0; i < cycle->listening.nelts; i++) {
|
||||
if (ls[i].socklen != bindings[j].socklen) {
|
||||
continue;
|
||||
if (ls[i].socklen == bindings[j].socklen
|
||||
&& ngx_cmp_sockaddr(ls[i].sockaddr, ls[i].socklen,
|
||||
(struct sockaddr *) &bindings[j].sockaddr,
|
||||
bindings[j].socklen, 1) == NGX_OK)
|
||||
{
|
||||
tmpl = &ls[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (tmpl == NULL) {
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, 0,
|
||||
"ipng_stats: init_module: no template listening "
|
||||
"found for cloned binding (source=\"%V\")",
|
||||
&bindings[j].source);
|
||||
return NGX_ERROR;
|
||||
}
|
||||
|
||||
dup = ngx_array_push(&cycle->listening);
|
||||
if (dup == NULL) return NGX_ERROR;
|
||||
*dup = *tmpl;
|
||||
dup->fd = (ngx_socket_t) -1;
|
||||
dup->previous = NULL;
|
||||
|
||||
target_idx[j] = cycle->listening.nelts - 1;
|
||||
}
|
||||
|
||||
/* Phase 2: map every non-clone binding to an existing listening
|
||||
* entry (by sockaddr, first unclaimed wins). */
|
||||
claimed = ngx_pcalloc(cycle->pool, cycle->listening.nelts);
|
||||
if (claimed == NULL) return NGX_ERROR;
|
||||
|
||||
for (j = 0; j < imcf->bindings->nelts; j++) {
|
||||
if (bindings[j].needs_clone) {
|
||||
claimed[target_idx[j]] = 1;
|
||||
continue;
|
||||
}
|
||||
ls = cycle->listening.elts;
|
||||
for (i = 0; i < cycle->listening.nelts; i++) {
|
||||
if (claimed[i]) continue;
|
||||
if (ls[i].socklen != bindings[j].socklen) continue;
|
||||
if (ngx_cmp_sockaddr(ls[i].sockaddr, ls[i].socklen,
|
||||
(struct sockaddr *) &bindings[j].sockaddr,
|
||||
bindings[j].socklen, 1) != NGX_OK)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
matched = 1;
|
||||
bindings[j].listening = &ls[i];
|
||||
|
||||
if (bindings[j].device.len > 0 && ls[i].fd != (ngx_socket_t) -1) {
|
||||
dlen = bindings[j].device.len < IFNAMSIZ - 1
|
||||
? bindings[j].device.len : IFNAMSIZ - 1;
|
||||
ngx_memcpy(devname, bindings[j].device.data, dlen);
|
||||
devname[dlen] = '\0';
|
||||
|
||||
if (setsockopt(ls[i].fd, SOL_SOCKET, SO_BINDTODEVICE,
|
||||
devname, (socklen_t) (dlen + 1)) == -1)
|
||||
{
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
|
||||
"ipng_stats: setsockopt(SO_BINDTODEVICE, "
|
||||
"\"%s\") failed for listen fd %d",
|
||||
devname, (int) ls[i].fd);
|
||||
return NGX_ERROR;
|
||||
}
|
||||
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
|
||||
"ipng_stats: bound listen fd %d to device "
|
||||
"\"%s\" (source=\"%V\")",
|
||||
(int) ls[i].fd, devname, &bindings[j].source);
|
||||
}
|
||||
break;
|
||||
target_idx[j] = i;
|
||||
claimed[i] = 1;
|
||||
goto found;
|
||||
}
|
||||
if (!matched) {
|
||||
u_char buf[NGX_SOCKADDR_STRLEN];
|
||||
size_t len;
|
||||
len = ngx_sock_ntop((struct sockaddr *) &bindings[j].sockaddr,
|
||||
bindings[j].socklen, buf, sizeof(buf), 1);
|
||||
ngx_log_error(NGX_LOG_WARN, cycle->log, 0,
|
||||
"ipng_stats: no listener matched binding "
|
||||
"source=\"%V\" addr=%*s socklen=%d",
|
||||
&bindings[j].source, len, buf,
|
||||
(int) bindings[j].socklen);
|
||||
ngx_log_error(NGX_LOG_EMERG, cycle->log, 0,
|
||||
"ipng_stats: init_module: no listening entry for "
|
||||
"binding (source=\"%V\")", &bindings[j].source);
|
||||
return NGX_ERROR;
|
||||
found:
|
||||
;
|
||||
}
|
||||
|
||||
/* Phase 3: close every pre-bound fd on a target listening entry.
|
||||
* nginx ran ngx_open_listening_sockets before init_module, so the
|
||||
* first-seen listen at each sockaddr has a naked bind that would
|
||||
* block subsequent device-pinned binds on the same addr. Free the
|
||||
* ports before we rebind. */
|
||||
ls = cycle->listening.elts;
|
||||
for (j = 0; j < imcf->bindings->nelts; j++) {
|
||||
ngx_listening_t *t = &ls[target_idx[j]];
|
||||
if (t->fd != (ngx_socket_t) -1) {
|
||||
close(t->fd);
|
||||
t->fd = (ngx_socket_t) -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Phase 4: rebind each target with SO_BINDTODEVICE set before
|
||||
* bind(). `ls->inherited = 1` tells nginx not to touch the socket
|
||||
* in any subsequent setup pass. */
|
||||
for (j = 0; j < imcf->bindings->nelts; j++) {
|
||||
ngx_listening_t *t = &ls[target_idx[j]];
|
||||
ngx_socket_t s;
|
||||
|
||||
s = ngx_http_ipng_stats_open_dev_socket(cycle,
|
||||
t->sockaddr, t->socklen, t->backlog, &bindings[j].device);
|
||||
if (s == (ngx_socket_t) -1) {
|
||||
return NGX_ERROR;
|
||||
}
|
||||
t->fd = s;
|
||||
t->inherited = 1;
|
||||
bindings[j].listening = t;
|
||||
|
||||
ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0,
|
||||
"ipng_stats: listen %V bound to device \"%V\" "
|
||||
"(source=\"%V\", fd=%d)",
|
||||
&t->addr_text, &bindings[j].device,
|
||||
&bindings[j].source, (int) s);
|
||||
}
|
||||
|
||||
return NGX_OK;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user