From fdef2a552b207fca57fa77b2962f05a21219de0b Mon Sep 17 00:00:00 2001 From: Pim van Pelt Date: Sat, 18 Apr 2026 10:58:51 +0200 Subject: [PATCH] Harden scrape rendering and add AddressSanitizer test suite Move all heap allocation out of the slab-mutex critical section in render_prom/render_json: snapshot cardinality under a brief lock, allocate aggs/snaps/string tables outside the lock, then re-acquire only to deep-copy strings and walk the LRU into the pre-allocated buffers. A worker crash during output buffer allocation can no longer leave the shared-memory zone locked, and a corrupt cardinality count is caught by a 10k sanity cap rather than causing a runaway ngx_pcalloc. Add build-asan and tests/02-asan/: a full sanitizer-instrumented nginx + module built via apt-source, and a 2-node containerlab Robot suite that drives reload storms, concurrent scrape-during-reload, and intern-table growth, failing if AddressSanitizer or UBSan reports anything on stderr. The two Robot suites now check for their required build artifacts up front so `make robot-test` no longer rebuilds them on every invocation. Co-Authored-By: Claude Opus 4.7 (1M context) --- Makefile | 84 +++- src/ngx_http_ipng_stats_module.c | 519 ++++++++++++--------- tests/01-module/01-e2e.robot | 11 + tests/02-asan/02-asan.robot | 184 ++++++++ tests/02-asan/lab/client/start.sh | 23 + tests/02-asan/lab/ipng-stats-asan.clab.yml | 46 ++ tests/02-asan/lab/server/nginx.conf | 50 ++ tests/02-asan/lab/server/start.sh | 61 +++ 8 files changed, 746 insertions(+), 232 deletions(-) create mode 100644 tests/02-asan/02-asan.robot create mode 100755 tests/02-asan/lab/client/start.sh create mode 100644 tests/02-asan/lab/ipng-stats-asan.clab.yml create mode 100644 tests/02-asan/lab/server/nginx.conf create mode 100755 tests/02-asan/lab/server/start.sh diff --git a/Makefile b/Makefile index 975ef71..8bd3493 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ VERSION := 0.2.0 NGINX_SRC ?= -.PHONY: help build pkg-deb robot-test install-deps clean fetch-nginx-src version-header +.PHONY: help build build-asan pkg-deb robot-test install-deps clean fetch-nginx-src version-header TEST ?= tests/ @@ -35,8 +35,10 @@ help: @echo "nginx-ipng-stats-plugin — make targets" @echo "" @echo " make build Build $(MODULE_NAME).so out-of-tree." + @echo " make build-asan Build a full nginx + module with AddressSanitizer" + @echo " into build/nginx-asan/ for local crash-hunting." @echo " make pkg-deb Build a Debian package via dpkg-buildpackage." - @echo " make robot-test Build .deb, then run Robot Framework e2e tests." + @echo " make robot-test Run Robot Framework e2e tests (all suites)." @echo " make install-deps Install build and test dependencies (apt)." @echo " make clean Remove build artifacts." @echo "" @@ -114,6 +116,80 @@ fetch-nginx-src: mv "$$NGX_DIR" $(BUILD_DIR)/nginx-src; \ rm -rf $(BUILD_DIR)/apt-src +# ---------------------------------------------------------------------- +# build-asan: full nginx + module built with AddressSanitizer. ASan +# needs to be present in the main binary to instrument dynamic modules; +# running a stock nginx with LD_PRELOAD=libasan.so does NOT work with +# a statically-linked module's allocations. So we build nginx itself +# here, into an isolated prefix under build/nginx-asan/, which you can +# run by hand against any config. Normal `make build` is unaffected. +# +# Typical workflow for the reload-crash hunt: +# make build-asan +# cp your-test.conf build/nginx-asan/conf/nginx.conf +# ASAN_OPTIONS=detect_odr_violation=0:abort_on_error=1:halt_on_error=1:detect_leaks=0 \ +# build/nginx-asan/sbin/nginx -p build/nginx-asan +# # in another shell: +# build/nginx-asan/sbin/nginx -p build/nginx-asan -s reload +# ASan writes any findings to stderr of the master before aborting. +# +# detect_odr_violation=0 suppresses false positives for symbols nginx +# intentionally duplicates between its main binary and dynamic modules +# (e.g. ngx_module_names in objs/ngx_http_ipng_stats_module_modules.c). +# ---------------------------------------------------------------------- + +# -fno-sanitize=nonnull-attribute suppresses a UBSan finding in nginx's +# ngx_cpymem (src/core/ngx_string.c:84): the macro expands to memcpy(...) +# which glibc declares _Nonnull, but nginx legitimately calls it with +# (dst, NULL, 0) in many places. Filtering just that check keeps real +# undefined behaviour visible. +ASAN_CFLAGS := -fsanitize=address -fsanitize=undefined -fno-sanitize=nonnull-attribute -fno-omit-frame-pointer -fno-common -g -O1 +ASAN_LDFLAGS := -fsanitize=address -fsanitize=undefined + +# The ASan build rebuilds nginx from source rather than only the module, +# because ASan must instrument the main binary to catch heap issues in +# modules it loads. The `nginx-dev` shim source tree under /usr/share +# only carries headers, so we always fall through to `apt source nginx` +# here and keep the full source tree isolated in build/nginx-asan-src/. +build-asan: version-header + @set -e; \ + NGX_SRC="$(BUILD_DIR)/nginx-asan-src"; \ + if [ ! -d "$$NGX_SRC" ] || [ ! -f "$$NGX_SRC/src/core/nginx.c" ]; then \ + echo "Fetching full nginx source via apt-source for ASan build"; \ + mkdir -p $(BUILD_DIR)/apt-src-asan; \ + rm -rf $$NGX_SRC; \ + cd $(BUILD_DIR)/apt-src-asan && apt source nginx; \ + NGX_DIR=$$(find $(BUILD_DIR)/apt-src-asan -maxdepth 1 -type d -name 'nginx-*' | head -n1); \ + if [ -z "$$NGX_DIR" ]; then \ + echo "error: could not find unpacked nginx source tree" >&2; \ + exit 1; \ + fi; \ + mv "$$NGX_DIR" $$NGX_SRC; \ + rm -rf $(BUILD_DIR)/apt-src-asan; \ + fi; \ + PREFIX="$(BUILD_DIR)/nginx-asan"; \ + echo "Configuring ASan nginx in $$NGX_SRC (prefix=$$PREFIX)"; \ + cd "$$NGX_SRC" && ./configure \ + --prefix="$$PREFIX" \ + --with-cc-opt="$(ASAN_CFLAGS)" \ + --with-ld-opt="$(ASAN_LDFLAGS)" \ + --with-compat \ + --with-debug \ + --add-dynamic-module=$(MODULE_DIR); \ + echo "Building ASan nginx + module (this takes a minute)"; \ + $(MAKE) -C "$$NGX_SRC" -f objs/Makefile build; \ + $(MAKE) -C "$$NGX_SRC" -f objs/Makefile install; \ + install -d $$PREFIX/modules; \ + install -m 0644 "$$NGX_SRC/objs/$(MODULE_NAME).so" $$PREFIX/modules/; \ + echo ""; \ + echo "ASan build ready:"; \ + echo " nginx: $$PREFIX/sbin/nginx"; \ + echo " module: $$PREFIX/modules/$(MODULE_NAME).so"; \ + echo ""; \ + echo "Run with:"; \ + echo " ASAN_OPTIONS=detect_odr_violation=0:abort_on_error=1:halt_on_error=1:detect_leaks=0 \\"; \ + echo " $$PREFIX/sbin/nginx -p $$PREFIX" + # ---------------------------------------------------------------------- # pkg-deb: build a .deb # ---------------------------------------------------------------------- @@ -146,10 +222,6 @@ tests/.venv: tests/requirements.txt tests/.venv/bin/pip install -q -r tests/requirements.txt robot-test: tests/.venv - @if [ ! -f $(BUILD_DIR)/libnginx-mod-http-ipng-stats_*.deb ]; then \ - echo "error: no .deb found in $(BUILD_DIR)/. Run 'make pkg-deb' first." >&2; \ - exit 1; \ - fi tests/rf-run.sh docker $(TEST) # ---------------------------------------------------------------------- diff --git a/src/ngx_http_ipng_stats_module.c b/src/ngx_http_ipng_stats_module.c index 376f254..c0e2ae6 100644 --- a/src/ngx_http_ipng_stats_module.c +++ b/src/ngx_http_ipng_stats_module.c @@ -2212,6 +2212,33 @@ ngx_http_ipng_stats_append(ngx_chain_t ***last, ngx_chain_t *cl) } +/* Per-node snapshot used by the Prometheus and JSON renderers. The + * renderers drain the shared-zone LRU into an array of these under + * the slab mutex, and then release the mutex before doing any output + * buffer allocation. That keeps ngx_pcalloc / malloc off the locked + * path — a worker crash during rendering can never leave the slab + * mutex held (NFR-4.3), and slab-lock contention no longer blocks on + * glibc's allocator. */ +typedef struct { + ngx_uint_t source_id; + ngx_uint_t vip_id; + ngx_uint_t class; + uint64_t requests; + uint64_t bytes_in; + uint64_t bytes_out; + uint64_t duration_sum_ms; + uint64_t upstream_sum_ms; +} ngx_http_ipng_stats_snap_t; + + +/* Sanity cap on shared-zone cardinality, above which the renderer + * refuses to snapshot rather than try to allocate a potentially + * runaway buffer. This is a belt-and-suspenders guard against a + * corrupt `sh->sources.nelts` or `sh->vips.nelts` — see design.md + * on reload hardening. */ +#define NGX_HTTP_IPNG_STATS_MAX_INTERN 10000 + + /* Find or create an aggregation entry for (source_id, vip_id). Linear * scan — the table is small in practice (< 100 entries). */ static ngx_http_ipng_stats_agg_t * @@ -2232,91 +2259,45 @@ ngx_http_ipng_stats_agg_get(ngx_http_ipng_stats_agg_t *aggs, ngx_uint_t *naggs, } -/* Walk matching nodes, emit per-node counters via `emit_counters`, - * accumulate histograms into `aggs`. Caller holds slab mutex. - * `ctx` is opaque — passed through to the emit callback. Returns number - * of distinct (source, vip) pairs observed, or -1 on error. */ -typedef ngx_int_t (*ngx_http_ipng_stats_counters_pt)(ngx_http_request_t *r, - ngx_http_ipng_stats_shctx_t *sh, ngx_http_ipng_stats_node_t *n, - ngx_chain_t ***last, ngx_uint_t *emitted); - +/* Deep-copy the sources and vips interning tables into r->pool so the + * renderers can dereference strings without holding the slab mutex. + * Caller holds the slab mutex. Writes the number of valid entries + * (possibly smaller than the allocated length) via *n_src_out / *n_vip_out. */ static ngx_int_t -ngx_http_ipng_stats_walk_aggregate(ngx_http_request_t *r, - ngx_http_ipng_stats_main_conf_t *imcf, - ngx_str_t *filter_src, ngx_str_t *filter_vip, - ngx_http_ipng_stats_counters_pt emit_counters, - ngx_chain_t ***last, ngx_uint_t *emitted, - ngx_http_ipng_stats_agg_t *aggs, ngx_uint_t naggs_alloc, - ngx_uint_t *naggs_out) +ngx_http_ipng_stats_snapshot_strings(ngx_http_request_t *r, + ngx_http_ipng_stats_shctx_t *sh, + ngx_str_t *src_tbl, ngx_uint_t n_src_alloc, ngx_uint_t *n_src_out, + ngx_str_t *vip_tbl, ngx_uint_t n_vip_alloc, ngx_uint_t *n_vip_out) { - ngx_http_ipng_stats_shctx_t *sh = imcf->shm_zone->data; - ngx_queue_t *q; - ngx_http_ipng_stats_node_t *n; - ngx_str_t *src_entry, *vip_entry; - ngx_atomic_uint_t *lanes, *blanes; - ngx_http_ipng_stats_agg_t *a; - ngx_uint_t i; - ngx_uint_t nb = imcf->nbuckets; - ngx_uint_t nbb = imcf->nbytebuckets; - ngx_uint_t naggs = 0; - ngx_int_t rc; + ngx_uint_t i, n; + u_char *d; - for (q = ngx_queue_head(&sh->lru); - q != ngx_queue_sentinel(&sh->lru); - q = ngx_queue_next(q)) - { - n = ngx_queue_data(q, ngx_http_ipng_stats_node_t, lru); - if (n->source_id >= sh->sources.nelts - || n->vip_id >= sh->vips.nelts) - { - continue; - } - src_entry = &sh->sources.entries[n->source_id]; - vip_entry = &sh->vips.entries[n->vip_id]; - - if (filter_src->len > 0 - && (src_entry->len != filter_src->len - || ngx_memcmp(src_entry->data, filter_src->data, - filter_src->len) != 0)) - { - continue; - } - if (filter_vip->len > 0 - && (vip_entry->len != filter_vip->len - || ngx_memcmp(vip_entry->data, filter_vip->data, - filter_vip->len) != 0)) - { - continue; - } - - rc = emit_counters(r, sh, n, last, emitted); - if (rc != NGX_OK) return NGX_ERROR; - - a = ngx_http_ipng_stats_agg_get(aggs, &naggs, naggs_alloc, - n->source_id, n->vip_id); - if (a == NULL) continue; - - a->duration_sum_ms += n->duration_sum_ms; - a->upstream_sum_ms += n->upstream_sum_ms; - a->bytes_in_sum += n->bytes_in; - a->bytes_out_sum += n->bytes_out; - a->req_total += n->requests; - - lanes = (ngx_atomic_uint_t *) (n + 1); - blanes = lanes + 2 * (nb + 1); - - for (i = 0; i <= nb; i++) { - a->dhist[i] += lanes[i]; - a->uhist[i] += lanes[nb + 1 + i]; - a->up_total += lanes[nb + 1 + i]; - } - for (i = 0; i <= nbb; i++) { - a->bin_hist[i] += blanes[i]; - a->bout_hist[i] += blanes[nbb + 1 + i]; + n = sh->sources.nelts < n_src_alloc ? sh->sources.nelts : n_src_alloc; + for (i = 0; i < n; i++) { + if (sh->sources.entries[i].len > 0) { + d = ngx_pnalloc(r->pool, sh->sources.entries[i].len); + if (d == NULL) return NGX_ERROR; + ngx_memcpy(d, sh->sources.entries[i].data, + sh->sources.entries[i].len); + src_tbl[i].data = d; + src_tbl[i].len = sh->sources.entries[i].len; } } + *n_src_out = n; + + n = sh->vips.nelts < n_vip_alloc ? sh->vips.nelts : n_vip_alloc; + for (i = 0; i < n; i++) { + if (sh->vips.entries[i].len > 0) { + d = ngx_pnalloc(r->pool, sh->vips.entries[i].len); + if (d == NULL) return NGX_ERROR; + ngx_memcpy(d, sh->vips.entries[i].data, + sh->vips.entries[i].len); + vip_tbl[i].data = d; + vip_tbl[i].len = sh->vips.entries[i].len; + } + } + *n_vip_out = n; - *naggs_out = naggs; return NGX_OK; } @@ -2352,32 +2333,105 @@ ngx_http_ipng_stats_agg_alloc(ngx_http_request_t *r, } -/* -- Prometheus ---------------------------------------------------- */ - -static ngx_int_t -ngx_http_ipng_stats_prom_counters(ngx_http_request_t *r, - ngx_http_ipng_stats_shctx_t *sh, ngx_http_ipng_stats_node_t *n, - ngx_chain_t ***last, ngx_uint_t *emitted) +/* Walk the LRU under the slab mutex, draining matching nodes into the + * caller-supplied `snaps` array and aggregating histogram lanes into + * `aggs`. All output buffers are pre-allocated by the caller — this + * function does not touch r->pool, so a crash here cannot leave the + * slab mutex held via a glibc abort. + * + * `src_tbl` / `vip_tbl` hold the deep-copied interning strings and are + * used only for filter comparisons; the renderer downstream indexes + * them again. Nodes whose ids fall outside the snapshotted range are + * skipped (a belt-and-suspenders guard against a stale id after a + * concurrent reload). */ +static void +ngx_http_ipng_stats_snapshot_nodes(ngx_http_ipng_stats_shctx_t *sh, + ngx_http_ipng_stats_main_conf_t *imcf, + ngx_str_t *filter_src, ngx_str_t *filter_vip, + ngx_str_t *src_tbl, ngx_uint_t n_src, + ngx_str_t *vip_tbl, ngx_uint_t n_vip, + ngx_http_ipng_stats_snap_t *snaps, ngx_uint_t nsnaps_alloc, + ngx_uint_t *nsnaps_out, + ngx_http_ipng_stats_agg_t *aggs, ngx_uint_t naggs_alloc, + ngx_uint_t *naggs_out) { - ngx_str_t *src = &sh->sources.entries[n->source_id]; - ngx_str_t *vip = &sh->vips.entries[n->vip_id]; - const char *cls = ngx_http_ipng_stats_class_label(n->class); - ngx_chain_t *cl = ngx_http_ipng_stats_chain_buf(r, 1024); - if (cl == NULL) return NGX_ERROR; - cl->buf->last = ngx_sprintf(cl->buf->last, - "nginx_ipng_requests_total{source_tag=\"%V\",vip=\"%V\",code=\"%s\"} %uA\n" - "nginx_ipng_bytes_in_total{source_tag=\"%V\",vip=\"%V\",code=\"%s\"} %uA\n" - "nginx_ipng_bytes_out_total{source_tag=\"%V\",vip=\"%V\",code=\"%s\"} %uA\n" - "nginx_ipng_latency_total{source_tag=\"%V\",vip=\"%V\",code=\"%s\"} %.3f\n", - src, vip, cls, n->requests, - src, vip, cls, n->bytes_in, - src, vip, cls, n->bytes_out, - src, vip, cls, (double) n->duration_sum_ms / 1000.0); - (*emitted)++; - return ngx_http_ipng_stats_append(last, cl); + ngx_queue_t *q; + ngx_http_ipng_stats_node_t *n; + ngx_str_t *src_entry, *vip_entry; + ngx_atomic_uint_t *lanes, *blanes; + ngx_http_ipng_stats_agg_t *a; + ngx_uint_t i; + ngx_uint_t nb = imcf->nbuckets; + ngx_uint_t nbb = imcf->nbytebuckets; + ngx_uint_t nsnaps = 0, naggs = 0; + + for (q = ngx_queue_head(&sh->lru); + q != ngx_queue_sentinel(&sh->lru); + q = ngx_queue_next(q)) + { + n = ngx_queue_data(q, ngx_http_ipng_stats_node_t, lru); + if (n->source_id >= n_src || n->vip_id >= n_vip) continue; + + src_entry = &src_tbl[n->source_id]; + vip_entry = &vip_tbl[n->vip_id]; + + if (filter_src->len > 0 + && (src_entry->len != filter_src->len + || ngx_memcmp(src_entry->data, filter_src->data, + filter_src->len) != 0)) + { + continue; + } + if (filter_vip->len > 0 + && (vip_entry->len != filter_vip->len + || ngx_memcmp(vip_entry->data, filter_vip->data, + filter_vip->len) != 0)) + { + continue; + } + + if (nsnaps < nsnaps_alloc) { + snaps[nsnaps].source_id = n->source_id; + snaps[nsnaps].vip_id = n->vip_id; + snaps[nsnaps].class = n->class; + snaps[nsnaps].requests = n->requests; + snaps[nsnaps].bytes_in = n->bytes_in; + snaps[nsnaps].bytes_out = n->bytes_out; + snaps[nsnaps].duration_sum_ms = n->duration_sum_ms; + snaps[nsnaps].upstream_sum_ms = n->upstream_sum_ms; + nsnaps++; + } + + a = ngx_http_ipng_stats_agg_get(aggs, &naggs, naggs_alloc, + n->source_id, n->vip_id); + if (a == NULL) continue; + a->duration_sum_ms += n->duration_sum_ms; + a->upstream_sum_ms += n->upstream_sum_ms; + a->bytes_in_sum += n->bytes_in; + a->bytes_out_sum += n->bytes_out; + a->req_total += n->requests; + + lanes = (ngx_atomic_uint_t *) (n + 1); + blanes = lanes + 2 * (nb + 1); + for (i = 0; i <= nb; i++) { + a->dhist[i] += lanes[i]; + a->uhist[i] += lanes[nb + 1 + i]; + a->up_total += lanes[nb + 1 + i]; + } + for (i = 0; i <= nbb; i++) { + a->bin_hist[i] += blanes[i]; + a->bout_hist[i] += blanes[nbb + 1 + i]; + } + } + + *nsnaps_out = nsnaps; + *naggs_out = naggs; } +/* -- Prometheus ---------------------------------------------------- */ + + static u_char * ngx_http_ipng_stats_render_hist(u_char *p, const char *metric, ngx_str_t *src, ngx_str_t *vip, ngx_uint_t *bounds, ngx_uint_t nb, @@ -2422,12 +2476,16 @@ ngx_http_ipng_stats_render_prom(ngx_http_request_t *r, ngx_chain_t **last = &out; ngx_str_t ctype = ngx_string("text/plain; version=0.0.4"); ngx_http_ipng_stats_agg_t *aggs; + ngx_http_ipng_stats_snap_t *snaps; + ngx_str_t *src_tbl = NULL, *vip_tbl = NULL; + ngx_uint_t n_src_alloc, n_vip_alloc; + ngx_uint_t n_src = 0, n_vip = 0; ngx_uint_t naggs = 0, naggs_alloc; - ngx_uint_t emitted = 0, i; + ngx_uint_t nsnaps = 0, nsnaps_alloc; + ngx_uint_t i; ngx_uint_t nb = imcf->nbuckets; ngx_uint_t nbb = imcf->nbytebuckets; size_t hist_sz; - ngx_int_t rc; slab = (ngx_slab_pool_t *) imcf->shm_zone->shm.addr; @@ -2449,28 +2507,91 @@ ngx_http_ipng_stats_render_prom(ngx_http_request_t *r, "# TYPE nginx_ipng_upstream_response_seconds histogram\n" "# HELP nginx_ipng_bytes_in Request size histogram in bytes.\n" "# TYPE nginx_ipng_bytes_in histogram\n" - "# HELP nginx_ipng_bytes_out Response size histogram in bytes.\n" + "# HELP nginx_ipng_bytes_out Request size histogram in bytes.\n" "# TYPE nginx_ipng_bytes_out histogram\n", NGX_HTTP_IPNG_STATS_VERSION, NGX_HTTP_IPNG_STATS_SCHEMA_VERSION); if (ngx_http_ipng_stats_append(&last, cl) != NGX_OK) { return NGX_HTTP_INTERNAL_SERVER_ERROR; } + /* Read cardinality with a brief lock. We release before allocating + * the snapshot buffers so glibc's allocator is never entered with + * the slab mutex held — a worker crash during allocation cannot + * leave the shared-memory zone locked. */ + ngx_shmtx_lock(&slab->mutex); + n_src_alloc = sh->sources.nelts; + n_vip_alloc = sh->vips.nelts; + ngx_shmtx_unlock(&slab->mutex); + + if (n_src_alloc > NGX_HTTP_IPNG_STATS_MAX_INTERN + || n_vip_alloc > NGX_HTTP_IPNG_STATS_MAX_INTERN) + { + ngx_log_error(NGX_LOG_ERR, r->connection->log, 0, + "ipng_stats: render: refusing to render, cardinality " + "out of range (sources=%ui, vips=%ui)", + n_src_alloc, n_vip_alloc); + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + + naggs_alloc = n_src_alloc * n_vip_alloc; + nsnaps_alloc = naggs_alloc * NGX_HTTP_IPNG_STATS_NCLASSES; + if (naggs_alloc == 0) naggs_alloc = 1; + if (nsnaps_alloc == 0) nsnaps_alloc = 1; + + if (ngx_http_ipng_stats_agg_alloc(r, imcf, naggs_alloc, + &aggs, &naggs_alloc) != NGX_OK) + { + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + snaps = ngx_pcalloc(r->pool, nsnaps_alloc * sizeof(*snaps)); + if (n_src_alloc > 0) { + src_tbl = ngx_pcalloc(r->pool, n_src_alloc * sizeof(ngx_str_t)); + if (src_tbl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + if (n_vip_alloc > 0) { + vip_tbl = ngx_pcalloc(r->pool, n_vip_alloc * sizeof(ngx_str_t)); + if (vip_tbl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + if (snaps == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; + ngx_shmtx_lock(&slab->mutex); - if (ngx_http_ipng_stats_agg_alloc(r, imcf, - sh->sources.nelts * sh->vips.nelts, &aggs, &naggs_alloc) != NGX_OK) + if (ngx_http_ipng_stats_snapshot_strings(r, sh, + src_tbl, n_src_alloc, &n_src, + vip_tbl, n_vip_alloc, &n_vip) != NGX_OK) { ngx_shmtx_unlock(&slab->mutex); return NGX_HTTP_INTERNAL_SERVER_ERROR; } - rc = ngx_http_ipng_stats_walk_aggregate(r, imcf, filter_source, filter_vip, - ngx_http_ipng_stats_prom_counters, &last, &emitted, - aggs, naggs_alloc, &naggs); - if (rc != NGX_OK) { - ngx_shmtx_unlock(&slab->mutex); - return NGX_HTTP_INTERNAL_SERVER_ERROR; + ngx_http_ipng_stats_snapshot_nodes(sh, imcf, + filter_source, filter_vip, + src_tbl, n_src, vip_tbl, n_vip, + snaps, nsnaps_alloc, &nsnaps, + aggs, naggs_alloc, &naggs); + + ngx_shmtx_unlock(&slab->mutex); + + /* Per-node counters. */ + for (i = 0; i < nsnaps; i++) { + ngx_http_ipng_stats_snap_t *s = &snaps[i]; + ngx_str_t *src = &src_tbl[s->source_id]; + ngx_str_t *vip = &vip_tbl[s->vip_id]; + const char *cls = ngx_http_ipng_stats_class_label(s->class); + cl = ngx_http_ipng_stats_chain_buf(r, 1024); + if (cl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; + cl->buf->last = ngx_sprintf(cl->buf->last, + "nginx_ipng_requests_total{source_tag=\"%V\",vip=\"%V\",code=\"%s\"} %uL\n" + "nginx_ipng_bytes_in_total{source_tag=\"%V\",vip=\"%V\",code=\"%s\"} %uL\n" + "nginx_ipng_bytes_out_total{source_tag=\"%V\",vip=\"%V\",code=\"%s\"} %uL\n" + "nginx_ipng_latency_total{source_tag=\"%V\",vip=\"%V\",code=\"%s\"} %.3f\n", + src, vip, cls, s->requests, + src, vip, cls, s->bytes_in, + src, vip, cls, s->bytes_out, + src, vip, cls, (double) s->duration_sum_ms / 1000.0); + if (ngx_http_ipng_stats_append(&last, cl) != NGX_OK) { + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } } /* One chain link per (source, vip) for the four aggregated histograms. @@ -2479,15 +2600,12 @@ ngx_http_ipng_stats_render_prom(ngx_http_request_t *r, for (i = 0; i < naggs; i++) { ngx_http_ipng_stats_agg_t *a = &aggs[i]; - ngx_str_t *src = &sh->sources.entries[a->source_id]; - ngx_str_t *vip = &sh->vips.entries[a->vip_id]; + ngx_str_t *src = &src_tbl[a->source_id]; + ngx_str_t *vip = &vip_tbl[a->vip_id]; u_char *p; cl = ngx_http_ipng_stats_chain_buf(r, hist_sz); - if (cl == NULL) { - ngx_shmtx_unlock(&slab->mutex); - return NGX_HTTP_INTERNAL_SERVER_ERROR; - } + if (cl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; p = cl->buf->last; p = ngx_http_ipng_stats_render_hist(p, "nginx_ipng_request_duration_seconds", src, vip, @@ -2507,33 +2625,16 @@ ngx_http_ipng_stats_render_prom(ngx_http_request_t *r, (double) a->bytes_out_sum, 0); cl->buf->last = p; if (ngx_http_ipng_stats_append(&last, cl) != NGX_OK) { - ngx_shmtx_unlock(&slab->mutex); return NGX_HTTP_INTERNAL_SERVER_ERROR; } } - ngx_shmtx_unlock(&slab->mutex); return ngx_http_ipng_stats_send(r, &ctype, out); } /* -- JSON ---------------------------------------------------------- */ -/* Per-(source, vip, class) counter group. We render one JSON object per - * aggregated (source, vip) record, with the class breakdown stored in - * an interim table while the walk is in progress. */ -typedef struct { - ngx_uint_t source_id; - ngx_uint_t vip_id; - ngx_uint_t class; - uint64_t requests; - uint64_t bytes_in; - uint64_t bytes_out; - uint64_t duration_sum_ms; - uint64_t upstream_sum_ms; -} ngx_http_ipng_stats_jnode_t; - - static ngx_int_t ngx_http_ipng_stats_render_json(ngx_http_request_t *r, ngx_http_ipng_stats_main_conf_t *imcf, @@ -2545,16 +2646,15 @@ ngx_http_ipng_stats_render_json(ngx_http_request_t *r, ngx_chain_t **last = &out; ngx_str_t ctype = ngx_string("application/json"); ngx_http_ipng_stats_agg_t *aggs; - ngx_http_ipng_stats_jnode_t *jnodes; - ngx_uint_t njnodes = 0, njnodes_alloc; + ngx_http_ipng_stats_snap_t *snaps; + ngx_str_t *src_tbl = NULL, *vip_tbl = NULL; + ngx_uint_t n_src_alloc, n_vip_alloc; + ngx_uint_t n_src = 0, n_vip = 0; + ngx_uint_t nsnaps = 0, nsnaps_alloc; ngx_uint_t naggs = 0, naggs_alloc; ngx_uint_t i, j, emitted = 0; ngx_uint_t nb = imcf->nbuckets; ngx_uint_t nbb = imcf->nbytebuckets; - ngx_queue_t *q; - ngx_http_ipng_stats_node_t *nd; - ngx_str_t *src_entry, *vip_entry; - ngx_atomic_uint_t *lanes, *blanes; ngx_http_ipng_stats_agg_t *a; size_t rec_sz; @@ -2570,86 +2670,59 @@ ngx_http_ipng_stats_render_json(ngx_http_request_t *r, } ngx_shmtx_lock(&slab->mutex); + n_src_alloc = sh->sources.nelts; + n_vip_alloc = sh->vips.nelts; + ngx_shmtx_unlock(&slab->mutex); + + if (n_src_alloc > NGX_HTTP_IPNG_STATS_MAX_INTERN + || n_vip_alloc > NGX_HTTP_IPNG_STATS_MAX_INTERN) + { + ngx_log_error(NGX_LOG_ERR, r->connection->log, 0, + "ipng_stats: render: refusing to render, cardinality " + "out of range (sources=%ui, vips=%ui)", + n_src_alloc, n_vip_alloc); + return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + + naggs_alloc = n_src_alloc * n_vip_alloc; + nsnaps_alloc = naggs_alloc * NGX_HTTP_IPNG_STATS_NCLASSES; + if (naggs_alloc == 0) naggs_alloc = 1; + if (nsnaps_alloc == 0) nsnaps_alloc = 1; - naggs_alloc = sh->sources.nelts * sh->vips.nelts; if (ngx_http_ipng_stats_agg_alloc(r, imcf, naggs_alloc, &aggs, &naggs_alloc) != NGX_OK) { - ngx_shmtx_unlock(&slab->mutex); return NGX_HTTP_INTERNAL_SERVER_ERROR; } - - /* Upper bound on jnodes = naggs_alloc * NCLASSES. */ - njnodes_alloc = naggs_alloc * NGX_HTTP_IPNG_STATS_NCLASSES; - if (njnodes_alloc == 0) njnodes_alloc = 1; - jnodes = ngx_pcalloc(r->pool, njnodes_alloc * sizeof(*jnodes)); - if (jnodes == NULL) { - ngx_shmtx_unlock(&slab->mutex); - return NGX_HTTP_INTERNAL_SERVER_ERROR; + snaps = ngx_pcalloc(r->pool, nsnaps_alloc * sizeof(*snaps)); + if (n_src_alloc > 0) { + src_tbl = ngx_pcalloc(r->pool, n_src_alloc * sizeof(ngx_str_t)); + if (src_tbl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; } + if (n_vip_alloc > 0) { + vip_tbl = ngx_pcalloc(r->pool, n_vip_alloc * sizeof(ngx_str_t)); + if (vip_tbl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; + } + if (snaps == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; - for (q = ngx_queue_head(&sh->lru); - q != ngx_queue_sentinel(&sh->lru); - q = ngx_queue_next(q)) + ngx_shmtx_lock(&slab->mutex); + + if (ngx_http_ipng_stats_snapshot_strings(r, sh, + src_tbl, n_src_alloc, &n_src, + vip_tbl, n_vip_alloc, &n_vip) != NGX_OK) { - nd = ngx_queue_data(q, ngx_http_ipng_stats_node_t, lru); - if (nd->source_id >= sh->sources.nelts - || nd->vip_id >= sh->vips.nelts) - { - continue; - } - src_entry = &sh->sources.entries[nd->source_id]; - vip_entry = &sh->vips.entries[nd->vip_id]; - - if (filter_source->len > 0 - && (src_entry->len != filter_source->len - || ngx_memcmp(src_entry->data, filter_source->data, - filter_source->len) != 0)) - { - continue; - } - if (filter_vip->len > 0 - && (vip_entry->len != filter_vip->len - || ngx_memcmp(vip_entry->data, filter_vip->data, - filter_vip->len) != 0)) - { - continue; - } - - if (njnodes < njnodes_alloc) { - jnodes[njnodes].source_id = nd->source_id; - jnodes[njnodes].vip_id = nd->vip_id; - jnodes[njnodes].class = nd->class; - jnodes[njnodes].requests = nd->requests; - jnodes[njnodes].bytes_in = nd->bytes_in; - jnodes[njnodes].bytes_out = nd->bytes_out; - jnodes[njnodes].duration_sum_ms = nd->duration_sum_ms; - jnodes[njnodes].upstream_sum_ms = nd->upstream_sum_ms; - njnodes++; - } - - a = ngx_http_ipng_stats_agg_get(aggs, &naggs, naggs_alloc, - nd->source_id, nd->vip_id); - if (a == NULL) continue; - a->duration_sum_ms += nd->duration_sum_ms; - a->upstream_sum_ms += nd->upstream_sum_ms; - a->bytes_in_sum += nd->bytes_in; - a->bytes_out_sum += nd->bytes_out; - a->req_total += nd->requests; - - lanes = (ngx_atomic_uint_t *) (nd + 1); - blanes = lanes + 2 * (nb + 1); - for (i = 0; i <= nb; i++) { - a->dhist[i] += lanes[i]; - a->uhist[i] += lanes[nb + 1 + i]; - a->up_total += lanes[nb + 1 + i]; - } - for (i = 0; i <= nbb; i++) { - a->bin_hist[i] += blanes[i]; - a->bout_hist[i] += blanes[nbb + 1 + i]; - } + ngx_shmtx_unlock(&slab->mutex); + return NGX_HTTP_INTERNAL_SERVER_ERROR; } + ngx_http_ipng_stats_snapshot_nodes(sh, imcf, + filter_source, filter_vip, + src_tbl, n_src, vip_tbl, n_vip, + snaps, nsnaps_alloc, &nsnaps, + aggs, naggs_alloc, &naggs); + + ngx_shmtx_unlock(&slab->mutex); + /* One JSON record per aggregated (source, vip). Size upper-bound * accounts for: fixed overhead, up to NCLASSES class entries, 4 * histograms. */ @@ -2660,33 +2733,30 @@ ngx_http_ipng_stats_render_json(ngx_http_request_t *r, for (i = 0; i < naggs; i++) { a = &aggs[i]; - ngx_str_t *src = &sh->sources.entries[a->source_id]; - ngx_str_t *vip = &sh->vips.entries[a->vip_id]; + ngx_str_t *src = &src_tbl[a->source_id]; + ngx_str_t *vip = &vip_tbl[a->vip_id]; u_char *p; ngx_uint_t first_class = 1; cl = ngx_http_ipng_stats_chain_buf(r, rec_sz); - if (cl == NULL) { - ngx_shmtx_unlock(&slab->mutex); - return NGX_HTTP_INTERNAL_SERVER_ERROR; - } + if (cl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; p = cl->buf->last; p = ngx_sprintf(p, "%s{\"source_tag\":\"%V\",\"vip\":\"%V\",\"classes\":{", (emitted == 0) ? "" : ",", src, vip); - for (j = 0; j < njnodes; j++) { - if (jnodes[j].source_id != a->source_id - || jnodes[j].vip_id != a->vip_id) continue; + for (j = 0; j < nsnaps; j++) { + if (snaps[j].source_id != a->source_id + || snaps[j].vip_id != a->vip_id) continue; p = ngx_sprintf(p, "%s\"%s\":{\"requests\":%uL,\"bytes_in\":%uL," "\"bytes_out\":%uL,\"latency_ms\":%uL," "\"upstream_latency_ms\":%uL}", first_class ? "" : ",", - ngx_http_ipng_stats_class_label(jnodes[j].class), - jnodes[j].requests, jnodes[j].bytes_in, - jnodes[j].bytes_out, jnodes[j].duration_sum_ms, - jnodes[j].upstream_sum_ms); + ngx_http_ipng_stats_class_label(snaps[j].class), + snaps[j].requests, snaps[j].bytes_in, + snaps[j].bytes_out, snaps[j].duration_sum_ms, + snaps[j].upstream_sum_ms); first_class = 0; } p = ngx_sprintf(p, "},\"request_duration_ms\":{\"sum\":%uL," @@ -2741,14 +2811,11 @@ ngx_http_ipng_stats_render_json(ngx_http_request_t *r, cl->buf->last = p; if (ngx_http_ipng_stats_append(&last, cl) != NGX_OK) { - ngx_shmtx_unlock(&slab->mutex); return NGX_HTTP_INTERNAL_SERVER_ERROR; } emitted++; } - ngx_shmtx_unlock(&slab->mutex); - cl = ngx_http_ipng_stats_chain_buf(r, 8); if (cl == NULL) return NGX_HTTP_INTERNAL_SERVER_ERROR; cl->buf->last = ngx_sprintf(cl->buf->last, "]}\n"); diff --git a/tests/01-module/01-e2e.robot b/tests/01-module/01-e2e.robot index 0e15d19..f32ada4 100644 --- a/tests/01-module/01-e2e.robot +++ b/tests/01-module/01-e2e.robot @@ -201,6 +201,7 @@ Request count accuracy # --- Lab lifecycle --- Deploy Lab + Require Deb Build Run ${CLAB_BIN} --runtime ${runtime} destroy -t ${CURDIR}/${lab-file} --cleanup 2>&1 || true ${rc} ${output} = Run And Return Rc And Output ... ${CLAB_BIN} --runtime ${runtime} deploy -t ${CURDIR}/${lab-file} @@ -210,6 +211,16 @@ Deploy Lab Wait Until Keyword Succeeds 60s 3s Client Can Reach Server ${CLIENT1} 10.0.1.1 Wait Until Keyword Succeeds 60s 3s Client Can Reach Server ${CLIENT2} 10.0.2.1 +Require Deb Build + [Documentation] Fail fast with an actionable message if the user + ... forgot to run `make pkg-deb` before invoking this + ... suite. The server container dpkg-installs the + ... built .deb via its bind-mount of build/. + ${rc} ${output} = Run And Return Rc And Output + ... bash -c 'ls ${EXECDIR}/build/libnginx-mod-http-ipng-stats_*.deb 2>/dev/null' + Run Keyword If ${rc} != 0 + ... Fail Module .deb not found — run `make pkg-deb` first. + Server Is Ready ${rc} ${output} = Run And Return Rc And Output ... curl -sf ${SCRAPE_URL} diff --git a/tests/02-asan/02-asan.robot b/tests/02-asan/02-asan.robot new file mode 100644 index 0000000..2164a6e --- /dev/null +++ b/tests/02-asan/02-asan.robot @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: Apache-2.0 +*** Settings *** +Documentation AddressSanitizer + UBSan stress suite for +... ngx_http_ipng_stats_module. Deploys a 2-node containerlab +... topology running an ASan-instrumented nginx (built by +... `make build-asan`), exercises the code paths most likely +... to surface memory errors — shared-zone init and reuse, +... scrape rendering under the slab mutex, log-phase +... interning, logtail UDP flush — and fails if any +... AddressSanitizer or UBSan finding appears in the nginx +... stderr during the run. +... +... This suite is deliberately not a superset of 01-module — +... it's a landing zone for memory-correctness cases. +... Functional coverage (attribution, filters, counters) +... lives in 01-module. +Library OperatingSystem +Library String +Suite Setup Deploy Lab +Suite Teardown Cleanup Lab +Test Teardown Assert No Sanitizer Findings + +*** Variables *** +${lab-name} ipng-stats-asan +${lab-file} lab/ipng-stats-asan.clab.yml +${runtime} docker +${CLAB_BIN} sudo containerlab +${SERVER} clab-${lab-name}-server +${CLIENT} clab-${lab-name}-client +${SCRAPE_URL} http://172.20.41.2:9113/stats +${DATAPLANE_URL} http://10.0.1.1:8080 +${STRESS_RELOADS} 10 +${STRESS_REQ_PER_LOOP} 25 + +*** Test Cases *** + +ASan nginx starts and serves a scrape + [Documentation] The ASan-instrumented nginx boots with the module + ... loaded, and a bare scrape returns the expected + ... preamble. Touches init_zone, postconfig, and the + ... scrape renderer with an empty LRU. + ${output} = Scrape Prometheus + Should Contain ${output} nginx-ipng-stats-plugin + Should Contain ${output} nginx_ipng_requests_total + +Scrape an empty JSON report + [Documentation] JSON renderer path with zero records — catches + ... off-by-one errors in the bracket emission. + ${rc} ${output} = Run And Return Rc And Output + ... curl -sf -H 'Accept: application/json' ${SCRAPE_URL} + Should Be Equal As Integers ${rc} 0 + Should Contain ${output} "schema":2 + Should Contain ${output} "records":[ + +Reload storm without traffic + [Documentation] Back-to-back reloads with no traffic in between. + ... Exercises init_zone's zone-reuse branch and the + ... shctx magic check; the cardinality is zero so the + ... renderer's naggs_alloc == 0 path is also covered. + FOR ${i} IN RANGE ${STRESS_RELOADS} + Docker Exec ${SERVER} ngxasan -s reload + Sleep 200ms + Scrape Prometheus + END + +Reload storm with interleaved traffic + [Documentation] Generate traffic, reload, scrape, repeat. This is + ... the scenario that surfaced the original crash: the + ... scrape path walks the shared-zone LRU while workers + ... are being cycled. Also grows the interning table + ... by using a handful of distinct paths. + FOR ${i} IN RANGE ${STRESS_RELOADS} + Generate Traffic ${STRESS_REQ_PER_LOOP} + Docker Exec ${SERVER} ngxasan -s reload + Sleep 200ms + Scrape Prometheus + END + +Concurrent scrape during reload + [Documentation] Scrape in a tight loop while issuing reloads from + ... a parallel shell. The renderer's snapshot step + ... deep-copies strings under the slab mutex; a + ... concurrent intern_shared grow during that window + ... would surface here as use-after-free. We run the + ... whole dance in one bash -c so Robot doesn't have + ... to babysit the background pid. + Generate Traffic ${STRESS_REQ_PER_LOOP} + ${rc} ${output} = Run And Return Rc And Output + ... bash -c '( for i in $(seq 1 200); do curl -sf ${SCRAPE_URL} > /dev/null || true; done ) & scraper=$!; for i in 1 2 3 4 5; do docker exec ${SERVER} ngxasan -s reload; sleep 0.3; done; wait $scraper' + Should Be Equal As Integers ${rc} 0 + +Large cardinality intern table growth + [Documentation] Drive enough distinct request paths that the + ... per-VIP vip/source interning array grows past its + ... initial slab_alloc — this exercises the realloc + ... path (ngx_slab_free_locked of the old entries + ... buffer, copy into the new one) inside the log + ... handler. + FOR ${i} IN RANGE 60 + Docker Exec Ignore Rc ${CLIENT} curl -s ${DATAPLANE_URL}/path${i} + END + Sleep 500ms + Scrape Prometheus + +*** Keywords *** + +# --- Lab lifecycle --- + +Deploy Lab + Require ASan Build + Run ${CLAB_BIN} --runtime ${runtime} destroy -t ${CURDIR}/${lab-file} --cleanup 2>&1 || true + ${rc} ${output} = Run And Return Rc And Output + ... ${CLAB_BIN} --runtime ${runtime} deploy -t ${CURDIR}/${lab-file} + Log ${output} + Should Be Equal As Integers ${rc} 0 + Wait Until Keyword Succeeds 90s 3s Server Is Ready + Wait Until Keyword Succeeds 60s 3s Client Can Reach Server + +Require ASan Build + [Documentation] Fail fast with an actionable message if the user + ... forgot to run `make build-asan` before invoking + ... this suite. + ${rc} = Run And Return Rc test -x ${EXECDIR}/build/nginx-asan/sbin/nginx + Run Keyword If ${rc} != 0 + ... Fail ASan nginx not found — run `make build-asan` first. + +Server Is Ready + ${rc} ${output} = Run And Return Rc And Output curl -sf ${SCRAPE_URL} + Should Be Equal As Integers ${rc} 0 + +Client Can Reach Server + ${rc} ${output} = Run And Return Rc And Output + ... docker exec ${CLIENT} curl -sf ${DATAPLANE_URL}/ + Should Be Equal As Integers ${rc} 0 + +Cleanup Lab + Run docker logs ${SERVER} > ${EXECDIR}/tests/out/asan-server-docker.log 2>&1 + Run docker exec ${SERVER} cat /tmp/nginx.err > ${EXECDIR}/tests/out/asan-nginx-err.log 2>&1 + Run docker exec ${SERVER} cat /tmp/nginx.stderr > ${EXECDIR}/tests/out/asan-nginx-stderr.log 2>&1 + Run docker exec ${SERVER} bash -c 'cat /tmp/asan.* 2>/dev/null; cat /tmp/ubsan.* 2>/dev/null' > ${EXECDIR}/tests/out/asan-reports.log 2>&1 + Run ${CLAB_BIN} --runtime ${runtime} destroy -t ${CURDIR}/${lab-file} --cleanup + +# --- Sanitizer assertion --- + +Assert No Sanitizer Findings + [Documentation] Fail the current test if the ASan or UBSan + ... runtime wrote any findings to stderr or their + ... per-pid log files. Runs after every test case — + ... we want the failing test to be the one that + ... produced the finding, not a later one. + ${rc} ${hits} = Run And Return Rc And Output + ... docker exec ${SERVER} bash -c 'grep -E "AddressSanitizer|LeakSanitizer|runtime error|SUMMARY:" /tmp/nginx.stderr /tmp/asan.* /tmp/ubsan.* 2>/dev/null || true' + Run Keyword If '${hits}' != '${EMPTY}' + ... Fail Sanitizer findings detected:\n${hits} + +# --- Traffic generation --- + +Generate Traffic + [Arguments] ${count} + FOR ${i} IN RANGE ${count} + Docker Exec Ignore Rc ${CLIENT} curl -s ${DATAPLANE_URL}/ + END + +# --- Scraping --- + +Scrape Prometheus + ${rc} ${output} = Run And Return Rc And Output curl -sf ${SCRAPE_URL} + Should Be Equal As Integers ${rc} 0 + RETURN ${output} + +# --- Container helpers --- + +Docker Exec + [Arguments] ${container} ${cmd} + ${rc} ${output} = Run And Return Rc And Output + ... docker exec ${container} ${cmd} + Should Be Equal As Integers ${rc} 0 + RETURN ${output} + +Docker Exec Ignore Rc + [Arguments] ${container} ${cmd} + ${rc} ${output} = Run And Return Rc And Output + ... docker exec ${container} ${cmd} + RETURN ${output} diff --git a/tests/02-asan/lab/client/start.sh b/tests/02-asan/lab/client/start.sh new file mode 100755 index 0000000..1e1ea96 --- /dev/null +++ b/tests/02-asan/lab/client/start.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# Client container entrypoint for the ASan test suite. Identical in +# spirit to tests/01-module/lab/client/start.sh — kept as a separate +# file so this suite's lab can be torn down and redeployed without +# affecting 01-module state. + +set -e + +apt-get update -qq +apt-get install -y -qq curl iproute2 > /dev/null 2>&1 + +echo "Waiting for eth1 ..." +while ! ip link show eth1 > /dev/null 2>&1; do + sleep 0.2 +done +ip link set eth1 up +ip addr add ${MY_IP} dev eth1 + +# Drop the mgmt default route so data-plane traffic goes out eth1. +ip route del default 2>/dev/null || true + +exec sleep infinity diff --git a/tests/02-asan/lab/ipng-stats-asan.clab.yml b/tests/02-asan/lab/ipng-stats-asan.clab.yml new file mode 100644 index 0000000..2591d72 --- /dev/null +++ b/tests/02-asan/lab/ipng-stats-asan.clab.yml @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# Containerlab topology for the AddressSanitizer/UBSan test suite. +# +# The server container bind-mounts build/nginx-asan/ — the +# sanitizer-instrumented nginx built by `make build-asan`. The binary +# was compiled against host glibc, so the container image must match +# the host's Debian release (trixie/13) for the .so and libasan to be +# ABI-compatible. The binary is run directly (no .deb install): the +# `make pkg-deb` path is exercised by tests/01-module/. +# +# Topology: one server + one client with a single data-plane link. +# Unlike 01-module we don't need multi-interface attribution here — +# this suite is focused on memory correctness, not traffic tagging. + +name: ipng-stats-asan + +mgmt: + network: ipng-stats-asan-net + ipv4-subnet: 172.20.41.0/24 + +topology: + nodes: + server: + kind: linux + image: debian:trixie-slim + mgmt-ipv4: 172.20.41.2 + binds: + # RW because nginx chowns client_body_temp/ and writes to logs/ + # on master startup; it's a build artifact so we don't mind. + - ../../../build/nginx-asan:/opt/nginx-asan + - ./server/nginx.conf:/opt/nginx-asan/conf/nginx.conf:ro + - ./server/start.sh:/start.sh:ro + cmd: bash /start.sh + + client: + kind: linux + image: debian:trixie-slim + mgmt-ipv4: 172.20.41.11 + binds: + - ./client/start.sh:/start.sh:ro + cmd: bash /start.sh + env: + MY_IP: 10.0.1.2/24 + + links: + - endpoints: ["server:eth1", "client:eth1"] diff --git a/tests/02-asan/lab/server/nginx.conf b/tests/02-asan/lab/server/nginx.conf new file mode 100644 index 0000000..1c84994 --- /dev/null +++ b/tests/02-asan/lab/server/nginx.conf @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 +# Minimal nginx config for the ASan test suite. Exercises the code paths +# most likely to surface memory errors: shared-zone init/reload, the +# scrape renderer (under slab mutex), the log-phase handler's interning, +# and logtail UDP buffering. + +load_module /opt/nginx-asan/modules/ngx_http_ipng_stats_module.so; + +daemon off; +master_process on; +worker_processes 2; +pid /tmp/nginx.pid; +error_log /tmp/nginx.err info; + +events { + worker_connections 128; +} + +http { + access_log off; + ipng_stats_zone ipng:1m; + ipng_stats_flush_interval 300ms; + ipng_stats_default_source direct; + + log_format logtail '$remote_addr\t$request_method\t$request_uri\t$status'; + ipng_stats_logtail logtail udp://127.0.0.1:9514 buffer=4k flush=300ms; + + server { + # Mgmt scrape endpoint. + listen 172.20.41.2:9113; + + location = /stats { + ipng_stats; + allow all; + } + } + + server { + # Data plane — client traffic lands here. + listen 10.0.1.1:8080 device=eth1 ipng_source_tag=cl1; + listen 172.20.41.2:8080; + + location / { + return 200 "ok $server_addr\n"; + } + location /notfound { + return 404 "nope\n"; + } + } +} diff --git a/tests/02-asan/lab/server/start.sh b/tests/02-asan/lab/server/start.sh new file mode 100755 index 0000000..0bdb4f1 --- /dev/null +++ b/tests/02-asan/lab/server/start.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# Server container entrypoint for the ASan test suite. Installs libasan +# runtime (the sanitizer-instrumented binary was linked against host +# gcc's libasan.so.8), wires up the data-plane interface, and execs the +# ASan nginx in the foreground with stderr captured so the Robot suite +# can grep for AddressSanitizer/UBSan findings at teardown. + +set -e + +apt-get update -qq +apt-get install -y -qq libasan8 libubsan1 ncat iproute2 curl > /dev/null 2>&1 + +# Wait for containerlab to attach the data-plane veth, configure the IP. +echo "Waiting for eth1 ..." +while ! ip link show eth1 > /dev/null 2>&1; do + sleep 0.2 +done +ip link set eth1 up +ip addr add 10.0.1.1/24 dev eth1 + +# UDP logtail listener — drains the module's datagrams so sendto() has +# a real destination. The test doesn't assert on this file's contents +# (01-module already covers logtail semantics); we just need the socket +# to exist so ASan sees a complete write/flush cycle in the module. +mkdir -p /var/log/nginx +ncat -u -l -k 127.0.0.1 9514 --recv-only >> /var/log/nginx/logtail-udp.log & + +# ASan options: +# detect_odr_violation=0 — nginx intentionally duplicates symbols like +# ngx_module_names between the main binary and each dynamic module. +# abort_on_error=1, halt_on_error=1 — fail fast so the Robot suite +# sees the exit status and the ASan report is preserved at the tail +# of /tmp/nginx.stderr. +# detect_leaks=0 — nginx exits without running its pool destructors in +# many paths; leak detection is not the goal here. +# log_path — ASan writes each finding to this prefix + pid, so even +# when nginx wipes its own error log on reload the ASan traces +# survive for post-run inspection. +ASAN_OPTS="detect_odr_violation=0:abort_on_error=1:halt_on_error=1:detect_leaks=0:log_path=/tmp/asan" +UBSAN_OPTS="print_stacktrace=1:halt_on_error=0:log_path=/tmp/ubsan" + +# Wrapper so every subsequent `docker exec ... ngxasan ...` (e.g. the +# reload signal from the Robot suite) inherits the same sanitizer +# settings. `docker exec` does not carry the master's env. +cat > /usr/local/bin/ngxasan < >(tee /tmp/nginx.stderr >&2)