Harden scrape rendering and add AddressSanitizer test suite
Move all heap allocation out of the slab-mutex critical section in render_prom/render_json: snapshot cardinality under a brief lock, allocate aggs/snaps/string tables outside the lock, then re-acquire only to deep-copy strings and walk the LRU into the pre-allocated buffers. A worker crash during output buffer allocation can no longer leave the shared-memory zone locked, and a corrupt cardinality count is caught by a 10k sanity cap rather than causing a runaway ngx_pcalloc. Add build-asan and tests/02-asan/: a full sanitizer-instrumented nginx + module built via apt-source, and a 2-node containerlab Robot suite that drives reload storms, concurrent scrape-during-reload, and intern-table growth, failing if AddressSanitizer or UBSan reports anything on stderr. The two Robot suites now check for their required build artifacts up front so `make robot-test` no longer rebuilds them on every invocation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
50
tests/02-asan/lab/server/nginx.conf
Normal file
50
tests/02-asan/lab/server/nginx.conf
Normal file
@@ -0,0 +1,50 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# Minimal nginx config for the ASan test suite. Exercises the code paths
|
||||
# most likely to surface memory errors: shared-zone init/reload, the
|
||||
# scrape renderer (under slab mutex), the log-phase handler's interning,
|
||||
# and logtail UDP buffering.
|
||||
|
||||
load_module /opt/nginx-asan/modules/ngx_http_ipng_stats_module.so;
|
||||
|
||||
daemon off;
|
||||
master_process on;
|
||||
worker_processes 2;
|
||||
pid /tmp/nginx.pid;
|
||||
error_log /tmp/nginx.err info;
|
||||
|
||||
events {
|
||||
worker_connections 128;
|
||||
}
|
||||
|
||||
http {
|
||||
access_log off;
|
||||
ipng_stats_zone ipng:1m;
|
||||
ipng_stats_flush_interval 300ms;
|
||||
ipng_stats_default_source direct;
|
||||
|
||||
log_format logtail '$remote_addr\t$request_method\t$request_uri\t$status';
|
||||
ipng_stats_logtail logtail udp://127.0.0.1:9514 buffer=4k flush=300ms;
|
||||
|
||||
server {
|
||||
# Mgmt scrape endpoint.
|
||||
listen 172.20.41.2:9113;
|
||||
|
||||
location = /stats {
|
||||
ipng_stats;
|
||||
allow all;
|
||||
}
|
||||
}
|
||||
|
||||
server {
|
||||
# Data plane — client traffic lands here.
|
||||
listen 10.0.1.1:8080 device=eth1 ipng_source_tag=cl1;
|
||||
listen 172.20.41.2:8080;
|
||||
|
||||
location / {
|
||||
return 200 "ok $server_addr\n";
|
||||
}
|
||||
location /notfound {
|
||||
return 404 "nope\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
61
tests/02-asan/lab/server/start.sh
Executable file
61
tests/02-asan/lab/server/start.sh
Executable file
@@ -0,0 +1,61 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# Server container entrypoint for the ASan test suite. Installs libasan
|
||||
# runtime (the sanitizer-instrumented binary was linked against host
|
||||
# gcc's libasan.so.8), wires up the data-plane interface, and execs the
|
||||
# ASan nginx in the foreground with stderr captured so the Robot suite
|
||||
# can grep for AddressSanitizer/UBSan findings at teardown.
|
||||
|
||||
set -e
|
||||
|
||||
apt-get update -qq
|
||||
apt-get install -y -qq libasan8 libubsan1 ncat iproute2 curl > /dev/null 2>&1
|
||||
|
||||
# Wait for containerlab to attach the data-plane veth, configure the IP.
|
||||
echo "Waiting for eth1 ..."
|
||||
while ! ip link show eth1 > /dev/null 2>&1; do
|
||||
sleep 0.2
|
||||
done
|
||||
ip link set eth1 up
|
||||
ip addr add 10.0.1.1/24 dev eth1
|
||||
|
||||
# UDP logtail listener — drains the module's datagrams so sendto() has
|
||||
# a real destination. The test doesn't assert on this file's contents
|
||||
# (01-module already covers logtail semantics); we just need the socket
|
||||
# to exist so ASan sees a complete write/flush cycle in the module.
|
||||
mkdir -p /var/log/nginx
|
||||
ncat -u -l -k 127.0.0.1 9514 --recv-only >> /var/log/nginx/logtail-udp.log &
|
||||
|
||||
# ASan options:
|
||||
# detect_odr_violation=0 — nginx intentionally duplicates symbols like
|
||||
# ngx_module_names between the main binary and each dynamic module.
|
||||
# abort_on_error=1, halt_on_error=1 — fail fast so the Robot suite
|
||||
# sees the exit status and the ASan report is preserved at the tail
|
||||
# of /tmp/nginx.stderr.
|
||||
# detect_leaks=0 — nginx exits without running its pool destructors in
|
||||
# many paths; leak detection is not the goal here.
|
||||
# log_path — ASan writes each finding to this prefix + pid, so even
|
||||
# when nginx wipes its own error log on reload the ASan traces
|
||||
# survive for post-run inspection.
|
||||
ASAN_OPTS="detect_odr_violation=0:abort_on_error=1:halt_on_error=1:detect_leaks=0:log_path=/tmp/asan"
|
||||
UBSAN_OPTS="print_stacktrace=1:halt_on_error=0:log_path=/tmp/ubsan"
|
||||
|
||||
# Wrapper so every subsequent `docker exec ... ngxasan ...` (e.g. the
|
||||
# reload signal from the Robot suite) inherits the same sanitizer
|
||||
# settings. `docker exec` does not carry the master's env.
|
||||
cat > /usr/local/bin/ngxasan <<EOF
|
||||
#!/bin/bash
|
||||
export ASAN_OPTIONS="${ASAN_OPTS}"
|
||||
export UBSAN_OPTIONS="${UBSAN_OPTS}"
|
||||
exec /opt/nginx-asan/sbin/nginx -p /opt/nginx-asan -c /opt/nginx-asan/conf/nginx.conf "\$@"
|
||||
EOF
|
||||
chmod +x /usr/local/bin/ngxasan
|
||||
|
||||
export ASAN_OPTIONS="${ASAN_OPTS}"
|
||||
export UBSAN_OPTIONS="${UBSAN_OPTS}"
|
||||
|
||||
# Tee stderr so both docker logs and /tmp/nginx.stderr see it. The
|
||||
# Robot suite inspects the file; ASan writes its report to stderr
|
||||
# before abort_on_error kicks the process.
|
||||
exec /opt/nginx-asan/sbin/nginx -p /opt/nginx-asan -c /opt/nginx-asan/conf/nginx.conf \
|
||||
2> >(tee /tmp/nginx.stderr >&2)
|
||||
Reference in New Issue
Block a user