fix: UDP listener parses batched datagrams

nginx-ipng-stats-plugin's ipng_stats_logtail directive buffers many log
lines into a single UDP datagram (default buffer=64k flush=1s). The
listener was treating each datagram as exactly one log line, so any
datagram with N>1 lines failed the v1 field-count check and dropped
silently. In production this showed up as logtail_udp_packets_received_total
roughly 4x logtail_udp_loglines_success_total — matching typical
burst-coalesced 4-lines-per-batch ratios.

Fix: strip trailing CRLF, split the payload on '\n', parse each
non-empty line independently. Counter semantics now match the names:

  packets_received  — datagrams off the socket (one per recvfrom)
  loglines_success  — log lines parsed OK (may be many per datagram)
  loglines_consumed — log lines forwarded to the store (not dropped)

After the fix, loglines_success ≈ packets_received × avg_lines_per_batch.

Regression test TestUDPListenerBatchedDatagram sends one datagram with
three '\n'-separated v1 lines and asserts all three LogRecords arrive,
plus loglines_success >= 3 * packets_received.

Docs (user-guide.md, design.md) now explain the datagram-vs-line unit
distinction so operators don't misread the ratio.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-17 11:59:43 +02:00
parent a554cfc2ee
commit e1f8bc5eb4
4 changed files with 110 additions and 23 deletions

View File

@@ -74,21 +74,31 @@ func (u *UDPListener) Run(ctx context.Context) {
if u.prom != nil {
u.prom.IncUDPPacket()
}
line := strings.TrimRight(string(buf[:n]), "\r\n")
rec, ok := ParseUDPLine(line, u.v4bits, u.v6bits)
if !ok {
continue
}
if u.prom != nil {
u.prom.IncUDPSuccess()
}
select {
case u.ch <- rec:
if u.prom != nil {
u.prom.IncUDPConsumed()
// nginx-ipng-stats-plugin batches log lines into a single UDP
// datagram (default buffer=64k flush=1s), so one packet may carry
// many lines. nginx's log_format always ends a rendered line with
// '\n'; split on that and process each line independently.
payload := strings.TrimRight(string(buf[:n]), "\r\n")
for _, line := range strings.Split(payload, "\n") {
line = strings.TrimSuffix(line, "\r")
if line == "" {
continue
}
rec, ok := ParseUDPLine(line, u.v4bits, u.v6bits)
if !ok {
continue
}
if u.prom != nil {
u.prom.IncUDPSuccess()
}
select {
case u.ch <- rec:
if u.prom != nil {
u.prom.IncUDPConsumed()
}
default:
// Channel full — drop rather than block the read loop.
}
default:
// Channel full — drop rather than block the read loop.
}
}
}

View File

@@ -66,6 +66,76 @@ func TestUDPListenerRoundTrip(t *testing.T) {
t.Fatal("no record received within 1s")
}
// TestUDPListenerBatchedDatagram exercises the nginx-ipng-stats-plugin's
// buffer/flush batching: a single UDP datagram may contain many log lines
// separated by '\n'. Each line MUST be counted and parsed independently,
// so packets_received * avg_lines_per_packet ≈ loglines_success (not
// packets_received == success as the earlier single-line code assumed).
func TestUDPListenerBatchedDatagram(t *testing.T) {
ch := make(chan LogRecord, 16)
ps := NewPromStore()
pc, err := net.ListenPacket("udp", "127.0.0.1:0")
if err != nil {
t.Fatalf("listen probe: %v", err)
}
addr := pc.LocalAddr().(*net.UDPAddr)
pc.Close()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
u := NewUDPListener(addr.String(), 24, 48, ch)
u.SetProm(ps)
go u.Run(ctx)
// Three v1 lines in one datagram, '\n'-terminated like nginx's
// log_format renders them.
batch := []byte(
"v1\ta.example.com\t1.2.3.4\tGET\t/a\t200\t10\t0.001\t0\t0\tdirect\t10.0.0.1\thttps\n" +
"v1\tb.example.com\t1.2.3.5\tGET\t/b\t404\t20\t0.002\t0\t0\tdirect\t10.0.0.1\thttps\n" +
"v1\tc.example.com\t1.2.3.6\tGET\t/c\t500\t30\t0.003\t0\t0\tdirect\t10.0.0.1\thttps\n",
)
src, err := net.ListenPacket("udp", "127.0.0.1:0")
if err != nil {
t.Fatalf("src listen: %v", err)
}
defer src.Close()
// Drive the listener with retries until all three records land.
got := make(map[string]bool)
deadline := time.Now().Add(time.Second)
for time.Now().Before(deadline) && len(got) < 3 {
if _, err := src.WriteTo(batch, addr); err != nil {
t.Fatalf("write: %v", err)
}
drain:
for {
select {
case rec := <-ch:
got[rec.Website] = true
case <-time.After(50 * time.Millisecond):
break drain
}
}
}
for _, want := range []string{"a.example.com", "b.example.com", "c.example.com"} {
if !got[want] {
t.Errorf("missing record for %s; got=%v", want, got)
}
}
// Exactly one packet arrived yet three lines should have succeeded.
// Under retries the numbers will be multiples, but the ratio must
// always be success ≈ 3 × packets once we've seen each record.
ps.udpMu.Lock()
pkt, suc := ps.udpPacketsReceived, ps.udpLoglinesSuccess
ps.udpMu.Unlock()
if pkt == 0 || suc < pkt*3 {
t.Errorf("packets=%d success=%d: expected success >= 3*packets", pkt, suc)
}
}
// TestUDPListenerMultipleSources exercises the nginx-reload path: a fresh
// nginx worker set opens brand-new send sockets (different ephemeral source
// ports) and the listener MUST keep accepting their packets without