New maglevt TUI component: out-of-band VIP health monitor

A small bubbletea TUI that reads maglev.yaml (repeatable --config), enumerates every VIP, and probes each from outside the load balancer on a tight cadence (default 100ms, ±10% jitter). HTTP/HTTPS VIPs get a GET against a configurable URI (default /.well-known/ipng/healthz) with per-VIP rolling latency (p50/p95/p99/max), lifetime N/FAIL counters, LAST status, and a response-header tally. Non-HTTP VIPs get a TCP connect probe. A bounded error panel classifies anomalies as timeout / http-err / net-err / spike and auto-sizes to fill the screen. Utility: during a failover drill (backend flap, AS drain, config push) the tally panel shows which backend each VIP is actually steering to, with two-colour activity highlighting over a 5s window — white = receiving traffic, grey = drained. Paired with the rolling OK%/latency columns it gives an at-a-glance answer to "is the VIP healthy from the outside right now, and which backend is it hitting", without relying on maglevd's own view of the world. Also bumps Makefile/go.mod to build the new binary.
2026-04-15 01:23:34 +02:00
parent 744b1cb3d2
commit 6293521157
8 changed files with 1890 additions and 1 deletions
--- a/cmd/tester/stats.go
+++ b/cmd/tester/stats.go
@@ -0,0 +1,162 @@
+// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
+
+package main
+
+import "sort"
+
+// rollingSize is the bounded history maglevt keeps per VIP for
+// latency percentiles and success-ratio display. At 100 samples
+// and the default 100ms probe interval, this is a ~10s window —
+// short enough to react quickly to failover events, long enough
+// that p50/p95 are statistically meaningful.
+const rollingSize = 100
+
+// rolling is a bounded-window (rollingSize) counter for per-VIP probe
+// results. It tracks success/failure totals, running sum for mean
+// latency, and a ring of individual samples so percentiles can be
+// computed on demand. Non-thread-safe: everything that touches a
+// rolling lives on the bubbletea dispatch goroutine, so no locking
+// is needed. The reset-on-rotate logic keeps ok/fail/sumNS in sync
+// with the ring contents as old samples get overwritten.
+type rolling struct {
+	samples []sample
+	idx     int // next write position
+	n       int // number of valid samples (0..rollingSize)
+
+	ok    int
+	fail  int
+	sumNS uint64
+	minNS uint64 // 0 while n == 0
+	maxNS uint64
+}
+
+type sample struct {
+	ns uint64
+	ok bool
+}
+
+// newRolling returns an empty rolling window ready to accept records.
+func newRolling() *rolling {
+	return &rolling{samples: make([]sample, rollingSize)}
+}
+
+// reset zeroes every field and the ring so the rolling window starts
+// fresh. Called by the 'r' keybinding and by the --reset flow.
+func (r *rolling) reset() {
+	for i := range r.samples {
+		r.samples[i] = sample{}
+	}
+	r.idx = 0
+	r.n = 0
+	r.ok = 0
+	r.fail = 0
+	r.sumNS = 0
+	r.minNS = 0
+	r.maxNS = 0
+}
+
+// record appends a single probe result to the rolling window, evicting
+// the oldest sample if the ring is already full. All aggregate fields
+// (ok/fail/sumNS) are kept in sync incrementally to avoid re-walking
+// the ring on every insert. min/max are re-derived from the ring
+// after the write — that's O(n) but n is bounded at rollingSize so
+// the cost is trivial and avoids the bookkeeping complexity of an
+// incremental extremum counter.
+func (r *rolling) record(ns uint64, ok bool) {
+	if r.n == rollingSize {
+		// Ring is full — subtract the contribution of the sample
+		// we're about to overwrite.
+		old := r.samples[r.idx]
+		r.sumNS -= old.ns
+		if old.ok {
+			r.ok--
+		} else {
+			r.fail--
+		}
+	} else {
+		r.n++
+	}
+
+	r.samples[r.idx] = sample{ns: ns, ok: ok}
+	r.idx = (r.idx + 1) % rollingSize
+	r.sumNS += ns
+	if ok {
+		r.ok++
+	} else {
+		r.fail++
+	}
+
+	// Recompute min/max from the live ring. O(rollingSize) but
+	// that's 100 uint64 reads — noise on any machine maglevt
+	// would ever run on.
+	r.minNS = ^uint64(0)
+	r.maxNS = 0
+	for i := 0; i < r.n; i++ {
+		s := r.samples[i]
+		if s.ns < r.minNS {
+			r.minNS = s.ns
+		}
+		if s.ns > r.maxNS {
+			r.maxNS = s.ns
+		}
+	}
+}
+
+// percentiles returns (p50, p95, p99) in nanoseconds over the
+// current window, or zeros if empty. Implemented by copying the
+// ring into a fresh slice, sort.Slice, and index lookup — the
+// 100-element sort is cheap enough to do per UI frame (roughly
+// every 250ms). Index clamping at r.n-1 handles the warmup case
+// where the rolling window doesn't yet have enough samples for
+// p95/p99 to fall in distinct slots.
+func (r *rolling) percentiles() (p50, p95, p99 uint64) {
+	if r.n == 0 {
+		return 0, 0, 0
+	}
+	buf := make([]uint64, r.n)
+	for i := 0; i < r.n; i++ {
+		buf[i] = r.samples[i].ns
+	}
+	sort.Slice(buf, func(i, j int) bool { return buf[i] < buf[j] })
+	p50 = buf[r.n/2]
+	p95Idx := r.n * 95 / 100
+	if p95Idx >= r.n {
+		p95Idx = r.n - 1
+	}
+	p95 = buf[p95Idx]
+	p99Idx := r.n * 99 / 100
+	if p99Idx >= r.n {
+		p99Idx = r.n - 1
+	}
+	p99 = buf[p99Idx]
+	return p50, p95, p99
+}
+
+// successPct returns the percentage of probes currently in the window
+// that completed successfully (2xx/3xx for HTTP, TCP connect OK for
+// tcp). Returns 0 when the window is empty.
+func (r *rolling) successPct() float64 {
+	if r.n == 0 {
+		return 0
+	}
+	return 100.0 * float64(r.ok) / float64(r.n)
+}
+
+// isSpike reports whether ns is more than 25% above the current
+// window maximum. A spike-warmup guard (n < 10) prevents the first
+// handful of cold-start samples from each flagging as spikes — max
+// during warmup is whatever happened to come in first, so the 1.25×
+// threshold is meaningless until we have a stable baseline.
+//
+// Intended to be called *before* record() on the same sample, so the
+// comparison runs against the previous window max rather than the
+// one the new sample would produce. If a spike is detected, the
+// caller typically records an errEvent and then calls record() to
+// fold the sample into the rolling stats as usual.
+func (r *rolling) isSpike(ns uint64) bool {
+	if r.n < 10 || r.maxNS == 0 {
+		return false
+	}
+	// ns > maxNS * 1.25, written without float conversion.
+	return ns > r.maxNS+r.maxNS/4
+}