Files
vpp-maglev/cmd/tester/stats.go
Pim van Pelt 6293521157 New maglevt TUI component: out-of-band VIP health monitor
A small bubbletea TUI that reads maglev.yaml (repeatable --config),
enumerates every VIP, and probes each from outside the load balancer
on a tight cadence (default 100ms, ±10% jitter). HTTP/HTTPS VIPs get
a GET against a configurable URI (default /.well-known/ipng/healthz)
with per-VIP rolling latency (p50/p95/p99/max), lifetime N/FAIL
counters, LAST status, and a response-header tally. Non-HTTP VIPs
get a TCP connect probe. A bounded error panel classifies anomalies
as timeout / http-err / net-err / spike and auto-sizes to fill the
screen.

Utility: during a failover drill (backend flap, AS drain, config
push) the tally panel shows which backend each VIP is actually
steering to, with two-colour activity highlighting over a 5s
window — white = receiving traffic, grey = drained. Paired with
the rolling OK%/latency columns it gives an at-a-glance answer to
"is the VIP healthy from the outside right now, and which backend
is it hitting", without relying on maglevd's own view of the
world.

Also bumps Makefile/go.mod to build the new binary.
2026-04-15 01:23:52 +02:00

163 lines
4.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
package main
import "sort"
// rollingSize is the bounded history maglevt keeps per VIP for
// latency percentiles and success-ratio display. At 100 samples
// and the default 100ms probe interval, this is a ~10s window —
// short enough to react quickly to failover events, long enough
// that p50/p95 are statistically meaningful.
const rollingSize = 100
// rolling is a bounded-window (rollingSize) counter for per-VIP probe
// results. It tracks success/failure totals, running sum for mean
// latency, and a ring of individual samples so percentiles can be
// computed on demand. Non-thread-safe: everything that touches a
// rolling lives on the bubbletea dispatch goroutine, so no locking
// is needed. The reset-on-rotate logic keeps ok/fail/sumNS in sync
// with the ring contents as old samples get overwritten.
type rolling struct {
samples []sample
idx int // next write position
n int // number of valid samples (0..rollingSize)
ok int
fail int
sumNS uint64
minNS uint64 // 0 while n == 0
maxNS uint64
}
type sample struct {
ns uint64
ok bool
}
// newRolling returns an empty rolling window ready to accept records.
func newRolling() *rolling {
return &rolling{samples: make([]sample, rollingSize)}
}
// reset zeroes every field and the ring so the rolling window starts
// fresh. Called by the 'r' keybinding and by the --reset flow.
func (r *rolling) reset() {
for i := range r.samples {
r.samples[i] = sample{}
}
r.idx = 0
r.n = 0
r.ok = 0
r.fail = 0
r.sumNS = 0
r.minNS = 0
r.maxNS = 0
}
// record appends a single probe result to the rolling window, evicting
// the oldest sample if the ring is already full. All aggregate fields
// (ok/fail/sumNS) are kept in sync incrementally to avoid re-walking
// the ring on every insert. min/max are re-derived from the ring
// after the write — that's O(n) but n is bounded at rollingSize so
// the cost is trivial and avoids the bookkeeping complexity of an
// incremental extremum counter.
func (r *rolling) record(ns uint64, ok bool) {
if r.n == rollingSize {
// Ring is full — subtract the contribution of the sample
// we're about to overwrite.
old := r.samples[r.idx]
r.sumNS -= old.ns
if old.ok {
r.ok--
} else {
r.fail--
}
} else {
r.n++
}
r.samples[r.idx] = sample{ns: ns, ok: ok}
r.idx = (r.idx + 1) % rollingSize
r.sumNS += ns
if ok {
r.ok++
} else {
r.fail++
}
// Recompute min/max from the live ring. O(rollingSize) but
// that's 100 uint64 reads — noise on any machine maglevt
// would ever run on.
r.minNS = ^uint64(0)
r.maxNS = 0
for i := 0; i < r.n; i++ {
s := r.samples[i]
if s.ns < r.minNS {
r.minNS = s.ns
}
if s.ns > r.maxNS {
r.maxNS = s.ns
}
}
}
// percentiles returns (p50, p95, p99) in nanoseconds over the
// current window, or zeros if empty. Implemented by copying the
// ring into a fresh slice, sort.Slice, and index lookup — the
// 100-element sort is cheap enough to do per UI frame (roughly
// every 250ms). Index clamping at r.n-1 handles the warmup case
// where the rolling window doesn't yet have enough samples for
// p95/p99 to fall in distinct slots.
func (r *rolling) percentiles() (p50, p95, p99 uint64) {
if r.n == 0 {
return 0, 0, 0
}
buf := make([]uint64, r.n)
for i := 0; i < r.n; i++ {
buf[i] = r.samples[i].ns
}
sort.Slice(buf, func(i, j int) bool { return buf[i] < buf[j] })
p50 = buf[r.n/2]
p95Idx := r.n * 95 / 100
if p95Idx >= r.n {
p95Idx = r.n - 1
}
p95 = buf[p95Idx]
p99Idx := r.n * 99 / 100
if p99Idx >= r.n {
p99Idx = r.n - 1
}
p99 = buf[p99Idx]
return p50, p95, p99
}
// successPct returns the percentage of probes currently in the window
// that completed successfully (2xx/3xx for HTTP, TCP connect OK for
// tcp). Returns 0 when the window is empty.
func (r *rolling) successPct() float64 {
if r.n == 0 {
return 0
}
return 100.0 * float64(r.ok) / float64(r.n)
}
// isSpike reports whether ns is more than 25% above the current
// window maximum. A spike-warmup guard (n < 10) prevents the first
// handful of cold-start samples from each flagging as spikes — max
// during warmup is whatever happened to come in first, so the 1.25×
// threshold is meaningless until we have a stable baseline.
//
// Intended to be called *before* record() on the same sample, so the
// comparison runs against the previous window max rather than the
// one the new sample would produce. If a spike is detected, the
// caller typically records an errEvent and then calls record() to
// fold the sample into the rolling stats as usual.
func (r *rolling) isSpike(ns uint64) bool {
if r.n < 10 || r.maxNS == 0 {
return false
}
// ns > maxNS * 1.25, written without float conversion.
return ns > r.maxNS+r.maxNS/4
}