A small bubbletea TUI that reads maglev.yaml (repeatable --config), enumerates every VIP, and probes each from outside the load balancer on a tight cadence (default 100ms, ±10% jitter). HTTP/HTTPS VIPs get a GET against a configurable URI (default /.well-known/ipng/healthz) with per-VIP rolling latency (p50/p95/p99/max), lifetime N/FAIL counters, LAST status, and a response-header tally. Non-HTTP VIPs get a TCP connect probe. A bounded error panel classifies anomalies as timeout / http-err / net-err / spike and auto-sizes to fill the screen. Utility: during a failover drill (backend flap, AS drain, config push) the tally panel shows which backend each VIP is actually steering to, with two-colour activity highlighting over a 5s window — white = receiving traffic, grey = drained. Paired with the rolling OK%/latency columns it gives an at-a-glance answer to "is the VIP healthy from the outside right now, and which backend is it hitting", without relying on maglevd's own view of the world. Also bumps Makefile/go.mod to build the new binary.
163 lines
4.7 KiB
Go
163 lines
4.7 KiB
Go
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
|
||
|
||
package main
|
||
|
||
import "sort"
|
||
|
||
// rollingSize is the bounded history maglevt keeps per VIP for
|
||
// latency percentiles and success-ratio display. At 100 samples
|
||
// and the default 100ms probe interval, this is a ~10s window —
|
||
// short enough to react quickly to failover events, long enough
|
||
// that p50/p95 are statistically meaningful.
|
||
const rollingSize = 100
|
||
|
||
// rolling is a bounded-window (rollingSize) counter for per-VIP probe
|
||
// results. It tracks success/failure totals, running sum for mean
|
||
// latency, and a ring of individual samples so percentiles can be
|
||
// computed on demand. Non-thread-safe: everything that touches a
|
||
// rolling lives on the bubbletea dispatch goroutine, so no locking
|
||
// is needed. The reset-on-rotate logic keeps ok/fail/sumNS in sync
|
||
// with the ring contents as old samples get overwritten.
|
||
type rolling struct {
|
||
samples []sample
|
||
idx int // next write position
|
||
n int // number of valid samples (0..rollingSize)
|
||
|
||
ok int
|
||
fail int
|
||
sumNS uint64
|
||
minNS uint64 // 0 while n == 0
|
||
maxNS uint64
|
||
}
|
||
|
||
type sample struct {
|
||
ns uint64
|
||
ok bool
|
||
}
|
||
|
||
// newRolling returns an empty rolling window ready to accept records.
|
||
func newRolling() *rolling {
|
||
return &rolling{samples: make([]sample, rollingSize)}
|
||
}
|
||
|
||
// reset zeroes every field and the ring so the rolling window starts
|
||
// fresh. Called by the 'r' keybinding and by the --reset flow.
|
||
func (r *rolling) reset() {
|
||
for i := range r.samples {
|
||
r.samples[i] = sample{}
|
||
}
|
||
r.idx = 0
|
||
r.n = 0
|
||
r.ok = 0
|
||
r.fail = 0
|
||
r.sumNS = 0
|
||
r.minNS = 0
|
||
r.maxNS = 0
|
||
}
|
||
|
||
// record appends a single probe result to the rolling window, evicting
|
||
// the oldest sample if the ring is already full. All aggregate fields
|
||
// (ok/fail/sumNS) are kept in sync incrementally to avoid re-walking
|
||
// the ring on every insert. min/max are re-derived from the ring
|
||
// after the write — that's O(n) but n is bounded at rollingSize so
|
||
// the cost is trivial and avoids the bookkeeping complexity of an
|
||
// incremental extremum counter.
|
||
func (r *rolling) record(ns uint64, ok bool) {
|
||
if r.n == rollingSize {
|
||
// Ring is full — subtract the contribution of the sample
|
||
// we're about to overwrite.
|
||
old := r.samples[r.idx]
|
||
r.sumNS -= old.ns
|
||
if old.ok {
|
||
r.ok--
|
||
} else {
|
||
r.fail--
|
||
}
|
||
} else {
|
||
r.n++
|
||
}
|
||
|
||
r.samples[r.idx] = sample{ns: ns, ok: ok}
|
||
r.idx = (r.idx + 1) % rollingSize
|
||
r.sumNS += ns
|
||
if ok {
|
||
r.ok++
|
||
} else {
|
||
r.fail++
|
||
}
|
||
|
||
// Recompute min/max from the live ring. O(rollingSize) but
|
||
// that's 100 uint64 reads — noise on any machine maglevt
|
||
// would ever run on.
|
||
r.minNS = ^uint64(0)
|
||
r.maxNS = 0
|
||
for i := 0; i < r.n; i++ {
|
||
s := r.samples[i]
|
||
if s.ns < r.minNS {
|
||
r.minNS = s.ns
|
||
}
|
||
if s.ns > r.maxNS {
|
||
r.maxNS = s.ns
|
||
}
|
||
}
|
||
}
|
||
|
||
// percentiles returns (p50, p95, p99) in nanoseconds over the
|
||
// current window, or zeros if empty. Implemented by copying the
|
||
// ring into a fresh slice, sort.Slice, and index lookup — the
|
||
// 100-element sort is cheap enough to do per UI frame (roughly
|
||
// every 250ms). Index clamping at r.n-1 handles the warmup case
|
||
// where the rolling window doesn't yet have enough samples for
|
||
// p95/p99 to fall in distinct slots.
|
||
func (r *rolling) percentiles() (p50, p95, p99 uint64) {
|
||
if r.n == 0 {
|
||
return 0, 0, 0
|
||
}
|
||
buf := make([]uint64, r.n)
|
||
for i := 0; i < r.n; i++ {
|
||
buf[i] = r.samples[i].ns
|
||
}
|
||
sort.Slice(buf, func(i, j int) bool { return buf[i] < buf[j] })
|
||
p50 = buf[r.n/2]
|
||
p95Idx := r.n * 95 / 100
|
||
if p95Idx >= r.n {
|
||
p95Idx = r.n - 1
|
||
}
|
||
p95 = buf[p95Idx]
|
||
p99Idx := r.n * 99 / 100
|
||
if p99Idx >= r.n {
|
||
p99Idx = r.n - 1
|
||
}
|
||
p99 = buf[p99Idx]
|
||
return p50, p95, p99
|
||
}
|
||
|
||
// successPct returns the percentage of probes currently in the window
|
||
// that completed successfully (2xx/3xx for HTTP, TCP connect OK for
|
||
// tcp). Returns 0 when the window is empty.
|
||
func (r *rolling) successPct() float64 {
|
||
if r.n == 0 {
|
||
return 0
|
||
}
|
||
return 100.0 * float64(r.ok) / float64(r.n)
|
||
}
|
||
|
||
// isSpike reports whether ns is more than 25% above the current
|
||
// window maximum. A spike-warmup guard (n < 10) prevents the first
|
||
// handful of cold-start samples from each flagging as spikes — max
|
||
// during warmup is whatever happened to come in first, so the 1.25×
|
||
// threshold is meaningless until we have a stable baseline.
|
||
//
|
||
// Intended to be called *before* record() on the same sample, so the
|
||
// comparison runs against the previous window max rather than the
|
||
// one the new sample would produce. If a spike is detected, the
|
||
// caller typically records an errEvent and then calls record() to
|
||
// fold the sample into the rolling stats as usual.
|
||
func (r *rolling) isSpike(ns uint64) bool {
|
||
if r.n < 10 || r.maxNS == 0 {
|
||
return false
|
||
}
|
||
// ns > maxNS * 1.25, written without float conversion.
|
||
return ns > r.maxNS+r.maxNS/4
|
||
}
|