New maglevt TUI component: out-of-band VIP health monitor

A small bubbletea TUI that reads maglev.yaml (repeatable --config), enumerates every VIP, and probes each from outside the load balancer on a tight cadence (default 100ms, ±10% jitter). HTTP/HTTPS VIPs get a GET against a configurable URI (default /.well-known/ipng/healthz) with per-VIP rolling latency (p50/p95/p99/max), lifetime N/FAIL counters, LAST status, and a response-header tally. Non-HTTP VIPs get a TCP connect probe. A bounded error panel classifies anomalies as timeout / http-err / net-err / spike and auto-sizes to fill the screen. Utility: during a failover drill (backend flap, AS drain, config push) the tally panel shows which backend each VIP is actually steering to, with two-colour activity highlighting over a 5s window — white = receiving traffic, grey = drained. Paired with the rolling OK%/latency columns it gives an at-a-glance answer to "is the VIP healthy from the outside right now, and which backend is it hitting", without relying on maglevd's own view of the world. Also bumps Makefile/go.mod to build the new binary.
2026-04-15 01:23:34 +02:00
parent 744b1cb3d2
commit 6293521157
8 changed files with 1890 additions and 1 deletions
--- a/cmd/tester/main.go
+++ b/cmd/tester/main.go
@@ -0,0 +1,305 @@
+// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
+
+// maglevt is a tiny TUI that reads maglev.yaml, enumerates every VIP
+// and hits it on a tight cadence (default 100ms) from outside the load
+// balancer. HTTP/HTTPS VIPs get a HEAD request with per-VIP rolling
+// latency stats, success/failure ratios, and a running tally of a
+// configurable response header (default: X-IPng-Frontend) so pool-
+// failover events show up as visible reshuffles in the tally. Non-HTTP
+// VIPs get a plain TCP-connect probe for liveness. See maglevt --help
+// for the flag surface.
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"net"
+	"os"
+	"regexp"
+	"sort"
+	"strings"
+	"time"
+
+	tea "github.com/charmbracelet/bubbletea"
+
+	buildinfo "git.ipng.ch/ipng/vpp-maglev/cmd"
+	"git.ipng.ch/ipng/vpp-maglev/internal/config"
+)
+
+func main() {
+	if err := run(); err != nil {
+		fmt.Fprintf(os.Stderr, "maglevt: %v\n", err)
+		os.Exit(1)
+	}
+}
+
+func run() error {
+	var cfgPaths multiFlag
+	flag.Var(&cfgPaths, "config", "path to maglev.yaml (repeatable; also accepts a comma-separated list). Frontends are unioned across files, deduplicated by (address, protocol, port).")
+	interval := flag.Duration("interval", 100*time.Millisecond, "probe interval per VIP (±10% jitter)")
+	timeout := flag.Duration("timeout", 2*time.Second, "per-request timeout")
+	host := flag.String("host", "", "Host header override (default: VIP address literal)")
+	// Default probe URI: a small, deliberate health-check path that
+	// typically returns 204 No Content and doesn't hit the backend
+	// app logs. /.well-known/ipng/healthz is the convention for
+	// IPng deployments; override with --uri for anything else.
+	// --path is registered as a synonym for backward compatibility
+	// with the pre-1.0 flag name — both set the same variable, so
+	// whichever the operator types last on the command line wins.
+	const defaultURI = "/.well-known/ipng/healthz"
+	path := flag.String("uri", defaultURI, "HTTP request path (URI) used in the GET request")
+	flag.StringVar(path, "path", defaultURI, "alias for --uri")
+	header := flag.String("header", "X-IPng-Frontend", "response header to extract and tally")
+	insecure := flag.Bool("insecure", true, "skip TLS verification for HTTPS")
+	keepalive := flag.Bool("keepalive", false, "enable HTTP keep-alives (disabled by default so each probe opens a fresh connection — required for failover visibility)")
+	flag.BoolVar(keepalive, "k", false, "shorthand for --keepalive")
+	filter := flag.String("filter", "", "only probe frontends whose name matches this regex")
+	printVersion := flag.Bool("version", false, "print version and exit")
+	flag.Parse()
+
+	if *printVersion {
+		fmt.Printf("maglevt %s (commit %s, built %s)\n",
+			buildinfo.Version(), buildinfo.Commit(), buildinfo.Date())
+		return nil
+	}
+
+	if len(cfgPaths) == 0 {
+		cfgPaths = multiFlag{"/etc/vpp-maglev/maglev.yaml"}
+	}
+
+	// Load every requested config. A parse/semantic error on any of
+	// them is fatal — we want the user to see it rather than silently
+	// probing a reduced set of VIPs because one file was broken.
+	configs := make([]*config.Config, 0, len(cfgPaths))
+	for _, p := range cfgPaths {
+		cfg, res := config.Check(p)
+		if !res.OK() {
+			if res.ParseError != "" {
+				return fmt.Errorf("config parse %s: %s", p, res.ParseError)
+			}
+			return fmt.Errorf("config semantic %s: %s", p, res.SemanticError)
+		}
+		configs = append(configs, cfg)
+	}
+
+	var filterRe *regexp.Regexp
+	if *filter != "" {
+		var err error
+		filterRe, err = regexp.Compile(*filter)
+		if err != nil {
+			return fmt.Errorf("invalid --filter regex: %w", err)
+		}
+	}
+
+	opts := probeOpts{
+		Interval:  *interval,
+		Timeout:   *timeout,
+		Host:      *host,
+		Path:      *path,
+		Header:    *header,
+		Insecure:  *insecure,
+		KeepAlive: *keepalive,
+	}
+
+	vips := buildVIPsUnion(configs, cfgPaths, filterRe, opts)
+	if len(vips) == 0 {
+		return fmt.Errorf("no matching frontends in %s", strings.Join(cfgPaths, ", "))
+	}
+
+	m := Model{
+		cfgPath: strings.Join(cfgPaths, ", "),
+		vips:    vips,
+		opts:    opts,
+		startAt: time.Now(),
+		showDNS: true,
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	prog := tea.NewProgram(m, tea.WithAltScreen())
+
+	// Spawn one probe goroutine per VIP. Each sends probeResultMsg
+	// into the tea.Program via prog.Send, which is thread-safe.
+	// Alongside the prober we kick off a one-shot reverse-DNS
+	// lookup so the 'd' toggle has a hostname to display; the
+	// lookup is best-effort and simply drops on timeout or NXDOMAIN.
+	for _, v := range vips {
+		go runProbeLoop(ctx, v.info, opts, prog.Send)
+		go runDNSLookup(ctx, v.info, prog.Send)
+	}
+
+	_, err := prog.Run()
+	cancel()
+	// Give the workers a beat to observe ctx and exit. This isn't
+	// strictly required — the process is exiting anyway — but a clean
+	// shutdown avoids the "unexpected EOF writing to a closed
+	// transport" spam that HTTP clients sometimes emit on ctrl-C.
+	time.Sleep(50 * time.Millisecond)
+	return err
+}
+
+// buildVIPsUnion flattens frontends from multiple configs into a
+// single deduplicated probe list, keyed by the (scheme, address,
+// port) tuple. The typical use case is a pair of maglevds fronting
+// the same two VIPs (vip0 / vip1, IPv4 + IPv6, × port 80 + 443 = 8
+// probers) — the operator passes both yaml files and maglevt unions
+// them so the probe grid doesn't grow duplicates from mirrored
+// configs. The symbolic frontend name from yaml is intentionally
+// dropped: when two files use the same name for different tuples
+// (common in cross-deployment comparisons) the name would be
+// ambiguous, and the tuple is the only stable identity. Only the
+// --filter regex still uses the name, as a pre-dedup match.
+//
+// Dedup key uses net.IP.String() which canonicalises IPv6 zero-
+// compression, so 2001:db8::1 and 2001:db8:0:0:0:0:0:1 collapse
+// onto one entry. Iteration order across files is stable for
+// deterministic TUI layout: within a file, frontends are visited
+// in name-sorted order; across files, the first occurrence of each
+// tuple wins and fixes its slot in the output.
+func buildVIPsUnion(cfgs []*config.Config, cfgPaths []string, filterRe *regexp.Regexp, opts probeOpts) []*vipState {
+	_ = cfgPaths // reserved for future diagnostics (e.g. which file this tuple came from)
+	type key struct {
+		ip     string
+		scheme string
+		port   uint16
+	}
+	seen := map[key]*vipState{}
+	var order []key
+
+	for _, cfg := range cfgs {
+		names := make([]string, 0, len(cfg.Frontends))
+		for name := range cfg.Frontends {
+			names = append(names, name)
+		}
+		sortStringsInPlace(names)
+
+		for _, name := range names {
+			fe := cfg.Frontends[name]
+			if filterRe != nil && !filterRe.MatchString(name) {
+				continue
+			}
+			if strings.ToLower(fe.Protocol) != "tcp" || fe.Port == 0 {
+				continue
+			}
+			scheme := schemeForPort(fe.Port)
+			k := key{ip: fe.Address.String(), scheme: scheme, port: fe.Port}
+			if _, ok := seen[k]; ok {
+				continue // already claimed by an earlier file
+			}
+			info := &vipInfo{
+				idx:    len(order),
+				scheme: scheme,
+				ip:     fe.Address,
+				port:   fe.Port,
+			}
+			if scheme == "http" || scheme == "https" {
+				info.url = buildURL(scheme, fe.Address, fe.Port, opts.Path)
+				info.client = newHTTPClient(opts)
+			}
+			v := &vipState{
+				info:     info,
+				rolling:  newRolling(),
+				tally:    map[string]int{},
+				tallyOld: map[string]int{},
+				tallyNew: map[string]int{},
+			}
+			seen[k] = v
+			order = append(order, k)
+		}
+	}
+
+	out := make([]*vipState, len(order))
+	for i, k := range order {
+		out[i] = seen[k]
+	}
+	// Display order: IPv6 before IPv4, higher ports before lower
+	// within each address family, then address string as a final
+	// tiebreaker for determinism across runs. HTTPS :443 sitting
+	// above HTTP :80 matches the "secure first" reading order most
+	// operators expect, and clustering all the IPv6 rows at the top
+	// keeps a mixed-family deployment visually coherent as the
+	// operator scans down the table.
+	sort.SliceStable(out, func(i, j int) bool {
+		vi, vj := out[i].info, out[j].info
+		iIs6 := vi.ip.To4() == nil
+		jIs6 := vj.ip.To4() == nil
+		if iIs6 != jIs6 {
+			return iIs6
+		}
+		if vi.port != vj.port {
+			return vi.port > vj.port
+		}
+		return vi.ip.String() < vj.ip.String()
+	})
+	// Re-index after the sort so info.idx matches the slot each VIP
+	// now occupies in out — probeResultMsg.VIPIdx is looked up via
+	// this index in Model.Update, so they must agree.
+	for i, v := range out {
+		v.info.idx = i
+	}
+	return out
+}
+
+// multiFlag is a flag.Value implementation that accumulates repeated
+// --config occurrences into a slice, and also accepts comma-separated
+// values on a single flag instance so `--config a.yaml,b.yaml` and
+// `--config a.yaml --config b.yaml` produce the same result.
+type multiFlag []string
+
+func (m *multiFlag) String() string {
+	return strings.Join(*m, ",")
+}
+
+func (m *multiFlag) Set(v string) error {
+	for _, p := range strings.Split(v, ",") {
+		p = strings.TrimSpace(p)
+		if p != "" {
+			*m = append(*m, p)
+		}
+	}
+	return nil
+}
+
+// schemeForPort infers HTTP vs HTTPS from the VIP's TCP port, falling
+// back to "tcp" (connect-only probe) for anything that isn't
+// unambiguously web. Intentionally narrow — we'd rather under-classify
+// than send HEAD / at an IMAPS VIP and spew protocol errors into the
+// logs. Adding more here is fine later (e.g. 8080/8443) but defaults
+// should stay conservative.
+func schemeForPort(port uint16) string {
+	switch port {
+	case 80:
+		return "http"
+	case 443:
+		return "https"
+	}
+	return "tcp"
+}
+
+// buildURL constructs the probe URL for an HTTP/HTTPS VIP. IPv6
+// literals are bracketed per RFC 3986 §3.2.2 so the colon in the
+// address isn't confused with the port separator.
+func buildURL(scheme string, ip net.IP, port uint16, path string) string {
+	host := ip.String()
+	if ip.To4() == nil {
+		host = "[" + host + "]"
+	}
+	if path == "" {
+		path = "/"
+	}
+	return fmt.Sprintf("%s://%s:%d%s", scheme, host, port, path)
+}
+
+// sortStringsInPlace is a tiny shim so we don't import "sort" just
+// for a single call from buildVIPs. The sorted-names slice is at
+// most a few dozen elements so an insertion sort is fine and avoids
+// the import churn.
+func sortStringsInPlace(s []string) {
+	for i := 1; i < len(s); i++ {
+		for j := i; j > 0 && s[j-1] > s[j]; j-- {
+			s[j-1], s[j] = s[j], s[j-1]
+		}
+	}
+}
--- a/cmd/tester/model.go
+++ b/cmd/tester/model.go
@@ -0,0 +1,349 @@
+// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
+
+package main
+
+import (
+	"fmt"
+	"time"
+
+	tea "github.com/charmbracelet/bubbletea"
+)
+
+// tallyWindow is the sliding-window length used to classify tally
+// entries as "actively receiving traffic" versus "drained". A probe
+// snapshot of each VIP's tally is rotated into vipState.tallyOld once
+// a second, so on steady state the delta (tally - tallyOld) reflects
+// somewhere between 1 and 2 seconds of activity — long enough to be
+// noise-resistant, short enough that a flush or graceful drain shows
+// up in the next UI redraw.
+const tallyWindow = 5 * time.Second
+
+// errWindowSize is the hard storage cap for Model.events. It isn't
+// the number of rows rendered to screen — that's computed per-frame
+// in View() from the terminal height so the events panel fills
+// whatever space is left after the header, table, tally, and
+// footer. This cap only exists to stop the ring from growing
+// unbounded on a very long-running session that's seeing constant
+// anomalies: 500 × ~100 bytes per event is ~50 KB, negligible.
+const errWindowSize = 500
+
+// errKind classifies why an event ended up in the error panel. The
+// four kinds map one-to-one to the four situations Update flags
+// from a probeResultMsg: a probe hit its timeout deadline, a probe
+// came back with an HTTP 4xx/5xx, a probe failed at the network
+// layer (connection refused, reset, unreachable, TLS handshake
+// error), or a probe completed successfully but with a latency
+// more than 25% above the rolling-window max.
+type errKind int
+
+const (
+	kindTimeout errKind = iota
+	kindHTTPErr
+	kindNetErr
+	kindSpike
+)
+
+func (k errKind) String() string {
+	switch k {
+	case kindTimeout:
+		return "timeout"
+	case kindHTTPErr:
+		return "http-err"
+	case kindNetErr:
+		return "net-err"
+	case kindSpike:
+		return "spike"
+	}
+	return "unknown"
+}
+
+// errEvent is one entry in the bounded error-panel ring. VIPIdx
+// points back into Model.vips so the view can look up the scheme
+// and address for the row label at render time (we don't store a
+// formatted label here to keep the event struct cheap and to let
+// the view decide how to style it).
+type errEvent struct {
+	At     time.Time
+	VIPIdx int
+	Kind   errKind
+	Detail string
+}
+
+// Model is the bubbletea Model for maglevt. Held by value throughout
+// so bubbletea's copy-on-Update semantics work naturally; mutable
+// per-VIP state lives behind *vipState pointers in the vips slice so
+// probeResultMsg handlers can update rolling/tally without copying
+// the whole model.
+type Model struct {
+	cfgPath string
+	vips    []*vipState
+	opts    probeOpts
+	startAt time.Time
+	width   int
+	height  int
+	help    bool       // whether the help overlay is currently shown
+	events  []errEvent // bounded ring of recent anomalies (size errWindowSize)
+	// showDNS toggles between hostname and IP-literal display in
+	// the ADDR column and the tally/events labels. On by default:
+	// operators usually know VIPs by name, and the 'd' key flips
+	// to the raw literal when they need to see which address
+	// family or which specific IP the row represents. Hostnames
+	// come in asynchronously via hostnameMsg, so vipState.hostname
+	// may still be empty for a VIP even when showDNS is true —
+	// the display falls back to the IP literal in that case.
+	showDNS bool
+}
+
+// vipState is the mutable per-VIP record threaded through the tea
+// dispatch loop. vipState.info is the immutable descriptor built at
+// startup (see probe.go::vipInfo), while everything else on this
+// struct is rewritten as probe results arrive.
+type vipState struct {
+	info *vipInfo
+
+	// hostname is the PTR-record lookup result for info.ip, filled
+	// in asynchronously by runDNSLookup via hostnameMsg. Empty
+	// until the lookup returns (or forever, if it fails or times
+	// out). The UI consults Model.showDNS to decide whether to
+	// use it.
+	hostname string
+
+	// Rolling stats populated from every probeResultMsg. Separate
+	// from tally so reset semantics match the user's mental model:
+	// pressing 'r' blows away both, but a future pause-clear-resume
+	// cadence could reset just one.
+	rolling *rolling
+	tally   map[string]int
+
+	// tallyOld / tallyNew are the two-slot rotating snapshots used
+	// by the tally panel to classify each backend as green (still
+	// receiving traffic), orange (receiving less than the leader),
+	// or grey (drained). tallyNew is captured every tallyWindow;
+	// on the next rotation it shifts into tallyOld, so the delta
+	// (tally - tallyOld) always spans between 1 and 2 tallyWindow
+	// units of activity. tallyAt is the wall-clock time tallyNew
+	// was captured and drives the rotation decision in tickMsg.
+	tallyOld map[string]int
+	tallyNew map[string]int
+	tallyAt  time.Time
+
+	// Lifetime counters. Unlike the rolling window these never
+	// forget until the operator hits 'r'. The N column in the
+	// probe table renders totalProbes; FAIL renders totalFails
+	// tinted red when non-zero so a failure that rolled off the
+	// 100-sample rolling window still leaves a visible mark on
+	// the cumulative count.
+	totalProbes int64
+	totalFails  int64
+
+	// Last-seen values for the rightmost LAST column. These are
+	// the "what happened on the most recent probe" snapshot the
+	// UI shows as green/yellow/red.
+	lastAt     time.Time
+	lastOK     bool
+	lastCode   int
+	lastErr    string
+	lastHeader string
+	lastDur    time.Duration
+}
+
+// tickMsg drives the periodic redraw even when no probe results are
+// arriving (so the uptime counter in the header ticks along even on
+// a completely idle VIP set). 250ms is fast enough to look live
+// without burning CPU on layout work.
+type tickMsg time.Time
+
+func tickCmd() tea.Cmd {
+	return tea.Tick(250*time.Millisecond, func(t time.Time) tea.Msg {
+		return tickMsg(t)
+	})
+}
+
+// Init kicks off the periodic redraw. The alt-screen entry and window
+// title are set at NewProgram time in main.go.
+func (m Model) Init() tea.Cmd {
+	return tickCmd()
+}
+
+// Update handles every tea.Msg delivered to the program. Four
+// message classes:
+//
+//   - tea.WindowSizeMsg — resize; cache width/height for the View.
+//   - tea.KeyMsg — keybindings (quit, pause, reset, help).
+//   - probeResultMsg — probe goroutine delivered a new sample;
+//     update rolling/tally/last* and the sparkline.
+//   - tickMsg — periodic redraw; re-arm the timer.
+func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
+	switch msg := msg.(type) {
+	case tea.WindowSizeMsg:
+		m.width = msg.Width
+		m.height = msg.Height
+		return m, nil
+
+	case tea.KeyMsg:
+		switch msg.String() {
+		case "q", "ctrl+c":
+			return m, tea.Quit
+		case " ":
+			paused.Store(!paused.Load())
+			return m, nil
+		case "r":
+			// Unified reset: wipe per-VIP rolling windows, per-VIP
+			// tallies, per-VIP sparklines, the global event ring,
+			// and the global uptime origin so the header clock
+			// starts fresh. 'r' is the one-key way to start a
+			// clean capture window, which matches the "I'm about
+			// to do a failover, watch this" flow.
+			for _, v := range m.vips {
+				v.rolling.reset()
+				v.tally = map[string]int{}
+				v.tallyOld = map[string]int{}
+				v.tallyNew = map[string]int{}
+				v.tallyAt = time.Time{}
+				v.totalProbes = 0
+				v.totalFails = 0
+				v.lastAt = time.Time{}
+				v.lastOK = false
+				v.lastCode = 0
+				v.lastErr = ""
+				v.lastHeader = ""
+				v.lastDur = 0
+			}
+			m.events = nil
+			m.startAt = time.Now()
+			return m, nil
+		case "h", "?":
+			m.help = !m.help
+			return m, nil
+		case "d":
+			m.showDNS = !m.showDNS
+			return m, nil
+		}
+
+	case hostnameMsg:
+		if msg.VIPIdx >= 0 && msg.VIPIdx < len(m.vips) {
+			m.vips[msg.VIPIdx].hostname = msg.Hostname
+		}
+		return m, nil
+
+	case probeResultMsg:
+		if msg.VIPIdx < 0 || msg.VIPIdx >= len(m.vips) {
+			return m, nil
+		}
+		v := m.vips[msg.VIPIdx]
+		ns := uint64(msg.Duration.Nanoseconds())
+		prevMax := v.rolling.maxNS
+		spike := v.rolling.isSpike(ns)
+
+		v.lastAt = msg.At
+		v.lastOK = msg.OK
+		v.lastCode = msg.Code
+		v.lastErr = msg.Err
+		v.lastHeader = msg.Header
+		v.lastDur = msg.Duration
+		v.totalProbes++
+		if !msg.OK {
+			v.totalFails++
+		}
+		v.rolling.record(ns, msg.OK)
+		if msg.Header != "" {
+			v.tally[msg.Header]++
+		}
+
+		// Classify the sample for the error panel. Order matters:
+		// a network error is always more interesting than a latency
+		// observation (the latency is noise from the failure
+		// itself), an HTTP error is more interesting than a spike
+		// (a 503 dominates a 10ms vs 12ms latency blip), and a
+		// spike is only flagged on otherwise-successful samples.
+		if ev, ok := classifyEvent(msg, v, spike, prevMax); ok {
+			m.events = append(m.events, ev)
+			if len(m.events) > errWindowSize {
+				m.events = m.events[len(m.events)-errWindowSize:]
+			}
+		}
+		return m, nil
+
+	case tickMsg:
+		// Rotate each VIP's tally snapshot once the window has
+		// elapsed. Skipping while paused keeps the tally colours
+		// frozen at their pre-pause state instead of decaying
+		// everything to grey as deltas naturally fall to zero.
+		if !paused.Load() {
+			now := time.Time(msg)
+			for _, v := range m.vips {
+				if v.tallyAt.IsZero() || now.Sub(v.tallyAt) >= tallyWindow {
+					v.tallyOld = v.tallyNew
+					v.tallyNew = cloneTally(v.tally)
+					v.tallyAt = now
+				}
+			}
+		}
+		return m, tickCmd()
+	}
+	return m, nil
+}
+
+// cloneTally returns a shallow copy of src suitable for the two-slot
+// rotation in vipState. The snapshot must be independent of the live
+// tally because subsequent probes keep mutating the original map;
+// without the copy, tallyNew and tally would alias and the delta
+// would always be zero.
+func cloneTally(src map[string]int) map[string]int {
+	out := make(map[string]int, len(src))
+	for k, v := range src {
+		out[k] = v
+	}
+	return out
+}
+
+// classifyEvent inspects a probeResultMsg and returns the matching
+// errEvent (if any) for the error panel. Returns (_, false) when
+// the sample is uninteresting — a boring 2xx/3xx HTTP response or
+// a successful TCP connect. The four classes are checked in
+// priority order: network/timeout errors trump HTTP status trump
+// latency spikes, because a failed probe's latency is noise
+// inherited from the failure.
+//
+// prevMax is the rolling-window max seen *before* this sample was
+// recorded. It's included in the spike Detail so the operator can
+// see the baseline the current probe blew past ("482ms (was
+// 98ms)") rather than just an absolute number with no context.
+func classifyEvent(msg probeResultMsg, v *vipState, spike bool, prevMax uint64) (errEvent, bool) {
+	if msg.Err != "" {
+		kind := kindNetErr
+		// shortError already collapses "i/o timeout" and
+		// "context deadline exceeded" to the literal "timeout"
+		// token, so an equality check is enough to distinguish
+		// hit-the-deadline failures from refused / reset /
+		// unreachable errors.
+		if msg.Err == "timeout" {
+			kind = kindTimeout
+		}
+		return errEvent{
+			At:     msg.At,
+			VIPIdx: msg.VIPIdx,
+			Kind:   kind,
+			Detail: msg.Err,
+		}, true
+	}
+	if v.info.scheme != "tcp" && msg.Code >= 400 {
+		return errEvent{
+			At:     msg.At,
+			VIPIdx: msg.VIPIdx,
+			Kind:   kindHTTPErr,
+			Detail: fmt.Sprintf("HTTP %d", msg.Code),
+		}, true
+	}
+	if spike {
+		return errEvent{
+			At:     msg.At,
+			VIPIdx: msg.VIPIdx,
+			Kind:   kindSpike,
+			Detail: fmt.Sprintf("%s (prev max %s)",
+				formatDur(msg.Duration),
+				formatDur(time.Duration(prevMax))),
+		}, true
+	}
+	return errEvent{}, false
+}
--- a/cmd/tester/probe.go
+++ b/cmd/tester/probe.go
@@ -0,0 +1,292 @@
+// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
+
+package main
+
+import (
+	"context"
+	"crypto/tls"
+	"math/rand/v2"
+	"net"
+	"net/http"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	tea "github.com/charmbracelet/bubbletea"
+)
+
+// probeOpts bundles the runtime probe configuration coming from the
+// command-line flags. Each probe goroutine gets a copy at startup;
+// nothing in here mutates during the program's lifetime (pauses flow
+// through the global `paused` atomic instead).
+type probeOpts struct {
+	Interval  time.Duration
+	Timeout   time.Duration
+	Host      string
+	Path      string
+	Header    string
+	Insecure  bool
+	KeepAlive bool
+}
+
+// vipInfo is the immutable per-VIP descriptor built from the maglev
+// config at startup. Kept separate from vipState (which holds mutable
+// stats) so the probe goroutine can read its target without racing
+// the Update loop in model.go.
+//
+// Identity of a VIP in maglevt is the (scheme, ip, port) tuple, not
+// the symbolic name from the config: when we union multiple maglev
+// yaml files, the same name can describe different tuples across
+// deployments and the tuple is the unambiguous key. The TUI shows
+// scheme + ip:port and nothing else, so there's no need to carry a
+// display name at all.
+type vipInfo struct {
+	idx    int    // index into Model.vips (stable for the process lifetime)
+	scheme string // "http", "https", or "tcp"
+	ip     net.IP // the VIP address
+	port   uint16 // the VIP port (TCP)
+	url    string // assembled probe URL for http/https; empty for tcp
+	client *http.Client
+}
+
+// probeResultMsg is the tea.Msg sent from probe goroutines to the UI
+// on every probe completion. Bubbletea delivers it into Model.Update
+// on the tea dispatch goroutine, so the model can mutate its per-VIP
+// state without locks.
+type probeResultMsg struct {
+	VIPIdx   int
+	At       time.Time
+	Duration time.Duration
+	OK       bool
+	Code     int    // HTTP status code (0 for tcp-only probes)
+	Header   string // extracted response header value, empty if absent
+	Err      string // empty when OK; populated with a short error string otherwise
+}
+
+// hostnameMsg is the tea.Msg the DNS resolver worker sends once it
+// has a PTR record for a VIP. The UI uses it to populate
+// vipState.hostname so the 'd' toggle has something to show. One
+// message per VIP at most — lookup failures just drop silently
+// and the VIP stays on its IP literal.
+type hostnameMsg struct {
+	VIPIdx   int
+	Hostname string
+}
+
+// paused is the global pause flag flipped by the spacebar binding in
+// Model.Update. Using an atomic rather than a channel keeps the probe
+// loop dead-simple — no extra select case, no risk of wedging a
+// goroutine on a full buffered channel.
+var paused atomic.Bool
+
+// newHTTPClient builds an *http.Client with the transport configured
+// the way maglevt wants its probes to behave. The two flags that
+// actually matter here are:
+//
+//   - DisableKeepAlives = !opts.KeepAlive: off by default, because
+//     failover testing relies on every probe opening a fresh TCP+TLS
+//     connection. A single persistent connection would pin us to one
+//     backend until the keep-alive timer expires, and the tally would
+//     silently lie about which backend the load balancer is actually
+//     steering new flows to. --keepalive/-k flips this back on for
+//     users who explicitly want the pinned-session view.
+//   - InsecureSkipVerify: on by default, matching `curl -k`. VIP
+//     certificates almost never match the raw IP literal we're
+//     targeting, so verification would fail on the first probe. Users
+//     who want strict verification can pass --insecure=false.
+//
+// Redirects are refused (CheckRedirect returns ErrUseLastResponse) so
+// a 302 from one backend doesn't silently advance to another VIP and
+// pollute the tally.
+func newHTTPClient(opts probeOpts) *http.Client {
+	tr := &http.Transport{
+		TLSClientConfig: &tls.Config{
+			InsecureSkipVerify: opts.Insecure, //nolint:gosec // matching curl -k, see comment above
+		},
+		DisableKeepAlives: !opts.KeepAlive,
+		// We dial the VIP literal directly — no DNS involvement —
+		// so the default DialContext is already what we want.
+	}
+	return &http.Client{
+		Transport: tr,
+		Timeout:   opts.Timeout,
+		CheckRedirect: func(*http.Request, []*http.Request) error {
+			return http.ErrUseLastResponse
+		},
+	}
+}
+
+// runProbeLoop is the per-VIP probe worker. It waits an initial random
+// delay (so N VIPs don't phase-lock onto the same tick after startup),
+// then fires a probe every opts.Interval ± 10% jitter until ctx is
+// cancelled. Each completed probe posts a probeResultMsg into the
+// tea.Program via send.
+//
+// The loop honors the global `paused` flag by simply skipping the
+// probe call while paused — the ticker keeps ticking so a resume
+// picks up at the next natural tick boundary instead of fast-
+// forwarding through a burst of back-to-back probes.
+func runProbeLoop(ctx context.Context, vip *vipInfo, opts probeOpts, send func(tea.Msg)) {
+	// Initial offset: uniformly in [0, interval) so N goroutines
+	// started together spread out across one interval window rather
+	// than all firing at t=0.
+	select {
+	case <-ctx.Done():
+		return
+	case <-time.After(time.Duration(rand.Int64N(int64(opts.Interval)))):
+	}
+	for {
+		sleepFor := jitter(opts.Interval)
+		if !paused.Load() {
+			result := doProbe(ctx, vip, opts)
+			send(result)
+		}
+		select {
+		case <-ctx.Done():
+			return
+		case <-time.After(sleepFor):
+		}
+	}
+}
+
+// runDNSLookup does a single reverse-DNS (PTR) lookup for vip.ip
+// and, on success, delivers a hostnameMsg to the UI. Runs once
+// per VIP at startup and then exits — PTR records change rarely
+// enough that we don't bother re-querying them, and the operator
+// can always restart maglevt to pick up a new mapping.
+//
+// A 3-second timeout keeps a broken resolver from wedging the
+// worker for the full life of the program; on timeout we just
+// give up silently and the VIP displays its IP literal forever.
+// Trailing dots are stripped so the rendered hostname matches
+// what the operator typed in their zone file.
+func runDNSLookup(parent context.Context, vip *vipInfo, send func(tea.Msg)) {
+	ctx, cancel := context.WithTimeout(parent, 3*time.Second)
+	defer cancel()
+	names, err := net.DefaultResolver.LookupAddr(ctx, vip.ip.String())
+	if err != nil || len(names) == 0 {
+		return
+	}
+	send(hostnameMsg{
+		VIPIdx:   vip.idx,
+		Hostname: strings.TrimSuffix(names[0], "."),
+	})
+}
+
+// jitter scales d by a uniform factor in [0.9, 1.1) — the same ±10%
+// jitter the checker uses, same rationale: probes don't phase-lock on
+// a wall-clock tick across every VIP in the config.
+func jitter(d time.Duration) time.Duration {
+	if d <= 0 {
+		return d
+	}
+	return time.Duration(float64(d) * (0.9 + 0.2*rand.Float64()))
+}
+
+// doProbe issues one probe against vip and returns a filled-in
+// probeResultMsg. HTTP / HTTPS go through vip.client with a GET
+// request against opts.Path (default /.well-known/ipng/healthz),
+// a Host-header override, and header extraction. Non-HTTP VIPs do
+// a plain TCP connect — success if the three-way handshake
+// completes before opts.Timeout.
+//
+// GET rather than HEAD: a common health-check path (healthz,
+// status, /-/healthy) often returns 204 or 200 and is cheap to
+// serve, but some handlers don't wire HEAD and would 405 us back.
+// GET works against every reasonable implementation and the
+// rolling-window latency is unchanged (we time until headers, not
+// until the body completes — resp.Body.Close() discards the body
+// without reading it). The defer below still closes the body so
+// the transport can recycle the connection if --keepalive is on.
+func doProbe(parent context.Context, vip *vipInfo, opts probeOpts) probeResultMsg {
+	start := time.Now()
+	result := probeResultMsg{VIPIdx: vip.idx, At: start}
+
+	ctx, cancel := context.WithTimeout(parent, opts.Timeout)
+	defer cancel()
+
+	if vip.scheme == "tcp" {
+		d := net.Dialer{Timeout: opts.Timeout}
+		conn, err := d.DialContext(ctx, "tcp",
+			net.JoinHostPort(vip.ip.String(), strconv.Itoa(int(vip.port))))
+		result.Duration = time.Since(start)
+		if err != nil {
+			result.Err = shortError(err)
+			return result
+		}
+		_ = conn.Close()
+		result.OK = true
+		return result
+	}
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, vip.url, nil)
+	if err != nil {
+		result.Duration = time.Since(start)
+		result.Err = shortError(err)
+		return result
+	}
+	// Host header: user override, else derive from the VIP itself
+	// (which the kernel already put in the URL, so leaving req.Host
+	// empty means "use the URL authority"). We only touch req.Host
+	// when the operator explicitly passed --host.
+	if opts.Host != "" {
+		req.Host = opts.Host
+	}
+
+	resp, err := vip.client.Do(req)
+	result.Duration = time.Since(start)
+	if err != nil {
+		result.Err = shortError(err)
+		return result
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	result.Code = resp.StatusCode
+	result.OK = resp.StatusCode >= 200 && resp.StatusCode < 400
+	if opts.Header != "" {
+		result.Header = resp.Header.Get(opts.Header)
+	}
+	return result
+}
+
+// shortError collapses common Go net errors into a short string
+// suitable for a narrow table cell. url.Error wrappings, dial
+// contexts, and "i/o timeout" trailers all get trimmed so the LAST
+// column shows something legible like "refused" / "timeout" /
+// "no route" instead of a 120-char wrapped error.
+func shortError(err error) string {
+	if err == nil {
+		return ""
+	}
+	msg := err.Error()
+	// net/url-wrapped errors: keep only the last segment, which
+	// holds the actual cause.
+	if i := strings.LastIndex(msg, ": "); i >= 0 {
+		msg = msg[i+2:]
+	}
+	// Normalise common kernel errnos and Go's wrappers.
+	switch {
+	case strings.Contains(msg, "connection refused"):
+		return "refused"
+	case strings.Contains(msg, "i/o timeout") || strings.Contains(msg, "context deadline exceeded"):
+		return "timeout"
+	case strings.Contains(msg, "no route to host"):
+		return "no-route"
+	case strings.Contains(msg, "network is unreachable"):
+		return "net-unrch"
+	case strings.Contains(msg, "host is unreachable"):
+		return "host-unrch"
+	case strings.Contains(msg, "connection reset"):
+		return "reset"
+	case strings.Contains(msg, "EOF"):
+		return "eof"
+	case strings.Contains(strings.ToLower(msg), "tls"):
+		return "tls-err"
+	}
+	// Last resort: truncate anything longer than the LAST column.
+	if len(msg) > 8 {
+		msg = msg[:8]
+	}
+	return msg
+}
--- a/cmd/tester/stats.go
+++ b/cmd/tester/stats.go
@@ -0,0 +1,162 @@
+// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
+
+package main
+
+import "sort"
+
+// rollingSize is the bounded history maglevt keeps per VIP for
+// latency percentiles and success-ratio display. At 100 samples
+// and the default 100ms probe interval, this is a ~10s window —
+// short enough to react quickly to failover events, long enough
+// that p50/p95 are statistically meaningful.
+const rollingSize = 100
+
+// rolling is a bounded-window (rollingSize) counter for per-VIP probe
+// results. It tracks success/failure totals, running sum for mean
+// latency, and a ring of individual samples so percentiles can be
+// computed on demand. Non-thread-safe: everything that touches a
+// rolling lives on the bubbletea dispatch goroutine, so no locking
+// is needed. The reset-on-rotate logic keeps ok/fail/sumNS in sync
+// with the ring contents as old samples get overwritten.
+type rolling struct {
+	samples []sample
+	idx     int // next write position
+	n       int // number of valid samples (0..rollingSize)
+
+	ok    int
+	fail  int
+	sumNS uint64
+	minNS uint64 // 0 while n == 0
+	maxNS uint64
+}
+
+type sample struct {
+	ns uint64
+	ok bool
+}
+
+// newRolling returns an empty rolling window ready to accept records.
+func newRolling() *rolling {
+	return &rolling{samples: make([]sample, rollingSize)}
+}
+
+// reset zeroes every field and the ring so the rolling window starts
+// fresh. Called by the 'r' keybinding and by the --reset flow.
+func (r *rolling) reset() {
+	for i := range r.samples {
+		r.samples[i] = sample{}
+	}
+	r.idx = 0
+	r.n = 0
+	r.ok = 0
+	r.fail = 0
+	r.sumNS = 0
+	r.minNS = 0
+	r.maxNS = 0
+}
+
+// record appends a single probe result to the rolling window, evicting
+// the oldest sample if the ring is already full. All aggregate fields
+// (ok/fail/sumNS) are kept in sync incrementally to avoid re-walking
+// the ring on every insert. min/max are re-derived from the ring
+// after the write — that's O(n) but n is bounded at rollingSize so
+// the cost is trivial and avoids the bookkeeping complexity of an
+// incremental extremum counter.
+func (r *rolling) record(ns uint64, ok bool) {
+	if r.n == rollingSize {
+		// Ring is full — subtract the contribution of the sample
+		// we're about to overwrite.
+		old := r.samples[r.idx]
+		r.sumNS -= old.ns
+		if old.ok {
+			r.ok--
+		} else {
+			r.fail--
+		}
+	} else {
+		r.n++
+	}
+
+	r.samples[r.idx] = sample{ns: ns, ok: ok}
+	r.idx = (r.idx + 1) % rollingSize
+	r.sumNS += ns
+	if ok {
+		r.ok++
+	} else {
+		r.fail++
+	}
+
+	// Recompute min/max from the live ring. O(rollingSize) but
+	// that's 100 uint64 reads — noise on any machine maglevt
+	// would ever run on.
+	r.minNS = ^uint64(0)
+	r.maxNS = 0
+	for i := 0; i < r.n; i++ {
+		s := r.samples[i]
+		if s.ns < r.minNS {
+			r.minNS = s.ns
+		}
+		if s.ns > r.maxNS {
+			r.maxNS = s.ns
+		}
+	}
+}
+
+// percentiles returns (p50, p95, p99) in nanoseconds over the
+// current window, or zeros if empty. Implemented by copying the
+// ring into a fresh slice, sort.Slice, and index lookup — the
+// 100-element sort is cheap enough to do per UI frame (roughly
+// every 250ms). Index clamping at r.n-1 handles the warmup case
+// where the rolling window doesn't yet have enough samples for
+// p95/p99 to fall in distinct slots.
+func (r *rolling) percentiles() (p50, p95, p99 uint64) {
+	if r.n == 0 {
+		return 0, 0, 0
+	}
+	buf := make([]uint64, r.n)
+	for i := 0; i < r.n; i++ {
+		buf[i] = r.samples[i].ns
+	}
+	sort.Slice(buf, func(i, j int) bool { return buf[i] < buf[j] })
+	p50 = buf[r.n/2]
+	p95Idx := r.n * 95 / 100
+	if p95Idx >= r.n {
+		p95Idx = r.n - 1
+	}
+	p95 = buf[p95Idx]
+	p99Idx := r.n * 99 / 100
+	if p99Idx >= r.n {
+		p99Idx = r.n - 1
+	}
+	p99 = buf[p99Idx]
+	return p50, p95, p99
+}
+
+// successPct returns the percentage of probes currently in the window
+// that completed successfully (2xx/3xx for HTTP, TCP connect OK for
+// tcp). Returns 0 when the window is empty.
+func (r *rolling) successPct() float64 {
+	if r.n == 0 {
+		return 0
+	}
+	return 100.0 * float64(r.ok) / float64(r.n)
+}
+
+// isSpike reports whether ns is more than 25% above the current
+// window maximum. A spike-warmup guard (n < 10) prevents the first
+// handful of cold-start samples from each flagging as spikes — max
+// during warmup is whatever happened to come in first, so the 1.25×
+// threshold is meaningless until we have a stable baseline.
+//
+// Intended to be called *before* record() on the same sample, so the
+// comparison runs against the previous window max rather than the
+// one the new sample would produce. If a spike is detected, the
+// caller typically records an errEvent and then calls record() to
+// fold the sample into the rolling stats as usual.
+func (r *rolling) isSpike(ns uint64) bool {
+	if r.n < 10 || r.maxNS == 0 {
+		return false
+	}
+	// ns > maxNS * 1.25, written without float conversion.
+	return ns > r.maxNS+r.maxNS/4
+}
--- a/cmd/tester/view.go
+++ b/cmd/tester/view.go
@@ -0,0 +1,724 @@
+// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
+
+package main
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/charmbracelet/lipgloss"
+)
+
+// Styles. Colours are ANSI 256 indices so maglevt renders the same
+// across iTerm, Alacritty, xterm, tmux, and screen without depending
+// on truecolor support.
+var (
+	styleHeader  = lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("14"))
+	styleSection = lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("14"))
+	styleDim     = lipgloss.NewStyle().Foreground(lipgloss.Color("8"))
+	styleHint    = lipgloss.NewStyle().Foreground(lipgloss.Color("244"))
+
+	styleOK    = lipgloss.NewStyle().Foreground(lipgloss.Color("10")).Bold(true)
+	styleWarn  = lipgloss.NewStyle().Foreground(lipgloss.Color("11")).Bold(true)
+	styleErr   = lipgloss.NewStyle().Foreground(lipgloss.Color("9")).Bold(true)
+	styleWhite = lipgloss.NewStyle().Foreground(lipgloss.Color("15")).Bold(true)
+
+	styleHTTP = lipgloss.NewStyle().Foreground(lipgloss.Color("12"))
+	styleTCP  = lipgloss.NewStyle().Foreground(lipgloss.Color("13"))
+
+	styleRunning = lipgloss.NewStyle().Foreground(lipgloss.Color("10")).Bold(true)
+	stylePaused  = lipgloss.NewStyle().Foreground(lipgloss.Color("11")).Bold(true)
+)
+
+// View renders the full TUI. One call per redraw; re-invoked whenever
+// bubbletea dispatches a message. No mutation — every field it reads
+// is in Model or vipState, so it's safe to call concurrently with
+// Update's message handling (tea serialises them anyway).
+//
+// Section order: header → probe table → tally (if --header collected
+// any samples) → events panel (if any anomalies have been observed)
+// → [blank padding] → footer. The tally lives above the events so a
+// growing events list pushes itself against the footer rather than
+// shoving the tally up and down every time a new anomaly lands —
+// the tally is the panel the operator stares at during a failover
+// test, and it should stay put.
+//
+// The footer is pinned to the last row of the terminal: we count
+// how many lines the above sections produced, then pad with enough
+// blank lines to push the footer to row m.height. That keeps the
+// `[q] quit  …` hint visible at the bottom even when the content
+// doesn't fill the screen, and matches the convention most TUIs
+// follow. When the content DOES overflow (tiny terminal / huge
+// config), we fall back to a single blank line between content
+// and footer so the footer is still reachable by scrolling.
+func (m Model) View() string {
+	if m.help {
+		return m.viewHelp()
+	}
+
+	// Phase 1: build everything that isn't the events panel, so we
+	// can measure how many rows are left for events. This is the
+	// "fixed" content — header, table, and (maybe) tally. Events
+	// then absorb whatever space remains between this block and
+	// the screen-pinned footer.
+	var pre strings.Builder
+	pre.WriteString(m.viewHeader())
+	pre.WriteString("\n\n")
+	pre.WriteString(m.viewTable())
+	if m.opts.Header != "" && m.anyTallied() {
+		pre.WriteString("\n")
+		pre.WriteString(m.viewTally())
+	}
+	preContent := pre.String()
+
+	footer := m.viewFooter()
+
+	// Phase 2: compute how many event rows can fit.
+	//
+	// Screen budget:
+	//   preContent lines              (header + table + optional tally)
+	//   + 1 blank separator
+	//   + 1 events section header
+	//   + N events rows               ← the unknown we're solving for
+	//   + M footer-pad blank rows
+	//   + 1 footer line
+	//   = m.height
+	//
+	// Solving for max N (before the footer-pad term kicks in):
+	//   N = m.height - preLines - 3    (3 = separator + events hdr + footer)
+	//
+	// When there's no room for events (tiny terminal, or tally
+	// already fills the screen), maxEvents clamps to 0 and the
+	// events section is skipped entirely, letting the footer pad
+	// logic below handle the spacing.
+	preLines := strings.Count(preContent, "\n")
+	// Budget arithmetic for the events panel:
+	//   preLines + 1 (separator) + 1 (events hdr) + maxEvents + 1 (footer) = m.height
+	// So maxEvents = m.height - preLines - 3. A negative value
+	// means the terminal is too tight to even frame the panel
+	// (section header + separator + footer wouldn't fit), in
+	// which case we skip the whole section. A zero value means
+	// we render only the "Recent events: (none)" placeholder
+	// with no rows below it — enough to mark the panel's
+	// position on the screen during the quiet period before the
+	// first anomaly arrives.
+	maxEvents := -1
+	if m.height > 0 {
+		maxEvents = m.height - preLines - 3
+	}
+
+	var content strings.Builder
+	content.WriteString(preContent)
+	if maxEvents >= 0 {
+		content.WriteString("\n")
+		content.WriteString(m.viewEvents(maxEvents))
+	}
+	contentStr := content.String()
+
+	// Phase 3: pin footer to last screen row. Alt-screen guarantees
+	// the view starts at row 1, so we need (m.height - 1) lines of
+	// content above the footer. m.height == 0 means no
+	// WindowSizeMsg yet (first frame); degrade to a single blank
+	// separator so the footer is still visible even if the exact
+	// row is wrong.
+	if m.height <= 0 {
+		return contentStr + "\n\n" + footer
+	}
+	// contentStr ends with a newline, so strings.Count is exactly the
+	// number of visible rows it occupies, and the cursor is parked at
+	// row contentLines+1 after it's written. To land the footer on
+	// the last terminal row we need padLines = m.height-contentLines-1
+	// extra newlines between content and footer. padLines==0 is the
+	// perfect-fit case (events panel was sized with this in mind) and
+	// must NOT be bumped to 1 — that would push the footer to row
+	// m.height+1 and scroll the header off the top of the alt-screen.
+	contentLines := strings.Count(contentStr, "\n")
+	padLines := m.height - contentLines - 1
+	if padLines < 0 {
+		padLines = 0
+	}
+	return contentStr + strings.Repeat("\n", padLines) + footer
+}
+
+func (m Model) viewHeader() string {
+	runState := styleRunning.Render("RUNNING")
+	if paused.Load() {
+		runState = stylePaused.Render("PAUSED")
+	}
+	line := fmt.Sprintf(
+		"maglevt — %s — interval: %s  timeout: %s  header: %s  [%s]  uptime: %s",
+		m.cfgPath,
+		m.opts.Interval,
+		m.opts.Timeout,
+		m.opts.Header,
+		runState,
+		time.Since(m.startAt).Round(time.Second),
+	)
+	return styleHeader.Render(line)
+}
+
+// Table column widths (visible characters, not bytes). All cell
+// rendering goes through padVisibleLeft / padVisibleRight which
+// measure width via lipgloss.Width — that strips ANSI escape
+// sequences before counting, so ANSI-styled cells pad correctly
+// instead of under-counting by the escape-code overhead.
+//
+// There's no VIP-name column: identity is the (scheme, ip, port)
+// tuple. PROTO + ADDR together are the row key, which also makes
+// the display independent of whatever names the source yaml files
+// used (potentially conflicting across a multi-config union).
+const (
+	colSchemeW = 5  // "http", "https", "tcp" — widest is 5
+	colAFW     = 2  // "v4" / "v6"
+	colAddrW   = 40 // address + port, bracketed IPv6; full 8-group expansion fits
+	colLastW   = 10
+	colNW      = 7 // "N" lifetime probe count; 7 digits handles >24h @ 100ms
+	colFailW   = 6 // "FAIL" lifetime failure count
+	colOKW     = 7
+	colP50W    = 9
+	colP95W    = 9
+	colP99W    = 9
+	colMaxW    = 9
+)
+
+// padVisibleRight right-pads s with spaces so that its rendered
+// visible width (lipgloss.Width, which strips ANSI) matches width.
+// If s is already that wide or wider, it's returned unchanged. Used
+// for left-aligned columns whose content may contain ANSI escapes.
+func padVisibleRight(s string, width int) string {
+	w := lipgloss.Width(s)
+	if w >= width {
+		return s
+	}
+	return s + strings.Repeat(" ", width-w)
+}
+
+// padVisibleLeft is the right-aligned sibling of padVisibleRight.
+func padVisibleLeft(s string, width int) string {
+	w := lipgloss.Width(s)
+	if w >= width {
+		return s
+	}
+	return strings.Repeat(" ", width-w) + s
+}
+
+// truncateVisible clamps s to at most width *visible* characters,
+// preserving embedded ANSI escape sequences by copying runs between
+// escapes. A single ellipsis replaces the last visible character
+// when truncation happens so the operator can see the cell was cut.
+func truncateVisible(s string, width int) string {
+	if lipgloss.Width(s) <= width {
+		return s
+	}
+	if width <= 0 {
+		return ""
+	}
+	// Walk runes, copying ANSI escape sequences verbatim (they
+	// don't consume visible width) and counting printable runes.
+	var b strings.Builder
+	visible := 0
+	inEscape := false
+	for _, r := range s {
+		if inEscape {
+			b.WriteRune(r)
+			if r == 'm' {
+				inEscape = false
+			}
+			continue
+		}
+		if r == 0x1b {
+			b.WriteRune(r)
+			inEscape = true
+			continue
+		}
+		if visible+1 >= width {
+			b.WriteRune('…')
+			break
+		}
+		b.WriteRune(r)
+		visible++
+	}
+	return b.String()
+}
+
+func (m Model) viewTable() string {
+	var b strings.Builder
+	// Header row: plain text (no per-cell styling) so the column
+	// widths match the data rows 1:1 without lipgloss.Width
+	// gymnastics.
+	// Header labels must each fit within their column width — "PROTO"
+	// at 6 chars overflows colSchemeW (5) and would push every
+	// subsequent header one column right of its data, so we use the
+	// 5-char "PROTO" here. LAST is left-aligned to match the starting
+	// column of the first tally entry on the row below, so the operator
+	// can eye-align a status code with the backend it corresponds to.
+	header := "  " +
+		padVisibleRight("PROTO", colSchemeW) + "  " +
+		padVisibleRight("AF", colAFW) + "  " +
+		padVisibleRight("ADDR", colAddrW) + "  " +
+		padVisibleRight("LAST", colLastW) + "  " +
+		padVisibleLeft("N", colNW) + "  " +
+		padVisibleLeft("FAIL", colFailW) + "  " +
+		padVisibleLeft("OK%", colOKW) + "  " +
+		padVisibleLeft("p50", colP50W) + "  " +
+		padVisibleLeft("p95", colP95W) + "  " +
+		padVisibleLeft("p99", colP99W) + "  " +
+		padVisibleLeft("max", colMaxW)
+	b.WriteString(styleDim.Render(header))
+	b.WriteString("\n")
+	for _, v := range m.vips {
+		b.WriteString(m.viewRow(v))
+		b.WriteString("\n")
+	}
+	return b.String()
+}
+
+func (m Model) viewRow(v *vipState) string {
+	scheme := schemeLabel(v.info.scheme)
+	addr := truncateVisible(m.displayAddr(v), colAddrW)
+	last := lastCell(v)
+
+	nStr := fmt.Sprintf("%d", v.totalProbes)
+	failStr := fmt.Sprintf("%d", v.totalFails)
+	// Red-tint the FAIL column only when there's actually been a
+	// failure. Zero reads as "fine" so it stays in the dim default
+	// colour along with a green-zero N for consistency. styleDim
+	// on the counters is intentional — they're reference values,
+	// not the primary "is this VIP healthy" signal (that's LAST
+	// and OK%).
+	nStyled := styleDim.Render(nStr)
+	var failStyled string
+	if v.totalFails > 0 {
+		failStyled = styleErr.Render(failStr)
+	} else {
+		failStyled = styleDim.Render(failStr)
+	}
+
+	okStr, p50Str, p95Str, p99Str, maxStr := "—", "—", "—", "—", "—"
+	if v.rolling.n > 0 {
+		okStr = fmt.Sprintf("%.1f", v.rolling.successPct())
+		p50ns, p95ns, p99ns := v.rolling.percentiles()
+		p50Str = formatDur(time.Duration(p50ns))
+		p95Str = formatDur(time.Duration(p95ns))
+		p99Str = formatDur(time.Duration(p99ns))
+		maxStr = formatDur(time.Duration(v.rolling.maxNS))
+	}
+	okStr = colourOK(okStr, v.rolling.successPct(), v.rolling.n)
+
+	return "  " +
+		padVisibleRight(scheme, colSchemeW) + "  " +
+		padVisibleRight(afLabel(v.info), colAFW) + "  " +
+		padVisibleRight(addr, colAddrW) + "  " +
+		padVisibleRight(last, colLastW) + "  " +
+		padVisibleLeft(nStyled, colNW) + "  " +
+		padVisibleLeft(failStyled, colFailW) + "  " +
+		padVisibleLeft(okStr, colOKW) + "  " +
+		padVisibleLeft(p50Str, colP50W) + "  " +
+		padVisibleLeft(p95Str, colP95W) + "  " +
+		padVisibleLeft(p99Str, colP99W) + "  " +
+		padVisibleLeft(maxStr, colMaxW)
+}
+
+// afLabel returns the address-family tag for the AF column.
+// IPv6 is identified by To4() == nil, which is how net.IP
+// distinguishes a 4-in-6 mapped address from a native v6 one.
+func afLabel(v *vipInfo) string {
+	if v.ip.To4() == nil {
+		return "v6"
+	}
+	return "v4"
+}
+
+// displayAddr formats the address cell for a VIP, honouring the
+// Model.showDNS toggle. With DNS on (the default) and a PTR result
+// available we show "hostname:port"; otherwise we fall back to the
+// raw IP literal, bracketed for IPv6. Keeping the toggle in the
+// Model (rather than per-VIP) means pressing 'd' flips every row
+// on the next redraw, which is the behaviour the operator expects.
+func (m Model) displayAddr(v *vipState) string {
+	if m.showDNS && v.hostname != "" {
+		return fmt.Sprintf("%s:%d", v.hostname, v.info.port)
+	}
+	return vipAddrString(v.info)
+}
+
+// schemeLabel renders the coloured scheme tag for the PROTO column.
+// The raw token ("http", "https", "tcp") is returned wrapped in an
+// ANSI style; callers are responsible for padding it to colSchemeW
+// via padVisibleRight when they want column alignment. Use
+// schemeAddrLabel when you want a pre-padded "PROTO  ADDR" tuple
+// aligned with the main table.
+func schemeLabel(scheme string) string {
+	switch scheme {
+	case "http":
+		return styleHTTP.Render("http")
+	case "https":
+		return styleHTTP.Render("https")
+	default:
+		return styleTCP.Render("tcp")
+	}
+}
+
+// schemeAddrLabel builds the shared "PROTO  AF  ADDR" label used by
+// the tally and events panels. Both sections need to line up under
+// the main probe table's PROTO + AF + ADDR columns, which means
+// every fixed-width cell must be padded before the next one is
+// appended — otherwise "http" + "  " + addr and "https" + "  " +
+// addr would start the address column one character apart. The
+// helper is the single source of truth so the two call sites can't
+// drift out of sync with the main table layout. ADDR honours the
+// Model.showDNS toggle via displayAddr so toggling 'd' flips every
+// section of the TUI in lockstep.
+func (m Model) schemeAddrLabel(v *vipState) string {
+	return padVisibleRight(schemeLabel(v.info.scheme), colSchemeW) + "  " +
+		padVisibleRight(afLabel(v.info), colAFW) + "  " +
+		m.displayAddr(v)
+}
+
+// vipAddrString formats an address+port for the ADDR column, with
+// IPv6 literals bracketed so the colons in the address don't blur
+// into the port separator visually. No scheme prefix — the PROTO
+// column handles that, which also frees up width for the address
+// itself.
+func vipAddrString(v *vipInfo) string {
+	host := v.ip.String()
+	if v.ip.To4() == nil {
+		host = "[" + host + "]"
+	}
+	return fmt.Sprintf("%s:%d", host, v.port)
+}
+
+// lastCell returns the colour-rendered LAST column for a single VIP,
+// combining the most recent status code (or error token) with a
+// bold colour that encodes success vs warning vs failure. Idle VIPs
+// (no probe yet) render as a dim dash.
+func lastCell(v *vipState) string {
+	if v.lastAt.IsZero() {
+		return styleDim.Render("—")
+	}
+	if v.lastErr != "" {
+		return styleErr.Render(v.lastErr)
+	}
+	if v.info.scheme == "tcp" {
+		if v.lastOK {
+			return styleOK.Render("ok")
+		}
+		return styleErr.Render("fail")
+	}
+	// HTTP / HTTPS: show the status code, coloured by class.
+	txt := fmt.Sprintf("%d", v.lastCode)
+	switch {
+	case v.lastCode >= 200 && v.lastCode < 300:
+		return styleOK.Render(txt)
+	case v.lastCode >= 300 && v.lastCode < 400:
+		return styleWarn.Render(txt)
+	case v.lastCode >= 400 && v.lastCode < 500:
+		return styleWarn.Render(txt)
+	default:
+		return styleErr.Render(txt)
+	}
+}
+
+// colourOK renders the OK% cell with a threshold-based colour. The
+// input string is the pre-formatted percentage text (or "-" if the
+// window is empty); the decision is made on the raw float so "99.9"
+// and "99.0" don't both end up the same colour by round-up. The
+// window-size check avoids painting a green "100.0" after a single
+// successful probe — we wait until the rolling window is at least
+// 10 samples deep before committing to a verdict.
+func colourOK(txt string, pct float64, n int) string {
+	if n < 10 {
+		return styleDim.Render(txt)
+	}
+	switch {
+	case pct >= 99:
+		return styleOK.Render(txt)
+	case pct >= 95:
+		return styleWarn.Render(txt)
+	default:
+		return styleErr.Render(txt)
+	}
+}
+
+func (m Model) anyTallied() bool {
+	for _, v := range m.vips {
+		if len(v.tally) > 0 {
+			return true
+		}
+	}
+	return false
+}
+
+func (m Model) viewTally() string {
+	// Label each tally row with the same PROTO + ADDR pair the
+	// main table uses, so the operator can correlate a row in the
+	// tally back to the probe-table row without a symbolic VIP
+	// name. Label width is the combined width of the two columns
+	// plus their inter-column gap so things line up.
+	const labelW = colSchemeW + 2 + colAFW + 2 + colAddrW
+
+	// Pre-pass: compute the widest (name:count) entry across every
+	// VIP's tally. Every rendered entry is padded to this width so
+	// rows stay vertically aligned as counts grow — one backend's
+	// count ticking from 999 to 1000 widens every entry by one
+	// character simultaneously, rather than just shifting that
+	// single row's trailing entries rightward.
+	maxEntryW := 0
+	for _, v := range m.vips {
+		if v.info.scheme == "tcp" {
+			continue
+		}
+		for name, n := range v.tally {
+			w := len(name) + 1 + countDigits(n) // "name" + ":" + digits
+			if w > maxEntryW {
+				maxEntryW = w
+			}
+		}
+	}
+
+	var b strings.Builder
+	b.WriteString(styleSection.Render(fmt.Sprintf("%s tally:", m.opts.Header)))
+	b.WriteString("\n")
+	for _, v := range m.vips {
+		if v.info.scheme == "tcp" || len(v.tally) == 0 {
+			continue
+		}
+		b.WriteString(m.renderTallyRow(v, maxEntryW, labelW))
+	}
+	return b.String()
+}
+
+// renderTallyRow builds one tally line for a single VIP. Entries
+// are sorted alphabetically — stable across failovers, and stable
+// under the inevitable jitter where three "equally loaded" backends
+// shuffle their exact counts from probe to probe. A count-sorted
+// order looks informative on a static screenshot but flickers on a
+// live display, and the operator ends up reading the names anyway
+// to figure out which column is which. Alphabetical pins every
+// label to its own column for the lifetime of the process.
+//
+// Colour is binary: white if the backend was seen at least once in
+// the last tallyWindow, grey otherwise. Green is deliberately
+// avoided here — it means "success" elsewhere in the TUI (OK%, 2xx
+// status codes) and carries a value judgement the tally doesn't
+// intend to make. An active tally entry is just "in the rotation
+// right now", not "good". The earlier three-way (green/orange/grey)
+// scheme tried to distinguish "row leader" from "still receiving
+// some traffic", but on a healthy VIP where maglev spreads flows
+// evenly, the three backends tie ±a few counts per window and
+// flicker between colours on every redraw — visual noise, not
+// signal.
+//
+// During the first tallyWindow after startup or a reset, v.tallyOld
+// is the empty map, so every positive count reads as active and the
+// row flashes all-white. That's correct: we haven't observed a
+// drain yet, so nothing is drained.
+func (m Model) renderTallyRow(v *vipState, maxEntryW, labelW int) string {
+	keys := make([]string, 0, len(v.tally))
+	for k := range v.tally {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	parts := make([]string, 0, len(keys))
+	for _, k := range keys {
+		n := v.tally[k]
+		raw := fmt.Sprintf("%s:%d", k, n)
+		// delta < 0 is the transient post-reset case where the
+		// tally dropped below the old snapshot; treat that as
+		// "no activity" until the snapshot rotates on the next
+		// tick.
+		active := n-v.tallyOld[k] > 0
+		var styled string
+		if active {
+			// White (not green): green reads as "success" in the
+			// rest of the TUI — reserving it for OK% and 2xx
+			// statuses keeps the semantic clean. White is just
+			// "this backend is live right now" without the value
+			// judgement.
+			styled = styleWhite.Render(raw)
+		} else {
+			styled = styleDim.Render(raw)
+		}
+		parts = append(parts, padVisibleRight(styled, maxEntryW))
+	}
+	label := m.schemeAddrLabel(v)
+	return "  " + padVisibleRight(truncateVisible(label, labelW), labelW) +
+		"  " + strings.Join(parts, "  ") + "\n"
+}
+
+// countDigits returns the number of decimal digits in n. Faster and
+// allocation-free vs fmt.Sprintf("%d", n) and good enough for the
+// positive-integer tally counts (we never feed it negatives).
+func countDigits(n int) int {
+	if n == 0 {
+		return 1
+	}
+	d := 0
+	for n > 0 {
+		n /= 10
+		d++
+	}
+	return d
+}
+
+// viewEvents renders the error panel into a fixed number of rows
+// (maxRows, computed per-frame from the terminal height by View()).
+// Only the most recent `maxRows` events are shown — anything older
+// scrolls off. When the panel clips, the header notes "showing N
+// of M" so the operator knows there's older history they can't see
+// without resetting and starting fresh.
+//
+// maxRows <= 0 or an empty events ring means "don't render this
+// section at all"; View() skips the whole block in that case.
+//
+// Each row carries a millisecond-precision timestamp, the VIP's
+// PROTO+ADDR label (same columns as the main table so the eye can
+// jump between the two), the event kind in a fixed 9-char slot, and
+// a free-form detail string. Network errors render red, HTTP errors
+// render red, spikes render yellow to distinguish "backend responded
+// but slowly" from "backend didn't respond at all".
+func (m Model) viewEvents(maxRows int) string {
+	if maxRows < 0 {
+		return ""
+	}
+	const (
+		labelW = colSchemeW + 2 + colAFW + 2 + colAddrW
+		kindW  = 9
+		tsW    = 12 // "HH:MM:SS.mmm"
+	)
+	var b strings.Builder
+	// Header slot 1: no events yet. The section header still
+	// renders so the operator always sees where the panel lives;
+	// a (none) tag makes the empty state obvious rather than
+	// leaving the operator wondering whether the panel is broken.
+	if len(m.events) == 0 {
+		fmt.Fprintf(&b, "%s %s\n",
+			styleSection.Render("Recent events:"),
+			styleDim.Render("(none)"))
+		return b.String()
+	}
+	// maxRows is the budget for event *rows*, not including the
+	// section-header line itself — View() reserved one extra row
+	// for the header when it computed the budget, so we can spend
+	// the full maxRows on events below.
+	events := m.events
+	clipped := false
+	if len(events) > maxRows {
+		events = events[len(events)-maxRows:]
+		clipped = true
+	}
+	if clipped {
+		fmt.Fprintf(&b, "%s\n", styleSection.Render(
+			fmt.Sprintf("Recent events (showing %d of %d):", len(events), len(m.events))))
+	} else {
+		fmt.Fprintf(&b, "%s\n", styleSection.Render("Recent events:"))
+	}
+	for _, e := range events {
+		v := m.vips[e.VIPIdx]
+		label := m.schemeAddrLabel(v)
+		ts := styleDim.Render(e.At.Format("15:04:05.000"))
+		kindTxt := e.Kind.String()
+		var kind string
+		switch e.Kind {
+		case kindSpike:
+			kind = styleWarn.Render(padVisibleRight(kindTxt, kindW))
+		default:
+			kind = styleErr.Render(padVisibleRight(kindTxt, kindW))
+		}
+		fmt.Fprintf(&b, "  %s  %s  %s  %s\n",
+			padVisibleRight(ts, tsW),
+			padVisibleRight(truncateVisible(label, labelW), labelW),
+			kind,
+			e.Detail,
+		)
+	}
+	return b.String()
+}
+
+func (m Model) viewFooter() string {
+	dnsHint := "[d] dns off"
+	if !m.showDNS {
+		dnsHint = "[d] dns on"
+	}
+	hints := []string{
+		"[q] quit",
+		"[space] pause/resume",
+		"[r] reset",
+		dnsHint,
+		"[h] help",
+	}
+	return styleHint.Render(strings.Join(hints, "  "))
+}
+
+func (m Model) viewHelp() string {
+	lines := []string{
+		styleHeader.Render("maglevt — keybindings"),
+		"",
+		"  q / ctrl-c        quit",
+		"  space             pause / resume all probe loops",
+		"  r                 reset rolling stats + tally + uptime",
+		"  d                 toggle hostname / IP-literal ADDR display",
+		"  h / ?             toggle this help overlay",
+		"",
+		styleSection.Render("columns"),
+		"",
+		"  PROTO               http / https (port 80/443) or tcp (everything else)",
+		"  AF                  address family — v4 or v6",
+		"  ADDR                VIP address + port, or reverse-DNS hostname (toggle 'd')",
+		"  LAST                most-recent probe result, coloured by class",
+		"  N                   lifetime probe count since startup (or last 'r')",
+		"  FAIL                lifetime failure count; red when non-zero",
+		"  OK%                 success ratio over the last 100 samples",
+		"  p50/p95/p99         latency percentiles over the last 100 samples",
+		"  max                 worst-case latency over the last 100 samples",
+		"",
+		styleSection.Render("tally"),
+		"",
+		"  Running count of the response header configured via --header",
+		"  (default X-IPng-Frontend), grouped by PROTO + ADDR. Entries",
+		"  are sorted alphabetically so each backend owns its column",
+		"  for the lifetime of the process. Colour marks recent",
+		"  activity over the last ~5s window:",
+		"    white             received at least one hit in the window",
+		"    grey              idle — flushed or fully drained",
+		"  Reset with 'r'.",
+		"",
+		styleSection.Render("events panel"),
+		"",
+		"  Rolling list of probes that warrant attention. The panel auto-",
+		"  sizes to fill whatever vertical space is left between the tally",
+		"  and the footer, so a taller terminal shows more history. Older",
+		"  events are still stored (up to 500) and re-appear if the",
+		"  terminal is enlarged; the header notes \"showing N of M\" when",
+		"  the display is clipping. Four event kinds:",
+		"    timeout   probe hit its --timeout deadline",
+		"    http-err  response carried a 4xx or 5xx status code",
+		"    net-err   TCP refused, reset, unreachable, or TLS error",
+		"    spike     successful probe more than 25% above the rolling",
+		"              window max (warmup: at least 10 samples required)",
+		"  Cleared along with everything else by 'r'.",
+		"",
+		styleHint.Render("Press h or ? again to dismiss."),
+	}
+	return strings.Join(lines, "\n")
+}
+
+// formatDur renders a time.Duration in a compact form for the
+// p50/p95 columns: sub-millisecond shows in µs, sub-second in ms,
+// and anything longer in seconds with a decimal place. Fits within
+// the 9-char column cleanly.
+func formatDur(d time.Duration) string {
+	if d <= 0 {
+		return "-"
+	}
+	switch {
+	case d < time.Millisecond:
+		return fmt.Sprintf("%dµs", d.Microseconds())
+	case d < time.Second:
+		return fmt.Sprintf("%.1fms", float64(d.Microseconds())/1000)
+	default:
+		return fmt.Sprintf("%.2fs", d.Seconds())
+	}
+}