New maglevt TUI component: out-of-band VIP health monitor

A small bubbletea TUI that reads maglev.yaml (repeatable --config),
enumerates every VIP, and probes each from outside the load balancer
on a tight cadence (default 100ms, ±10% jitter). HTTP/HTTPS VIPs get
a GET against a configurable URI (default /.well-known/ipng/healthz)
with per-VIP rolling latency (p50/p95/p99/max), lifetime N/FAIL
counters, LAST status, and a response-header tally. Non-HTTP VIPs
get a TCP connect probe. A bounded error panel classifies anomalies
as timeout / http-err / net-err / spike and auto-sizes to fill the
screen.

Utility: during a failover drill (backend flap, AS drain, config
push) the tally panel shows which backend each VIP is actually
steering to, with two-colour activity highlighting over a 5s
window — white = receiving traffic, grey = drained. Paired with
the rolling OK%/latency columns it gives an at-a-glance answer to
"is the VIP healthy from the outside right now, and which backend
is it hitting", without relying on maglevd's own view of the
world.

Also bumps Makefile/go.mod to build the new binary.
This commit is contained in:
2026-04-15 01:23:34 +02:00
parent 744b1cb3d2
commit 6293521157
8 changed files with 1890 additions and 1 deletions

305
cmd/tester/main.go Normal file
View File

@@ -0,0 +1,305 @@
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
// maglevt is a tiny TUI that reads maglev.yaml, enumerates every VIP
// and hits it on a tight cadence (default 100ms) from outside the load
// balancer. HTTP/HTTPS VIPs get a HEAD request with per-VIP rolling
// latency stats, success/failure ratios, and a running tally of a
// configurable response header (default: X-IPng-Frontend) so pool-
// failover events show up as visible reshuffles in the tally. Non-HTTP
// VIPs get a plain TCP-connect probe for liveness. See maglevt --help
// for the flag surface.
package main
import (
"context"
"flag"
"fmt"
"net"
"os"
"regexp"
"sort"
"strings"
"time"
tea "github.com/charmbracelet/bubbletea"
buildinfo "git.ipng.ch/ipng/vpp-maglev/cmd"
"git.ipng.ch/ipng/vpp-maglev/internal/config"
)
func main() {
if err := run(); err != nil {
fmt.Fprintf(os.Stderr, "maglevt: %v\n", err)
os.Exit(1)
}
}
func run() error {
var cfgPaths multiFlag
flag.Var(&cfgPaths, "config", "path to maglev.yaml (repeatable; also accepts a comma-separated list). Frontends are unioned across files, deduplicated by (address, protocol, port).")
interval := flag.Duration("interval", 100*time.Millisecond, "probe interval per VIP (±10% jitter)")
timeout := flag.Duration("timeout", 2*time.Second, "per-request timeout")
host := flag.String("host", "", "Host header override (default: VIP address literal)")
// Default probe URI: a small, deliberate health-check path that
// typically returns 204 No Content and doesn't hit the backend
// app logs. /.well-known/ipng/healthz is the convention for
// IPng deployments; override with --uri for anything else.
// --path is registered as a synonym for backward compatibility
// with the pre-1.0 flag name — both set the same variable, so
// whichever the operator types last on the command line wins.
const defaultURI = "/.well-known/ipng/healthz"
path := flag.String("uri", defaultURI, "HTTP request path (URI) used in the GET request")
flag.StringVar(path, "path", defaultURI, "alias for --uri")
header := flag.String("header", "X-IPng-Frontend", "response header to extract and tally")
insecure := flag.Bool("insecure", true, "skip TLS verification for HTTPS")
keepalive := flag.Bool("keepalive", false, "enable HTTP keep-alives (disabled by default so each probe opens a fresh connection — required for failover visibility)")
flag.BoolVar(keepalive, "k", false, "shorthand for --keepalive")
filter := flag.String("filter", "", "only probe frontends whose name matches this regex")
printVersion := flag.Bool("version", false, "print version and exit")
flag.Parse()
if *printVersion {
fmt.Printf("maglevt %s (commit %s, built %s)\n",
buildinfo.Version(), buildinfo.Commit(), buildinfo.Date())
return nil
}
if len(cfgPaths) == 0 {
cfgPaths = multiFlag{"/etc/vpp-maglev/maglev.yaml"}
}
// Load every requested config. A parse/semantic error on any of
// them is fatal — we want the user to see it rather than silently
// probing a reduced set of VIPs because one file was broken.
configs := make([]*config.Config, 0, len(cfgPaths))
for _, p := range cfgPaths {
cfg, res := config.Check(p)
if !res.OK() {
if res.ParseError != "" {
return fmt.Errorf("config parse %s: %s", p, res.ParseError)
}
return fmt.Errorf("config semantic %s: %s", p, res.SemanticError)
}
configs = append(configs, cfg)
}
var filterRe *regexp.Regexp
if *filter != "" {
var err error
filterRe, err = regexp.Compile(*filter)
if err != nil {
return fmt.Errorf("invalid --filter regex: %w", err)
}
}
opts := probeOpts{
Interval: *interval,
Timeout: *timeout,
Host: *host,
Path: *path,
Header: *header,
Insecure: *insecure,
KeepAlive: *keepalive,
}
vips := buildVIPsUnion(configs, cfgPaths, filterRe, opts)
if len(vips) == 0 {
return fmt.Errorf("no matching frontends in %s", strings.Join(cfgPaths, ", "))
}
m := Model{
cfgPath: strings.Join(cfgPaths, ", "),
vips: vips,
opts: opts,
startAt: time.Now(),
showDNS: true,
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
prog := tea.NewProgram(m, tea.WithAltScreen())
// Spawn one probe goroutine per VIP. Each sends probeResultMsg
// into the tea.Program via prog.Send, which is thread-safe.
// Alongside the prober we kick off a one-shot reverse-DNS
// lookup so the 'd' toggle has a hostname to display; the
// lookup is best-effort and simply drops on timeout or NXDOMAIN.
for _, v := range vips {
go runProbeLoop(ctx, v.info, opts, prog.Send)
go runDNSLookup(ctx, v.info, prog.Send)
}
_, err := prog.Run()
cancel()
// Give the workers a beat to observe ctx and exit. This isn't
// strictly required — the process is exiting anyway — but a clean
// shutdown avoids the "unexpected EOF writing to a closed
// transport" spam that HTTP clients sometimes emit on ctrl-C.
time.Sleep(50 * time.Millisecond)
return err
}
// buildVIPsUnion flattens frontends from multiple configs into a
// single deduplicated probe list, keyed by the (scheme, address,
// port) tuple. The typical use case is a pair of maglevds fronting
// the same two VIPs (vip0 / vip1, IPv4 + IPv6, × port 80 + 443 = 8
// probers) — the operator passes both yaml files and maglevt unions
// them so the probe grid doesn't grow duplicates from mirrored
// configs. The symbolic frontend name from yaml is intentionally
// dropped: when two files use the same name for different tuples
// (common in cross-deployment comparisons) the name would be
// ambiguous, and the tuple is the only stable identity. Only the
// --filter regex still uses the name, as a pre-dedup match.
//
// Dedup key uses net.IP.String() which canonicalises IPv6 zero-
// compression, so 2001:db8::1 and 2001:db8:0:0:0:0:0:1 collapse
// onto one entry. Iteration order across files is stable for
// deterministic TUI layout: within a file, frontends are visited
// in name-sorted order; across files, the first occurrence of each
// tuple wins and fixes its slot in the output.
func buildVIPsUnion(cfgs []*config.Config, cfgPaths []string, filterRe *regexp.Regexp, opts probeOpts) []*vipState {
_ = cfgPaths // reserved for future diagnostics (e.g. which file this tuple came from)
type key struct {
ip string
scheme string
port uint16
}
seen := map[key]*vipState{}
var order []key
for _, cfg := range cfgs {
names := make([]string, 0, len(cfg.Frontends))
for name := range cfg.Frontends {
names = append(names, name)
}
sortStringsInPlace(names)
for _, name := range names {
fe := cfg.Frontends[name]
if filterRe != nil && !filterRe.MatchString(name) {
continue
}
if strings.ToLower(fe.Protocol) != "tcp" || fe.Port == 0 {
continue
}
scheme := schemeForPort(fe.Port)
k := key{ip: fe.Address.String(), scheme: scheme, port: fe.Port}
if _, ok := seen[k]; ok {
continue // already claimed by an earlier file
}
info := &vipInfo{
idx: len(order),
scheme: scheme,
ip: fe.Address,
port: fe.Port,
}
if scheme == "http" || scheme == "https" {
info.url = buildURL(scheme, fe.Address, fe.Port, opts.Path)
info.client = newHTTPClient(opts)
}
v := &vipState{
info: info,
rolling: newRolling(),
tally: map[string]int{},
tallyOld: map[string]int{},
tallyNew: map[string]int{},
}
seen[k] = v
order = append(order, k)
}
}
out := make([]*vipState, len(order))
for i, k := range order {
out[i] = seen[k]
}
// Display order: IPv6 before IPv4, higher ports before lower
// within each address family, then address string as a final
// tiebreaker for determinism across runs. HTTPS :443 sitting
// above HTTP :80 matches the "secure first" reading order most
// operators expect, and clustering all the IPv6 rows at the top
// keeps a mixed-family deployment visually coherent as the
// operator scans down the table.
sort.SliceStable(out, func(i, j int) bool {
vi, vj := out[i].info, out[j].info
iIs6 := vi.ip.To4() == nil
jIs6 := vj.ip.To4() == nil
if iIs6 != jIs6 {
return iIs6
}
if vi.port != vj.port {
return vi.port > vj.port
}
return vi.ip.String() < vj.ip.String()
})
// Re-index after the sort so info.idx matches the slot each VIP
// now occupies in out — probeResultMsg.VIPIdx is looked up via
// this index in Model.Update, so they must agree.
for i, v := range out {
v.info.idx = i
}
return out
}
// multiFlag is a flag.Value implementation that accumulates repeated
// --config occurrences into a slice, and also accepts comma-separated
// values on a single flag instance so `--config a.yaml,b.yaml` and
// `--config a.yaml --config b.yaml` produce the same result.
type multiFlag []string
func (m *multiFlag) String() string {
return strings.Join(*m, ",")
}
func (m *multiFlag) Set(v string) error {
for _, p := range strings.Split(v, ",") {
p = strings.TrimSpace(p)
if p != "" {
*m = append(*m, p)
}
}
return nil
}
// schemeForPort infers HTTP vs HTTPS from the VIP's TCP port, falling
// back to "tcp" (connect-only probe) for anything that isn't
// unambiguously web. Intentionally narrow — we'd rather under-classify
// than send HEAD / at an IMAPS VIP and spew protocol errors into the
// logs. Adding more here is fine later (e.g. 8080/8443) but defaults
// should stay conservative.
func schemeForPort(port uint16) string {
switch port {
case 80:
return "http"
case 443:
return "https"
}
return "tcp"
}
// buildURL constructs the probe URL for an HTTP/HTTPS VIP. IPv6
// literals are bracketed per RFC 3986 §3.2.2 so the colon in the
// address isn't confused with the port separator.
func buildURL(scheme string, ip net.IP, port uint16, path string) string {
host := ip.String()
if ip.To4() == nil {
host = "[" + host + "]"
}
if path == "" {
path = "/"
}
return fmt.Sprintf("%s://%s:%d%s", scheme, host, port, path)
}
// sortStringsInPlace is a tiny shim so we don't import "sort" just
// for a single call from buildVIPs. The sorted-names slice is at
// most a few dozen elements so an insertion sort is fine and avoids
// the import churn.
func sortStringsInPlace(s []string) {
for i := 1; i < len(s); i++ {
for j := i; j > 0 && s[j-1] > s[j]; j-- {
s[j-1], s[j] = s[j], s[j-1]
}
}
}

349
cmd/tester/model.go Normal file
View File

@@ -0,0 +1,349 @@
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
package main
import (
"fmt"
"time"
tea "github.com/charmbracelet/bubbletea"
)
// tallyWindow is the sliding-window length used to classify tally
// entries as "actively receiving traffic" versus "drained". A probe
// snapshot of each VIP's tally is rotated into vipState.tallyOld once
// a second, so on steady state the delta (tally - tallyOld) reflects
// somewhere between 1 and 2 seconds of activity — long enough to be
// noise-resistant, short enough that a flush or graceful drain shows
// up in the next UI redraw.
const tallyWindow = 5 * time.Second
// errWindowSize is the hard storage cap for Model.events. It isn't
// the number of rows rendered to screen — that's computed per-frame
// in View() from the terminal height so the events panel fills
// whatever space is left after the header, table, tally, and
// footer. This cap only exists to stop the ring from growing
// unbounded on a very long-running session that's seeing constant
// anomalies: 500 × ~100 bytes per event is ~50 KB, negligible.
const errWindowSize = 500
// errKind classifies why an event ended up in the error panel. The
// four kinds map one-to-one to the four situations Update flags
// from a probeResultMsg: a probe hit its timeout deadline, a probe
// came back with an HTTP 4xx/5xx, a probe failed at the network
// layer (connection refused, reset, unreachable, TLS handshake
// error), or a probe completed successfully but with a latency
// more than 25% above the rolling-window max.
type errKind int
const (
kindTimeout errKind = iota
kindHTTPErr
kindNetErr
kindSpike
)
func (k errKind) String() string {
switch k {
case kindTimeout:
return "timeout"
case kindHTTPErr:
return "http-err"
case kindNetErr:
return "net-err"
case kindSpike:
return "spike"
}
return "unknown"
}
// errEvent is one entry in the bounded error-panel ring. VIPIdx
// points back into Model.vips so the view can look up the scheme
// and address for the row label at render time (we don't store a
// formatted label here to keep the event struct cheap and to let
// the view decide how to style it).
type errEvent struct {
At time.Time
VIPIdx int
Kind errKind
Detail string
}
// Model is the bubbletea Model for maglevt. Held by value throughout
// so bubbletea's copy-on-Update semantics work naturally; mutable
// per-VIP state lives behind *vipState pointers in the vips slice so
// probeResultMsg handlers can update rolling/tally without copying
// the whole model.
type Model struct {
cfgPath string
vips []*vipState
opts probeOpts
startAt time.Time
width int
height int
help bool // whether the help overlay is currently shown
events []errEvent // bounded ring of recent anomalies (size errWindowSize)
// showDNS toggles between hostname and IP-literal display in
// the ADDR column and the tally/events labels. On by default:
// operators usually know VIPs by name, and the 'd' key flips
// to the raw literal when they need to see which address
// family or which specific IP the row represents. Hostnames
// come in asynchronously via hostnameMsg, so vipState.hostname
// may still be empty for a VIP even when showDNS is true —
// the display falls back to the IP literal in that case.
showDNS bool
}
// vipState is the mutable per-VIP record threaded through the tea
// dispatch loop. vipState.info is the immutable descriptor built at
// startup (see probe.go::vipInfo), while everything else on this
// struct is rewritten as probe results arrive.
type vipState struct {
info *vipInfo
// hostname is the PTR-record lookup result for info.ip, filled
// in asynchronously by runDNSLookup via hostnameMsg. Empty
// until the lookup returns (or forever, if it fails or times
// out). The UI consults Model.showDNS to decide whether to
// use it.
hostname string
// Rolling stats populated from every probeResultMsg. Separate
// from tally so reset semantics match the user's mental model:
// pressing 'r' blows away both, but a future pause-clear-resume
// cadence could reset just one.
rolling *rolling
tally map[string]int
// tallyOld / tallyNew are the two-slot rotating snapshots used
// by the tally panel to classify each backend as green (still
// receiving traffic), orange (receiving less than the leader),
// or grey (drained). tallyNew is captured every tallyWindow;
// on the next rotation it shifts into tallyOld, so the delta
// (tally - tallyOld) always spans between 1 and 2 tallyWindow
// units of activity. tallyAt is the wall-clock time tallyNew
// was captured and drives the rotation decision in tickMsg.
tallyOld map[string]int
tallyNew map[string]int
tallyAt time.Time
// Lifetime counters. Unlike the rolling window these never
// forget until the operator hits 'r'. The N column in the
// probe table renders totalProbes; FAIL renders totalFails
// tinted red when non-zero so a failure that rolled off the
// 100-sample rolling window still leaves a visible mark on
// the cumulative count.
totalProbes int64
totalFails int64
// Last-seen values for the rightmost LAST column. These are
// the "what happened on the most recent probe" snapshot the
// UI shows as green/yellow/red.
lastAt time.Time
lastOK bool
lastCode int
lastErr string
lastHeader string
lastDur time.Duration
}
// tickMsg drives the periodic redraw even when no probe results are
// arriving (so the uptime counter in the header ticks along even on
// a completely idle VIP set). 250ms is fast enough to look live
// without burning CPU on layout work.
type tickMsg time.Time
func tickCmd() tea.Cmd {
return tea.Tick(250*time.Millisecond, func(t time.Time) tea.Msg {
return tickMsg(t)
})
}
// Init kicks off the periodic redraw. The alt-screen entry and window
// title are set at NewProgram time in main.go.
func (m Model) Init() tea.Cmd {
return tickCmd()
}
// Update handles every tea.Msg delivered to the program. Four
// message classes:
//
// - tea.WindowSizeMsg — resize; cache width/height for the View.
// - tea.KeyMsg — keybindings (quit, pause, reset, help).
// - probeResultMsg — probe goroutine delivered a new sample;
// update rolling/tally/last* and the sparkline.
// - tickMsg — periodic redraw; re-arm the timer.
func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
switch msg := msg.(type) {
case tea.WindowSizeMsg:
m.width = msg.Width
m.height = msg.Height
return m, nil
case tea.KeyMsg:
switch msg.String() {
case "q", "ctrl+c":
return m, tea.Quit
case " ":
paused.Store(!paused.Load())
return m, nil
case "r":
// Unified reset: wipe per-VIP rolling windows, per-VIP
// tallies, per-VIP sparklines, the global event ring,
// and the global uptime origin so the header clock
// starts fresh. 'r' is the one-key way to start a
// clean capture window, which matches the "I'm about
// to do a failover, watch this" flow.
for _, v := range m.vips {
v.rolling.reset()
v.tally = map[string]int{}
v.tallyOld = map[string]int{}
v.tallyNew = map[string]int{}
v.tallyAt = time.Time{}
v.totalProbes = 0
v.totalFails = 0
v.lastAt = time.Time{}
v.lastOK = false
v.lastCode = 0
v.lastErr = ""
v.lastHeader = ""
v.lastDur = 0
}
m.events = nil
m.startAt = time.Now()
return m, nil
case "h", "?":
m.help = !m.help
return m, nil
case "d":
m.showDNS = !m.showDNS
return m, nil
}
case hostnameMsg:
if msg.VIPIdx >= 0 && msg.VIPIdx < len(m.vips) {
m.vips[msg.VIPIdx].hostname = msg.Hostname
}
return m, nil
case probeResultMsg:
if msg.VIPIdx < 0 || msg.VIPIdx >= len(m.vips) {
return m, nil
}
v := m.vips[msg.VIPIdx]
ns := uint64(msg.Duration.Nanoseconds())
prevMax := v.rolling.maxNS
spike := v.rolling.isSpike(ns)
v.lastAt = msg.At
v.lastOK = msg.OK
v.lastCode = msg.Code
v.lastErr = msg.Err
v.lastHeader = msg.Header
v.lastDur = msg.Duration
v.totalProbes++
if !msg.OK {
v.totalFails++
}
v.rolling.record(ns, msg.OK)
if msg.Header != "" {
v.tally[msg.Header]++
}
// Classify the sample for the error panel. Order matters:
// a network error is always more interesting than a latency
// observation (the latency is noise from the failure
// itself), an HTTP error is more interesting than a spike
// (a 503 dominates a 10ms vs 12ms latency blip), and a
// spike is only flagged on otherwise-successful samples.
if ev, ok := classifyEvent(msg, v, spike, prevMax); ok {
m.events = append(m.events, ev)
if len(m.events) > errWindowSize {
m.events = m.events[len(m.events)-errWindowSize:]
}
}
return m, nil
case tickMsg:
// Rotate each VIP's tally snapshot once the window has
// elapsed. Skipping while paused keeps the tally colours
// frozen at their pre-pause state instead of decaying
// everything to grey as deltas naturally fall to zero.
if !paused.Load() {
now := time.Time(msg)
for _, v := range m.vips {
if v.tallyAt.IsZero() || now.Sub(v.tallyAt) >= tallyWindow {
v.tallyOld = v.tallyNew
v.tallyNew = cloneTally(v.tally)
v.tallyAt = now
}
}
}
return m, tickCmd()
}
return m, nil
}
// cloneTally returns a shallow copy of src suitable for the two-slot
// rotation in vipState. The snapshot must be independent of the live
// tally because subsequent probes keep mutating the original map;
// without the copy, tallyNew and tally would alias and the delta
// would always be zero.
func cloneTally(src map[string]int) map[string]int {
out := make(map[string]int, len(src))
for k, v := range src {
out[k] = v
}
return out
}
// classifyEvent inspects a probeResultMsg and returns the matching
// errEvent (if any) for the error panel. Returns (_, false) when
// the sample is uninteresting — a boring 2xx/3xx HTTP response or
// a successful TCP connect. The four classes are checked in
// priority order: network/timeout errors trump HTTP status trump
// latency spikes, because a failed probe's latency is noise
// inherited from the failure.
//
// prevMax is the rolling-window max seen *before* this sample was
// recorded. It's included in the spike Detail so the operator can
// see the baseline the current probe blew past ("482ms (was
// 98ms)") rather than just an absolute number with no context.
func classifyEvent(msg probeResultMsg, v *vipState, spike bool, prevMax uint64) (errEvent, bool) {
if msg.Err != "" {
kind := kindNetErr
// shortError already collapses "i/o timeout" and
// "context deadline exceeded" to the literal "timeout"
// token, so an equality check is enough to distinguish
// hit-the-deadline failures from refused / reset /
// unreachable errors.
if msg.Err == "timeout" {
kind = kindTimeout
}
return errEvent{
At: msg.At,
VIPIdx: msg.VIPIdx,
Kind: kind,
Detail: msg.Err,
}, true
}
if v.info.scheme != "tcp" && msg.Code >= 400 {
return errEvent{
At: msg.At,
VIPIdx: msg.VIPIdx,
Kind: kindHTTPErr,
Detail: fmt.Sprintf("HTTP %d", msg.Code),
}, true
}
if spike {
return errEvent{
At: msg.At,
VIPIdx: msg.VIPIdx,
Kind: kindSpike,
Detail: fmt.Sprintf("%s (prev max %s)",
formatDur(msg.Duration),
formatDur(time.Duration(prevMax))),
}, true
}
return errEvent{}, false
}

292
cmd/tester/probe.go Normal file
View File

@@ -0,0 +1,292 @@
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
package main
import (
"context"
"crypto/tls"
"math/rand/v2"
"net"
"net/http"
"strconv"
"strings"
"sync/atomic"
"time"
tea "github.com/charmbracelet/bubbletea"
)
// probeOpts bundles the runtime probe configuration coming from the
// command-line flags. Each probe goroutine gets a copy at startup;
// nothing in here mutates during the program's lifetime (pauses flow
// through the global `paused` atomic instead).
type probeOpts struct {
Interval time.Duration
Timeout time.Duration
Host string
Path string
Header string
Insecure bool
KeepAlive bool
}
// vipInfo is the immutable per-VIP descriptor built from the maglev
// config at startup. Kept separate from vipState (which holds mutable
// stats) so the probe goroutine can read its target without racing
// the Update loop in model.go.
//
// Identity of a VIP in maglevt is the (scheme, ip, port) tuple, not
// the symbolic name from the config: when we union multiple maglev
// yaml files, the same name can describe different tuples across
// deployments and the tuple is the unambiguous key. The TUI shows
// scheme + ip:port and nothing else, so there's no need to carry a
// display name at all.
type vipInfo struct {
idx int // index into Model.vips (stable for the process lifetime)
scheme string // "http", "https", or "tcp"
ip net.IP // the VIP address
port uint16 // the VIP port (TCP)
url string // assembled probe URL for http/https; empty for tcp
client *http.Client
}
// probeResultMsg is the tea.Msg sent from probe goroutines to the UI
// on every probe completion. Bubbletea delivers it into Model.Update
// on the tea dispatch goroutine, so the model can mutate its per-VIP
// state without locks.
type probeResultMsg struct {
VIPIdx int
At time.Time
Duration time.Duration
OK bool
Code int // HTTP status code (0 for tcp-only probes)
Header string // extracted response header value, empty if absent
Err string // empty when OK; populated with a short error string otherwise
}
// hostnameMsg is the tea.Msg the DNS resolver worker sends once it
// has a PTR record for a VIP. The UI uses it to populate
// vipState.hostname so the 'd' toggle has something to show. One
// message per VIP at most — lookup failures just drop silently
// and the VIP stays on its IP literal.
type hostnameMsg struct {
VIPIdx int
Hostname string
}
// paused is the global pause flag flipped by the spacebar binding in
// Model.Update. Using an atomic rather than a channel keeps the probe
// loop dead-simple — no extra select case, no risk of wedging a
// goroutine on a full buffered channel.
var paused atomic.Bool
// newHTTPClient builds an *http.Client with the transport configured
// the way maglevt wants its probes to behave. The two flags that
// actually matter here are:
//
// - DisableKeepAlives = !opts.KeepAlive: off by default, because
// failover testing relies on every probe opening a fresh TCP+TLS
// connection. A single persistent connection would pin us to one
// backend until the keep-alive timer expires, and the tally would
// silently lie about which backend the load balancer is actually
// steering new flows to. --keepalive/-k flips this back on for
// users who explicitly want the pinned-session view.
// - InsecureSkipVerify: on by default, matching `curl -k`. VIP
// certificates almost never match the raw IP literal we're
// targeting, so verification would fail on the first probe. Users
// who want strict verification can pass --insecure=false.
//
// Redirects are refused (CheckRedirect returns ErrUseLastResponse) so
// a 302 from one backend doesn't silently advance to another VIP and
// pollute the tally.
func newHTTPClient(opts probeOpts) *http.Client {
tr := &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: opts.Insecure, //nolint:gosec // matching curl -k, see comment above
},
DisableKeepAlives: !opts.KeepAlive,
// We dial the VIP literal directly — no DNS involvement —
// so the default DialContext is already what we want.
}
return &http.Client{
Transport: tr,
Timeout: opts.Timeout,
CheckRedirect: func(*http.Request, []*http.Request) error {
return http.ErrUseLastResponse
},
}
}
// runProbeLoop is the per-VIP probe worker. It waits an initial random
// delay (so N VIPs don't phase-lock onto the same tick after startup),
// then fires a probe every opts.Interval ± 10% jitter until ctx is
// cancelled. Each completed probe posts a probeResultMsg into the
// tea.Program via send.
//
// The loop honors the global `paused` flag by simply skipping the
// probe call while paused — the ticker keeps ticking so a resume
// picks up at the next natural tick boundary instead of fast-
// forwarding through a burst of back-to-back probes.
func runProbeLoop(ctx context.Context, vip *vipInfo, opts probeOpts, send func(tea.Msg)) {
// Initial offset: uniformly in [0, interval) so N goroutines
// started together spread out across one interval window rather
// than all firing at t=0.
select {
case <-ctx.Done():
return
case <-time.After(time.Duration(rand.Int64N(int64(opts.Interval)))):
}
for {
sleepFor := jitter(opts.Interval)
if !paused.Load() {
result := doProbe(ctx, vip, opts)
send(result)
}
select {
case <-ctx.Done():
return
case <-time.After(sleepFor):
}
}
}
// runDNSLookup does a single reverse-DNS (PTR) lookup for vip.ip
// and, on success, delivers a hostnameMsg to the UI. Runs once
// per VIP at startup and then exits — PTR records change rarely
// enough that we don't bother re-querying them, and the operator
// can always restart maglevt to pick up a new mapping.
//
// A 3-second timeout keeps a broken resolver from wedging the
// worker for the full life of the program; on timeout we just
// give up silently and the VIP displays its IP literal forever.
// Trailing dots are stripped so the rendered hostname matches
// what the operator typed in their zone file.
func runDNSLookup(parent context.Context, vip *vipInfo, send func(tea.Msg)) {
ctx, cancel := context.WithTimeout(parent, 3*time.Second)
defer cancel()
names, err := net.DefaultResolver.LookupAddr(ctx, vip.ip.String())
if err != nil || len(names) == 0 {
return
}
send(hostnameMsg{
VIPIdx: vip.idx,
Hostname: strings.TrimSuffix(names[0], "."),
})
}
// jitter scales d by a uniform factor in [0.9, 1.1) — the same ±10%
// jitter the checker uses, same rationale: probes don't phase-lock on
// a wall-clock tick across every VIP in the config.
func jitter(d time.Duration) time.Duration {
if d <= 0 {
return d
}
return time.Duration(float64(d) * (0.9 + 0.2*rand.Float64()))
}
// doProbe issues one probe against vip and returns a filled-in
// probeResultMsg. HTTP / HTTPS go through vip.client with a GET
// request against opts.Path (default /.well-known/ipng/healthz),
// a Host-header override, and header extraction. Non-HTTP VIPs do
// a plain TCP connect — success if the three-way handshake
// completes before opts.Timeout.
//
// GET rather than HEAD: a common health-check path (healthz,
// status, /-/healthy) often returns 204 or 200 and is cheap to
// serve, but some handlers don't wire HEAD and would 405 us back.
// GET works against every reasonable implementation and the
// rolling-window latency is unchanged (we time until headers, not
// until the body completes — resp.Body.Close() discards the body
// without reading it). The defer below still closes the body so
// the transport can recycle the connection if --keepalive is on.
func doProbe(parent context.Context, vip *vipInfo, opts probeOpts) probeResultMsg {
start := time.Now()
result := probeResultMsg{VIPIdx: vip.idx, At: start}
ctx, cancel := context.WithTimeout(parent, opts.Timeout)
defer cancel()
if vip.scheme == "tcp" {
d := net.Dialer{Timeout: opts.Timeout}
conn, err := d.DialContext(ctx, "tcp",
net.JoinHostPort(vip.ip.String(), strconv.Itoa(int(vip.port))))
result.Duration = time.Since(start)
if err != nil {
result.Err = shortError(err)
return result
}
_ = conn.Close()
result.OK = true
return result
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, vip.url, nil)
if err != nil {
result.Duration = time.Since(start)
result.Err = shortError(err)
return result
}
// Host header: user override, else derive from the VIP itself
// (which the kernel already put in the URL, so leaving req.Host
// empty means "use the URL authority"). We only touch req.Host
// when the operator explicitly passed --host.
if opts.Host != "" {
req.Host = opts.Host
}
resp, err := vip.client.Do(req)
result.Duration = time.Since(start)
if err != nil {
result.Err = shortError(err)
return result
}
defer func() { _ = resp.Body.Close() }()
result.Code = resp.StatusCode
result.OK = resp.StatusCode >= 200 && resp.StatusCode < 400
if opts.Header != "" {
result.Header = resp.Header.Get(opts.Header)
}
return result
}
// shortError collapses common Go net errors into a short string
// suitable for a narrow table cell. url.Error wrappings, dial
// contexts, and "i/o timeout" trailers all get trimmed so the LAST
// column shows something legible like "refused" / "timeout" /
// "no route" instead of a 120-char wrapped error.
func shortError(err error) string {
if err == nil {
return ""
}
msg := err.Error()
// net/url-wrapped errors: keep only the last segment, which
// holds the actual cause.
if i := strings.LastIndex(msg, ": "); i >= 0 {
msg = msg[i+2:]
}
// Normalise common kernel errnos and Go's wrappers.
switch {
case strings.Contains(msg, "connection refused"):
return "refused"
case strings.Contains(msg, "i/o timeout") || strings.Contains(msg, "context deadline exceeded"):
return "timeout"
case strings.Contains(msg, "no route to host"):
return "no-route"
case strings.Contains(msg, "network is unreachable"):
return "net-unrch"
case strings.Contains(msg, "host is unreachable"):
return "host-unrch"
case strings.Contains(msg, "connection reset"):
return "reset"
case strings.Contains(msg, "EOF"):
return "eof"
case strings.Contains(strings.ToLower(msg), "tls"):
return "tls-err"
}
// Last resort: truncate anything longer than the LAST column.
if len(msg) > 8 {
msg = msg[:8]
}
return msg
}

162
cmd/tester/stats.go Normal file
View File

@@ -0,0 +1,162 @@
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
package main
import "sort"
// rollingSize is the bounded history maglevt keeps per VIP for
// latency percentiles and success-ratio display. At 100 samples
// and the default 100ms probe interval, this is a ~10s window —
// short enough to react quickly to failover events, long enough
// that p50/p95 are statistically meaningful.
const rollingSize = 100
// rolling is a bounded-window (rollingSize) counter for per-VIP probe
// results. It tracks success/failure totals, running sum for mean
// latency, and a ring of individual samples so percentiles can be
// computed on demand. Non-thread-safe: everything that touches a
// rolling lives on the bubbletea dispatch goroutine, so no locking
// is needed. The reset-on-rotate logic keeps ok/fail/sumNS in sync
// with the ring contents as old samples get overwritten.
type rolling struct {
samples []sample
idx int // next write position
n int // number of valid samples (0..rollingSize)
ok int
fail int
sumNS uint64
minNS uint64 // 0 while n == 0
maxNS uint64
}
type sample struct {
ns uint64
ok bool
}
// newRolling returns an empty rolling window ready to accept records.
func newRolling() *rolling {
return &rolling{samples: make([]sample, rollingSize)}
}
// reset zeroes every field and the ring so the rolling window starts
// fresh. Called by the 'r' keybinding and by the --reset flow.
func (r *rolling) reset() {
for i := range r.samples {
r.samples[i] = sample{}
}
r.idx = 0
r.n = 0
r.ok = 0
r.fail = 0
r.sumNS = 0
r.minNS = 0
r.maxNS = 0
}
// record appends a single probe result to the rolling window, evicting
// the oldest sample if the ring is already full. All aggregate fields
// (ok/fail/sumNS) are kept in sync incrementally to avoid re-walking
// the ring on every insert. min/max are re-derived from the ring
// after the write — that's O(n) but n is bounded at rollingSize so
// the cost is trivial and avoids the bookkeeping complexity of an
// incremental extremum counter.
func (r *rolling) record(ns uint64, ok bool) {
if r.n == rollingSize {
// Ring is full — subtract the contribution of the sample
// we're about to overwrite.
old := r.samples[r.idx]
r.sumNS -= old.ns
if old.ok {
r.ok--
} else {
r.fail--
}
} else {
r.n++
}
r.samples[r.idx] = sample{ns: ns, ok: ok}
r.idx = (r.idx + 1) % rollingSize
r.sumNS += ns
if ok {
r.ok++
} else {
r.fail++
}
// Recompute min/max from the live ring. O(rollingSize) but
// that's 100 uint64 reads — noise on any machine maglevt
// would ever run on.
r.minNS = ^uint64(0)
r.maxNS = 0
for i := 0; i < r.n; i++ {
s := r.samples[i]
if s.ns < r.minNS {
r.minNS = s.ns
}
if s.ns > r.maxNS {
r.maxNS = s.ns
}
}
}
// percentiles returns (p50, p95, p99) in nanoseconds over the
// current window, or zeros if empty. Implemented by copying the
// ring into a fresh slice, sort.Slice, and index lookup — the
// 100-element sort is cheap enough to do per UI frame (roughly
// every 250ms). Index clamping at r.n-1 handles the warmup case
// where the rolling window doesn't yet have enough samples for
// p95/p99 to fall in distinct slots.
func (r *rolling) percentiles() (p50, p95, p99 uint64) {
if r.n == 0 {
return 0, 0, 0
}
buf := make([]uint64, r.n)
for i := 0; i < r.n; i++ {
buf[i] = r.samples[i].ns
}
sort.Slice(buf, func(i, j int) bool { return buf[i] < buf[j] })
p50 = buf[r.n/2]
p95Idx := r.n * 95 / 100
if p95Idx >= r.n {
p95Idx = r.n - 1
}
p95 = buf[p95Idx]
p99Idx := r.n * 99 / 100
if p99Idx >= r.n {
p99Idx = r.n - 1
}
p99 = buf[p99Idx]
return p50, p95, p99
}
// successPct returns the percentage of probes currently in the window
// that completed successfully (2xx/3xx for HTTP, TCP connect OK for
// tcp). Returns 0 when the window is empty.
func (r *rolling) successPct() float64 {
if r.n == 0 {
return 0
}
return 100.0 * float64(r.ok) / float64(r.n)
}
// isSpike reports whether ns is more than 25% above the current
// window maximum. A spike-warmup guard (n < 10) prevents the first
// handful of cold-start samples from each flagging as spikes — max
// during warmup is whatever happened to come in first, so the 1.25×
// threshold is meaningless until we have a stable baseline.
//
// Intended to be called *before* record() on the same sample, so the
// comparison runs against the previous window max rather than the
// one the new sample would produce. If a spike is detected, the
// caller typically records an errEvent and then calls record() to
// fold the sample into the rolling stats as usual.
func (r *rolling) isSpike(ns uint64) bool {
if r.n < 10 || r.maxNS == 0 {
return false
}
// ns > maxNS * 1.25, written without float conversion.
return ns > r.maxNS+r.maxNS/4
}

724
cmd/tester/view.go Normal file
View File

@@ -0,0 +1,724 @@
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
package main
import (
"fmt"
"sort"
"strings"
"time"
"github.com/charmbracelet/lipgloss"
)
// Styles. Colours are ANSI 256 indices so maglevt renders the same
// across iTerm, Alacritty, xterm, tmux, and screen without depending
// on truecolor support.
var (
styleHeader = lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("14"))
styleSection = lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("14"))
styleDim = lipgloss.NewStyle().Foreground(lipgloss.Color("8"))
styleHint = lipgloss.NewStyle().Foreground(lipgloss.Color("244"))
styleOK = lipgloss.NewStyle().Foreground(lipgloss.Color("10")).Bold(true)
styleWarn = lipgloss.NewStyle().Foreground(lipgloss.Color("11")).Bold(true)
styleErr = lipgloss.NewStyle().Foreground(lipgloss.Color("9")).Bold(true)
styleWhite = lipgloss.NewStyle().Foreground(lipgloss.Color("15")).Bold(true)
styleHTTP = lipgloss.NewStyle().Foreground(lipgloss.Color("12"))
styleTCP = lipgloss.NewStyle().Foreground(lipgloss.Color("13"))
styleRunning = lipgloss.NewStyle().Foreground(lipgloss.Color("10")).Bold(true)
stylePaused = lipgloss.NewStyle().Foreground(lipgloss.Color("11")).Bold(true)
)
// View renders the full TUI. One call per redraw; re-invoked whenever
// bubbletea dispatches a message. No mutation — every field it reads
// is in Model or vipState, so it's safe to call concurrently with
// Update's message handling (tea serialises them anyway).
//
// Section order: header → probe table → tally (if --header collected
// any samples) → events panel (if any anomalies have been observed)
// → [blank padding] → footer. The tally lives above the events so a
// growing events list pushes itself against the footer rather than
// shoving the tally up and down every time a new anomaly lands —
// the tally is the panel the operator stares at during a failover
// test, and it should stay put.
//
// The footer is pinned to the last row of the terminal: we count
// how many lines the above sections produced, then pad with enough
// blank lines to push the footer to row m.height. That keeps the
// `[q] quit …` hint visible at the bottom even when the content
// doesn't fill the screen, and matches the convention most TUIs
// follow. When the content DOES overflow (tiny terminal / huge
// config), we fall back to a single blank line between content
// and footer so the footer is still reachable by scrolling.
func (m Model) View() string {
if m.help {
return m.viewHelp()
}
// Phase 1: build everything that isn't the events panel, so we
// can measure how many rows are left for events. This is the
// "fixed" content — header, table, and (maybe) tally. Events
// then absorb whatever space remains between this block and
// the screen-pinned footer.
var pre strings.Builder
pre.WriteString(m.viewHeader())
pre.WriteString("\n\n")
pre.WriteString(m.viewTable())
if m.opts.Header != "" && m.anyTallied() {
pre.WriteString("\n")
pre.WriteString(m.viewTally())
}
preContent := pre.String()
footer := m.viewFooter()
// Phase 2: compute how many event rows can fit.
//
// Screen budget:
// preContent lines (header + table + optional tally)
// + 1 blank separator
// + 1 events section header
// + N events rows ← the unknown we're solving for
// + M footer-pad blank rows
// + 1 footer line
// = m.height
//
// Solving for max N (before the footer-pad term kicks in):
// N = m.height - preLines - 3 (3 = separator + events hdr + footer)
//
// When there's no room for events (tiny terminal, or tally
// already fills the screen), maxEvents clamps to 0 and the
// events section is skipped entirely, letting the footer pad
// logic below handle the spacing.
preLines := strings.Count(preContent, "\n")
// Budget arithmetic for the events panel:
// preLines + 1 (separator) + 1 (events hdr) + maxEvents + 1 (footer) = m.height
// So maxEvents = m.height - preLines - 3. A negative value
// means the terminal is too tight to even frame the panel
// (section header + separator + footer wouldn't fit), in
// which case we skip the whole section. A zero value means
// we render only the "Recent events: (none)" placeholder
// with no rows below it — enough to mark the panel's
// position on the screen during the quiet period before the
// first anomaly arrives.
maxEvents := -1
if m.height > 0 {
maxEvents = m.height - preLines - 3
}
var content strings.Builder
content.WriteString(preContent)
if maxEvents >= 0 {
content.WriteString("\n")
content.WriteString(m.viewEvents(maxEvents))
}
contentStr := content.String()
// Phase 3: pin footer to last screen row. Alt-screen guarantees
// the view starts at row 1, so we need (m.height - 1) lines of
// content above the footer. m.height == 0 means no
// WindowSizeMsg yet (first frame); degrade to a single blank
// separator so the footer is still visible even if the exact
// row is wrong.
if m.height <= 0 {
return contentStr + "\n\n" + footer
}
// contentStr ends with a newline, so strings.Count is exactly the
// number of visible rows it occupies, and the cursor is parked at
// row contentLines+1 after it's written. To land the footer on
// the last terminal row we need padLines = m.height-contentLines-1
// extra newlines between content and footer. padLines==0 is the
// perfect-fit case (events panel was sized with this in mind) and
// must NOT be bumped to 1 — that would push the footer to row
// m.height+1 and scroll the header off the top of the alt-screen.
contentLines := strings.Count(contentStr, "\n")
padLines := m.height - contentLines - 1
if padLines < 0 {
padLines = 0
}
return contentStr + strings.Repeat("\n", padLines) + footer
}
func (m Model) viewHeader() string {
runState := styleRunning.Render("RUNNING")
if paused.Load() {
runState = stylePaused.Render("PAUSED")
}
line := fmt.Sprintf(
"maglevt — %s — interval: %s timeout: %s header: %s [%s] uptime: %s",
m.cfgPath,
m.opts.Interval,
m.opts.Timeout,
m.opts.Header,
runState,
time.Since(m.startAt).Round(time.Second),
)
return styleHeader.Render(line)
}
// Table column widths (visible characters, not bytes). All cell
// rendering goes through padVisibleLeft / padVisibleRight which
// measure width via lipgloss.Width — that strips ANSI escape
// sequences before counting, so ANSI-styled cells pad correctly
// instead of under-counting by the escape-code overhead.
//
// There's no VIP-name column: identity is the (scheme, ip, port)
// tuple. PROTO + ADDR together are the row key, which also makes
// the display independent of whatever names the source yaml files
// used (potentially conflicting across a multi-config union).
const (
colSchemeW = 5 // "http", "https", "tcp" — widest is 5
colAFW = 2 // "v4" / "v6"
colAddrW = 40 // address + port, bracketed IPv6; full 8-group expansion fits
colLastW = 10
colNW = 7 // "N" lifetime probe count; 7 digits handles >24h @ 100ms
colFailW = 6 // "FAIL" lifetime failure count
colOKW = 7
colP50W = 9
colP95W = 9
colP99W = 9
colMaxW = 9
)
// padVisibleRight right-pads s with spaces so that its rendered
// visible width (lipgloss.Width, which strips ANSI) matches width.
// If s is already that wide or wider, it's returned unchanged. Used
// for left-aligned columns whose content may contain ANSI escapes.
func padVisibleRight(s string, width int) string {
w := lipgloss.Width(s)
if w >= width {
return s
}
return s + strings.Repeat(" ", width-w)
}
// padVisibleLeft is the right-aligned sibling of padVisibleRight.
func padVisibleLeft(s string, width int) string {
w := lipgloss.Width(s)
if w >= width {
return s
}
return strings.Repeat(" ", width-w) + s
}
// truncateVisible clamps s to at most width *visible* characters,
// preserving embedded ANSI escape sequences by copying runs between
// escapes. A single ellipsis replaces the last visible character
// when truncation happens so the operator can see the cell was cut.
func truncateVisible(s string, width int) string {
if lipgloss.Width(s) <= width {
return s
}
if width <= 0 {
return ""
}
// Walk runes, copying ANSI escape sequences verbatim (they
// don't consume visible width) and counting printable runes.
var b strings.Builder
visible := 0
inEscape := false
for _, r := range s {
if inEscape {
b.WriteRune(r)
if r == 'm' {
inEscape = false
}
continue
}
if r == 0x1b {
b.WriteRune(r)
inEscape = true
continue
}
if visible+1 >= width {
b.WriteRune('…')
break
}
b.WriteRune(r)
visible++
}
return b.String()
}
func (m Model) viewTable() string {
var b strings.Builder
// Header row: plain text (no per-cell styling) so the column
// widths match the data rows 1:1 without lipgloss.Width
// gymnastics.
// Header labels must each fit within their column width — "PROTO"
// at 6 chars overflows colSchemeW (5) and would push every
// subsequent header one column right of its data, so we use the
// 5-char "PROTO" here. LAST is left-aligned to match the starting
// column of the first tally entry on the row below, so the operator
// can eye-align a status code with the backend it corresponds to.
header := " " +
padVisibleRight("PROTO", colSchemeW) + " " +
padVisibleRight("AF", colAFW) + " " +
padVisibleRight("ADDR", colAddrW) + " " +
padVisibleRight("LAST", colLastW) + " " +
padVisibleLeft("N", colNW) + " " +
padVisibleLeft("FAIL", colFailW) + " " +
padVisibleLeft("OK%", colOKW) + " " +
padVisibleLeft("p50", colP50W) + " " +
padVisibleLeft("p95", colP95W) + " " +
padVisibleLeft("p99", colP99W) + " " +
padVisibleLeft("max", colMaxW)
b.WriteString(styleDim.Render(header))
b.WriteString("\n")
for _, v := range m.vips {
b.WriteString(m.viewRow(v))
b.WriteString("\n")
}
return b.String()
}
func (m Model) viewRow(v *vipState) string {
scheme := schemeLabel(v.info.scheme)
addr := truncateVisible(m.displayAddr(v), colAddrW)
last := lastCell(v)
nStr := fmt.Sprintf("%d", v.totalProbes)
failStr := fmt.Sprintf("%d", v.totalFails)
// Red-tint the FAIL column only when there's actually been a
// failure. Zero reads as "fine" so it stays in the dim default
// colour along with a green-zero N for consistency. styleDim
// on the counters is intentional — they're reference values,
// not the primary "is this VIP healthy" signal (that's LAST
// and OK%).
nStyled := styleDim.Render(nStr)
var failStyled string
if v.totalFails > 0 {
failStyled = styleErr.Render(failStr)
} else {
failStyled = styleDim.Render(failStr)
}
okStr, p50Str, p95Str, p99Str, maxStr := "—", "—", "—", "—", "—"
if v.rolling.n > 0 {
okStr = fmt.Sprintf("%.1f", v.rolling.successPct())
p50ns, p95ns, p99ns := v.rolling.percentiles()
p50Str = formatDur(time.Duration(p50ns))
p95Str = formatDur(time.Duration(p95ns))
p99Str = formatDur(time.Duration(p99ns))
maxStr = formatDur(time.Duration(v.rolling.maxNS))
}
okStr = colourOK(okStr, v.rolling.successPct(), v.rolling.n)
return " " +
padVisibleRight(scheme, colSchemeW) + " " +
padVisibleRight(afLabel(v.info), colAFW) + " " +
padVisibleRight(addr, colAddrW) + " " +
padVisibleRight(last, colLastW) + " " +
padVisibleLeft(nStyled, colNW) + " " +
padVisibleLeft(failStyled, colFailW) + " " +
padVisibleLeft(okStr, colOKW) + " " +
padVisibleLeft(p50Str, colP50W) + " " +
padVisibleLeft(p95Str, colP95W) + " " +
padVisibleLeft(p99Str, colP99W) + " " +
padVisibleLeft(maxStr, colMaxW)
}
// afLabel returns the address-family tag for the AF column.
// IPv6 is identified by To4() == nil, which is how net.IP
// distinguishes a 4-in-6 mapped address from a native v6 one.
func afLabel(v *vipInfo) string {
if v.ip.To4() == nil {
return "v6"
}
return "v4"
}
// displayAddr formats the address cell for a VIP, honouring the
// Model.showDNS toggle. With DNS on (the default) and a PTR result
// available we show "hostname:port"; otherwise we fall back to the
// raw IP literal, bracketed for IPv6. Keeping the toggle in the
// Model (rather than per-VIP) means pressing 'd' flips every row
// on the next redraw, which is the behaviour the operator expects.
func (m Model) displayAddr(v *vipState) string {
if m.showDNS && v.hostname != "" {
return fmt.Sprintf("%s:%d", v.hostname, v.info.port)
}
return vipAddrString(v.info)
}
// schemeLabel renders the coloured scheme tag for the PROTO column.
// The raw token ("http", "https", "tcp") is returned wrapped in an
// ANSI style; callers are responsible for padding it to colSchemeW
// via padVisibleRight when they want column alignment. Use
// schemeAddrLabel when you want a pre-padded "PROTO ADDR" tuple
// aligned with the main table.
func schemeLabel(scheme string) string {
switch scheme {
case "http":
return styleHTTP.Render("http")
case "https":
return styleHTTP.Render("https")
default:
return styleTCP.Render("tcp")
}
}
// schemeAddrLabel builds the shared "PROTO AF ADDR" label used by
// the tally and events panels. Both sections need to line up under
// the main probe table's PROTO + AF + ADDR columns, which means
// every fixed-width cell must be padded before the next one is
// appended — otherwise "http" + " " + addr and "https" + " " +
// addr would start the address column one character apart. The
// helper is the single source of truth so the two call sites can't
// drift out of sync with the main table layout. ADDR honours the
// Model.showDNS toggle via displayAddr so toggling 'd' flips every
// section of the TUI in lockstep.
func (m Model) schemeAddrLabel(v *vipState) string {
return padVisibleRight(schemeLabel(v.info.scheme), colSchemeW) + " " +
padVisibleRight(afLabel(v.info), colAFW) + " " +
m.displayAddr(v)
}
// vipAddrString formats an address+port for the ADDR column, with
// IPv6 literals bracketed so the colons in the address don't blur
// into the port separator visually. No scheme prefix — the PROTO
// column handles that, which also frees up width for the address
// itself.
func vipAddrString(v *vipInfo) string {
host := v.ip.String()
if v.ip.To4() == nil {
host = "[" + host + "]"
}
return fmt.Sprintf("%s:%d", host, v.port)
}
// lastCell returns the colour-rendered LAST column for a single VIP,
// combining the most recent status code (or error token) with a
// bold colour that encodes success vs warning vs failure. Idle VIPs
// (no probe yet) render as a dim dash.
func lastCell(v *vipState) string {
if v.lastAt.IsZero() {
return styleDim.Render("—")
}
if v.lastErr != "" {
return styleErr.Render(v.lastErr)
}
if v.info.scheme == "tcp" {
if v.lastOK {
return styleOK.Render("ok")
}
return styleErr.Render("fail")
}
// HTTP / HTTPS: show the status code, coloured by class.
txt := fmt.Sprintf("%d", v.lastCode)
switch {
case v.lastCode >= 200 && v.lastCode < 300:
return styleOK.Render(txt)
case v.lastCode >= 300 && v.lastCode < 400:
return styleWarn.Render(txt)
case v.lastCode >= 400 && v.lastCode < 500:
return styleWarn.Render(txt)
default:
return styleErr.Render(txt)
}
}
// colourOK renders the OK% cell with a threshold-based colour. The
// input string is the pre-formatted percentage text (or "-" if the
// window is empty); the decision is made on the raw float so "99.9"
// and "99.0" don't both end up the same colour by round-up. The
// window-size check avoids painting a green "100.0" after a single
// successful probe — we wait until the rolling window is at least
// 10 samples deep before committing to a verdict.
func colourOK(txt string, pct float64, n int) string {
if n < 10 {
return styleDim.Render(txt)
}
switch {
case pct >= 99:
return styleOK.Render(txt)
case pct >= 95:
return styleWarn.Render(txt)
default:
return styleErr.Render(txt)
}
}
func (m Model) anyTallied() bool {
for _, v := range m.vips {
if len(v.tally) > 0 {
return true
}
}
return false
}
func (m Model) viewTally() string {
// Label each tally row with the same PROTO + ADDR pair the
// main table uses, so the operator can correlate a row in the
// tally back to the probe-table row without a symbolic VIP
// name. Label width is the combined width of the two columns
// plus their inter-column gap so things line up.
const labelW = colSchemeW + 2 + colAFW + 2 + colAddrW
// Pre-pass: compute the widest (name:count) entry across every
// VIP's tally. Every rendered entry is padded to this width so
// rows stay vertically aligned as counts grow — one backend's
// count ticking from 999 to 1000 widens every entry by one
// character simultaneously, rather than just shifting that
// single row's trailing entries rightward.
maxEntryW := 0
for _, v := range m.vips {
if v.info.scheme == "tcp" {
continue
}
for name, n := range v.tally {
w := len(name) + 1 + countDigits(n) // "name" + ":" + digits
if w > maxEntryW {
maxEntryW = w
}
}
}
var b strings.Builder
b.WriteString(styleSection.Render(fmt.Sprintf("%s tally:", m.opts.Header)))
b.WriteString("\n")
for _, v := range m.vips {
if v.info.scheme == "tcp" || len(v.tally) == 0 {
continue
}
b.WriteString(m.renderTallyRow(v, maxEntryW, labelW))
}
return b.String()
}
// renderTallyRow builds one tally line for a single VIP. Entries
// are sorted alphabetically — stable across failovers, and stable
// under the inevitable jitter where three "equally loaded" backends
// shuffle their exact counts from probe to probe. A count-sorted
// order looks informative on a static screenshot but flickers on a
// live display, and the operator ends up reading the names anyway
// to figure out which column is which. Alphabetical pins every
// label to its own column for the lifetime of the process.
//
// Colour is binary: white if the backend was seen at least once in
// the last tallyWindow, grey otherwise. Green is deliberately
// avoided here — it means "success" elsewhere in the TUI (OK%, 2xx
// status codes) and carries a value judgement the tally doesn't
// intend to make. An active tally entry is just "in the rotation
// right now", not "good". The earlier three-way (green/orange/grey)
// scheme tried to distinguish "row leader" from "still receiving
// some traffic", but on a healthy VIP where maglev spreads flows
// evenly, the three backends tie ±a few counts per window and
// flicker between colours on every redraw — visual noise, not
// signal.
//
// During the first tallyWindow after startup or a reset, v.tallyOld
// is the empty map, so every positive count reads as active and the
// row flashes all-white. That's correct: we haven't observed a
// drain yet, so nothing is drained.
func (m Model) renderTallyRow(v *vipState, maxEntryW, labelW int) string {
keys := make([]string, 0, len(v.tally))
for k := range v.tally {
keys = append(keys, k)
}
sort.Strings(keys)
parts := make([]string, 0, len(keys))
for _, k := range keys {
n := v.tally[k]
raw := fmt.Sprintf("%s:%d", k, n)
// delta < 0 is the transient post-reset case where the
// tally dropped below the old snapshot; treat that as
// "no activity" until the snapshot rotates on the next
// tick.
active := n-v.tallyOld[k] > 0
var styled string
if active {
// White (not green): green reads as "success" in the
// rest of the TUI — reserving it for OK% and 2xx
// statuses keeps the semantic clean. White is just
// "this backend is live right now" without the value
// judgement.
styled = styleWhite.Render(raw)
} else {
styled = styleDim.Render(raw)
}
parts = append(parts, padVisibleRight(styled, maxEntryW))
}
label := m.schemeAddrLabel(v)
return " " + padVisibleRight(truncateVisible(label, labelW), labelW) +
" " + strings.Join(parts, " ") + "\n"
}
// countDigits returns the number of decimal digits in n. Faster and
// allocation-free vs fmt.Sprintf("%d", n) and good enough for the
// positive-integer tally counts (we never feed it negatives).
func countDigits(n int) int {
if n == 0 {
return 1
}
d := 0
for n > 0 {
n /= 10
d++
}
return d
}
// viewEvents renders the error panel into a fixed number of rows
// (maxRows, computed per-frame from the terminal height by View()).
// Only the most recent `maxRows` events are shown — anything older
// scrolls off. When the panel clips, the header notes "showing N
// of M" so the operator knows there's older history they can't see
// without resetting and starting fresh.
//
// maxRows <= 0 or an empty events ring means "don't render this
// section at all"; View() skips the whole block in that case.
//
// Each row carries a millisecond-precision timestamp, the VIP's
// PROTO+ADDR label (same columns as the main table so the eye can
// jump between the two), the event kind in a fixed 9-char slot, and
// a free-form detail string. Network errors render red, HTTP errors
// render red, spikes render yellow to distinguish "backend responded
// but slowly" from "backend didn't respond at all".
func (m Model) viewEvents(maxRows int) string {
if maxRows < 0 {
return ""
}
const (
labelW = colSchemeW + 2 + colAFW + 2 + colAddrW
kindW = 9
tsW = 12 // "HH:MM:SS.mmm"
)
var b strings.Builder
// Header slot 1: no events yet. The section header still
// renders so the operator always sees where the panel lives;
// a (none) tag makes the empty state obvious rather than
// leaving the operator wondering whether the panel is broken.
if len(m.events) == 0 {
fmt.Fprintf(&b, "%s %s\n",
styleSection.Render("Recent events:"),
styleDim.Render("(none)"))
return b.String()
}
// maxRows is the budget for event *rows*, not including the
// section-header line itself — View() reserved one extra row
// for the header when it computed the budget, so we can spend
// the full maxRows on events below.
events := m.events
clipped := false
if len(events) > maxRows {
events = events[len(events)-maxRows:]
clipped = true
}
if clipped {
fmt.Fprintf(&b, "%s\n", styleSection.Render(
fmt.Sprintf("Recent events (showing %d of %d):", len(events), len(m.events))))
} else {
fmt.Fprintf(&b, "%s\n", styleSection.Render("Recent events:"))
}
for _, e := range events {
v := m.vips[e.VIPIdx]
label := m.schemeAddrLabel(v)
ts := styleDim.Render(e.At.Format("15:04:05.000"))
kindTxt := e.Kind.String()
var kind string
switch e.Kind {
case kindSpike:
kind = styleWarn.Render(padVisibleRight(kindTxt, kindW))
default:
kind = styleErr.Render(padVisibleRight(kindTxt, kindW))
}
fmt.Fprintf(&b, " %s %s %s %s\n",
padVisibleRight(ts, tsW),
padVisibleRight(truncateVisible(label, labelW), labelW),
kind,
e.Detail,
)
}
return b.String()
}
func (m Model) viewFooter() string {
dnsHint := "[d] dns off"
if !m.showDNS {
dnsHint = "[d] dns on"
}
hints := []string{
"[q] quit",
"[space] pause/resume",
"[r] reset",
dnsHint,
"[h] help",
}
return styleHint.Render(strings.Join(hints, " "))
}
func (m Model) viewHelp() string {
lines := []string{
styleHeader.Render("maglevt — keybindings"),
"",
" q / ctrl-c quit",
" space pause / resume all probe loops",
" r reset rolling stats + tally + uptime",
" d toggle hostname / IP-literal ADDR display",
" h / ? toggle this help overlay",
"",
styleSection.Render("columns"),
"",
" PROTO http / https (port 80/443) or tcp (everything else)",
" AF address family — v4 or v6",
" ADDR VIP address + port, or reverse-DNS hostname (toggle 'd')",
" LAST most-recent probe result, coloured by class",
" N lifetime probe count since startup (or last 'r')",
" FAIL lifetime failure count; red when non-zero",
" OK% success ratio over the last 100 samples",
" p50/p95/p99 latency percentiles over the last 100 samples",
" max worst-case latency over the last 100 samples",
"",
styleSection.Render("tally"),
"",
" Running count of the response header configured via --header",
" (default X-IPng-Frontend), grouped by PROTO + ADDR. Entries",
" are sorted alphabetically so each backend owns its column",
" for the lifetime of the process. Colour marks recent",
" activity over the last ~5s window:",
" white received at least one hit in the window",
" grey idle — flushed or fully drained",
" Reset with 'r'.",
"",
styleSection.Render("events panel"),
"",
" Rolling list of probes that warrant attention. The panel auto-",
" sizes to fill whatever vertical space is left between the tally",
" and the footer, so a taller terminal shows more history. Older",
" events are still stored (up to 500) and re-appear if the",
" terminal is enlarged; the header notes \"showing N of M\" when",
" the display is clipping. Four event kinds:",
" timeout probe hit its --timeout deadline",
" http-err response carried a 4xx or 5xx status code",
" net-err TCP refused, reset, unreachable, or TLS error",
" spike successful probe more than 25% above the rolling",
" window max (warmup: at least 10 samples required)",
" Cleared along with everything else by 'r'.",
"",
styleHint.Render("Press h or ? again to dismiss."),
}
return strings.Join(lines, "\n")
}
// formatDur renders a time.Duration in a compact form for the
// p50/p95 columns: sub-millisecond shows in µs, sub-second in ms,
// and anything longer in seconds with a decimal place. Fits within
// the 9-char column cleanly.
func formatDur(d time.Duration) string {
if d <= 0 {
return "-"
}
switch {
case d < time.Millisecond:
return fmt.Sprintf("%dµs", d.Microseconds())
case d < time.Second:
return fmt.Sprintf("%.1fms", float64(d.Microseconds())/1000)
default:
return fmt.Sprintf("%.2fs", d.Seconds())
}
}