A small bubbletea TUI that reads maglev.yaml (repeatable --config), enumerates every VIP, and probes each from outside the load balancer on a tight cadence (default 100ms, ±10% jitter). HTTP/HTTPS VIPs get a GET against a configurable URI (default /.well-known/ipng/healthz) with per-VIP rolling latency (p50/p95/p99/max), lifetime N/FAIL counters, LAST status, and a response-header tally. Non-HTTP VIPs get a TCP connect probe. A bounded error panel classifies anomalies as timeout / http-err / net-err / spike and auto-sizes to fill the screen. Utility: during a failover drill (backend flap, AS drain, config push) the tally panel shows which backend each VIP is actually steering to, with two-colour activity highlighting over a 5s window — white = receiving traffic, grey = drained. Paired with the rolling OK%/latency columns it gives an at-a-glance answer to "is the VIP healthy from the outside right now, and which backend is it hitting", without relying on maglevd's own view of the world. Also bumps Makefile/go.mod to build the new binary.
350 lines
11 KiB
Go
350 lines
11 KiB
Go
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
|
||
|
||
package main
|
||
|
||
import (
|
||
"fmt"
|
||
"time"
|
||
|
||
tea "github.com/charmbracelet/bubbletea"
|
||
)
|
||
|
||
// tallyWindow is the sliding-window length used to classify tally
|
||
// entries as "actively receiving traffic" versus "drained". A probe
|
||
// snapshot of each VIP's tally is rotated into vipState.tallyOld once
|
||
// a second, so on steady state the delta (tally - tallyOld) reflects
|
||
// somewhere between 1 and 2 seconds of activity — long enough to be
|
||
// noise-resistant, short enough that a flush or graceful drain shows
|
||
// up in the next UI redraw.
|
||
const tallyWindow = 5 * time.Second
|
||
|
||
// errWindowSize is the hard storage cap for Model.events. It isn't
|
||
// the number of rows rendered to screen — that's computed per-frame
|
||
// in View() from the terminal height so the events panel fills
|
||
// whatever space is left after the header, table, tally, and
|
||
// footer. This cap only exists to stop the ring from growing
|
||
// unbounded on a very long-running session that's seeing constant
|
||
// anomalies: 500 × ~100 bytes per event is ~50 KB, negligible.
|
||
const errWindowSize = 500
|
||
|
||
// errKind classifies why an event ended up in the error panel. The
|
||
// four kinds map one-to-one to the four situations Update flags
|
||
// from a probeResultMsg: a probe hit its timeout deadline, a probe
|
||
// came back with an HTTP 4xx/5xx, a probe failed at the network
|
||
// layer (connection refused, reset, unreachable, TLS handshake
|
||
// error), or a probe completed successfully but with a latency
|
||
// more than 25% above the rolling-window max.
|
||
type errKind int
|
||
|
||
const (
|
||
kindTimeout errKind = iota
|
||
kindHTTPErr
|
||
kindNetErr
|
||
kindSpike
|
||
)
|
||
|
||
func (k errKind) String() string {
|
||
switch k {
|
||
case kindTimeout:
|
||
return "timeout"
|
||
case kindHTTPErr:
|
||
return "http-err"
|
||
case kindNetErr:
|
||
return "net-err"
|
||
case kindSpike:
|
||
return "spike"
|
||
}
|
||
return "unknown"
|
||
}
|
||
|
||
// errEvent is one entry in the bounded error-panel ring. VIPIdx
|
||
// points back into Model.vips so the view can look up the scheme
|
||
// and address for the row label at render time (we don't store a
|
||
// formatted label here to keep the event struct cheap and to let
|
||
// the view decide how to style it).
|
||
type errEvent struct {
|
||
At time.Time
|
||
VIPIdx int
|
||
Kind errKind
|
||
Detail string
|
||
}
|
||
|
||
// Model is the bubbletea Model for maglevt. Held by value throughout
|
||
// so bubbletea's copy-on-Update semantics work naturally; mutable
|
||
// per-VIP state lives behind *vipState pointers in the vips slice so
|
||
// probeResultMsg handlers can update rolling/tally without copying
|
||
// the whole model.
|
||
type Model struct {
|
||
cfgPath string
|
||
vips []*vipState
|
||
opts probeOpts
|
||
startAt time.Time
|
||
width int
|
||
height int
|
||
help bool // whether the help overlay is currently shown
|
||
events []errEvent // bounded ring of recent anomalies (size errWindowSize)
|
||
// showDNS toggles between hostname and IP-literal display in
|
||
// the ADDR column and the tally/events labels. On by default:
|
||
// operators usually know VIPs by name, and the 'd' key flips
|
||
// to the raw literal when they need to see which address
|
||
// family or which specific IP the row represents. Hostnames
|
||
// come in asynchronously via hostnameMsg, so vipState.hostname
|
||
// may still be empty for a VIP even when showDNS is true —
|
||
// the display falls back to the IP literal in that case.
|
||
showDNS bool
|
||
}
|
||
|
||
// vipState is the mutable per-VIP record threaded through the tea
|
||
// dispatch loop. vipState.info is the immutable descriptor built at
|
||
// startup (see probe.go::vipInfo), while everything else on this
|
||
// struct is rewritten as probe results arrive.
|
||
type vipState struct {
|
||
info *vipInfo
|
||
|
||
// hostname is the PTR-record lookup result for info.ip, filled
|
||
// in asynchronously by runDNSLookup via hostnameMsg. Empty
|
||
// until the lookup returns (or forever, if it fails or times
|
||
// out). The UI consults Model.showDNS to decide whether to
|
||
// use it.
|
||
hostname string
|
||
|
||
// Rolling stats populated from every probeResultMsg. Separate
|
||
// from tally so reset semantics match the user's mental model:
|
||
// pressing 'r' blows away both, but a future pause-clear-resume
|
||
// cadence could reset just one.
|
||
rolling *rolling
|
||
tally map[string]int
|
||
|
||
// tallyOld / tallyNew are the two-slot rotating snapshots used
|
||
// by the tally panel to classify each backend as green (still
|
||
// receiving traffic), orange (receiving less than the leader),
|
||
// or grey (drained). tallyNew is captured every tallyWindow;
|
||
// on the next rotation it shifts into tallyOld, so the delta
|
||
// (tally - tallyOld) always spans between 1 and 2 tallyWindow
|
||
// units of activity. tallyAt is the wall-clock time tallyNew
|
||
// was captured and drives the rotation decision in tickMsg.
|
||
tallyOld map[string]int
|
||
tallyNew map[string]int
|
||
tallyAt time.Time
|
||
|
||
// Lifetime counters. Unlike the rolling window these never
|
||
// forget until the operator hits 'r'. The N column in the
|
||
// probe table renders totalProbes; FAIL renders totalFails
|
||
// tinted red when non-zero so a failure that rolled off the
|
||
// 100-sample rolling window still leaves a visible mark on
|
||
// the cumulative count.
|
||
totalProbes int64
|
||
totalFails int64
|
||
|
||
// Last-seen values for the rightmost LAST column. These are
|
||
// the "what happened on the most recent probe" snapshot the
|
||
// UI shows as green/yellow/red.
|
||
lastAt time.Time
|
||
lastOK bool
|
||
lastCode int
|
||
lastErr string
|
||
lastHeader string
|
||
lastDur time.Duration
|
||
}
|
||
|
||
// tickMsg drives the periodic redraw even when no probe results are
|
||
// arriving (so the uptime counter in the header ticks along even on
|
||
// a completely idle VIP set). 250ms is fast enough to look live
|
||
// without burning CPU on layout work.
|
||
type tickMsg time.Time
|
||
|
||
func tickCmd() tea.Cmd {
|
||
return tea.Tick(250*time.Millisecond, func(t time.Time) tea.Msg {
|
||
return tickMsg(t)
|
||
})
|
||
}
|
||
|
||
// Init kicks off the periodic redraw. The alt-screen entry and window
|
||
// title are set at NewProgram time in main.go.
|
||
func (m Model) Init() tea.Cmd {
|
||
return tickCmd()
|
||
}
|
||
|
||
// Update handles every tea.Msg delivered to the program. Four
|
||
// message classes:
|
||
//
|
||
// - tea.WindowSizeMsg — resize; cache width/height for the View.
|
||
// - tea.KeyMsg — keybindings (quit, pause, reset, help).
|
||
// - probeResultMsg — probe goroutine delivered a new sample;
|
||
// update rolling/tally/last* and the sparkline.
|
||
// - tickMsg — periodic redraw; re-arm the timer.
|
||
func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||
switch msg := msg.(type) {
|
||
case tea.WindowSizeMsg:
|
||
m.width = msg.Width
|
||
m.height = msg.Height
|
||
return m, nil
|
||
|
||
case tea.KeyMsg:
|
||
switch msg.String() {
|
||
case "q", "ctrl+c":
|
||
return m, tea.Quit
|
||
case " ":
|
||
paused.Store(!paused.Load())
|
||
return m, nil
|
||
case "r":
|
||
// Unified reset: wipe per-VIP rolling windows, per-VIP
|
||
// tallies, per-VIP sparklines, the global event ring,
|
||
// and the global uptime origin so the header clock
|
||
// starts fresh. 'r' is the one-key way to start a
|
||
// clean capture window, which matches the "I'm about
|
||
// to do a failover, watch this" flow.
|
||
for _, v := range m.vips {
|
||
v.rolling.reset()
|
||
v.tally = map[string]int{}
|
||
v.tallyOld = map[string]int{}
|
||
v.tallyNew = map[string]int{}
|
||
v.tallyAt = time.Time{}
|
||
v.totalProbes = 0
|
||
v.totalFails = 0
|
||
v.lastAt = time.Time{}
|
||
v.lastOK = false
|
||
v.lastCode = 0
|
||
v.lastErr = ""
|
||
v.lastHeader = ""
|
||
v.lastDur = 0
|
||
}
|
||
m.events = nil
|
||
m.startAt = time.Now()
|
||
return m, nil
|
||
case "h", "?":
|
||
m.help = !m.help
|
||
return m, nil
|
||
case "d":
|
||
m.showDNS = !m.showDNS
|
||
return m, nil
|
||
}
|
||
|
||
case hostnameMsg:
|
||
if msg.VIPIdx >= 0 && msg.VIPIdx < len(m.vips) {
|
||
m.vips[msg.VIPIdx].hostname = msg.Hostname
|
||
}
|
||
return m, nil
|
||
|
||
case probeResultMsg:
|
||
if msg.VIPIdx < 0 || msg.VIPIdx >= len(m.vips) {
|
||
return m, nil
|
||
}
|
||
v := m.vips[msg.VIPIdx]
|
||
ns := uint64(msg.Duration.Nanoseconds())
|
||
prevMax := v.rolling.maxNS
|
||
spike := v.rolling.isSpike(ns)
|
||
|
||
v.lastAt = msg.At
|
||
v.lastOK = msg.OK
|
||
v.lastCode = msg.Code
|
||
v.lastErr = msg.Err
|
||
v.lastHeader = msg.Header
|
||
v.lastDur = msg.Duration
|
||
v.totalProbes++
|
||
if !msg.OK {
|
||
v.totalFails++
|
||
}
|
||
v.rolling.record(ns, msg.OK)
|
||
if msg.Header != "" {
|
||
v.tally[msg.Header]++
|
||
}
|
||
|
||
// Classify the sample for the error panel. Order matters:
|
||
// a network error is always more interesting than a latency
|
||
// observation (the latency is noise from the failure
|
||
// itself), an HTTP error is more interesting than a spike
|
||
// (a 503 dominates a 10ms vs 12ms latency blip), and a
|
||
// spike is only flagged on otherwise-successful samples.
|
||
if ev, ok := classifyEvent(msg, v, spike, prevMax); ok {
|
||
m.events = append(m.events, ev)
|
||
if len(m.events) > errWindowSize {
|
||
m.events = m.events[len(m.events)-errWindowSize:]
|
||
}
|
||
}
|
||
return m, nil
|
||
|
||
case tickMsg:
|
||
// Rotate each VIP's tally snapshot once the window has
|
||
// elapsed. Skipping while paused keeps the tally colours
|
||
// frozen at their pre-pause state instead of decaying
|
||
// everything to grey as deltas naturally fall to zero.
|
||
if !paused.Load() {
|
||
now := time.Time(msg)
|
||
for _, v := range m.vips {
|
||
if v.tallyAt.IsZero() || now.Sub(v.tallyAt) >= tallyWindow {
|
||
v.tallyOld = v.tallyNew
|
||
v.tallyNew = cloneTally(v.tally)
|
||
v.tallyAt = now
|
||
}
|
||
}
|
||
}
|
||
return m, tickCmd()
|
||
}
|
||
return m, nil
|
||
}
|
||
|
||
// cloneTally returns a shallow copy of src suitable for the two-slot
|
||
// rotation in vipState. The snapshot must be independent of the live
|
||
// tally because subsequent probes keep mutating the original map;
|
||
// without the copy, tallyNew and tally would alias and the delta
|
||
// would always be zero.
|
||
func cloneTally(src map[string]int) map[string]int {
|
||
out := make(map[string]int, len(src))
|
||
for k, v := range src {
|
||
out[k] = v
|
||
}
|
||
return out
|
||
}
|
||
|
||
// classifyEvent inspects a probeResultMsg and returns the matching
|
||
// errEvent (if any) for the error panel. Returns (_, false) when
|
||
// the sample is uninteresting — a boring 2xx/3xx HTTP response or
|
||
// a successful TCP connect. The four classes are checked in
|
||
// priority order: network/timeout errors trump HTTP status trump
|
||
// latency spikes, because a failed probe's latency is noise
|
||
// inherited from the failure.
|
||
//
|
||
// prevMax is the rolling-window max seen *before* this sample was
|
||
// recorded. It's included in the spike Detail so the operator can
|
||
// see the baseline the current probe blew past ("482ms (was
|
||
// 98ms)") rather than just an absolute number with no context.
|
||
func classifyEvent(msg probeResultMsg, v *vipState, spike bool, prevMax uint64) (errEvent, bool) {
|
||
if msg.Err != "" {
|
||
kind := kindNetErr
|
||
// shortError already collapses "i/o timeout" and
|
||
// "context deadline exceeded" to the literal "timeout"
|
||
// token, so an equality check is enough to distinguish
|
||
// hit-the-deadline failures from refused / reset /
|
||
// unreachable errors.
|
||
if msg.Err == "timeout" {
|
||
kind = kindTimeout
|
||
}
|
||
return errEvent{
|
||
At: msg.At,
|
||
VIPIdx: msg.VIPIdx,
|
||
Kind: kind,
|
||
Detail: msg.Err,
|
||
}, true
|
||
}
|
||
if v.info.scheme != "tcp" && msg.Code >= 400 {
|
||
return errEvent{
|
||
At: msg.At,
|
||
VIPIdx: msg.VIPIdx,
|
||
Kind: kindHTTPErr,
|
||
Detail: fmt.Sprintf("HTTP %d", msg.Code),
|
||
}, true
|
||
}
|
||
if spike {
|
||
return errEvent{
|
||
At: msg.At,
|
||
VIPIdx: msg.VIPIdx,
|
||
Kind: kindSpike,
|
||
Detail: fmt.Sprintf("%s (prev max %s)",
|
||
formatDur(msg.Duration),
|
||
formatDur(time.Duration(prevMax))),
|
||
}, true
|
||
}
|
||
return errEvent{}, false
|
||
}
|