// SPDX-License-Identifier: Apache-2.0 package main import ( "fmt" "time" tea "github.com/charmbracelet/bubbletea" ) // tallyWindow is the sliding-window length used to classify tally // entries as "actively receiving traffic" versus "drained". A probe // snapshot of each VIP's tally is rotated into vipState.tallyOld once // a second, so on steady state the delta (tally - tallyOld) reflects // somewhere between 1 and 2 seconds of activity — long enough to be // noise-resistant, short enough that a flush or graceful drain shows // up in the next UI redraw. const tallyWindow = 5 * time.Second // errWindowSize is the hard storage cap for Model.events. It isn't // the number of rows rendered to screen — that's computed per-frame // in View() from the terminal height so the events panel fills // whatever space is left after the header, table, tally, and // footer. This cap only exists to stop the ring from growing // unbounded on a very long-running session that's seeing constant // anomalies: 500 × ~100 bytes per event is ~50 KB, negligible. const errWindowSize = 500 // errKind classifies why an event ended up in the error panel. The // four kinds map one-to-one to the four situations Update flags // from a probeResultMsg: a probe hit its timeout deadline, a probe // came back with an HTTP 4xx/5xx, a probe failed at the network // layer (connection refused, reset, unreachable, TLS handshake // error), or a probe completed successfully but with a latency // more than 25% above the rolling-window max. type errKind int const ( kindTimeout errKind = iota kindHTTPErr kindNetErr kindSpike ) func (k errKind) String() string { switch k { case kindTimeout: return "timeout" case kindHTTPErr: return "http-err" case kindNetErr: return "net-err" case kindSpike: return "spike" } return "unknown" } // errEvent is one entry in the bounded error-panel ring. VIPIdx // points back into Model.vips so the view can look up the scheme // and address for the row label at render time (we don't store a // formatted label here to keep the event struct cheap and to let // the view decide how to style it). type errEvent struct { At time.Time VIPIdx int Kind errKind Detail string } // Model is the bubbletea Model for maglevt. Held by value throughout // so bubbletea's copy-on-Update semantics work naturally; mutable // per-VIP state lives behind *vipState pointers in the vips slice so // probeResultMsg handlers can update rolling/tally without copying // the whole model. type Model struct { cfgPath string vips []*vipState opts probeOpts startAt time.Time width int height int help bool // whether the help overlay is currently shown events []errEvent // bounded ring of recent anomalies (size errWindowSize) // showDNS toggles between hostname and IP-literal display in // the ADDR column and the tally/events labels. On by default: // operators usually know VIPs by name, and the 'd' key flips // to the raw literal when they need to see which address // family or which specific IP the row represents. Hostnames // come in asynchronously via hostnameMsg, so vipState.hostname // may still be empty for a VIP even when showDNS is true — // the display falls back to the IP literal in that case. showDNS bool } // vipState is the mutable per-VIP record threaded through the tea // dispatch loop. vipState.info is the immutable descriptor built at // startup (see probe.go::vipInfo), while everything else on this // struct is rewritten as probe results arrive. type vipState struct { info *vipInfo // hostname is the PTR-record lookup result for info.ip, filled // in asynchronously by runDNSLookup via hostnameMsg. Empty // until the lookup returns (or forever, if it fails or times // out). The UI consults Model.showDNS to decide whether to // use it. hostname string // Rolling stats populated from every probeResultMsg. Separate // from tally so reset semantics match the user's mental model: // pressing 'r' blows away both, but a future pause-clear-resume // cadence could reset just one. rolling *rolling tally map[string]int // tallyOld / tallyNew are the two-slot rotating snapshots used // by the tally panel to classify each backend as green (still // receiving traffic), orange (receiving less than the leader), // or grey (drained). tallyNew is captured every tallyWindow; // on the next rotation it shifts into tallyOld, so the delta // (tally - tallyOld) always spans between 1 and 2 tallyWindow // units of activity. tallyAt is the wall-clock time tallyNew // was captured and drives the rotation decision in tickMsg. tallyOld map[string]int tallyNew map[string]int tallyAt time.Time // Lifetime counters. Unlike the rolling window these never // forget until the operator hits 'r'. The N column in the // probe table renders totalProbes; FAIL renders totalFails // tinted red when non-zero so a failure that rolled off the // 100-sample rolling window still leaves a visible mark on // the cumulative count. totalProbes int64 totalFails int64 // Last-seen values for the rightmost LAST column. These are // the "what happened on the most recent probe" snapshot the // UI shows as green/yellow/red. lastAt time.Time lastOK bool lastCode int lastErr string lastHeader string lastDur time.Duration } // tickMsg drives the periodic redraw even when no probe results are // arriving (so the uptime counter in the header ticks along even on // a completely idle VIP set). 250ms is fast enough to look live // without burning CPU on layout work. type tickMsg time.Time func tickCmd() tea.Cmd { return tea.Tick(250*time.Millisecond, func(t time.Time) tea.Msg { return tickMsg(t) }) } // Init kicks off the periodic redraw. The alt-screen entry and window // title are set at NewProgram time in main.go. func (m Model) Init() tea.Cmd { return tickCmd() } // Update handles every tea.Msg delivered to the program. Four // message classes: // // - tea.WindowSizeMsg — resize; cache width/height for the View. // - tea.KeyMsg — keybindings (quit, pause, reset, help). // - probeResultMsg — probe goroutine delivered a new sample; // update rolling/tally/last* and the sparkline. // - tickMsg — periodic redraw; re-arm the timer. func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { switch msg := msg.(type) { case tea.WindowSizeMsg: m.width = msg.Width m.height = msg.Height return m, nil case tea.KeyMsg: switch msg.String() { case "q", "ctrl+c": return m, tea.Quit case " ": paused.Store(!paused.Load()) return m, nil case "r": // Unified reset: wipe per-VIP rolling windows, per-VIP // tallies, per-VIP sparklines, the global event ring, // and the global uptime origin so the header clock // starts fresh. 'r' is the one-key way to start a // clean capture window, which matches the "I'm about // to do a failover, watch this" flow. for _, v := range m.vips { v.rolling.reset() v.tally = map[string]int{} v.tallyOld = map[string]int{} v.tallyNew = map[string]int{} v.tallyAt = time.Time{} v.totalProbes = 0 v.totalFails = 0 v.lastAt = time.Time{} v.lastOK = false v.lastCode = 0 v.lastErr = "" v.lastHeader = "" v.lastDur = 0 } m.events = nil m.startAt = time.Now() return m, nil case "h", "?": m.help = !m.help return m, nil case "d": m.showDNS = !m.showDNS return m, nil } case hostnameMsg: if msg.VIPIdx >= 0 && msg.VIPIdx < len(m.vips) { m.vips[msg.VIPIdx].hostname = msg.Hostname } return m, nil case probeResultMsg: if msg.VIPIdx < 0 || msg.VIPIdx >= len(m.vips) { return m, nil } v := m.vips[msg.VIPIdx] ns := uint64(msg.Duration.Nanoseconds()) prevMax := v.rolling.maxNS spike := v.rolling.isSpike(ns) v.lastAt = msg.At v.lastOK = msg.OK v.lastCode = msg.Code v.lastErr = msg.Err v.lastHeader = msg.Header v.lastDur = msg.Duration v.totalProbes++ if !msg.OK { v.totalFails++ } v.rolling.record(ns, msg.OK) if msg.Header != "" { v.tally[msg.Header]++ } // Classify the sample for the error panel. Order matters: // a network error is always more interesting than a latency // observation (the latency is noise from the failure // itself), an HTTP error is more interesting than a spike // (a 503 dominates a 10ms vs 12ms latency blip), and a // spike is only flagged on otherwise-successful samples. if ev, ok := classifyEvent(msg, v, spike, prevMax); ok { m.events = append(m.events, ev) if len(m.events) > errWindowSize { m.events = m.events[len(m.events)-errWindowSize:] } } return m, nil case tickMsg: // Rotate each VIP's tally snapshot once the window has // elapsed. Skipping while paused keeps the tally colours // frozen at their pre-pause state instead of decaying // everything to grey as deltas naturally fall to zero. if !paused.Load() { now := time.Time(msg) for _, v := range m.vips { if v.tallyAt.IsZero() || now.Sub(v.tallyAt) >= tallyWindow { v.tallyOld = v.tallyNew v.tallyNew = cloneTally(v.tally) v.tallyAt = now } } } return m, tickCmd() } return m, nil } // cloneTally returns a shallow copy of src suitable for the two-slot // rotation in vipState. The snapshot must be independent of the live // tally because subsequent probes keep mutating the original map; // without the copy, tallyNew and tally would alias and the delta // would always be zero. func cloneTally(src map[string]int) map[string]int { out := make(map[string]int, len(src)) for k, v := range src { out[k] = v } return out } // classifyEvent inspects a probeResultMsg and returns the matching // errEvent (if any) for the error panel. Returns (_, false) when // the sample is uninteresting — a boring 2xx/3xx HTTP response or // a successful TCP connect. The four classes are checked in // priority order: network/timeout errors trump HTTP status trump // latency spikes, because a failed probe's latency is noise // inherited from the failure. // // prevMax is the rolling-window max seen *before* this sample was // recorded. It's included in the spike Detail so the operator can // see the baseline the current probe blew past ("482ms (was // 98ms)") rather than just an absolute number with no context. func classifyEvent(msg probeResultMsg, v *vipState, spike bool, prevMax uint64) (errEvent, bool) { if msg.Err != "" { kind := kindNetErr // shortError already collapses "i/o timeout" and // "context deadline exceeded" to the literal "timeout" // token, so an equality check is enough to distinguish // hit-the-deadline failures from refused / reset / // unreachable errors. if msg.Err == "timeout" { kind = kindTimeout } return errEvent{ At: msg.At, VIPIdx: msg.VIPIdx, Kind: kind, Detail: msg.Err, }, true } if v.info.scheme != "tcp" && msg.Code >= 400 { return errEvent{ At: msg.At, VIPIdx: msg.VIPIdx, Kind: kindHTTPErr, Detail: fmt.Sprintf("HTTP %d", msg.Code), }, true } if spike { return errEvent{ At: msg.At, VIPIdx: msg.VIPIdx, Kind: kindSpike, Detail: fmt.Sprintf("%s (prev max %s)", formatDur(msg.Duration), formatDur(time.Duration(prevMax))), }, true } return errEvent{}, false }