vpp-maglev/cmd/tester/model.go

// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>

package main

import (
	"fmt"
	"time"

	tea "github.com/charmbracelet/bubbletea"
)

// tallyWindow is the sliding-window length used to classify tally
// entries as "actively receiving traffic" versus "drained". A probe
// snapshot of each VIP's tally is rotated into vipState.tallyOld once
// a second, so on steady state the delta (tally - tallyOld) reflects
// somewhere between 1 and 2 seconds of activity — long enough to be
// noise-resistant, short enough that a flush or graceful drain shows
// up in the next UI redraw.
const tallyWindow = 5 * time.Second

// errWindowSize is the hard storage cap for Model.events. It isn't
// the number of rows rendered to screen — that's computed per-frame
// in View() from the terminal height so the events panel fills
// whatever space is left after the header, table, tally, and
// footer. This cap only exists to stop the ring from growing
// unbounded on a very long-running session that's seeing constant
// anomalies: 500 × ~100 bytes per event is ~50 KB, negligible.
const errWindowSize = 500

// errKind classifies why an event ended up in the error panel. The
// four kinds map one-to-one to the four situations Update flags
// from a probeResultMsg: a probe hit its timeout deadline, a probe
// came back with an HTTP 4xx/5xx, a probe failed at the network
// layer (connection refused, reset, unreachable, TLS handshake
// error), or a probe completed successfully but with a latency
// more than 25% above the rolling-window max.
type errKind int

const (
	kindTimeout errKind = iota
	kindHTTPErr
	kindNetErr
	kindSpike
)

func (k errKind) String() string {
	switch k {
	case kindTimeout:
		return "timeout"
	case kindHTTPErr:
		return "http-err"
	case kindNetErr:
		return "net-err"
	case kindSpike:
		return "spike"
	}
	return "unknown"
}

// errEvent is one entry in the bounded error-panel ring. VIPIdx
// points back into Model.vips so the view can look up the scheme
// and address for the row label at render time (we don't store a
// formatted label here to keep the event struct cheap and to let
// the view decide how to style it).
type errEvent struct {
	At     time.Time
	VIPIdx int
	Kind   errKind
	Detail string
}

// Model is the bubbletea Model for maglevt. Held by value throughout
// so bubbletea's copy-on-Update semantics work naturally; mutable
// per-VIP state lives behind *vipState pointers in the vips slice so
// probeResultMsg handlers can update rolling/tally without copying
// the whole model.
type Model struct {
	cfgPath string
	vips    []*vipState
	opts    probeOpts
	startAt time.Time
	width   int
	height  int
	help    bool       // whether the help overlay is currently shown
	events  []errEvent // bounded ring of recent anomalies (size errWindowSize)
	// showDNS toggles between hostname and IP-literal display in
	// the ADDR column and the tally/events labels. On by default:
	// operators usually know VIPs by name, and the 'd' key flips
	// to the raw literal when they need to see which address
	// family or which specific IP the row represents. Hostnames
	// come in asynchronously via hostnameMsg, so vipState.hostname
	// may still be empty for a VIP even when showDNS is true —
	// the display falls back to the IP literal in that case.
	showDNS bool
}

// vipState is the mutable per-VIP record threaded through the tea
// dispatch loop. vipState.info is the immutable descriptor built at
// startup (see probe.go::vipInfo), while everything else on this
// struct is rewritten as probe results arrive.
type vipState struct {
	info *vipInfo

	// hostname is the PTR-record lookup result for info.ip, filled
	// in asynchronously by runDNSLookup via hostnameMsg. Empty
	// until the lookup returns (or forever, if it fails or times
	// out). The UI consults Model.showDNS to decide whether to
	// use it.
	hostname string

	// Rolling stats populated from every probeResultMsg. Separate
	// from tally so reset semantics match the user's mental model:
	// pressing 'r' blows away both, but a future pause-clear-resume
	// cadence could reset just one.
	rolling *rolling
	tally   map[string]int

	// tallyOld / tallyNew are the two-slot rotating snapshots used
	// by the tally panel to classify each backend as green (still
	// receiving traffic), orange (receiving less than the leader),
	// or grey (drained). tallyNew is captured every tallyWindow;
	// on the next rotation it shifts into tallyOld, so the delta
	// (tally - tallyOld) always spans between 1 and 2 tallyWindow
	// units of activity. tallyAt is the wall-clock time tallyNew
	// was captured and drives the rotation decision in tickMsg.
	tallyOld map[string]int
	tallyNew map[string]int
	tallyAt  time.Time

	// Lifetime counters. Unlike the rolling window these never
	// forget until the operator hits 'r'. The N column in the
	// probe table renders totalProbes; FAIL renders totalFails
	// tinted red when non-zero so a failure that rolled off the
	// 100-sample rolling window still leaves a visible mark on
	// the cumulative count.
	totalProbes int64
	totalFails  int64

	// Last-seen values for the rightmost LAST column. These are
	// the "what happened on the most recent probe" snapshot the
	// UI shows as green/yellow/red.
	lastAt     time.Time
	lastOK     bool
	lastCode   int
	lastErr    string
	lastHeader string
	lastDur    time.Duration
}

// tickMsg drives the periodic redraw even when no probe results are
// arriving (so the uptime counter in the header ticks along even on
// a completely idle VIP set). 250ms is fast enough to look live
// without burning CPU on layout work.
type tickMsg time.Time

func tickCmd() tea.Cmd {
	return tea.Tick(250*time.Millisecond, func(t time.Time) tea.Msg {
		return tickMsg(t)
	})
}

// Init kicks off the periodic redraw. The alt-screen entry and window
// title are set at NewProgram time in main.go.
func (m Model) Init() tea.Cmd {
	return tickCmd()
}

// Update handles every tea.Msg delivered to the program. Four
// message classes:
//
//   - tea.WindowSizeMsg — resize; cache width/height for the View.
//   - tea.KeyMsg — keybindings (quit, pause, reset, help).
//   - probeResultMsg — probe goroutine delivered a new sample;
//     update rolling/tally/last* and the sparkline.
//   - tickMsg — periodic redraw; re-arm the timer.
func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
	switch msg := msg.(type) {
	case tea.WindowSizeMsg:
		m.width = msg.Width
		m.height = msg.Height
		return m, nil

	case tea.KeyMsg:
		switch msg.String() {
		case "q", "ctrl+c":
			return m, tea.Quit
		case " ":
			paused.Store(!paused.Load())
			return m, nil
		case "r":
			// Unified reset: wipe per-VIP rolling windows, per-VIP
			// tallies, per-VIP sparklines, the global event ring,
			// and the global uptime origin so the header clock
			// starts fresh. 'r' is the one-key way to start a
			// clean capture window, which matches the "I'm about
			// to do a failover, watch this" flow.
			for _, v := range m.vips {
				v.rolling.reset()
				v.tally = map[string]int{}
				v.tallyOld = map[string]int{}
				v.tallyNew = map[string]int{}
				v.tallyAt = time.Time{}
				v.totalProbes = 0
				v.totalFails = 0
				v.lastAt = time.Time{}
				v.lastOK = false
				v.lastCode = 0
				v.lastErr = ""
				v.lastHeader = ""
				v.lastDur = 0
			}
			m.events = nil
			m.startAt = time.Now()
			return m, nil
		case "h", "?":
			m.help = !m.help
			return m, nil
		case "d":
			m.showDNS = !m.showDNS
			return m, nil
		}

	case hostnameMsg:
		if msg.VIPIdx >= 0 && msg.VIPIdx < len(m.vips) {
			m.vips[msg.VIPIdx].hostname = msg.Hostname
		}
		return m, nil

	case probeResultMsg:
		if msg.VIPIdx < 0 || msg.VIPIdx >= len(m.vips) {
			return m, nil
		}
		v := m.vips[msg.VIPIdx]
		ns := uint64(msg.Duration.Nanoseconds())
		prevMax := v.rolling.maxNS
		spike := v.rolling.isSpike(ns)

		v.lastAt = msg.At
		v.lastOK = msg.OK
		v.lastCode = msg.Code
		v.lastErr = msg.Err
		v.lastHeader = msg.Header
		v.lastDur = msg.Duration
		v.totalProbes++
		if !msg.OK {
			v.totalFails++
		}
		v.rolling.record(ns, msg.OK)
		if msg.Header != "" {
			v.tally[msg.Header]++
		}

		// Classify the sample for the error panel. Order matters:
		// a network error is always more interesting than a latency
		// observation (the latency is noise from the failure
		// itself), an HTTP error is more interesting than a spike
		// (a 503 dominates a 10ms vs 12ms latency blip), and a
		// spike is only flagged on otherwise-successful samples.
		if ev, ok := classifyEvent(msg, v, spike, prevMax); ok {
			m.events = append(m.events, ev)
			if len(m.events) > errWindowSize {
				m.events = m.events[len(m.events)-errWindowSize:]
			}
		}
		return m, nil

	case tickMsg:
		// Rotate each VIP's tally snapshot once the window has
		// elapsed. Skipping while paused keeps the tally colours
		// frozen at their pre-pause state instead of decaying
		// everything to grey as deltas naturally fall to zero.
		if !paused.Load() {
			now := time.Time(msg)
			for _, v := range m.vips {
				if v.tallyAt.IsZero() || now.Sub(v.tallyAt) >= tallyWindow {
					v.tallyOld = v.tallyNew
					v.tallyNew = cloneTally(v.tally)
					v.tallyAt = now
				}
			}
		}
		return m, tickCmd()
	}
	return m, nil
}

// cloneTally returns a shallow copy of src suitable for the two-slot
// rotation in vipState. The snapshot must be independent of the live
// tally because subsequent probes keep mutating the original map;
// without the copy, tallyNew and tally would alias and the delta
// would always be zero.
func cloneTally(src map[string]int) map[string]int {
	out := make(map[string]int, len(src))
	for k, v := range src {
		out[k] = v
	}
	return out
}

// classifyEvent inspects a probeResultMsg and returns the matching
// errEvent (if any) for the error panel. Returns (_, false) when
// the sample is uninteresting — a boring 2xx/3xx HTTP response or
// a successful TCP connect. The four classes are checked in
// priority order: network/timeout errors trump HTTP status trump
// latency spikes, because a failed probe's latency is noise
// inherited from the failure.
//
// prevMax is the rolling-window max seen *before* this sample was
// recorded. It's included in the spike Detail so the operator can
// see the baseline the current probe blew past ("482ms (was
// 98ms)") rather than just an absolute number with no context.
func classifyEvent(msg probeResultMsg, v *vipState, spike bool, prevMax uint64) (errEvent, bool) {
	if msg.Err != "" {
		kind := kindNetErr
		// shortError already collapses "i/o timeout" and
		// "context deadline exceeded" to the literal "timeout"
		// token, so an equality check is enough to distinguish
		// hit-the-deadline failures from refused / reset /
		// unreachable errors.
		if msg.Err == "timeout" {
			kind = kindTimeout
		}
		return errEvent{
			At:     msg.At,
			VIPIdx: msg.VIPIdx,
			Kind:   kind,
			Detail: msg.Err,
		}, true
	}
	if v.info.scheme != "tcp" && msg.Code >= 400 {
		return errEvent{
			At:     msg.At,
			VIPIdx: msg.VIPIdx,
			Kind:   kindHTTPErr,
			Detail: fmt.Sprintf("HTTP %d", msg.Code),
		}, true
	}
	if spike {
		return errEvent{
			At:     msg.At,
			VIPIdx: msg.VIPIdx,
			Kind:   kindSpike,
			Detail: fmt.Sprintf("%s (prev max %s)",
				formatDur(msg.Duration),
				formatDur(time.Duration(prevMax))),
		}, true
	}
	return errEvent{}, false
}