vpp-maglev/cmd/tester/stats.go

// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>

package main

import "sort"

// rollingSize is the bounded history maglevt keeps per VIP for
// latency percentiles and success-ratio display. At 100 samples
// and the default 100ms probe interval, this is a ~10s window —
// short enough to react quickly to failover events, long enough
// that p50/p95 are statistically meaningful.
const rollingSize = 100

// rolling is a bounded-window (rollingSize) counter for per-VIP probe
// results. It tracks success/failure totals, running sum for mean
// latency, and a ring of individual samples so percentiles can be
// computed on demand. Non-thread-safe: everything that touches a
// rolling lives on the bubbletea dispatch goroutine, so no locking
// is needed. The reset-on-rotate logic keeps ok/fail/sumNS in sync
// with the ring contents as old samples get overwritten.
type rolling struct {
	samples []sample
	idx     int // next write position
	n       int // number of valid samples (0..rollingSize)

	ok    int
	fail  int
	sumNS uint64
	minNS uint64 // 0 while n == 0
	maxNS uint64
}

type sample struct {
	ns uint64
	ok bool
}

// newRolling returns an empty rolling window ready to accept records.
func newRolling() *rolling {
	return &rolling{samples: make([]sample, rollingSize)}
}

// reset zeroes every field and the ring so the rolling window starts
// fresh. Called by the 'r' keybinding and by the --reset flow.
func (r *rolling) reset() {
	for i := range r.samples {
		r.samples[i] = sample{}
	}
	r.idx = 0
	r.n = 0
	r.ok = 0
	r.fail = 0
	r.sumNS = 0
	r.minNS = 0
	r.maxNS = 0
}

// record appends a single probe result to the rolling window, evicting
// the oldest sample if the ring is already full. All aggregate fields
// (ok/fail/sumNS) are kept in sync incrementally to avoid re-walking
// the ring on every insert. min/max are re-derived from the ring
// after the write — that's O(n) but n is bounded at rollingSize so
// the cost is trivial and avoids the bookkeeping complexity of an
// incremental extremum counter.
func (r *rolling) record(ns uint64, ok bool) {
	if r.n == rollingSize {
		// Ring is full — subtract the contribution of the sample
		// we're about to overwrite.
		old := r.samples[r.idx]
		r.sumNS -= old.ns
		if old.ok {
			r.ok--
		} else {
			r.fail--
		}
	} else {
		r.n++
	}

	r.samples[r.idx] = sample{ns: ns, ok: ok}
	r.idx = (r.idx + 1) % rollingSize
	r.sumNS += ns
	if ok {
		r.ok++
	} else {
		r.fail++
	}

	// Recompute min/max from the live ring. O(rollingSize) but
	// that's 100 uint64 reads — noise on any machine maglevt
	// would ever run on.
	r.minNS = ^uint64(0)
	r.maxNS = 0
	for i := 0; i < r.n; i++ {
		s := r.samples[i]
		if s.ns < r.minNS {
			r.minNS = s.ns
		}
		if s.ns > r.maxNS {
			r.maxNS = s.ns
		}
	}
}

// percentiles returns (p50, p95, p99) in nanoseconds over the
// current window, or zeros if empty. Implemented by copying the
// ring into a fresh slice, sort.Slice, and index lookup — the
// 100-element sort is cheap enough to do per UI frame (roughly
// every 250ms). Index clamping at r.n-1 handles the warmup case
// where the rolling window doesn't yet have enough samples for
// p95/p99 to fall in distinct slots.
func (r *rolling) percentiles() (p50, p95, p99 uint64) {
	if r.n == 0 {
		return 0, 0, 0
	}
	buf := make([]uint64, r.n)
	for i := 0; i < r.n; i++ {
		buf[i] = r.samples[i].ns
	}
	sort.Slice(buf, func(i, j int) bool { return buf[i] < buf[j] })
	p50 = buf[r.n/2]
	p95Idx := r.n * 95 / 100
	if p95Idx >= r.n {
		p95Idx = r.n - 1
	}
	p95 = buf[p95Idx]
	p99Idx := r.n * 99 / 100
	if p99Idx >= r.n {
		p99Idx = r.n - 1
	}
	p99 = buf[p99Idx]
	return p50, p95, p99
}

// successPct returns the percentage of probes currently in the window
// that completed successfully (2xx/3xx for HTTP, TCP connect OK for
// tcp). Returns 0 when the window is empty.
func (r *rolling) successPct() float64 {
	if r.n == 0 {
		return 0
	}
	return 100.0 * float64(r.ok) / float64(r.n)
}

// isSpike reports whether ns is more than 25% above the current
// window maximum. A spike-warmup guard (n < 10) prevents the first
// handful of cold-start samples from each flagging as spikes — max
// during warmup is whatever happened to come in first, so the 1.25×
// threshold is meaningless until we have a stable baseline.
//
// Intended to be called *before* record() on the same sample, so the
// comparison runs against the previous window max rather than the
// one the new sample would produce. If a spike is detected, the
// caller typically records an errEvent and then calls record() to
// fold the sample into the rolling stats as usual.
func (r *rolling) isSpike(ns uint64) bool {
	if r.n < 10 || r.maxNS == 0 {
		return false
	}
	// ns > maxNS * 1.25, written without float conversion.
	return ns > r.maxNS+r.maxNS/4
}