New maglevt TUI component: out-of-band VIP health monitor

A small bubbletea TUI that reads maglev.yaml (repeatable --config),
enumerates every VIP, and probes each from outside the load balancer
on a tight cadence (default 100ms, ±10% jitter). HTTP/HTTPS VIPs get
a GET against a configurable URI (default /.well-known/ipng/healthz)
with per-VIP rolling latency (p50/p95/p99/max), lifetime N/FAIL
counters, LAST status, and a response-header tally. Non-HTTP VIPs
get a TCP connect probe. A bounded error panel classifies anomalies
as timeout / http-err / net-err / spike and auto-sizes to fill the
screen.

Utility: during a failover drill (backend flap, AS drain, config
push) the tally panel shows which backend each VIP is actually
steering to, with two-colour activity highlighting over a 5s
window — white = receiving traffic, grey = drained. Paired with
the rolling OK%/latency columns it gives an at-a-glance answer to
"is the VIP healthy from the outside right now, and which backend
is it hitting", without relying on maglevd's own view of the
world.

Also bumps Makefile/go.mod to build the new binary.
This commit is contained in:
2026-04-15 01:23:34 +02:00
parent 744b1cb3d2
commit 6293521157
8 changed files with 1890 additions and 1 deletions

305
cmd/tester/main.go Normal file
View File

@@ -0,0 +1,305 @@
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
// maglevt is a tiny TUI that reads maglev.yaml, enumerates every VIP
// and hits it on a tight cadence (default 100ms) from outside the load
// balancer. HTTP/HTTPS VIPs get a HEAD request with per-VIP rolling
// latency stats, success/failure ratios, and a running tally of a
// configurable response header (default: X-IPng-Frontend) so pool-
// failover events show up as visible reshuffles in the tally. Non-HTTP
// VIPs get a plain TCP-connect probe for liveness. See maglevt --help
// for the flag surface.
package main
import (
"context"
"flag"
"fmt"
"net"
"os"
"regexp"
"sort"
"strings"
"time"
tea "github.com/charmbracelet/bubbletea"
buildinfo "git.ipng.ch/ipng/vpp-maglev/cmd"
"git.ipng.ch/ipng/vpp-maglev/internal/config"
)
func main() {
if err := run(); err != nil {
fmt.Fprintf(os.Stderr, "maglevt: %v\n", err)
os.Exit(1)
}
}
func run() error {
var cfgPaths multiFlag
flag.Var(&cfgPaths, "config", "path to maglev.yaml (repeatable; also accepts a comma-separated list). Frontends are unioned across files, deduplicated by (address, protocol, port).")
interval := flag.Duration("interval", 100*time.Millisecond, "probe interval per VIP (±10% jitter)")
timeout := flag.Duration("timeout", 2*time.Second, "per-request timeout")
host := flag.String("host", "", "Host header override (default: VIP address literal)")
// Default probe URI: a small, deliberate health-check path that
// typically returns 204 No Content and doesn't hit the backend
// app logs. /.well-known/ipng/healthz is the convention for
// IPng deployments; override with --uri for anything else.
// --path is registered as a synonym for backward compatibility
// with the pre-1.0 flag name — both set the same variable, so
// whichever the operator types last on the command line wins.
const defaultURI = "/.well-known/ipng/healthz"
path := flag.String("uri", defaultURI, "HTTP request path (URI) used in the GET request")
flag.StringVar(path, "path", defaultURI, "alias for --uri")
header := flag.String("header", "X-IPng-Frontend", "response header to extract and tally")
insecure := flag.Bool("insecure", true, "skip TLS verification for HTTPS")
keepalive := flag.Bool("keepalive", false, "enable HTTP keep-alives (disabled by default so each probe opens a fresh connection — required for failover visibility)")
flag.BoolVar(keepalive, "k", false, "shorthand for --keepalive")
filter := flag.String("filter", "", "only probe frontends whose name matches this regex")
printVersion := flag.Bool("version", false, "print version and exit")
flag.Parse()
if *printVersion {
fmt.Printf("maglevt %s (commit %s, built %s)\n",
buildinfo.Version(), buildinfo.Commit(), buildinfo.Date())
return nil
}
if len(cfgPaths) == 0 {
cfgPaths = multiFlag{"/etc/vpp-maglev/maglev.yaml"}
}
// Load every requested config. A parse/semantic error on any of
// them is fatal — we want the user to see it rather than silently
// probing a reduced set of VIPs because one file was broken.
configs := make([]*config.Config, 0, len(cfgPaths))
for _, p := range cfgPaths {
cfg, res := config.Check(p)
if !res.OK() {
if res.ParseError != "" {
return fmt.Errorf("config parse %s: %s", p, res.ParseError)
}
return fmt.Errorf("config semantic %s: %s", p, res.SemanticError)
}
configs = append(configs, cfg)
}
var filterRe *regexp.Regexp
if *filter != "" {
var err error
filterRe, err = regexp.Compile(*filter)
if err != nil {
return fmt.Errorf("invalid --filter regex: %w", err)
}
}
opts := probeOpts{
Interval: *interval,
Timeout: *timeout,
Host: *host,
Path: *path,
Header: *header,
Insecure: *insecure,
KeepAlive: *keepalive,
}
vips := buildVIPsUnion(configs, cfgPaths, filterRe, opts)
if len(vips) == 0 {
return fmt.Errorf("no matching frontends in %s", strings.Join(cfgPaths, ", "))
}
m := Model{
cfgPath: strings.Join(cfgPaths, ", "),
vips: vips,
opts: opts,
startAt: time.Now(),
showDNS: true,
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
prog := tea.NewProgram(m, tea.WithAltScreen())
// Spawn one probe goroutine per VIP. Each sends probeResultMsg
// into the tea.Program via prog.Send, which is thread-safe.
// Alongside the prober we kick off a one-shot reverse-DNS
// lookup so the 'd' toggle has a hostname to display; the
// lookup is best-effort and simply drops on timeout or NXDOMAIN.
for _, v := range vips {
go runProbeLoop(ctx, v.info, opts, prog.Send)
go runDNSLookup(ctx, v.info, prog.Send)
}
_, err := prog.Run()
cancel()
// Give the workers a beat to observe ctx and exit. This isn't
// strictly required — the process is exiting anyway — but a clean
// shutdown avoids the "unexpected EOF writing to a closed
// transport" spam that HTTP clients sometimes emit on ctrl-C.
time.Sleep(50 * time.Millisecond)
return err
}
// buildVIPsUnion flattens frontends from multiple configs into a
// single deduplicated probe list, keyed by the (scheme, address,
// port) tuple. The typical use case is a pair of maglevds fronting
// the same two VIPs (vip0 / vip1, IPv4 + IPv6, × port 80 + 443 = 8
// probers) — the operator passes both yaml files and maglevt unions
// them so the probe grid doesn't grow duplicates from mirrored
// configs. The symbolic frontend name from yaml is intentionally
// dropped: when two files use the same name for different tuples
// (common in cross-deployment comparisons) the name would be
// ambiguous, and the tuple is the only stable identity. Only the
// --filter regex still uses the name, as a pre-dedup match.
//
// Dedup key uses net.IP.String() which canonicalises IPv6 zero-
// compression, so 2001:db8::1 and 2001:db8:0:0:0:0:0:1 collapse
// onto one entry. Iteration order across files is stable for
// deterministic TUI layout: within a file, frontends are visited
// in name-sorted order; across files, the first occurrence of each
// tuple wins and fixes its slot in the output.
func buildVIPsUnion(cfgs []*config.Config, cfgPaths []string, filterRe *regexp.Regexp, opts probeOpts) []*vipState {
_ = cfgPaths // reserved for future diagnostics (e.g. which file this tuple came from)
type key struct {
ip string
scheme string
port uint16
}
seen := map[key]*vipState{}
var order []key
for _, cfg := range cfgs {
names := make([]string, 0, len(cfg.Frontends))
for name := range cfg.Frontends {
names = append(names, name)
}
sortStringsInPlace(names)
for _, name := range names {
fe := cfg.Frontends[name]
if filterRe != nil && !filterRe.MatchString(name) {
continue
}
if strings.ToLower(fe.Protocol) != "tcp" || fe.Port == 0 {
continue
}
scheme := schemeForPort(fe.Port)
k := key{ip: fe.Address.String(), scheme: scheme, port: fe.Port}
if _, ok := seen[k]; ok {
continue // already claimed by an earlier file
}
info := &vipInfo{
idx: len(order),
scheme: scheme,
ip: fe.Address,
port: fe.Port,
}
if scheme == "http" || scheme == "https" {
info.url = buildURL(scheme, fe.Address, fe.Port, opts.Path)
info.client = newHTTPClient(opts)
}
v := &vipState{
info: info,
rolling: newRolling(),
tally: map[string]int{},
tallyOld: map[string]int{},
tallyNew: map[string]int{},
}
seen[k] = v
order = append(order, k)
}
}
out := make([]*vipState, len(order))
for i, k := range order {
out[i] = seen[k]
}
// Display order: IPv6 before IPv4, higher ports before lower
// within each address family, then address string as a final
// tiebreaker for determinism across runs. HTTPS :443 sitting
// above HTTP :80 matches the "secure first" reading order most
// operators expect, and clustering all the IPv6 rows at the top
// keeps a mixed-family deployment visually coherent as the
// operator scans down the table.
sort.SliceStable(out, func(i, j int) bool {
vi, vj := out[i].info, out[j].info
iIs6 := vi.ip.To4() == nil
jIs6 := vj.ip.To4() == nil
if iIs6 != jIs6 {
return iIs6
}
if vi.port != vj.port {
return vi.port > vj.port
}
return vi.ip.String() < vj.ip.String()
})
// Re-index after the sort so info.idx matches the slot each VIP
// now occupies in out — probeResultMsg.VIPIdx is looked up via
// this index in Model.Update, so they must agree.
for i, v := range out {
v.info.idx = i
}
return out
}
// multiFlag is a flag.Value implementation that accumulates repeated
// --config occurrences into a slice, and also accepts comma-separated
// values on a single flag instance so `--config a.yaml,b.yaml` and
// `--config a.yaml --config b.yaml` produce the same result.
type multiFlag []string
func (m *multiFlag) String() string {
return strings.Join(*m, ",")
}
func (m *multiFlag) Set(v string) error {
for _, p := range strings.Split(v, ",") {
p = strings.TrimSpace(p)
if p != "" {
*m = append(*m, p)
}
}
return nil
}
// schemeForPort infers HTTP vs HTTPS from the VIP's TCP port, falling
// back to "tcp" (connect-only probe) for anything that isn't
// unambiguously web. Intentionally narrow — we'd rather under-classify
// than send HEAD / at an IMAPS VIP and spew protocol errors into the
// logs. Adding more here is fine later (e.g. 8080/8443) but defaults
// should stay conservative.
func schemeForPort(port uint16) string {
switch port {
case 80:
return "http"
case 443:
return "https"
}
return "tcp"
}
// buildURL constructs the probe URL for an HTTP/HTTPS VIP. IPv6
// literals are bracketed per RFC 3986 §3.2.2 so the colon in the
// address isn't confused with the port separator.
func buildURL(scheme string, ip net.IP, port uint16, path string) string {
host := ip.String()
if ip.To4() == nil {
host = "[" + host + "]"
}
if path == "" {
path = "/"
}
return fmt.Sprintf("%s://%s:%d%s", scheme, host, port, path)
}
// sortStringsInPlace is a tiny shim so we don't import "sort" just
// for a single call from buildVIPs. The sorted-names slice is at
// most a few dozen elements so an insertion sort is fine and avoids
// the import churn.
func sortStringsInPlace(s []string) {
for i := 1; i < len(s); i++ {
for j := i; j > 0 && s[j-1] > s[j]; j-- {
s[j-1], s[j] = s[j], s[j-1]
}
}
}