New maglevt TUI component: out-of-band VIP health monitor
A small bubbletea TUI that reads maglev.yaml (repeatable --config), enumerates every VIP, and probes each from outside the load balancer on a tight cadence (default 100ms, ±10% jitter). HTTP/HTTPS VIPs get a GET against a configurable URI (default /.well-known/ipng/healthz) with per-VIP rolling latency (p50/p95/p99/max), lifetime N/FAIL counters, LAST status, and a response-header tally. Non-HTTP VIPs get a TCP connect probe. A bounded error panel classifies anomalies as timeout / http-err / net-err / spike and auto-sizes to fill the screen. Utility: during a failover drill (backend flap, AS drain, config push) the tally panel shows which backend each VIP is actually steering to, with two-colour activity highlighting over a 5s window — white = receiving traffic, grey = drained. Paired with the rolling OK%/latency columns it gives an at-a-glance answer to "is the VIP healthy from the outside right now, and which backend is it hitting", without relying on maglevd's own view of the world. Also bumps Makefile/go.mod to build the new binary.
This commit is contained in:
292
cmd/tester/probe.go
Normal file
292
cmd/tester/probe.go
Normal file
@@ -0,0 +1,292 @@
|
||||
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"math/rand/v2"
|
||||
"net"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
// probeOpts bundles the runtime probe configuration coming from the
|
||||
// command-line flags. Each probe goroutine gets a copy at startup;
|
||||
// nothing in here mutates during the program's lifetime (pauses flow
|
||||
// through the global `paused` atomic instead).
|
||||
type probeOpts struct {
|
||||
Interval time.Duration
|
||||
Timeout time.Duration
|
||||
Host string
|
||||
Path string
|
||||
Header string
|
||||
Insecure bool
|
||||
KeepAlive bool
|
||||
}
|
||||
|
||||
// vipInfo is the immutable per-VIP descriptor built from the maglev
|
||||
// config at startup. Kept separate from vipState (which holds mutable
|
||||
// stats) so the probe goroutine can read its target without racing
|
||||
// the Update loop in model.go.
|
||||
//
|
||||
// Identity of a VIP in maglevt is the (scheme, ip, port) tuple, not
|
||||
// the symbolic name from the config: when we union multiple maglev
|
||||
// yaml files, the same name can describe different tuples across
|
||||
// deployments and the tuple is the unambiguous key. The TUI shows
|
||||
// scheme + ip:port and nothing else, so there's no need to carry a
|
||||
// display name at all.
|
||||
type vipInfo struct {
|
||||
idx int // index into Model.vips (stable for the process lifetime)
|
||||
scheme string // "http", "https", or "tcp"
|
||||
ip net.IP // the VIP address
|
||||
port uint16 // the VIP port (TCP)
|
||||
url string // assembled probe URL for http/https; empty for tcp
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// probeResultMsg is the tea.Msg sent from probe goroutines to the UI
|
||||
// on every probe completion. Bubbletea delivers it into Model.Update
|
||||
// on the tea dispatch goroutine, so the model can mutate its per-VIP
|
||||
// state without locks.
|
||||
type probeResultMsg struct {
|
||||
VIPIdx int
|
||||
At time.Time
|
||||
Duration time.Duration
|
||||
OK bool
|
||||
Code int // HTTP status code (0 for tcp-only probes)
|
||||
Header string // extracted response header value, empty if absent
|
||||
Err string // empty when OK; populated with a short error string otherwise
|
||||
}
|
||||
|
||||
// hostnameMsg is the tea.Msg the DNS resolver worker sends once it
|
||||
// has a PTR record for a VIP. The UI uses it to populate
|
||||
// vipState.hostname so the 'd' toggle has something to show. One
|
||||
// message per VIP at most — lookup failures just drop silently
|
||||
// and the VIP stays on its IP literal.
|
||||
type hostnameMsg struct {
|
||||
VIPIdx int
|
||||
Hostname string
|
||||
}
|
||||
|
||||
// paused is the global pause flag flipped by the spacebar binding in
|
||||
// Model.Update. Using an atomic rather than a channel keeps the probe
|
||||
// loop dead-simple — no extra select case, no risk of wedging a
|
||||
// goroutine on a full buffered channel.
|
||||
var paused atomic.Bool
|
||||
|
||||
// newHTTPClient builds an *http.Client with the transport configured
|
||||
// the way maglevt wants its probes to behave. The two flags that
|
||||
// actually matter here are:
|
||||
//
|
||||
// - DisableKeepAlives = !opts.KeepAlive: off by default, because
|
||||
// failover testing relies on every probe opening a fresh TCP+TLS
|
||||
// connection. A single persistent connection would pin us to one
|
||||
// backend until the keep-alive timer expires, and the tally would
|
||||
// silently lie about which backend the load balancer is actually
|
||||
// steering new flows to. --keepalive/-k flips this back on for
|
||||
// users who explicitly want the pinned-session view.
|
||||
// - InsecureSkipVerify: on by default, matching `curl -k`. VIP
|
||||
// certificates almost never match the raw IP literal we're
|
||||
// targeting, so verification would fail on the first probe. Users
|
||||
// who want strict verification can pass --insecure=false.
|
||||
//
|
||||
// Redirects are refused (CheckRedirect returns ErrUseLastResponse) so
|
||||
// a 302 from one backend doesn't silently advance to another VIP and
|
||||
// pollute the tally.
|
||||
func newHTTPClient(opts probeOpts) *http.Client {
|
||||
tr := &http.Transport{
|
||||
TLSClientConfig: &tls.Config{
|
||||
InsecureSkipVerify: opts.Insecure, //nolint:gosec // matching curl -k, see comment above
|
||||
},
|
||||
DisableKeepAlives: !opts.KeepAlive,
|
||||
// We dial the VIP literal directly — no DNS involvement —
|
||||
// so the default DialContext is already what we want.
|
||||
}
|
||||
return &http.Client{
|
||||
Transport: tr,
|
||||
Timeout: opts.Timeout,
|
||||
CheckRedirect: func(*http.Request, []*http.Request) error {
|
||||
return http.ErrUseLastResponse
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// runProbeLoop is the per-VIP probe worker. It waits an initial random
|
||||
// delay (so N VIPs don't phase-lock onto the same tick after startup),
|
||||
// then fires a probe every opts.Interval ± 10% jitter until ctx is
|
||||
// cancelled. Each completed probe posts a probeResultMsg into the
|
||||
// tea.Program via send.
|
||||
//
|
||||
// The loop honors the global `paused` flag by simply skipping the
|
||||
// probe call while paused — the ticker keeps ticking so a resume
|
||||
// picks up at the next natural tick boundary instead of fast-
|
||||
// forwarding through a burst of back-to-back probes.
|
||||
func runProbeLoop(ctx context.Context, vip *vipInfo, opts probeOpts, send func(tea.Msg)) {
|
||||
// Initial offset: uniformly in [0, interval) so N goroutines
|
||||
// started together spread out across one interval window rather
|
||||
// than all firing at t=0.
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(time.Duration(rand.Int64N(int64(opts.Interval)))):
|
||||
}
|
||||
for {
|
||||
sleepFor := jitter(opts.Interval)
|
||||
if !paused.Load() {
|
||||
result := doProbe(ctx, vip, opts)
|
||||
send(result)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(sleepFor):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runDNSLookup does a single reverse-DNS (PTR) lookup for vip.ip
|
||||
// and, on success, delivers a hostnameMsg to the UI. Runs once
|
||||
// per VIP at startup and then exits — PTR records change rarely
|
||||
// enough that we don't bother re-querying them, and the operator
|
||||
// can always restart maglevt to pick up a new mapping.
|
||||
//
|
||||
// A 3-second timeout keeps a broken resolver from wedging the
|
||||
// worker for the full life of the program; on timeout we just
|
||||
// give up silently and the VIP displays its IP literal forever.
|
||||
// Trailing dots are stripped so the rendered hostname matches
|
||||
// what the operator typed in their zone file.
|
||||
func runDNSLookup(parent context.Context, vip *vipInfo, send func(tea.Msg)) {
|
||||
ctx, cancel := context.WithTimeout(parent, 3*time.Second)
|
||||
defer cancel()
|
||||
names, err := net.DefaultResolver.LookupAddr(ctx, vip.ip.String())
|
||||
if err != nil || len(names) == 0 {
|
||||
return
|
||||
}
|
||||
send(hostnameMsg{
|
||||
VIPIdx: vip.idx,
|
||||
Hostname: strings.TrimSuffix(names[0], "."),
|
||||
})
|
||||
}
|
||||
|
||||
// jitter scales d by a uniform factor in [0.9, 1.1) — the same ±10%
|
||||
// jitter the checker uses, same rationale: probes don't phase-lock on
|
||||
// a wall-clock tick across every VIP in the config.
|
||||
func jitter(d time.Duration) time.Duration {
|
||||
if d <= 0 {
|
||||
return d
|
||||
}
|
||||
return time.Duration(float64(d) * (0.9 + 0.2*rand.Float64()))
|
||||
}
|
||||
|
||||
// doProbe issues one probe against vip and returns a filled-in
|
||||
// probeResultMsg. HTTP / HTTPS go through vip.client with a GET
|
||||
// request against opts.Path (default /.well-known/ipng/healthz),
|
||||
// a Host-header override, and header extraction. Non-HTTP VIPs do
|
||||
// a plain TCP connect — success if the three-way handshake
|
||||
// completes before opts.Timeout.
|
||||
//
|
||||
// GET rather than HEAD: a common health-check path (healthz,
|
||||
// status, /-/healthy) often returns 204 or 200 and is cheap to
|
||||
// serve, but some handlers don't wire HEAD and would 405 us back.
|
||||
// GET works against every reasonable implementation and the
|
||||
// rolling-window latency is unchanged (we time until headers, not
|
||||
// until the body completes — resp.Body.Close() discards the body
|
||||
// without reading it). The defer below still closes the body so
|
||||
// the transport can recycle the connection if --keepalive is on.
|
||||
func doProbe(parent context.Context, vip *vipInfo, opts probeOpts) probeResultMsg {
|
||||
start := time.Now()
|
||||
result := probeResultMsg{VIPIdx: vip.idx, At: start}
|
||||
|
||||
ctx, cancel := context.WithTimeout(parent, opts.Timeout)
|
||||
defer cancel()
|
||||
|
||||
if vip.scheme == "tcp" {
|
||||
d := net.Dialer{Timeout: opts.Timeout}
|
||||
conn, err := d.DialContext(ctx, "tcp",
|
||||
net.JoinHostPort(vip.ip.String(), strconv.Itoa(int(vip.port))))
|
||||
result.Duration = time.Since(start)
|
||||
if err != nil {
|
||||
result.Err = shortError(err)
|
||||
return result
|
||||
}
|
||||
_ = conn.Close()
|
||||
result.OK = true
|
||||
return result
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, vip.url, nil)
|
||||
if err != nil {
|
||||
result.Duration = time.Since(start)
|
||||
result.Err = shortError(err)
|
||||
return result
|
||||
}
|
||||
// Host header: user override, else derive from the VIP itself
|
||||
// (which the kernel already put in the URL, so leaving req.Host
|
||||
// empty means "use the URL authority"). We only touch req.Host
|
||||
// when the operator explicitly passed --host.
|
||||
if opts.Host != "" {
|
||||
req.Host = opts.Host
|
||||
}
|
||||
|
||||
resp, err := vip.client.Do(req)
|
||||
result.Duration = time.Since(start)
|
||||
if err != nil {
|
||||
result.Err = shortError(err)
|
||||
return result
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
result.Code = resp.StatusCode
|
||||
result.OK = resp.StatusCode >= 200 && resp.StatusCode < 400
|
||||
if opts.Header != "" {
|
||||
result.Header = resp.Header.Get(opts.Header)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// shortError collapses common Go net errors into a short string
|
||||
// suitable for a narrow table cell. url.Error wrappings, dial
|
||||
// contexts, and "i/o timeout" trailers all get trimmed so the LAST
|
||||
// column shows something legible like "refused" / "timeout" /
|
||||
// "no route" instead of a 120-char wrapped error.
|
||||
func shortError(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
msg := err.Error()
|
||||
// net/url-wrapped errors: keep only the last segment, which
|
||||
// holds the actual cause.
|
||||
if i := strings.LastIndex(msg, ": "); i >= 0 {
|
||||
msg = msg[i+2:]
|
||||
}
|
||||
// Normalise common kernel errnos and Go's wrappers.
|
||||
switch {
|
||||
case strings.Contains(msg, "connection refused"):
|
||||
return "refused"
|
||||
case strings.Contains(msg, "i/o timeout") || strings.Contains(msg, "context deadline exceeded"):
|
||||
return "timeout"
|
||||
case strings.Contains(msg, "no route to host"):
|
||||
return "no-route"
|
||||
case strings.Contains(msg, "network is unreachable"):
|
||||
return "net-unrch"
|
||||
case strings.Contains(msg, "host is unreachable"):
|
||||
return "host-unrch"
|
||||
case strings.Contains(msg, "connection reset"):
|
||||
return "reset"
|
||||
case strings.Contains(msg, "EOF"):
|
||||
return "eof"
|
||||
case strings.Contains(strings.ToLower(msg), "tls"):
|
||||
return "tls-err"
|
||||
}
|
||||
// Last resort: truncate anything longer than the LAST column.
|
||||
if len(msg) > 8 {
|
||||
msg = msg[:8]
|
||||
}
|
||||
return msg
|
||||
}
|
||||
Reference in New Issue
Block a user