// Copyright (c) 2026, Pim van Pelt package main import ( "context" "crypto/tls" "math/rand/v2" "net" "net/http" "strconv" "strings" "sync/atomic" "time" tea "github.com/charmbracelet/bubbletea" ) // probeOpts bundles the runtime probe configuration coming from the // command-line flags. Each probe goroutine gets a copy at startup; // nothing in here mutates during the program's lifetime (pauses flow // through the global `paused` atomic instead). type probeOpts struct { Interval time.Duration Timeout time.Duration Host string Path string Header string Insecure bool KeepAlive bool } // vipInfo is the immutable per-VIP descriptor built from the maglev // config at startup. Kept separate from vipState (which holds mutable // stats) so the probe goroutine can read its target without racing // the Update loop in model.go. // // Identity of a VIP in maglevt is the (scheme, ip, port) tuple, not // the symbolic name from the config: when we union multiple maglev // yaml files, the same name can describe different tuples across // deployments and the tuple is the unambiguous key. The TUI shows // scheme + ip:port and nothing else, so there's no need to carry a // display name at all. type vipInfo struct { idx int // index into Model.vips (stable for the process lifetime) scheme string // "http", "https", or "tcp" ip net.IP // the VIP address port uint16 // the VIP port (TCP) url string // assembled probe URL for http/https; empty for tcp client *http.Client } // probeResultMsg is the tea.Msg sent from probe goroutines to the UI // on every probe completion. Bubbletea delivers it into Model.Update // on the tea dispatch goroutine, so the model can mutate its per-VIP // state without locks. type probeResultMsg struct { VIPIdx int At time.Time Duration time.Duration OK bool Code int // HTTP status code (0 for tcp-only probes) Header string // extracted response header value, empty if absent Err string // empty when OK; populated with a short error string otherwise } // hostnameMsg is the tea.Msg the DNS resolver worker sends once it // has a PTR record for a VIP. The UI uses it to populate // vipState.hostname so the 'd' toggle has something to show. One // message per VIP at most — lookup failures just drop silently // and the VIP stays on its IP literal. type hostnameMsg struct { VIPIdx int Hostname string } // paused is the global pause flag flipped by the spacebar binding in // Model.Update. Using an atomic rather than a channel keeps the probe // loop dead-simple — no extra select case, no risk of wedging a // goroutine on a full buffered channel. var paused atomic.Bool // newHTTPClient builds an *http.Client with the transport configured // the way maglevt wants its probes to behave. The two flags that // actually matter here are: // // - DisableKeepAlives = !opts.KeepAlive: off by default, because // failover testing relies on every probe opening a fresh TCP+TLS // connection. A single persistent connection would pin us to one // backend until the keep-alive timer expires, and the tally would // silently lie about which backend the load balancer is actually // steering new flows to. --keepalive/-k flips this back on for // users who explicitly want the pinned-session view. // - InsecureSkipVerify: on by default, matching `curl -k`. VIP // certificates almost never match the raw IP literal we're // targeting, so verification would fail on the first probe. Users // who want strict verification can pass --insecure=false. // // Redirects are refused (CheckRedirect returns ErrUseLastResponse) so // a 302 from one backend doesn't silently advance to another VIP and // pollute the tally. func newHTTPClient(opts probeOpts) *http.Client { tr := &http.Transport{ TLSClientConfig: &tls.Config{ InsecureSkipVerify: opts.Insecure, //nolint:gosec // matching curl -k, see comment above }, DisableKeepAlives: !opts.KeepAlive, // We dial the VIP literal directly — no DNS involvement — // so the default DialContext is already what we want. } return &http.Client{ Transport: tr, Timeout: opts.Timeout, CheckRedirect: func(*http.Request, []*http.Request) error { return http.ErrUseLastResponse }, } } // runProbeLoop is the per-VIP probe worker. It waits an initial random // delay (so N VIPs don't phase-lock onto the same tick after startup), // then fires a probe every opts.Interval ± 10% jitter until ctx is // cancelled. Each completed probe posts a probeResultMsg into the // tea.Program via send. // // The loop honors the global `paused` flag by simply skipping the // probe call while paused — the ticker keeps ticking so a resume // picks up at the next natural tick boundary instead of fast- // forwarding through a burst of back-to-back probes. func runProbeLoop(ctx context.Context, vip *vipInfo, opts probeOpts, send func(tea.Msg)) { // Initial offset: uniformly in [0, interval) so N goroutines // started together spread out across one interval window rather // than all firing at t=0. select { case <-ctx.Done(): return case <-time.After(time.Duration(rand.Int64N(int64(opts.Interval)))): } for { sleepFor := jitter(opts.Interval) if !paused.Load() { result := doProbe(ctx, vip, opts) send(result) } select { case <-ctx.Done(): return case <-time.After(sleepFor): } } } // runDNSLookup does a single reverse-DNS (PTR) lookup for vip.ip // and, on success, delivers a hostnameMsg to the UI. Runs once // per VIP at startup and then exits — PTR records change rarely // enough that we don't bother re-querying them, and the operator // can always restart maglevt to pick up a new mapping. // // A 3-second timeout keeps a broken resolver from wedging the // worker for the full life of the program; on timeout we just // give up silently and the VIP displays its IP literal forever. // Trailing dots are stripped so the rendered hostname matches // what the operator typed in their zone file. func runDNSLookup(parent context.Context, vip *vipInfo, send func(tea.Msg)) { ctx, cancel := context.WithTimeout(parent, 3*time.Second) defer cancel() names, err := net.DefaultResolver.LookupAddr(ctx, vip.ip.String()) if err != nil || len(names) == 0 { return } send(hostnameMsg{ VIPIdx: vip.idx, Hostname: strings.TrimSuffix(names[0], "."), }) } // jitter scales d by a uniform factor in [0.9, 1.1) — the same ±10% // jitter the checker uses, same rationale: probes don't phase-lock on // a wall-clock tick across every VIP in the config. func jitter(d time.Duration) time.Duration { if d <= 0 { return d } return time.Duration(float64(d) * (0.9 + 0.2*rand.Float64())) } // doProbe issues one probe against vip and returns a filled-in // probeResultMsg. HTTP / HTTPS go through vip.client with a GET // request against opts.Path (default /.well-known/ipng/healthz), // a Host-header override, and header extraction. Non-HTTP VIPs do // a plain TCP connect — success if the three-way handshake // completes before opts.Timeout. // // GET rather than HEAD: a common health-check path (healthz, // status, /-/healthy) often returns 204 or 200 and is cheap to // serve, but some handlers don't wire HEAD and would 405 us back. // GET works against every reasonable implementation and the // rolling-window latency is unchanged (we time until headers, not // until the body completes — resp.Body.Close() discards the body // without reading it). The defer below still closes the body so // the transport can recycle the connection if --keepalive is on. func doProbe(parent context.Context, vip *vipInfo, opts probeOpts) probeResultMsg { start := time.Now() result := probeResultMsg{VIPIdx: vip.idx, At: start} ctx, cancel := context.WithTimeout(parent, opts.Timeout) defer cancel() if vip.scheme == "tcp" { d := net.Dialer{Timeout: opts.Timeout} conn, err := d.DialContext(ctx, "tcp", net.JoinHostPort(vip.ip.String(), strconv.Itoa(int(vip.port)))) result.Duration = time.Since(start) if err != nil { result.Err = shortError(err) return result } _ = conn.Close() result.OK = true return result } req, err := http.NewRequestWithContext(ctx, http.MethodGet, vip.url, nil) if err != nil { result.Duration = time.Since(start) result.Err = shortError(err) return result } // Host header: user override, else derive from the VIP itself // (which the kernel already put in the URL, so leaving req.Host // empty means "use the URL authority"). We only touch req.Host // when the operator explicitly passed --host. if opts.Host != "" { req.Host = opts.Host } resp, err := vip.client.Do(req) result.Duration = time.Since(start) if err != nil { result.Err = shortError(err) return result } defer func() { _ = resp.Body.Close() }() result.Code = resp.StatusCode result.OK = resp.StatusCode >= 200 && resp.StatusCode < 400 if opts.Header != "" { result.Header = resp.Header.Get(opts.Header) } return result } // shortError collapses common Go net errors into a short string // suitable for a narrow table cell. url.Error wrappings, dial // contexts, and "i/o timeout" trailers all get trimmed so the LAST // column shows something legible like "refused" / "timeout" / // "no route" instead of a 120-char wrapped error. func shortError(err error) string { if err == nil { return "" } msg := err.Error() // net/url-wrapped errors: keep only the last segment, which // holds the actual cause. if i := strings.LastIndex(msg, ": "); i >= 0 { msg = msg[i+2:] } // Normalise common kernel errnos and Go's wrappers. switch { case strings.Contains(msg, "connection refused"): return "refused" case strings.Contains(msg, "i/o timeout") || strings.Contains(msg, "context deadline exceeded"): return "timeout" case strings.Contains(msg, "no route to host"): return "no-route" case strings.Contains(msg, "network is unreachable"): return "net-unrch" case strings.Contains(msg, "host is unreachable"): return "host-unrch" case strings.Contains(msg, "connection reset"): return "reset" case strings.Contains(msg, "EOF"): return "eof" case strings.Contains(strings.ToLower(msg), "tls"): return "tls-err" } // Last resort: truncate anything longer than the LAST column. if len(msg) > 8 { msg = msg[:8] } return msg }