308 lines
11 KiB
Go
308 lines
11 KiB
Go
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"math/rand/v2"
|
|
"net"
|
|
"net/http"
|
|
"strconv"
|
|
"strings"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
tea "github.com/charmbracelet/bubbletea"
|
|
)
|
|
|
|
// probeOpts bundles the runtime probe configuration coming from the
|
|
// command-line flags. Each probe goroutine gets a copy at startup;
|
|
// nothing in here mutates during the program's lifetime (pauses flow
|
|
// through the global `paused` atomic instead).
|
|
type probeOpts struct {
|
|
Interval time.Duration
|
|
Timeout time.Duration
|
|
Host string
|
|
Path string
|
|
Header string
|
|
Insecure bool
|
|
KeepAlive bool
|
|
}
|
|
|
|
// vipInfo is the immutable per-VIP descriptor built from the maglev
|
|
// config at startup. Kept separate from vipState (which holds mutable
|
|
// stats) so the probe goroutine can read its target without racing
|
|
// the Update loop in model.go.
|
|
//
|
|
// Identity of a VIP in maglevt is the (scheme, ip, port) tuple, not
|
|
// the symbolic name from the config: when we union multiple maglev
|
|
// yaml files, the same name can describe different tuples across
|
|
// deployments and the tuple is the unambiguous key. The TUI shows
|
|
// scheme + ip:port and nothing else, so there's no need to carry a
|
|
// display name at all.
|
|
type vipInfo struct {
|
|
idx int // index into Model.vips (stable for the process lifetime)
|
|
scheme string // "http", "https", or "tcp"
|
|
ip net.IP // the VIP address
|
|
port uint16 // the VIP port (TCP)
|
|
url string // assembled probe URL for http/https; empty for tcp
|
|
client *http.Client
|
|
}
|
|
|
|
// probeResultMsg is the tea.Msg sent from probe goroutines to the UI
|
|
// on every probe completion. Bubbletea delivers it into Model.Update
|
|
// on the tea dispatch goroutine, so the model can mutate its per-VIP
|
|
// state without locks.
|
|
type probeResultMsg struct {
|
|
VIPIdx int
|
|
At time.Time
|
|
Duration time.Duration
|
|
OK bool
|
|
Code int // HTTP status code (0 for tcp-only probes)
|
|
Header string // extracted response header value, empty if absent
|
|
Err string // empty when OK; populated with a short error string otherwise
|
|
}
|
|
|
|
// hostnameMsg is the tea.Msg the DNS resolver worker sends once it
|
|
// has a PTR record for a VIP. The UI uses it to populate
|
|
// vipState.hostname so the 'd' toggle has something to show. One
|
|
// message per VIP at most — lookup failures just drop silently
|
|
// and the VIP stays on its IP literal.
|
|
type hostnameMsg struct {
|
|
VIPIdx int
|
|
Hostname string
|
|
}
|
|
|
|
// paused is the global pause flag flipped by the spacebar binding in
|
|
// Model.Update. Using an atomic rather than a channel keeps the probe
|
|
// loop dead-simple — no extra select case, no risk of wedging a
|
|
// goroutine on a full buffered channel.
|
|
var paused atomic.Bool
|
|
|
|
// newHTTPClient builds an *http.Client with the transport configured
|
|
// the way maglevt wants its probes to behave. The two flags that
|
|
// actually matter here are:
|
|
//
|
|
// - DisableKeepAlives = !opts.KeepAlive: off by default, because
|
|
// failover testing relies on every probe opening a fresh TCP+TLS
|
|
// connection. A single persistent connection would pin us to one
|
|
// backend until the keep-alive timer expires, and the tally would
|
|
// silently lie about which backend the load balancer is actually
|
|
// steering new flows to. --keepalive/-k flips this back on for
|
|
// users who explicitly want the pinned-session view.
|
|
// - InsecureSkipVerify: on by default, matching `curl -k`. VIP
|
|
// certificates almost never match the raw IP literal we're
|
|
// targeting, so verification would fail on the first probe. Users
|
|
// who want strict verification can pass --insecure=false.
|
|
//
|
|
// Redirects are refused (CheckRedirect returns ErrUseLastResponse) so
|
|
// a 302 from one backend doesn't silently advance to another VIP and
|
|
// pollute the tally.
|
|
func newHTTPClient(opts probeOpts) *http.Client {
|
|
tr := &http.Transport{
|
|
TLSClientConfig: &tls.Config{
|
|
InsecureSkipVerify: opts.Insecure, //nolint:gosec // matching curl -k, see comment above
|
|
},
|
|
DisableKeepAlives: !opts.KeepAlive,
|
|
// We dial the VIP literal directly — no DNS involvement —
|
|
// so the default DialContext is already what we want.
|
|
}
|
|
return &http.Client{
|
|
Transport: tr,
|
|
Timeout: opts.Timeout,
|
|
CheckRedirect: func(*http.Request, []*http.Request) error {
|
|
return http.ErrUseLastResponse
|
|
},
|
|
}
|
|
}
|
|
|
|
// runProbeLoop is the per-VIP probe worker. It waits an initial random
|
|
// delay (so N VIPs don't phase-lock onto the same tick after startup),
|
|
// then fires a probe every opts.Interval ± 10% jitter until ctx is
|
|
// cancelled. Each completed probe posts a probeResultMsg into the
|
|
// tea.Program via send.
|
|
//
|
|
// The probe duration is subtracted from the sleep so the loop's
|
|
// *period* is the interval, not interval + probe time. If a 100ms
|
|
// interval probe takes 30ms, the next sleep is 70ms (minus jitter),
|
|
// not 100ms — otherwise a VIP with 30ms probes would actually fire
|
|
// every 130ms and the --interval flag would quietly lie. On the
|
|
// pathological case where a probe overruns the interval (slow VIP
|
|
// under load) we clamp the sleep to zero and fire the next probe
|
|
// immediately, but we never try to "catch up" by firing multiple
|
|
// back-to-back probes — that would flood an already-struggling
|
|
// backend right when it's slow.
|
|
//
|
|
// The loop honors the global `paused` flag by simply skipping the
|
|
// probe call while paused — the ticker keeps ticking so a resume
|
|
// picks up at the next natural tick boundary instead of fast-
|
|
// forwarding through a burst of back-to-back probes.
|
|
func runProbeLoop(ctx context.Context, vip *vipInfo, opts probeOpts, send func(tea.Msg)) {
|
|
// Initial offset: uniformly in [0, interval) so N goroutines
|
|
// started together spread out across one interval window rather
|
|
// than all firing at t=0.
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(time.Duration(rand.Int64N(int64(opts.Interval)))):
|
|
}
|
|
for {
|
|
sleepFor := jitter(opts.Interval)
|
|
if !paused.Load() {
|
|
result := doProbe(ctx, vip, opts)
|
|
send(result)
|
|
sleepFor -= result.Duration
|
|
if sleepFor < 0 {
|
|
sleepFor = 0
|
|
}
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(sleepFor):
|
|
}
|
|
}
|
|
}
|
|
|
|
// runDNSLookup does a single reverse-DNS (PTR) lookup for vip.ip
|
|
// and, on success, delivers a hostnameMsg to the UI. Runs once
|
|
// per VIP at startup and then exits — PTR records change rarely
|
|
// enough that we don't bother re-querying them, and the operator
|
|
// can always restart maglevt to pick up a new mapping.
|
|
//
|
|
// A 3-second timeout keeps a broken resolver from wedging the
|
|
// worker for the full life of the program; on timeout we just
|
|
// give up silently and the VIP displays its IP literal forever.
|
|
// Trailing dots are stripped so the rendered hostname matches
|
|
// what the operator typed in their zone file.
|
|
func runDNSLookup(parent context.Context, vip *vipInfo, send func(tea.Msg)) {
|
|
ctx, cancel := context.WithTimeout(parent, 3*time.Second)
|
|
defer cancel()
|
|
names, err := net.DefaultResolver.LookupAddr(ctx, vip.ip.String())
|
|
if err != nil || len(names) == 0 {
|
|
return
|
|
}
|
|
send(hostnameMsg{
|
|
VIPIdx: vip.idx,
|
|
Hostname: strings.TrimSuffix(names[0], "."),
|
|
})
|
|
}
|
|
|
|
// jitter scales d by a uniform factor in [0.9, 1.1) — the same ±10%
|
|
// jitter the checker uses, same rationale: probes don't phase-lock on
|
|
// a wall-clock tick across every VIP in the config.
|
|
func jitter(d time.Duration) time.Duration {
|
|
if d <= 0 {
|
|
return d
|
|
}
|
|
return time.Duration(float64(d) * (0.9 + 0.2*rand.Float64()))
|
|
}
|
|
|
|
// doProbe issues one probe against vip and returns a filled-in
|
|
// probeResultMsg. HTTP / HTTPS go through vip.client with a GET
|
|
// request against opts.Path (default /.well-known/ipng/healthz),
|
|
// a Host-header override, and header extraction. Non-HTTP VIPs do
|
|
// a plain TCP connect — success if the three-way handshake
|
|
// completes before opts.Timeout.
|
|
//
|
|
// GET rather than HEAD: a common health-check path (healthz,
|
|
// status, /-/healthy) often returns 204 or 200 and is cheap to
|
|
// serve, but some handlers don't wire HEAD and would 405 us back.
|
|
// GET works against every reasonable implementation and the
|
|
// rolling-window latency is unchanged (we time until headers, not
|
|
// until the body completes — resp.Body.Close() discards the body
|
|
// without reading it). The defer below still closes the body so
|
|
// the transport can recycle the connection if --keepalive is on.
|
|
func doProbe(parent context.Context, vip *vipInfo, opts probeOpts) probeResultMsg {
|
|
start := time.Now()
|
|
result := probeResultMsg{VIPIdx: vip.idx, At: start}
|
|
|
|
ctx, cancel := context.WithTimeout(parent, opts.Timeout)
|
|
defer cancel()
|
|
|
|
if vip.scheme == "tcp" {
|
|
d := net.Dialer{Timeout: opts.Timeout}
|
|
conn, err := d.DialContext(ctx, "tcp",
|
|
net.JoinHostPort(vip.ip.String(), strconv.Itoa(int(vip.port))))
|
|
result.Duration = time.Since(start)
|
|
if err != nil {
|
|
result.Err = shortError(err)
|
|
return result
|
|
}
|
|
_ = conn.Close()
|
|
result.OK = true
|
|
return result
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, vip.url, nil)
|
|
if err != nil {
|
|
result.Duration = time.Since(start)
|
|
result.Err = shortError(err)
|
|
return result
|
|
}
|
|
// Host header: user override, else derive from the VIP itself
|
|
// (which the kernel already put in the URL, so leaving req.Host
|
|
// empty means "use the URL authority"). We only touch req.Host
|
|
// when the operator explicitly passed --host.
|
|
if opts.Host != "" {
|
|
req.Host = opts.Host
|
|
}
|
|
|
|
resp, err := vip.client.Do(req)
|
|
result.Duration = time.Since(start)
|
|
if err != nil {
|
|
result.Err = shortError(err)
|
|
return result
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
|
|
result.Code = resp.StatusCode
|
|
result.OK = resp.StatusCode >= 200 && resp.StatusCode < 400
|
|
if opts.Header != "" {
|
|
result.Header = resp.Header.Get(opts.Header)
|
|
}
|
|
return result
|
|
}
|
|
|
|
// shortError collapses common Go net errors into a short string
|
|
// suitable for a narrow table cell. url.Error wrappings, dial
|
|
// contexts, and "i/o timeout" trailers all get trimmed so the LAST
|
|
// column shows something legible like "refused" / "timeout" /
|
|
// "no route" instead of a 120-char wrapped error.
|
|
func shortError(err error) string {
|
|
if err == nil {
|
|
return ""
|
|
}
|
|
msg := err.Error()
|
|
// net/url-wrapped errors: keep only the last segment, which
|
|
// holds the actual cause.
|
|
if i := strings.LastIndex(msg, ": "); i >= 0 {
|
|
msg = msg[i+2:]
|
|
}
|
|
// Normalise common kernel errnos and Go's wrappers.
|
|
switch {
|
|
case strings.Contains(msg, "connection refused"):
|
|
return "refused"
|
|
case strings.Contains(msg, "i/o timeout") || strings.Contains(msg, "context deadline exceeded"):
|
|
return "timeout"
|
|
case strings.Contains(msg, "no route to host"):
|
|
return "no-route"
|
|
case strings.Contains(msg, "network is unreachable"):
|
|
return "net-unrch"
|
|
case strings.Contains(msg, "host is unreachable"):
|
|
return "host-unrch"
|
|
case strings.Contains(msg, "connection reset"):
|
|
return "reset"
|
|
case strings.Contains(msg, "EOF"):
|
|
return "eof"
|
|
case strings.Contains(strings.ToLower(msg), "tls"):
|
|
return "tls-err"
|
|
}
|
|
// Last resort: truncate anything longer than the LAST column.
|
|
if len(msg) > 8 {
|
|
msg = msg[:8]
|
|
}
|
|
return msg
|
|
}
|