SPA (cmd/frontend/web): - New "lb buckets" column backed by a 1s-debounced GetVPPLBState fetch loop with leading+trailing edge coalesce. - Per-frontend health icon (✅/⚠️/❗/‼️/❓) in the Zippy header, gated by a settling flag that suppresses ‼️ until the next lb-state reconciliation after a backend transition or weight change. - In-place leaf merge on lb-state so stable bucket values (e.g. "0") don't retrigger the Flash animation on every refresh. - Zippy cards remember open state in a cookie, default closed on fresh load; fixed-width frontend-title-name + reserved icon slot so headers line up across all cards. - Clock-drift watchdog in sse.ts that forces a fresh EventSource on laptop-wake so the broker emits a resync instead of hanging on a dead half-open socket. Frontend service (cmd/frontend): - maglevClient.lbStateLoop, trigger on backend transitions + vpp-connect, best-effort fetch on refreshAll. - Admin handlers explicitly wake the lb-state loop after lifecycle ops and set-weight (the latter emits no transition event on the maglevd side, so the WatchEvents path wouldn't have caught it). - /favicon.ico served from embedded web/public IPng logo. VPP integration: - internal/vpp/lbstate.go: dumpASesForVIP drops Pfx from the dump request (setting it silently wipes IPv4 replies in the LB plugin) and filters results by prefix on the response side instead, which also demuxes multi-VIP-on-same-port cases correctly. maglevc: - Walk now returns the unconsumed token tail; dispatch and the question listener reject unknown commands with a targeted error instead of dumping the full command tree prefixed with garbage. - On '?', echo the current line (including the '?') before the help list so the output reads like birdc. Checker / prober: - internal/checker: ±10% jitter on NextInterval so probes across restart don't all fire on the same tick. - internal/prober: HTTP User-Agent now carries the build version and project URL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
842 lines
26 KiB
Go
842 lines
26 KiB
Go
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
|
|
|
|
package checker
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"math/rand/v2"
|
|
"net"
|
|
"sort"
|
|
"sync"
|
|
"time"
|
|
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/config"
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/health"
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/prober"
|
|
)
|
|
|
|
// BackendSnapshot combines the live health state with the config entry for a backend.
|
|
type BackendSnapshot struct {
|
|
Health *health.Backend
|
|
Config config.Backend
|
|
}
|
|
|
|
// Event is emitted on every state transition the checker observes. There are
|
|
// two kinds, distinguished by which of BackendName or FrontendTransition is
|
|
// populated:
|
|
//
|
|
// - Backend transition: FrontendName is the frontend that references the
|
|
// backend (one event per frontend per backend transition), BackendName
|
|
// and Backend are set, and Transition carries the health.Transition.
|
|
// FrontendTransition is nil.
|
|
// - Frontend transition: FrontendName is the frontend whose aggregate state
|
|
// changed, FrontendTransition is non-nil. BackendName and Backend are
|
|
// empty, Transition is the zero value.
|
|
//
|
|
// Consumers dispatch on FrontendTransition != nil.
|
|
type Event struct {
|
|
FrontendName string
|
|
BackendName string
|
|
Backend net.IP
|
|
Transition health.Transition
|
|
|
|
FrontendTransition *health.FrontendTransition
|
|
}
|
|
|
|
type worker struct {
|
|
backend *health.Backend
|
|
hc config.HealthCheck
|
|
entry config.Backend
|
|
cancel context.CancelFunc
|
|
wakeCh chan struct{} // closed/signalled to interrupt probe sleep on resume
|
|
}
|
|
|
|
// Checker orchestrates health probing for all backends.
|
|
// Each backend is probed exactly once, regardless of how many frontends
|
|
// reference it.
|
|
type Checker struct {
|
|
runCtx context.Context // set in Run; used by EnableBackend to start new goroutines
|
|
cfg *config.Config
|
|
mu sync.RWMutex
|
|
workers map[string]*worker // keyed by backend name
|
|
|
|
// frontendStates tracks the aggregated state of every configured frontend
|
|
// (unknown/up/down). Updated whenever a backend transition happens; a
|
|
// change emits a frontend-transition Event. The zero value for a missing
|
|
// key is FrontendStateUnknown, so initial-reference accesses behave
|
|
// correctly even without explicit seeding.
|
|
frontendStates map[string]health.FrontendState
|
|
|
|
subsMu sync.Mutex
|
|
nextID int
|
|
subs map[int]chan Event
|
|
eventCh chan Event
|
|
}
|
|
|
|
// New creates a Checker. Call Run to start probing.
|
|
func New(cfg *config.Config) *Checker {
|
|
return &Checker{
|
|
cfg: cfg,
|
|
workers: make(map[string]*worker),
|
|
frontendStates: make(map[string]health.FrontendState),
|
|
subs: make(map[int]chan Event),
|
|
eventCh: make(chan Event, 256),
|
|
}
|
|
}
|
|
|
|
// Run starts all probe goroutines and blocks until ctx is cancelled.
|
|
func (c *Checker) Run(ctx context.Context) error {
|
|
go c.fanOut(ctx)
|
|
|
|
c.mu.Lock()
|
|
c.runCtx = ctx // safe: held under mu before any EnableBackend call can read it
|
|
names := activeBackendNames(c.cfg)
|
|
maxHistory := c.cfg.HealthChecker.TransitionHistory
|
|
for i, name := range names {
|
|
b := c.cfg.Backends[name]
|
|
hc := c.cfg.HealthChecks[b.HealthCheck]
|
|
c.startWorker(ctx, name, b, hc, i, len(names), maxHistory)
|
|
}
|
|
c.mu.Unlock()
|
|
|
|
<-ctx.Done()
|
|
return nil
|
|
}
|
|
|
|
// Reload applies a new config without restarting the process.
|
|
// New backends are added, removed backends are stopped, changed backends are
|
|
// restarted. Backends whose healthcheck config is unchanged continue
|
|
// uninterrupted, even if the set of frontends referencing them changes.
|
|
func (c *Checker) Reload(ctx context.Context, cfg *config.Config) error {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
maxHistory := cfg.HealthChecker.TransitionHistory
|
|
|
|
desired := map[string]struct{}{}
|
|
for _, name := range activeBackendNames(cfg) {
|
|
desired[name] = struct{}{}
|
|
}
|
|
|
|
// Stop workers no longer needed; emit a removed event using the old frontends.
|
|
for name, w := range c.workers {
|
|
if _, ok := desired[name]; !ok {
|
|
slog.Info("backend-stop", "backend", name)
|
|
t := w.backend.Remove(maxHistory)
|
|
c.emitForBackend(name, w.backend.Address, t, c.cfg.Frontends)
|
|
w.cancel()
|
|
delete(c.workers, name)
|
|
}
|
|
}
|
|
|
|
// Add new or restart changed workers; emit an unknown event using the new frontends.
|
|
names := activeBackendNames(cfg)
|
|
for i, name := range names {
|
|
b := cfg.Backends[name]
|
|
hc := cfg.HealthChecks[b.HealthCheck]
|
|
if w, ok := c.workers[name]; ok {
|
|
if healthCheckEqual(w.hc, hc) {
|
|
// Update entry metadata (address, healthcheck name)
|
|
// in place without restart. Preserve the runtime
|
|
// Enabled flag — the operator's
|
|
// PauseBackend/DisableBackend/EnableBackend state
|
|
// must outlive config reloads so an operator who
|
|
// disabled a backend and then reloaded config
|
|
// (e.g. to adjust weights on an unrelated
|
|
// frontend) doesn't find their disabled backend
|
|
// silently re-enabled while its worker state
|
|
// remains stuck at StateDisabled. The YAML's
|
|
// Enabled field is still authoritative on the
|
|
// worker-restart path below (where the backend
|
|
// is structurally new to this worker instance).
|
|
runtimeEnabled := w.entry.Enabled
|
|
w.entry = b
|
|
w.entry.Enabled = runtimeEnabled
|
|
continue
|
|
}
|
|
slog.Info("backend-restart", "backend", name)
|
|
w.cancel()
|
|
c.startWorker(ctx, name, b, hc, i, len(names), maxHistory)
|
|
} else {
|
|
slog.Info("backend-start", "backend", name)
|
|
c.startWorker(ctx, name, b, hc, i, len(names), maxHistory)
|
|
}
|
|
c.emitForBackend(name, c.workers[name].backend.Address, c.workers[name].backend.Transitions[0], cfg.Frontends)
|
|
}
|
|
|
|
// Drop frontendStates entries for frontends no longer in config.
|
|
for feName := range c.frontendStates {
|
|
if _, ok := cfg.Frontends[feName]; !ok {
|
|
delete(c.frontendStates, feName)
|
|
}
|
|
}
|
|
|
|
c.cfg = cfg
|
|
return nil
|
|
}
|
|
|
|
// Subscribe returns a channel that receives Events for every state transition.
|
|
// Call the returned cancel function to unsubscribe.
|
|
func (c *Checker) Subscribe() (<-chan Event, func()) {
|
|
c.subsMu.Lock()
|
|
defer c.subsMu.Unlock()
|
|
id := c.nextID
|
|
c.nextID++
|
|
ch := make(chan Event, 64)
|
|
c.subs[id] = ch
|
|
return ch, func() {
|
|
c.subsMu.Lock()
|
|
defer c.subsMu.Unlock()
|
|
delete(c.subs, id)
|
|
close(ch)
|
|
}
|
|
}
|
|
|
|
// Config returns the live config pointer held by the checker. Callers must
|
|
// treat the returned value as read-only. The pointer is swapped on Reload,
|
|
// so callers that cache it across reloads may see stale data.
|
|
func (c *Checker) Config() *config.Config {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
return c.cfg
|
|
}
|
|
|
|
// BackendState returns the current health state of a backend. Returns
|
|
// (StateUnknown, false) when the backend has no worker. Satisfies
|
|
// vpp.StateSource.
|
|
func (c *Checker) BackendState(name string) (health.State, bool) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
w, ok := c.workers[name]
|
|
if !ok {
|
|
return health.StateUnknown, false
|
|
}
|
|
return w.backend.State, true
|
|
}
|
|
|
|
// FrontendState returns the current aggregate state of a frontend (unknown,
|
|
// up, or down). Returns (FrontendStateUnknown, false) when the frontend is
|
|
// not known to the checker.
|
|
func (c *Checker) FrontendState(name string) (health.FrontendState, bool) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
if _, ok := c.cfg.Frontends[name]; !ok {
|
|
return health.FrontendStateUnknown, false
|
|
}
|
|
return c.frontendStates[name], true
|
|
}
|
|
|
|
// ListFrontends returns the names of all configured frontends.
|
|
func (c *Checker) ListFrontends() []string {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
names := make([]string, 0, len(c.cfg.Frontends))
|
|
for name := range c.cfg.Frontends {
|
|
names = append(names, name)
|
|
}
|
|
sort.Strings(names)
|
|
return names
|
|
}
|
|
|
|
// GetFrontend returns the frontend config for the given name.
|
|
func (c *Checker) GetFrontend(name string) (config.Frontend, bool) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
v, ok := c.cfg.Frontends[name]
|
|
return v, ok
|
|
}
|
|
|
|
// SetFrontendPoolBackendWeight updates the weight of a backend within a named
|
|
// pool of a frontend. Returns the updated FrontendInfo and a descriptive error
|
|
// if the frontend, pool, or backend is not found or the weight is out of range.
|
|
//
|
|
// After mutating the weight, updateFrontendState is re-run for the affected
|
|
// frontend so the aggregate state reflects the new effective weights. A
|
|
// weight change can flip a frontend between UP and DOWN (e.g. zeroing the
|
|
// last non-zero-weighted backend in the active pool), and without this
|
|
// call the checker's cached frontend state would drift from reality until
|
|
// the next genuine backend transition happens to trigger a recompute.
|
|
func (c *Checker) SetFrontendPoolBackendWeight(frontendName, poolName, backendName string, weight int) (config.Frontend, error) {
|
|
if weight < 0 || weight > 100 {
|
|
return config.Frontend{}, fmt.Errorf("weight %d out of range [0, 100]", weight)
|
|
}
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
fe, ok := c.cfg.Frontends[frontendName]
|
|
if !ok {
|
|
return config.Frontend{}, fmt.Errorf("frontend %q not found", frontendName)
|
|
}
|
|
for i, pool := range fe.Pools {
|
|
if pool.Name != poolName {
|
|
continue
|
|
}
|
|
pb, ok := pool.Backends[backendName]
|
|
if !ok {
|
|
return config.Frontend{}, fmt.Errorf("backend %q not found in pool %q", backendName, poolName)
|
|
}
|
|
pb.Weight = weight
|
|
fe.Pools[i].Backends[backendName] = pb
|
|
c.cfg.Frontends[frontendName] = fe
|
|
slog.Info("frontend-pool-weight", "frontend", frontendName, "pool", poolName, "backend", backendName, "weight", weight)
|
|
c.updateFrontendState(frontendName, fe)
|
|
return fe, nil
|
|
}
|
|
return config.Frontend{}, fmt.Errorf("pool %q not found in frontend %q", poolName, frontendName)
|
|
}
|
|
|
|
// ListHealthChecks returns the names of all configured health checks, sorted.
|
|
func (c *Checker) ListHealthChecks() []string {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
names := make([]string, 0, len(c.cfg.HealthChecks))
|
|
for name := range c.cfg.HealthChecks {
|
|
names = append(names, name)
|
|
}
|
|
sort.Strings(names)
|
|
return names
|
|
}
|
|
|
|
// GetHealthCheck returns the config for a health check by name.
|
|
func (c *Checker) GetHealthCheck(name string) (config.HealthCheck, bool) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
hc, ok := c.cfg.HealthChecks[name]
|
|
return hc, ok
|
|
}
|
|
|
|
// ListBackends returns the names of all active backends.
|
|
func (c *Checker) ListBackends() []string {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
names := make([]string, 0, len(c.workers))
|
|
for name := range c.workers {
|
|
names = append(names, name)
|
|
}
|
|
sort.Strings(names)
|
|
return names
|
|
}
|
|
|
|
// ListFrontendBackends returns the backend health states for all backends of a frontend.
|
|
func (c *Checker) ListFrontendBackends(frontendName string) []*health.Backend {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
fe, ok := c.cfg.Frontends[frontendName]
|
|
if !ok {
|
|
return nil
|
|
}
|
|
var out []*health.Backend
|
|
seen := map[string]struct{}{}
|
|
for _, pool := range fe.Pools {
|
|
for name := range pool.Backends {
|
|
if _, already := seen[name]; already {
|
|
continue
|
|
}
|
|
seen[name] = struct{}{}
|
|
if w, ok := c.workers[name]; ok {
|
|
out = append(out, w.backend)
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// GetBackend returns a snapshot of the health state and config for a backend by name.
|
|
func (c *Checker) GetBackend(name string) (BackendSnapshot, bool) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
w, ok := c.workers[name]
|
|
if !ok {
|
|
return BackendSnapshot{}, false
|
|
}
|
|
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
|
|
}
|
|
|
|
// GetBackendInfo returns the health state and key config fields for a backend.
|
|
// Satisfies metrics.StateSource.
|
|
func (c *Checker) GetBackendInfo(name string) (metrics.BackendInfo, bool) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
w, ok := c.workers[name]
|
|
if !ok {
|
|
return metrics.BackendInfo{}, false
|
|
}
|
|
return metrics.BackendInfo{
|
|
Health: w.backend,
|
|
Enabled: w.entry.Enabled,
|
|
HCName: w.entry.HealthCheck,
|
|
}, true
|
|
}
|
|
|
|
// PauseBackend pauses health checking for a backend by name. The probe
|
|
// goroutine is cancelled so no further traffic is sent to the backend. The
|
|
// backend's state is set to paused and remains frozen until ResumeBackend is
|
|
// called (which starts a fresh probe goroutine).
|
|
// Returns an error if the backend is not found or is disabled.
|
|
func (c *Checker) PauseBackend(name string) (BackendSnapshot, error) {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
w, ok := c.workers[name]
|
|
if !ok {
|
|
return BackendSnapshot{}, fmt.Errorf("backend %q not found", name)
|
|
}
|
|
if !w.entry.Enabled {
|
|
return BackendSnapshot{}, fmt.Errorf("backend %q is disabled; enable it first", name)
|
|
}
|
|
maxHistory := c.cfg.HealthChecker.TransitionHistory
|
|
if w.backend.Pause(maxHistory) {
|
|
t := w.backend.Transitions[0]
|
|
slog.Info("backend-transition", "backend", name,
|
|
"from", t.From.String(),
|
|
"to", t.To.String(),
|
|
)
|
|
c.emitForBackend(name, w.backend.Address, t, c.cfg.Frontends)
|
|
}
|
|
w.cancel()
|
|
return BackendSnapshot{Health: w.backend, Config: w.entry}, nil
|
|
}
|
|
|
|
// ResumeBackend resumes health checking for a backend by name. A fresh probe
|
|
// goroutine is started and the backend re-enters StateUnknown. The existing
|
|
// transition history is preserved.
|
|
// Returns an error if the backend is not found or is disabled.
|
|
func (c *Checker) ResumeBackend(name string) (BackendSnapshot, error) {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
w, ok := c.workers[name]
|
|
if !ok {
|
|
return BackendSnapshot{}, fmt.Errorf("backend %q not found", name)
|
|
}
|
|
if !w.entry.Enabled {
|
|
return BackendSnapshot{}, fmt.Errorf("backend %q is disabled; enable it first", name)
|
|
}
|
|
maxHistory := c.cfg.HealthChecker.TransitionHistory
|
|
if w.backend.Resume(maxHistory) {
|
|
t := w.backend.Transitions[0]
|
|
slog.Info("backend-transition", "backend", name,
|
|
"from", t.From.String(),
|
|
"to", t.To.String(),
|
|
)
|
|
c.emitForBackend(name, w.backend.Address, t, c.cfg.Frontends)
|
|
}
|
|
// Launch a fresh probe goroutine with a new cancellable context,
|
|
// keeping the existing worker and its transition history.
|
|
wCtx, cancel := context.WithCancel(c.runCtx)
|
|
w.cancel = cancel
|
|
w.wakeCh = make(chan struct{}, 1)
|
|
go c.runProbe(wCtx, name, 0, 1)
|
|
return BackendSnapshot{Health: w.backend, Config: w.entry}, nil
|
|
}
|
|
|
|
// DisableBackend stops health checking for a backend and removes it from active
|
|
// rotation. The worker entry is kept in the map so the backend remains visible
|
|
// via GetBackend and can be re-enabled with EnableBackend.
|
|
//
|
|
// Preconditions are keyed on w.backend.State rather than w.entry.Enabled so
|
|
// that any drift between the two fields (e.g. a past Reload that reset the
|
|
// flag without transitioning state) is self-healing: if the state is not
|
|
// already disabled we always do the full transition and bring the flag in
|
|
// line, and if the state is already disabled we fix up the flag without a
|
|
// no-op transition.
|
|
func (c *Checker) DisableBackend(name string) (BackendSnapshot, bool) {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
w, ok := c.workers[name]
|
|
if !ok {
|
|
return BackendSnapshot{}, false
|
|
}
|
|
if w.backend.State == health.StateDisabled {
|
|
// Already disabled at the state level; make sure the flag
|
|
// reflects reality without emitting a redundant transition.
|
|
w.entry.Enabled = false
|
|
if b, ok := c.cfg.Backends[name]; ok {
|
|
b.Enabled = false
|
|
c.cfg.Backends[name] = b
|
|
}
|
|
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
|
|
}
|
|
maxHistory := c.cfg.HealthChecker.TransitionHistory
|
|
t := w.backend.Disable(maxHistory)
|
|
slog.Info("backend-transition", "backend", name,
|
|
"from", t.From.String(),
|
|
"to", t.To.String(),
|
|
)
|
|
c.emitForBackend(name, w.backend.Address, t, c.cfg.Frontends)
|
|
w.cancel()
|
|
w.entry.Enabled = false
|
|
if b, ok := c.cfg.Backends[name]; ok {
|
|
b.Enabled = false
|
|
c.cfg.Backends[name] = b
|
|
}
|
|
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
|
|
}
|
|
|
|
// EnableBackend re-enables a previously disabled backend. The existing
|
|
// Backend struct is reused — its transition history is preserved — and a
|
|
// fresh probe goroutine is launched. The backend re-enters StateUnknown.
|
|
//
|
|
// Preconditions are keyed on w.backend.State rather than w.entry.Enabled, so
|
|
// drift between the two (most commonly caused by a Reload that reset the
|
|
// flag while the worker state was still disabled) doesn't wedge the backend
|
|
// — we always do the full transition when the state is disabled, and skip
|
|
// it (while syncing the flag) when it's not.
|
|
func (c *Checker) EnableBackend(name string) (BackendSnapshot, bool) {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
w, ok := c.workers[name]
|
|
if !ok {
|
|
return BackendSnapshot{}, false
|
|
}
|
|
if w.backend.State != health.StateDisabled {
|
|
// Not in the disabled state — just make the flag match.
|
|
w.entry.Enabled = true
|
|
if b, ok := c.cfg.Backends[name]; ok {
|
|
b.Enabled = true
|
|
c.cfg.Backends[name] = b
|
|
}
|
|
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
|
|
}
|
|
w.entry.Enabled = true
|
|
if b, ok := c.cfg.Backends[name]; ok {
|
|
b.Enabled = true
|
|
c.cfg.Backends[name] = b
|
|
}
|
|
maxHistory := c.cfg.HealthChecker.TransitionHistory
|
|
t := w.backend.Enable(maxHistory)
|
|
slog.Info("backend-transition", "backend", name,
|
|
"from", t.From.String(),
|
|
"to", t.To.String(),
|
|
)
|
|
c.emitForBackend(name, w.backend.Address, t, c.cfg.Frontends)
|
|
|
|
// Launch a fresh probe goroutine with a new cancellable context,
|
|
// keeping the existing worker and its transition history.
|
|
wCtx, cancel := context.WithCancel(c.runCtx)
|
|
w.cancel = cancel
|
|
w.wakeCh = make(chan struct{}, 1)
|
|
go c.runProbe(wCtx, name, 0, 1)
|
|
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
|
|
}
|
|
|
|
// ---- internal --------------------------------------------------------------
|
|
|
|
// startWorker creates a Backend and launches a probe goroutine.
|
|
// Must be called with c.mu held.
|
|
func (c *Checker) startWorker(ctx context.Context, name string, entry config.Backend, hc config.HealthCheck, pos, total, maxHistory int) {
|
|
rise, fall := hc.Rise, hc.Fall
|
|
if entry.HealthCheck == "" {
|
|
// No healthcheck: one synthetic pass drives the backend to Up immediately.
|
|
rise, fall = 1, 1
|
|
}
|
|
wCtx, cancel := context.WithCancel(ctx)
|
|
w := &worker{
|
|
backend: health.New(name, entry.Address, rise, fall),
|
|
hc: hc,
|
|
entry: entry,
|
|
cancel: cancel,
|
|
wakeCh: make(chan struct{}, 1),
|
|
}
|
|
w.backend.Start(maxHistory)
|
|
c.workers[name] = w
|
|
go c.runProbe(wCtx, name, pos, total)
|
|
}
|
|
|
|
// runProbe is the per-backend probe loop.
|
|
func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
|
|
c.mu.RLock()
|
|
w, ok := c.workers[name]
|
|
if !ok {
|
|
c.mu.RUnlock()
|
|
return
|
|
}
|
|
initialDelay := staggerDelay(w.hc.Interval, pos, total)
|
|
c.mu.RUnlock()
|
|
|
|
if initialDelay > 0 {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(initialDelay):
|
|
}
|
|
}
|
|
|
|
first := true
|
|
for {
|
|
c.mu.RLock()
|
|
w, ok := c.workers[name]
|
|
if !ok {
|
|
c.mu.RUnlock()
|
|
return
|
|
}
|
|
hc := w.hc
|
|
entry := w.entry
|
|
maxHistory := c.cfg.HealthChecker.TransitionHistory
|
|
netns := c.cfg.HealthChecker.Netns
|
|
wakeCh := w.wakeCh
|
|
var sleepFor time.Duration
|
|
if entry.HealthCheck == "" {
|
|
// Static (no-healthcheck) backends: the first iteration fires
|
|
// the synthetic pass immediately so the backend reaches "up"
|
|
// without delay; subsequent iterations idle at 30s since there's
|
|
// nothing to do anyway.
|
|
if first {
|
|
sleepFor = 0
|
|
} else {
|
|
sleepFor = 30 * time.Second
|
|
}
|
|
} else {
|
|
sleepFor = jitterInterval(w.backend.NextInterval(hc.Interval, hc.FastInterval, hc.DownInterval))
|
|
}
|
|
c.mu.RUnlock()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(sleepFor):
|
|
case <-wakeCh:
|
|
}
|
|
first = false
|
|
|
|
var result health.ProbeResult
|
|
if entry.HealthCheck == "" {
|
|
// No healthcheck configured: synthesise a passing result so the
|
|
// backend is assumed healthy without any network activity.
|
|
result = health.ProbeResult{OK: true, Layer: health.LayerL7, Code: "L7OK"}
|
|
} else {
|
|
var probeSrc net.IP
|
|
if entry.Address.To4() != nil {
|
|
probeSrc = hc.ProbeIPv4Src
|
|
} else {
|
|
probeSrc = hc.ProbeIPv6Src
|
|
}
|
|
pcfg := prober.ProbeConfig{
|
|
Target: entry.Address,
|
|
Port: hc.Port,
|
|
ProbeSrc: probeSrc,
|
|
HealthCheckNetns: netns,
|
|
Timeout: hc.Timeout,
|
|
HTTP: hc.HTTP,
|
|
TCP: hc.TCP,
|
|
}
|
|
probeCtx, cancel := context.WithTimeout(ctx, hc.Timeout)
|
|
slog.Debug("probe-start", "backend", name, "type", hc.Type)
|
|
start := time.Now()
|
|
result = prober.ForType(hc.Type)(probeCtx, pcfg)
|
|
elapsed := time.Since(start)
|
|
cancel()
|
|
slog.Debug("probe-done",
|
|
"backend", name,
|
|
"type", hc.Type,
|
|
"ok", result.OK,
|
|
"code", result.Code,
|
|
"detail", result.Detail,
|
|
"elapsed", elapsed.Round(time.Millisecond).String(),
|
|
)
|
|
res := "success"
|
|
if !result.OK {
|
|
res = "failure"
|
|
}
|
|
metrics.ProbeTotal.WithLabelValues(name, hc.Type, res, result.Code).Inc()
|
|
metrics.ProbeDuration.WithLabelValues(name, hc.Type).Observe(elapsed.Seconds())
|
|
}
|
|
|
|
c.mu.Lock()
|
|
w, exists := c.workers[name]
|
|
if !exists {
|
|
c.mu.Unlock()
|
|
return
|
|
}
|
|
if w.backend.Record(result, maxHistory) {
|
|
t := w.backend.Transitions[0]
|
|
addr := w.backend.Address
|
|
slog.Info("backend-transition",
|
|
"backend", name,
|
|
"from", t.From.String(),
|
|
"to", t.To.String(),
|
|
"code", result.Code,
|
|
"detail", result.Detail,
|
|
)
|
|
metrics.TransitionTotal.WithLabelValues(name, t.From.String(), t.To.String()).Inc()
|
|
c.emitForBackend(name, addr, t, c.cfg.Frontends)
|
|
}
|
|
c.mu.Unlock()
|
|
}
|
|
}
|
|
|
|
// emitForBackend emits one backend-transition Event per frontend that
|
|
// references backendName (in any pool), using the provided frontends map.
|
|
// After emitting the backend event for a frontend, it also re-computes that
|
|
// frontend's aggregate state and emits a frontend-transition Event if the
|
|
// state has changed. Must be called with c.mu held.
|
|
func (c *Checker) emitForBackend(backendName string, addr net.IP, t health.Transition, frontends map[string]config.Frontend) {
|
|
for feName, fe := range frontends {
|
|
if !frontendReferencesBackend(fe, backendName) {
|
|
continue
|
|
}
|
|
c.emit(Event{FrontendName: feName, BackendName: backendName, Backend: addr, Transition: t})
|
|
c.updateFrontendState(feName, fe)
|
|
}
|
|
}
|
|
|
|
// frontendReferencesBackend reports whether fe has the named backend in any
|
|
// of its pools.
|
|
func frontendReferencesBackend(fe config.Frontend, backendName string) bool {
|
|
for _, pool := range fe.Pools {
|
|
if _, ok := pool.Backends[backendName]; ok {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// updateFrontendState recomputes the aggregate state of fe, compares against
|
|
// the last known state, and emits a frontend-transition Event on change.
|
|
// Must be called with c.mu held. The current state is read from the worker
|
|
// map — so the caller (who already holds c.mu) sees a consistent view.
|
|
func (c *Checker) updateFrontendState(feName string, fe config.Frontend) {
|
|
states := make(map[string]health.State)
|
|
for _, pool := range fe.Pools {
|
|
for bName := range pool.Backends {
|
|
if w, ok := c.workers[bName]; ok {
|
|
states[bName] = w.backend.State
|
|
} else {
|
|
states[bName] = health.StateUnknown
|
|
}
|
|
}
|
|
}
|
|
newState := health.ComputeFrontendState(fe, states)
|
|
old := c.frontendStates[feName] // zero value (Unknown) on first access
|
|
if old == newState {
|
|
return
|
|
}
|
|
c.frontendStates[feName] = newState
|
|
ft := health.FrontendTransition{From: old, To: newState, At: time.Now()}
|
|
slog.Info("frontend-transition",
|
|
"frontend", feName,
|
|
"from", old.String(),
|
|
"to", newState.String(),
|
|
)
|
|
c.emit(Event{FrontendName: feName, FrontendTransition: &ft})
|
|
}
|
|
|
|
// emit sends an event to the internal fan-out channel (non-blocking).
|
|
// Must be called with c.mu held.
|
|
func (c *Checker) emit(e Event) {
|
|
select {
|
|
case c.eventCh <- e:
|
|
default:
|
|
slog.Warn("event-drop", "frontend", e.FrontendName, "backend", e.BackendName)
|
|
}
|
|
}
|
|
|
|
// fanOut reads from eventCh and distributes to all subscribers.
|
|
func (c *Checker) fanOut(ctx context.Context) {
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case e := <-c.eventCh:
|
|
c.subsMu.Lock()
|
|
for _, ch := range c.subs {
|
|
select {
|
|
case ch <- e:
|
|
default:
|
|
// Slow subscriber — drop rather than block.
|
|
}
|
|
}
|
|
c.subsMu.Unlock()
|
|
}
|
|
}
|
|
}
|
|
|
|
// healthCheckEqual returns true if two HealthCheck configs are functionally identical.
|
|
func healthCheckEqual(a, b config.HealthCheck) bool {
|
|
if a.Type != b.Type ||
|
|
a.Interval != b.Interval ||
|
|
a.FastInterval != b.FastInterval ||
|
|
a.DownInterval != b.DownInterval ||
|
|
a.Timeout != b.Timeout ||
|
|
a.Rise != b.Rise ||
|
|
a.Fall != b.Fall {
|
|
return false
|
|
}
|
|
return httpParamsEqual(a.HTTP, b.HTTP) && tcpParamsEqual(a.TCP, b.TCP)
|
|
}
|
|
|
|
func httpParamsEqual(a, b *config.HTTPParams) bool {
|
|
if a == nil && b == nil {
|
|
return true
|
|
}
|
|
if a == nil || b == nil {
|
|
return false
|
|
}
|
|
aRe, bRe := "", ""
|
|
if a.ResponseRegexp != nil {
|
|
aRe = a.ResponseRegexp.String()
|
|
}
|
|
if b.ResponseRegexp != nil {
|
|
bRe = b.ResponseRegexp.String()
|
|
}
|
|
return a.Path == b.Path &&
|
|
a.Host == b.Host &&
|
|
a.ResponseCodeMin == b.ResponseCodeMin &&
|
|
a.ResponseCodeMax == b.ResponseCodeMax &&
|
|
aRe == bRe &&
|
|
a.ServerName == b.ServerName &&
|
|
a.InsecureSkipVerify == b.InsecureSkipVerify
|
|
}
|
|
|
|
func tcpParamsEqual(a, b *config.TCPParams) bool {
|
|
if a == nil && b == nil {
|
|
return true
|
|
}
|
|
if a == nil || b == nil {
|
|
return false
|
|
}
|
|
return a.SSL == b.SSL &&
|
|
a.ServerName == b.ServerName &&
|
|
a.InsecureSkipVerify == b.InsecureSkipVerify
|
|
}
|
|
|
|
// activeBackendNames returns a sorted, deduplicated list of backend names that
|
|
// are referenced by at least one frontend pool and have Enabled: true.
|
|
func activeBackendNames(cfg *config.Config) []string {
|
|
seen := map[string]struct{}{}
|
|
for _, fe := range cfg.Frontends {
|
|
for _, pool := range fe.Pools {
|
|
for name := range pool.Backends {
|
|
if b, ok := cfg.Backends[name]; ok && b.Enabled {
|
|
seen[name] = struct{}{}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
names := make([]string, 0, len(seen))
|
|
for name := range seen {
|
|
names = append(names, name)
|
|
}
|
|
sort.Strings(names)
|
|
return names
|
|
}
|
|
|
|
// staggerDelay computes the initial probe delay for position pos out of total.
|
|
func staggerDelay(interval time.Duration, pos, total int) time.Duration {
|
|
if total <= 1 {
|
|
return 0
|
|
}
|
|
return time.Duration(int64(interval) * int64(pos) / int64(total))
|
|
}
|
|
|
|
// jitterInterval scales d by a uniformly-random factor in [0.9, 1.1) so that
|
|
// probe schedules across many backends drift apart instead of all firing on
|
|
// the same tick after process start (or after a config reload re-staggers them
|
|
// onto identical phases).
|
|
func jitterInterval(d time.Duration) time.Duration {
|
|
if d <= 0 {
|
|
return d
|
|
}
|
|
return time.Duration(float64(d) * (0.9 + 0.2*rand.Float64()))
|
|
}
|