Files
vpp-maglev/internal/vpp/warmup.go

409 lines
15 KiB
Go

// SPDX-License-Identifier: Apache-2.0
package vpp
import (
"context"
"log/slog"
"sync"
"time"
"git.ipng.ch/ipng/vpp-maglev/internal/config"
"git.ipng.ch/ipng/vpp-maglev/internal/health"
)
// warmupPollInterval is how often runWarmup re-checks per-VIP backend
// state during the [minDelay, maxDelay) per-VIP release phase. 250ms
// is fast enough that a VIP whose last backend just completed rise
// probes gets released within a quarter-second of settling, and slow
// enough that the polling cost is negligible compared to probe work
// the checker is doing on the same core at the same time.
const warmupPollInterval = 250 * time.Millisecond
// warmupTracker is the per-process gate for the VPP LB sync path
// during the first StartupMaxDelay seconds of maglevd's life. It
// exists to keep a maglevd restart dataplane-neutral: without it,
// the first SyncLBStateAll would fire before any probes had
// completed, every backend would still be in StateUnknown, and
// BackendEffectiveWeight would reduce every AS to weight 0 — which
// on VPP's side means the new-flow table empties and every new
// connection hits the "no server" drop counter until the rise
// counters catch up.
//
// The tracker expresses three states and the transitions between
// them:
//
// 1. inside [0, minDelay) — "min-delay window". No sync of any
// kind is allowed to touch VPP, neither the periodic SyncLBStateAll
// loop nor the per-transition SyncLBStateVIP path from the
// reconciler. This is the absolute hands-off window the operator
// configures with vpp.lb.startup-min-delay.
//
// 2. inside [minDelay, maxDelay), per-VIP gating — "release phase".
// Each frontend is released (and one SyncLBStateVIP runs against
// it) as soon as every backend it references has reached a
// non-Unknown state. Both the warmup driver goroutine (which
// polls at warmupPollInterval) and the reconciler event path
// (which checks on every received transition) attempt to release
// VIPs; tryRelease arbitrates.
//
// 3. allDone — warmup is complete. Either every VIP has been
// individually released during the release phase, or the
// maxDelay watchdog expired and the warmup driver ran a final
// SyncLBStateAll for any stragglers. After allDone every gate
// is open, the reconciler runs normally on every transition,
// and the periodic lbSyncLoop's ticker starts.
//
// The clock is process-relative: startAt is set in Client.New()
// and does not reset across VPP reconnects. If VPP drops at t=8s
// while the release phase is mid-run and reconnects at t=12s, the
// warmup driver re-enters the release phase knowing that 12s of
// the 30s maxDelay have already been consumed. If VPP stays down
// past maxDelay, the first connect after that jumps straight to
// the final SyncLBStateAll and marks allDone.
type warmupTracker struct {
startAt time.Time
minDelay time.Duration
maxDelay time.Duration
mu sync.Mutex
released map[string]bool // frontend name → released-for-sync
allDone bool
doneCh chan struct{} // closed when allDone is first set
}
// newWarmupTracker constructs a tracker with startAt pinned to time.Now().
// Delay values are not read at construction time — they come from the
// config via runWarmup's call to getStateSource().Config() — so main.go
// can construct the Client before the config has been fully propagated.
func newWarmupTracker() *warmupTracker {
return &warmupTracker{
startAt: time.Now(),
released: make(map[string]bool),
doneCh: make(chan struct{}),
}
}
// configure latches the min/max delay values onto the tracker. Idempotent
// if called with the same values; separate from the constructor so the
// tracker exists before we've parsed the config, and so runWarmup can
// read a consistent pair of values even if the config is reloaded mid-
// warmup (per design decision, config reload does not reset the warmup
// clock, and the delay values latched at first configure() are authoritative
// for the lifetime of the warmup phase).
func (w *warmupTracker) configure(minDelay, maxDelay time.Duration) {
w.mu.Lock()
defer w.mu.Unlock()
// Only latch once. Subsequent calls are no-ops so a config reload
// doesn't re-run warmup against new (possibly shorter) delays.
if w.minDelay != 0 || w.maxDelay != 0 {
return
}
w.minDelay = minDelay
w.maxDelay = maxDelay
}
// inMinDelay reports whether the absolute hands-off window is still active.
func (w *warmupTracker) inMinDelay() bool {
w.mu.Lock()
defer w.mu.Unlock()
if w.allDone {
return false
}
return time.Since(w.startAt) < w.minDelay
}
// isReleased reports whether the given frontend may be synced. True if
// warmup is fully done or this specific frontend has been individually
// released during the release phase. Fast path for the reconciler event
// handler: a cheap check before it considers attempting a release.
func (w *warmupTracker) isReleased(feName string) bool {
w.mu.Lock()
defer w.mu.Unlock()
return w.allDone || w.released[feName]
}
// tryRelease atomically decides whether to release feName. Returns true
// if the frontend is (now) eligible for sync:
//
// - already released (by a previous caller, including allDone) → true
// - inside minDelay window → false
// - past minDelay and caller has verified allKnown externally → true,
// and the tracker is mutated to remember the release
//
// tryRelease does NOT check the allKnown precondition itself — the
// caller is responsible for evaluating backend states before calling.
// Separating the checks this way lets two independent release drivers
// (the warmup poll goroutine and the reconciler event handler) share
// the same gating state without exposing a mid-check race.
//
// Returns true for the "already released" case so callers have a
// single branch: if tryRelease(fe) is true, proceed to sync.
func (w *warmupTracker) tryRelease(feName string) bool {
w.mu.Lock()
defer w.mu.Unlock()
if w.allDone || w.released[feName] {
return true
}
if time.Since(w.startAt) < w.minDelay {
return false
}
w.released[feName] = true
return true
}
// finishAll marks warmup fully complete. Called once by runWarmup when
// either every frontend has been released via the per-VIP path or the
// maxDelay watchdog has expired. Idempotent: repeat calls are no-ops.
// Closes doneCh on the first call so waiters in lbSyncLoop unblock.
func (w *warmupTracker) finishAll() {
w.mu.Lock()
defer w.mu.Unlock()
if w.allDone {
return
}
w.allDone = true
close(w.doneCh)
}
// doneChan returns a channel that is closed when finishAll is called.
// Waiters block on this to defer periodic sync work until after the
// warmup phase has completed (or been skipped entirely).
func (w *warmupTracker) doneChan() <-chan struct{} {
return w.doneCh
}
// isAllDone is the non-blocking companion to doneChan: true iff
// finishAll has been called. Used by lbSyncLoop to decide whether
// to re-enter runWarmup on each VPP reconnect.
func (w *warmupTracker) isAllDone() bool {
w.mu.Lock()
defer w.mu.Unlock()
return w.allDone
}
// elapsed returns how long the tracker has been running, formatted
// as a human-readable Go duration string (e.g. "959ms", "5.2s") for
// use in slog attributes. Returning the string form directly —
// rather than a time.Duration — is deliberate: slog's default JSON
// handler renders time.Duration as a raw nanosecond int64 which is
// unreadable in a log viewer, while a pre-formatted string lands
// as "5.2s" and matches how the config values are written in YAML.
func (w *warmupTracker) elapsed() string {
return time.Since(w.startAt).Round(time.Millisecond).String()
}
// allBackendsKnown reports whether every backend referenced by fe is
// in a non-Unknown state. This is the precondition for releasing a
// frontend during the per-VIP release phase: desiredFromFrontend can
// only produce correct weights for a frontend whose backends have
// all been probed at least once through the health checker's rise
// counter (unknown → up/down).
//
// "Known" here is the literal reading: StateUnknown disqualifies,
// everything else qualifies. That means a legitimately-down backend
// counts as known and contributes its weight=0 to the desired set,
// which is the correct restart behaviour — a backend that was down
// before the restart stays down across the restart without waiting
// for it to come back up.
func allBackendsKnown(fe config.Frontend, src StateSource) bool {
for _, pool := range fe.Pools {
for bName := range pool.Backends {
s, ok := src.BackendState(bName)
if !ok || s == health.StateUnknown {
return false
}
}
}
return true
}
// runWarmup drives the warmup state machine. Called from lbSyncLoop
// on first entry (subsequent reconnect entries find allDone == true
// and skip straight to the periodic ticker).
//
// Phases:
//
// 1. Latch delay values from the current config.
// 2. If maxDelay == 0 (warmup disabled): run SyncLBStateAll
// immediately, mark allDone, return.
// 3. Sleep until minDelay has elapsed (absolute hands-off).
// 4. Poll every warmupPollInterval, releasing any frontend whose
// backends are all known. Each release fires a single-VIP sync.
// Exit the poll when all frontends are released OR maxDelay
// elapses.
// 5. Run SyncLBStateAll for any stragglers and mark allDone.
//
// Exits early if ctx is cancelled at any point.
func (c *Client) runWarmup(ctx context.Context) {
src := c.getStateSource()
if src == nil {
// No state source ever registered; nothing meaningful to do.
// Close the gate so lbSyncLoop doesn't hang.
c.warmup.finishAll()
return
}
cfg := src.Config()
if cfg == nil {
c.warmup.finishAll()
return
}
c.warmup.configure(cfg.VPP.LB.StartupMinDelay, cfg.VPP.LB.StartupMaxDelay)
w := c.warmup
// maxDelay == 0 is the "no warmup" escape hatch: sync immediately
// and mark the gate open. Operators pick this for tests and dev
// setups where a few seconds of startup black-hole on bounce is
// acceptable in exchange for not having to wait out the warmup.
if w.maxDelay == 0 {
slog.Info("vpp-lb-warmup-skipped",
"impact", "VPP LB update skipped")
if err := c.SyncLBStateAll(cfg); err != nil {
slog.Warn("vpp-lb-sync-error", "err", err)
}
w.finishAll()
return
}
slog.Info("vpp-lb-warmup-start",
"min-delay", w.minDelay.String(),
"max-delay", w.maxDelay.String(),
"impact", "Gating all VPP LB updates")
// Phase 3: wait out the min-delay absolute hands-off window.
minDeadline := w.startAt.Add(w.minDelay)
if wait := time.Until(minDeadline); wait > 0 {
select {
case <-ctx.Done():
return
case <-time.After(wait):
}
}
slog.Info("vpp-lb-warmup-min-delay-elapsed",
"elapsed", w.elapsed(),
"impact", "Ungating VPP LB updates for VIPs with known backend state")
// Phase 4: poll for per-VIP release until maxDelay expires.
// happyPath is set to true if we exit the poll loop because
// every frontend has been released individually via SyncLBStateVIP.
// In that case Phase 5 below skips the SyncLBStateAll entirely —
// running it would be a redundant full reconcile over a dataplane
// that's already in the desired state, and the log would misreport
// the warmup-complete event as a "max-delay-final" stragglers sweep.
maxDeadline := w.startAt.Add(w.maxDelay)
happyPath := false
for time.Now().Before(maxDeadline) {
// Re-read the state source every tick: the config may not
// have been available at loop entry (e.g. first connect
// beat the config load race) but could be present now.
src = c.getStateSource()
if src != nil {
cfg = src.Config()
}
if cfg != nil {
// Release any frontend whose backends have all settled.
allReleased := true
for feName, fe := range cfg.Frontends {
if w.isReleased(feName) {
continue
}
if !allBackendsKnown(fe, src) {
allReleased = false
continue
}
if !w.tryRelease(feName) {
// Still inside minDelay — shouldn't happen here
// because we waited above, but guard anyway.
allReleased = false
continue
}
slog.Info("vpp-lb-warmup-release",
"frontend", feName,
"trigger", "poll",
"elapsed", w.elapsed())
if err := c.SyncLBStateVIP(cfg, feName, ""); err != nil {
slog.Warn("vpp-lb-warmup-release-error",
"frontend", feName,
"err", err)
}
}
if allReleased {
// Everything settled before maxDelay — fast path
// out of the poll so we don't sit idle for the
// remainder of the watchdog window.
happyPath = true
break
}
}
// Sleep warmupPollInterval or until maxDelay, whichever
// is shorter, before trying again.
wait := warmupPollInterval
if rem := time.Until(maxDeadline); rem < wait {
wait = rem
}
select {
case <-ctx.Done():
return
case <-time.After(wait):
}
}
// Phase 5: close out warmup. Two paths, but both emit
// vpp-lb-warmup-max-delay-elapsed at the max-delay boundary so
// the log timeline (start → min-delay-elapsed → (releases
// happen) → max-delay-elapsed) is consistent regardless of
// whether the warmup ended early or the watchdog tripped.
//
// - happyPath: every frontend was released individually during
// Phase 4 and each one's SyncLBStateVIP already ran. VPP is
// in the desired state; finishAll is called immediately so
// the periodic sync loop can start drift-correction without
// waiting out the rest of max-delay. The warmup driver then
// sleeps until max-delay and emits -max-delay-elapsed as a
// gratuitous timeline marker — the gate is already open,
// but the line completes the warmup picture for an operator
// reading the log and keeps the event sequence symmetric
// with the watchdog path.
//
// - watchdog: max-delay elapsed with stragglers remaining. At
// least one frontend never made it through allBackendsKnown,
// so its effective weight computation still treats some
// backends as StateUnknown and will program weight=0 for
// them. Emit -max-delay-elapsed at the boundary, run
// SyncLBStateAll to sweep stragglers, then finishAll.
if happyPath {
slog.Info("vpp-lb-warmup-complete",
"elapsed", w.elapsed(),
"impact", "Ungating VPP LB updates, all frontends released")
w.finishAll()
if wait := time.Until(maxDeadline); wait > 0 {
select {
case <-ctx.Done():
return
case <-time.After(wait):
}
}
slog.Info("vpp-lb-warmup-max-delay-elapsed",
"elapsed", w.elapsed(),
"impact", "Ungating all VPP LB updates")
return
}
slog.Info("vpp-lb-warmup-max-delay-elapsed",
"elapsed", w.elapsed(),
"impact", "Ungating all VPP LB updates")
src = c.getStateSource()
if src != nil {
cfg = src.Config()
}
if cfg != nil {
if err := c.SyncLBStateAll(cfg); err != nil {
slog.Warn("vpp-lb-sync-error", "err", err)
}
}
w.finishAll()
}