// SPDX-License-Identifier: Apache-2.0 package vpp import ( "context" "log/slog" "sync" "time" "git.ipng.ch/ipng/vpp-maglev/internal/config" "git.ipng.ch/ipng/vpp-maglev/internal/health" ) // warmupPollInterval is how often runWarmup re-checks per-VIP backend // state during the [minDelay, maxDelay) per-VIP release phase. 250ms // is fast enough that a VIP whose last backend just completed rise // probes gets released within a quarter-second of settling, and slow // enough that the polling cost is negligible compared to probe work // the checker is doing on the same core at the same time. const warmupPollInterval = 250 * time.Millisecond // warmupTracker is the per-process gate for the VPP LB sync path // during the first StartupMaxDelay seconds of maglevd's life. It // exists to keep a maglevd restart dataplane-neutral: without it, // the first SyncLBStateAll would fire before any probes had // completed, every backend would still be in StateUnknown, and // BackendEffectiveWeight would reduce every AS to weight 0 — which // on VPP's side means the new-flow table empties and every new // connection hits the "no server" drop counter until the rise // counters catch up. // // The tracker expresses three states and the transitions between // them: // // 1. inside [0, minDelay) — "min-delay window". No sync of any // kind is allowed to touch VPP, neither the periodic SyncLBStateAll // loop nor the per-transition SyncLBStateVIP path from the // reconciler. This is the absolute hands-off window the operator // configures with vpp.lb.startup-min-delay. // // 2. inside [minDelay, maxDelay), per-VIP gating — "release phase". // Each frontend is released (and one SyncLBStateVIP runs against // it) as soon as every backend it references has reached a // non-Unknown state. Both the warmup driver goroutine (which // polls at warmupPollInterval) and the reconciler event path // (which checks on every received transition) attempt to release // VIPs; tryRelease arbitrates. // // 3. allDone — warmup is complete. Either every VIP has been // individually released during the release phase, or the // maxDelay watchdog expired and the warmup driver ran a final // SyncLBStateAll for any stragglers. After allDone every gate // is open, the reconciler runs normally on every transition, // and the periodic lbSyncLoop's ticker starts. // // The clock is process-relative: startAt is set in Client.New() // and does not reset across VPP reconnects. If VPP drops at t=8s // while the release phase is mid-run and reconnects at t=12s, the // warmup driver re-enters the release phase knowing that 12s of // the 30s maxDelay have already been consumed. If VPP stays down // past maxDelay, the first connect after that jumps straight to // the final SyncLBStateAll and marks allDone. type warmupTracker struct { startAt time.Time minDelay time.Duration maxDelay time.Duration mu sync.Mutex released map[string]bool // frontend name → released-for-sync allDone bool doneCh chan struct{} // closed when allDone is first set } // newWarmupTracker constructs a tracker with startAt pinned to time.Now(). // Delay values are not read at construction time — they come from the // config via runWarmup's call to getStateSource().Config() — so main.go // can construct the Client before the config has been fully propagated. func newWarmupTracker() *warmupTracker { return &warmupTracker{ startAt: time.Now(), released: make(map[string]bool), doneCh: make(chan struct{}), } } // configure latches the min/max delay values onto the tracker. Idempotent // if called with the same values; separate from the constructor so the // tracker exists before we've parsed the config, and so runWarmup can // read a consistent pair of values even if the config is reloaded mid- // warmup (per design decision, config reload does not reset the warmup // clock, and the delay values latched at first configure() are authoritative // for the lifetime of the warmup phase). func (w *warmupTracker) configure(minDelay, maxDelay time.Duration) { w.mu.Lock() defer w.mu.Unlock() // Only latch once. Subsequent calls are no-ops so a config reload // doesn't re-run warmup against new (possibly shorter) delays. if w.minDelay != 0 || w.maxDelay != 0 { return } w.minDelay = minDelay w.maxDelay = maxDelay } // inMinDelay reports whether the absolute hands-off window is still active. func (w *warmupTracker) inMinDelay() bool { w.mu.Lock() defer w.mu.Unlock() if w.allDone { return false } return time.Since(w.startAt) < w.minDelay } // isReleased reports whether the given frontend may be synced. True if // warmup is fully done or this specific frontend has been individually // released during the release phase. Fast path for the reconciler event // handler: a cheap check before it considers attempting a release. func (w *warmupTracker) isReleased(feName string) bool { w.mu.Lock() defer w.mu.Unlock() return w.allDone || w.released[feName] } // tryRelease atomically decides whether to release feName. Returns true // if the frontend is (now) eligible for sync: // // - already released (by a previous caller, including allDone) → true // - inside minDelay window → false // - past minDelay and caller has verified allKnown externally → true, // and the tracker is mutated to remember the release // // tryRelease does NOT check the allKnown precondition itself — the // caller is responsible for evaluating backend states before calling. // Separating the checks this way lets two independent release drivers // (the warmup poll goroutine and the reconciler event handler) share // the same gating state without exposing a mid-check race. // // Returns true for the "already released" case so callers have a // single branch: if tryRelease(fe) is true, proceed to sync. func (w *warmupTracker) tryRelease(feName string) bool { w.mu.Lock() defer w.mu.Unlock() if w.allDone || w.released[feName] { return true } if time.Since(w.startAt) < w.minDelay { return false } w.released[feName] = true return true } // finishAll marks warmup fully complete. Called once by runWarmup when // either every frontend has been released via the per-VIP path or the // maxDelay watchdog has expired. Idempotent: repeat calls are no-ops. // Closes doneCh on the first call so waiters in lbSyncLoop unblock. func (w *warmupTracker) finishAll() { w.mu.Lock() defer w.mu.Unlock() if w.allDone { return } w.allDone = true close(w.doneCh) } // doneChan returns a channel that is closed when finishAll is called. // Waiters block on this to defer periodic sync work until after the // warmup phase has completed (or been skipped entirely). func (w *warmupTracker) doneChan() <-chan struct{} { return w.doneCh } // isAllDone is the non-blocking companion to doneChan: true iff // finishAll has been called. Used by lbSyncLoop to decide whether // to re-enter runWarmup on each VPP reconnect. func (w *warmupTracker) isAllDone() bool { w.mu.Lock() defer w.mu.Unlock() return w.allDone } // elapsed returns how long the tracker has been running, formatted // as a human-readable Go duration string (e.g. "959ms", "5.2s") for // use in slog attributes. Returning the string form directly — // rather than a time.Duration — is deliberate: slog's default JSON // handler renders time.Duration as a raw nanosecond int64 which is // unreadable in a log viewer, while a pre-formatted string lands // as "5.2s" and matches how the config values are written in YAML. func (w *warmupTracker) elapsed() string { return time.Since(w.startAt).Round(time.Millisecond).String() } // allBackendsKnown reports whether every backend referenced by fe is // in a non-Unknown state. This is the precondition for releasing a // frontend during the per-VIP release phase: desiredFromFrontend can // only produce correct weights for a frontend whose backends have // all been probed at least once through the health checker's rise // counter (unknown → up/down). // // "Known" here is the literal reading: StateUnknown disqualifies, // everything else qualifies. That means a legitimately-down backend // counts as known and contributes its weight=0 to the desired set, // which is the correct restart behaviour — a backend that was down // before the restart stays down across the restart without waiting // for it to come back up. func allBackendsKnown(fe config.Frontend, src StateSource) bool { for _, pool := range fe.Pools { for bName := range pool.Backends { s, ok := src.BackendState(bName) if !ok || s == health.StateUnknown { return false } } } return true } // runWarmup drives the warmup state machine. Called from lbSyncLoop // on first entry (subsequent reconnect entries find allDone == true // and skip straight to the periodic ticker). // // Phases: // // 1. Latch delay values from the current config. // 2. If maxDelay == 0 (warmup disabled): run SyncLBStateAll // immediately, mark allDone, return. // 3. Sleep until minDelay has elapsed (absolute hands-off). // 4. Poll every warmupPollInterval, releasing any frontend whose // backends are all known. Each release fires a single-VIP sync. // Exit the poll when all frontends are released OR maxDelay // elapses. // 5. Run SyncLBStateAll for any stragglers and mark allDone. // // Exits early if ctx is cancelled at any point. func (c *Client) runWarmup(ctx context.Context) { src := c.getStateSource() if src == nil { // No state source ever registered; nothing meaningful to do. // Close the gate so lbSyncLoop doesn't hang. c.warmup.finishAll() return } cfg := src.Config() if cfg == nil { c.warmup.finishAll() return } c.warmup.configure(cfg.VPP.LB.StartupMinDelay, cfg.VPP.LB.StartupMaxDelay) w := c.warmup // maxDelay == 0 is the "no warmup" escape hatch: sync immediately // and mark the gate open. Operators pick this for tests and dev // setups where a few seconds of startup black-hole on bounce is // acceptable in exchange for not having to wait out the warmup. if w.maxDelay == 0 { slog.Info("vpp-lb-warmup-skipped", "impact", "VPP LB update skipped") if err := c.SyncLBStateAll(cfg); err != nil { slog.Warn("vpp-lb-sync-error", "err", err) } w.finishAll() return } slog.Info("vpp-lb-warmup-start", "min-delay", w.minDelay.String(), "max-delay", w.maxDelay.String(), "impact", "Gating all VPP LB updates") // Phase 3: wait out the min-delay absolute hands-off window. minDeadline := w.startAt.Add(w.minDelay) if wait := time.Until(minDeadline); wait > 0 { select { case <-ctx.Done(): return case <-time.After(wait): } } slog.Info("vpp-lb-warmup-min-delay-elapsed", "elapsed", w.elapsed(), "impact", "Ungating VPP LB updates for VIPs with known backend state") // Phase 4: poll for per-VIP release until maxDelay expires. // happyPath is set to true if we exit the poll loop because // every frontend has been released individually via SyncLBStateVIP. // In that case Phase 5 below skips the SyncLBStateAll entirely — // running it would be a redundant full reconcile over a dataplane // that's already in the desired state, and the log would misreport // the warmup-complete event as a "max-delay-final" stragglers sweep. maxDeadline := w.startAt.Add(w.maxDelay) happyPath := false for time.Now().Before(maxDeadline) { // Re-read the state source every tick: the config may not // have been available at loop entry (e.g. first connect // beat the config load race) but could be present now. src = c.getStateSource() if src != nil { cfg = src.Config() } if cfg != nil { // Release any frontend whose backends have all settled. allReleased := true for feName, fe := range cfg.Frontends { if w.isReleased(feName) { continue } if !allBackendsKnown(fe, src) { allReleased = false continue } if !w.tryRelease(feName) { // Still inside minDelay — shouldn't happen here // because we waited above, but guard anyway. allReleased = false continue } slog.Info("vpp-lb-warmup-release", "frontend", feName, "trigger", "poll", "elapsed", w.elapsed()) if err := c.SyncLBStateVIP(cfg, feName, ""); err != nil { slog.Warn("vpp-lb-warmup-release-error", "frontend", feName, "err", err) } } if allReleased { // Everything settled before maxDelay — fast path // out of the poll so we don't sit idle for the // remainder of the watchdog window. happyPath = true break } } // Sleep warmupPollInterval or until maxDelay, whichever // is shorter, before trying again. wait := warmupPollInterval if rem := time.Until(maxDeadline); rem < wait { wait = rem } select { case <-ctx.Done(): return case <-time.After(wait): } } // Phase 5: close out warmup. Two paths, but both emit // vpp-lb-warmup-max-delay-elapsed at the max-delay boundary so // the log timeline (start → min-delay-elapsed → (releases // happen) → max-delay-elapsed) is consistent regardless of // whether the warmup ended early or the watchdog tripped. // // - happyPath: every frontend was released individually during // Phase 4 and each one's SyncLBStateVIP already ran. VPP is // in the desired state; finishAll is called immediately so // the periodic sync loop can start drift-correction without // waiting out the rest of max-delay. The warmup driver then // sleeps until max-delay and emits -max-delay-elapsed as a // gratuitous timeline marker — the gate is already open, // but the line completes the warmup picture for an operator // reading the log and keeps the event sequence symmetric // with the watchdog path. // // - watchdog: max-delay elapsed with stragglers remaining. At // least one frontend never made it through allBackendsKnown, // so its effective weight computation still treats some // backends as StateUnknown and will program weight=0 for // them. Emit -max-delay-elapsed at the boundary, run // SyncLBStateAll to sweep stragglers, then finishAll. if happyPath { slog.Info("vpp-lb-warmup-complete", "elapsed", w.elapsed(), "impact", "Ungating VPP LB updates, all frontends released") w.finishAll() if wait := time.Until(maxDeadline); wait > 0 { select { case <-ctx.Done(): return case <-time.After(wait): } } slog.Info("vpp-lb-warmup-max-delay-elapsed", "elapsed", w.elapsed(), "impact", "Ungating all VPP LB updates") return } slog.Info("vpp-lb-warmup-max-delay-elapsed", "elapsed", w.elapsed(), "impact", "Ungating all VPP LB updates") src = c.getStateSource() if src != nil { cfg = src.Config() } if cfg != nil { if err := c.SyncLBStateAll(cfg); err != nil { slog.Warn("vpp-lb-sync-error", "err", err) } } w.finishAll() }