409 lines
15 KiB
Go
409 lines
15 KiB
Go
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
package vpp
|
|
|
|
import (
|
|
"context"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/config"
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/health"
|
|
)
|
|
|
|
// warmupPollInterval is how often runWarmup re-checks per-VIP backend
|
|
// state during the [minDelay, maxDelay) per-VIP release phase. 250ms
|
|
// is fast enough that a VIP whose last backend just completed rise
|
|
// probes gets released within a quarter-second of settling, and slow
|
|
// enough that the polling cost is negligible compared to probe work
|
|
// the checker is doing on the same core at the same time.
|
|
const warmupPollInterval = 250 * time.Millisecond
|
|
|
|
// warmupTracker is the per-process gate for the VPP LB sync path
|
|
// during the first StartupMaxDelay seconds of maglevd's life. It
|
|
// exists to keep a maglevd restart dataplane-neutral: without it,
|
|
// the first SyncLBStateAll would fire before any probes had
|
|
// completed, every backend would still be in StateUnknown, and
|
|
// BackendEffectiveWeight would reduce every AS to weight 0 — which
|
|
// on VPP's side means the new-flow table empties and every new
|
|
// connection hits the "no server" drop counter until the rise
|
|
// counters catch up.
|
|
//
|
|
// The tracker expresses three states and the transitions between
|
|
// them:
|
|
//
|
|
// 1. inside [0, minDelay) — "min-delay window". No sync of any
|
|
// kind is allowed to touch VPP, neither the periodic SyncLBStateAll
|
|
// loop nor the per-transition SyncLBStateVIP path from the
|
|
// reconciler. This is the absolute hands-off window the operator
|
|
// configures with vpp.lb.startup-min-delay.
|
|
//
|
|
// 2. inside [minDelay, maxDelay), per-VIP gating — "release phase".
|
|
// Each frontend is released (and one SyncLBStateVIP runs against
|
|
// it) as soon as every backend it references has reached a
|
|
// non-Unknown state. Both the warmup driver goroutine (which
|
|
// polls at warmupPollInterval) and the reconciler event path
|
|
// (which checks on every received transition) attempt to release
|
|
// VIPs; tryRelease arbitrates.
|
|
//
|
|
// 3. allDone — warmup is complete. Either every VIP has been
|
|
// individually released during the release phase, or the
|
|
// maxDelay watchdog expired and the warmup driver ran a final
|
|
// SyncLBStateAll for any stragglers. After allDone every gate
|
|
// is open, the reconciler runs normally on every transition,
|
|
// and the periodic lbSyncLoop's ticker starts.
|
|
//
|
|
// The clock is process-relative: startAt is set in Client.New()
|
|
// and does not reset across VPP reconnects. If VPP drops at t=8s
|
|
// while the release phase is mid-run and reconnects at t=12s, the
|
|
// warmup driver re-enters the release phase knowing that 12s of
|
|
// the 30s maxDelay have already been consumed. If VPP stays down
|
|
// past maxDelay, the first connect after that jumps straight to
|
|
// the final SyncLBStateAll and marks allDone.
|
|
type warmupTracker struct {
|
|
startAt time.Time
|
|
minDelay time.Duration
|
|
maxDelay time.Duration
|
|
|
|
mu sync.Mutex
|
|
released map[string]bool // frontend name → released-for-sync
|
|
allDone bool
|
|
doneCh chan struct{} // closed when allDone is first set
|
|
}
|
|
|
|
// newWarmupTracker constructs a tracker with startAt pinned to time.Now().
|
|
// Delay values are not read at construction time — they come from the
|
|
// config via runWarmup's call to getStateSource().Config() — so main.go
|
|
// can construct the Client before the config has been fully propagated.
|
|
func newWarmupTracker() *warmupTracker {
|
|
return &warmupTracker{
|
|
startAt: time.Now(),
|
|
released: make(map[string]bool),
|
|
doneCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// configure latches the min/max delay values onto the tracker. Idempotent
|
|
// if called with the same values; separate from the constructor so the
|
|
// tracker exists before we've parsed the config, and so runWarmup can
|
|
// read a consistent pair of values even if the config is reloaded mid-
|
|
// warmup (per design decision, config reload does not reset the warmup
|
|
// clock, and the delay values latched at first configure() are authoritative
|
|
// for the lifetime of the warmup phase).
|
|
func (w *warmupTracker) configure(minDelay, maxDelay time.Duration) {
|
|
w.mu.Lock()
|
|
defer w.mu.Unlock()
|
|
// Only latch once. Subsequent calls are no-ops so a config reload
|
|
// doesn't re-run warmup against new (possibly shorter) delays.
|
|
if w.minDelay != 0 || w.maxDelay != 0 {
|
|
return
|
|
}
|
|
w.minDelay = minDelay
|
|
w.maxDelay = maxDelay
|
|
}
|
|
|
|
// inMinDelay reports whether the absolute hands-off window is still active.
|
|
func (w *warmupTracker) inMinDelay() bool {
|
|
w.mu.Lock()
|
|
defer w.mu.Unlock()
|
|
if w.allDone {
|
|
return false
|
|
}
|
|
return time.Since(w.startAt) < w.minDelay
|
|
}
|
|
|
|
// isReleased reports whether the given frontend may be synced. True if
|
|
// warmup is fully done or this specific frontend has been individually
|
|
// released during the release phase. Fast path for the reconciler event
|
|
// handler: a cheap check before it considers attempting a release.
|
|
func (w *warmupTracker) isReleased(feName string) bool {
|
|
w.mu.Lock()
|
|
defer w.mu.Unlock()
|
|
return w.allDone || w.released[feName]
|
|
}
|
|
|
|
// tryRelease atomically decides whether to release feName. Returns true
|
|
// if the frontend is (now) eligible for sync:
|
|
//
|
|
// - already released (by a previous caller, including allDone) → true
|
|
// - inside minDelay window → false
|
|
// - past minDelay and caller has verified allKnown externally → true,
|
|
// and the tracker is mutated to remember the release
|
|
//
|
|
// tryRelease does NOT check the allKnown precondition itself — the
|
|
// caller is responsible for evaluating backend states before calling.
|
|
// Separating the checks this way lets two independent release drivers
|
|
// (the warmup poll goroutine and the reconciler event handler) share
|
|
// the same gating state without exposing a mid-check race.
|
|
//
|
|
// Returns true for the "already released" case so callers have a
|
|
// single branch: if tryRelease(fe) is true, proceed to sync.
|
|
func (w *warmupTracker) tryRelease(feName string) bool {
|
|
w.mu.Lock()
|
|
defer w.mu.Unlock()
|
|
if w.allDone || w.released[feName] {
|
|
return true
|
|
}
|
|
if time.Since(w.startAt) < w.minDelay {
|
|
return false
|
|
}
|
|
w.released[feName] = true
|
|
return true
|
|
}
|
|
|
|
// finishAll marks warmup fully complete. Called once by runWarmup when
|
|
// either every frontend has been released via the per-VIP path or the
|
|
// maxDelay watchdog has expired. Idempotent: repeat calls are no-ops.
|
|
// Closes doneCh on the first call so waiters in lbSyncLoop unblock.
|
|
func (w *warmupTracker) finishAll() {
|
|
w.mu.Lock()
|
|
defer w.mu.Unlock()
|
|
if w.allDone {
|
|
return
|
|
}
|
|
w.allDone = true
|
|
close(w.doneCh)
|
|
}
|
|
|
|
// doneChan returns a channel that is closed when finishAll is called.
|
|
// Waiters block on this to defer periodic sync work until after the
|
|
// warmup phase has completed (or been skipped entirely).
|
|
func (w *warmupTracker) doneChan() <-chan struct{} {
|
|
return w.doneCh
|
|
}
|
|
|
|
// isAllDone is the non-blocking companion to doneChan: true iff
|
|
// finishAll has been called. Used by lbSyncLoop to decide whether
|
|
// to re-enter runWarmup on each VPP reconnect.
|
|
func (w *warmupTracker) isAllDone() bool {
|
|
w.mu.Lock()
|
|
defer w.mu.Unlock()
|
|
return w.allDone
|
|
}
|
|
|
|
// elapsed returns how long the tracker has been running, formatted
|
|
// as a human-readable Go duration string (e.g. "959ms", "5.2s") for
|
|
// use in slog attributes. Returning the string form directly —
|
|
// rather than a time.Duration — is deliberate: slog's default JSON
|
|
// handler renders time.Duration as a raw nanosecond int64 which is
|
|
// unreadable in a log viewer, while a pre-formatted string lands
|
|
// as "5.2s" and matches how the config values are written in YAML.
|
|
func (w *warmupTracker) elapsed() string {
|
|
return time.Since(w.startAt).Round(time.Millisecond).String()
|
|
}
|
|
|
|
// allBackendsKnown reports whether every backend referenced by fe is
|
|
// in a non-Unknown state. This is the precondition for releasing a
|
|
// frontend during the per-VIP release phase: desiredFromFrontend can
|
|
// only produce correct weights for a frontend whose backends have
|
|
// all been probed at least once through the health checker's rise
|
|
// counter (unknown → up/down).
|
|
//
|
|
// "Known" here is the literal reading: StateUnknown disqualifies,
|
|
// everything else qualifies. That means a legitimately-down backend
|
|
// counts as known and contributes its weight=0 to the desired set,
|
|
// which is the correct restart behaviour — a backend that was down
|
|
// before the restart stays down across the restart without waiting
|
|
// for it to come back up.
|
|
func allBackendsKnown(fe config.Frontend, src StateSource) bool {
|
|
for _, pool := range fe.Pools {
|
|
for bName := range pool.Backends {
|
|
s, ok := src.BackendState(bName)
|
|
if !ok || s == health.StateUnknown {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// runWarmup drives the warmup state machine. Called from lbSyncLoop
|
|
// on first entry (subsequent reconnect entries find allDone == true
|
|
// and skip straight to the periodic ticker).
|
|
//
|
|
// Phases:
|
|
//
|
|
// 1. Latch delay values from the current config.
|
|
// 2. If maxDelay == 0 (warmup disabled): run SyncLBStateAll
|
|
// immediately, mark allDone, return.
|
|
// 3. Sleep until minDelay has elapsed (absolute hands-off).
|
|
// 4. Poll every warmupPollInterval, releasing any frontend whose
|
|
// backends are all known. Each release fires a single-VIP sync.
|
|
// Exit the poll when all frontends are released OR maxDelay
|
|
// elapses.
|
|
// 5. Run SyncLBStateAll for any stragglers and mark allDone.
|
|
//
|
|
// Exits early if ctx is cancelled at any point.
|
|
func (c *Client) runWarmup(ctx context.Context) {
|
|
src := c.getStateSource()
|
|
if src == nil {
|
|
// No state source ever registered; nothing meaningful to do.
|
|
// Close the gate so lbSyncLoop doesn't hang.
|
|
c.warmup.finishAll()
|
|
return
|
|
}
|
|
cfg := src.Config()
|
|
if cfg == nil {
|
|
c.warmup.finishAll()
|
|
return
|
|
}
|
|
c.warmup.configure(cfg.VPP.LB.StartupMinDelay, cfg.VPP.LB.StartupMaxDelay)
|
|
|
|
w := c.warmup
|
|
|
|
// maxDelay == 0 is the "no warmup" escape hatch: sync immediately
|
|
// and mark the gate open. Operators pick this for tests and dev
|
|
// setups where a few seconds of startup black-hole on bounce is
|
|
// acceptable in exchange for not having to wait out the warmup.
|
|
if w.maxDelay == 0 {
|
|
slog.Info("vpp-lb-warmup-skipped",
|
|
"impact", "VPP LB update skipped")
|
|
if err := c.SyncLBStateAll(cfg); err != nil {
|
|
slog.Warn("vpp-lb-sync-error", "err", err)
|
|
}
|
|
w.finishAll()
|
|
return
|
|
}
|
|
|
|
slog.Info("vpp-lb-warmup-start",
|
|
"min-delay", w.minDelay.String(),
|
|
"max-delay", w.maxDelay.String(),
|
|
"impact", "Gating all VPP LB updates")
|
|
|
|
// Phase 3: wait out the min-delay absolute hands-off window.
|
|
minDeadline := w.startAt.Add(w.minDelay)
|
|
if wait := time.Until(minDeadline); wait > 0 {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(wait):
|
|
}
|
|
}
|
|
slog.Info("vpp-lb-warmup-min-delay-elapsed",
|
|
"elapsed", w.elapsed(),
|
|
"impact", "Ungating VPP LB updates for VIPs with known backend state")
|
|
|
|
// Phase 4: poll for per-VIP release until maxDelay expires.
|
|
// happyPath is set to true if we exit the poll loop because
|
|
// every frontend has been released individually via SyncLBStateVIP.
|
|
// In that case Phase 5 below skips the SyncLBStateAll entirely —
|
|
// running it would be a redundant full reconcile over a dataplane
|
|
// that's already in the desired state, and the log would misreport
|
|
// the warmup-complete event as a "max-delay-final" stragglers sweep.
|
|
maxDeadline := w.startAt.Add(w.maxDelay)
|
|
happyPath := false
|
|
for time.Now().Before(maxDeadline) {
|
|
// Re-read the state source every tick: the config may not
|
|
// have been available at loop entry (e.g. first connect
|
|
// beat the config load race) but could be present now.
|
|
src = c.getStateSource()
|
|
if src != nil {
|
|
cfg = src.Config()
|
|
}
|
|
if cfg != nil {
|
|
// Release any frontend whose backends have all settled.
|
|
allReleased := true
|
|
for feName, fe := range cfg.Frontends {
|
|
if w.isReleased(feName) {
|
|
continue
|
|
}
|
|
if !allBackendsKnown(fe, src) {
|
|
allReleased = false
|
|
continue
|
|
}
|
|
if !w.tryRelease(feName) {
|
|
// Still inside minDelay — shouldn't happen here
|
|
// because we waited above, but guard anyway.
|
|
allReleased = false
|
|
continue
|
|
}
|
|
slog.Info("vpp-lb-warmup-release",
|
|
"frontend", feName,
|
|
"trigger", "poll",
|
|
"elapsed", w.elapsed())
|
|
if err := c.SyncLBStateVIP(cfg, feName, ""); err != nil {
|
|
slog.Warn("vpp-lb-warmup-release-error",
|
|
"frontend", feName,
|
|
"err", err)
|
|
}
|
|
}
|
|
if allReleased {
|
|
// Everything settled before maxDelay — fast path
|
|
// out of the poll so we don't sit idle for the
|
|
// remainder of the watchdog window.
|
|
happyPath = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// Sleep warmupPollInterval or until maxDelay, whichever
|
|
// is shorter, before trying again.
|
|
wait := warmupPollInterval
|
|
if rem := time.Until(maxDeadline); rem < wait {
|
|
wait = rem
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(wait):
|
|
}
|
|
}
|
|
|
|
// Phase 5: close out warmup. Two paths, but both emit
|
|
// vpp-lb-warmup-max-delay-elapsed at the max-delay boundary so
|
|
// the log timeline (start → min-delay-elapsed → (releases
|
|
// happen) → max-delay-elapsed) is consistent regardless of
|
|
// whether the warmup ended early or the watchdog tripped.
|
|
//
|
|
// - happyPath: every frontend was released individually during
|
|
// Phase 4 and each one's SyncLBStateVIP already ran. VPP is
|
|
// in the desired state; finishAll is called immediately so
|
|
// the periodic sync loop can start drift-correction without
|
|
// waiting out the rest of max-delay. The warmup driver then
|
|
// sleeps until max-delay and emits -max-delay-elapsed as a
|
|
// gratuitous timeline marker — the gate is already open,
|
|
// but the line completes the warmup picture for an operator
|
|
// reading the log and keeps the event sequence symmetric
|
|
// with the watchdog path.
|
|
//
|
|
// - watchdog: max-delay elapsed with stragglers remaining. At
|
|
// least one frontend never made it through allBackendsKnown,
|
|
// so its effective weight computation still treats some
|
|
// backends as StateUnknown and will program weight=0 for
|
|
// them. Emit -max-delay-elapsed at the boundary, run
|
|
// SyncLBStateAll to sweep stragglers, then finishAll.
|
|
if happyPath {
|
|
slog.Info("vpp-lb-warmup-complete",
|
|
"elapsed", w.elapsed(),
|
|
"impact", "Ungating VPP LB updates, all frontends released")
|
|
w.finishAll()
|
|
|
|
if wait := time.Until(maxDeadline); wait > 0 {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(wait):
|
|
}
|
|
}
|
|
slog.Info("vpp-lb-warmup-max-delay-elapsed",
|
|
"elapsed", w.elapsed(),
|
|
"impact", "Ungating all VPP LB updates")
|
|
return
|
|
}
|
|
|
|
slog.Info("vpp-lb-warmup-max-delay-elapsed",
|
|
"elapsed", w.elapsed(),
|
|
"impact", "Ungating all VPP LB updates")
|
|
src = c.getStateSource()
|
|
if src != nil {
|
|
cfg = src.Config()
|
|
}
|
|
if cfg != nil {
|
|
if err := c.SyncLBStateAll(cfg); err != nil {
|
|
slog.Warn("vpp-lb-sync-error", "err", err)
|
|
}
|
|
}
|
|
w.finishAll()
|
|
}
|