vpp-maglev/internal/vpp/warmup.go

// SPDX-License-Identifier: Apache-2.0

package vpp

import (
	"context"
	"log/slog"
	"sync"
	"time"

	"git.ipng.ch/ipng/vpp-maglev/internal/config"
	"git.ipng.ch/ipng/vpp-maglev/internal/health"
)

// warmupPollInterval is how often runWarmup re-checks per-VIP backend
// state during the [minDelay, maxDelay) per-VIP release phase. 250ms
// is fast enough that a VIP whose last backend just completed rise
// probes gets released within a quarter-second of settling, and slow
// enough that the polling cost is negligible compared to probe work
// the checker is doing on the same core at the same time.
const warmupPollInterval = 250 * time.Millisecond

// warmupTracker is the per-process gate for the VPP LB sync path
// during the first StartupMaxDelay seconds of maglevd's life. It
// exists to keep a maglevd restart dataplane-neutral: without it,
// the first SyncLBStateAll would fire before any probes had
// completed, every backend would still be in StateUnknown, and
// BackendEffectiveWeight would reduce every AS to weight 0 — which
// on VPP's side means the new-flow table empties and every new
// connection hits the "no server" drop counter until the rise
// counters catch up.
//
// The tracker expresses three states and the transitions between
// them:
//
//  1. inside [0, minDelay)  — "min-delay window". No sync of any
//     kind is allowed to touch VPP, neither the periodic SyncLBStateAll
//     loop nor the per-transition SyncLBStateVIP path from the
//     reconciler. This is the absolute hands-off window the operator
//     configures with vpp.lb.startup-min-delay.
//
//  2. inside [minDelay, maxDelay), per-VIP gating — "release phase".
//     Each frontend is released (and one SyncLBStateVIP runs against
//     it) as soon as every backend it references has reached a
//     non-Unknown state. Both the warmup driver goroutine (which
//     polls at warmupPollInterval) and the reconciler event path
//     (which checks on every received transition) attempt to release
//     VIPs; tryRelease arbitrates.
//
//  3. allDone — warmup is complete. Either every VIP has been
//     individually released during the release phase, or the
//     maxDelay watchdog expired and the warmup driver ran a final
//     SyncLBStateAll for any stragglers. After allDone every gate
//     is open, the reconciler runs normally on every transition,
//     and the periodic lbSyncLoop's ticker starts.
//
// The clock is process-relative: startAt is set in Client.New()
// and does not reset across VPP reconnects. If VPP drops at t=8s
// while the release phase is mid-run and reconnects at t=12s, the
// warmup driver re-enters the release phase knowing that 12s of
// the 30s maxDelay have already been consumed. If VPP stays down
// past maxDelay, the first connect after that jumps straight to
// the final SyncLBStateAll and marks allDone.
type warmupTracker struct {
	startAt  time.Time
	minDelay time.Duration
	maxDelay time.Duration

	mu       sync.Mutex
	released map[string]bool // frontend name → released-for-sync
	allDone  bool
	doneCh   chan struct{} // closed when allDone is first set
}

// newWarmupTracker constructs a tracker with startAt pinned to time.Now().
// Delay values are not read at construction time — they come from the
// config via runWarmup's call to getStateSource().Config() — so main.go
// can construct the Client before the config has been fully propagated.
func newWarmupTracker() *warmupTracker {
	return &warmupTracker{
		startAt:  time.Now(),
		released: make(map[string]bool),
		doneCh:   make(chan struct{}),
	}
}

// configure latches the min/max delay values onto the tracker. Idempotent
// if called with the same values; separate from the constructor so the
// tracker exists before we've parsed the config, and so runWarmup can
// read a consistent pair of values even if the config is reloaded mid-
// warmup (per design decision, config reload does not reset the warmup
// clock, and the delay values latched at first configure() are authoritative
// for the lifetime of the warmup phase).
func (w *warmupTracker) configure(minDelay, maxDelay time.Duration) {
	w.mu.Lock()
	defer w.mu.Unlock()
	// Only latch once. Subsequent calls are no-ops so a config reload
	// doesn't re-run warmup against new (possibly shorter) delays.
	if w.minDelay != 0 || w.maxDelay != 0 {
		return
	}
	w.minDelay = minDelay
	w.maxDelay = maxDelay
}

// inMinDelay reports whether the absolute hands-off window is still active.
func (w *warmupTracker) inMinDelay() bool {
	w.mu.Lock()
	defer w.mu.Unlock()
	if w.allDone {
		return false
	}
	return time.Since(w.startAt) < w.minDelay
}

// isReleased reports whether the given frontend may be synced. True if
// warmup is fully done or this specific frontend has been individually
// released during the release phase. Fast path for the reconciler event
// handler: a cheap check before it considers attempting a release.
func (w *warmupTracker) isReleased(feName string) bool {
	w.mu.Lock()
	defer w.mu.Unlock()
	return w.allDone || w.released[feName]
}

// tryRelease atomically decides whether to release feName. Returns true
// if the frontend is (now) eligible for sync:
//
//   - already released (by a previous caller, including allDone) → true
//   - inside minDelay window → false
//   - past minDelay and caller has verified allKnown externally → true,
//     and the tracker is mutated to remember the release
//
// tryRelease does NOT check the allKnown precondition itself — the
// caller is responsible for evaluating backend states before calling.
// Separating the checks this way lets two independent release drivers
// (the warmup poll goroutine and the reconciler event handler) share
// the same gating state without exposing a mid-check race.
//
// Returns true for the "already released" case so callers have a
// single branch: if tryRelease(fe) is true, proceed to sync.
func (w *warmupTracker) tryRelease(feName string) bool {
	w.mu.Lock()
	defer w.mu.Unlock()
	if w.allDone || w.released[feName] {
		return true
	}
	if time.Since(w.startAt) < w.minDelay {
		return false
	}
	w.released[feName] = true
	return true
}

// finishAll marks warmup fully complete. Called once by runWarmup when
// either every frontend has been released via the per-VIP path or the
// maxDelay watchdog has expired. Idempotent: repeat calls are no-ops.
// Closes doneCh on the first call so waiters in lbSyncLoop unblock.
func (w *warmupTracker) finishAll() {
	w.mu.Lock()
	defer w.mu.Unlock()
	if w.allDone {
		return
	}
	w.allDone = true
	close(w.doneCh)
}

// doneChan returns a channel that is closed when finishAll is called.
// Waiters block on this to defer periodic sync work until after the
// warmup phase has completed (or been skipped entirely).
func (w *warmupTracker) doneChan() <-chan struct{} {
	return w.doneCh
}

// isAllDone is the non-blocking companion to doneChan: true iff
// finishAll has been called. Used by lbSyncLoop to decide whether
// to re-enter runWarmup on each VPP reconnect.
func (w *warmupTracker) isAllDone() bool {
	w.mu.Lock()
	defer w.mu.Unlock()
	return w.allDone
}

// elapsed returns how long the tracker has been running, formatted
// as a human-readable Go duration string (e.g. "959ms", "5.2s") for
// use in slog attributes. Returning the string form directly —
// rather than a time.Duration — is deliberate: slog's default JSON
// handler renders time.Duration as a raw nanosecond int64 which is
// unreadable in a log viewer, while a pre-formatted string lands
// as "5.2s" and matches how the config values are written in YAML.
func (w *warmupTracker) elapsed() string {
	return time.Since(w.startAt).Round(time.Millisecond).String()
}

// allBackendsKnown reports whether every backend referenced by fe is
// in a non-Unknown state. This is the precondition for releasing a
// frontend during the per-VIP release phase: desiredFromFrontend can
// only produce correct weights for a frontend whose backends have
// all been probed at least once through the health checker's rise
// counter (unknown → up/down).
//
// "Known" here is the literal reading: StateUnknown disqualifies,
// everything else qualifies. That means a legitimately-down backend
// counts as known and contributes its weight=0 to the desired set,
// which is the correct restart behaviour — a backend that was down
// before the restart stays down across the restart without waiting
// for it to come back up.
func allBackendsKnown(fe config.Frontend, src StateSource) bool {
	for _, pool := range fe.Pools {
		for bName := range pool.Backends {
			s, ok := src.BackendState(bName)
			if !ok || s == health.StateUnknown {
				return false
			}
		}
	}
	return true
}

// runWarmup drives the warmup state machine. Called from lbSyncLoop
// on first entry (subsequent reconnect entries find allDone == true
// and skip straight to the periodic ticker).
//
// Phases:
//
//  1. Latch delay values from the current config.
//  2. If maxDelay == 0 (warmup disabled): run SyncLBStateAll
//     immediately, mark allDone, return.
//  3. Sleep until minDelay has elapsed (absolute hands-off).
//  4. Poll every warmupPollInterval, releasing any frontend whose
//     backends are all known. Each release fires a single-VIP sync.
//     Exit the poll when all frontends are released OR maxDelay
//     elapses.
//  5. Run SyncLBStateAll for any stragglers and mark allDone.
//
// Exits early if ctx is cancelled at any point.
func (c *Client) runWarmup(ctx context.Context) {
	src := c.getStateSource()
	if src == nil {
		// No state source ever registered; nothing meaningful to do.
		// Close the gate so lbSyncLoop doesn't hang.
		c.warmup.finishAll()
		return
	}
	cfg := src.Config()
	if cfg == nil {
		c.warmup.finishAll()
		return
	}
	c.warmup.configure(cfg.VPP.LB.StartupMinDelay, cfg.VPP.LB.StartupMaxDelay)

	w := c.warmup

	// maxDelay == 0 is the "no warmup" escape hatch: sync immediately
	// and mark the gate open. Operators pick this for tests and dev
	// setups where a few seconds of startup black-hole on bounce is
	// acceptable in exchange for not having to wait out the warmup.
	if w.maxDelay == 0 {
		slog.Info("vpp-lb-warmup-skipped",
			"impact", "VPP LB update skipped")
		if err := c.SyncLBStateAll(cfg); err != nil {
			slog.Warn("vpp-lb-sync-error", "err", err)
		}
		w.finishAll()
		return
	}

	slog.Info("vpp-lb-warmup-start",
		"min-delay", w.minDelay.String(),
		"max-delay", w.maxDelay.String(),
		"impact", "Gating all VPP LB updates")

	// Phase 3: wait out the min-delay absolute hands-off window.
	minDeadline := w.startAt.Add(w.minDelay)
	if wait := time.Until(minDeadline); wait > 0 {
		select {
		case <-ctx.Done():
			return
		case <-time.After(wait):
		}
	}
	slog.Info("vpp-lb-warmup-min-delay-elapsed",
		"elapsed", w.elapsed(),
		"impact", "Ungating VPP LB updates for VIPs with known backend state")

	// Phase 4: poll for per-VIP release until maxDelay expires.
	// happyPath is set to true if we exit the poll loop because
	// every frontend has been released individually via SyncLBStateVIP.
	// In that case Phase 5 below skips the SyncLBStateAll entirely —
	// running it would be a redundant full reconcile over a dataplane
	// that's already in the desired state, and the log would misreport
	// the warmup-complete event as a "max-delay-final" stragglers sweep.
	maxDeadline := w.startAt.Add(w.maxDelay)
	happyPath := false
	for time.Now().Before(maxDeadline) {
		// Re-read the state source every tick: the config may not
		// have been available at loop entry (e.g. first connect
		// beat the config load race) but could be present now.
		src = c.getStateSource()
		if src != nil {
			cfg = src.Config()
		}
		if cfg != nil {
			// Release any frontend whose backends have all settled.
			allReleased := true
			for feName, fe := range cfg.Frontends {
				if w.isReleased(feName) {
					continue
				}
				if !allBackendsKnown(fe, src) {
					allReleased = false
					continue
				}
				if !w.tryRelease(feName) {
					// Still inside minDelay — shouldn't happen here
					// because we waited above, but guard anyway.
					allReleased = false
					continue
				}
				slog.Info("vpp-lb-warmup-release",
					"frontend", feName,
					"trigger", "poll",
					"elapsed", w.elapsed())
				if err := c.SyncLBStateVIP(cfg, feName, ""); err != nil {
					slog.Warn("vpp-lb-warmup-release-error",
						"frontend", feName,
						"err", err)
				}
			}
			if allReleased {
				// Everything settled before maxDelay — fast path
				// out of the poll so we don't sit idle for the
				// remainder of the watchdog window.
				happyPath = true
				break
			}
		}

		// Sleep warmupPollInterval or until maxDelay, whichever
		// is shorter, before trying again.
		wait := warmupPollInterval
		if rem := time.Until(maxDeadline); rem < wait {
			wait = rem
		}
		select {
		case <-ctx.Done():
			return
		case <-time.After(wait):
		}
	}

	// Phase 5: close out warmup. Two paths, but both emit
	// vpp-lb-warmup-max-delay-elapsed at the max-delay boundary so
	// the log timeline (start → min-delay-elapsed → (releases
	// happen) → max-delay-elapsed) is consistent regardless of
	// whether the warmup ended early or the watchdog tripped.
	//
	//   - happyPath: every frontend was released individually during
	//     Phase 4 and each one's SyncLBStateVIP already ran. VPP is
	//     in the desired state; finishAll is called immediately so
	//     the periodic sync loop can start drift-correction without
	//     waiting out the rest of max-delay. The warmup driver then
	//     sleeps until max-delay and emits -max-delay-elapsed as a
	//     gratuitous timeline marker — the gate is already open,
	//     but the line completes the warmup picture for an operator
	//     reading the log and keeps the event sequence symmetric
	//     with the watchdog path.
	//
	//   - watchdog: max-delay elapsed with stragglers remaining. At
	//     least one frontend never made it through allBackendsKnown,
	//     so its effective weight computation still treats some
	//     backends as StateUnknown and will program weight=0 for
	//     them. Emit -max-delay-elapsed at the boundary, run
	//     SyncLBStateAll to sweep stragglers, then finishAll.
	if happyPath {
		slog.Info("vpp-lb-warmup-complete",
			"elapsed", w.elapsed(),
			"impact", "Ungating VPP LB updates, all frontends released")
		w.finishAll()

		if wait := time.Until(maxDeadline); wait > 0 {
			select {
			case <-ctx.Done():
				return
			case <-time.After(wait):
			}
		}
		slog.Info("vpp-lb-warmup-max-delay-elapsed",
			"elapsed", w.elapsed(),
			"impact", "Ungating all VPP LB updates")
		return
	}

	slog.Info("vpp-lb-warmup-max-delay-elapsed",
		"elapsed", w.elapsed(),
		"impact", "Ungating all VPP LB updates")
	src = c.getStateSource()
	if src != nil {
		cfg = src.Config()
	}
	if cfg != nil {
		if err := c.SyncLBStateAll(cfg); err != nil {
			slog.Warn("vpp-lb-sync-error", "err", err)
		}
	}
	w.finishAll()
}