481 lines
19 KiB
TypeScript
481 lines
19 KiB
TypeScript
// SPDX-License-Identifier: Apache-2.0
|
||
|
||
import { createStore, produce } from "solid-js/store";
|
||
import type {
|
||
BackendEventPayload,
|
||
FrontendEventPayload,
|
||
FrontendSnapshot,
|
||
LBStatePayload,
|
||
MaglevdStatusPayload,
|
||
StateSnapshot,
|
||
TransitionRecord,
|
||
} from "../types";
|
||
import { tick } from "./tick";
|
||
|
||
// recomputeDerivedState mirrors the server-side
|
||
// health.EffectiveWeights / ActivePoolIndex / ComputeFrontendState
|
||
// logic so the SPA can keep pool.effective_weight AND the
|
||
// per-frontend aggregate state correct the moment any backend
|
||
// transitions or any configured weight changes, without waiting for
|
||
// the 30s refresh. Walking every frontend is cheap — O(frontends ×
|
||
// pools × backends-per-pool) with tiny constants — and it's
|
||
// strictly a function of the backend state map + configured
|
||
// weights, so there's no risk of drift vs. the server as long as
|
||
// the rules stay identical. The SPA is the authoritative source of
|
||
// truth for *display* state: the server's cached frontendStates
|
||
// field can be stale (e.g. after a SetFrontendPoolBackendWeight
|
||
// call that doesn't re-run updateFrontendState, or after a long-
|
||
// lived WatchEvents stream where a past transition corrupted the
|
||
// client's cache) and the SPA recomputes from its own live
|
||
// backends array to avoid inheriting any staleness.
|
||
//
|
||
// Effective weight rule: a backend gets its configured pool weight
|
||
// iff it is up AND belongs to the currently-active pool; everything
|
||
// else is 0. The active pool is the first pool containing a backend
|
||
// that is both up AND has a non-zero configured weight — a pool
|
||
// whose up backends are all weight=0 contributes no serving
|
||
// capacity and gets skipped over in priority failover. Kept in
|
||
// lock-step with internal/health/weights.go ActivePoolIndex.
|
||
//
|
||
// Frontend state rule: unknown if no backends or every referenced
|
||
// backend is still in StateUnknown; up if any backend in any pool
|
||
// has effective_weight > 0; otherwise down. Kept in lock-step with
|
||
// internal/health/weights.go ComputeFrontendState.
|
||
function recomputeDerivedState(snap: StateSnapshot) {
|
||
const stateOf: Record<string, string> = {};
|
||
for (const b of snap.backends) stateOf[b.name] = b.state;
|
||
for (const fe of snap.frontends) {
|
||
let activePool = 0;
|
||
for (let i = 0; i < fe.pools.length; i++) {
|
||
let anyServing = false;
|
||
for (const pb of fe.pools[i].backends) {
|
||
if (stateOf[pb.name] === "up" && pb.weight > 0) {
|
||
anyServing = true;
|
||
break;
|
||
}
|
||
}
|
||
if (anyServing) {
|
||
activePool = i;
|
||
break;
|
||
}
|
||
}
|
||
let anyEffective = false;
|
||
let seenAny = false;
|
||
let allUnknown = true;
|
||
const seen = new Set<string>();
|
||
for (let i = 0; i < fe.pools.length; i++) {
|
||
for (const pb of fe.pools[i].backends) {
|
||
const st = stateOf[pb.name];
|
||
pb.effective_weight = st === "up" && i === activePool ? pb.weight : 0;
|
||
if (pb.effective_weight > 0) anyEffective = true;
|
||
if (!seen.has(pb.name)) {
|
||
seen.add(pb.name);
|
||
seenAny = true;
|
||
if (st !== "unknown") allUnknown = false;
|
||
}
|
||
}
|
||
}
|
||
if (!seenAny || allUnknown) {
|
||
fe.state = "unknown";
|
||
} else if (anyEffective) {
|
||
fe.state = "up";
|
||
} else {
|
||
fe.state = "down";
|
||
}
|
||
}
|
||
}
|
||
|
||
// FrontendState keys snapshots by maglevd name. A single store drives the
|
||
// whole UI; reducers produce() into the right branch.
|
||
//
|
||
// settling is a per-(maglevd, frontend) flag flipped to true on any
|
||
// event that changes which backends should be serving — backend
|
||
// transitions, configured weight edits — and auto-cleared after a
|
||
// fixed grace window. While true, frontendHealth suppresses the
|
||
// bug-buckets verdict so a transient race between the new control-
|
||
// plane state and the lagging GetVPPLBState refetch doesn't flash
|
||
// the ‼️ icon. A real, persistent dataplane disagreement still shows
|
||
// up the moment the grace window expires.
|
||
export type FrontendState = {
|
||
byName: Record<string, StateSnapshot>;
|
||
settling: Record<string, Record<string, true>>;
|
||
};
|
||
|
||
const [state, setState] = createStore<FrontendState>({ byName: {}, settling: {} });
|
||
|
||
export { state };
|
||
|
||
const SETTLE_GRACE_MS = 2000;
|
||
|
||
// Outside-the-store map of pending auto-clear timers, keyed by
|
||
// (maglevd, frontend). Timer ids aren't UI state so they don't
|
||
// belong in the reactive store; keeping them in a plain Map lets a
|
||
// fresh transition cancel and restart the timer cleanly.
|
||
const settlingTimers = new Map<string, ReturnType<typeof setTimeout>>();
|
||
function settleKey(m: string, f: string): string {
|
||
return `${m}\x00${f}`;
|
||
}
|
||
|
||
function markFrontendSettling(maglevd: string, frontend: string) {
|
||
setState(
|
||
produce((s) => {
|
||
if (!s.settling[maglevd]) s.settling[maglevd] = {};
|
||
s.settling[maglevd][frontend] = true;
|
||
}),
|
||
);
|
||
const k = settleKey(maglevd, frontend);
|
||
const existing = settlingTimers.get(k);
|
||
if (existing) clearTimeout(existing);
|
||
settlingTimers.set(
|
||
k,
|
||
setTimeout(() => {
|
||
settlingTimers.delete(k);
|
||
setState(
|
||
produce((s) => {
|
||
if (s.settling[maglevd]) delete s.settling[maglevd][frontend];
|
||
}),
|
||
);
|
||
}, SETTLE_GRACE_MS),
|
||
);
|
||
}
|
||
|
||
// clearMaglevdSettling is called from applyLBState the moment a fresh
|
||
// GetVPPLBState reconciliation lands. The dataplane data is now at
|
||
// least as new as whatever transitions triggered the wait, so any
|
||
// remaining bug-buckets discrepancy is real and worth surfacing.
|
||
// The 2s safety timer in markFrontendSettling exists only as a
|
||
// fallback for the case where VPP is disconnected (or the fetch is
|
||
// failing) and an lb-state event would never arrive — without the
|
||
// timer, settling would get stuck and the icon would silently
|
||
// suppress real bugs.
|
||
function clearMaglevdSettling(maglevd: string) {
|
||
for (const [k, id] of settlingTimers) {
|
||
if (k.startsWith(maglevd + "\x00")) {
|
||
clearTimeout(id);
|
||
settlingTimers.delete(k);
|
||
}
|
||
}
|
||
setState(
|
||
produce((s) => {
|
||
if (s.settling[maglevd]) s.settling[maglevd] = {};
|
||
}),
|
||
);
|
||
}
|
||
|
||
export function replaceSnapshot(snap: StateSnapshot) {
|
||
// Recompute effective weights + aggregate frontend state locally
|
||
// from the snapshot's backends array, rather than trusting the
|
||
// server's state field verbatim. The server can be stale (the
|
||
// checker's frontendStates map is only updated on backend
|
||
// transitions, not on weight changes), so deriving from our own
|
||
// backend data is the only way to guarantee the display stays
|
||
// consistent with reality.
|
||
recomputeDerivedState(snap);
|
||
setState(
|
||
produce((s) => {
|
||
s.byName[snap.maglevd.name] = snap;
|
||
}),
|
||
);
|
||
}
|
||
|
||
export function replaceAll(snaps: StateSnapshot[]) {
|
||
const byName: Record<string, StateSnapshot> = {};
|
||
for (const s of snaps) {
|
||
recomputeDerivedState(s);
|
||
byName[s.maglevd.name] = s;
|
||
}
|
||
setState({ byName });
|
||
}
|
||
|
||
export function applyBackendTransition(maglevd: string, p: BackendEventPayload) {
|
||
setState(
|
||
produce((s) => {
|
||
const snap = s.byName[maglevd];
|
||
if (!snap) return;
|
||
const b = snap.backends.find((x) => x.name === p.backend);
|
||
if (!b) return;
|
||
b.state = p.transition.to;
|
||
// Derive enabled from state — see the matching comment in
|
||
// cmd/frontend/client.go applyBackendTransition. state="disabled"
|
||
// and enabled=false are two expressions of the same condition
|
||
// in maglevd, so keeping them in sync locally closes a drift
|
||
// window where the UI would show the wrong [disabled] tag.
|
||
b.enabled = p.transition.to !== "disabled";
|
||
b.last_transition = p.transition;
|
||
if (!b.transitions) b.transitions = [];
|
||
b.transitions.push(p.transition);
|
||
if (b.transitions.length > 20) {
|
||
b.transitions = b.transitions.slice(b.transitions.length - 20);
|
||
}
|
||
// A backend state change can shift which pool is active and
|
||
// therefore which pool-memberships get non-zero effective
|
||
// weights, and in turn can flip the frontend's aggregate
|
||
// state. Recompute for every frontend — not just the one
|
||
// pointed at by this backend — because pool-failover is a
|
||
// per-frontend computation and the same backend can appear in
|
||
// multiple frontends with different pool placements.
|
||
recomputeDerivedState(snap);
|
||
}),
|
||
);
|
||
// Mark every frontend that references this backend as settling so
|
||
// the bug-buckets verdict is gated on the next fresh GetVPPLBState
|
||
// reconciliation (or the 2s safety timer, whichever fires first).
|
||
const snap = state.byName[maglevd];
|
||
if (snap) {
|
||
for (const fe of snap.frontends) {
|
||
if (fe.pools.some((pool) => pool.backends.some((pb) => pb.name === p.backend))) {
|
||
markFrontendSettling(maglevd, fe.name);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Frontend-transition events arrive from the server's checker, but
|
||
// the SPA no longer trusts their `to` field — recomputeDerivedState
|
||
// walks the local backends array on every backend event and every
|
||
// hydration to produce an up-to-date frontend state that the server
|
||
// can't make stale. Kept as a named reducer so sse.ts's dispatch
|
||
// table still has a landing spot for "frontend" events (they also
|
||
// flow into the DebugPanel via pushEvent); the body is deliberately
|
||
// empty — not a bug.
|
||
export function applyFrontendTransition(_maglevd: string, _p: FrontendEventPayload) {
|
||
// no-op — state is derived client-side, see recomputeDerivedState
|
||
}
|
||
|
||
// applyLBState merges the per-frontend bucket map for one maglevd
|
||
// from a freshly-arrived "lb-state" SSE event. A null/undefined
|
||
// per_frontend payload (sent on VPP disconnect or fetch failure)
|
||
// clears the cached map so the SPA renders em-dashes in the buckets
|
||
// column instead of stale numbers.
|
||
//
|
||
// The merge is done leaf-by-leaf rather than via wholesale assignment.
|
||
// produce's proxy only emits a signal when a property is actually
|
||
// written, so guarding each write with `!==` keeps unchanged numbers
|
||
// (in particular every drained-to-0 backend) from invalidating their
|
||
// downstream reactive reads. Without this, the periodic 30s refresh
|
||
// and every same-value re-fetch would re-trigger the Flash animation
|
||
// on every cell — which is exactly the visual storm we're avoiding.
|
||
export function applyLBState(maglevd: string, p: LBStatePayload) {
|
||
setState(
|
||
produce((s) => {
|
||
const snap = s.byName[maglevd];
|
||
if (!snap) return;
|
||
const next = p.per_frontend;
|
||
const empty = !next || Object.keys(next).length === 0;
|
||
if (empty) {
|
||
if (snap.lb_state !== undefined) snap.lb_state = undefined;
|
||
return;
|
||
}
|
||
if (!snap.lb_state) {
|
||
snap.lb_state = { per_frontend: {} };
|
||
}
|
||
const cur = snap.lb_state.per_frontend;
|
||
// Update / insert leaves that actually changed.
|
||
for (const fe of Object.keys(next)) {
|
||
if (!cur[fe]) cur[fe] = {};
|
||
const curRow = cur[fe];
|
||
const nextRow = next[fe];
|
||
for (const be of Object.keys(nextRow)) {
|
||
if (curRow[be] !== nextRow[be]) curRow[be] = nextRow[be];
|
||
}
|
||
for (const be of Object.keys(curRow)) {
|
||
if (!(be in nextRow)) delete curRow[be];
|
||
}
|
||
}
|
||
// Drop frontends that disappeared from the new snapshot.
|
||
for (const fe of Object.keys(cur)) {
|
||
if (!(fe in next)) delete cur[fe];
|
||
}
|
||
}),
|
||
);
|
||
// A fresh lb-state event means the dataplane data is now at least
|
||
// as new as anything we were waiting on — re-enable bug detection.
|
||
clearMaglevdSettling(maglevd);
|
||
}
|
||
|
||
// lbBucketsFor looks up the bucket count VPP currently routes to a
|
||
// given backend on a given frontend. Returns undefined when the
|
||
// snapshot has no LB state at all (VPP disconnected, no fetch yet) or
|
||
// when the backend isn't programmed into VPP for that VIP — the view
|
||
// renders an em-dash in both cases.
|
||
export function lbBucketsFor(
|
||
snap: StateSnapshot | undefined,
|
||
frontend: string,
|
||
backend: string,
|
||
): number | undefined {
|
||
return snap?.lb_state?.per_frontend?.[frontend]?.[backend];
|
||
}
|
||
|
||
export function applyVPPStatus(maglevd: string, state: string) {
|
||
setState(
|
||
produce((s) => {
|
||
const snap = s.byName[maglevd];
|
||
if (!snap) return;
|
||
snap.vpp_state = state;
|
||
}),
|
||
);
|
||
}
|
||
|
||
export function applyMaglevdStatus(maglevd: string, p: MaglevdStatusPayload) {
|
||
setState(
|
||
produce((s) => {
|
||
const snap = s.byName[maglevd];
|
||
if (!snap) return;
|
||
snap.maglevd.connected = p.connected;
|
||
snap.maglevd.last_error = p.last_error;
|
||
}),
|
||
);
|
||
}
|
||
|
||
// applyConfiguredWeight updates the configured weight of a specific
|
||
// backend's pool-membership within a named frontend/pool, then
|
||
// recomputes effective weights so pool-failover semantics stay
|
||
// consistent. Called from the BackendActionsMenu after a successful
|
||
// admin "set weight" POST so the UI reflects the change instantly
|
||
// without waiting for the 30s refresh tick. Unlike the previous
|
||
// log-event-driven reducer, this one is scoped to exactly the
|
||
// pool-membership the operator edited, so it can't leak weights
|
||
// across frontends that share the backend.
|
||
export function applyConfiguredWeight(
|
||
maglevd: string,
|
||
frontend: string,
|
||
pool: string,
|
||
backend: string,
|
||
weight: number,
|
||
) {
|
||
setState(
|
||
produce((s) => {
|
||
const snap = s.byName[maglevd];
|
||
if (!snap) return;
|
||
const fe = snap.frontends.find((f) => f.name === frontend);
|
||
if (!fe) return;
|
||
const p = fe.pools.find((x) => x.name === pool);
|
||
if (!p) return;
|
||
const pb = p.backends.find((x) => x.name === backend);
|
||
if (!pb) return;
|
||
pb.weight = weight;
|
||
recomputeDerivedState(snap);
|
||
}),
|
||
);
|
||
markFrontendSettling(maglevd, frontend);
|
||
}
|
||
|
||
// FrontendHealth is the per-frontend "is everything actually working"
|
||
// verdict computed from backend states, effective weights, and (when
|
||
// available) the VPP bucket map. The cascade is intentionally
|
||
// priority-ordered: a data-plane disagreement (control says serve,
|
||
// VPP routes nothing) is the loudest signal because it usually means
|
||
// something is broken in the sync path, not just an unhealthy backend.
|
||
//
|
||
// "ok" → all backends up, primary serving, every
|
||
// eff>0 backend has VPP buckets>0
|
||
// "bug-buckets" → some backend with effective_weight>0 has 0
|
||
// buckets in VPP — control plane and data
|
||
// plane disagree, almost always a bug
|
||
// "primary-drained" → primary pool is not serving any traffic
|
||
// (every backend in pool[0] has eff=0); the
|
||
// frontend is on its fallback or fully down
|
||
// "degraded" → at least one backend isn't 'up' but nothing
|
||
// worse — typical maintenance / outage state
|
||
// "unknown" → fallthrough; should be unreachable, kept as
|
||
// a safety net for logic bugs in this function
|
||
export type FrontendHealth = "ok" | "bug-buckets" | "primary-drained" | "degraded" | "unknown";
|
||
|
||
export function frontendHealth(snap: StateSnapshot, fe: FrontendSnapshot): FrontendHealth {
|
||
const stateOf: Record<string, string> = {};
|
||
for (const b of snap.backends) stateOf[b.name] = b.state;
|
||
|
||
// The bucket check is only meaningful when we actually have an LB
|
||
// state snapshot. On a fresh page load (or with VPP disconnected)
|
||
// lb_state is undefined; in that window we fall back to "trust the
|
||
// control plane" so the icon still settles to ✅ instead of
|
||
// perpetual ❓ until the first GetVPPLBState round-trip.
|
||
const lbAvailable = !!snap.lb_state;
|
||
const feBuckets = snap.lb_state?.per_frontend?.[fe.name];
|
||
// Reactive read of the per-frontend settling flag. While true,
|
||
// we're still waiting for the next GetVPPLBState reconciliation
|
||
// after a recent control-plane change; the dataplane may be mid-
|
||
// reconverge so any "weight>0 but buckets==0" we'd see here is
|
||
// almost certainly a race, not a real bug.
|
||
const settling = !!state.settling[snap.maglevd.name]?.[fe.name];
|
||
|
||
let anyDown = false;
|
||
let dataplaneBug = false;
|
||
for (const pool of fe.pools) {
|
||
for (const pb of pool.backends) {
|
||
if (stateOf[pb.name] !== "up") anyDown = true;
|
||
if (!settling && lbAvailable && pb.effective_weight > 0) {
|
||
const b = feBuckets?.[pb.name];
|
||
if (b === undefined || b === 0) dataplaneBug = true;
|
||
}
|
||
}
|
||
}
|
||
|
||
const primary = fe.pools[0];
|
||
const primaryHasWeights = !!primary && primary.backends.some((pb) => pb.weight > 0);
|
||
const primaryAllZero = !primary || primary.backends.every((pb) => pb.effective_weight === 0);
|
||
|
||
if (!anyDown && primaryHasWeights && !dataplaneBug) return "ok";
|
||
if (dataplaneBug) return "bug-buckets";
|
||
if (primaryAllZero) return "primary-drained";
|
||
if (anyDown) return "degraded";
|
||
return "unknown";
|
||
}
|
||
|
||
export function frontendHealthIcon(snap: StateSnapshot, fe: FrontendSnapshot): string {
|
||
switch (frontendHealth(snap, fe)) {
|
||
case "ok":
|
||
return "✅";
|
||
case "bug-buckets":
|
||
return "‼️";
|
||
case "primary-drained":
|
||
return "❗";
|
||
case "degraded":
|
||
return "⚠️";
|
||
case "unknown":
|
||
return "❓";
|
||
}
|
||
}
|
||
|
||
// Helpers used by views.
|
||
|
||
// formatVIPAddress renders an address:port string with IPv6 addresses
|
||
// wrapped in square brackets. This matches the URL-authority
|
||
// convention (RFC 3986 §3.2.2) — without the brackets the colons in
|
||
// an IPv6 literal are ambiguous against the port separator. IPv4 is
|
||
// left bare.
|
||
export function formatVIPAddress(address: string, port: number): string {
|
||
if (address.includes(":")) return `[${address}]:${port}`;
|
||
return `${address}:${port}`;
|
||
}
|
||
|
||
export function lastTransitionAge(t?: TransitionRecord): string {
|
||
// Subscribe to the 1s ticker so the age string updates live as a
|
||
// real-time countdown. No effect on layout — the age column is
|
||
// unwrapped so the Flash animation never fires for these periodic
|
||
// updates.
|
||
tick();
|
||
if (!t || !t.at_unix_ns || t.at_unix_ns <= 0) return "";
|
||
const ms = Date.now() - t.at_unix_ns / 1e6;
|
||
const totalSec = Math.floor(ms / 1000);
|
||
// Clock skew between maglevd and the browser, plus the fact that
|
||
// "1s ago" reads awkwardly, means anything at or below 1s is best
|
||
// rendered as "now". Also catches negative values from a future-
|
||
// skewed server clock.
|
||
if (totalSec <= 1) return "now";
|
||
// Render the two most significant units so fresh transitions show
|
||
// sub-minute detail ("10m30s") while older transitions round cleanly
|
||
// ("1d16h"). A single unit is shown only below one minute, since
|
||
// "Xs" has nothing smaller beneath it.
|
||
const s = totalSec % 60;
|
||
const totalMin = Math.floor(totalSec / 60);
|
||
if (totalMin < 1) return `${totalSec}s ago`;
|
||
const m = totalMin % 60;
|
||
const totalHr = Math.floor(totalMin / 60);
|
||
if (totalHr < 1) return `${m}m${s}s ago`;
|
||
const h = totalHr % 24;
|
||
const d = Math.floor(totalHr / 24);
|
||
if (d < 1) return `${totalHr}h${m}m ago`;
|
||
return `${d}d${h}h ago`;
|
||
}
|