vpp-maglev/cmd/frontend/web/src/stores/state.ts

// SPDX-License-Identifier: Apache-2.0

import { createStore, produce } from "solid-js/store";
import type {
  BackendEventPayload,
  FrontendEventPayload,
  FrontendSnapshot,
  LBStatePayload,
  MaglevdStatusPayload,
  StateSnapshot,
  TransitionRecord,
} from "../types";
import { tick } from "./tick";

// recomputeDerivedState mirrors the server-side
// health.EffectiveWeights / ActivePoolIndex / ComputeFrontendState
// logic so the SPA can keep pool.effective_weight AND the
// per-frontend aggregate state correct the moment any backend
// transitions or any configured weight changes, without waiting for
// the 30s refresh. Walking every frontend is cheap — O(frontends ×
// pools × backends-per-pool) with tiny constants — and it's
// strictly a function of the backend state map + configured
// weights, so there's no risk of drift vs. the server as long as
// the rules stay identical. The SPA is the authoritative source of
// truth for *display* state: the server's cached frontendStates
// field can be stale (e.g. after a SetFrontendPoolBackendWeight
// call that doesn't re-run updateFrontendState, or after a long-
// lived WatchEvents stream where a past transition corrupted the
// client's cache) and the SPA recomputes from its own live
// backends array to avoid inheriting any staleness.
//
// Effective weight rule: a backend gets its configured pool weight
// iff it is up AND belongs to the currently-active pool; everything
// else is 0. The active pool is the first pool containing a backend
// that is both up AND has a non-zero configured weight — a pool
// whose up backends are all weight=0 contributes no serving
// capacity and gets skipped over in priority failover. Kept in
// lock-step with internal/health/weights.go ActivePoolIndex.
//
// Frontend state rule: unknown if no backends or every referenced
// backend is still in StateUnknown; up if any backend in any pool
// has effective_weight > 0; otherwise down. Kept in lock-step with
// internal/health/weights.go ComputeFrontendState.
function recomputeDerivedState(snap: StateSnapshot) {
  const stateOf: Record<string, string> = {};
  for (const b of snap.backends) stateOf[b.name] = b.state;
  for (const fe of snap.frontends) {
    let activePool = 0;
    for (let i = 0; i < fe.pools.length; i++) {
      let anyServing = false;
      for (const pb of fe.pools[i].backends) {
        if (stateOf[pb.name] === "up" && pb.weight > 0) {
          anyServing = true;
          break;
        }
      }
      if (anyServing) {
        activePool = i;
        break;
      }
    }
    let anyEffective = false;
    let seenAny = false;
    let allUnknown = true;
    const seen = new Set<string>();
    for (let i = 0; i < fe.pools.length; i++) {
      for (const pb of fe.pools[i].backends) {
        const st = stateOf[pb.name];
        pb.effective_weight = st === "up" && i === activePool ? pb.weight : 0;
        if (pb.effective_weight > 0) anyEffective = true;
        if (!seen.has(pb.name)) {
          seen.add(pb.name);
          seenAny = true;
          if (st !== "unknown") allUnknown = false;
        }
      }
    }
    if (!seenAny || allUnknown) {
      fe.state = "unknown";
    } else if (anyEffective) {
      fe.state = "up";
    } else {
      fe.state = "down";
    }
  }
}

// FrontendState keys snapshots by maglevd name. A single store drives the
// whole UI; reducers produce() into the right branch.
//
// settling is a per-(maglevd, frontend) flag flipped to true on any
// event that changes which backends should be serving — backend
// transitions, configured weight edits — and auto-cleared after a
// fixed grace window. While true, frontendHealth suppresses the
// bug-buckets verdict so a transient race between the new control-
// plane state and the lagging GetVPPLBState refetch doesn't flash
// the ‼️ icon. A real, persistent dataplane disagreement still shows
// up the moment the grace window expires.
export type FrontendState = {
  byName: Record<string, StateSnapshot>;
  settling: Record<string, Record<string, true>>;
};

const [state, setState] = createStore<FrontendState>({ byName: {}, settling: {} });

export { state };

const SETTLE_GRACE_MS = 2000;

// Outside-the-store map of pending auto-clear timers, keyed by
// (maglevd, frontend). Timer ids aren't UI state so they don't
// belong in the reactive store; keeping them in a plain Map lets a
// fresh transition cancel and restart the timer cleanly.
const settlingTimers = new Map<string, ReturnType<typeof setTimeout>>();
function settleKey(m: string, f: string): string {
  return `${m}\x00${f}`;
}

function markFrontendSettling(maglevd: string, frontend: string) {
  setState(
    produce((s) => {
      if (!s.settling[maglevd]) s.settling[maglevd] = {};
      s.settling[maglevd][frontend] = true;
    }),
  );
  const k = settleKey(maglevd, frontend);
  const existing = settlingTimers.get(k);
  if (existing) clearTimeout(existing);
  settlingTimers.set(
    k,
    setTimeout(() => {
      settlingTimers.delete(k);
      setState(
        produce((s) => {
          if (s.settling[maglevd]) delete s.settling[maglevd][frontend];
        }),
      );
    }, SETTLE_GRACE_MS),
  );
}

// clearMaglevdSettling is called from applyLBState the moment a fresh
// GetVPPLBState reconciliation lands. The dataplane data is now at
// least as new as whatever transitions triggered the wait, so any
// remaining bug-buckets discrepancy is real and worth surfacing.
// The 2s safety timer in markFrontendSettling exists only as a
// fallback for the case where VPP is disconnected (or the fetch is
// failing) and an lb-state event would never arrive — without the
// timer, settling would get stuck and the icon would silently
// suppress real bugs.
function clearMaglevdSettling(maglevd: string) {
  for (const [k, id] of settlingTimers) {
    if (k.startsWith(maglevd + "\x00")) {
      clearTimeout(id);
      settlingTimers.delete(k);
    }
  }
  setState(
    produce((s) => {
      if (s.settling[maglevd]) s.settling[maglevd] = {};
    }),
  );
}

export function replaceSnapshot(snap: StateSnapshot) {
  // Recompute effective weights + aggregate frontend state locally
  // from the snapshot's backends array, rather than trusting the
  // server's state field verbatim. The server can be stale (the
  // checker's frontendStates map is only updated on backend
  // transitions, not on weight changes), so deriving from our own
  // backend data is the only way to guarantee the display stays
  // consistent with reality.
  recomputeDerivedState(snap);
  setState(
    produce((s) => {
      s.byName[snap.maglevd.name] = snap;
    }),
  );
}

export function replaceAll(snaps: StateSnapshot[]) {
  const byName: Record<string, StateSnapshot> = {};
  for (const s of snaps) {
    recomputeDerivedState(s);
    byName[s.maglevd.name] = s;
  }
  setState({ byName });
}

export function applyBackendTransition(maglevd: string, p: BackendEventPayload) {
  setState(
    produce((s) => {
      const snap = s.byName[maglevd];
      if (!snap) return;
      const b = snap.backends.find((x) => x.name === p.backend);
      if (!b) return;
      b.state = p.transition.to;
      // Derive enabled from state — see the matching comment in
      // cmd/frontend/client.go applyBackendTransition. state="disabled"
      // and enabled=false are two expressions of the same condition
      // in maglevd, so keeping them in sync locally closes a drift
      // window where the UI would show the wrong [disabled] tag.
      b.enabled = p.transition.to !== "disabled";
      b.last_transition = p.transition;
      if (!b.transitions) b.transitions = [];
      b.transitions.push(p.transition);
      if (b.transitions.length > 20) {
        b.transitions = b.transitions.slice(b.transitions.length - 20);
      }
      // A backend state change can shift which pool is active and
      // therefore which pool-memberships get non-zero effective
      // weights, and in turn can flip the frontend's aggregate
      // state. Recompute for every frontend — not just the one
      // pointed at by this backend — because pool-failover is a
      // per-frontend computation and the same backend can appear in
      // multiple frontends with different pool placements.
      recomputeDerivedState(snap);
    }),
  );
  // Mark every frontend that references this backend as settling so
  // the bug-buckets verdict is gated on the next fresh GetVPPLBState
  // reconciliation (or the 2s safety timer, whichever fires first).
  const snap = state.byName[maglevd];
  if (snap) {
    for (const fe of snap.frontends) {
      if (fe.pools.some((pool) => pool.backends.some((pb) => pb.name === p.backend))) {
        markFrontendSettling(maglevd, fe.name);
      }
    }
  }
}

// Frontend-transition events arrive from the server's checker, but
// the SPA no longer trusts their `to` field — recomputeDerivedState
// walks the local backends array on every backend event and every
// hydration to produce an up-to-date frontend state that the server
// can't make stale. Kept as a named reducer so sse.ts's dispatch
// table still has a landing spot for "frontend" events (they also
// flow into the DebugPanel via pushEvent); the body is deliberately
// empty — not a bug.
export function applyFrontendTransition(_maglevd: string, _p: FrontendEventPayload) {
  // no-op — state is derived client-side, see recomputeDerivedState
}

// applyLBState merges the per-frontend bucket map for one maglevd
// from a freshly-arrived "lb-state" SSE event. A null/undefined
// per_frontend payload (sent on VPP disconnect or fetch failure)
// clears the cached map so the SPA renders em-dashes in the buckets
// column instead of stale numbers.
//
// The merge is done leaf-by-leaf rather than via wholesale assignment.
// produce's proxy only emits a signal when a property is actually
// written, so guarding each write with `!==` keeps unchanged numbers
// (in particular every drained-to-0 backend) from invalidating their
// downstream reactive reads. Without this, the periodic 30s refresh
// and every same-value re-fetch would re-trigger the Flash animation
// on every cell — which is exactly the visual storm we're avoiding.
export function applyLBState(maglevd: string, p: LBStatePayload) {
  setState(
    produce((s) => {
      const snap = s.byName[maglevd];
      if (!snap) return;
      const next = p.per_frontend;
      const empty = !next || Object.keys(next).length === 0;
      if (empty) {
        if (snap.lb_state !== undefined) snap.lb_state = undefined;
        return;
      }
      if (!snap.lb_state) {
        snap.lb_state = { per_frontend: {} };
      }
      const cur = snap.lb_state.per_frontend;
      // Update / insert leaves that actually changed.
      for (const fe of Object.keys(next)) {
        if (!cur[fe]) cur[fe] = {};
        const curRow = cur[fe];
        const nextRow = next[fe];
        for (const be of Object.keys(nextRow)) {
          if (curRow[be] !== nextRow[be]) curRow[be] = nextRow[be];
        }
        for (const be of Object.keys(curRow)) {
          if (!(be in nextRow)) delete curRow[be];
        }
      }
      // Drop frontends that disappeared from the new snapshot.
      for (const fe of Object.keys(cur)) {
        if (!(fe in next)) delete cur[fe];
      }
    }),
  );
  // A fresh lb-state event means the dataplane data is now at least
  // as new as anything we were waiting on — re-enable bug detection.
  clearMaglevdSettling(maglevd);
}

// lbBucketsFor looks up the bucket count VPP currently routes to a
// given backend on a given frontend. Returns undefined when the
// snapshot has no LB state at all (VPP disconnected, no fetch yet) or
// when the backend isn't programmed into VPP for that VIP — the view
// renders an em-dash in both cases.
export function lbBucketsFor(
  snap: StateSnapshot | undefined,
  frontend: string,
  backend: string,
): number | undefined {
  return snap?.lb_state?.per_frontend?.[frontend]?.[backend];
}

export function applyVPPStatus(maglevd: string, state: string) {
  setState(
    produce((s) => {
      const snap = s.byName[maglevd];
      if (!snap) return;
      snap.vpp_state = state;
    }),
  );
}

export function applyMaglevdStatus(maglevd: string, p: MaglevdStatusPayload) {
  setState(
    produce((s) => {
      const snap = s.byName[maglevd];
      if (!snap) return;
      snap.maglevd.connected = p.connected;
      snap.maglevd.last_error = p.last_error;
    }),
  );
}

// applyConfiguredWeight updates the configured weight of a specific
// backend's pool-membership within a named frontend/pool, then
// recomputes effective weights so pool-failover semantics stay
// consistent. Called from the BackendActionsMenu after a successful
// admin "set weight" POST so the UI reflects the change instantly
// without waiting for the 30s refresh tick. Unlike the previous
// log-event-driven reducer, this one is scoped to exactly the
// pool-membership the operator edited, so it can't leak weights
// across frontends that share the backend.
export function applyConfiguredWeight(
  maglevd: string,
  frontend: string,
  pool: string,
  backend: string,
  weight: number,
) {
  setState(
    produce((s) => {
      const snap = s.byName[maglevd];
      if (!snap) return;
      const fe = snap.frontends.find((f) => f.name === frontend);
      if (!fe) return;
      const p = fe.pools.find((x) => x.name === pool);
      if (!p) return;
      const pb = p.backends.find((x) => x.name === backend);
      if (!pb) return;
      pb.weight = weight;
      recomputeDerivedState(snap);
    }),
  );
  markFrontendSettling(maglevd, frontend);
}

// FrontendHealth is the per-frontend "is everything actually working"
// verdict computed from backend states, effective weights, and (when
// available) the VPP bucket map. The cascade is intentionally
// priority-ordered: a data-plane disagreement (control says serve,
// VPP routes nothing) is the loudest signal because it usually means
// something is broken in the sync path, not just an unhealthy backend.
//
//   "ok"               → all backends up, primary serving, every
//                        eff>0 backend has VPP buckets>0
//   "bug-buckets"      → some backend with effective_weight>0 has 0
//                        buckets in VPP — control plane and data
//                        plane disagree, almost always a bug
//   "primary-drained"  → primary pool is not serving any traffic
//                        (every backend in pool[0] has eff=0); the
//                        frontend is on its fallback or fully down
//   "degraded"         → at least one backend isn't 'up' but nothing
//                        worse — typical maintenance / outage state
//   "unknown"          → fallthrough; should be unreachable, kept as
//                        a safety net for logic bugs in this function
export type FrontendHealth = "ok" | "bug-buckets" | "primary-drained" | "degraded" | "unknown";

export function frontendHealth(snap: StateSnapshot, fe: FrontendSnapshot): FrontendHealth {
  const stateOf: Record<string, string> = {};
  for (const b of snap.backends) stateOf[b.name] = b.state;

  // The bucket check is only meaningful when we actually have an LB
  // state snapshot. On a fresh page load (or with VPP disconnected)
  // lb_state is undefined; in that window we fall back to "trust the
  // control plane" so the icon still settles to ✅ instead of
  // perpetual ❓ until the first GetVPPLBState round-trip.
  const lbAvailable = !!snap.lb_state;
  const feBuckets = snap.lb_state?.per_frontend?.[fe.name];
  // Reactive read of the per-frontend settling flag. While true,
  // we're still waiting for the next GetVPPLBState reconciliation
  // after a recent control-plane change; the dataplane may be mid-
  // reconverge so any "weight>0 but buckets==0" we'd see here is
  // almost certainly a race, not a real bug.
  const settling = !!state.settling[snap.maglevd.name]?.[fe.name];

  let anyDown = false;
  let dataplaneBug = false;
  for (const pool of fe.pools) {
    for (const pb of pool.backends) {
      if (stateOf[pb.name] !== "up") anyDown = true;
      if (!settling && lbAvailable && pb.effective_weight > 0) {
        const b = feBuckets?.[pb.name];
        if (b === undefined || b === 0) dataplaneBug = true;
      }
    }
  }

  const primary = fe.pools[0];
  const primaryHasWeights = !!primary && primary.backends.some((pb) => pb.weight > 0);
  const primaryAllZero = !primary || primary.backends.every((pb) => pb.effective_weight === 0);

  if (!anyDown && primaryHasWeights && !dataplaneBug) return "ok";
  if (dataplaneBug) return "bug-buckets";
  if (primaryAllZero) return "primary-drained";
  if (anyDown) return "degraded";
  return "unknown";
}

export function frontendHealthIcon(snap: StateSnapshot, fe: FrontendSnapshot): string {
  switch (frontendHealth(snap, fe)) {
    case "ok":
      return "✅";
    case "bug-buckets":
      return "‼️";
    case "primary-drained":
      return "❗";
    case "degraded":
      return "⚠️";
    case "unknown":
      return "❓";
  }
}

// Helpers used by views.

// formatVIPAddress renders an address:port string with IPv6 addresses
// wrapped in square brackets. This matches the URL-authority
// convention (RFC 3986 §3.2.2) — without the brackets the colons in
// an IPv6 literal are ambiguous against the port separator. IPv4 is
// left bare.
export function formatVIPAddress(address: string, port: number): string {
  if (address.includes(":")) return `[${address}]:${port}`;
  return `${address}:${port}`;
}

export function lastTransitionAge(t?: TransitionRecord): string {
  // Subscribe to the 1s ticker so the age string updates live as a
  // real-time countdown. No effect on layout — the age column is
  // unwrapped so the Flash animation never fires for these periodic
  // updates.
  tick();
  if (!t || !t.at_unix_ns || t.at_unix_ns <= 0) return "";
  const ms = Date.now() - t.at_unix_ns / 1e6;
  const totalSec = Math.floor(ms / 1000);
  // Clock skew between maglevd and the browser, plus the fact that
  // "1s ago" reads awkwardly, means anything at or below 1s is best
  // rendered as "now". Also catches negative values from a future-
  // skewed server clock.
  if (totalSec <= 1) return "now";
  // Render the two most significant units so fresh transitions show
  // sub-minute detail ("10m30s") while older transitions round cleanly
  // ("1d16h"). A single unit is shown only below one minute, since
  // "Xs" has nothing smaller beneath it.
  const s = totalSec % 60;
  const totalMin = Math.floor(totalSec / 60);
  if (totalMin < 1) return `${totalSec}s ago`;
  const m = totalMin % 60;
  const totalHr = Math.floor(totalMin / 60);
  if (totalHr < 1) return `${m}m${s}s ago`;
  const h = totalHr % 24;
  const d = Math.floor(totalHr / 24);
  if (d < 1) return `${totalHr}h${m}m ago`;
  return `${d}d${h}h ago`;
}