LB buckets column + health cascade; VPP dump fix; maglevc strictness

SPA (cmd/frontend/web): - New "lb buckets" column backed by a 1s-debounced GetVPPLBState fetch loop with leading+trailing edge coalesce. - Per-frontend health icon (✅/⚠️/❗/‼️/❓) in the Zippy header, gated by a settling flag that suppresses ‼️ until the next lb-state reconciliation after a backend transition or weight change. - In-place leaf merge on lb-state so stable bucket values (e.g. "0") don't retrigger the Flash animation on every refresh. - Zippy cards remember open state in a cookie, default closed on fresh load; fixed-width frontend-title-name + reserved icon slot so headers line up across all cards. - Clock-drift watchdog in sse.ts that forces a fresh EventSource on laptop-wake so the broker emits a resync instead of hanging on a dead half-open socket. Frontend service (cmd/frontend): - maglevClient.lbStateLoop, trigger on backend transitions + vpp-connect, best-effort fetch on refreshAll. - Admin handlers explicitly wake the lb-state loop after lifecycle ops and set-weight (the latter emits no transition event on the maglevd side, so the WatchEvents path wouldn't have caught it). - /favicon.ico served from embedded web/public IPng logo. VPP integration: - internal/vpp/lbstate.go: dumpASesForVIP drops Pfx from the dump request (setting it silently wipes IPv4 replies in the LB plugin) and filters results by prefix on the response side instead, which also demuxes multi-VIP-on-same-port cases correctly. maglevc: - Walk now returns the unconsumed token tail; dispatch and the question listener reject unknown commands with a targeted error instead of dumping the full command tree prefixed with garbage. - On '?', echo the current line (including the '?') before the help list so the output reads like birdc. Checker / prober: - internal/checker: ±10% jitter on NextInterval so probes across restart don't all fire on the same tick. - internal/prober: HTTP User-Agent now carries the build version and project URL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 14:23:34 +02:00
parent 35643fd774
commit 1a1c48ef54
28 changed files with 828 additions and 57 deletions
--- a/cmd/frontend/client.go
+++ b/cmd/frontend/client.go
@@ -33,6 +33,12 @@ type maglevClient struct {
 	connected bool
 	lastErr   string
 	cache     cachedState
+
+	// lbWakeCh is a buffer-1 trigger channel feeding lbStateLoop. Every
+	// backend transition (and a few other events) does a non-blocking send
+	// here; the loop coalesces bursts into at most one GetVPPLBState call
+	// per second. See lbStateLoop for the leading+trailing-edge debounce.
+	lbWakeCh chan struct{}
 }

 // cachedState is the per-maglevd snapshot served via the REST handlers.
@@ -49,6 +55,7 @@ type cachedState struct {
 	HealthCheckOrder []string
 	VPPInfo          *VPPInfoSnapshot
 	VPPState         string // "", "connected", "disconnected"
+	LBState          *LBStateSnapshot
 	LastRefresh      time.Time
 }

@@ -69,6 +76,7 @@ func newMaglevClient(address string, broker *Broker) (*maglevClient, error) {
 			Backends:     map[string]*BackendSnapshot{},
 			HealthChecks: map[string]*HealthCheckSnapshot{},
 		},
+		lbWakeCh: make(chan struct{}, 1),
 	}, nil
 }

@@ -147,6 +155,7 @@ func (c *maglevClient) Start(ctx context.Context) {
 	go c.watchLoop(ctx)
 	go c.refreshLoop(ctx)
 	go c.healthLoop(ctx)
+	go c.lbStateLoop(ctx)
 }

 func (c *maglevClient) setConnected(ok bool, errMsg string) {
@@ -196,6 +205,7 @@ func (c *maglevClient) Snapshot() *StateSnapshot {
 		HealthChecks: make([]*HealthCheckSnapshot, 0, len(c.cache.HealthCheckOrder)),
 		VPPInfo:      c.cache.VPPInfo,
 		VPPState:     c.cache.VPPState,
+		LBState:      c.cache.LBState,
 	}
 	for _, name := range c.cache.FrontendsOrder {
 		if f, ok := c.cache.Frontends[name]; ok {
@@ -302,6 +312,11 @@ func (c *maglevClient) refreshAll(ctx context.Context) error {
 	c.cache.VPPState = vppState
 	c.cache.LastRefresh = time.Now()
 	c.mu.Unlock()
+	// Best-effort LB state pull so /view/api/state served on a fresh
+	// page load already carries the bucket column. Errors are
+	// swallowed by fetchLBStateAndPublish (which clears the cache and
+	// emits an empty event so the SPA renders "—").
+	c.fetchLBStateAndPublish(ctx)
 	return nil
 }

@@ -434,6 +449,11 @@ func (c *maglevClient) handleEvent(ev *grpcapi.Event) {
 			AtUnixNs: tr.AtUnixNs,
 			Payload:  payload,
 		})
+		// A real transition means VPP is about to (or already did)
+		// reshuffle bucket allocations across the affected VIP. Wake
+		// the lb-state loop so the SPA's bucket column converges
+		// without waiting for the 30s refresh.
+		c.triggerLBStateFetch()

 	case *grpcapi.Event_Frontend:
 		fe := body.Frontend
@@ -544,6 +564,11 @@ func (c *maglevClient) applyVPPLogHeartbeat(msg string) {
 		AtUnixNs: time.Now().UnixNano(),
 		Payload:  payload,
 	})
+	// VPP just came back: pull fresh LB state so the bucket column
+	// repopulates immediately instead of waiting up to 30s for the
+	// next refresh tick. On vpp-disconnect the next fetch will fail
+	// and clear the cache, which is also the right behaviour.
+	c.triggerLBStateFetch()
 }

 func (c *maglevClient) applyBackendTransition(name string, tr *TransitionRecord) {
@@ -677,6 +702,175 @@ func transitionFromProto(t *grpcapi.TransitionRecord) *TransitionRecord {
 	}
 }

+// triggerLBStateFetch sends a non-blocking wake to lbStateLoop. The
+// channel has buffer 1 so coalesced bursts never block the publisher.
+func (c *maglevClient) triggerLBStateFetch() {
+	select {
+	case c.lbWakeCh <- struct{}{}:
+	default:
+	}
+}
+
+// lbStateLoop consumes wake signals and calls GetVPPLBState, with a
+// leading+trailing-edge debounce so we never exceed one fetch per
+// minLBInterval (1s). The leading edge means the very first wake after
+// an idle period fires immediately — important so a single isolated
+// transition isn't artificially delayed by a second. The trailing edge
+// means a burst of wakes during the cool-down still gets one final
+// fetch right after the gate opens, so the SPA always converges to a
+// post-burst snapshot rather than missing the last update.
+func (c *maglevClient) lbStateLoop(ctx context.Context) {
+	const minLBInterval = time.Second
+	var (
+		timer     *time.Timer
+		lastFetch time.Time
+	)
+	timerCh := func() <-chan time.Time {
+		if timer == nil {
+			return nil
+		}
+		return timer.C
+	}
+	fire := func() {
+		if timer != nil {
+			if !timer.Stop() {
+				select {
+				case <-timer.C:
+				default:
+				}
+			}
+			timer = nil
+		}
+		c.fetchLBStateAndPublish(ctx)
+		lastFetch = time.Now()
+	}
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-c.lbWakeCh:
+			wait := minLBInterval - time.Since(lastFetch)
+			if wait <= 0 {
+				fire()
+			} else if timer == nil {
+				timer = time.NewTimer(wait)
+			}
+		case <-timerCh():
+			timer = nil
+			fire()
+		}
+	}
+}
+
+// fetchLBStateAndPublish runs one GetVPPLBState round-trip, rebuilds
+// the per-frontend bucket map, swaps it into the cache, and broadcasts
+// a "lb-state" BrowserEvent. On error the cache is cleared and an
+// empty event is published so the SPA can switch the bucket column to
+// em-dashes — clear-on-error is simpler than stale-but-visible and
+// doesn't risk showing a confusing snapshot from before VPP died.
+func (c *maglevClient) fetchLBStateAndPublish(ctx context.Context) {
+	fctx, cancel := context.WithTimeout(ctx, 5*time.Second)
+	defer cancel()
+	lbs, err := c.api.GetVPPLBState(fctx, &grpcapi.GetVPPLBStateRequest{})
+	if err != nil {
+		c.mu.Lock()
+		had := c.cache.LBState != nil
+		c.cache.LBState = nil
+		c.mu.Unlock()
+		slog.Debug("lb-state-fetch", "maglevd", c.name, "err", err)
+		if had {
+			c.publishLBState(nil)
+		}
+		return
+	}
+	snap := c.buildLBStateSnapshot(lbs)
+	c.mu.Lock()
+	c.cache.LBState = snap
+	c.mu.Unlock()
+	c.publishLBState(snap.PerFrontend)
+}
+
+func (c *maglevClient) publishLBState(perFrontend map[string]map[string]int32) {
+	payload, _ := json.Marshal(LBStatePayload{PerFrontend: perFrontend})
+	c.broker.Publish(BrowserEvent{
+		Maglevd:  c.name,
+		Type:     "lb-state",
+		AtUnixNs: time.Now().UnixNano(),
+		Payload:  payload,
+	})
+}
+
+// buildLBStateSnapshot translates a VPP-side state record (keyed by
+// CIDR/protocol/port and AS address) into a maglev-side record (keyed
+// by frontend name and backend name). Unmatched VIPs and unmatched AS
+// addresses are silently skipped — they're benign side effects of a
+// transient sync gap or a backend address that's only present in one
+// of the two universes.
+func (c *maglevClient) buildLBStateSnapshot(lbs *grpcapi.VPPLBState) *LBStateSnapshot {
+	c.mu.RLock()
+	feByVIP := make(map[string]string, len(c.cache.Frontends))
+	for _, f := range c.cache.Frontends {
+		feByVIP[lbVIPKey(f.Address, f.Protocol, f.Port)] = f.Name
+	}
+	backendByAddr := make(map[string]string, len(c.cache.Backends))
+	for _, b := range c.cache.Backends {
+		backendByAddr[b.Address] = b.Name
+	}
+	c.mu.RUnlock()
+
+	out := &LBStateSnapshot{PerFrontend: map[string]map[string]int32{}}
+	for _, v := range lbs.GetVips() {
+		feName, ok := feByVIP[lbVIPKey(stripLBHostMask(v.GetPrefix()), lbProtoString(v.GetProtocol()), v.GetPort())]
+		if !ok {
+			continue
+		}
+		row := out.PerFrontend[feName]
+		if row == nil {
+			row = map[string]int32{}
+			out.PerFrontend[feName] = row
+		}
+		for _, as := range v.GetApplicationServers() {
+			bname, ok := backendByAddr[as.GetAddress()]
+			if !ok {
+				continue
+			}
+			row[bname] = int32(as.GetNumBuckets())
+		}
+	}
+	return out
+}
+
+// lbVIPKey is the join key between a maglev FrontendSnapshot and a
+// VPP-side VPPLBVIP record. Stripping the mask and lower-casing the
+// protocol gives a canonical form that both sides can produce.
+func lbVIPKey(addr, proto string, port uint32) string {
+	return fmt.Sprintf("%s/%s/%d", addr, strings.ToLower(proto), port)
+}
+
+// lbProtoString mirrors maglevc's protoString — kept local to avoid a
+// cross-package import for two trivial helpers.
+func lbProtoString(p uint32) string {
+	switch p {
+	case 6:
+		return "tcp"
+	case 17:
+		return "udp"
+	case 255:
+		return "any"
+	}
+	return fmt.Sprintf("%d", p)
+}
+
+// stripLBHostMask trims "/32" or "/128" from a VPP host-prefix VIP so
+// it can be compared against a maglev FrontendSnapshot.Address (which
+// is bare). Other shapes are returned unchanged.
+func stripLBHostMask(prefix string) string {
+	if strings.HasSuffix(prefix, "/32") || strings.HasSuffix(prefix, "/128") {
+		return prefix[:strings.LastIndexByte(prefix, '/')]
+	}
+	return prefix
+}
+
 func healthCheckFromProto(h *grpcapi.HealthCheckInfo) *HealthCheckSnapshot {
 	return &HealthCheckSnapshot{
 		Name:           h.GetName(),
--- a/cmd/frontend/handlers.go
+++ b/cmd/frontend/handlers.go
@@ -37,6 +37,23 @@ func registerHandlers(mux *http.ServeMux, clients []*maglevClient, broker *Broke
 		_, _ = w.Write([]byte("ok\n"))
 	})

+	// Favicon served from the same embedded dist tree Vite produced.
+	// Browsers auto-fetch /favicon.ico from the document root regardless
+	// of where the SPA itself is mounted, so we register a top-level
+	// handler in addition to whatever /view/favicon.ico picks up via the
+	// static file server below. Read once at registration so we don't
+	// touch the embed.FS on every request, and serve with a long
+	// max-age since the bytes never change for a given binary.
+	if favicon, ferr := fs.ReadFile(webFS, "web/dist/favicon.ico"); ferr == nil {
+		mux.HandleFunc("/favicon.ico", func(w http.ResponseWriter, _ *http.Request) {
+			w.Header().Set("Content-Type", "image/x-icon")
+			w.Header().Set("Cache-Control", "public, max-age=86400")
+			_, _ = w.Write(favicon)
+		})
+	} else {
+		slog.Warn("favicon-missing", "err", ferr)
+	}
+
 	mux.HandleFunc("/view/api/version", func(w http.ResponseWriter, _ *http.Request) {
 		writeJSON(w, VersionInfo{
 			Version:      buildinfo.Version(),
@@ -188,6 +205,13 @@ func handleBackendLifecycle(w http.ResponseWriter, r *http.Request, c *maglevCli
 	}
 	slog.Info("admin-backend-action",
 		"maglevd", c.name, "backend", name, "action", action, "state", snap.State)
+	// The maglevd→watch path will deliver a transition event that
+	// also wakes the lb-state loop, but firing here too makes the
+	// admin path self-contained and shaves the worst-case race
+	// where the SPA is still waiting on the WatchEvents replay
+	// when the POST response lands. The debouncer coalesces any
+	// duplicate wake.
+	c.triggerLBStateFetch()
 	writeJSON(w, snap)
 }

@@ -219,6 +243,14 @@ func handleBackendWeight(w http.ResponseWriter, r *http.Request, c *maglevClient
 	slog.Info("admin-set-weight",
 		"maglevd", c.name, "frontend", frontend, "pool", pool, "backend", backend,
 		"weight", body.Weight, "flush", body.Flush)
+	// Weight changes never produce a transition event on the maglevd
+	// side (the backend's state is unchanged), so the WatchEvents
+	// stream won't wake the lb-state loop for us — without an explicit
+	// trigger here the SPA's bucket column would stay stale until the
+	// next 30s refresh tick. SyncLBStateVIP on the maglevd side has
+	// already pushed the new weights into VPP synchronously, so the
+	// fetch we kick off will see fresh post-mutation buckets.
+	c.triggerLBStateFetch()
 	writeJSON(w, snap)
 }

--- a/cmd/frontend/types.go
+++ b/cmd/frontend/types.go
@@ -15,6 +15,18 @@ type StateSnapshot struct {
 	// from vpp-connect / vpp-disconnect / vpp-api-{send,recv} log
 	// events and re-seeded on every refreshAll tick.
 	VPPState string `json:"vpp_state,omitempty"`
+	// LBState is the most recent VPP LB plugin view of buckets-per-backend,
+	// keyed by frontend name → backend name → bucket count. nil when VPP is
+	// disconnected or no fetch has succeeded yet.
+	LBState *LBStateSnapshot `json:"lb_state,omitempty"`
+}
+
+// LBStateSnapshot is a per-(frontend, backend) view of VPP's bucket
+// allocation. The frontend collects this with GetVPPLBState and matches
+// VPP's VIP records back to maglev frontend/backend names so the SPA
+// never has to know about VPP-side prefixes or AS addresses.
+type LBStateSnapshot struct {
+	PerFrontend map[string]map[string]int32 `json:"per_frontend"`
 }

 // MaglevdInfo is the per-maglevd connection status record.
@@ -98,11 +110,19 @@ type VPPInfoSnapshot struct {
 // BrowserEvent is the wire shape sent over SSE to the browser.
 type BrowserEvent struct {
 	Maglevd  string          `json:"maglevd"`
-	Type     string          `json:"type"` // log|backend|frontend|maglevd-status|resync
+	Type     string          `json:"type"` // log|backend|frontend|maglevd-status|vpp-status|lb-state|resync
 	AtUnixNs int64           `json:"at_unix_ns"`
 	Payload  json.RawMessage `json:"payload"`
 }

+// LBStatePayload rides on a "lb-state" BrowserEvent and carries the
+// freshly-fetched bucket map. PerFrontend may be nil (or empty) to
+// signal "no LB state available" — the SPA renders such backends
+// with an em-dash in the buckets column.
+type LBStatePayload struct {
+	PerFrontend map[string]map[string]int32 `json:"per_frontend"`
+}
+
 // BackendEventPayload is what we ship inside BrowserEvent.Payload for
 // type == "backend".
 type BackendEventPayload struct {
--- a/cmd/frontend/web/dist/assets/index-3BvNJ7QB.css
+++ b/cmd/frontend/web/dist/assets/index-3BvNJ7QB.css
--- a/cmd/frontend/web/dist/assets/index-CExoCDXh.css
+++ b/cmd/frontend/web/dist/assets/index-CExoCDXh.css
--- a/cmd/frontend/web/dist/assets/index-DCJJqBMY.js
+++ b/cmd/frontend/web/dist/assets/index-DCJJqBMY.js
--- a/cmd/frontend/web/dist/assets/index-DjixLt11.js
+++ b/cmd/frontend/web/dist/assets/index-DjixLt11.js
--- a/cmd/frontend/web/dist/favicon.ico
+++ b/cmd/frontend/web/dist/favicon.ico
--- a/cmd/frontend/web/dist/index.html
+++ b/cmd/frontend/web/dist/index.html
@@ -3,9 +3,10 @@
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <link rel="icon" type="image/x-icon" href="/view/favicon.ico" />
    <title>maglev</title>
-    <script type="module" crossorigin src="/view/assets/index-DjixLt11.js"></script>
-    <link rel="stylesheet" crossorigin href="/view/assets/index-CExoCDXh.css">
+    <script type="module" crossorigin src="/view/assets/index-DCJJqBMY.js"></script>
+    <link rel="stylesheet" crossorigin href="/view/assets/index-3BvNJ7QB.css">
  </head>
  <body>
    <div id="root"></div>
--- a/cmd/frontend/web/index.html
+++ b/cmd/frontend/web/index.html
@@ -3,6 +3,7 @@
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <link rel="icon" type="image/x-icon" href="/favicon.ico" />
    <title>maglev</title>
  </head>
  <body>
--- a/cmd/frontend/web/public/favicon.ico
+++ b/cmd/frontend/web/public/favicon.ico
--- a/cmd/frontend/web/src/api/sse.ts
+++ b/cmd/frontend/web/src/api/sse.ts
@@ -2,6 +2,7 @@ import type {
  BackendEventPayload,
  BrowserEvent,
  FrontendEventPayload,
+  LBStatePayload,
  MaglevdStatusPayload,
  VPPStatusPayload,
 } from "../types";
@@ -9,6 +10,7 @@ import { fetchAllState } from "./rest";
 import {
  applyBackendTransition,
  applyFrontendTransition,
+  applyLBState,
  applyMaglevdStatus,
  applyVPPStatus,
  replaceAll,
@@ -19,10 +21,24 @@ import { pushEvent } from "../stores/events";
 // reconnects with the Last-Event-ID header set, which the Go broker uses
 // to replay events from its 30s ring buffer. A "resync" event tells us to
 // refetch full state and redraw.
-export function openEventStream(): EventSource {
-  const es = new EventSource("/view/api/events");
+//
+// On top of the browser's built-in reconnect we also run a clock-drift
+// watchdog: a setInterval that doesn't fire during OS suspend, so a tick
+// that arrives much later than expected almost always means the laptop
+// just came back from sleep. EventSource doesn't notice when its TCP
+// connection has been silently torn down during sleep (no FIN was
+// delivered, so readyState stays OPEN forever), so we force-reconnect
+// ourselves on a wake. The new connection sends no Last-Event-ID, which
+// makes the broker emit a "resync" event and the handler below refetches
+// full state.
+const SSE_WAKE_TICK_MS = 10_000;
+const SSE_WAKE_THRESHOLD_MS = 30_000;

-  es.onmessage = (msg) => {
+export function openEventStream(): void {
+  let es: EventSource | undefined;
+  let reconnecting = false;
+
+  const onMessage = (msg: MessageEvent) => {
    try {
      const ev = JSON.parse(msg.data) as BrowserEvent;
      dispatch(ev);
@@ -31,23 +47,60 @@ export function openEventStream(): EventSource {
    }
  };

-  // "resync" is emitted as a named event so we can listen for it
-  // without it going through the default onmessage dispatch.
-  es.addEventListener("resync", async () => {
+  const onResync = async () => {
    try {
      const snaps = await fetchAllState();
      replaceAll(snaps);
    } catch (err) {
      console.error("resync refetch failed", err);
    }
-  });
+  };

+  const connect = () => {
+    if (es) {
+      es.close();
+      es = undefined;
+    }
+    es = new EventSource("/view/api/events");
+    es.onmessage = onMessage;
+    es.addEventListener("resync", onResync);
    es.onerror = (err) => {
      // EventSource handles reconnection on its own — just log.
      console.debug("sse error, browser will reconnect", err);
    };
+  };

-  return es;
+  const reconnect = (reason: string) => {
+    // Coalesce multiple wake signals that fire within the same instant
+    // (e.g. clock-drift tick AND a future visibility hook). One brief
+    // window is enough; subsequent calls are no-ops.
+    if (reconnecting) return;
+    reconnecting = true;
+    console.info("sse reconnecting:", reason);
+    connect();
+    setTimeout(() => {
+      reconnecting = false;
+    }, 1000);
+  };
+
+  // Wake detector. The interval is short enough (10s) to catch even
+  // brief naps, the threshold (30s) is well above the interval + JS
+  // jitter so a clean wake reads unambiguously, and we never trigger
+  // on normal background-tab throttling because that doesn't usually
+  // pause setInterval for 30+ seconds at a time. If a future Chrome
+  // policy starts throttling that aggressively, the worst case is one
+  // extra reconnect every few minutes — still cheap.
+  let lastTick = Date.now();
+  setInterval(() => {
+    const now = Date.now();
+    const elapsed = now - lastTick;
+    lastTick = now;
+    if (elapsed > SSE_WAKE_THRESHOLD_MS) {
+      reconnect(`wake detected (${Math.round(elapsed / 1000)}s gap)`);
+    }
+  }, SSE_WAKE_TICK_MS);
+
+  connect();
 }

 function dispatch(ev: BrowserEvent) {
@@ -68,6 +121,9 @@ function dispatch(ev: BrowserEvent) {
    case "vpp-status":
      applyVPPStatus(ev.maglevd, (ev.payload as VPPStatusPayload).state);
      break;
+    case "lb-state":
+      applyLBState(ev.maglevd, ev.payload as LBStatePayload);
+      break;
    case "log":
      // Log events are displayed in the DebugPanel but no longer
      // mutate the state tree. The previous vpp-lb-sync-as-*
--- a/cmd/frontend/web/src/components/Zippy.tsx
+++ b/cmd/frontend/web/src/components/Zippy.tsx
@@ -1,14 +1,23 @@
 import type { Component, JSX } from "solid-js";
+import { isZippyOpen, setZippyOpen } from "../stores/zippy";

 type Props = {
+  // Stable identifier used as the cookie key. Must be unique within
+  // the app; a fresh page load opens the Zippy iff its id is present
+  // in the persisted open-set, so changing the id "forgets" prior
+  // user state for that panel.
+  id: string;
  title: JSX.Element;
-  open?: boolean;
  children: JSX.Element;
 };

 const Zippy: Component<Props> = (props) => {
  return (
-    <details class="zippy" open={props.open}>
+    <details
+      class="zippy"
+      open={isZippyOpen(props.id)}
+      onToggle={(e) => setZippyOpen(props.id, e.currentTarget.open)}
+    >
      <summary>{props.title}</summary>
      <div class="zippy-body">{props.children}</div>
    </details>
--- a/cmd/frontend/web/src/stores/state.ts
+++ b/cmd/frontend/web/src/stores/state.ts
@@ -2,6 +2,8 @@ import { createStore, produce } from "solid-js/store";
 import type {
  BackendEventPayload,
  FrontendEventPayload,
+  FrontendSnapshot,
+  LBStatePayload,
  MaglevdStatusPayload,
  StateSnapshot,
  TransitionRecord,
@@ -83,14 +85,81 @@ function recomputeDerivedState(snap: StateSnapshot) {

 // FrontendState keys snapshots by maglevd name. A single store drives the
 // whole UI; reducers produce() into the right branch.
+//
+// settling is a per-(maglevd, frontend) flag flipped to true on any
+// event that changes which backends should be serving — backend
+// transitions, configured weight edits — and auto-cleared after a
+// fixed grace window. While true, frontendHealth suppresses the
+// bug-buckets verdict so a transient race between the new control-
+// plane state and the lagging GetVPPLBState refetch doesn't flash
+// the ‼️ icon. A real, persistent dataplane disagreement still shows
+// up the moment the grace window expires.
 export type FrontendState = {
  byName: Record<string, StateSnapshot>;
+  settling: Record<string, Record<string, true>>;
 };

-const [state, setState] = createStore<FrontendState>({ byName: {} });
+const [state, setState] = createStore<FrontendState>({ byName: {}, settling: {} });

 export { state };

+const SETTLE_GRACE_MS = 2000;
+
+// Outside-the-store map of pending auto-clear timers, keyed by
+// (maglevd, frontend). Timer ids aren't UI state so they don't
+// belong in the reactive store; keeping them in a plain Map lets a
+// fresh transition cancel and restart the timer cleanly.
+const settlingTimers = new Map<string, ReturnType<typeof setTimeout>>();
+function settleKey(m: string, f: string): string {
+  return `${m}\x00${f}`;
+}
+
+function markFrontendSettling(maglevd: string, frontend: string) {
+  setState(
+    produce((s) => {
+      if (!s.settling[maglevd]) s.settling[maglevd] = {};
+      s.settling[maglevd][frontend] = true;
+    }),
+  );
+  const k = settleKey(maglevd, frontend);
+  const existing = settlingTimers.get(k);
+  if (existing) clearTimeout(existing);
+  settlingTimers.set(
+    k,
+    setTimeout(() => {
+      settlingTimers.delete(k);
+      setState(
+        produce((s) => {
+          if (s.settling[maglevd]) delete s.settling[maglevd][frontend];
+        }),
+      );
+    }, SETTLE_GRACE_MS),
+  );
+}
+
+// clearMaglevdSettling is called from applyLBState the moment a fresh
+// GetVPPLBState reconciliation lands. The dataplane data is now at
+// least as new as whatever transitions triggered the wait, so any
+// remaining bug-buckets discrepancy is real and worth surfacing.
+// The 2s safety timer in markFrontendSettling exists only as a
+// fallback for the case where VPP is disconnected (or the fetch is
+// failing) and an lb-state event would never arrive — without the
+// timer, settling would get stuck and the icon would silently
+// suppress real bugs.
+function clearMaglevdSettling(maglevd: string) {
+  for (const [k, id] of settlingTimers) {
+    if (k.startsWith(maglevd + "\x00")) {
+      clearTimeout(id);
+      settlingTimers.delete(k);
+    }
+  }
+  setState(
+    produce((s) => {
+      if (s.settling[maglevd]) s.settling[maglevd] = {};
+    }),
+  );
+}
+
 export function replaceSnapshot(snap: StateSnapshot) {
  // Recompute effective weights + aggregate frontend state locally
  // from the snapshot's backends array, rather than trusting the
@@ -146,6 +215,17 @@ export function applyBackendTransition(maglevd: string, p: BackendEventPayload)
      recomputeDerivedState(snap);
    }),
  );
+  // Mark every frontend that references this backend as settling so
+  // the bug-buckets verdict is gated on the next fresh GetVPPLBState
+  // reconciliation (or the 2s safety timer, whichever fires first).
+  const snap = state.byName[maglevd];
+  if (snap) {
+    for (const fe of snap.frontends) {
+      if (fe.pools.some((pool) => pool.backends.some((pb) => pb.name === p.backend))) {
+        markFrontendSettling(maglevd, fe.name);
+      }
+    }
+  }
 }

 // Frontend-transition events arrive from the server's checker, but
@@ -160,6 +240,70 @@ export function applyFrontendTransition(_maglevd: string, _p: FrontendEventPaylo
  // no-op — state is derived client-side, see recomputeDerivedState
 }

+// applyLBState merges the per-frontend bucket map for one maglevd
+// from a freshly-arrived "lb-state" SSE event. A null/undefined
+// per_frontend payload (sent on VPP disconnect or fetch failure)
+// clears the cached map so the SPA renders em-dashes in the buckets
+// column instead of stale numbers.
+//
+// The merge is done leaf-by-leaf rather than via wholesale assignment.
+// produce's proxy only emits a signal when a property is actually
+// written, so guarding each write with `!==` keeps unchanged numbers
+// (in particular every drained-to-0 backend) from invalidating their
+// downstream reactive reads. Without this, the periodic 30s refresh
+// and every same-value re-fetch would re-trigger the Flash animation
+// on every cell — which is exactly the visual storm we're avoiding.
+export function applyLBState(maglevd: string, p: LBStatePayload) {
+  setState(
+    produce((s) => {
+      const snap = s.byName[maglevd];
+      if (!snap) return;
+      const next = p.per_frontend;
+      const empty = !next || Object.keys(next).length === 0;
+      if (empty) {
+        if (snap.lb_state !== undefined) snap.lb_state = undefined;
+        return;
+      }
+      if (!snap.lb_state) {
+        snap.lb_state = { per_frontend: {} };
+      }
+      const cur = snap.lb_state.per_frontend;
+      // Update / insert leaves that actually changed.
+      for (const fe of Object.keys(next)) {
+        if (!cur[fe]) cur[fe] = {};
+        const curRow = cur[fe];
+        const nextRow = next[fe];
+        for (const be of Object.keys(nextRow)) {
+          if (curRow[be] !== nextRow[be]) curRow[be] = nextRow[be];
+        }
+        for (const be of Object.keys(curRow)) {
+          if (!(be in nextRow)) delete curRow[be];
+        }
+      }
+      // Drop frontends that disappeared from the new snapshot.
+      for (const fe of Object.keys(cur)) {
+        if (!(fe in next)) delete cur[fe];
+      }
+    }),
+  );
+  // A fresh lb-state event means the dataplane data is now at least
+  // as new as anything we were waiting on — re-enable bug detection.
+  clearMaglevdSettling(maglevd);
+}
+
+// lbBucketsFor looks up the bucket count VPP currently routes to a
+// given backend on a given frontend. Returns undefined when the
+// snapshot has no LB state at all (VPP disconnected, no fetch yet) or
+// when the backend isn't programmed into VPP for that VIP — the view
+// renders an em-dash in both cases.
+export function lbBucketsFor(
+  snap: StateSnapshot | undefined,
+  frontend: string,
+  backend: string,
+): number | undefined {
+  return snap?.lb_state?.per_frontend?.[frontend]?.[backend];
+}
+
 export function applyVPPStatus(maglevd: string, state: string) {
  setState(
    produce((s) => {
@@ -211,6 +355,89 @@ export function applyConfiguredWeight(
      recomputeDerivedState(snap);
    }),
  );
+  markFrontendSettling(maglevd, frontend);
+}
+
+// FrontendHealth is the per-frontend "is everything actually working"
+// verdict computed from backend states, effective weights, and (when
+// available) the VPP bucket map. The cascade is intentionally
+// priority-ordered: a data-plane disagreement (control says serve,
+// VPP routes nothing) is the loudest signal because it usually means
+// something is broken in the sync path, not just an unhealthy backend.
+//
+//   "ok"               → all backends up, primary serving, every
+//                        eff>0 backend has VPP buckets>0
+//   "bug-buckets"      → some backend with effective_weight>0 has 0
+//                        buckets in VPP — control plane and data
+//                        plane disagree, almost always a bug
+//   "primary-drained"  → primary pool is not serving any traffic
+//                        (every backend in pool[0] has eff=0); the
+//                        frontend is on its fallback or fully down
+//   "degraded"         → at least one backend isn't 'up' but nothing
+//                        worse — typical maintenance / outage state
+//   "unknown"          → fallthrough; should be unreachable, kept as
+//                        a safety net for logic bugs in this function
+export type FrontendHealth =
+  | "ok"
+  | "bug-buckets"
+  | "primary-drained"
+  | "degraded"
+  | "unknown";
+
+export function frontendHealth(snap: StateSnapshot, fe: FrontendSnapshot): FrontendHealth {
+  const stateOf: Record<string, string> = {};
+  for (const b of snap.backends) stateOf[b.name] = b.state;
+
+  // The bucket check is only meaningful when we actually have an LB
+  // state snapshot. On a fresh page load (or with VPP disconnected)
+  // lb_state is undefined; in that window we fall back to "trust the
+  // control plane" so the icon still settles to ✅ instead of
+  // perpetual ❓ until the first GetVPPLBState round-trip.
+  const lbAvailable = !!snap.lb_state;
+  const feBuckets = snap.lb_state?.per_frontend?.[fe.name];
+  // Reactive read of the per-frontend settling flag. While true,
+  // we're still waiting for the next GetVPPLBState reconciliation
+  // after a recent control-plane change; the dataplane may be mid-
+  // reconverge so any "weight>0 but buckets==0" we'd see here is
+  // almost certainly a race, not a real bug.
+  const settling = !!state.settling[snap.maglevd.name]?.[fe.name];
+
+  let anyDown = false;
+  let dataplaneBug = false;
+  for (const pool of fe.pools) {
+    for (const pb of pool.backends) {
+      if (stateOf[pb.name] !== "up") anyDown = true;
+      if (!settling && lbAvailable && pb.effective_weight > 0) {
+        const b = feBuckets?.[pb.name];
+        if (b === undefined || b === 0) dataplaneBug = true;
+      }
+    }
+  }
+
+  const primary = fe.pools[0];
+  const primaryHasWeights = !!primary && primary.backends.some((pb) => pb.weight > 0);
+  const primaryAllZero = !primary || primary.backends.every((pb) => pb.effective_weight === 0);
+
+  if (!anyDown && primaryHasWeights && !dataplaneBug) return "ok";
+  if (dataplaneBug) return "bug-buckets";
+  if (primaryAllZero) return "primary-drained";
+  if (anyDown) return "degraded";
+  return "unknown";
+}
+
+export function frontendHealthIcon(snap: StateSnapshot, fe: FrontendSnapshot): string {
+  switch (frontendHealth(snap, fe)) {
+    case "ok":
+      return "✅";
+    case "bug-buckets":
+      return "‼️";
+    case "primary-drained":
+      return "❗";
+    case "degraded":
+      return "⚠️";
+    case "unknown":
+      return "❓";
+  }
 }

 // Helpers used by views.
--- a/cmd/frontend/web/src/stores/zippy.ts
+++ b/cmd/frontend/web/src/stores/zippy.ts
@@ -0,0 +1,55 @@
+import { createSignal } from "solid-js";
+
+// Persistence layer for collapsible (Zippy) panels. The cookie is a
+// best-effort hint: the page always renders all Zippies closed unless
+// their stable id is in the cookie's open-set, but a missing or
+// corrupt cookie just falls back to "everything closed", so losing it
+// (browser data clear, expiry, private window, write failure) is a
+// pure cosmetic regression.
+//
+// localStorage would arguably be a tidier home for this — it's
+// client-only and doesn't ride on every HTTP request — but the
+// payload is tiny and the user asked for a cookie, so a cookie it
+// is. SameSite=Lax keeps it from leaking to third-party iframes.
+
+const COOKIE_NAME = "maglev_zippy_open";
+const COOKIE_MAX_AGE = 60 * 60 * 24 * 365; // 1 year
+
+function readCookie(): Set<string> {
+  try {
+    const raw = document.cookie
+      .split("; ")
+      .find((c) => c.startsWith(COOKIE_NAME + "="));
+    if (!raw) return new Set();
+    const value = decodeURIComponent(raw.slice(COOKIE_NAME.length + 1));
+    if (!value) return new Set();
+    return new Set(value.split(","));
+  } catch {
+    return new Set();
+  }
+}
+
+function writeCookie(ids: Set<string>) {
+  try {
+    const value = encodeURIComponent([...ids].join(","));
+    document.cookie = `${COOKIE_NAME}=${value}; Path=/; Max-Age=${COOKIE_MAX_AGE}; SameSite=Lax`;
+  } catch {
+    // best-effort — quota, third-party-cookie blocks, etc. all silently fall back
+  }
+}
+
+const [openSet, setOpenSet] = createSignal<Set<string>>(readCookie());
+
+export function isZippyOpen(id: string): boolean {
+  return openSet().has(id);
+}
+
+export function setZippyOpen(id: string, open: boolean) {
+  const cur = openSet();
+  if (open === cur.has(id)) return;
+  const next = new Set(cur);
+  if (open) next.add(id);
+  else next.delete(id);
+  setOpenSet(next);
+  writeCookie(next);
+}
--- a/cmd/frontend/web/src/styles/theme.css
+++ b/cmd/frontend/web/src/styles/theme.css
@@ -174,9 +174,27 @@
  gap: 10px;
  flex-wrap: wrap;
 }
+.frontend-title-icon {
+  display: inline-block;
+  width: 1.5em;
+  text-align: center;
+  font-size: 14px;
+  line-height: 1;
+}
 .frontend-title-name {
  font-size: 15px;
  font-weight: 600;
+  /* Fixed-width slot so the state badge (and everything after it)
+   * lines up across every Zippy header. 40ch is wide enough for the
+   * longest realistic frontend name without crowding the icon to its
+   * left. Names that exceed the slot get an ellipsis rather than
+   * pushing the badge sideways. */
+  display: inline-block;
+  width: 40ch;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+  vertical-align: middle;
 }
 .frontend-title-addr {
  font-family: "SF Mono", Menlo, Consolas, monospace;
@@ -280,6 +298,9 @@
 .backend-table .col-effective {
  width: 95px;
 }
+.backend-table .col-buckets {
+  width: 95px;
+}
 .backend-table .col-age {
  width: 110px;
 }
--- a/cmd/frontend/web/src/types.ts
+++ b/cmd/frontend/web/src/types.ts
@@ -79,15 +79,29 @@ export type StateSnapshot = {
  healthchecks: HealthCheckSnapshot[];
  vpp_info?: VPPInfoSnapshot;
  vpp_state?: string; // "connected" | "disconnected" | ""
+  lb_state?: LBStateSnapshot;
+};
+
+// LBStateSnapshot is VPP's view of buckets-per-backend, scoped per
+// frontend. lb_state.per_frontend[frontendName][backendName] is the
+// number of consistent-hash buckets VPP currently routes to that
+// backend on that VIP. Missing entries (or a missing snapshot) render
+// as an em-dash in the SPA.
+export type LBStateSnapshot = {
+  per_frontend: Record<string, Record<string, number>>;
 };

 export type BrowserEvent = {
  maglevd: string;
-  type: "log" | "backend" | "frontend" | "maglevd-status" | "vpp-status" | "resync";
+  type: "log" | "backend" | "frontend" | "maglevd-status" | "vpp-status" | "lb-state" | "resync";
  at_unix_ns: number;
  payload: unknown;
 };

+export type LBStatePayload = {
+  per_frontend: Record<string, Record<string, number>> | null;
+};
+
 export type BackendEventPayload = {
  backend: string;
  transition: TransitionRecord;
--- a/cmd/frontend/web/src/views/BackendRow.tsx
+++ b/cmd/frontend/web/src/views/BackendRow.tsx
@@ -1,10 +1,10 @@
-import { Show, type Component } from "solid-js";
+import { Show, createMemo, type Component } from "solid-js";
 import type { BackendSnapshot, PoolBackendSnapshot } from "../types";
 import StatusBadge from "../components/StatusBadge";
 import ProbeHeartbeat from "../components/ProbeHeartbeat";
 import Flash from "../components/Flash";
 import BackendActionsMenu from "../components/BackendActionsMenu";
-import { lastTransitionAge } from "../stores/state";
+import { lastTransitionAge, lbBucketsFor, state as appState } from "../stores/state";
 import { isAdmin } from "../stores/mode";

 type Props = {
@@ -27,6 +27,18 @@ type Props = {

 const BackendRow: Component<Props> = (props) => {
  const b = () => props.backend;
+  // Subscribed lookup: lbBucketsFor reads from the reactive store, so
+  // the cell re-renders the moment a "lb-state" SSE event mutates the
+  // map. A missing entry (VPP disconnected, backend not yet programmed)
+  // renders as an em-dash; an explicit 0 means "in VPP, drained".
+  // createMemo guarantees Flash only sees a new value when the leaf
+  // actually changed — without it, any spurious upstream re-run (e.g.
+  // a sibling backend's transition triggering recomputeDerivedState)
+  // would pop the bucket cell on every backend in the table.
+  const bucketsLabel = createMemo<number | "—">(() => {
+    const v = lbBucketsFor(appState.byName[props.maglevd], props.frontend, b().name);
+    return v === undefined ? "—" : v;
+  });
  return (
    <tr
      class="backend-row"
@@ -50,6 +62,9 @@ const BackendRow: Component<Props> = (props) => {
      <td class="numeric">
        <Flash value={props.poolBackend.effective_weight} />
      </td>
+      <td class="numeric">
+        <Flash value={bucketsLabel()} />
+      </td>
      <td class="age">{lastTransitionAge(b().last_transition)}</td>
      <Show when={isAdmin}>
        <td class="actions">
--- a/cmd/frontend/web/src/views/DebugPanel.tsx
+++ b/cmd/frontend/web/src/views/DebugPanel.tsx
@@ -44,7 +44,7 @@ const DebugPanel: Component = () => {
  });

  return (
-    <Zippy title="Event stream">
+    <Zippy id="debug-events" title="Event stream">
      <ol class="event-tail" ref={olRef}>
        <For each={filtered()}>
          {(ev) => (
--- a/cmd/frontend/web/src/views/FrontendCard.tsx
+++ b/cmd/frontend/web/src/views/FrontendCard.tsx
@@ -5,7 +5,7 @@ import StatusBadge from "../components/StatusBadge";
 import Flash from "../components/Flash";
 import Zippy from "../components/Zippy";
 import { isAdmin } from "../stores/mode";
-import { formatVIPAddress } from "../stores/state";
+import { formatVIPAddress, frontendHealthIcon } from "../stores/state";

 type Props = {
  snap: StateSnapshot;
@@ -26,8 +26,15 @@ const FrontendCard: Component<Props> = (props) => {
  const backendByName = () => Object.fromEntries(props.snap.backends.map((b) => [b.name, b]));
  const fe = () => props.frontend;

+  // The icon span has a fixed width so the rest of the title doesn't
+  // jiggle horizontally when the verdict changes (✅ ↔ ⚠️ ↔ ❗ etc.).
+  // The role/aria-label gives the meaning without depending on the
+  // emoji glyph reading well to a screen reader.
  const title = (
    <span class="frontend-title">
+      <span class="frontend-title-icon" aria-label="health" role="img">
+        {frontendHealthIcon(props.snap, fe())}
+      </span>
      <span class="frontend-title-name">{fe().name}</span>
      <Flash value={fe().state ?? "unknown"}>
        <StatusBadge state={fe().state ?? "unknown"} />
@@ -40,7 +47,7 @@ const FrontendCard: Component<Props> = (props) => {
  );

  return (
-    <Zippy title={title} open>
+    <Zippy id={`frontend-${props.snap.maglevd.name}-${fe().name}`} title={title}>
      <table class="backend-table">
        <thead>
          <tr>
@@ -50,6 +57,7 @@ const FrontendCard: Component<Props> = (props) => {
            <th class="col-state">state</th>
            <th class="col-weight numeric">weight</th>
            <th class="col-effective numeric">effective</th>
+            <th class="col-buckets numeric">lb buckets</th>
            <th class="col-age">last transition</th>
            <Show when={isAdmin}>
              <th class="col-actions actions" />
--- a/cmd/frontend/web/src/views/VPPInfoPanel.tsx
+++ b/cmd/frontend/web/src/views/VPPInfoPanel.tsx
@@ -77,7 +77,7 @@ const VPPInfoPanel: Component<Props> = (props) => {
  );

  return (
-    <Zippy title={title}>
+    <Zippy id={`vpp-${props.name}`} title={title}>
      <Show when={props.info} fallback={<p class="empty">No VPP information available.</p>}>
        {(i) => (
          <dl class="kv">
--- a/cmd/maglevc/complete.go
+++ b/cmd/maglevc/complete.go
@@ -88,9 +88,25 @@ func (ql *questionListener) OnChange(line []rune, pos int, key rune) (newLine []
 	// Walk the confirmed prefix to the current node, then try to advance one
 	// more step using the partial token (via prefix-match or slot fallback).
 	// This mirrors birdc: "sh?" expands "sh" to "show" and shows show's subtree.
-	node, _ := Walk(ql.root, prefix)
+	node, _, remaining := Walk(ql.root, prefix)
 	displayPrefix := strings.Join(prefix, " ")
-	if partial != "" {
+	var unknownMsg string
+	if len(remaining) > 0 {
+		// One of the confirmed prefix tokens was unknown. Show an
+		// "unknown" banner, then list what's available at the deepest
+		// node we *did* reach so the operator can see what they could
+		// have typed instead. The partial at the cursor is irrelevant
+		// once the left context is already broken.
+		consumed := prefix[:len(prefix)-len(remaining)]
+		bad := remaining[0]
+		if len(consumed) == 0 {
+			unknownMsg = fmt.Sprintf("unknown command: %s", bad)
+		} else {
+			unknownMsg = fmt.Sprintf("unknown subcommand %q after %q", bad, strings.Join(consumed, " "))
+		}
+		displayPrefix = strings.Join(consumed, " ")
+		partial = ""
+	} else if partial != "" {
 		if next := matchFixedChild(node.Children, partial); next != nil {
 			// Partial uniquely matched a fixed child — descend into it.
 			node = next
@@ -127,7 +143,19 @@ func (ql *questionListener) OnChange(line []rune, pos int, key rune) (newLine []
 	}

 	// Emit output. Raw terminal mode requires \r\n.
-	fmt.Fprintf(ql.rl.Stderr(), "\r\n")
+	//
+	// readline's wrapWriter wraps every Write in a clean-write-print
+	// cycle: it erases the current input line, runs our closure, and
+	// redraws the prompt+buffer afterwards. That means starting the
+	// output with a bare "\r\n" leaves the original row blank, so the
+	// operator loses sight of what they typed. Instead we echo the
+	// full "maglev> show vpp lb ?" ourselves as the first write —
+	// that lands on the just-cleaned row, birdc-style, and the
+	// subsequent Fprintfs each redraw a fresh prompt below the help.
+	fmt.Fprintf(ql.rl.Stderr(), "%s%s\r\n", ql.rl.Config.Prompt, string(line))
+	if unknownMsg != "" {
+		fmt.Fprintf(ql.rl.Stderr(), "  %s\r\n", unknownMsg)
+	}
 	if len(lines) == 0 {
 		fmt.Fprintf(ql.rl.Stderr(), "  <no completions>\r\n")
 	} else {
--- a/cmd/maglevc/shell.go
+++ b/cmd/maglevc/shell.go
@@ -66,20 +66,40 @@ func runShell(ctx context.Context, client grpcapi.MaglevClient) error {

 // dispatch walks the tree and executes the matched command.
 func dispatch(ctx context.Context, root *Node, client grpcapi.MaglevClient, tokens []string) error {
-	node, args := Walk(root, tokens)
+	node, args, remaining := Walk(root, tokens)
+
+	if len(remaining) > 0 {
+		// One or more tokens couldn't be matched. Report the first
+		// offending token with the consumed prefix for context; don't
+		// dump the full command tree prefixed with garbage, which is
+		// what the previous code did and what prompted this fix.
+		consumed := tokens[:len(tokens)-len(remaining)]
+		return unknownCommandError(consumed, remaining[0])
+	}

 	if node.Run == nil {
-		showHelp(root, tokens)
+		showHelpAt(node, strings.Join(tokens, " "))
 		return nil
 	}

 	return node.Run(ctx, client, args)
 }

-// showHelp prints all reachable commands from the given token path, birdc-style.
-func showHelp(root *Node, tokens []string) {
-	node, _ := Walk(root, tokens)
-	prefix := strings.Join(tokens, " ")
+// unknownCommandError builds the error returned by dispatch when the
+// tree walk couldn't consume the full token list. The format differs
+// slightly depending on whether any tokens were consumed, so the
+// message always points at the first unknown token and its context.
+func unknownCommandError(consumed []string, bad string) error {
+	if len(consumed) == 0 {
+		return fmt.Errorf("unknown command: %s", bad)
+	}
+	return fmt.Errorf("unknown subcommand %q after %q", bad, strings.Join(consumed, " "))
+}
+
+// showHelpAt prints the reachable leaves below node, each displayed
+// with the given prefix. Split from dispatch so the caller can decide
+// which node to anchor the help at without re-walking the tree.
+func showHelpAt(node *Node, prefix string) {
 	lines := expandPaths(node, prefix, make(map[*Node]bool))

 	maxLen := 0
--- a/cmd/maglevc/tree.go
+++ b/cmd/maglevc/tree.go
@@ -23,8 +23,12 @@ type Node struct {
 // Walk descends the tree following tokens. At each step it tries fixed
 // children first (exact then prefix), then falls back to a slot child
 // (Dynamic != nil). Tokens consumed by slot children are collected as args.
-// Returns the deepest node reached and the args collected from slot nodes.
-func Walk(root *Node, tokens []string) (*Node, []string) {
+// Returns the deepest node reached, the args collected from slot nodes,
+// and any tokens that could not be matched. A non-empty remaining slice
+// means the input contained a token that neither matched a fixed child
+// at the current node nor fed into a slot — callers should treat that
+// as "unknown command" rather than silently anchoring help at the root.
+func Walk(root *Node, tokens []string) (*Node, []string, []string) {
 	node := root
 	var args []string
 	for len(tokens) > 0 {
@@ -47,10 +51,11 @@ func Walk(root *Node, tokens []string) (*Node, []string) {
 			continue
 		}

-		// Dead end — no match.
+		// Dead end — no match. The caller gets the still-unconsumed tail
+		// in the third return value.
 		break
 	}
-	return node, args
+	return node, args, tokens
 }

 // matchFixedChild returns the child matching tok by exact then unique prefix,
@@ -124,8 +129,14 @@ func expandPaths(node *Node, prefix string, visited map[*Node]bool) []helpLine {
 // Candidates returns the completable children at the current position given
 // the already-typed tokens and the partial token being completed.
 func Candidates(root *Node, tokens []string, partial string, ctx context.Context, client grpcapi.MaglevClient) []*Node {
-	// Walk the already-confirmed tokens.
-	node, _ := Walk(root, tokens)
+	// Walk the already-confirmed tokens. If any of them are unknown,
+	// offer no completions at all — continuing to suggest children off
+	// the partially-walked node would mislead the user into "completing"
+	// an invalid command.
+	node, _, remaining := Walk(root, tokens)
+	if len(remaining) > 0 {
+		return nil
+	}

 	// Now look at what could follow at this node.
 	// Check fixed children filtered by partial.
--- a/cmd/maglevc/tree_test.go
+++ b/cmd/maglevc/tree_test.go
@@ -53,7 +53,7 @@ func TestExpandPathsRoot(t *testing.T) {

 func TestExpandPathsShow(t *testing.T) {
 	root := buildTree()
-	showNode, _ := Walk(root, []string{"show"})
+	showNode, _, _ := Walk(root, []string{"show"})
 	lines := expandPaths(showNode, "show", make(map[*Node]bool))

 	for _, l := range lines {
@@ -75,7 +75,7 @@ func TestExpandPathsShow(t *testing.T) {
 func TestExpandPathsNoCycles(t *testing.T) {
 	root := buildTree()
 	// watch events has a self-referencing slot; expandPaths must terminate.
-	watchEvents, _ := Walk(root, []string{"watch", "events"})
+	watchEvents, _, _ := Walk(root, []string{"watch", "events"})
 	lines := expandPaths(watchEvents, "watch events", make(map[*Node]bool))

 	// Should produce exactly 2 lines: "watch events" and "watch events <opt>".
@@ -87,7 +87,7 @@ func TestExpandPathsNoCycles(t *testing.T) {
 func TestExpandPathsSetBackendName(t *testing.T) {
 	root := buildTree()
 	// Walk to the name slot so displayPrefix carries the actual arg.
-	node, _ := Walk(root, []string{"set", "backend", "mybackend"})
+	node, _, _ := Walk(root, []string{"set", "backend", "mybackend"})
 	lines := expandPaths(node, "set backend mybackend", make(map[*Node]bool))

 	want := []string{
@@ -110,31 +110,37 @@ func TestPrefixMatchCollapsedNouns(t *testing.T) {
 	root := buildTree()

 	// "sh ba" → show backends (list all) via prefix matching.
-	node, args := Walk(root, []string{"sh", "ba"})
+	node, args, rem := Walk(root, []string{"sh", "ba"})
 	if node.Run == nil {
 		t.Fatal("'sh ba' did not reach a Run node")
 	}
 	if len(args) != 0 {
 		t.Errorf("'sh ba' should have 0 args, got %v", args)
 	}
+	if len(rem) != 0 {
+		t.Errorf("'sh ba' should fully consume tokens, got remaining %v", rem)
+	}

 	// "sh ba nginx0" → show backends <name> (get specific) via slot.
-	node, args = Walk(root, []string{"sh", "ba", "nginx0"})
+	node, args, rem = Walk(root, []string{"sh", "ba", "nginx0"})
 	if node.Run == nil {
 		t.Fatal("'sh ba nginx0' did not reach a Run node")
 	}
 	if len(args) != 1 || args[0] != "nginx0" {
 		t.Errorf("'sh ba nginx0' args: got %v, want [nginx0]", args)
 	}
+	if len(rem) != 0 {
+		t.Errorf("'sh ba nginx0' should fully consume tokens, got remaining %v", rem)
+	}

 	// "sh fr" → show frontends (list all).
-	node, _ = Walk(root, []string{"sh", "fr"})
+	node, _, _ = Walk(root, []string{"sh", "fr"})
 	if node.Run == nil {
 		t.Fatal("'sh fr' did not reach a Run node")
 	}

 	// "sh he icmp" → show healthchecks icmp (get specific).
-	node, args = Walk(root, []string{"sh", "he", "icmp"})
+	node, args, _ = Walk(root, []string{"sh", "he", "icmp"})
 	if node.Run == nil {
 		t.Fatal("'sh he icmp' did not reach a Run node")
 	}
@@ -143,11 +149,37 @@ func TestPrefixMatchCollapsedNouns(t *testing.T) {
 	}
 }

+func TestWalkUnknownTokens(t *testing.T) {
+	root := buildTree()
+
+	// A bare unknown word leaves every token unconsumed and anchors
+	// the returned node at the root — callers must treat this as
+	// "unknown command" rather than silently showing the whole tree.
+	node, _, rem := Walk(root, []string{"foo"})
+	if node != root {
+		t.Errorf("'foo' should leave walk at root, got %q", node.Word)
+	}
+	if len(rem) != 1 || rem[0] != "foo" {
+		t.Errorf("'foo' remaining: got %v, want [foo]", rem)
+	}
+
+	// Partial consumption: "show" matches but "bogus" doesn't. The
+	// returned remaining is the first unmatched token onwards so the
+	// caller can point at exactly what was wrong.
+	node, _, rem = Walk(root, []string{"show", "bogus", "tail"})
+	if node.Word != "show" {
+		t.Errorf("'show bogus tail' should stop at show, got %q", node.Word)
+	}
+	if len(rem) != 2 || rem[0] != "bogus" || rem[1] != "tail" {
+		t.Errorf("'show bogus tail' remaining: got %v, want [bogus tail]", rem)
+	}
+}
+
 func TestExpandPathsWeightSlotWalk(t *testing.T) {
 	// Verify the weight command is fully walkable (fixes bug: setWeightValue
 	// and setFrontendPoolName were non-slot nodes that couldn't capture tokens).
 	root := buildTree()
-	node, args := Walk(root, []string{"set", "frontend", "web", "pool", "primary", "backend", "be0", "weight", "42"})
+	node, args, _ := Walk(root, []string{"set", "frontend", "web", "pool", "primary", "backend", "be0", "weight", "42"})
 	if node.Run == nil {
 		t.Fatal("Walk did not reach a Run node for full weight command")
 	}
--- a/internal/checker/checker.go
+++ b/internal/checker/checker.go
@@ -6,6 +6,7 @@ import (
 	"context"
 	"fmt"
 	"log/slog"
+	"math/rand/v2"
 	"net"
 	"sort"
 	"sync"
@@ -586,7 +587,7 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
 				sleepFor = 30 * time.Second
 			}
 		} else {
-			sleepFor = w.backend.NextInterval(hc.Interval, hc.FastInterval, hc.DownInterval)
+			sleepFor = jitterInterval(w.backend.NextInterval(hc.Interval, hc.FastInterval, hc.DownInterval))
 		}
 		c.mu.RUnlock()

@@ -827,3 +828,14 @@ func staggerDelay(interval time.Duration, pos, total int) time.Duration {
 	}
 	return time.Duration(int64(interval) * int64(pos) / int64(total))
 }
+
+// jitterInterval scales d by a uniformly-random factor in [0.9, 1.1) so that
+// probe schedules across many backends drift apart instead of all firing on
+// the same tick after process start (or after a config reload re-staggers them
+// onto identical phases).
+func jitterInterval(d time.Duration) time.Duration {
+	if d <= 0 {
+		return d
+	}
+	return time.Duration(float64(d) * (0.9 + 0.2*rand.Float64()))
+}
--- a/internal/prober/http.go
+++ b/internal/prober/http.go
@@ -12,9 +12,12 @@ import (
 	"strconv"
 	"strings"

+	buildinfo "git.ipng.ch/ipng/vpp-maglev/cmd"
 	"git.ipng.ch/ipng/vpp-maglev/internal/health"
 )

+var userAgent = "maglev-healthchecker/" + buildinfo.Version() + " (+https://git.ipng.ch/ipng/vpp-maglev)"
+
 // HTTPProbe sends a plain HTTP GET to cfg.Target inside the healthcheck netns.
 func HTTPProbe(ctx context.Context, cfg ProbeConfig) health.ProbeResult {
 	return doHTTPProbe(ctx, cfg, false)
@@ -110,7 +113,7 @@ func doHTTPProbe(ctx context.Context, cfg ProbeConfig, useTLS bool) health.Probe
 		return health.ProbeResult{OK: false, Layer: health.LayerL7, Code: "L7RSP", Detail: err.Error()}
 	}
 	req.Host = hostHeader
-	req.Header.Set("User-Agent", "maglev-healthchecker/1.0")
+	req.Header.Set("User-Agent", userAgent)

 	resp, err := client.Do(req)
 	if err != nil {
--- a/internal/vpp/lbstate.go
+++ b/internal/vpp/lbstate.go
@@ -79,7 +79,7 @@ func (c *Client) GetLBStateAll() (*LBState, error) {
 		return nil, err
 	}
 	for i := range vips {
-		ases, err := dumpASesForVIP(ch, vips[i].Protocol, vips[i].Port)
+		ases, err := dumpASesForVIP(ch, vips[i].Prefix, vips[i].Protocol, vips[i].Port)
 		if err != nil {
 			return nil, err
 		}
@@ -159,7 +159,7 @@ func lookupVIP(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16
 		if all[i].Protocol != protocol || all[i].Port != port {
 			continue
 		}
-		ases, err := dumpASesForVIP(ch, protocol, port)
+		ases, err := dumpASesForVIP(ch, all[i].Prefix, protocol, port)
 		if err != nil {
 			return nil, err
 		}
@@ -170,13 +170,22 @@ func lookupVIP(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16
 }

 // dumpASesForVIP returns the application servers bound to the VIP identified
-// by (protocol, port). VPP's lb_as_v2_dump filter is used; we also guard
-// defensively against replies for other VIPs.
-func dumpASesForVIP(ch *loggedChannel, protocol uint8, port uint16) ([]LBAS, error) {
+// by (prefix, protocol, port).
+//
+// VPP's lb_as_v2_dump does not honour the request's Pfx field — the LB
+// plugin only filters on (protocol, port), so a single dump call returns
+// ASes for every VIP sharing that proto+port pair (e.g. an IPv4 and an
+// IPv6 VIP both listening on TCP/80). We do the prefix filter in Go on
+// the response side. Earlier we tried setting Pfx in the request as
+// well; for reasons unknown that silently dropped every IPv4 reply, so
+// this code intentionally leaves Pfx zero and relies entirely on
+// post-filtering.
+func dumpASesForVIP(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16) ([]LBAS, error) {
 	req := &lb.LbAsV2Dump{
 		Protocol: protocol,
 		Port:     port,
 	}
+	want := prefix.String()
 	reqCtx := ch.SendMultiRequest(req)
 	var out []LBAS
 	for {
@@ -191,6 +200,9 @@ func dumpASesForVIP(ch *loggedChannel, protocol uint8, port uint16) ([]LBAS, err
 		if reply.Vip.Port != port || uint8(reply.Vip.Protocol) != protocol {
 			continue
 		}
+		if lbVipPrefix(reply.Vip).String() != want {
+			continue
+		}
 		var inUse time.Time
 		if reply.InUseSince != 0 {
 			inUse = time.Unix(int64(reply.InUseSince), 0)