LB buckets column + health cascade; VPP dump fix; maglevc strictness

SPA (cmd/frontend/web): - New "lb buckets" column backed by a 1s-debounced GetVPPLBState fetch loop with leading+trailing edge coalesce. - Per-frontend health icon (✅/⚠️/❗/‼️/❓) in the Zippy header, gated by a settling flag that suppresses ‼️ until the next lb-state reconciliation after a backend transition or weight change. - In-place leaf merge on lb-state so stable bucket values (e.g. "0") don't retrigger the Flash animation on every refresh. - Zippy cards remember open state in a cookie, default closed on fresh load; fixed-width frontend-title-name + reserved icon slot so headers line up across all cards. - Clock-drift watchdog in sse.ts that forces a fresh EventSource on laptop-wake so the broker emits a resync instead of hanging on a dead half-open socket. Frontend service (cmd/frontend): - maglevClient.lbStateLoop, trigger on backend transitions + vpp-connect, best-effort fetch on refreshAll. - Admin handlers explicitly wake the lb-state loop after lifecycle ops and set-weight (the latter emits no transition event on the maglevd side, so the WatchEvents path wouldn't have caught it). - /favicon.ico served from embedded web/public IPng logo. VPP integration: - internal/vpp/lbstate.go: dumpASesForVIP drops Pfx from the dump request (setting it silently wipes IPv4 replies in the LB plugin) and filters results by prefix on the response side instead, which also demuxes multi-VIP-on-same-port cases correctly. maglevc: - Walk now returns the unconsumed token tail; dispatch and the question listener reject unknown commands with a targeted error instead of dumping the full command tree prefixed with garbage. - On '?', echo the current line (including the '?') before the help list so the output reads like birdc. Checker / prober: - internal/checker: ±10% jitter on NextInterval so probes across restart don't all fire on the same tick. - internal/prober: HTTP User-Agent now carries the build version and project URL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 14:23:34 +02:00
parent 35643fd774
commit 1a1c48ef54
28 changed files with 828 additions and 57 deletions
--- a/cmd/frontend/client.go
+++ b/cmd/frontend/client.go
@@ -33,6 +33,12 @@ type maglevClient struct {
 	connected bool
 	lastErr   string
 	cache     cachedState
+
+	// lbWakeCh is a buffer-1 trigger channel feeding lbStateLoop. Every
+	// backend transition (and a few other events) does a non-blocking send
+	// here; the loop coalesces bursts into at most one GetVPPLBState call
+	// per second. See lbStateLoop for the leading+trailing-edge debounce.
+	lbWakeCh chan struct{}
 }

 // cachedState is the per-maglevd snapshot served via the REST handlers.
@@ -49,6 +55,7 @@ type cachedState struct {
 	HealthCheckOrder []string
 	VPPInfo          *VPPInfoSnapshot
 	VPPState         string // "", "connected", "disconnected"
+	LBState          *LBStateSnapshot
 	LastRefresh      time.Time
 }

@@ -69,6 +76,7 @@ func newMaglevClient(address string, broker *Broker) (*maglevClient, error) {
 			Backends:     map[string]*BackendSnapshot{},
 			HealthChecks: map[string]*HealthCheckSnapshot{},
 		},
+		lbWakeCh: make(chan struct{}, 1),
 	}, nil
 }

@@ -147,6 +155,7 @@ func (c *maglevClient) Start(ctx context.Context) {
 	go c.watchLoop(ctx)
 	go c.refreshLoop(ctx)
 	go c.healthLoop(ctx)
+	go c.lbStateLoop(ctx)
 }

 func (c *maglevClient) setConnected(ok bool, errMsg string) {
@@ -196,6 +205,7 @@ func (c *maglevClient) Snapshot() *StateSnapshot {
 		HealthChecks: make([]*HealthCheckSnapshot, 0, len(c.cache.HealthCheckOrder)),
 		VPPInfo:      c.cache.VPPInfo,
 		VPPState:     c.cache.VPPState,
+		LBState:      c.cache.LBState,
 	}
 	for _, name := range c.cache.FrontendsOrder {
 		if f, ok := c.cache.Frontends[name]; ok {
@@ -302,6 +312,11 @@ func (c *maglevClient) refreshAll(ctx context.Context) error {
 	c.cache.VPPState = vppState
 	c.cache.LastRefresh = time.Now()
 	c.mu.Unlock()
+	// Best-effort LB state pull so /view/api/state served on a fresh
+	// page load already carries the bucket column. Errors are
+	// swallowed by fetchLBStateAndPublish (which clears the cache and
+	// emits an empty event so the SPA renders "—").
+	c.fetchLBStateAndPublish(ctx)
 	return nil
 }

@@ -434,6 +449,11 @@ func (c *maglevClient) handleEvent(ev *grpcapi.Event) {
 			AtUnixNs: tr.AtUnixNs,
 			Payload:  payload,
 		})
+		// A real transition means VPP is about to (or already did)
+		// reshuffle bucket allocations across the affected VIP. Wake
+		// the lb-state loop so the SPA's bucket column converges
+		// without waiting for the 30s refresh.
+		c.triggerLBStateFetch()

 	case *grpcapi.Event_Frontend:
 		fe := body.Frontend
@@ -544,6 +564,11 @@ func (c *maglevClient) applyVPPLogHeartbeat(msg string) {
 		AtUnixNs: time.Now().UnixNano(),
 		Payload:  payload,
 	})
+	// VPP just came back: pull fresh LB state so the bucket column
+	// repopulates immediately instead of waiting up to 30s for the
+	// next refresh tick. On vpp-disconnect the next fetch will fail
+	// and clear the cache, which is also the right behaviour.
+	c.triggerLBStateFetch()
 }

 func (c *maglevClient) applyBackendTransition(name string, tr *TransitionRecord) {
@@ -677,6 +702,175 @@ func transitionFromProto(t *grpcapi.TransitionRecord) *TransitionRecord {
 	}
 }

+// triggerLBStateFetch sends a non-blocking wake to lbStateLoop. The
+// channel has buffer 1 so coalesced bursts never block the publisher.
+func (c *maglevClient) triggerLBStateFetch() {
+	select {
+	case c.lbWakeCh <- struct{}{}:
+	default:
+	}
+}
+
+// lbStateLoop consumes wake signals and calls GetVPPLBState, with a
+// leading+trailing-edge debounce so we never exceed one fetch per
+// minLBInterval (1s). The leading edge means the very first wake after
+// an idle period fires immediately — important so a single isolated
+// transition isn't artificially delayed by a second. The trailing edge
+// means a burst of wakes during the cool-down still gets one final
+// fetch right after the gate opens, so the SPA always converges to a
+// post-burst snapshot rather than missing the last update.
+func (c *maglevClient) lbStateLoop(ctx context.Context) {
+	const minLBInterval = time.Second
+	var (
+		timer     *time.Timer
+		lastFetch time.Time
+	)
+	timerCh := func() <-chan time.Time {
+		if timer == nil {
+			return nil
+		}
+		return timer.C
+	}
+	fire := func() {
+		if timer != nil {
+			if !timer.Stop() {
+				select {
+				case <-timer.C:
+				default:
+				}
+			}
+			timer = nil
+		}
+		c.fetchLBStateAndPublish(ctx)
+		lastFetch = time.Now()
+	}
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-c.lbWakeCh:
+			wait := minLBInterval - time.Since(lastFetch)
+			if wait <= 0 {
+				fire()
+			} else if timer == nil {
+				timer = time.NewTimer(wait)
+			}
+		case <-timerCh():
+			timer = nil
+			fire()
+		}
+	}
+}
+
+// fetchLBStateAndPublish runs one GetVPPLBState round-trip, rebuilds
+// the per-frontend bucket map, swaps it into the cache, and broadcasts
+// a "lb-state" BrowserEvent. On error the cache is cleared and an
+// empty event is published so the SPA can switch the bucket column to
+// em-dashes — clear-on-error is simpler than stale-but-visible and
+// doesn't risk showing a confusing snapshot from before VPP died.
+func (c *maglevClient) fetchLBStateAndPublish(ctx context.Context) {
+	fctx, cancel := context.WithTimeout(ctx, 5*time.Second)
+	defer cancel()
+	lbs, err := c.api.GetVPPLBState(fctx, &grpcapi.GetVPPLBStateRequest{})
+	if err != nil {
+		c.mu.Lock()
+		had := c.cache.LBState != nil
+		c.cache.LBState = nil
+		c.mu.Unlock()
+		slog.Debug("lb-state-fetch", "maglevd", c.name, "err", err)
+		if had {
+			c.publishLBState(nil)
+		}
+		return
+	}
+	snap := c.buildLBStateSnapshot(lbs)
+	c.mu.Lock()
+	c.cache.LBState = snap
+	c.mu.Unlock()
+	c.publishLBState(snap.PerFrontend)
+}
+
+func (c *maglevClient) publishLBState(perFrontend map[string]map[string]int32) {
+	payload, _ := json.Marshal(LBStatePayload{PerFrontend: perFrontend})
+	c.broker.Publish(BrowserEvent{
+		Maglevd:  c.name,
+		Type:     "lb-state",
+		AtUnixNs: time.Now().UnixNano(),
+		Payload:  payload,
+	})
+}
+
+// buildLBStateSnapshot translates a VPP-side state record (keyed by
+// CIDR/protocol/port and AS address) into a maglev-side record (keyed
+// by frontend name and backend name). Unmatched VIPs and unmatched AS
+// addresses are silently skipped — they're benign side effects of a
+// transient sync gap or a backend address that's only present in one
+// of the two universes.
+func (c *maglevClient) buildLBStateSnapshot(lbs *grpcapi.VPPLBState) *LBStateSnapshot {
+	c.mu.RLock()
+	feByVIP := make(map[string]string, len(c.cache.Frontends))
+	for _, f := range c.cache.Frontends {
+		feByVIP[lbVIPKey(f.Address, f.Protocol, f.Port)] = f.Name
+	}
+	backendByAddr := make(map[string]string, len(c.cache.Backends))
+	for _, b := range c.cache.Backends {
+		backendByAddr[b.Address] = b.Name
+	}
+	c.mu.RUnlock()
+
+	out := &LBStateSnapshot{PerFrontend: map[string]map[string]int32{}}
+	for _, v := range lbs.GetVips() {
+		feName, ok := feByVIP[lbVIPKey(stripLBHostMask(v.GetPrefix()), lbProtoString(v.GetProtocol()), v.GetPort())]
+		if !ok {
+			continue
+		}
+		row := out.PerFrontend[feName]
+		if row == nil {
+			row = map[string]int32{}
+			out.PerFrontend[feName] = row
+		}
+		for _, as := range v.GetApplicationServers() {
+			bname, ok := backendByAddr[as.GetAddress()]
+			if !ok {
+				continue
+			}
+			row[bname] = int32(as.GetNumBuckets())
+		}
+	}
+	return out
+}
+
+// lbVIPKey is the join key between a maglev FrontendSnapshot and a
+// VPP-side VPPLBVIP record. Stripping the mask and lower-casing the
+// protocol gives a canonical form that both sides can produce.
+func lbVIPKey(addr, proto string, port uint32) string {
+	return fmt.Sprintf("%s/%s/%d", addr, strings.ToLower(proto), port)
+}
+
+// lbProtoString mirrors maglevc's protoString — kept local to avoid a
+// cross-package import for two trivial helpers.
+func lbProtoString(p uint32) string {
+	switch p {
+	case 6:
+		return "tcp"
+	case 17:
+		return "udp"
+	case 255:
+		return "any"
+	}
+	return fmt.Sprintf("%d", p)
+}
+
+// stripLBHostMask trims "/32" or "/128" from a VPP host-prefix VIP so
+// it can be compared against a maglev FrontendSnapshot.Address (which
+// is bare). Other shapes are returned unchanged.
+func stripLBHostMask(prefix string) string {
+	if strings.HasSuffix(prefix, "/32") || strings.HasSuffix(prefix, "/128") {
+		return prefix[:strings.LastIndexByte(prefix, '/')]
+	}
+	return prefix
+}
+
 func healthCheckFromProto(h *grpcapi.HealthCheckInfo) *HealthCheckSnapshot {
 	return &HealthCheckSnapshot{
 		Name:           h.GetName(),