vpp-maglev/cmd/frontend/client.go

// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>

package main

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"log/slog"
	"net"
	"strings"
	"sync"
	"time"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	"git.ipng.ch/ipng/vpp-maglev/internal/grpcapi"
)

// maglevClient is a per-maglevd gRPC client plus cache and background loops.
type maglevClient struct {
	name    string
	address string
	conn    *grpc.ClientConn
	api     grpcapi.MaglevClient
	broker  *Broker

	mu        sync.RWMutex
	connected bool
	lastErr   string
	cache     cachedState
}

// cachedState is the per-maglevd snapshot served via the REST handlers.
// Frontends / Backends / HealthChecks are maps for O(1) lookup from the
// event path, and the *Order slices preserve the order returned by the
// corresponding List* RPC so the UI renders in a stable order across
// reloads instead of Go map iteration's randomised order.
type cachedState struct {
	Frontends        map[string]*FrontendSnapshot
	FrontendsOrder   []string
	Backends         map[string]*BackendSnapshot
	BackendsOrder    []string
	HealthChecks     map[string]*HealthCheckSnapshot
	HealthCheckOrder []string
	VPPInfo          *VPPInfoSnapshot
	LastRefresh      time.Time
}

func newMaglevClient(address string, broker *Broker) (*maglevClient, error) {
	conn, err := grpc.NewClient(address,
		grpc.WithTransportCredentials(insecure.NewCredentials()))
	if err != nil {
		return nil, err
	}
	return &maglevClient{
		name:    hostnameOf(address),
		address: address,
		conn:    conn,
		api:     grpcapi.NewMaglevClient(conn),
		broker:  broker,
		cache: cachedState{
			Frontends:    map[string]*FrontendSnapshot{},
			Backends:     map[string]*BackendSnapshot{},
			HealthChecks: map[string]*HealthCheckSnapshot{},
		},
	}, nil
}

// hostnameOf strips the port from an address and returns a short display
// name. For DNS names we take the first label ("lb-ams.internal:9090" →
// "lb-ams"). For IP literals we return the full address so we don't
// accidentally truncate "127.0.0.1" to "127".
func hostnameOf(address string) string {
	host := address
	if h, _, err := net.SplitHostPort(address); err == nil {
		host = h
	}
	host = strings.TrimPrefix(strings.TrimSuffix(host, "]"), "[")
	if net.ParseIP(host) != nil {
		return host
	}
	if i := strings.Index(host, "."); i >= 0 {
		return host[:i]
	}
	return host
}

func (c *maglevClient) Close() {
	_ = c.conn.Close()
}

func (c *maglevClient) Start(ctx context.Context) {
	go c.watchLoop(ctx)
	go c.refreshLoop(ctx)
	go c.healthLoop(ctx)
}

func (c *maglevClient) setConnected(ok bool, errMsg string) {
	c.mu.Lock()
	prev := c.connected
	c.connected = ok
	c.lastErr = errMsg
	c.mu.Unlock()
	if prev != ok {
		payload, _ := json.Marshal(MaglevdStatusPayload{Connected: ok, LastError: errMsg})
		c.broker.Publish(BrowserEvent{
			Maglevd:  c.name,
			Type:     "maglevd-status",
			AtUnixNs: time.Now().UnixNano(),
			Payload:  payload,
		})
	}
}

// Info returns the current connection status for this maglevd.
func (c *maglevClient) Info() MaglevdInfo {
	c.mu.RLock()
	defer c.mu.RUnlock()
	return MaglevdInfo{
		Name:      c.name,
		Address:   c.address,
		Connected: c.connected,
		LastError: c.lastErr,
	}
}

// Snapshot returns a deep-ish copy of the cached state for REST handlers.
// Iteration order follows the corresponding *Order slice so the UI sees a
// stable, RPC-defined order across reloads.
func (c *maglevClient) Snapshot() *StateSnapshot {
	c.mu.RLock()
	defer c.mu.RUnlock()
	snap := &StateSnapshot{
		Maglevd: MaglevdInfo{
			Name:      c.name,
			Address:   c.address,
			Connected: c.connected,
			LastError: c.lastErr,
		},
		Frontends:    make([]*FrontendSnapshot, 0, len(c.cache.FrontendsOrder)),
		Backends:     make([]*BackendSnapshot, 0, len(c.cache.BackendsOrder)),
		HealthChecks: make([]*HealthCheckSnapshot, 0, len(c.cache.HealthCheckOrder)),
		VPPInfo:      c.cache.VPPInfo,
	}
	for _, name := range c.cache.FrontendsOrder {
		if f, ok := c.cache.Frontends[name]; ok {
			snap.Frontends = append(snap.Frontends, f)
		}
	}
	for _, name := range c.cache.BackendsOrder {
		if b, ok := c.cache.Backends[name]; ok {
			snap.Backends = append(snap.Backends, b)
		}
	}
	for _, name := range c.cache.HealthCheckOrder {
		if h, ok := c.cache.HealthChecks[name]; ok {
			snap.HealthChecks = append(snap.HealthChecks, h)
		}
	}
	return snap
}

// refreshAll pulls a full fresh view of the maglevd's state into the cache.
// Called from the refreshLoop every 30s and immediately after a successful
// reconnect.
func (c *maglevClient) refreshAll(ctx context.Context) error {
	rctx, cancel := context.WithTimeout(ctx, 10*time.Second)
	defer cancel()

	frontends := map[string]*FrontendSnapshot{}
	fl, err := c.api.ListFrontends(rctx, &grpcapi.ListFrontendsRequest{})
	if err != nil {
		return fmt.Errorf("list frontends: %w", err)
	}
	frontendsOrder := append([]string(nil), fl.GetFrontendNames()...)
	for _, name := range frontendsOrder {
		fi, err := c.api.GetFrontend(rctx, &grpcapi.GetFrontendRequest{Name: name})
		if err != nil {
			return fmt.Errorf("get frontend %s: %w", name, err)
		}
		frontends[name] = frontendFromProto(fi)
	}

	backends := map[string]*BackendSnapshot{}
	bl, err := c.api.ListBackends(rctx, &grpcapi.ListBackendsRequest{})
	if err != nil {
		return fmt.Errorf("list backends: %w", err)
	}
	backendsOrder := append([]string(nil), bl.GetBackendNames()...)
	for _, name := range backendsOrder {
		bi, err := c.api.GetBackend(rctx, &grpcapi.GetBackendRequest{Name: name})
		if err != nil {
			return fmt.Errorf("get backend %s: %w", name, err)
		}
		backends[name] = backendFromProto(bi)
	}

	healthchecks := map[string]*HealthCheckSnapshot{}
	hl, err := c.api.ListHealthChecks(rctx, &grpcapi.ListHealthChecksRequest{})
	if err != nil {
		return fmt.Errorf("list healthchecks: %w", err)
	}
	healthCheckOrder := append([]string(nil), hl.GetNames()...)
	for _, name := range healthCheckOrder {
		hi, err := c.api.GetHealthCheck(rctx, &grpcapi.GetHealthCheckRequest{Name: name})
		if err != nil {
			return fmt.Errorf("get healthcheck %s: %w", name, err)
		}
		healthchecks[name] = healthCheckFromProto(hi)
	}

	var vppInfo *VPPInfoSnapshot
	if vi, err := c.api.GetVPPInfo(rctx, &grpcapi.GetVPPInfoRequest{}); err == nil {
		vppInfo = &VPPInfoSnapshot{
			Version:       vi.GetVersion(),
			BuildDate:     vi.GetBuildDate(),
			PID:           vi.GetPid(),
			BoottimeNs:    vi.GetBoottimeNs(),
			ConnecttimeNs: vi.GetConnecttimeNs(),
		}
	}

	c.mu.Lock()
	c.cache.Frontends = frontends
	c.cache.FrontendsOrder = frontendsOrder
	c.cache.Backends = backends
	c.cache.BackendsOrder = backendsOrder
	c.cache.HealthChecks = healthchecks
	c.cache.HealthCheckOrder = healthCheckOrder
	c.cache.VPPInfo = vppInfo
	c.cache.LastRefresh = time.Now()
	c.mu.Unlock()
	return nil
}

// watchLoop subscribes to WatchEvents and feeds the broker until the context
// is cancelled. Reconnects with exponential backoff on stream errors.
func (c *maglevClient) watchLoop(ctx context.Context) {
	backoff := time.Second
	maxBackoff := 30 * time.Second
	for {
		if ctx.Err() != nil {
			return
		}
		if err := c.watchOnce(ctx); err != nil {
			if ctx.Err() != nil {
				return
			}
			slog.Warn("watch-disconnected", "maglevd", c.name, "err", err)
			c.setConnected(false, err.Error())
			select {
			case <-ctx.Done():
				return
			case <-time.After(backoff):
			}
			backoff *= 2
			if backoff > maxBackoff {
				backoff = maxBackoff
			}
			continue
		}
		backoff = time.Second
	}
}

func (c *maglevClient) watchOnce(ctx context.Context) error {
	logFlag := true
	backendFlag := true
	frontendFlag := true
	req := &grpcapi.WatchRequest{
		Log:      &logFlag,
		LogLevel: "debug",
		Backend:  &backendFlag,
		Frontend: &frontendFlag,
	}
	stream, err := c.api.WatchEvents(ctx, req)
	if err != nil {
		return fmt.Errorf("open stream: %w", err)
	}
	// Successful subscribe: mark connected and pull a fresh snapshot so
	// the REST cache is immediately ground-truth accurate. WatchEvents
	// itself replays current state as synthetic from==to events, which
	// will also update the cache as they arrive.
	c.setConnected(true, "")
	if err := c.refreshAll(ctx); err != nil {
		slog.Warn("refresh-after-watch", "maglevd", c.name, "err", err)
	}
	for {
		ev, err := stream.Recv()
		if err != nil {
			if errors.Is(err, io.EOF) || ctx.Err() != nil {
				return nil
			}
			return err
		}
		c.handleEvent(ev)
	}
}

// handleEvent applies an incoming gRPC event to the local cache and
// publishes a corresponding BrowserEvent on the broker.
func (c *maglevClient) handleEvent(ev *grpcapi.Event) {
	switch body := ev.GetEvent().(type) {
	case *grpcapi.Event_Log:
		le := body.Log
		if le == nil {
			return
		}
		attrs := make(map[string]string, len(le.GetAttrs()))
		for _, a := range le.GetAttrs() {
			attrs[a.GetKey()] = a.GetValue()
		}
		payload, _ := json.Marshal(LogEventPayload{
			Level: le.GetLevel(),
			Msg:   le.GetMsg(),
			Attrs: attrs,
		})
		c.broker.Publish(BrowserEvent{
			Maglevd:  c.name,
			Type:     "log",
			AtUnixNs: le.GetAtUnixNs(),
			Payload:  payload,
		})

	case *grpcapi.Event_Backend:
		be := body.Backend
		if be == nil || be.GetTransition() == nil {
			return
		}
		tr := transitionFromProto(be.GetTransition())
		// maglevd replays current state on WatchEvents subscribe as a
		// synthetic event with from==to and at_unix_ns=0 (see
		// internal/grpcapi/server.go). It is not a real transition — the
		// in-process cache is already correct from refreshAll, so don't
		// touch LastTransition (which would clobber it with at=0 and
		// render as "55 years ago" in the browser) and don't forward to
		// the broker.
		if tr.From == tr.To {
			return
		}
		c.applyBackendTransition(be.GetBackendName(), tr)
		payload, _ := json.Marshal(BackendEventPayload{
			Backend:    be.GetBackendName(),
			Transition: *tr,
		})
		c.broker.Publish(BrowserEvent{
			Maglevd:  c.name,
			Type:     "backend",
			AtUnixNs: tr.AtUnixNs,
			Payload:  payload,
		})

	case *grpcapi.Event_Frontend:
		fe := body.Frontend
		if fe == nil || fe.GetTransition() == nil {
			return
		}
		tr := transitionFromProto(fe.GetTransition())
		if tr.From == tr.To {
			return
		}
		payload, _ := json.Marshal(FrontendEventPayload{
			Frontend:   fe.GetFrontendName(),
			Transition: *tr,
		})
		c.broker.Publish(BrowserEvent{
			Maglevd:  c.name,
			Type:     "frontend",
			AtUnixNs: tr.AtUnixNs,
			Payload:  payload,
		})
	}
}

func (c *maglevClient) applyBackendTransition(name string, tr *TransitionRecord) {
	c.mu.Lock()
	defer c.mu.Unlock()
	b, ok := c.cache.Backends[name]
	if !ok {
		b = &BackendSnapshot{Name: name}
		c.cache.Backends[name] = b
		c.cache.BackendsOrder = append(c.cache.BackendsOrder, name)
	}
	b.State = tr.To
	b.LastTransition = tr
	b.Transitions = append(b.Transitions, tr)
	// Cap history to the most recent 20 entries to mirror what maglevd
	// returns from GetBackend.
	if len(b.Transitions) > 20 {
		b.Transitions = b.Transitions[len(b.Transitions)-20:]
	}
}

// refreshLoop pulls a fresh snapshot every 30s to catch anything the live
// event stream may have missed (e.g. during a brief gRPC reconnect).
func (c *maglevClient) refreshLoop(ctx context.Context) {
	t := time.NewTicker(30 * time.Second)
	defer t.Stop()
	for {
		select {
		case <-ctx.Done():
			return
		case <-t.C:
			if err := c.refreshAll(ctx); err != nil {
				slog.Debug("refresh-all", "maglevd", c.name, "err", err)
			}
		}
	}
}

// healthLoop issues a cheap GetVPPInfo every 5s to surface connection drops
// quickly. Errors flip the connection indicator; recoveries trigger a
// refreshAll so the cache catches up.
func (c *maglevClient) healthLoop(ctx context.Context) {
	t := time.NewTicker(5 * time.Second)
	defer t.Stop()
	for {
		select {
		case <-ctx.Done():
			return
		case <-t.C:
			hctx, cancel := context.WithTimeout(ctx, 2*time.Second)
			_, err := c.api.GetVPPInfo(hctx, &grpcapi.GetVPPInfoRequest{})
			cancel()
			if err != nil {
				c.setConnected(false, err.Error())
			} else {
				c.setConnected(true, "")
			}
		}
	}
}

// ---- proto → JSON helpers --------------------------------------------------

func frontendFromProto(fi *grpcapi.FrontendInfo) *FrontendSnapshot {
	out := &FrontendSnapshot{
		Name:        fi.GetName(),
		Address:     fi.GetAddress(),
		Protocol:    fi.GetProtocol(),
		Port:        fi.GetPort(),
		Description: fi.GetDescription(),
		SrcIPSticky: fi.GetSrcIpSticky(),
	}
	for _, p := range fi.GetPools() {
		ps := &PoolSnapshot{Name: p.GetName()}
		for _, pb := range p.GetBackends() {
			ps.Backends = append(ps.Backends, &PoolBackendSnapshot{
				Name:            pb.GetName(),
				Weight:          pb.GetWeight(),
				EffectiveWeight: pb.GetEffectiveWeight(),
			})
		}
		out.Pools = append(out.Pools, ps)
	}
	return out
}

func backendFromProto(bi *grpcapi.BackendInfo) *BackendSnapshot {
	out := &BackendSnapshot{
		Name:        bi.GetName(),
		Address:     bi.GetAddress(),
		State:       bi.GetState(),
		Enabled:     bi.GetEnabled(),
		HealthCheck: bi.GetHealthcheck(),
	}
	for _, t := range bi.GetTransitions() {
		out.Transitions = append(out.Transitions, transitionFromProto(t))
	}
	if n := len(out.Transitions); n > 0 {
		out.LastTransition = out.Transitions[n-1]
	}
	return out
}

func transitionFromProto(t *grpcapi.TransitionRecord) *TransitionRecord {
	return &TransitionRecord{
		From:     t.GetFrom(),
		To:       t.GetTo(),
		AtUnixNs: t.GetAtUnixNs(),
	}
}

func healthCheckFromProto(h *grpcapi.HealthCheckInfo) *HealthCheckSnapshot {
	return &HealthCheckSnapshot{
		Name:           h.GetName(),
		Type:           h.GetType(),
		Port:           h.GetPort(),
		IntervalNs:     h.GetIntervalNs(),
		FastIntervalNs: h.GetFastIntervalNs(),
		DownIntervalNs: h.GetDownIntervalNs(),
		TimeoutNs:      h.GetTimeoutNs(),
		Rise:           h.GetRise(),
		Fall:           h.GetFall(),
	}
}