From d5fbf5c640e4ab1cbe1595a509337dfac3dcb398 Mon Sep 17 00:00:00 2001
From: Pim van Pelt <pim@ipng.ch>
Date: Sun, 12 Apr 2026 13:00:29 +0200
Subject: [PATCH] Prometheus: add VPP, LB sync, and gRPC metrics; expand docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New metrics plus the corresponding documentation for everything that's
accumulated since the last Prometheus pass.

internal/metrics/metrics.go
- New VPPSource interface (IsConnected, VPPInfo) plus a metrics-local
  VPPInfo struct that mirrors vpp.Info. Decoupling via interface +
  struct-mirror keeps the dependency direction one-way (vpp → metrics),
  so vpp can import metrics to update inline counters without a cycle.
- New Collector gauges scraped on demand: maglev_vpp_connected,
  maglev_vpp_uptime_seconds (from /sys/boottime), maglev_vpp_connected_seconds
  (time since maglevd connected), and maglev_vpp_info (static 1-gauge
  carrying version, build_date, and pid as labels).
- New inline counters:
  - maglev_vpp_api_total{msg, direction, result} — bumped from the
    loggedChannel wrapper on every VPP binary-API send/recv. Gives full
    visibility into what maglevd is doing with VPP, broken down by
    message name, direction (send/recv), and result (success/failure).
  - maglev_vpp_lbsync_total{scope, kind} — bumped from the reconciler
    at the end of each SyncLBStateAll/SyncLBStateVIP run. kind ∈
    {vip_added, vip_removed, as_added, as_removed, as_weight_updated};
    scope ∈ {all, vip}. Zero-valued kinds are not emitted so noise
    stays low.
- Register() signature now takes a VPPSource (may be nil) alongside
  the existing StateSource.

internal/vpp/client.go
- New VPPInfo() (metrics.VPPInfo, bool) shim method on *Client that
  satisfies metrics.VPPSource. Returns (_, false) when disconnected so
  the collector skips the vpp_* gauges cleanly.

internal/vpp/apilog.go
- The loggedChannel's SendRequest / SendMultiRequest / ReceiveReply
  paths now call metrics.VPPAPITotal.WithLabelValues(...).Inc() in
  addition to slog.Debug. Since every VPP API call in the codebase
  must go through loggedChannel (NewAPIChannel is unexported), this
  one instrumentation point catches everything.

internal/vpp/lbsync.go
- New recordSyncStats(scope, st) helper called once at the end of
  SyncLBStateAll and SyncLBStateVIP to bump maglev_vpp_lbsync_total.
  Zero-valued stats are skipped.

cmd/maglevd/main.go
- Added github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus
  for the standard gRPC server metrics (grpc_server_started_total,
  grpc_server_handled_total, grpc_server_handling_seconds, etc.,
  labelled by service/method/type/code).
- Constructs grpcprom.NewServerMetrics(WithServerHandlingTimeHistogram())
  before creating the grpc.Server, installs it as UnaryInterceptor +
  StreamInterceptor, then calls InitializeMetrics(srv) after service
  registration so every method appears at 0 on the first scrape
  instead of materialising lazily on first RPC.
- Passes the vppClient (or nil) as a metrics.VPPSource to
  metrics.Register so the vpp_* gauges are emitted when integration
  is enabled and silently omitted otherwise.

docs/user-guide.md
- New 'Prometheus metrics' section in the maglevd chapter,
  tabulating every metric family: backend state gauges, probe
  counters/histogram, transition counters, the new VPP gauges and
  counters, and the standard gRPC server metrics.
- 'show frontends <name>' description updated to mention the two
  weight columns ('weight' = configured from YAML, 'effective' =
  state-aware after pool-failover logic).
- Pause / disable descriptions clarified: transition history is
  preserved across these operator actions.

docs/healthchecks.md
- New 'Static (no-healthcheck) backends' section explaining that
  backends without a healthcheck use rise/fall=1, fire a synthetic
  passing probe immediately on startup (no 30s wait), and idle at
  30s between iterations thereafter.
- New 'Pool failover' section documenting the priority-tier model,
  the active-pool definition, when promotion happens, cascading to
  further tiers, and graceful drain on demotion. Points readers at
  'maglevc show frontends <name>' as the inspection interface.

docs/config-guide.md
- healthcheck field doc now describes static-backend behavior and
  cross-references healthchecks.md.
- pools field doc now explains failover semantics at a high level
  and cross-references the detailed healthchecks.md section.
---
 cmd/maglevd/main.go         |  25 ++++++-
 docs/config-guide.md        |  14 ++--
 docs/healthchecks.md        |  49 ++++++++++++++
 docs/user-guide.md          |  64 +++++++++++++++---
 go.mod                      |   2 +
 go.sum                      |   4 ++
 internal/metrics/metrics.go | 129 +++++++++++++++++++++++++++++++++---
 internal/vpp/apilog.go      |  18 +++--
 internal/vpp/client.go      |  20 ++++++
 internal/vpp/lbsync.go      |  25 +++++++
 10 files changed, 322 insertions(+), 28 deletions(-)
diff --git a/cmd/maglevd/main.go b/cmd/maglevd/main.go
index d0dbc4d..da1c8ab 100644
--- a/cmd/maglevd/main.go
+++ b/cmd/maglevd/main.go
@@ -13,6 +13,7 @@ import (
 	"os/signal"
 	"syscall"
 
+	grpcprom "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"google.golang.org/grpc"
@@ -109,16 +110,29 @@ func run() error {
 	}
 
 	// ---- gRPC server --------------------------------------------------------
+	// Server-side metrics for every RPC: call counters, in-flight gauges,
+	// and handler-latency histograms, labelled by method and gRPC code.
+	// Provided by go-grpc-middleware's prometheus adapter; the metric
+	// families are emitted on the /metrics endpoint alongside ours.
+	grpcMetrics := grpcprom.NewServerMetrics(
+		grpcprom.WithServerHandlingTimeHistogram(),
+	)
 	lis, err := net.Listen("tcp", *grpcAddr)
 	if err != nil {
 		return fmt.Errorf("listen %s: %w", *grpcAddr, err)
 	}
-	srv := grpc.NewServer()
+	srv := grpc.NewServer(
+		grpc.UnaryInterceptor(grpcMetrics.UnaryServerInterceptor()),
+		grpc.StreamInterceptor(grpcMetrics.StreamServerInterceptor()),
+	)
 	maglevServer := grpcapi.NewServer(ctx, chkr, logBroadcaster, *configPath, vppClient)
 	grpcapi.RegisterMaglevServer(srv, maglevServer)
 	if *enableReflection {
 		reflection.Register(srv)
 	}
+	// Pre-register every method with 0 so the metric shows up on first
+	// scrape even before any RPC has been received.
+	grpcMetrics.InitializeMetrics(srv)
 	slog.Info("grpc-listening", "addr", *grpcAddr, "reflection", *enableReflection)
 
 	go func() {
@@ -130,7 +144,14 @@ func run() error {
 	// ---- Prometheus metrics -------------------------------------------------
 	if *metricsAddr != "" {
 		reg := prometheus.DefaultRegisterer
-		metrics.Register(reg, chkr)
+		// vppClient may be nil when VPP integration is disabled; the
+		// collector handles that by skipping the vpp_* gauges.
+		var vppSrc metrics.VPPSource
+		if vppClient != nil {
+			vppSrc = vppClient
+		}
+		metrics.Register(reg, chkr, vppSrc)
+		reg.MustRegister(grpcMetrics)
 
 		mux := http.NewServeMux()
 		mux.Handle("/metrics", promhttp.Handler())
diff --git a/docs/config-guide.md b/docs/config-guide.md
index 559e3f3..a5929b2 100644
--- a/docs/config-guide.md
+++ b/docs/config-guide.md
@@ -239,8 +239,10 @@ multiple frontends.
 
 *   ***address***: Required. The IPv4 or IPv6 address of this backend server.
 *   ***healthcheck***: The name of a health check defined in the `healthchecks` section.
-    When empty or omitted, no probing is performed and the backend is assumed permanently
-    healthy. This is useful for backends that are always available or managed by other means.
+    When empty or omitted, the backend is static: no probing is performed and the backend
+    enters `StateUp` immediately on startup (via a synthetic pass, rise/fall forced to 1/1).
+    This is useful for backends that are always available or managed by other means. See
+    [healthchecks.md](healthchecks.md) for details on the static-backend behavior.
 *   ***enabled***: A boolean controlling whether this backend participates in any frontend.
     When `false`, the backend is excluded entirely and no probe goroutine is started.
     Defaults to `true`.
@@ -280,8 +282,12 @@ ordered list of backend pools. The gRPC API exposes frontends by name.
     frontend port is independent of the healthcheck port: a frontend on port 443 may use
     a healthcheck that probes port 80.
 *   ***pools***: Required. A non-empty ordered list of pool objects. Pools express priority:
-    the first pool is preferred; subsequent pools act as fallbacks. All backends across all
-    pools in a frontend must have addresses of the same address family (all IPv4 or all IPv6).
+    the first pool is preferred; subsequent pools act as fallbacks. When every backend in
+    pool[0] leaves `StateUp` (down, paused, disabled, or not yet probed), pool[1] is
+    automatically promoted — its up backends take over serving traffic. The promotion
+    cascades across further tiers. See [healthchecks.md](healthchecks.md#pool-failover)
+    for the full failover semantics. All backends across all pools in a frontend must
+    have addresses of the same address family (all IPv4 or all IPv6).
 
 Each pool has:
 
diff --git a/docs/healthchecks.md b/docs/healthchecks.md
index 7b459dd..8474d9e 100644
--- a/docs/healthchecks.md
+++ b/docs/healthchecks.md
@@ -187,6 +187,55 @@ goroutine is not restarted and no transition event is emitted.
 
 ---
 
+## Static (no-healthcheck) backends
+
+A backend with no `healthcheck` field in YAML skips the probe loop entirely.
+Instead of actually probing, `maglevd` synthesises a single passing result
+on startup. Specifically:
+
+- The worker's rise/fall counters are forced to `1/1`, so a single synthetic
+  pass is enough to reach `StateUp`.
+- The first "probe" fires immediately (zero sleep). Subsequent iterations
+  idle at 30 seconds — there is nothing to do.
+- The backend reaches `up` within milliseconds of startup.
+
+Static backends are useful for administrative VIPs where the caller knows the
+backend is always available, or for test configurations where deterministic
+state is more valuable than real health signals.
+
+---
+
+## Pool failover
+
+Every frontend has one or more pools. The pools are priority tiers: pool[0]
+is the primary, pool[1] is the first fallback, pool[2] the next, and so on.
+At any moment, `maglevd` computes an **active pool** — the first pool that
+contains at least one backend in `StateUp`:
+
+- As long as pool[0] has any up backend, it stays active. Its up backends
+  receive traffic at their configured weights; backends in lower-priority
+  pools stay on standby with effective weight 0.
+- When pool[0] has zero up backends (all down, paused, disabled, or still
+  unknown), pool[1] is promoted: its up backends get their configured
+  weights, and pool[0] backends stay at 0 until at least one recovers.
+- The same rule cascades to pool[2], pool[3], etc., for further fallback
+  tiers.
+- When no pool has any up backend, every backend's effective weight is 0
+  and the VIP serves nothing.
+
+Failover is evaluated on every backend state transition and also on the
+periodic VPP drift reconciliation (every `maglev.vpp.lb.sync-interval`).
+The resulting effective weight for each backend can be inspected via
+`maglevc show frontends <name>` — each pool backend row shows both the
+configured weight and the effective weight after failover.
+
+Demotion on recovery (e.g. pool[1] → standby when pool[0] comes back up)
+drains gracefully: the demoted backends have their weight set to 0 but
+existing flows in the VPP flow table are left to drain naturally. The only
+state that forces immediate flow-table flushing is operator `disable`.
+
+---
+
 ## Log lines
 
 All state changes produce a structured log line at `INFO` level:
diff --git a/docs/user-guide.md b/docs/user-guide.md
index 3f4c93b..a5d3c48 100644
--- a/docs/user-guide.md
+++ b/docs/user-guide.md
@@ -41,8 +41,47 @@ special capabilities.
 All log output is written to stdout as JSON using Go's `log/slog`. The first
 line logged after the logger is configured is a `starting` record that includes
 `version`, `commit`, and `date`. Every state change emits a `backend-transition`
-line at `INFO` level. Set `--log-level debug` to see individual probe attempts
-and their outcomes.
+line at `INFO` level. Set `--log-level debug` to see individual probe attempts,
+every VPP binary-API call (`vpp-api-send` / `vpp-api-recv` with full payload),
+and the per-VIP sync operations (`vpp-lbsync-vip-add`, `vpp-lbsync-as-weight`,
+etc.) as they happen.
+
+### Prometheus metrics
+
+`maglevd` exposes Prometheus metrics on `--metrics-addr` (default `:9091`) at
+the `/metrics` path. Metric families:
+
+**Health-check and backend state (gauges, on-demand):**
+| Metric | Labels | Description |
+|---|---|---|
+| `maglev_backend_state` | `backend`, `address`, `healthcheck`, `state` | 1 for the current state row per backend, 0 otherwise. |
+| `maglev_backend_health` | `backend` | Current rise/fall counter value. |
+| `maglev_backend_enabled` | `backend` | 1 if enabled, 0 if disabled. |
+| `maglev_frontend_pool_backend_weight` | `frontend`, `pool`, `backend` | Configured weight from YAML. |
+
+**Probe counters and latency (inline):**
+| Metric | Labels | Description |
+|---|---|---|
+| `maglev_probe_total` | `backend`, `type`, `result`, `code` | Probes executed. `result` is `success` or `failure`. |
+| `maglev_probe_duration_seconds` | `backend`, `type` | Histogram of probe wall time. |
+| `maglev_backend_transitions_total` | `backend`, `from`, `to` | State machine transitions. |
+
+**VPP integration (when enabled):**
+| Metric | Labels | Description |
+|---|---|---|
+| `maglev_vpp_connected` | — | 1 if maglevd currently has a live VPP connection. |
+| `maglev_vpp_uptime_seconds` | — | Seconds since VPP started (from `/sys/boottime`). |
+| `maglev_vpp_connected_seconds` | — | Seconds since maglevd established the current VPP connection. |
+| `maglev_vpp_info` | `version`, `build_date`, `pid` | Static VPP build metadata; always 1. |
+| `maglev_vpp_api_total` | `msg`, `direction`, `result` | VPP binary-API calls. `direction` is `send` or `recv`; `result` is `success` or `failure`. |
+| `maglev_vpp_lbsync_total` | `scope`, `kind` | Per-mutation sync counters. `scope` is `all` or `vip`; `kind` is one of `vip_added`, `vip_removed`, `as_added`, `as_removed`, `as_weight_updated`. |
+
+**gRPC server (standard `go-grpc-middleware/prometheus` metrics):**
+`grpc_server_started_total`, `grpc_server_handled_total`,
+`grpc_server_msg_received_total`, `grpc_server_msg_sent_total`, and
+`grpc_server_handling_seconds` — all labelled by `grpc_service`,
+`grpc_method`, `grpc_type`, and `grpc_code`. Every method is
+pre-registered at zero so time series exist on the first scrape.
 
 ---
 
@@ -74,8 +113,12 @@ show version                     Print build version, commit hash, and build dat
 
 show frontends [<name>]          Without name: list all frontend names.
                                  With name: show address, protocol, port, description,
-                                 and pools. Each pool lists its backends with weights
-                                 (if != 100) and marks disabled backends with [disabled].
+                                 and pools. Each pool lists its backends with two
+                                 weight columns:
+                                   weight     — configured weight from the YAML
+                                   effective  — state-aware weight after pool failover
+                                                (what gets programmed into VPP)
+                                 Disabled backends are marked with [disabled].
 
 show backends [<name>]           Without name: list all backend names.
                                  With name: show address, current state (with duration),
@@ -104,13 +147,16 @@ sync vpp lbstate [<name>]        Reconcile the VPP load-balancer dataplane from
                                  drift, and once on startup.
 
 set backend <name> pause         Stop health checking for a backend. Cancels the probe
-                                 goroutine so no further traffic is sent, and freezes
-                                 the state at whatever it was when paused.
+                                 goroutine so no further traffic is sent, and sets the
+                                 state to 'paused'. The backend's transition history is
+                                 preserved, so 'show backend <name>' still shows where
+                                 it came from.
 set backend <name> resume        Resume health checking. A fresh probe goroutine is
                                  started and the backend re-enters unknown state.
-set backend <name> disable       Stop probing entirely and remove the backend from rotation.
-                                 The backend remains visible (state: disabled) and can be
-                                 re-enabled without reloading configuration.
+set backend <name> disable       Stop probing entirely and remove the backend from
+                                 rotation. The backend remains visible (state: disabled)
+                                 with its transition history intact and can be re-enabled
+                                 without reloading configuration.
 set backend <name> enable        Re-enable a disabled backend. A fresh probe goroutine is
                                  started and the backend re-enters unknown state.
 
diff --git a/go.mod b/go.mod
index 66b89d7..aa2a5a2 100644
--- a/go.mod
+++ b/go.mod
@@ -19,6 +19,8 @@ require (
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/fsnotify/fsnotify v1.9.0 // indirect
 	github.com/ftrvxmtrx/fd v0.0.0-20150925145434-c6d800382fff // indirect
+	github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.1.0 // indirect
+	github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect
 	github.com/kr/text v0.2.0 // indirect
 	github.com/lunixbochs/struc v0.0.0-20200521075829-a4cb8d33dbbe // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
diff --git a/go.sum b/go.sum
index a2dbf33..a7719e1 100644
--- a/go.sum
+++ b/go.sum
@@ -26,6 +26,10 @@ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.1.0 h1:QGLs/O40yoNK9vmy4rhUGBVyMf1lISBGtXRpsu/Qu/o=
+github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.1.0/go.mod h1:hM2alZsMUni80N33RBe6J0e423LB+odMj7d3EMP9l20=
+github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 h1:pRhl55Yx1eC7BZ1N+BBWwnKaMyD8uC+34TLdndZMAKk=
+github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0/go.mod h1:XKMd7iuf/RGPSMJ/U4HP0zS2Z9Fh8Ps9a+6X26m/tmI=
 github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
 github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
index 8c0119e..c6f43b2 100644
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -2,13 +2,17 @@
 
 // Package metrics exposes Prometheus metrics for maglevd.
 //
-// Gauge-type metrics (backend state, health counter, weights) are collected
-// on demand when Prometheus scrapes /metrics via the Collector. Counter and
-// histogram metrics (probe totals, probe duration, transitions) are updated
-// inline from the probe loop.
+// Gauge-type metrics (backend state, health counter, weights, VPP connection
+// info) are collected on demand when Prometheus scrapes /metrics via the
+// Collector. Counter and histogram metrics (probe totals, probe duration,
+// transitions, VPP API calls, LB sync operations) are updated inline from
+// the probe loop and VPP sync paths.
 package metrics
 
 import (
+	"fmt"
+	"time"
+
 	"git.ipng.ch/ipng/vpp-maglev/internal/config"
 	"git.ipng.ch/ipng/vpp-maglev/internal/health"
 
@@ -30,6 +34,25 @@ type StateSource interface {
 	GetFrontend(name string) (config.Frontend, bool)
 }
 
+// VPPInfo mirrors vpp.Info so the metrics package doesn't need to import
+// internal/vpp (which would create an import cycle — vpp imports metrics
+// to update counters inline).
+type VPPInfo struct {
+	Version        string
+	BuildDate      string
+	PID            uint32
+	BootTime       time.Time
+	ConnectedSince time.Time
+}
+
+// VPPSource provides read-only access to the VPP client's state. vpp.Client
+// is adapted to this interface via a small shim in the collector so the
+// metrics package stays decoupled from the vpp package's concrete types.
+type VPPSource interface {
+	IsConnected() bool
+	VPPInfo() (VPPInfo, bool)
+}
+
 // ---- inline metrics (updated per probe) ------------------------------------
 
 var (
@@ -54,6 +77,27 @@ var (
 		Name:      "transitions_total",
 		Help:      "Total number of backend state transitions.",
 	}, []string{"backend", "from", "to"})
+
+	// ---- VPP API counters ---------------------------------------------------
+
+	VPPAPITotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "maglev",
+		Subsystem: "vpp_api",
+		Name:      "total",
+		Help:      "Total number of VPP binary-API messages sent to or received from VPP.",
+	}, []string{"msg", "direction", "result"})
+
+	// ---- LB sync counters ---------------------------------------------------
+
+	// LBSyncTotal counts individual dataplane mutations performed by the
+	// sync path. kind ∈ {vip_added, vip_removed, as_added, as_removed,
+	// as_weight_updated}; scope ∈ {all, vip}.
+	LBSyncTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "maglev",
+		Subsystem: "vpp_lbsync",
+		Name:      "total",
+		Help:      "Total number of VPP load-balancer sync operations applied to the dataplane.",
+	}, []string{"scope", "kind"})
 )
 
 // ---- collector (scraped on demand) -----------------------------------------
@@ -63,17 +107,26 @@ var (
 // removed by a config reload.
 type Collector struct {
 	src StateSource
+	vpp VPPSource // optional; nil when VPP integration is disabled
 
 	backendState   *prometheus.Desc
 	backendHealth  *prometheus.Desc
 	backendEnabled *prometheus.Desc
 	poolWeight     *prometheus.Desc
+
+	vppConnected     *prometheus.Desc
+	vppUptimeSeconds *prometheus.Desc
+	vppConnectedFor  *prometheus.Desc
+	vppInfo          *prometheus.Desc
 }
 
-// NewCollector creates a Collector backed by the given StateSource.
-func NewCollector(src StateSource) *Collector {
+// NewCollector creates a Collector backed by the given StateSource. vpp may
+// be nil when VPP integration is disabled; in that case vpp_* metrics are
+// simply not emitted.
+func NewCollector(src StateSource, vpp VPPSource) *Collector {
 	return &Collector{
 		src: src,
+		vpp: vpp,
 		backendState: prometheus.NewDesc(
 			"maglev_backend_state",
 			"Current backend state (1 = active for the given state label).",
@@ -94,6 +147,26 @@ func NewCollector(src StateSource) *Collector {
 			"Configured weight of a backend in a frontend pool (0-100).",
 			[]string{"frontend", "pool", "backend"}, nil,
 		),
+		vppConnected: prometheus.NewDesc(
+			"maglev_vpp_connected",
+			"Whether maglevd currently has an established connection to VPP (1) or not (0).",
+			nil, nil,
+		),
+		vppUptimeSeconds: prometheus.NewDesc(
+			"maglev_vpp_uptime_seconds",
+			"Seconds since VPP started (from the /sys/boottime stats counter).",
+			nil, nil,
+		),
+		vppConnectedFor: prometheus.NewDesc(
+			"maglev_vpp_connected_seconds",
+			"Seconds since maglevd established the current VPP connection.",
+			nil, nil,
+		),
+		vppInfo: prometheus.NewDesc(
+			"maglev_vpp_info",
+			"Static VPP build information. Always 1; metadata is conveyed via labels.",
+			[]string{"version", "build_date", "pid"}, nil,
+		),
 	}
 }
 
@@ -103,6 +176,10 @@ func (c *Collector) Describe(ch chan<- *prometheus.Desc) {
 	ch <- c.backendHealth
 	ch <- c.backendEnabled
 	ch <- c.poolWeight
+	ch <- c.vppConnected
+	ch <- c.vppUptimeSeconds
+	ch <- c.vppConnectedFor
+	ch <- c.vppInfo
 }
 
 // Collect implements prometheus.Collector.
@@ -163,14 +240,48 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) {
 			}
 		}
 	}
+
+	// ---- VPP gauges -------------------------------------------------------
+	if c.vpp == nil {
+		return
+	}
+	connected := 0.0
+	if c.vpp.IsConnected() {
+		connected = 1.0
+	}
+	ch <- prometheus.MustNewConstMetric(c.vppConnected, prometheus.GaugeValue, connected)
+
+	info, ok := c.vpp.VPPInfo()
+	if !ok {
+		return
+	}
+	if !info.BootTime.IsZero() {
+		ch <- prometheus.MustNewConstMetric(
+			c.vppUptimeSeconds, prometheus.GaugeValue,
+			time.Since(info.BootTime).Seconds(),
+		)
+	}
+	if !info.ConnectedSince.IsZero() {
+		ch <- prometheus.MustNewConstMetric(
+			c.vppConnectedFor, prometheus.GaugeValue,
+			time.Since(info.ConnectedSince).Seconds(),
+		)
+	}
+	ch <- prometheus.MustNewConstMetric(
+		c.vppInfo, prometheus.GaugeValue, 1.0,
+		info.Version, info.BuildDate, fmt.Sprintf("%d", info.PID),
+	)
 }
 
-// Register registers all metrics with the given registry.
-func Register(reg prometheus.Registerer, src StateSource) *Collector {
-	coll := NewCollector(src)
+// Register registers all metrics with the given registry. vpp may be nil
+// to disable VPP-related metrics.
+func Register(reg prometheus.Registerer, src StateSource, vpp VPPSource) *Collector {
+	coll := NewCollector(src, vpp)
 	reg.MustRegister(coll)
 	reg.MustRegister(ProbeTotal)
 	reg.MustRegister(ProbeDuration)
 	reg.MustRegister(TransitionTotal)
+	reg.MustRegister(VPPAPITotal)
+	reg.MustRegister(LBSyncTotal)
 	return coll
 }
diff --git a/internal/vpp/apilog.go b/internal/vpp/apilog.go
index a648f8c..b697095 100644
--- a/internal/vpp/apilog.go
+++ b/internal/vpp/apilog.go
@@ -7,6 +7,8 @@ import (
 	"log/slog"
 
 	"go.fd.io/govpp/api"
+
+	"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
 )
 
 // loggedChannel wraps an api.Channel so that every VPP request/reply is
@@ -38,27 +40,31 @@ func (lc *loggedChannel) Close() { lc.ch.Close() }
 
 // SendRequest logs the outgoing message and returns a wrapped request context.
 func (lc *loggedChannel) SendRequest(msg api.Message) *loggedRequestCtx {
+	name := msg.GetMessageName()
 	slog.Debug("vpp-api-send",
-		"msg", msg.GetMessageName(),
+		"msg", name,
 		"crc", msg.GetCrcString(),
 		"payload", fmt.Sprintf("%+v", msg),
 	)
+	metrics.VPPAPITotal.WithLabelValues(name, "send", "success").Inc()
 	return &loggedRequestCtx{
 		ctx:  lc.ch.SendRequest(msg),
-		name: msg.GetMessageName(),
+		name: name,
 	}
 }
 
 // SendMultiRequest logs the outgoing message and returns a wrapped multi-request context.
 func (lc *loggedChannel) SendMultiRequest(msg api.Message) *loggedMultiRequestCtx {
+	name := msg.GetMessageName()
 	slog.Debug("vpp-api-send-multi",
-		"msg", msg.GetMessageName(),
+		"msg", name,
 		"crc", msg.GetCrcString(),
 		"payload", fmt.Sprintf("%+v", msg),
 	)
+	metrics.VPPAPITotal.WithLabelValues(name, "send", "success").Inc()
 	return &loggedMultiRequestCtx{
 		ctx:  lc.ch.SendMultiRequest(msg),
-		name: msg.GetMessageName(),
+		name: name,
 	}
 }
 
@@ -76,6 +82,7 @@ func (r *loggedRequestCtx) ReceiveReply(msg api.Message) error {
 			"reply", msg.GetMessageName(),
 			"err", err,
 		)
+		metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "failure").Inc()
 		return err
 	}
 	slog.Debug("vpp-api-recv",
@@ -83,6 +90,7 @@ func (r *loggedRequestCtx) ReceiveReply(msg api.Message) error {
 		"reply", msg.GetMessageName(),
 		"payload", fmt.Sprintf("%+v", msg),
 	)
+	metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "success").Inc()
 	return nil
 }
 
@@ -102,6 +110,7 @@ func (r *loggedMultiRequestCtx) ReceiveReply(msg api.Message) (bool, error) {
 			"seq", r.seq,
 			"err", err,
 		)
+		metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "failure").Inc()
 		return stop, err
 	}
 	if stop {
@@ -117,6 +126,7 @@ func (r *loggedMultiRequestCtx) ReceiveReply(msg api.Message) (bool, error) {
 		"seq", r.seq,
 		"payload", fmt.Sprintf("%+v", msg),
 	)
+	metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "success").Inc()
 	r.seq++
 	return stop, nil
 }
diff --git a/internal/vpp/client.go b/internal/vpp/client.go
index 9166a01..6507c61 100644
--- a/internal/vpp/client.go
+++ b/internal/vpp/client.go
@@ -19,6 +19,7 @@ import (
 
 	"git.ipng.ch/ipng/vpp-maglev/internal/config"
 	"git.ipng.ch/ipng/vpp-maglev/internal/health"
+	"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
 	lb "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb"
 )
 
@@ -216,6 +217,25 @@ func (c *Client) GetInfo() (Info, error) {
 	return c.info, nil
 }
 
+// VPPInfo satisfies metrics.VPPSource. It returns a copy of the cached
+// connection info as a metrics-local struct so the metrics package doesn't
+// need to import internal/vpp. Second return is false when VPP is not
+// connected (the collector skips the vpp_* gauges in that case).
+func (c *Client) VPPInfo() (metrics.VPPInfo, bool) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if c.apiConn == nil {
+		return metrics.VPPInfo{}, false
+	}
+	return metrics.VPPInfo{
+		Version:        c.info.Version,
+		BuildDate:      c.info.BuildDate,
+		PID:            c.info.PID,
+		BootTime:       c.info.BootTime,
+		ConnectedSince: c.info.ConnectedSince,
+	}, true
+}
+
 // connect establishes both API and stats connections. If either fails,
 // both are torn down.
 func (c *Client) connect() error {
diff --git a/internal/vpp/lbsync.go b/internal/vpp/lbsync.go
index 55f25f0..8d9c535 100644
--- a/internal/vpp/lbsync.go
+++ b/internal/vpp/lbsync.go
@@ -10,6 +10,7 @@ import (
 
 	"git.ipng.ch/ipng/vpp-maglev/internal/config"
 	"git.ipng.ch/ipng/vpp-maglev/internal/health"
+	"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
 	ip_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/ip_types"
 	lb "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb"
 	lb_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb_types"
@@ -50,6 +51,28 @@ type syncStats struct {
 	asWeight int
 }
 
+// recordSyncStats increments the Prometheus lbsync counters for one sync
+// run. scope is "all" for SyncLBStateAll and "vip" for SyncLBStateVIP.
+// Zero-valued kinds emit no increment (the counter stays at its previous
+// value for that label set).
+func recordSyncStats(scope string, st *syncStats) {
+	if st.vipAdd > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "vip_added").Add(float64(st.vipAdd))
+	}
+	if st.vipDel > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "vip_removed").Add(float64(st.vipDel))
+	}
+	if st.asAdd > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "as_added").Add(float64(st.asAdd))
+	}
+	if st.asDel > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "as_removed").Add(float64(st.asDel))
+	}
+	if st.asWeight > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "as_weight_updated").Add(float64(st.asWeight))
+	}
+}
+
 // SyncLBStateAll reconciles the full VPP load-balancer state with the given
 // config. For every frontend in cfg:
 //   - if the VIP does not exist in VPP, create it;
@@ -118,6 +141,7 @@ func (c *Client) SyncLBStateAll(cfg *config.Config) error {
 		}
 	}
 
+	recordSyncStats("all", &st)
 	slog.Info("vpp-lbsync-done",
 		"scope", "all",
 		"vip-added", st.vipAdd,
@@ -169,6 +193,7 @@ func (c *Client) SyncLBStateVIP(cfg *config.Config, feName string) error {
 	if err := reconcileVIP(ch, d, cur, &st); err != nil {
 		return err
 	}
+	recordSyncStats("vip", &st)
 	slog.Info("vpp-lbsync-done",
 		"scope", "vip",
 		"frontend", feName,