Prometheus: add VPP, LB sync, and gRPC metrics; expand docs

New metrics plus the corresponding documentation for everything that's accumulated since the last Prometheus pass. internal/metrics/metrics.go - New VPPSource interface (IsConnected, VPPInfo) plus a metrics-local VPPInfo struct that mirrors vpp.Info. Decoupling via interface + struct-mirror keeps the dependency direction one-way (vpp → metrics), so vpp can import metrics to update inline counters without a cycle. - New Collector gauges scraped on demand: maglev_vpp_connected, maglev_vpp_uptime_seconds (from /sys/boottime), maglev_vpp_connected_seconds (time since maglevd connected), and maglev_vpp_info (static 1-gauge carrying version, build_date, and pid as labels). - New inline counters: - maglev_vpp_api_total{msg, direction, result} — bumped from the loggedChannel wrapper on every VPP binary-API send/recv. Gives full visibility into what maglevd is doing with VPP, broken down by message name, direction (send/recv), and result (success/failure). - maglev_vpp_lbsync_total{scope, kind} — bumped from the reconciler at the end of each SyncLBStateAll/SyncLBStateVIP run. kind ∈ {vip_added, vip_removed, as_added, as_removed, as_weight_updated}; scope ∈ {all, vip}. Zero-valued kinds are not emitted so noise stays low. - Register() signature now takes a VPPSource (may be nil) alongside the existing StateSource. internal/vpp/client.go - New VPPInfo() (metrics.VPPInfo, bool) shim method on *Client that satisfies metrics.VPPSource. Returns (_, false) when disconnected so the collector skips the vpp_* gauges cleanly. internal/vpp/apilog.go - The loggedChannel's SendRequest / SendMultiRequest / ReceiveReply paths now call metrics.VPPAPITotal.WithLabelValues(...).Inc() in addition to slog.Debug. Since every VPP API call in the codebase must go through loggedChannel (NewAPIChannel is unexported), this one instrumentation point catches everything. internal/vpp/lbsync.go - New recordSyncStats(scope, st) helper called once at the end of SyncLBStateAll and SyncLBStateVIP to bump maglev_vpp_lbsync_total. Zero-valued stats are skipped. cmd/maglevd/main.go - Added github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus for the standard gRPC server metrics (grpc_server_started_total, grpc_server_handled_total, grpc_server_handling_seconds, etc., labelled by service/method/type/code). - Constructs grpcprom.NewServerMetrics(WithServerHandlingTimeHistogram()) before creating the grpc.Server, installs it as UnaryInterceptor + StreamInterceptor, then calls InitializeMetrics(srv) after service registration so every method appears at 0 on the first scrape instead of materialising lazily on first RPC. - Passes the vppClient (or nil) as a metrics.VPPSource to metrics.Register so the vpp_* gauges are emitted when integration is enabled and silently omitted otherwise. docs/user-guide.md - New 'Prometheus metrics' section in the maglevd chapter, tabulating every metric family: backend state gauges, probe counters/histogram, transition counters, the new VPP gauges and counters, and the standard gRPC server metrics. - 'show frontends <name>' description updated to mention the two weight columns ('weight' = configured from YAML, 'effective' = state-aware after pool-failover logic). - Pause / disable descriptions clarified: transition history is preserved across these operator actions. docs/healthchecks.md - New 'Static (no-healthcheck) backends' section explaining that backends without a healthcheck use rise/fall=1, fire a synthetic passing probe immediately on startup (no 30s wait), and idle at 30s between iterations thereafter. - New 'Pool failover' section documenting the priority-tier model, the active-pool definition, when promotion happens, cascading to further tiers, and graceful drain on demotion. Points readers at 'maglevc show frontends <name>' as the inspection interface. docs/config-guide.md - healthcheck field doc now describes static-backend behavior and cross-references healthchecks.md. - pools field doc now explains failover semantics at a high level and cross-references the detailed healthchecks.md section.
2026-04-12 13:00:29 +02:00
parent 0049c2ae73
commit d5fbf5c640
10 changed files with 322 additions and 28 deletions
--- a/internal/vpp/apilog.go
+++ b/internal/vpp/apilog.go
@@ -7,6 +7,8 @@ import (
 	"log/slog"

 	"go.fd.io/govpp/api"
+
+	"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
 )

 // loggedChannel wraps an api.Channel so that every VPP request/reply is
@@ -38,27 +40,31 @@ func (lc *loggedChannel) Close() { lc.ch.Close() }

 // SendRequest logs the outgoing message and returns a wrapped request context.
 func (lc *loggedChannel) SendRequest(msg api.Message) *loggedRequestCtx {
+	name := msg.GetMessageName()
 	slog.Debug("vpp-api-send",
-		"msg", msg.GetMessageName(),
+		"msg", name,
 		"crc", msg.GetCrcString(),
 		"payload", fmt.Sprintf("%+v", msg),
 	)
+	metrics.VPPAPITotal.WithLabelValues(name, "send", "success").Inc()
 	return &loggedRequestCtx{
 		ctx:  lc.ch.SendRequest(msg),
-		name: msg.GetMessageName(),
+		name: name,
 	}
 }

 // SendMultiRequest logs the outgoing message and returns a wrapped multi-request context.
 func (lc *loggedChannel) SendMultiRequest(msg api.Message) *loggedMultiRequestCtx {
+	name := msg.GetMessageName()
 	slog.Debug("vpp-api-send-multi",
-		"msg", msg.GetMessageName(),
+		"msg", name,
 		"crc", msg.GetCrcString(),
 		"payload", fmt.Sprintf("%+v", msg),
 	)
+	metrics.VPPAPITotal.WithLabelValues(name, "send", "success").Inc()
 	return &loggedMultiRequestCtx{
 		ctx:  lc.ch.SendMultiRequest(msg),
-		name: msg.GetMessageName(),
+		name: name,
 	}
 }

@@ -76,6 +82,7 @@ func (r *loggedRequestCtx) ReceiveReply(msg api.Message) error {
 			"reply", msg.GetMessageName(),
 			"err", err,
 		)
+		metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "failure").Inc()
 		return err
 	}
 	slog.Debug("vpp-api-recv",
@@ -83,6 +90,7 @@ func (r *loggedRequestCtx) ReceiveReply(msg api.Message) error {
 		"reply", msg.GetMessageName(),
 		"payload", fmt.Sprintf("%+v", msg),
 	)
+	metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "success").Inc()
 	return nil
 }

@@ -102,6 +110,7 @@ func (r *loggedMultiRequestCtx) ReceiveReply(msg api.Message) (bool, error) {
 			"seq", r.seq,
 			"err", err,
 		)
+		metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "failure").Inc()
 		return stop, err
 	}
 	if stop {
@@ -117,6 +126,7 @@ func (r *loggedMultiRequestCtx) ReceiveReply(msg api.Message) (bool, error) {
 		"seq", r.seq,
 		"payload", fmt.Sprintf("%+v", msg),
 	)
+	metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "success").Inc()
 	r.seq++
 	return stop, nil
 }
--- a/internal/vpp/client.go
+++ b/internal/vpp/client.go
@@ -19,6 +19,7 @@ import (

 	"git.ipng.ch/ipng/vpp-maglev/internal/config"
 	"git.ipng.ch/ipng/vpp-maglev/internal/health"
+	"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
 	lb "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb"
 )

@@ -216,6 +217,25 @@ func (c *Client) GetInfo() (Info, error) {
 	return c.info, nil
 }

+// VPPInfo satisfies metrics.VPPSource. It returns a copy of the cached
+// connection info as a metrics-local struct so the metrics package doesn't
+// need to import internal/vpp. Second return is false when VPP is not
+// connected (the collector skips the vpp_* gauges in that case).
+func (c *Client) VPPInfo() (metrics.VPPInfo, bool) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if c.apiConn == nil {
+		return metrics.VPPInfo{}, false
+	}
+	return metrics.VPPInfo{
+		Version:        c.info.Version,
+		BuildDate:      c.info.BuildDate,
+		PID:            c.info.PID,
+		BootTime:       c.info.BootTime,
+		ConnectedSince: c.info.ConnectedSince,
+	}, true
+}
+
 // connect establishes both API and stats connections. If either fails,
 // both are torn down.
 func (c *Client) connect() error {
--- a/internal/vpp/lbsync.go
+++ b/internal/vpp/lbsync.go
@@ -10,6 +10,7 @@ import (

 	"git.ipng.ch/ipng/vpp-maglev/internal/config"
 	"git.ipng.ch/ipng/vpp-maglev/internal/health"
+	"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
 	ip_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/ip_types"
 	lb "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb"
 	lb_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb_types"
@@ -50,6 +51,28 @@ type syncStats struct {
 	asWeight int
 }

+// recordSyncStats increments the Prometheus lbsync counters for one sync
+// run. scope is "all" for SyncLBStateAll and "vip" for SyncLBStateVIP.
+// Zero-valued kinds emit no increment (the counter stays at its previous
+// value for that label set).
+func recordSyncStats(scope string, st *syncStats) {
+	if st.vipAdd > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "vip_added").Add(float64(st.vipAdd))
+	}
+	if st.vipDel > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "vip_removed").Add(float64(st.vipDel))
+	}
+	if st.asAdd > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "as_added").Add(float64(st.asAdd))
+	}
+	if st.asDel > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "as_removed").Add(float64(st.asDel))
+	}
+	if st.asWeight > 0 {
+		metrics.LBSyncTotal.WithLabelValues(scope, "as_weight_updated").Add(float64(st.asWeight))
+	}
+}
+
 // SyncLBStateAll reconciles the full VPP load-balancer state with the given
 // config. For every frontend in cfg:
 //   - if the VIP does not exist in VPP, create it;
@@ -118,6 +141,7 @@ func (c *Client) SyncLBStateAll(cfg *config.Config) error {
 		}
 	}

+	recordSyncStats("all", &st)
 	slog.Info("vpp-lbsync-done",
 		"scope", "all",
 		"vip-added", st.vipAdd,
@@ -169,6 +193,7 @@ func (c *Client) SyncLBStateVIP(cfg *config.Config, feName string) error {
 	if err := reconcileVIP(ch, d, cur, &st); err != nil {
 		return err
 	}
+	recordSyncStats("vip", &st)
 	slog.Info("vpp-lbsync-done",
 		"scope", "vip",
 		"frontend", feName,