Add Prometheus metrics endpoint; containerize integration tests

Prometheus metrics (internal/metrics/, cmd/maglevd/) - New --metrics-addr flag (default :9091, env MAGLEV_METRICS_ADDR) serving /metrics via promhttp. - Gauge metrics scraped on demand via a custom prometheus.Collector: maglev_backend_state, maglev_backend_health, maglev_backend_enabled, maglev_frontend_pool_backend_weight. - Inline counter/histogram metrics updated per probe: maglev_probe_total (by backend, type, result, code), maglev_probe_duration_seconds (by backend, type), maglev_backend_transitions_total (by backend, from, to). - StateSource interface in metrics package breaks the import cycle with checker; checker.Checker satisfies it via GetBackendInfo. Integration tests - Run maglevd inside a containerlab node (debian:trixie-slim with build/ bind-mounted) instead of on the host. Eliminates port collisions with any host maglevd. - maglevc commands run via docker exec into the maglevd container. - Add 6 Prometheus test cases: endpoint reachable, all backends report state=up, probe counters non-zero, duration histogram populated, pool weights correct, transition counters present.
2026-04-11 20:50:59 +02:00
parent 8bde00eb61
commit 4ab3096c8b
9 changed files with 311 additions and 18 deletions
--- a/internal/checker/checker.go
+++ b/internal/checker/checker.go
@@ -13,6 +13,7 @@ import (

 	"git.ipng.ch/ipng/vpp-maglev/internal/config"
 	"git.ipng.ch/ipng/vpp-maglev/internal/health"
+	"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
 	"git.ipng.ch/ipng/vpp-maglev/internal/prober"
 )

@@ -267,6 +268,22 @@ func (c *Checker) GetBackend(name string) (BackendSnapshot, bool) {
 	return BackendSnapshot{Health: w.backend, Config: w.entry}, true
 }

+// GetBackendInfo returns the health state and key config fields for a backend.
+// Satisfies metrics.StateSource.
+func (c *Checker) GetBackendInfo(name string) (metrics.BackendInfo, bool) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	w, ok := c.workers[name]
+	if !ok {
+		return metrics.BackendInfo{}, false
+	}
+	return metrics.BackendInfo{
+		Health:  w.backend,
+		Enabled: w.entry.Enabled,
+		HCName:  w.entry.HealthCheck,
+	}, true
+}
+
 // PauseBackend pauses health checking for a backend by name. The probe
 // goroutine is cancelled so no further traffic is sent to the backend. The
 // backend's state is set to paused and remains frozen until ResumeBackend is
@@ -466,6 +483,7 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
 			slog.Debug("probe-start", "backend", name, "type", hc.Type)
 			start := time.Now()
 			result = prober.ForType(hc.Type)(probeCtx, pcfg)
+			elapsed := time.Since(start)
 			cancel()
 			slog.Debug("probe-done",
 				"backend", name,
@@ -473,8 +491,14 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
 				"ok", result.OK,
 				"code", result.Code,
 				"detail", result.Detail,
-				"elapsed", time.Since(start).Round(time.Millisecond).String(),
+				"elapsed", elapsed.Round(time.Millisecond).String(),
 			)
+			res := "success"
+			if !result.OK {
+				res = "failure"
+			}
+			metrics.ProbeTotal.WithLabelValues(name, hc.Type, res, result.Code).Inc()
+			metrics.ProbeDuration.WithLabelValues(name, hc.Type).Observe(elapsed.Seconds())
 		}

 		c.mu.Lock()
@@ -493,6 +517,7 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
 				"code", result.Code,
 				"detail", result.Detail,
 			)
+			metrics.TransitionTotal.WithLabelValues(name, t.From.String(), t.To.String()).Inc()
 			c.emitForBackend(name, addr, t, c.cfg.Frontends)
 		}
 		c.mu.Unlock()
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -0,0 +1,176 @@
+// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
+
+// Package metrics exposes Prometheus metrics for maglevd.
+//
+// Gauge-type metrics (backend state, health counter, weights) are collected
+// on demand when Prometheus scrapes /metrics via the Collector. Counter and
+// histogram metrics (probe totals, probe duration, transitions) are updated
+// inline from the probe loop.
+package metrics
+
+import (
+	"git.ipng.ch/ipng/vpp-maglev/internal/config"
+	"git.ipng.ch/ipng/vpp-maglev/internal/health"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// BackendInfo holds the health and config state needed by the collector.
+type BackendInfo struct {
+	Health  *health.Backend
+	Enabled bool
+	HCName  string // healthcheck name from config
+}
+
+// StateSource provides read-only access to the running checker state.
+type StateSource interface {
+	ListBackends() []string
+	GetBackendInfo(name string) (BackendInfo, bool)
+	ListFrontends() []string
+	GetFrontend(name string) (config.Frontend, bool)
+}
+
+// ---- inline metrics (updated per probe) ------------------------------------
+
+var (
+	ProbeTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "maglev",
+		Subsystem: "probe",
+		Name:      "total",
+		Help:      "Total number of health-check probes executed.",
+	}, []string{"backend", "type", "result", "code"})
+
+	ProbeDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Namespace: "maglev",
+		Subsystem: "probe",
+		Name:      "duration_seconds",
+		Help:      "Health-check probe duration in seconds.",
+		Buckets:   []float64{.001, .0025, .005, .01, .025, .05, .1, .25, .5, 1, 2.5},
+	}, []string{"backend", "type"})
+
+	TransitionTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: "maglev",
+		Subsystem: "backend",
+		Name:      "transitions_total",
+		Help:      "Total number of backend state transitions.",
+	}, []string{"backend", "from", "to"})
+)
+
+// ---- collector (scraped on demand) -----------------------------------------
+
+// Collector implements prometheus.Collector by querying the running checker
+// on each scrape. This avoids stale label sets when backends are added or
+// removed by a config reload.
+type Collector struct {
+	src StateSource
+
+	backendState   *prometheus.Desc
+	backendHealth  *prometheus.Desc
+	backendEnabled *prometheus.Desc
+	poolWeight     *prometheus.Desc
+}
+
+// NewCollector creates a Collector backed by the given StateSource.
+func NewCollector(src StateSource) *Collector {
+	return &Collector{
+		src: src,
+		backendState: prometheus.NewDesc(
+			"maglev_backend_state",
+			"Current backend state (1 = active for the given state label).",
+			[]string{"backend", "address", "healthcheck", "state"}, nil,
+		),
+		backendHealth: prometheus.NewDesc(
+			"maglev_backend_health",
+			"Current health counter value.",
+			[]string{"backend"}, nil,
+		),
+		backendEnabled: prometheus.NewDesc(
+			"maglev_backend_enabled",
+			"Whether the backend is enabled (1) or disabled (0).",
+			[]string{"backend"}, nil,
+		),
+		poolWeight: prometheus.NewDesc(
+			"maglev_frontend_pool_backend_weight",
+			"Configured weight of a backend in a frontend pool (0-100).",
+			[]string{"frontend", "pool", "backend"}, nil,
+		),
+	}
+}
+
+// Describe implements prometheus.Collector.
+func (c *Collector) Describe(ch chan<- *prometheus.Desc) {
+	ch <- c.backendState
+	ch <- c.backendHealth
+	ch <- c.backendEnabled
+	ch <- c.poolWeight
+}
+
+// Collect implements prometheus.Collector.
+func (c *Collector) Collect(ch chan<- prometheus.Metric) {
+	states := []health.State{
+		health.StateUnknown,
+		health.StateUp,
+		health.StateDown,
+		health.StatePaused,
+		health.StateRemoved,
+	}
+
+	for _, name := range c.src.ListBackends() {
+		info, ok := c.src.GetBackendInfo(name)
+		if !ok {
+			continue
+		}
+		addr := info.Health.Address.String()
+
+		// One time-series per possible state; the current state is 1, rest 0.
+		for _, s := range states {
+			val := 0.0
+			if info.Health.State == s {
+				val = 1.0
+			}
+			ch <- prometheus.MustNewConstMetric(
+				c.backendState, prometheus.GaugeValue, val,
+				name, addr, info.HCName, s.String(),
+			)
+		}
+
+		ch <- prometheus.MustNewConstMetric(
+			c.backendHealth, prometheus.GaugeValue,
+			float64(info.Health.Counter.Health), name,
+		)
+
+		enabled := 0.0
+		if info.Enabled {
+			enabled = 1.0
+		}
+		ch <- prometheus.MustNewConstMetric(
+			c.backendEnabled, prometheus.GaugeValue, enabled, name,
+		)
+	}
+
+	for _, feName := range c.src.ListFrontends() {
+		fe, ok := c.src.GetFrontend(feName)
+		if !ok {
+			continue
+		}
+		for _, pool := range fe.Pools {
+			for beName, pb := range pool.Backends {
+				ch <- prometheus.MustNewConstMetric(
+					c.poolWeight, prometheus.GaugeValue,
+					float64(pb.Weight), feName, pool.Name, beName,
+				)
+			}
+		}
+	}
+}
+
+// Register registers all metrics with the given registry.
+func Register(reg prometheus.Registerer, src StateSource) *Collector {
+	coll := NewCollector(src)
+	reg.MustRegister(coll)
+	reg.MustRegister(ProbeTotal)
+	reg.MustRegister(ProbeDuration)
+	reg.MustRegister(TransitionTotal)
+	return coll
+}
+