Add Prometheus metrics endpoint; containerize integration tests

Prometheus metrics (internal/metrics/, cmd/maglevd/)
- New --metrics-addr flag (default :9091, env MAGLEV_METRICS_ADDR)
  serving /metrics via promhttp.
- Gauge metrics scraped on demand via a custom prometheus.Collector:
  maglev_backend_state, maglev_backend_health, maglev_backend_enabled,
  maglev_frontend_pool_backend_weight.
- Inline counter/histogram metrics updated per probe:
  maglev_probe_total (by backend, type, result, code),
  maglev_probe_duration_seconds (by backend, type),
  maglev_backend_transitions_total (by backend, from, to).
- StateSource interface in metrics package breaks the import cycle
  with checker; checker.Checker satisfies it via GetBackendInfo.

Integration tests
- Run maglevd inside a containerlab node (debian:trixie-slim with
  build/ bind-mounted) instead of on the host. Eliminates port
  collisions with any host maglevd.
- maglevc commands run via docker exec into the maglevd container.
- Add 6 Prometheus test cases: endpoint reachable, all backends
  report state=up, probe counters non-zero, duration histogram
  populated, pool weights correct, transition counters present.
This commit is contained in:
2026-04-11 20:50:59 +02:00
parent 8bde00eb61
commit 4ab3096c8b
9 changed files with 311 additions and 18 deletions

View File

@@ -13,6 +13,7 @@ import (
"git.ipng.ch/ipng/vpp-maglev/internal/config"
"git.ipng.ch/ipng/vpp-maglev/internal/health"
"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
"git.ipng.ch/ipng/vpp-maglev/internal/prober"
)
@@ -267,6 +268,22 @@ func (c *Checker) GetBackend(name string) (BackendSnapshot, bool) {
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
}
// GetBackendInfo returns the health state and key config fields for a backend.
// Satisfies metrics.StateSource.
func (c *Checker) GetBackendInfo(name string) (metrics.BackendInfo, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
w, ok := c.workers[name]
if !ok {
return metrics.BackendInfo{}, false
}
return metrics.BackendInfo{
Health: w.backend,
Enabled: w.entry.Enabled,
HCName: w.entry.HealthCheck,
}, true
}
// PauseBackend pauses health checking for a backend by name. The probe
// goroutine is cancelled so no further traffic is sent to the backend. The
// backend's state is set to paused and remains frozen until ResumeBackend is
@@ -466,6 +483,7 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
slog.Debug("probe-start", "backend", name, "type", hc.Type)
start := time.Now()
result = prober.ForType(hc.Type)(probeCtx, pcfg)
elapsed := time.Since(start)
cancel()
slog.Debug("probe-done",
"backend", name,
@@ -473,8 +491,14 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
"ok", result.OK,
"code", result.Code,
"detail", result.Detail,
"elapsed", time.Since(start).Round(time.Millisecond).String(),
"elapsed", elapsed.Round(time.Millisecond).String(),
)
res := "success"
if !result.OK {
res = "failure"
}
metrics.ProbeTotal.WithLabelValues(name, hc.Type, res, result.Code).Inc()
metrics.ProbeDuration.WithLabelValues(name, hc.Type).Observe(elapsed.Seconds())
}
c.mu.Lock()
@@ -493,6 +517,7 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
"code", result.Code,
"detail", result.Detail,
)
metrics.TransitionTotal.WithLabelValues(name, t.From.String(), t.To.String()).Inc()
c.emitForBackend(name, addr, t, c.cfg.Frontends)
}
c.mu.Unlock()