Add Prometheus metrics endpoint; containerize integration tests
Prometheus metrics (internal/metrics/, cmd/maglevd/) - New --metrics-addr flag (default :9091, env MAGLEV_METRICS_ADDR) serving /metrics via promhttp. - Gauge metrics scraped on demand via a custom prometheus.Collector: maglev_backend_state, maglev_backend_health, maglev_backend_enabled, maglev_frontend_pool_backend_weight. - Inline counter/histogram metrics updated per probe: maglev_probe_total (by backend, type, result, code), maglev_probe_duration_seconds (by backend, type), maglev_backend_transitions_total (by backend, from, to). - StateSource interface in metrics package breaks the import cycle with checker; checker.Checker satisfies it via GetBackendInfo. Integration tests - Run maglevd inside a containerlab node (debian:trixie-slim with build/ bind-mounted) instead of on the host. Eliminates port collisions with any host maglevd. - maglevc commands run via docker exec into the maglevd container. - Add 6 Prometheus test cases: endpoint reachable, all backends report state=up, probe counters non-zero, duration histogram populated, pool weights correct, transition counters present.
This commit is contained in:
@@ -13,6 +13,7 @@ import (
|
||||
|
||||
"git.ipng.ch/ipng/vpp-maglev/internal/config"
|
||||
"git.ipng.ch/ipng/vpp-maglev/internal/health"
|
||||
"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
|
||||
"git.ipng.ch/ipng/vpp-maglev/internal/prober"
|
||||
)
|
||||
|
||||
@@ -267,6 +268,22 @@ func (c *Checker) GetBackend(name string) (BackendSnapshot, bool) {
|
||||
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
|
||||
}
|
||||
|
||||
// GetBackendInfo returns the health state and key config fields for a backend.
|
||||
// Satisfies metrics.StateSource.
|
||||
func (c *Checker) GetBackendInfo(name string) (metrics.BackendInfo, bool) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
w, ok := c.workers[name]
|
||||
if !ok {
|
||||
return metrics.BackendInfo{}, false
|
||||
}
|
||||
return metrics.BackendInfo{
|
||||
Health: w.backend,
|
||||
Enabled: w.entry.Enabled,
|
||||
HCName: w.entry.HealthCheck,
|
||||
}, true
|
||||
}
|
||||
|
||||
// PauseBackend pauses health checking for a backend by name. The probe
|
||||
// goroutine is cancelled so no further traffic is sent to the backend. The
|
||||
// backend's state is set to paused and remains frozen until ResumeBackend is
|
||||
@@ -466,6 +483,7 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
|
||||
slog.Debug("probe-start", "backend", name, "type", hc.Type)
|
||||
start := time.Now()
|
||||
result = prober.ForType(hc.Type)(probeCtx, pcfg)
|
||||
elapsed := time.Since(start)
|
||||
cancel()
|
||||
slog.Debug("probe-done",
|
||||
"backend", name,
|
||||
@@ -473,8 +491,14 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
|
||||
"ok", result.OK,
|
||||
"code", result.Code,
|
||||
"detail", result.Detail,
|
||||
"elapsed", time.Since(start).Round(time.Millisecond).String(),
|
||||
"elapsed", elapsed.Round(time.Millisecond).String(),
|
||||
)
|
||||
res := "success"
|
||||
if !result.OK {
|
||||
res = "failure"
|
||||
}
|
||||
metrics.ProbeTotal.WithLabelValues(name, hc.Type, res, result.Code).Inc()
|
||||
metrics.ProbeDuration.WithLabelValues(name, hc.Type).Observe(elapsed.Seconds())
|
||||
}
|
||||
|
||||
c.mu.Lock()
|
||||
@@ -493,6 +517,7 @@ func (c *Checker) runProbe(ctx context.Context, name string, pos, total int) {
|
||||
"code", result.Code,
|
||||
"detail", result.Detail,
|
||||
)
|
||||
metrics.TransitionTotal.WithLabelValues(name, t.From.String(), t.To.String()).Inc()
|
||||
c.emitForBackend(name, addr, t, c.cfg.Frontends)
|
||||
}
|
||||
c.mu.Unlock()
|
||||
|
||||
Reference in New Issue
Block a user