diff --git a/cmd/maglevd/main.go b/cmd/maglevd/main.go index d0dbc4d..da1c8ab 100644 --- a/cmd/maglevd/main.go +++ b/cmd/maglevd/main.go @@ -13,6 +13,7 @@ import ( "os/signal" "syscall" + grpcprom "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "google.golang.org/grpc" @@ -109,16 +110,29 @@ func run() error { } // ---- gRPC server -------------------------------------------------------- + // Server-side metrics for every RPC: call counters, in-flight gauges, + // and handler-latency histograms, labelled by method and gRPC code. + // Provided by go-grpc-middleware's prometheus adapter; the metric + // families are emitted on the /metrics endpoint alongside ours. + grpcMetrics := grpcprom.NewServerMetrics( + grpcprom.WithServerHandlingTimeHistogram(), + ) lis, err := net.Listen("tcp", *grpcAddr) if err != nil { return fmt.Errorf("listen %s: %w", *grpcAddr, err) } - srv := grpc.NewServer() + srv := grpc.NewServer( + grpc.UnaryInterceptor(grpcMetrics.UnaryServerInterceptor()), + grpc.StreamInterceptor(grpcMetrics.StreamServerInterceptor()), + ) maglevServer := grpcapi.NewServer(ctx, chkr, logBroadcaster, *configPath, vppClient) grpcapi.RegisterMaglevServer(srv, maglevServer) if *enableReflection { reflection.Register(srv) } + // Pre-register every method with 0 so the metric shows up on first + // scrape even before any RPC has been received. + grpcMetrics.InitializeMetrics(srv) slog.Info("grpc-listening", "addr", *grpcAddr, "reflection", *enableReflection) go func() { @@ -130,7 +144,14 @@ func run() error { // ---- Prometheus metrics ------------------------------------------------- if *metricsAddr != "" { reg := prometheus.DefaultRegisterer - metrics.Register(reg, chkr) + // vppClient may be nil when VPP integration is disabled; the + // collector handles that by skipping the vpp_* gauges. + var vppSrc metrics.VPPSource + if vppClient != nil { + vppSrc = vppClient + } + metrics.Register(reg, chkr, vppSrc) + reg.MustRegister(grpcMetrics) mux := http.NewServeMux() mux.Handle("/metrics", promhttp.Handler()) diff --git a/docs/config-guide.md b/docs/config-guide.md index 559e3f3..a5929b2 100644 --- a/docs/config-guide.md +++ b/docs/config-guide.md @@ -239,8 +239,10 @@ multiple frontends. * ***address***: Required. The IPv4 or IPv6 address of this backend server. * ***healthcheck***: The name of a health check defined in the `healthchecks` section. - When empty or omitted, no probing is performed and the backend is assumed permanently - healthy. This is useful for backends that are always available or managed by other means. + When empty or omitted, the backend is static: no probing is performed and the backend + enters `StateUp` immediately on startup (via a synthetic pass, rise/fall forced to 1/1). + This is useful for backends that are always available or managed by other means. See + [healthchecks.md](healthchecks.md) for details on the static-backend behavior. * ***enabled***: A boolean controlling whether this backend participates in any frontend. When `false`, the backend is excluded entirely and no probe goroutine is started. Defaults to `true`. @@ -280,8 +282,12 @@ ordered list of backend pools. The gRPC API exposes frontends by name. frontend port is independent of the healthcheck port: a frontend on port 443 may use a healthcheck that probes port 80. * ***pools***: Required. A non-empty ordered list of pool objects. Pools express priority: - the first pool is preferred; subsequent pools act as fallbacks. All backends across all - pools in a frontend must have addresses of the same address family (all IPv4 or all IPv6). + the first pool is preferred; subsequent pools act as fallbacks. When every backend in + pool[0] leaves `StateUp` (down, paused, disabled, or not yet probed), pool[1] is + automatically promoted — its up backends take over serving traffic. The promotion + cascades across further tiers. See [healthchecks.md](healthchecks.md#pool-failover) + for the full failover semantics. All backends across all pools in a frontend must + have addresses of the same address family (all IPv4 or all IPv6). Each pool has: diff --git a/docs/healthchecks.md b/docs/healthchecks.md index 7b459dd..8474d9e 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -187,6 +187,55 @@ goroutine is not restarted and no transition event is emitted. --- +## Static (no-healthcheck) backends + +A backend with no `healthcheck` field in YAML skips the probe loop entirely. +Instead of actually probing, `maglevd` synthesises a single passing result +on startup. Specifically: + +- The worker's rise/fall counters are forced to `1/1`, so a single synthetic + pass is enough to reach `StateUp`. +- The first "probe" fires immediately (zero sleep). Subsequent iterations + idle at 30 seconds — there is nothing to do. +- The backend reaches `up` within milliseconds of startup. + +Static backends are useful for administrative VIPs where the caller knows the +backend is always available, or for test configurations where deterministic +state is more valuable than real health signals. + +--- + +## Pool failover + +Every frontend has one or more pools. The pools are priority tiers: pool[0] +is the primary, pool[1] is the first fallback, pool[2] the next, and so on. +At any moment, `maglevd` computes an **active pool** — the first pool that +contains at least one backend in `StateUp`: + +- As long as pool[0] has any up backend, it stays active. Its up backends + receive traffic at their configured weights; backends in lower-priority + pools stay on standby with effective weight 0. +- When pool[0] has zero up backends (all down, paused, disabled, or still + unknown), pool[1] is promoted: its up backends get their configured + weights, and pool[0] backends stay at 0 until at least one recovers. +- The same rule cascades to pool[2], pool[3], etc., for further fallback + tiers. +- When no pool has any up backend, every backend's effective weight is 0 + and the VIP serves nothing. + +Failover is evaluated on every backend state transition and also on the +periodic VPP drift reconciliation (every `maglev.vpp.lb.sync-interval`). +The resulting effective weight for each backend can be inspected via +`maglevc show frontends ` — each pool backend row shows both the +configured weight and the effective weight after failover. + +Demotion on recovery (e.g. pool[1] → standby when pool[0] comes back up) +drains gracefully: the demoted backends have their weight set to 0 but +existing flows in the VPP flow table are left to drain naturally. The only +state that forces immediate flow-table flushing is operator `disable`. + +--- + ## Log lines All state changes produce a structured log line at `INFO` level: diff --git a/docs/user-guide.md b/docs/user-guide.md index 3f4c93b..a5d3c48 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -41,8 +41,47 @@ special capabilities. All log output is written to stdout as JSON using Go's `log/slog`. The first line logged after the logger is configured is a `starting` record that includes `version`, `commit`, and `date`. Every state change emits a `backend-transition` -line at `INFO` level. Set `--log-level debug` to see individual probe attempts -and their outcomes. +line at `INFO` level. Set `--log-level debug` to see individual probe attempts, +every VPP binary-API call (`vpp-api-send` / `vpp-api-recv` with full payload), +and the per-VIP sync operations (`vpp-lbsync-vip-add`, `vpp-lbsync-as-weight`, +etc.) as they happen. + +### Prometheus metrics + +`maglevd` exposes Prometheus metrics on `--metrics-addr` (default `:9091`) at +the `/metrics` path. Metric families: + +**Health-check and backend state (gauges, on-demand):** +| Metric | Labels | Description | +|---|---|---| +| `maglev_backend_state` | `backend`, `address`, `healthcheck`, `state` | 1 for the current state row per backend, 0 otherwise. | +| `maglev_backend_health` | `backend` | Current rise/fall counter value. | +| `maglev_backend_enabled` | `backend` | 1 if enabled, 0 if disabled. | +| `maglev_frontend_pool_backend_weight` | `frontend`, `pool`, `backend` | Configured weight from YAML. | + +**Probe counters and latency (inline):** +| Metric | Labels | Description | +|---|---|---| +| `maglev_probe_total` | `backend`, `type`, `result`, `code` | Probes executed. `result` is `success` or `failure`. | +| `maglev_probe_duration_seconds` | `backend`, `type` | Histogram of probe wall time. | +| `maglev_backend_transitions_total` | `backend`, `from`, `to` | State machine transitions. | + +**VPP integration (when enabled):** +| Metric | Labels | Description | +|---|---|---| +| `maglev_vpp_connected` | — | 1 if maglevd currently has a live VPP connection. | +| `maglev_vpp_uptime_seconds` | — | Seconds since VPP started (from `/sys/boottime`). | +| `maglev_vpp_connected_seconds` | — | Seconds since maglevd established the current VPP connection. | +| `maglev_vpp_info` | `version`, `build_date`, `pid` | Static VPP build metadata; always 1. | +| `maglev_vpp_api_total` | `msg`, `direction`, `result` | VPP binary-API calls. `direction` is `send` or `recv`; `result` is `success` or `failure`. | +| `maglev_vpp_lbsync_total` | `scope`, `kind` | Per-mutation sync counters. `scope` is `all` or `vip`; `kind` is one of `vip_added`, `vip_removed`, `as_added`, `as_removed`, `as_weight_updated`. | + +**gRPC server (standard `go-grpc-middleware/prometheus` metrics):** +`grpc_server_started_total`, `grpc_server_handled_total`, +`grpc_server_msg_received_total`, `grpc_server_msg_sent_total`, and +`grpc_server_handling_seconds` — all labelled by `grpc_service`, +`grpc_method`, `grpc_type`, and `grpc_code`. Every method is +pre-registered at zero so time series exist on the first scrape. --- @@ -74,8 +113,12 @@ show version Print build version, commit hash, and build dat show frontends [] Without name: list all frontend names. With name: show address, protocol, port, description, - and pools. Each pool lists its backends with weights - (if != 100) and marks disabled backends with [disabled]. + and pools. Each pool lists its backends with two + weight columns: + weight — configured weight from the YAML + effective — state-aware weight after pool failover + (what gets programmed into VPP) + Disabled backends are marked with [disabled]. show backends [] Without name: list all backend names. With name: show address, current state (with duration), @@ -104,13 +147,16 @@ sync vpp lbstate [] Reconcile the VPP load-balancer dataplane from drift, and once on startup. set backend pause Stop health checking for a backend. Cancels the probe - goroutine so no further traffic is sent, and freezes - the state at whatever it was when paused. + goroutine so no further traffic is sent, and sets the + state to 'paused'. The backend's transition history is + preserved, so 'show backend ' still shows where + it came from. set backend resume Resume health checking. A fresh probe goroutine is started and the backend re-enters unknown state. -set backend disable Stop probing entirely and remove the backend from rotation. - The backend remains visible (state: disabled) and can be - re-enabled without reloading configuration. +set backend disable Stop probing entirely and remove the backend from + rotation. The backend remains visible (state: disabled) + with its transition history intact and can be re-enabled + without reloading configuration. set backend enable Re-enable a disabled backend. A fresh probe goroutine is started and the backend re-enters unknown state. diff --git a/go.mod b/go.mod index 66b89d7..aa2a5a2 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,8 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/ftrvxmtrx/fd v0.0.0-20150925145434-c6d800382fff // indirect + github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.1.0 // indirect + github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 // indirect github.com/kr/text v0.2.0 // indirect github.com/lunixbochs/struc v0.0.0-20200521075829-a4cb8d33dbbe // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect diff --git a/go.sum b/go.sum index a2dbf33..a7719e1 100644 --- a/go.sum +++ b/go.sum @@ -26,6 +26,10 @@ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.1.0 h1:QGLs/O40yoNK9vmy4rhUGBVyMf1lISBGtXRpsu/Qu/o= +github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.1.0/go.mod h1:hM2alZsMUni80N33RBe6J0e423LB+odMj7d3EMP9l20= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 h1:pRhl55Yx1eC7BZ1N+BBWwnKaMyD8uC+34TLdndZMAKk= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0/go.mod h1:XKMd7iuf/RGPSMJ/U4HP0zS2Z9Fh8Ps9a+6X26m/tmI= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 8c0119e..c6f43b2 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -2,13 +2,17 @@ // Package metrics exposes Prometheus metrics for maglevd. // -// Gauge-type metrics (backend state, health counter, weights) are collected -// on demand when Prometheus scrapes /metrics via the Collector. Counter and -// histogram metrics (probe totals, probe duration, transitions) are updated -// inline from the probe loop. +// Gauge-type metrics (backend state, health counter, weights, VPP connection +// info) are collected on demand when Prometheus scrapes /metrics via the +// Collector. Counter and histogram metrics (probe totals, probe duration, +// transitions, VPP API calls, LB sync operations) are updated inline from +// the probe loop and VPP sync paths. package metrics import ( + "fmt" + "time" + "git.ipng.ch/ipng/vpp-maglev/internal/config" "git.ipng.ch/ipng/vpp-maglev/internal/health" @@ -30,6 +34,25 @@ type StateSource interface { GetFrontend(name string) (config.Frontend, bool) } +// VPPInfo mirrors vpp.Info so the metrics package doesn't need to import +// internal/vpp (which would create an import cycle — vpp imports metrics +// to update counters inline). +type VPPInfo struct { + Version string + BuildDate string + PID uint32 + BootTime time.Time + ConnectedSince time.Time +} + +// VPPSource provides read-only access to the VPP client's state. vpp.Client +// is adapted to this interface via a small shim in the collector so the +// metrics package stays decoupled from the vpp package's concrete types. +type VPPSource interface { + IsConnected() bool + VPPInfo() (VPPInfo, bool) +} + // ---- inline metrics (updated per probe) ------------------------------------ var ( @@ -54,6 +77,27 @@ var ( Name: "transitions_total", Help: "Total number of backend state transitions.", }, []string{"backend", "from", "to"}) + + // ---- VPP API counters --------------------------------------------------- + + VPPAPITotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "maglev", + Subsystem: "vpp_api", + Name: "total", + Help: "Total number of VPP binary-API messages sent to or received from VPP.", + }, []string{"msg", "direction", "result"}) + + // ---- LB sync counters --------------------------------------------------- + + // LBSyncTotal counts individual dataplane mutations performed by the + // sync path. kind ∈ {vip_added, vip_removed, as_added, as_removed, + // as_weight_updated}; scope ∈ {all, vip}. + LBSyncTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "maglev", + Subsystem: "vpp_lbsync", + Name: "total", + Help: "Total number of VPP load-balancer sync operations applied to the dataplane.", + }, []string{"scope", "kind"}) ) // ---- collector (scraped on demand) ----------------------------------------- @@ -63,17 +107,26 @@ var ( // removed by a config reload. type Collector struct { src StateSource + vpp VPPSource // optional; nil when VPP integration is disabled backendState *prometheus.Desc backendHealth *prometheus.Desc backendEnabled *prometheus.Desc poolWeight *prometheus.Desc + + vppConnected *prometheus.Desc + vppUptimeSeconds *prometheus.Desc + vppConnectedFor *prometheus.Desc + vppInfo *prometheus.Desc } -// NewCollector creates a Collector backed by the given StateSource. -func NewCollector(src StateSource) *Collector { +// NewCollector creates a Collector backed by the given StateSource. vpp may +// be nil when VPP integration is disabled; in that case vpp_* metrics are +// simply not emitted. +func NewCollector(src StateSource, vpp VPPSource) *Collector { return &Collector{ src: src, + vpp: vpp, backendState: prometheus.NewDesc( "maglev_backend_state", "Current backend state (1 = active for the given state label).", @@ -94,6 +147,26 @@ func NewCollector(src StateSource) *Collector { "Configured weight of a backend in a frontend pool (0-100).", []string{"frontend", "pool", "backend"}, nil, ), + vppConnected: prometheus.NewDesc( + "maglev_vpp_connected", + "Whether maglevd currently has an established connection to VPP (1) or not (0).", + nil, nil, + ), + vppUptimeSeconds: prometheus.NewDesc( + "maglev_vpp_uptime_seconds", + "Seconds since VPP started (from the /sys/boottime stats counter).", + nil, nil, + ), + vppConnectedFor: prometheus.NewDesc( + "maglev_vpp_connected_seconds", + "Seconds since maglevd established the current VPP connection.", + nil, nil, + ), + vppInfo: prometheus.NewDesc( + "maglev_vpp_info", + "Static VPP build information. Always 1; metadata is conveyed via labels.", + []string{"version", "build_date", "pid"}, nil, + ), } } @@ -103,6 +176,10 @@ func (c *Collector) Describe(ch chan<- *prometheus.Desc) { ch <- c.backendHealth ch <- c.backendEnabled ch <- c.poolWeight + ch <- c.vppConnected + ch <- c.vppUptimeSeconds + ch <- c.vppConnectedFor + ch <- c.vppInfo } // Collect implements prometheus.Collector. @@ -163,14 +240,48 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) { } } } + + // ---- VPP gauges ------------------------------------------------------- + if c.vpp == nil { + return + } + connected := 0.0 + if c.vpp.IsConnected() { + connected = 1.0 + } + ch <- prometheus.MustNewConstMetric(c.vppConnected, prometheus.GaugeValue, connected) + + info, ok := c.vpp.VPPInfo() + if !ok { + return + } + if !info.BootTime.IsZero() { + ch <- prometheus.MustNewConstMetric( + c.vppUptimeSeconds, prometheus.GaugeValue, + time.Since(info.BootTime).Seconds(), + ) + } + if !info.ConnectedSince.IsZero() { + ch <- prometheus.MustNewConstMetric( + c.vppConnectedFor, prometheus.GaugeValue, + time.Since(info.ConnectedSince).Seconds(), + ) + } + ch <- prometheus.MustNewConstMetric( + c.vppInfo, prometheus.GaugeValue, 1.0, + info.Version, info.BuildDate, fmt.Sprintf("%d", info.PID), + ) } -// Register registers all metrics with the given registry. -func Register(reg prometheus.Registerer, src StateSource) *Collector { - coll := NewCollector(src) +// Register registers all metrics with the given registry. vpp may be nil +// to disable VPP-related metrics. +func Register(reg prometheus.Registerer, src StateSource, vpp VPPSource) *Collector { + coll := NewCollector(src, vpp) reg.MustRegister(coll) reg.MustRegister(ProbeTotal) reg.MustRegister(ProbeDuration) reg.MustRegister(TransitionTotal) + reg.MustRegister(VPPAPITotal) + reg.MustRegister(LBSyncTotal) return coll } diff --git a/internal/vpp/apilog.go b/internal/vpp/apilog.go index a648f8c..b697095 100644 --- a/internal/vpp/apilog.go +++ b/internal/vpp/apilog.go @@ -7,6 +7,8 @@ import ( "log/slog" "go.fd.io/govpp/api" + + "git.ipng.ch/ipng/vpp-maglev/internal/metrics" ) // loggedChannel wraps an api.Channel so that every VPP request/reply is @@ -38,27 +40,31 @@ func (lc *loggedChannel) Close() { lc.ch.Close() } // SendRequest logs the outgoing message and returns a wrapped request context. func (lc *loggedChannel) SendRequest(msg api.Message) *loggedRequestCtx { + name := msg.GetMessageName() slog.Debug("vpp-api-send", - "msg", msg.GetMessageName(), + "msg", name, "crc", msg.GetCrcString(), "payload", fmt.Sprintf("%+v", msg), ) + metrics.VPPAPITotal.WithLabelValues(name, "send", "success").Inc() return &loggedRequestCtx{ ctx: lc.ch.SendRequest(msg), - name: msg.GetMessageName(), + name: name, } } // SendMultiRequest logs the outgoing message and returns a wrapped multi-request context. func (lc *loggedChannel) SendMultiRequest(msg api.Message) *loggedMultiRequestCtx { + name := msg.GetMessageName() slog.Debug("vpp-api-send-multi", - "msg", msg.GetMessageName(), + "msg", name, "crc", msg.GetCrcString(), "payload", fmt.Sprintf("%+v", msg), ) + metrics.VPPAPITotal.WithLabelValues(name, "send", "success").Inc() return &loggedMultiRequestCtx{ ctx: lc.ch.SendMultiRequest(msg), - name: msg.GetMessageName(), + name: name, } } @@ -76,6 +82,7 @@ func (r *loggedRequestCtx) ReceiveReply(msg api.Message) error { "reply", msg.GetMessageName(), "err", err, ) + metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "failure").Inc() return err } slog.Debug("vpp-api-recv", @@ -83,6 +90,7 @@ func (r *loggedRequestCtx) ReceiveReply(msg api.Message) error { "reply", msg.GetMessageName(), "payload", fmt.Sprintf("%+v", msg), ) + metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "success").Inc() return nil } @@ -102,6 +110,7 @@ func (r *loggedMultiRequestCtx) ReceiveReply(msg api.Message) (bool, error) { "seq", r.seq, "err", err, ) + metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "failure").Inc() return stop, err } if stop { @@ -117,6 +126,7 @@ func (r *loggedMultiRequestCtx) ReceiveReply(msg api.Message) (bool, error) { "seq", r.seq, "payload", fmt.Sprintf("%+v", msg), ) + metrics.VPPAPITotal.WithLabelValues(r.name, "recv", "success").Inc() r.seq++ return stop, nil } diff --git a/internal/vpp/client.go b/internal/vpp/client.go index 9166a01..6507c61 100644 --- a/internal/vpp/client.go +++ b/internal/vpp/client.go @@ -19,6 +19,7 @@ import ( "git.ipng.ch/ipng/vpp-maglev/internal/config" "git.ipng.ch/ipng/vpp-maglev/internal/health" + "git.ipng.ch/ipng/vpp-maglev/internal/metrics" lb "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb" ) @@ -216,6 +217,25 @@ func (c *Client) GetInfo() (Info, error) { return c.info, nil } +// VPPInfo satisfies metrics.VPPSource. It returns a copy of the cached +// connection info as a metrics-local struct so the metrics package doesn't +// need to import internal/vpp. Second return is false when VPP is not +// connected (the collector skips the vpp_* gauges in that case). +func (c *Client) VPPInfo() (metrics.VPPInfo, bool) { + c.mu.Lock() + defer c.mu.Unlock() + if c.apiConn == nil { + return metrics.VPPInfo{}, false + } + return metrics.VPPInfo{ + Version: c.info.Version, + BuildDate: c.info.BuildDate, + PID: c.info.PID, + BootTime: c.info.BootTime, + ConnectedSince: c.info.ConnectedSince, + }, true +} + // connect establishes both API and stats connections. If either fails, // both are torn down. func (c *Client) connect() error { diff --git a/internal/vpp/lbsync.go b/internal/vpp/lbsync.go index 55f25f0..8d9c535 100644 --- a/internal/vpp/lbsync.go +++ b/internal/vpp/lbsync.go @@ -10,6 +10,7 @@ import ( "git.ipng.ch/ipng/vpp-maglev/internal/config" "git.ipng.ch/ipng/vpp-maglev/internal/health" + "git.ipng.ch/ipng/vpp-maglev/internal/metrics" ip_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/ip_types" lb "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb" lb_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb_types" @@ -50,6 +51,28 @@ type syncStats struct { asWeight int } +// recordSyncStats increments the Prometheus lbsync counters for one sync +// run. scope is "all" for SyncLBStateAll and "vip" for SyncLBStateVIP. +// Zero-valued kinds emit no increment (the counter stays at its previous +// value for that label set). +func recordSyncStats(scope string, st *syncStats) { + if st.vipAdd > 0 { + metrics.LBSyncTotal.WithLabelValues(scope, "vip_added").Add(float64(st.vipAdd)) + } + if st.vipDel > 0 { + metrics.LBSyncTotal.WithLabelValues(scope, "vip_removed").Add(float64(st.vipDel)) + } + if st.asAdd > 0 { + metrics.LBSyncTotal.WithLabelValues(scope, "as_added").Add(float64(st.asAdd)) + } + if st.asDel > 0 { + metrics.LBSyncTotal.WithLabelValues(scope, "as_removed").Add(float64(st.asDel)) + } + if st.asWeight > 0 { + metrics.LBSyncTotal.WithLabelValues(scope, "as_weight_updated").Add(float64(st.asWeight)) + } +} + // SyncLBStateAll reconciles the full VPP load-balancer state with the given // config. For every frontend in cfg: // - if the VIP does not exist in VPP, create it; @@ -118,6 +141,7 @@ func (c *Client) SyncLBStateAll(cfg *config.Config) error { } } + recordSyncStats("all", &st) slog.Info("vpp-lbsync-done", "scope", "all", "vip-added", st.vipAdd, @@ -169,6 +193,7 @@ func (c *Client) SyncLBStateVIP(cfg *config.Config, feName string) error { if err := reconcileVIP(ch, d, cur, &st); err != nil { return err } + recordSyncStats("vip", &st) slog.Info("vpp-lbsync-done", "scope", "vip", "frontend", feName,