From 1815675fb6cd076084211ffab07c08ea33eadb00 Mon Sep 17 00:00:00 2001 From: Pim van Pelt Date: Sat, 11 Apr 2026 21:04:17 +0200 Subject: [PATCH] Distinguish disabled from removed backend state; add make fixstyle Add StateDisabled for operator-initiated disable, keeping StateRemoved for backends that disappear during a config reload. Previously both used StateRemoved, which was confusing: "removed" implies the backend no longer exists in config, but a disabled backend is still present and can be re-enabled on the fly. - health: add StateDisabled with String() "disabled", Disable() method with probe code "disabled". Record() rejects probes in all three inactive states (paused, disabled, removed). - checker: DisableBackend calls backend.Disable() instead of Remove(). - docs: healthchecks.md rewritten for pause (goroutine cancelled, not just results discarded), and separate disabled/removed state rows. user-guide.md updated to match. - Makefile: add fixstyle target (gofmt -w .). --- Makefile | 5 ++- docs/healthchecks.md | 54 +++++++++++++++++++-------- docs/user-guide.md | 2 +- internal/checker/checker.go | 2 +- internal/checker/checker_test.go | 4 +- internal/grpcapi/server_test.go | 4 +- internal/health/state.go | 22 ++++++++--- internal/health/state_test.go | 1 + internal/metrics/metrics.go | 2 +- tests/01-maglevd/01-healthcheck.robot | 2 +- 10 files changed, 68 insertions(+), 30 deletions(-) diff --git a/Makefile b/Makefile index 08b21d0..d00af87 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ LDFLAGS := -X '$(MODULE)/cmd.version=$(VERSION)' \ TEST ?= tests/ -.PHONY: all build build-amd64 build-arm64 test proto lint pkg-deb robot-test clean +.PHONY: all build build-amd64 build-arm64 test proto lint fixstyle pkg-deb robot-test clean all: build @@ -48,6 +48,9 @@ $(GEN_FILES): $(PROTO_FILE) --go-grpc_out=. --go-grpc_opt=module=$(MODULE) \ $(PROTO_FILE) +fixstyle: + gofmt -w . + lint: golangci-lint run ./... diff --git a/docs/healthchecks.md b/docs/healthchecks.md index e05f3b0..7b459dd 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -2,7 +2,7 @@ `maglevd` probes each backend independently of how many frontends reference it. Every backend runs exactly one probe goroutine. State changes are broadcast as -gRPC events to all connected `WatchBackendEvents` subscribers. +gRPC events to all connected `WatchEvents` subscribers. --- @@ -10,11 +10,12 @@ gRPC events to all connected `WatchBackendEvents` subscribers. | State | Meaning | |---|---| -| `unknown` | Initial state; also entered after a resume or backend restart. | +| `unknown` | Initial state; also entered after a resume or enable. | | `up` | Backend is healthy and eligible to receive traffic. | | `down` | Backend has failed enough consecutive probes to be considered offline. | -| `paused` | Health checking suspended by an operator. Probes fire but results are discarded. | -| `removed` | Backend was removed from configuration. No further probes are accepted. | +| `paused` | Health checking stopped by an operator. No probes are sent. | +| `disabled` | Backend disabled by an operator. No probes are sent. | +| `removed` | Backend removed from configuration by a reload. No probes are sent. | --- @@ -41,9 +42,9 @@ without bouncing between up and down. ### Expedited unknown resolution -When a backend enters `unknown` state (new, restarted, or resumed) its counter -is pre-loaded to `rise − 1`. This means a single probe result is enough to -resolve the state: +When a backend enters `unknown` state (new, restarted, resumed, or re-enabled) +its counter is pre-loaded to `rise − 1`. This means a single probe result is +enough to resolve the state: - **1 pass** → `up` - **1 fail** → `down` (also via the special unknown shortcut below) @@ -92,7 +93,7 @@ that are known to be offline. ## Transition events Every state change is logged as `backend-transition` and emitted as a gRPC -`BackendEvent` to all active `WatchBackendEvents` streams. +`BackendEvent` to all active `WatchEvents` streams. ### Backend added (config load or reload) @@ -100,7 +101,7 @@ Every state change is logged as `backend-transition` and emitted as a gRPC unknown → unknown (code: start) ``` -The counter is pre-loaded to `rise − 1`. The first probe fires immediately at +The counter is pre-loaded to `rise − 1`. The first probe fires after `fast-interval` (or `interval` if not configured). One pass produces `unknown → up`; one fail produces `unknown → down`. @@ -127,8 +128,9 @@ If multiple backends start together they are staggered across the first → paused (operator action) ``` -The counter is reset to 0. Probes continue to fire on their normal schedule but -all results are discarded. The backend stays `paused` until explicitly resumed. +The counter is reset to 0. The probe goroutine is cancelled — no further +probes are sent and no traffic reaches the backend while it is paused. The +backend stays `paused` until explicitly resumed. ### Resume @@ -136,9 +138,31 @@ all results are discarded. The backend stays `paused` until explicitly resumed. paused → unknown (operator action) ``` -The counter is reset to `rise − 1`. The probe goroutine is woken immediately -(no wait for the next scheduled probe). One subsequent pass produces `unknown → -up`; one fail produces `unknown → down`. +The counter is reset to `rise − 1`. A fresh probe goroutine is started, +which fires its first probe after `fast-interval` (or `interval` if not +configured). One pass produces `unknown → up`; one fail produces `unknown → +down`. + +### Disable + +``` + → disabled (operator action) +``` + +The probe goroutine is cancelled and the backend is marked `enabled: false`. +No further probes are sent. The backend remains visible via the gRPC API (state +`disabled`) and can be re-enabled without a config reload. + +### Enable + +``` +disabled → unknown (operator action, via fresh goroutine) +``` + +A new probe goroutine is started and the backend re-enters `unknown` with the +counter pre-loaded to `rise − 1`. The `enabled` flag is set back to `true`. +The first probe fires after `fast-interval` and resolves state as described +under *Backend added*. ### Backend removed (config reload) @@ -175,4 +199,4 @@ All state changes produce a structured log line at `INFO` level: Probe-driven transitions also carry `code` and `detail` fields from the probe result (e.g. `L4CON`, `L7STS`, `connection refused`). Operator-driven -transitions (pause, resume) carry empty code and detail. +transitions (pause, resume, disable, enable) carry empty code and detail. diff --git a/docs/user-guide.md b/docs/user-guide.md index 7dd03c9..677d812 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -87,7 +87,7 @@ set backend pause Suspend health checking for a backend, freezing set backend resume Resume health checking; backend re-enters unknown state and is probed immediately. set backend disable Stop probing entirely and remove the backend from rotation. - The backend remains visible (state: removed) and can be + The backend remains visible (state: disabled) and can be re-enabled without reloading configuration. set backend enable Re-enable a disabled backend. A fresh probe goroutine is started and the backend re-enters unknown state. diff --git a/internal/checker/checker.go b/internal/checker/checker.go index 41f4a62..e00c915 100644 --- a/internal/checker/checker.go +++ b/internal/checker/checker.go @@ -350,7 +350,7 @@ func (c *Checker) DisableBackend(name string) (BackendSnapshot, bool) { return BackendSnapshot{Health: w.backend, Config: w.entry}, true } maxHistory := c.cfg.HealthChecker.TransitionHistory - t := w.backend.Remove(maxHistory) + t := w.backend.Disable(maxHistory) slog.Info("backend-disable", "backend", name) c.emitForBackend(name, w.backend.Address, t, c.cfg.Frontends) w.cancel() diff --git a/internal/checker/checker_test.go b/internal/checker/checker_test.go index 05fb889..d15b992 100644 --- a/internal/checker/checker_test.go +++ b/internal/checker/checker_test.go @@ -310,8 +310,8 @@ func TestEnableDisable(t *testing.T) { if !ok { t.Fatal("DisableBackend: not found") } - if b.Health.State != health.StateRemoved { - t.Errorf("after disable: state=%s, want removed", b.Health.State) + if b.Health.State != health.StateDisabled { + t.Errorf("after disable: state=%s, want disabled", b.Health.State) } if b.Config.Enabled { t.Error("after disable: Enabled should be false") diff --git a/internal/grpcapi/server_test.go b/internal/grpcapi/server_test.go index 4ca0007..b0cdcfb 100644 --- a/internal/grpcapi/server_test.go +++ b/internal/grpcapi/server_test.go @@ -268,8 +268,8 @@ func TestEnableDisableBackend(t *testing.T) { if err != nil { t.Fatalf("DisableBackend: %v", err) } - if info.State != "removed" { - t.Errorf("after disable: got %q, want removed", info.State) + if info.State != "disabled" { + t.Errorf("after disable: got %q, want disabled", info.State) } if info.Enabled { t.Error("after disable: Enabled should be false") diff --git a/internal/health/state.go b/internal/health/state.go index bd0ecc7..871fedd 100644 --- a/internal/health/state.go +++ b/internal/health/state.go @@ -29,11 +29,12 @@ type ProbeResult struct { type State int const ( - StateUnknown State = iota // initial state before first probe - StateUp - StateDown - StatePaused - StateRemoved // backend was removed from configuration + StateUnknown State = iota // initial state before first probe + StateUp // backend is healthy + StateDown // backend has failed enough probes + StatePaused // operator paused health checking + StateDisabled // operator disabled the backend + StateRemoved // backend removed from configuration by reload ) func (s State) String() string { @@ -46,6 +47,8 @@ func (s State) String() string { return "down" case StatePaused: return "paused" + case StateDisabled: + return "disabled" case StateRemoved: return "removed" default: @@ -123,7 +126,7 @@ func New(name string, addr net.IP, rise, fall int) *Backend { // failure means the backend is not yet confirmed reachable), and to StateUp // once the counter reaches Rise consecutive passes. func (b *Backend) Record(r ProbeResult, maxHistory int) bool { - if b.State == StatePaused || b.State == StateRemoved { + if b.State == StatePaused || b.State == StateDisabled || b.State == StateRemoved { return false } if r.OK { @@ -196,6 +199,13 @@ func (b *Backend) Start(maxHistory int) Transition { return b.Transitions[0] } +// Disable transitions the backend to StateDisabled. Returns the transition. +// After this call no further probe results are accepted. +func (b *Backend) Disable(maxHistory int) Transition { + b.transition(StateDisabled, ProbeResult{Code: "disabled"}, maxHistory) + return b.Transitions[0] +} + // Remove transitions the backend to StateRemoved. Returns the transition. // After this call no further probe results are accepted. func (b *Backend) Remove(maxHistory int) Transition { diff --git a/internal/health/state_test.go b/internal/health/state_test.go index ce5717c..38f2822 100644 --- a/internal/health/state_test.go +++ b/internal/health/state_test.go @@ -333,6 +333,7 @@ func TestStateString(t *testing.T) { {StateUp, "up"}, {StateDown, "down"}, {StatePaused, "paused"}, + {StateDisabled, "disabled"}, {StateRemoved, "removed"}, } for _, c := range cases { diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index b2e5a92..8c0119e 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -112,6 +112,7 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) { health.StateUp, health.StateDown, health.StatePaused, + health.StateDisabled, health.StateRemoved, } @@ -173,4 +174,3 @@ func Register(reg prometheus.Registerer, src StateSource) *Collector { reg.MustRegister(TransitionTotal) return coll } - diff --git a/tests/01-maglevd/01-healthcheck.robot b/tests/01-maglevd/01-healthcheck.robot index befa367..f368beb 100644 --- a/tests/01-maglevd/01-healthcheck.robot +++ b/tests/01-maglevd/01-healthcheck.robot @@ -59,7 +59,7 @@ Resume backend restarts probing Disable backend stops probing Maglevc set backend nginx2 disable - Backend Should Have State nginx2 removed + Backend Should Have State nginx2 disabled Backend Should Be Disabled nginx2 Sleep 1s ${before} = Get Probe Count nginx2