From 9a3c5c5dc0bd25d1ca451e8eca6b1f665e3e90fd Mon Sep 17 00:00:00 2001 From: Pim van Pelt Date: Sun, 19 Apr 2026 20:39:07 +0200 Subject: [PATCH] checker: fix ResumeBackend leaking goroutine on non-paused backend; v1.0.2 Calling ResumeBackend on a backend that wasn't actually paused (state != StatePaused) would overwrite w.cancel and spawn a fresh probe goroutine without cancelling the old one, leaving two probe loops running for the same backend until process exit. The guard now mirrors EnableBackend's early-return on a non-target state. --- Makefile | 2 +- internal/checker/checker.go | 8 ++++++ internal/checker/checker_test.go | 42 ++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4e31388..1548b2d 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ FRONTEND_WEB_SRC := $(shell find cmd/frontend/web/src -type f 2>/dev/null) \ FRONTEND_WEB_DIST := cmd/frontend/web/dist/index.html NATIVE_ARCH := $(shell go env GOARCH) -VERSION := 1.0.1 +VERSION := 1.0.2 COMMIT_HASH := $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE := $(shell date -u +%Y-%m-%dT%H:%M:%SZ) LDFLAGS := -X '$(MODULE)/cmd.version=$(VERSION)' \ diff --git a/internal/checker/checker.go b/internal/checker/checker.go index de89fb8..5871ca0 100644 --- a/internal/checker/checker.go +++ b/internal/checker/checker.go @@ -416,6 +416,11 @@ func (c *Checker) PauseBackend(name string) (BackendSnapshot, error) { // goroutine is started and the backend re-enters StateUnknown. The existing // transition history is preserved. // Returns an error if the backend is not found or is disabled. +// +// No-ops when the backend isn't paused: PauseBackend is what cancels the +// probe goroutine, so if we started a fresh one here unconditionally we'd +// overwrite w.cancel with a new cancel func and orphan the old goroutine — +// two probe loops would then run for the same backend until process exit. func (c *Checker) ResumeBackend(name string) (BackendSnapshot, error) { c.mu.Lock() defer c.mu.Unlock() @@ -426,6 +431,9 @@ func (c *Checker) ResumeBackend(name string) (BackendSnapshot, error) { if !w.entry.Enabled { return BackendSnapshot{}, fmt.Errorf("backend %q is disabled; enable it first", name) } + if w.backend.State != health.StatePaused { + return BackendSnapshot{Health: w.backend, Config: w.entry}, nil + } maxHistory := c.cfg.HealthChecker.TransitionHistory if w.backend.Resume(maxHistory) { t := w.backend.Transitions[0] diff --git a/internal/checker/checker_test.go b/internal/checker/checker_test.go index ed22a2f..677972a 100644 --- a/internal/checker/checker_test.go +++ b/internal/checker/checker_test.go @@ -385,3 +385,45 @@ func TestPauseResume(t *testing.T) { t.Error("ResumeBackend on disabled backend: expected error") } } + +// TestResumeNonPausedIsNoOp guards against a regression where ResumeBackend +// on a running (non-paused) backend would overwrite w.cancel and spawn an +// extra probe goroutine, orphaning the original one. The worker's wakeCh +// must stay identical — a new one would indicate the goroutine-start branch +// ran. +func TestResumeNonPausedIsNoOp(t *testing.T) { + cfg := makeTestConfig(time.Hour, 3, 2) + c := New(cfg) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go c.fanOut(ctx) + + origWakeCh := make(chan struct{}, 1) + c.mu.Lock() + c.runCtx = ctx + _, wCancel := context.WithCancel(ctx) + c.workers["be0"] = &worker{ + backend: health.New("be0", net.ParseIP("10.0.0.2"), 2, 3), + hc: cfg.HealthChecks["icmp"], + entry: cfg.Backends["be0"], + cancel: wCancel, + wakeCh: origWakeCh, + } + c.mu.Unlock() + + b, err := c.ResumeBackend("be0") + if err != nil { + t.Fatalf("ResumeBackend: %v", err) + } + if b.Health.State != health.StateUnknown { + t.Errorf("state=%s, want unknown (resume on non-paused should not transition)", b.Health.State) + } + + c.mu.RLock() + gotWakeCh := c.workers["be0"].wakeCh + c.mu.RUnlock() + if gotWakeCh != origWakeCh { + t.Error("wakeCh was replaced — ResumeBackend spawned a spurious goroutine") + } +}