checker: fix ResumeBackend leaking goroutine on non-paused backend; v1.0.2
Calling ResumeBackend on a backend that wasn't actually paused (state != StatePaused) would overwrite w.cancel and spawn a fresh probe goroutine without cancelling the old one, leaving two probe loops running for the same backend until process exit. The guard now mirrors EnableBackend's early-return on a non-target state.
This commit is contained in:
2
Makefile
2
Makefile
@@ -15,7 +15,7 @@ FRONTEND_WEB_SRC := $(shell find cmd/frontend/web/src -type f 2>/dev/null) \
|
|||||||
FRONTEND_WEB_DIST := cmd/frontend/web/dist/index.html
|
FRONTEND_WEB_DIST := cmd/frontend/web/dist/index.html
|
||||||
|
|
||||||
NATIVE_ARCH := $(shell go env GOARCH)
|
NATIVE_ARCH := $(shell go env GOARCH)
|
||||||
VERSION := 1.0.1
|
VERSION := 1.0.2
|
||||||
COMMIT_HASH := $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
COMMIT_HASH := $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
||||||
DATE := $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
DATE := $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||||
LDFLAGS := -X '$(MODULE)/cmd.version=$(VERSION)' \
|
LDFLAGS := -X '$(MODULE)/cmd.version=$(VERSION)' \
|
||||||
|
|||||||
@@ -416,6 +416,11 @@ func (c *Checker) PauseBackend(name string) (BackendSnapshot, error) {
|
|||||||
// goroutine is started and the backend re-enters StateUnknown. The existing
|
// goroutine is started and the backend re-enters StateUnknown. The existing
|
||||||
// transition history is preserved.
|
// transition history is preserved.
|
||||||
// Returns an error if the backend is not found or is disabled.
|
// Returns an error if the backend is not found or is disabled.
|
||||||
|
//
|
||||||
|
// No-ops when the backend isn't paused: PauseBackend is what cancels the
|
||||||
|
// probe goroutine, so if we started a fresh one here unconditionally we'd
|
||||||
|
// overwrite w.cancel with a new cancel func and orphan the old goroutine —
|
||||||
|
// two probe loops would then run for the same backend until process exit.
|
||||||
func (c *Checker) ResumeBackend(name string) (BackendSnapshot, error) {
|
func (c *Checker) ResumeBackend(name string) (BackendSnapshot, error) {
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
defer c.mu.Unlock()
|
defer c.mu.Unlock()
|
||||||
@@ -426,6 +431,9 @@ func (c *Checker) ResumeBackend(name string) (BackendSnapshot, error) {
|
|||||||
if !w.entry.Enabled {
|
if !w.entry.Enabled {
|
||||||
return BackendSnapshot{}, fmt.Errorf("backend %q is disabled; enable it first", name)
|
return BackendSnapshot{}, fmt.Errorf("backend %q is disabled; enable it first", name)
|
||||||
}
|
}
|
||||||
|
if w.backend.State != health.StatePaused {
|
||||||
|
return BackendSnapshot{Health: w.backend, Config: w.entry}, nil
|
||||||
|
}
|
||||||
maxHistory := c.cfg.HealthChecker.TransitionHistory
|
maxHistory := c.cfg.HealthChecker.TransitionHistory
|
||||||
if w.backend.Resume(maxHistory) {
|
if w.backend.Resume(maxHistory) {
|
||||||
t := w.backend.Transitions[0]
|
t := w.backend.Transitions[0]
|
||||||
|
|||||||
@@ -385,3 +385,45 @@ func TestPauseResume(t *testing.T) {
|
|||||||
t.Error("ResumeBackend on disabled backend: expected error")
|
t.Error("ResumeBackend on disabled backend: expected error")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestResumeNonPausedIsNoOp guards against a regression where ResumeBackend
|
||||||
|
// on a running (non-paused) backend would overwrite w.cancel and spawn an
|
||||||
|
// extra probe goroutine, orphaning the original one. The worker's wakeCh
|
||||||
|
// must stay identical — a new one would indicate the goroutine-start branch
|
||||||
|
// ran.
|
||||||
|
func TestResumeNonPausedIsNoOp(t *testing.T) {
|
||||||
|
cfg := makeTestConfig(time.Hour, 3, 2)
|
||||||
|
c := New(cfg)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
go c.fanOut(ctx)
|
||||||
|
|
||||||
|
origWakeCh := make(chan struct{}, 1)
|
||||||
|
c.mu.Lock()
|
||||||
|
c.runCtx = ctx
|
||||||
|
_, wCancel := context.WithCancel(ctx)
|
||||||
|
c.workers["be0"] = &worker{
|
||||||
|
backend: health.New("be0", net.ParseIP("10.0.0.2"), 2, 3),
|
||||||
|
hc: cfg.HealthChecks["icmp"],
|
||||||
|
entry: cfg.Backends["be0"],
|
||||||
|
cancel: wCancel,
|
||||||
|
wakeCh: origWakeCh,
|
||||||
|
}
|
||||||
|
c.mu.Unlock()
|
||||||
|
|
||||||
|
b, err := c.ResumeBackend("be0")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ResumeBackend: %v", err)
|
||||||
|
}
|
||||||
|
if b.Health.State != health.StateUnknown {
|
||||||
|
t.Errorf("state=%s, want unknown (resume on non-paused should not transition)", b.Health.State)
|
||||||
|
}
|
||||||
|
|
||||||
|
c.mu.RLock()
|
||||||
|
gotWakeCh := c.workers["be0"].wakeCh
|
||||||
|
c.mu.RUnlock()
|
||||||
|
if gotWakeCh != origWakeCh {
|
||||||
|
t.Error("wakeCh was replaced — ResumeBackend spawned a spurious goroutine")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user