Fix pause to cancel probe goroutine; add Robot Framework integration tests

Pause semantics
- PauseBackend now cancels the probe goroutine so no HTTP/TCP/ICMP
  traffic is sent while the backend is paused. Previously the goroutine
  kept running and results were silently discarded.
- ResumeBackend launches a fresh probe goroutine on the existing worker,
  preserving transition history. The backend re-enters unknown state.

Integration tests (tests/01-maglevd/)
- Containerlab topology with 3 nginx:alpine backends on a dedicated
  management network (172.20.30.0/24) with static IPs.
- maglevd config with 200ms HTTP health-check interval for fast test
  convergence (rise=2, fall=2).
- 8 test cases: deploy lab, start maglevd, all backends reach up,
  nginx logs confirm probes arriving, pause stops probes (probe count
  stable), resume restarts probes, disable stops probes, enable
  restarts probes.

VPP dataplane test (tests/02-vpp-lb/)
- Rewrite 01-e2e-lab.robot to match the actual single-VPP topology:
  test client-to-server ping through VPP bridge domains and verify
  nginx is serving on all app servers. The previous version referenced
  a non-existent topology file and tested OSPF/BFD between two VPP
  nodes that don't exist in this lab.

Build infrastructure
- Add 'make robot-test' target with TEST= for suite selection.
- Add tests/.venv target for Robot Framework virtualenv.
- Make IMAGE optional in rf-run.sh.
- Add .gitignore entries for test output, venv, logs, and clab state.
This commit is contained in:
2026-04-11 20:16:22 +02:00
parent 3bd30b69f4
commit 8bde00eb61
20 changed files with 519 additions and 7 deletions

View File

@@ -267,7 +267,10 @@ func (c *Checker) GetBackend(name string) (BackendSnapshot, bool) {
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
}
// PauseBackend pauses health checking for a backend by name.
// PauseBackend pauses health checking for a backend by name. The probe
// goroutine is cancelled so no further traffic is sent to the backend. The
// backend's state is set to paused and remains frozen until ResumeBackend is
// called (which starts a fresh probe goroutine).
func (c *Checker) PauseBackend(name string) (BackendSnapshot, bool) {
c.mu.Lock()
defer c.mu.Unlock()
@@ -284,10 +287,13 @@ func (c *Checker) PauseBackend(name string) (BackendSnapshot, bool) {
)
c.emitForBackend(name, w.backend.Address, t, c.cfg.Frontends)
}
w.cancel()
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
}
// ResumeBackend resumes health checking for a backend by name.
// ResumeBackend resumes health checking for a backend by name. A fresh probe
// goroutine is started and the backend re-enters StateUnknown. The existing
// transition history is preserved.
func (c *Checker) ResumeBackend(name string) (BackendSnapshot, bool) {
c.mu.Lock()
defer c.mu.Unlock()
@@ -303,11 +309,13 @@ func (c *Checker) ResumeBackend(name string) (BackendSnapshot, bool) {
"to", t.To.String(),
)
c.emitForBackend(name, w.backend.Address, t, c.cfg.Frontends)
select {
case w.wakeCh <- struct{}{}:
default:
}
}
// Launch a fresh probe goroutine with a new cancellable context,
// keeping the existing worker and its transition history.
wCtx, cancel := context.WithCancel(c.runCtx)
w.cancel = cancel
w.wakeCh = make(chan struct{}, 1)
go c.runProbe(wCtx, name, 0, 1)
return BackendSnapshot{Health: w.backend, Config: w.entry}, true
}

View File

@@ -347,12 +347,14 @@ func TestPauseResume(t *testing.T) {
go c.fanOut(ctx)
c.mu.Lock()
c.runCtx = ctx
_, wCancel := context.WithCancel(ctx)
c.workers["be0"] = &worker{
backend: health.New("be0", net.ParseIP("10.0.0.2"), 2, 3),
hc: cfg.HealthChecks["icmp"],
entry: cfg.Backends["be0"],
cancel: wCancel,
wakeCh: make(chan struct{}, 1),
}
c.mu.Unlock()