Initial revisin of healthchecker, inspired by HAProxy

This commit is contained in:
2026-04-10 17:30:44 +02:00
commit b84b3274b1
24 changed files with 4400 additions and 0 deletions

View File

@@ -0,0 +1,307 @@
package health
import (
"net"
"testing"
"time"
)
func newBackend() *Backend {
return New("web4", net.ParseIP("10.0.0.1"), 2, 3) // rise=2, fall=3
}
func pass() ProbeResult { return ProbeResult{OK: true, Layer: LayerL7, Code: "L7OK"} }
func fail() ProbeResult { return ProbeResult{OK: false, Layer: LayerL4, Code: "L4CON"} }
func TestInitialState(t *testing.T) {
b := newBackend()
if b.State != StateUnknown {
t.Errorf("initial state: got %s, want unknown", b.State)
}
if len(b.Transitions) != 0 {
t.Errorf("initial transitions: got %d, want 0", len(b.Transitions))
}
if b.Counter.Health != 0 {
t.Errorf("initial counter health: got %d, want 0", b.Counter.Health)
}
}
// TestRiseToUp: rise=2 passes from Down/Unknown → Up.
func TestRiseToUp(t *testing.T) {
tests := []struct {
name string
initialState State
}{
{"from unknown", StateUnknown},
{"from down", StateDown},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
b := newBackend()
b.State = tt.initialState
// First pass: counter=1, still in DOWN range (rise=2), no transition.
if b.Record(pass(), 5) {
t.Error("should not transition after 1 pass (rise=2)")
}
if b.State != tt.initialState {
t.Errorf("state changed early: got %s", b.State)
}
// Second pass: counter=2=rise, transitions to Up.
if !b.Record(pass(), 5) {
t.Error("should transition to Up after 2 passes")
}
if b.State != StateUp {
t.Errorf("state: got %s, want up", b.State)
}
})
}
}
// TestFallToDown: fall=3 failures from fully-healthy → Down.
//
// The fall guarantee applies from counter=max (fully healthy). A backend that
// just became Up is at counter=rise (the floor of the UP range), so a single
// failure already drops it back into the DOWN range. This is correct: hysteresis
// protects a server that has been consistently healthy for a long time, not one
// that just scraped past the rise threshold.
func TestFallToDown(t *testing.T) {
b := newBackend() // rise=2, fall=3, max=4
// Drive to fully healthy: need rise + (max-rise) = 4 passes total.
for i := 0; i < b.Counter.Max(); i++ {
b.Record(pass(), 5)
}
if b.State != StateUp {
t.Fatalf("precondition: want up, got %s", b.State)
}
if b.Counter.Health != b.Counter.Max() {
t.Fatalf("precondition: want counter=%d, got %d", b.Counter.Max(), b.Counter.Health)
}
// fall-1=2 failures: counter 4→3→2, both still in UP range (>=rise=2).
if b.Record(fail(), 5) {
t.Error("should not transition after 1 fail from fully healthy")
}
if b.Record(fail(), 5) {
t.Error("should not transition after 2 fails from fully healthy")
}
if b.State != StateUp {
t.Errorf("state after 2 fails: got %s, want up", b.State)
}
// Third failure: counter 2→1 < rise=2 → Down.
if !b.Record(fail(), 5) {
t.Error("should transition to Down after fall=3 failures from fully healthy")
}
if b.State != StateDown {
t.Errorf("state: got %s, want down", b.State)
}
}
// TestUnknownToDownOnFirstFail: any failure while Unknown → Down immediately.
func TestUnknownToDownOnFirstFail(t *testing.T) {
b := newBackend()
if !b.Record(fail(), 5) {
t.Error("first fail from Unknown should transition to Down")
}
if b.State != StateDown {
t.Errorf("state: got %s, want down", b.State)
}
}
// TestHysteresis: alternating pass/fail keeps backend in degraded range without
// bouncing between Up and Down. This is the key HAProxy counter property.
func TestHysteresis(t *testing.T) {
b := newBackend()
// Drive to Up.
b.Record(pass(), 5)
b.Record(pass(), 5) // counter=2, state=Up
// Alternate pass/fail. Counter oscillates 3↔2 (both in UP range for rise=2),
// or 2↔1 (crossing the boundary). Let's trace:
// Start: counter=2 (just became Up, was at 2=rise after second pass, then RecordPass incremented to 3... wait)
// Actually: after first pass from Unknown (counter=0), counter=1. After second pass, counter=2=rise, RecordPass returns true → Up.
// But RecordPass increments BEFORE checking: wasUp=false, counter becomes 2, IsUp()=true → returns true.
// So after second pass: counter=2, state=Up.
// max = rise+fall-1 = 2+3-1 = 4.
// fail: counter=1 < rise=2 → RecordFail: wasDown=false (counter was 2=rise, IsUp=true),
// counter becomes 1, IsUp()=false → returns true → Down!
// Hmm, so one fail from counter=2 (barely Up) → Down? That's with rise=2.
// The hysteresis is more visible with rise=2, fall=5: max=6.
// Let's use a backend with more headroom.
b2 := New("test", net.ParseIP("10.0.0.2"), 2, 5) // rise=2, fall=5, max=6
// Drive to fully healthy.
b2.Record(pass(), 5) // counter=1
b2.Record(pass(), 5) // counter=2=rise → Up
b2.Record(pass(), 5) // counter=3
b2.Record(pass(), 5) // counter=4
b2.Record(pass(), 5) // counter=5
b2.Record(pass(), 5) // counter=6=max
// Now alternate: fail drops from 6, pass brings back up.
// Should not transition since counter stays in UP range (>=2).
for i := 0; i < 4; i++ {
transitioned := b2.Record(fail(), 5) // 6→5→4→3→2 (all >=rise=2)
if transitioned {
t.Errorf("fail %d: should not transition (counter in UP range)", i+1)
}
if !b2.Counter.IsUp() {
t.Errorf("fail %d: should still be up", i+1)
}
if b2.Record(pass(), 5) { // re-increment
t.Errorf("pass %d: should not transition (already Up)", i+1)
}
}
if b2.State != StateUp {
t.Errorf("after alternating: want up, got %s", b2.State)
}
}
// TestNextInterval: correct interval selection based on counter state.
func TestNextInterval(t *testing.T) {
interval := 2 * time.Second
fast := 500 * time.Millisecond
down := 30 * time.Second
b := New("test", net.ParseIP("10.0.0.1"), 2, 3) // max=4
// Unknown (no probes yet): always use interval, never downInterval.
if got := b.NextInterval(interval, fast, down); got != interval {
t.Errorf("StateUnknown: got %v, want %v (interval)", got, interval)
}
// After first fail: counter=0, state=Down → downInterval.
b.Record(ProbeResult{OK: false, Code: "L4CON"}, 5)
if b.State != StateDown {
t.Fatalf("expected StateDown after first fail, got %s", b.State)
}
if got := b.NextInterval(interval, fast, down); got != down {
t.Errorf("StateDown/counter=0: got %v, want %v (down)", got, down)
}
// Drive to max (fully healthy) → interval.
b.Counter.Health = b.Counter.Max()
if got := b.NextInterval(interval, fast, down); got != interval {
t.Errorf("counter=max: got %v, want %v (interval)", got, interval)
}
// Degraded (0 < counter < max) → fastInterval.
b.Counter.Health = 1
if got := b.NextInterval(interval, fast, down); got != fast {
t.Errorf("counter=1 (degraded): got %v, want %v (fast)", got, fast)
}
// No fastInterval configured → falls back to interval.
if got := b.NextInterval(interval, 0, down); got != interval {
t.Errorf("degraded, no fast: got %v, want %v (interval)", got, interval)
}
// No downInterval configured → falls back to interval.
b.Counter.Health = 0
if got := b.NextInterval(interval, fast, 0); got != interval {
t.Errorf("down, no downInterval: got %v, want %v (interval)", got, interval)
}
}
func TestPauseResume(t *testing.T) {
b := newBackend()
b.State = StateUp
changed := b.Pause(5)
if !changed {
t.Error("Pause should return true")
}
if b.State != StatePaused {
t.Errorf("after Pause: got %s, want paused", b.State)
}
// Probes ignored while paused.
if b.Record(pass(), 5) {
t.Error("Record(pass) should not transition while paused")
}
if b.Record(fail(), 5) {
t.Error("Record(fail) should not transition while paused")
}
if b.State != StatePaused {
t.Errorf("state changed while paused: %s", b.State)
}
// Second Pause is a no-op.
if b.Pause(5) {
t.Error("second Pause should return false")
}
changed = b.Resume(5)
if !changed {
t.Error("Resume should return true")
}
if b.State != StateUnknown {
t.Errorf("after Resume: got %s, want unknown", b.State)
}
// Resume on non-paused is a no-op.
if b.Resume(5) {
t.Error("Resume on non-paused should return false")
}
}
func TestTransitionHistory(t *testing.T) {
b := newBackend()
maxHistory := 3
// Drive several state changes. Each cycle: pass×2→Up, fail→Down (Unknown→Down on first fail).
b.Record(fail(), maxHistory) // Unknown→Down
b.Record(pass(), maxHistory) // counter++
b.Record(pass(), maxHistory) // Down→Up
b.Record(fail(), maxHistory) // Up: counter drops
b.Record(fail(), maxHistory) // Up: counter drops
b.Record(fail(), maxHistory) // Up→Down
b.Record(pass(), maxHistory) // counter++
b.Record(pass(), maxHistory) // Down→Up
if len(b.Transitions) != maxHistory {
t.Errorf("transitions capped at %d, got %d", maxHistory, len(b.Transitions))
}
// Newest first: last transition was →Up.
if b.Transitions[0].To != StateUp {
t.Errorf("newest transition: got %s, want up", b.Transitions[0].To)
}
// Transitions carry ProbeResult.
if b.Transitions[0].Result.Code == "" {
t.Error("transition result code should not be empty")
}
}
func TestTransitionTimestamp(t *testing.T) {
b := newBackend()
before := time.Now()
b.Record(fail(), 5)
after := time.Now()
if len(b.Transitions) == 0 {
t.Fatal("expected a transition")
}
ts := b.Transitions[0].At
if ts.Before(before) || ts.After(after) {
t.Errorf("transition timestamp %v outside [%v, %v]", ts, before, after)
}
}
func TestStateString(t *testing.T) {
cases := []struct {
s State
want string
}{
{StateUnknown, "unknown"},
{StateUp, "up"},
{StateDown, "down"},
{StatePaused, "paused"},
}
for _, c := range cases {
if c.s.String() != c.want {
t.Errorf("State(%d).String() = %q, want %q", c.s, c.s.String(), c.want)
}
}
}