Initial revisin of healthchecker, inspired by HAProxy

This commit is contained in:
2026-04-10 17:30:44 +02:00
commit b84b3274b1
24 changed files with 4400 additions and 0 deletions

189
internal/health/state.go Normal file
View File

@@ -0,0 +1,189 @@
package health
import (
"net"
"time"
)
// CheckLayer indicates at which network layer a probe stopped.
type CheckLayer int
const (
LayerUnknown CheckLayer = iota
LayerL4 // TCP connect
LayerL6 // TLS handshake
LayerL7 // Application (HTTP response, ICMP reply)
)
// ProbeResult is the outcome of a single probe execution.
type ProbeResult struct {
OK bool
Layer CheckLayer
Code string // "L4OK", "L4TOUT", "L4CON", "L7OK", "L7TOUT", "L7RSP", "L7STS"
Detail string // human-readable, e.g. "HTTP 503", "connection refused"
}
// State represents the health state of a backend.
type State int
const (
StateUnknown State = iota // initial state before first probe
StateUp
StateDown
StatePaused
)
func (s State) String() string {
switch s {
case StateUnknown:
return "unknown"
case StateUp:
return "up"
case StateDown:
return "down"
case StatePaused:
return "paused"
default:
return "unknown"
}
}
// Transition records a single state change event.
type Transition struct {
From State
To State
At time.Time
Result ProbeResult
}
// HealthCounter is HAProxy's single-integer rise/fall model.
//
// Health ∈ [0, Rise+Fall-1]. Server is UP when Health >= Rise, DOWN when
// Health < Rise. On success Health increments (ceiling Rise+Fall-1); on
// failure Health decrements (floor 0). This gives hysteresis: a flapping
// backend stays in the degraded range without bouncing between UP and DOWN.
type HealthCounter struct {
Health int
Rise int
Fall int
}
func (h *HealthCounter) Max() int { return h.Rise + h.Fall - 1 }
func (h *HealthCounter) IsUp() bool { return h.Health >= h.Rise }
func (h *HealthCounter) IsDegraded() bool { return h.Health > 0 && h.Health < h.Max() }
// RecordPass increments the counter. Returns true if the server just became UP.
func (h *HealthCounter) RecordPass() bool {
wasUp := h.IsUp()
if h.Health < h.Max() {
h.Health++
}
return !wasUp && h.IsUp()
}
// RecordFail decrements the counter. Returns true if the server just went DOWN.
func (h *HealthCounter) RecordFail() bool {
wasDown := !h.IsUp()
if h.Health > 0 {
h.Health--
}
return !wasDown && !h.IsUp()
}
// Backend tracks the health state of one VIP:backend tuple.
type Backend struct {
VIPName string
Address net.IP
State State
Counter HealthCounter
Transitions []Transition // newest first, capped at maxHistory
}
// New creates a Backend in StateUnknown.
func New(vipName string, addr net.IP, rise, fall int) *Backend {
return &Backend{
VIPName: vipName,
Address: addr,
State: StateUnknown,
Counter: HealthCounter{Rise: rise, Fall: fall},
}
}
// Record applies a probe result to the health counter and transitions state if
// needed. Returns true if the state changed.
//
// StateUnknown transitions to StateDown on the first failure (any evidence of
// failure means the backend is not yet confirmed reachable), and to StateUp
// once the counter reaches Rise consecutive passes.
func (b *Backend) Record(r ProbeResult, maxHistory int) bool {
if b.State == StatePaused {
return false
}
if r.OK {
if b.Counter.RecordPass() {
b.transition(StateUp, r, maxHistory)
return true
}
} else {
if b.Counter.RecordFail() || b.State == StateUnknown {
b.transition(StateDown, r, maxHistory)
return true
}
}
return false
}
// Pause transitions the backend to StatePaused. Returns true if the state changed.
func (b *Backend) Pause(maxHistory int) bool {
if b.State == StatePaused {
return false
}
b.transition(StatePaused, ProbeResult{}, maxHistory)
b.Counter.Health = 0
return true
}
// Resume transitions a paused backend back to StateUnknown, resetting the
// counter. Returns true if the state changed.
func (b *Backend) Resume(maxHistory int) bool {
if b.State != StatePaused {
return false
}
b.transition(StateUnknown, ProbeResult{}, maxHistory)
b.Counter.Health = 0
return true
}
// NextInterval returns the appropriate probe interval based on state and counter:
// - Unknown (no probes yet): interval — probe promptly to establish initial state
// - Fully healthy (counter at max): interval
// - Fully down (counter at 0): downInterval (falls back to interval)
// - Degraded (anywhere in between): fastInterval (falls back to interval)
func (b *Backend) NextInterval(interval, fastInterval, downInterval time.Duration) time.Duration {
if b.State == StateUnknown {
return interval
}
if b.Counter.Health == b.Counter.Max() {
return interval
}
if b.Counter.Health == 0 {
if downInterval > 0 {
return downInterval
}
return interval
}
if fastInterval > 0 {
return fastInterval
}
return interval
}
// transition appends a new Transition and updates State.
func (b *Backend) transition(to State, r ProbeResult, maxHistory int) {
t := Transition{From: b.State, To: to, At: time.Now(), Result: r}
b.Transitions = append([]Transition{t}, b.Transitions...)
if len(b.Transitions) > maxHistory {
b.Transitions = b.Transitions[:maxHistory]
}
b.State = to
}

View File

@@ -0,0 +1,307 @@
package health
import (
"net"
"testing"
"time"
)
func newBackend() *Backend {
return New("web4", net.ParseIP("10.0.0.1"), 2, 3) // rise=2, fall=3
}
func pass() ProbeResult { return ProbeResult{OK: true, Layer: LayerL7, Code: "L7OK"} }
func fail() ProbeResult { return ProbeResult{OK: false, Layer: LayerL4, Code: "L4CON"} }
func TestInitialState(t *testing.T) {
b := newBackend()
if b.State != StateUnknown {
t.Errorf("initial state: got %s, want unknown", b.State)
}
if len(b.Transitions) != 0 {
t.Errorf("initial transitions: got %d, want 0", len(b.Transitions))
}
if b.Counter.Health != 0 {
t.Errorf("initial counter health: got %d, want 0", b.Counter.Health)
}
}
// TestRiseToUp: rise=2 passes from Down/Unknown → Up.
func TestRiseToUp(t *testing.T) {
tests := []struct {
name string
initialState State
}{
{"from unknown", StateUnknown},
{"from down", StateDown},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
b := newBackend()
b.State = tt.initialState
// First pass: counter=1, still in DOWN range (rise=2), no transition.
if b.Record(pass(), 5) {
t.Error("should not transition after 1 pass (rise=2)")
}
if b.State != tt.initialState {
t.Errorf("state changed early: got %s", b.State)
}
// Second pass: counter=2=rise, transitions to Up.
if !b.Record(pass(), 5) {
t.Error("should transition to Up after 2 passes")
}
if b.State != StateUp {
t.Errorf("state: got %s, want up", b.State)
}
})
}
}
// TestFallToDown: fall=3 failures from fully-healthy → Down.
//
// The fall guarantee applies from counter=max (fully healthy). A backend that
// just became Up is at counter=rise (the floor of the UP range), so a single
// failure already drops it back into the DOWN range. This is correct: hysteresis
// protects a server that has been consistently healthy for a long time, not one
// that just scraped past the rise threshold.
func TestFallToDown(t *testing.T) {
b := newBackend() // rise=2, fall=3, max=4
// Drive to fully healthy: need rise + (max-rise) = 4 passes total.
for i := 0; i < b.Counter.Max(); i++ {
b.Record(pass(), 5)
}
if b.State != StateUp {
t.Fatalf("precondition: want up, got %s", b.State)
}
if b.Counter.Health != b.Counter.Max() {
t.Fatalf("precondition: want counter=%d, got %d", b.Counter.Max(), b.Counter.Health)
}
// fall-1=2 failures: counter 4→3→2, both still in UP range (>=rise=2).
if b.Record(fail(), 5) {
t.Error("should not transition after 1 fail from fully healthy")
}
if b.Record(fail(), 5) {
t.Error("should not transition after 2 fails from fully healthy")
}
if b.State != StateUp {
t.Errorf("state after 2 fails: got %s, want up", b.State)
}
// Third failure: counter 2→1 < rise=2 → Down.
if !b.Record(fail(), 5) {
t.Error("should transition to Down after fall=3 failures from fully healthy")
}
if b.State != StateDown {
t.Errorf("state: got %s, want down", b.State)
}
}
// TestUnknownToDownOnFirstFail: any failure while Unknown → Down immediately.
func TestUnknownToDownOnFirstFail(t *testing.T) {
b := newBackend()
if !b.Record(fail(), 5) {
t.Error("first fail from Unknown should transition to Down")
}
if b.State != StateDown {
t.Errorf("state: got %s, want down", b.State)
}
}
// TestHysteresis: alternating pass/fail keeps backend in degraded range without
// bouncing between Up and Down. This is the key HAProxy counter property.
func TestHysteresis(t *testing.T) {
b := newBackend()
// Drive to Up.
b.Record(pass(), 5)
b.Record(pass(), 5) // counter=2, state=Up
// Alternate pass/fail. Counter oscillates 3↔2 (both in UP range for rise=2),
// or 2↔1 (crossing the boundary). Let's trace:
// Start: counter=2 (just became Up, was at 2=rise after second pass, then RecordPass incremented to 3... wait)
// Actually: after first pass from Unknown (counter=0), counter=1. After second pass, counter=2=rise, RecordPass returns true → Up.
// But RecordPass increments BEFORE checking: wasUp=false, counter becomes 2, IsUp()=true → returns true.
// So after second pass: counter=2, state=Up.
// max = rise+fall-1 = 2+3-1 = 4.
// fail: counter=1 < rise=2 → RecordFail: wasDown=false (counter was 2=rise, IsUp=true),
// counter becomes 1, IsUp()=false → returns true → Down!
// Hmm, so one fail from counter=2 (barely Up) → Down? That's with rise=2.
// The hysteresis is more visible with rise=2, fall=5: max=6.
// Let's use a backend with more headroom.
b2 := New("test", net.ParseIP("10.0.0.2"), 2, 5) // rise=2, fall=5, max=6
// Drive to fully healthy.
b2.Record(pass(), 5) // counter=1
b2.Record(pass(), 5) // counter=2=rise → Up
b2.Record(pass(), 5) // counter=3
b2.Record(pass(), 5) // counter=4
b2.Record(pass(), 5) // counter=5
b2.Record(pass(), 5) // counter=6=max
// Now alternate: fail drops from 6, pass brings back up.
// Should not transition since counter stays in UP range (>=2).
for i := 0; i < 4; i++ {
transitioned := b2.Record(fail(), 5) // 6→5→4→3→2 (all >=rise=2)
if transitioned {
t.Errorf("fail %d: should not transition (counter in UP range)", i+1)
}
if !b2.Counter.IsUp() {
t.Errorf("fail %d: should still be up", i+1)
}
if b2.Record(pass(), 5) { // re-increment
t.Errorf("pass %d: should not transition (already Up)", i+1)
}
}
if b2.State != StateUp {
t.Errorf("after alternating: want up, got %s", b2.State)
}
}
// TestNextInterval: correct interval selection based on counter state.
func TestNextInterval(t *testing.T) {
interval := 2 * time.Second
fast := 500 * time.Millisecond
down := 30 * time.Second
b := New("test", net.ParseIP("10.0.0.1"), 2, 3) // max=4
// Unknown (no probes yet): always use interval, never downInterval.
if got := b.NextInterval(interval, fast, down); got != interval {
t.Errorf("StateUnknown: got %v, want %v (interval)", got, interval)
}
// After first fail: counter=0, state=Down → downInterval.
b.Record(ProbeResult{OK: false, Code: "L4CON"}, 5)
if b.State != StateDown {
t.Fatalf("expected StateDown after first fail, got %s", b.State)
}
if got := b.NextInterval(interval, fast, down); got != down {
t.Errorf("StateDown/counter=0: got %v, want %v (down)", got, down)
}
// Drive to max (fully healthy) → interval.
b.Counter.Health = b.Counter.Max()
if got := b.NextInterval(interval, fast, down); got != interval {
t.Errorf("counter=max: got %v, want %v (interval)", got, interval)
}
// Degraded (0 < counter < max) → fastInterval.
b.Counter.Health = 1
if got := b.NextInterval(interval, fast, down); got != fast {
t.Errorf("counter=1 (degraded): got %v, want %v (fast)", got, fast)
}
// No fastInterval configured → falls back to interval.
if got := b.NextInterval(interval, 0, down); got != interval {
t.Errorf("degraded, no fast: got %v, want %v (interval)", got, interval)
}
// No downInterval configured → falls back to interval.
b.Counter.Health = 0
if got := b.NextInterval(interval, fast, 0); got != interval {
t.Errorf("down, no downInterval: got %v, want %v (interval)", got, interval)
}
}
func TestPauseResume(t *testing.T) {
b := newBackend()
b.State = StateUp
changed := b.Pause(5)
if !changed {
t.Error("Pause should return true")
}
if b.State != StatePaused {
t.Errorf("after Pause: got %s, want paused", b.State)
}
// Probes ignored while paused.
if b.Record(pass(), 5) {
t.Error("Record(pass) should not transition while paused")
}
if b.Record(fail(), 5) {
t.Error("Record(fail) should not transition while paused")
}
if b.State != StatePaused {
t.Errorf("state changed while paused: %s", b.State)
}
// Second Pause is a no-op.
if b.Pause(5) {
t.Error("second Pause should return false")
}
changed = b.Resume(5)
if !changed {
t.Error("Resume should return true")
}
if b.State != StateUnknown {
t.Errorf("after Resume: got %s, want unknown", b.State)
}
// Resume on non-paused is a no-op.
if b.Resume(5) {
t.Error("Resume on non-paused should return false")
}
}
func TestTransitionHistory(t *testing.T) {
b := newBackend()
maxHistory := 3
// Drive several state changes. Each cycle: pass×2→Up, fail→Down (Unknown→Down on first fail).
b.Record(fail(), maxHistory) // Unknown→Down
b.Record(pass(), maxHistory) // counter++
b.Record(pass(), maxHistory) // Down→Up
b.Record(fail(), maxHistory) // Up: counter drops
b.Record(fail(), maxHistory) // Up: counter drops
b.Record(fail(), maxHistory) // Up→Down
b.Record(pass(), maxHistory) // counter++
b.Record(pass(), maxHistory) // Down→Up
if len(b.Transitions) != maxHistory {
t.Errorf("transitions capped at %d, got %d", maxHistory, len(b.Transitions))
}
// Newest first: last transition was →Up.
if b.Transitions[0].To != StateUp {
t.Errorf("newest transition: got %s, want up", b.Transitions[0].To)
}
// Transitions carry ProbeResult.
if b.Transitions[0].Result.Code == "" {
t.Error("transition result code should not be empty")
}
}
func TestTransitionTimestamp(t *testing.T) {
b := newBackend()
before := time.Now()
b.Record(fail(), 5)
after := time.Now()
if len(b.Transitions) == 0 {
t.Fatal("expected a transition")
}
ts := b.Transitions[0].At
if ts.Before(before) || ts.After(after) {
t.Errorf("transition timestamp %v outside [%v, %v]", ts, before, after)
}
}
func TestStateString(t *testing.T) {
cases := []struct {
s State
want string
}{
{StateUnknown, "unknown"},
{StateUp, "up"},
{StateDown, "down"},
{StatePaused, "paused"},
}
for _, c := range cases {
if c.s.String() != c.want {
t.Errorf("State(%d).String() = %q, want %q", c.s, c.s.String(), c.want)
}
}
}