vpp-maglev/internal/health/state.go

// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>

package health

import (
	"net"
	"time"
)

// CheckLayer indicates at which network layer a probe stopped.
type CheckLayer int

const (
	LayerUnknown CheckLayer = iota
	LayerL4                 // TCP connect
	LayerL6                 // TLS handshake
	LayerL7                 // Application (HTTP response, ICMP reply)
)

// ProbeResult is the outcome of a single probe execution.
type ProbeResult struct {
	OK     bool
	Layer  CheckLayer
	Code   string // "L4OK", "L4TOUT", "L4CON", "L7OK", "L7TOUT", "L7RSP", "L7STS"
	Detail string // human-readable, e.g. "HTTP 503", "connection refused"
}

// State represents the health state of a backend.
type State int

const (
	StateUnknown  State = iota // initial state before first probe
	StateUp                    // backend is healthy
	StateDown                  // backend has failed enough probes
	StatePaused                // operator paused health checking
	StateDisabled              // operator disabled the backend
	StateRemoved               // backend removed from configuration by reload
)

func (s State) String() string {
	switch s {
	case StateUnknown:
		return "unknown"
	case StateUp:
		return "up"
	case StateDown:
		return "down"
	case StatePaused:
		return "paused"
	case StateDisabled:
		return "disabled"
	case StateRemoved:
		return "removed"
	default:
		return "unknown"
	}
}

// Transition records a single state change event.
type Transition struct {
	From   State
	To     State
	At     time.Time
	Result ProbeResult
}

// HealthCounter is HAProxy's single-integer rise/fall model.
//
// Health ∈ [0, Rise+Fall-1]. Server is UP when Health >= Rise, DOWN when
// Health < Rise. On success Health increments (ceiling Rise+Fall-1); on
// failure Health decrements (floor 0). This gives hysteresis: a flapping
// backend stays in the degraded range without bouncing between UP and DOWN.
type HealthCounter struct {
	Health int
	Rise   int
	Fall   int
}

func (h *HealthCounter) Max() int         { return h.Rise + h.Fall - 1 }
func (h *HealthCounter) IsUp() bool       { return h.Health >= h.Rise }
func (h *HealthCounter) IsDegraded() bool { return h.Health > 0 && h.Health < h.Max() }

// RecordPass increments the counter. Returns true if the server just became UP.
func (h *HealthCounter) RecordPass() bool {
	wasUp := h.IsUp()
	if h.Health < h.Max() {
		h.Health++
	}
	return !wasUp && h.IsUp()
}

// RecordFail decrements the counter. Returns true if the server just went DOWN.
func (h *HealthCounter) RecordFail() bool {
	wasDown := !h.IsUp()
	if h.Health > 0 {
		h.Health--
	}
	return !wasDown && !h.IsUp()
}

// Backend tracks the health state of a named backend.
type Backend struct {
	Name        string
	Address     net.IP
	State       State
	Counter     HealthCounter
	Transitions []Transition // newest first, capped at maxHistory
}

// New creates a Backend in StateUnknown with the health counter pre-loaded to
// Rise-1, so the very first probe resolves the state: one pass → Up, any
// fail → Down (via the StateUnknown shortcut in Record).
func New(name string, addr net.IP, rise, fall int) *Backend {
	return &Backend{
		Name:    name,
		Address: addr,
		State:   StateUnknown,
		Counter: HealthCounter{Rise: rise, Fall: fall, Health: rise - 1},
	}
}

// Record applies a probe result to the health counter and transitions state if
// needed. Returns true if the state changed.
//
// StateUnknown transitions to StateDown on the first failure (any evidence of
// failure means the backend is not yet confirmed reachable), and to StateUp
// once the counter reaches Rise consecutive passes.
func (b *Backend) Record(r ProbeResult, maxHistory int) bool {
	if b.State == StatePaused || b.State == StateDisabled || b.State == StateRemoved {
		return false
	}
	if r.OK {
		if b.Counter.RecordPass() {
			b.transition(StateUp, r, maxHistory)
			return true
		}
	} else {
		if b.Counter.RecordFail() || b.State == StateUnknown {
			b.transition(StateDown, r, maxHistory)
			return true
		}
	}
	return false
}

// Pause transitions the backend to StatePaused. Returns true if the state changed.
func (b *Backend) Pause(maxHistory int) bool {
	if b.State == StatePaused {
		return false
	}
	b.transition(StatePaused, ProbeResult{}, maxHistory)
	b.Counter.Health = 0
	return true
}

// Resume transitions a paused backend back to StateUnknown, resetting the
// counter. Returns true if the state changed.
func (b *Backend) Resume(maxHistory int) bool {
	if b.State != StatePaused {
		return false
	}
	b.transition(StateUnknown, ProbeResult{}, maxHistory)
	b.Counter.Health = b.Counter.Rise - 1
	return true
}

// NextInterval returns the appropriate probe interval based on state and counter:
//   - Unknown (initial / post-resume): fastInterval (falls back to interval) — probe quickly to establish state
//   - Fully healthy (counter at max):  interval
//   - Fully down (counter at 0):       downInterval (falls back to interval)
//   - Degraded (anywhere in between):  fastInterval (falls back to interval)
func (b *Backend) NextInterval(interval, fastInterval, downInterval time.Duration) time.Duration {
	if b.State == StateUnknown {
		if fastInterval > 0 {
			return fastInterval
		}
		return interval
	}
	if b.Counter.Health == b.Counter.Max() {
		return interval
	}
	if b.Counter.Health == 0 {
		if downInterval > 0 {
			return downInterval
		}
		return interval
	}
	if fastInterval > 0 {
		return fastInterval
	}
	return interval
}

// Start records the initial StateUnknown transition when a backend is first
// created or restarted. It exists solely to populate the transition history
// and fire a reload event; the state does not change.
func (b *Backend) Start(maxHistory int) Transition {
	b.transition(StateUnknown, ProbeResult{Code: "start"}, maxHistory)
	return b.Transitions[0]
}

// Disable transitions the backend to StateDisabled. Returns the transition.
// After this call no further probe results are accepted.
func (b *Backend) Disable(maxHistory int) Transition {
	b.transition(StateDisabled, ProbeResult{Code: "disabled"}, maxHistory)
	return b.Transitions[0]
}

// Enable transitions a disabled backend back to StateUnknown, resetting the
// counter so the first probe result resolves state (rise-1 preload gives
// 1-pass → Up, 1-fail → Down). Returns the transition.
func (b *Backend) Enable(maxHistory int) Transition {
	b.transition(StateUnknown, ProbeResult{Code: "enabled"}, maxHistory)
	b.Counter.Health = b.Counter.Rise - 1
	return b.Transitions[0]
}

// Remove transitions the backend to StateRemoved. Returns the transition.
// After this call no further probe results are accepted.
func (b *Backend) Remove(maxHistory int) Transition {
	b.transition(StateRemoved, ProbeResult{Code: "removed"}, maxHistory)
	return b.Transitions[0]
}

// transition appends a new Transition and updates State.
func (b *Backend) transition(to State, r ProbeResult, maxHistory int) {
	t := Transition{From: b.State, To: to, At: time.Now(), Result: r}
	b.Transitions = append([]Transition{t}, b.Transitions...)
	if len(b.Transitions) > maxHistory {
		b.Transitions = b.Transitions[:maxHistory]
	}
	b.State = to
}