Three reliability fixes bundled with docs updates. Restart-neutral VPP LB sync via a startup warmup window (internal/vpp/warmup.go). Before this, a maglevd restart would immediately issue SyncLBStateAll with every backend still in StateUnknown — mapped through BackendEffectiveWeight to weight 0 — and VPP would black-hole all new flows until the checker's rise counters caught up, several seconds later. The new warmup tracker owns a process-wide state machine gated by two config knobs: vpp.lb.startup-min-delay (default 5s) is an absolute hands-off window during which neither the periodic sync loop nor the per-transition reconciler touches VPP; vpp.lb. startup-max-delay (default 30s) is the watchdog for a per-VIP release phase that runs between the two, releasing each frontend as soon as every backend it references reaches a non-Unknown state. At max-delay a final SyncLBStateAll runs for any stragglers still in Unknown. Config reload does not reset the clock. Both delays can be set to 0 to disable the warmup entirely. The reconciler's suppressed-during-warmup events log at DEBUG so operators can still see them with --log-level debug. Unit tests cover the tracker state machine, allBackendsKnown precondition, and the zero-delay escape hatch. Deterministic AS iteration in VPP LB sync. reconcileVIP and recreateVIP now issue their lb_as_add_del / lb_as_set_weight calls in numeric IP order (IPv4 before IPv6, ascending within each family) via a new sortedIPKeys helper, instead of Go map iteration order. VPP's LB plugin breaks per-bucket ties in the Maglev lookup table by insertion position in its internal AS vec, so without a stable call order two maglevd instances on the same config could push identical AS sets into VPP in different orders and produce divergent new-flow tables. Numeric sort is used in preference to lexicographic so the sync log stays human-readable: string order would place 10.0.0.10 before 10.0.0.2, and the same problem in v6. Unit tests cover empty, single, v4/v6 numeric vs lexicographic, v4-before-v6 grouping, a 1000-iteration stability loop against Go's randomised map iteration, insertion-order invariance, and the desiredAS call-site type. maglevt interval fix. runProbeLoop used to sleep the full jittered interval after every probe, so a 100ms --interval with a 30ms probe actually produced a 130ms period. The sleep now subtracts result.Duration so cadence matches the flag. Probes that overrun clamp sleep to zero and fire the next probe immediately without trying to catch up on missed cycles — a slow backend doesn't get flooded with back-to-back probes at the moment it's already struggling. Docs. config-guide now documents flush-on-down and the new startup-min-delay / startup-max-delay knobs; user-guide's maglevd section explains the restart-neutrality property, the three warmup phases, and the relevant slog lines operators should watch for during a bounce. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
880 lines
29 KiB
Go
880 lines
29 KiB
Go
// Copyright (c) 2026, Pim van Pelt <pim@ipng.ch>
|
|
|
|
package vpp
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"go.fd.io/govpp/binapi/vlib"
|
|
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/config"
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/health"
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
|
|
ip_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/ip_types"
|
|
lb "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb"
|
|
lb_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb_types"
|
|
)
|
|
|
|
// sortedIPKeys returns the keys of m in ascending numeric IP order. Used
|
|
// by the AS iteration sites in reconcileVIP and recreateVIP to make the
|
|
// sequence of lb_as_add_del calls deterministic across maglevd instances
|
|
// on the same config.
|
|
//
|
|
// Why this matters for Maglev: VPP's LB plugin stores ASes in a vec in
|
|
// insertion order and rebuilds the new-flow lookup table by walking that
|
|
// vec. The per-AS permutation is a pure function of the AS address (so it
|
|
// matches across instances), but when two ASes want the same bucket on
|
|
// the same pass the tie is broken by whichever one comes first in the
|
|
// vec. If maglevd issues its add calls in Go map iteration order — which
|
|
// is randomised on every run — two independent maglevd instances with
|
|
// bit-identical configs can push the same ASes into VPP in different
|
|
// orders, and their lookup tables end up differing in tie-broken buckets.
|
|
// Sorting here is the first half of the fix; a matching sort inside VPP's
|
|
// lb_vip_update_new_flow_table closes the flap-history hole (where a
|
|
// remove+re-add after steady state reuses freed slots in the as_pool in
|
|
// locally-visited order) and is landing in a separate commit.
|
|
//
|
|
// The keys at every call site are IP literals from net.IP.String(). Sort
|
|
// order is numeric, not lexicographic: lexicographic would put 10.0.0.10
|
|
// before 10.0.0.2 (and 2001:db8::10 before 2001:db8::2), which is
|
|
// correct for determinism but confusing to operators reading the sync
|
|
// log. We parse each key back into a net.IP once, compare the canonical
|
|
// 16-byte form, and group IPv4 before IPv6 so mixed-family frontends
|
|
// read as "v4 block, v6 block" top-to-bottom. ParseIP always succeeds
|
|
// here because the caller built the map key via net.IP.String() in the
|
|
// first place.
|
|
func sortedIPKeys[V any](m map[string]V) []string {
|
|
type kv struct {
|
|
key string
|
|
ip net.IP
|
|
}
|
|
pairs := make([]kv, 0, len(m))
|
|
for k := range m {
|
|
pairs = append(pairs, kv{k, net.ParseIP(k)})
|
|
}
|
|
sort.Slice(pairs, func(i, j int) bool {
|
|
return compareIPNumeric(pairs[i].ip, pairs[j].ip) < 0
|
|
})
|
|
out := make([]string, len(pairs))
|
|
for i, p := range pairs {
|
|
out[i] = p.key
|
|
}
|
|
return out
|
|
}
|
|
|
|
// compareIPNumeric returns <0, 0, >0 in the three-way convention. IPv4
|
|
// sorts before IPv6. Within each family the comparison runs against the
|
|
// canonical fixed-width byte form (4 bytes for v4, 16 bytes for v6),
|
|
// which makes byte ordering match numeric ordering. A nil input — which
|
|
// should not happen given sortedIPKeys's call contract — sorts before
|
|
// any non-nil address to keep the comparator total.
|
|
func compareIPNumeric(a, b net.IP) int {
|
|
a4 := a.To4()
|
|
b4 := b.To4()
|
|
switch {
|
|
case a == nil && b == nil:
|
|
return 0
|
|
case a == nil:
|
|
return -1
|
|
case b == nil:
|
|
return 1
|
|
case a4 != nil && b4 == nil:
|
|
return -1
|
|
case a4 == nil && b4 != nil:
|
|
return 1
|
|
case a4 != nil: // both v4
|
|
return bytes.Compare(a4, b4)
|
|
default: // both v6
|
|
return bytes.Compare(a.To16(), b.To16())
|
|
}
|
|
}
|
|
|
|
// ErrFrontendNotFound is returned by SyncLBStateVIP when the caller asks for
|
|
// a frontend name that does not exist in the config.
|
|
var ErrFrontendNotFound = errors.New("frontend not found in config")
|
|
|
|
// vipKey uniquely identifies a VPP LB VIP by its prefix, protocol, and port.
|
|
type vipKey struct {
|
|
prefix string // canonical CIDR form
|
|
protocol uint8
|
|
port uint16
|
|
}
|
|
|
|
// desiredVIP is the sync's view of one VIP derived from the maglev config.
|
|
type desiredVIP struct {
|
|
Prefix *net.IPNet
|
|
Protocol uint8 // 6=TCP, 17=UDP, 255=any
|
|
Port uint16
|
|
SrcIPSticky bool // lb_add_del_vip_v2.src_ip_sticky
|
|
Encap lb_types.LbEncapType // GRE4 / GRE6; matches the backend family, not the VIP's
|
|
ASes map[string]desiredAS // keyed by AS IP string
|
|
}
|
|
|
|
// desiredAS is one application server to be installed under a VIP.
|
|
type desiredAS struct {
|
|
Address net.IP
|
|
Weight uint8 // 0-100
|
|
Flush bool // if true, drop existing flows when transitioning to weight 0
|
|
}
|
|
|
|
// syncStats counts changes made to the dataplane during a sync run.
|
|
type syncStats struct {
|
|
vipAdd int
|
|
vipDel int
|
|
asAdd int
|
|
asDel int
|
|
asWeight int
|
|
}
|
|
|
|
// recordSyncStats increments the Prometheus lbsync counters for one sync
|
|
// run. scope is "all" for SyncLBStateAll and "vip" for SyncLBStateVIP.
|
|
// Zero-valued kinds emit no increment (the counter stays at its previous
|
|
// value for that label set).
|
|
func recordSyncStats(scope string, st *syncStats) {
|
|
if st.vipAdd > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "vip_added").Add(float64(st.vipAdd))
|
|
}
|
|
if st.vipDel > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "vip_removed").Add(float64(st.vipDel))
|
|
}
|
|
if st.asAdd > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "as_added").Add(float64(st.asAdd))
|
|
}
|
|
if st.asDel > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "as_removed").Add(float64(st.asDel))
|
|
}
|
|
if st.asWeight > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "as_weight_updated").Add(float64(st.asWeight))
|
|
}
|
|
}
|
|
|
|
// SyncLBStateAll reconciles the full VPP load-balancer state with the given
|
|
// config. For every frontend in cfg:
|
|
// - if the VIP does not exist in VPP, create it;
|
|
// - for every pool backend, add the application server if missing, or
|
|
// update its weight if different.
|
|
//
|
|
// VIPs and ASes present in VPP but absent from the config are removed.
|
|
// Returns an error if any VPP API call fails.
|
|
func (c *Client) SyncLBStateAll(cfg *config.Config) error {
|
|
if !c.IsConnected() {
|
|
return errNotConnected
|
|
}
|
|
src := c.getStateSource()
|
|
if src == nil {
|
|
return fmt.Errorf("no state source configured")
|
|
}
|
|
|
|
cur, err := c.GetLBStateAll()
|
|
if err != nil {
|
|
return fmt.Errorf("read VPP LB state: %w", err)
|
|
}
|
|
desired := desiredFromConfig(cfg, src)
|
|
|
|
ch, err := c.apiChannel()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer ch.Close()
|
|
|
|
slog.Info("vpp-lb-sync-start",
|
|
"scope", "all",
|
|
"vips-desired", len(desired),
|
|
"vips-current", len(cur.VIPs))
|
|
|
|
// Index both sides by (prefix, protocol, port).
|
|
curByKey := make(map[vipKey]LBVIP, len(cur.VIPs))
|
|
for _, v := range cur.VIPs {
|
|
curByKey[makeVIPKey(v.Prefix, v.Protocol, v.Port)] = v
|
|
}
|
|
desByKey := make(map[vipKey]desiredVIP, len(desired))
|
|
for _, d := range desired {
|
|
desByKey[makeVIPKey(d.Prefix, d.Protocol, d.Port)] = d
|
|
}
|
|
|
|
var st syncStats
|
|
|
|
// ---- pass 1: remove VIPs that are in VPP but not in config ----
|
|
for k, v := range curByKey {
|
|
if _, keep := desByKey[k]; keep {
|
|
continue
|
|
}
|
|
if err := removeVIP(ch, v, &st); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// ---- pass 2: add/update VIPs that are in config ----
|
|
for k, d := range desByKey {
|
|
cur, existing := curByKey[k]
|
|
var curPtr *LBVIP
|
|
var curSticky bool
|
|
if existing {
|
|
curPtr = &cur
|
|
curSticky = cur.SrcIPSticky
|
|
}
|
|
if err := reconcileVIP(ch, d, curPtr, curSticky, "", &st); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
recordSyncStats("all", &st)
|
|
slog.Info("vpp-lb-sync-done",
|
|
"scope", "all",
|
|
"vip-added", st.vipAdd,
|
|
"vip-removed", st.vipDel,
|
|
"as-added", st.asAdd,
|
|
"as-removed", st.asDel,
|
|
"as-weight-updated", st.asWeight)
|
|
return nil
|
|
}
|
|
|
|
// SyncLBStateVIP reconciles a single VIP (identified by frontend name) with
|
|
// the given config. Unlike SyncLBStateAll, it never removes VIPs: if the
|
|
// frontend is missing from cfg, SyncLBStateVIP returns ErrFrontendNotFound.
|
|
// This is the right tool for targeted updates on a busy load-balancer with
|
|
// many VIPs — only one VIP is read from VPP and only its ASes are modified.
|
|
//
|
|
// flushAddress, when non-empty, is the IP of an application server whose
|
|
// weight change (if any) should be pushed with IsFlush=true regardless of
|
|
// the usual "only flush on non-zero → zero" heuristic. This is how the
|
|
// SetFrontendPoolBackendWeight RPC exposes an explicit "drop flows now"
|
|
// knob: the server handler resolves the backend's config address and
|
|
// passes it here. Callers that don't need forced flushing pass "".
|
|
func (c *Client) SyncLBStateVIP(cfg *config.Config, feName, flushAddress string) error {
|
|
if !c.IsConnected() {
|
|
return errNotConnected
|
|
}
|
|
src := c.getStateSource()
|
|
if src == nil {
|
|
return fmt.Errorf("no state source configured")
|
|
}
|
|
fe, ok := cfg.Frontends[feName]
|
|
if !ok {
|
|
return fmt.Errorf("%q: %w", feName, ErrFrontendNotFound)
|
|
}
|
|
d := desiredFromFrontend(cfg, fe, src)
|
|
|
|
cur, err := c.GetLBStateVIP(d.Prefix, d.Protocol, d.Port)
|
|
if err != nil {
|
|
return fmt.Errorf("read VPP VIP state: %w", err)
|
|
}
|
|
|
|
ch, err := c.apiChannel()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer ch.Close()
|
|
|
|
slog.Info("vpp-lb-sync-start",
|
|
"scope", "vip",
|
|
"frontend", feName,
|
|
"vip", d.Prefix.IP.String(),
|
|
"protocol", protocolName(d.Protocol),
|
|
"port", d.Port)
|
|
|
|
var curSticky bool
|
|
if cur != nil {
|
|
curSticky = cur.SrcIPSticky
|
|
}
|
|
|
|
var st syncStats
|
|
if err := reconcileVIP(ch, d, cur, curSticky, flushAddress, &st); err != nil {
|
|
return err
|
|
}
|
|
recordSyncStats("vip", &st)
|
|
slog.Info("vpp-lb-sync-done",
|
|
"scope", "vip",
|
|
"frontend", feName,
|
|
"vip-added", st.vipAdd,
|
|
"as-added", st.asAdd,
|
|
"as-removed", st.asDel,
|
|
"as-weight-updated", st.asWeight)
|
|
return nil
|
|
}
|
|
|
|
// reconcileVIP brings one VIP's state in VPP into alignment with the desired
|
|
// state. If cur is nil the VIP is added from scratch; otherwise ASes are
|
|
// added, removed, and reweighted individually. Stats are accumulated into st.
|
|
//
|
|
// curSticky is the src_ip_sticky flag VPP currently has programmed for this
|
|
// VIP, as scraped by queryLBSticky. Callers only consult it when cur != nil,
|
|
// and the scrape reads the same live lb_main.vips pool as lb_vip_dump, so a
|
|
// matching entry is always present. When the flag differs from the desired
|
|
// value, the VIP is torn down (ASes del+flushed, VIP deleted) and recreated
|
|
// — VPP has no API to mutate src_ip_sticky on an existing VIP.
|
|
func reconcileVIP(ch *loggedChannel, d desiredVIP, cur *LBVIP, curSticky bool, flushAddress string, st *syncStats) error {
|
|
if cur == nil {
|
|
if err := addVIP(ch, d); err != nil {
|
|
return err
|
|
}
|
|
st.vipAdd++
|
|
for _, addr := range sortedIPKeys(d.ASes) {
|
|
if err := addAS(ch, d.Prefix, d.Protocol, d.Port, d.ASes[addr]); err != nil {
|
|
return err
|
|
}
|
|
st.asAdd++
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if curSticky != d.SrcIPSticky {
|
|
return recreateVIP(ch, d, *cur, st, "src-ip-sticky-changed",
|
|
"from", curSticky, "to", d.SrcIPSticky)
|
|
}
|
|
|
|
// Encap mismatch: every backend flipped address family (e.g. all
|
|
// IPv6 → all IPv4 after a config edit). VPP's encap is a VIP-level
|
|
// attribute set at lb_add_del_vip time with no mutation API, so
|
|
// adding new-family ASes under the old encap wedges the
|
|
// reconciler: packets would be wrapped for the wrong family and
|
|
// the new backends never see traffic. The only recovery is a VIP
|
|
// recreate. The old-family ASes end up orphaned in VPP's pool and
|
|
// are GC'd on the plugin's own ~40s "USED-flag=false" schedule;
|
|
// the next regular sync tick confirms steady state. A recreate
|
|
// does tear down existing flows to the VIP, which is why we gate
|
|
// on an explicit encap difference rather than reconciling every
|
|
// sync cycle.
|
|
if desiredEncap := encapString(d.Encap); desiredEncap != cur.Encap {
|
|
return recreateVIP(ch, d, *cur, st, "encap-changed",
|
|
"from", cur.Encap, "to", desiredEncap)
|
|
}
|
|
|
|
// VIP exists in both — reconcile ASes.
|
|
curASes := make(map[string]LBAS, len(cur.ASes))
|
|
for _, a := range cur.ASes {
|
|
curASes[a.Address.String()] = a
|
|
}
|
|
|
|
// Remove ASes that are in VPP but not desired.
|
|
for _, addr := range sortedIPKeys(curASes) {
|
|
a := curASes[addr]
|
|
if _, keep := d.ASes[addr]; keep {
|
|
continue
|
|
}
|
|
if err := delAS(ch, cur.Prefix, cur.Protocol, cur.Port, a.Address, a.Weight); err != nil {
|
|
return err
|
|
}
|
|
st.asDel++
|
|
}
|
|
|
|
// Add new ASes, update weights on existing ones.
|
|
for _, addr := range sortedIPKeys(d.ASes) {
|
|
a := d.ASes[addr]
|
|
c, hit := curASes[addr]
|
|
if !hit {
|
|
if err := addAS(ch, d.Prefix, d.Protocol, d.Port, a); err != nil {
|
|
return err
|
|
}
|
|
st.asAdd++
|
|
continue
|
|
}
|
|
// setASWeight is issued whenever the weight changes OR whenever
|
|
// the state machine asks for a flush (a.Flush=true, currently
|
|
// emitted only for StateDisabled). The a.Flush path has to
|
|
// fire even on a no-op weight change, because a backend can
|
|
// reach StateDisabled via a pool-failover that already drained
|
|
// its VPP weight to 0 on an earlier tick — at that moment
|
|
// c.Weight == a.Weight == 0, and a gate keyed solely on the
|
|
// weight diff would silently drop the flush intent and leave
|
|
// stale sticky-cache entries pointing at the now-disabled AS
|
|
// (see the "disable nlams0 after fallback deactivation" trace
|
|
// in the bug investigation).
|
|
//
|
|
// Firing unconditionally on a.Flush is idempotent at VPP's
|
|
// side: lb_as_set_weight with an unchanged weight is a no-op
|
|
// on the Maglev table (lb_vip_update_new_flow_table rebuilds
|
|
// the same table), and a redundant lb_flush_vip_as is bounded
|
|
// — it walks each per-worker sticky_ht once. The trade-off is
|
|
// that disabled backends re-issue the flush on every periodic
|
|
// SyncLBStateAll tick, and any sticky entries that happened to
|
|
// land in the meantime get cleared; both are acceptable for
|
|
// the "correctness over churn" semantics we want here.
|
|
weightChanged := c.Weight != a.Weight
|
|
flush := a.Flush
|
|
// Caller-forced flush: used by SetFrontendPoolBackendWeight
|
|
// with flush=true to explicitly drop live sessions for a
|
|
// single backend. The address match is exact — no other
|
|
// AS's weight change is affected, even if several happen
|
|
// in the same reconcile pass.
|
|
if flushAddress != "" && addr == flushAddress {
|
|
flush = true
|
|
}
|
|
if weightChanged || flush {
|
|
if err := setASWeight(ch, d.Prefix, d.Protocol, d.Port, a, c.Weight, flush); err != nil {
|
|
return err
|
|
}
|
|
st.asWeight++
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// recreateVIP tears down an existing VIP and rebuilds it with the
|
|
// desired configuration and ASes. Used when a VIP attribute that VPP
|
|
// can't mutate in place has changed — today src_ip_sticky and the
|
|
// encap family. reason is logged as an operator-facing explanation;
|
|
// extra is appended to the slog call as additional fields (typically
|
|
// "from", <oldvalue>, "to", <newvalue>).
|
|
func recreateVIP(ch *loggedChannel, d desiredVIP, cur LBVIP, st *syncStats, reason string, extra ...any) error {
|
|
logAttrs := []any{
|
|
"vip", d.Prefix.IP.String(),
|
|
"protocol", protocolName(d.Protocol),
|
|
"port", d.Port,
|
|
"reason", reason,
|
|
}
|
|
logAttrs = append(logAttrs, extra...)
|
|
slog.Info("vpp-lb-sync-vip-recreate", logAttrs...)
|
|
if err := removeVIP(ch, cur, st); err != nil {
|
|
return err
|
|
}
|
|
if err := addVIP(ch, d); err != nil {
|
|
return err
|
|
}
|
|
st.vipAdd++
|
|
for _, addr := range sortedIPKeys(d.ASes) {
|
|
if err := addAS(ch, d.Prefix, d.Protocol, d.Port, d.ASes[addr]); err != nil {
|
|
return err
|
|
}
|
|
st.asAdd++
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// removeVIP flushes all ASes from a VIP and then deletes the VIP itself.
|
|
func removeVIP(ch *loggedChannel, v LBVIP, st *syncStats) error {
|
|
for _, as := range v.ASes {
|
|
if err := delAS(ch, v.Prefix, v.Protocol, v.Port, as.Address, as.Weight); err != nil {
|
|
return err
|
|
}
|
|
st.asDel++
|
|
}
|
|
if err := delVIP(ch, v.Prefix, v.Protocol, v.Port); err != nil {
|
|
return err
|
|
}
|
|
st.vipDel++
|
|
return nil
|
|
}
|
|
|
|
// desiredFromConfig flattens every frontend in cfg into a desired VIP set.
|
|
// src provides the per-backend health state so weights and flush hints
|
|
// reflect the current runtime state, not just the static config.
|
|
func desiredFromConfig(cfg *config.Config, src StateSource) []desiredVIP {
|
|
out := make([]desiredVIP, 0, len(cfg.Frontends))
|
|
for _, fe := range cfg.Frontends {
|
|
out = append(out, desiredFromFrontend(cfg, fe, src))
|
|
}
|
|
return out
|
|
}
|
|
|
|
// desiredFromFrontend builds the desired VIP for a single frontend.
|
|
//
|
|
// All backends across all pools of a frontend are merged into a single
|
|
// application-server list so VPP knows about every backend that could ever
|
|
// receive traffic. The per-AS weight and flush hint are computed by
|
|
// asFromBackend from three inputs: (pool index, backend health state,
|
|
// configured pool weight).
|
|
//
|
|
// When the same backend appears in multiple pools, the first pool it
|
|
// appears in wins.
|
|
func desiredFromFrontend(cfg *config.Config, fe config.Frontend, src StateSource) desiredVIP {
|
|
bits := 32
|
|
if fe.Address.To4() == nil {
|
|
bits = 128
|
|
}
|
|
// Start with an encap derived from the VIP's own family as a
|
|
// fallback. This only applies when the frontend has zero valid
|
|
// backends (e.g. every referenced backend is missing from
|
|
// cfg.Backends); any real backend below overrides it to the
|
|
// backend family, which is the correct choice because the GRE
|
|
// encap carries backend traffic, not VIP traffic. Config
|
|
// validation already guarantees every backend in a frontend
|
|
// shares the same family, so the first valid backend we see is
|
|
// authoritative.
|
|
d := desiredVIP{
|
|
Prefix: &net.IPNet{IP: fe.Address, Mask: net.CIDRMask(bits, bits)},
|
|
Protocol: protocolFromConfig(fe.Protocol),
|
|
Port: fe.Port,
|
|
SrcIPSticky: fe.SrcIPSticky,
|
|
Encap: encapForIP(fe.Address),
|
|
ASes: make(map[string]desiredAS),
|
|
}
|
|
encapSet := false
|
|
|
|
states := snapshotStates(fe, src)
|
|
activePool := health.ActivePoolIndex(fe, states)
|
|
|
|
for poolIdx, pool := range fe.Pools {
|
|
for bName, pb := range pool.Backends {
|
|
b, ok := cfg.Backends[bName]
|
|
if !ok || b.Address == nil {
|
|
continue
|
|
}
|
|
if !encapSet {
|
|
d.Encap = encapForIP(b.Address)
|
|
encapSet = true
|
|
}
|
|
// Disabled backends (either via operator action or config) are
|
|
// kept in the desired set so they stay installed in VPP with
|
|
// weight=0 — they must not be deleted, otherwise a subsequent
|
|
// enable has to re-add them and existing flow-table state (if
|
|
// any) is lost. The state machine drives what weight to set
|
|
// via health.BackendEffectiveWeight; we never filter on
|
|
// b.Enabled here.
|
|
addr := b.Address.String()
|
|
if _, already := d.ASes[addr]; already {
|
|
continue
|
|
}
|
|
w, flush := health.BackendEffectiveWeight(poolIdx, activePool, states[bName], pb.Weight, fe.FlushOnDown)
|
|
d.ASes[addr] = desiredAS{
|
|
Address: b.Address,
|
|
Weight: w,
|
|
Flush: flush,
|
|
}
|
|
}
|
|
}
|
|
return d
|
|
}
|
|
|
|
// EffectiveWeights returns the current effective VPP weight for every backend
|
|
// in every pool of fe, keyed by poolIdx and backend name. Intended for
|
|
// observability (e.g. the GetFrontend gRPC handler) — the sync path and the
|
|
// checker's frontend-state logic use health.EffectiveWeights directly.
|
|
func EffectiveWeights(fe config.Frontend, src StateSource) map[int]map[string]uint8 {
|
|
return health.EffectiveWeights(fe, snapshotStates(fe, src))
|
|
}
|
|
|
|
// snapshotStates builds a state map for every backend referenced by fe. It
|
|
// takes one read-lock per backend via the StateSource interface, so the
|
|
// caller gets a consistent view to feed into the pure health helpers.
|
|
func snapshotStates(fe config.Frontend, src StateSource) map[string]health.State {
|
|
states := make(map[string]health.State)
|
|
for _, pool := range fe.Pools {
|
|
for bName := range pool.Backends {
|
|
if s, ok := src.BackendState(bName); ok {
|
|
states[bName] = s
|
|
} else {
|
|
states[bName] = health.StateUnknown
|
|
}
|
|
}
|
|
}
|
|
return states
|
|
}
|
|
|
|
// ---- API call helpers ------------------------------------------------------
|
|
|
|
// defaultFlowsTableLength is sent as NewFlowsTableLength in lb_add_del_vip_v2.
|
|
// The .api file declares default=1024 but that default is only applied by VAT/
|
|
// the CLI parser, not when a raw message is marshalled over the socket. If we
|
|
// send 0, the plugin's vec_validate explodes (OOM / panic). Must be a power of
|
|
// two — 1024 matches the default that would have been applied via CLI.
|
|
const defaultFlowsTableLength = 1024
|
|
|
|
func addVIP(ch *loggedChannel, d desiredVIP) error {
|
|
req := &lb.LbAddDelVipV2{
|
|
Pfx: ip_types.NewAddressWithPrefix(*d.Prefix),
|
|
Protocol: d.Protocol,
|
|
Port: d.Port,
|
|
Encap: d.Encap,
|
|
Type: lb_types.LB_API_SRV_TYPE_CLUSTERIP,
|
|
NewFlowsTableLength: defaultFlowsTableLength,
|
|
SrcIPSticky: d.SrcIPSticky,
|
|
IsDel: false,
|
|
}
|
|
reply := &lb.LbAddDelVipV2Reply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_add_del_vip_v2 add %s: %w", d.Prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_add_del_vip_v2 add %s: retval=%d", d.Prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-vip-added",
|
|
"vip", d.Prefix.IP.String(),
|
|
"protocol", protocolName(d.Protocol),
|
|
"port", d.Port,
|
|
"encap", encapName(d.Encap),
|
|
"src-ip-sticky", d.SrcIPSticky)
|
|
return nil
|
|
}
|
|
|
|
func delVIP(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16) error {
|
|
req := &lb.LbAddDelVipV2{
|
|
Pfx: ip_types.NewAddressWithPrefix(*prefix),
|
|
Protocol: protocol,
|
|
Port: port,
|
|
IsDel: true,
|
|
}
|
|
reply := &lb.LbAddDelVipV2Reply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_add_del_vip_v2 del %s: %w", prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_add_del_vip_v2 del %s: retval=%d", prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-vip-removed",
|
|
"vip", prefix.IP.String(),
|
|
"protocol", protocolName(protocol),
|
|
"port", port)
|
|
return nil
|
|
}
|
|
|
|
func addAS(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16, a desiredAS) error {
|
|
req := &lb.LbAddDelAsV2{
|
|
Pfx: ip_types.NewAddressWithPrefix(*prefix),
|
|
Protocol: protocol,
|
|
Port: port,
|
|
AsAddress: ip_types.NewAddress(a.Address),
|
|
Weight: a.Weight,
|
|
IsDel: false,
|
|
}
|
|
reply := &lb.LbAddDelAsV2Reply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_add_del_as_v2 add %s@%s: %w", a.Address, prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_add_del_as_v2 add %s@%s: retval=%d", a.Address, prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-as-added",
|
|
"vip", prefix.IP.String(),
|
|
"protocol", protocolName(protocol),
|
|
"port", port,
|
|
"address", a.Address.String(),
|
|
"weight", a.Weight)
|
|
return nil
|
|
}
|
|
|
|
func delAS(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16, addr net.IP, fromWeight uint8) error {
|
|
req := &lb.LbAddDelAsV2{
|
|
Pfx: ip_types.NewAddressWithPrefix(*prefix),
|
|
Protocol: protocol,
|
|
Port: port,
|
|
AsAddress: ip_types.NewAddress(addr),
|
|
IsDel: true,
|
|
IsFlush: true,
|
|
}
|
|
reply := &lb.LbAddDelAsV2Reply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_add_del_as_v2 del %s@%s: %w", addr, prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_add_del_as_v2 del %s@%s: retval=%d", addr, prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-as-removed",
|
|
"vip", prefix.IP.String(),
|
|
"protocol", protocolName(protocol),
|
|
"port", port,
|
|
"address", addr.String(),
|
|
"weight", fromWeight)
|
|
return nil
|
|
}
|
|
|
|
func setASWeight(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16, a desiredAS, fromWeight uint8, flush bool) error {
|
|
req := &lb.LbAsSetWeight{
|
|
Pfx: ip_types.NewAddressWithPrefix(*prefix),
|
|
Protocol: protocol,
|
|
Port: port,
|
|
AsAddress: ip_types.NewAddress(a.Address),
|
|
Weight: a.Weight,
|
|
IsFlush: flush,
|
|
}
|
|
reply := &lb.LbAsSetWeightReply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_as_set_weight %s@%s: %w", a.Address, prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_as_set_weight %s@%s: retval=%d", a.Address, prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-as-weight-updated",
|
|
"vip", prefix.IP.String(),
|
|
"protocol", protocolName(protocol),
|
|
"port", port,
|
|
"address", a.Address.String(),
|
|
"from", fromWeight,
|
|
"to", a.Weight,
|
|
"flush", flush)
|
|
return nil
|
|
}
|
|
|
|
// ---- VIP snapshot scrape ---------------------------------------------------
|
|
|
|
// TEMPORARY WORKAROUND: VPP's lb_vip_dump / lb_vip_details message does not
|
|
// return the src_ip_sticky flag, and lb_vip_details also doesn't expose the
|
|
// LB plugin's pool index — which is what the stats segment uses to key
|
|
// per-VIP counters. We work around both by running `show lb vips verbose`
|
|
// via the cli_inband API and parsing the human-readable output. This is
|
|
// slow and fragile (the format is not a stable API) and must be replaced
|
|
// once VPP ships an lb_vip_v2_dump that includes these fields.
|
|
|
|
// lbVIPSnapshot holds the per-VIP facts that the scrape recovers. `index`
|
|
// is the LB pool index (`vip - lbm->vips` in lb.c) used as the second
|
|
// dimension of the vip_counters SimpleCounterStat in the stats segment.
|
|
type lbVIPSnapshot struct {
|
|
index uint32
|
|
sticky bool
|
|
}
|
|
|
|
// queryLBVIPSnapshot runs `show lb vips verbose` and parses it into a map
|
|
// keyed by vipKey. The returned error is non-nil only when the cli_inband
|
|
// RPC itself fails.
|
|
func queryLBVIPSnapshot(ch *loggedChannel) (map[vipKey]lbVIPSnapshot, error) {
|
|
req := &vlib.CliInband{Cmd: "show lb vips verbose"}
|
|
reply := &vlib.CliInbandReply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return nil, fmt.Errorf("cli_inband %q: %w", req.Cmd, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return nil, fmt.Errorf("cli_inband %q: retval=%d", req.Cmd, reply.Retval)
|
|
}
|
|
return parseLBVIPSnapshot(reply.Reply), nil
|
|
}
|
|
|
|
// queryLBSticky is a thin projection of queryLBVIPSnapshot for callers that
|
|
// only care about the src_ip_sticky flag (the sync path).
|
|
func queryLBSticky(ch *loggedChannel) (map[vipKey]bool, error) {
|
|
snap, err := queryLBVIPSnapshot(ch)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
out := make(map[vipKey]bool, len(snap))
|
|
for k, v := range snap {
|
|
out[k] = v.sticky
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// lbVIPHeaderRe matches the first line of each VIP block in the output of
|
|
// `show lb vips verbose`. VPP produces lines like:
|
|
//
|
|
// ip4-gre4 [1] 192.0.2.1/32 src_ip_sticky
|
|
// ip6-gre6 [2] 2001:db8::1/128
|
|
//
|
|
// Capture groups: 1 = pool index, 2 = prefix (CIDR), 3 = " src_ip_sticky" when present.
|
|
var lbVIPHeaderRe = regexp.MustCompile(
|
|
`\b(?:ip4-gre4|ip6-gre6|ip4-gre6|ip6-gre4|ip4-l3dsr|ip4-nat4|ip6-nat6)\s+\[(\d+)\]\s+(\S+)(\s+src_ip_sticky)?`,
|
|
)
|
|
|
|
// lbVIPProtoRe matches the `protocol:<n> port:<n>` sub-line that appears
|
|
// under each non-all-port VIP block.
|
|
var lbVIPProtoRe = regexp.MustCompile(`protocol:(\d+)\s+port:(\d+)`)
|
|
|
|
// parseLBVIPSnapshot turns the reply text of `show lb vips verbose` into a
|
|
// map of vipKey → lbVIPSnapshot. It walks lines sequentially: each header
|
|
// line starts a new VIP, and the following `protocol:<p> port:<port>` line
|
|
// (if any) refines the key. All-port VIPs have no protocol/port sub-line
|
|
// and are recorded with protocol=255 and port=0 — the same key format used
|
|
// by the rest of the sync path (see protocolFromConfig).
|
|
func parseLBVIPSnapshot(text string) map[vipKey]lbVIPSnapshot {
|
|
out := make(map[vipKey]lbVIPSnapshot)
|
|
var havePending bool
|
|
var curPrefix string
|
|
var curIndex uint32
|
|
var curSticky bool
|
|
var curProto uint8 = 255
|
|
var curPort uint16
|
|
|
|
flush := func() {
|
|
if !havePending || curPrefix == "" {
|
|
return
|
|
}
|
|
out[vipKey{prefix: curPrefix, protocol: curProto, port: curPort}] = lbVIPSnapshot{
|
|
index: curIndex,
|
|
sticky: curSticky,
|
|
}
|
|
}
|
|
|
|
for _, line := range strings.Split(text, "\n") {
|
|
if m := lbVIPHeaderRe.FindStringSubmatch(line); m != nil {
|
|
flush()
|
|
havePending = true
|
|
if idx, err := strconv.ParseUint(m[1], 10, 32); err == nil {
|
|
curIndex = uint32(idx)
|
|
} else {
|
|
curIndex = 0
|
|
}
|
|
curPrefix = canonicalCIDR(m[2])
|
|
curSticky = m[3] != ""
|
|
curProto = 255
|
|
curPort = 0
|
|
continue
|
|
}
|
|
if !havePending {
|
|
continue
|
|
}
|
|
if m := lbVIPProtoRe.FindStringSubmatch(line); m != nil {
|
|
if p, err := strconv.Atoi(m[1]); err == nil {
|
|
curProto = uint8(p)
|
|
}
|
|
if p, err := strconv.Atoi(m[2]); err == nil {
|
|
curPort = uint16(p)
|
|
}
|
|
}
|
|
}
|
|
flush()
|
|
return out
|
|
}
|
|
|
|
// canonicalCIDR parses a CIDR string and returns it in Go's canonical
|
|
// net.IPNet.String() form so it matches the keys produced by makeVIPKey.
|
|
// Returns the input unchanged if parsing fails — the header regex should
|
|
// have validated it, but a mismatch shouldn't panic the sync path.
|
|
func canonicalCIDR(s string) string {
|
|
_, ipnet, err := net.ParseCIDR(s)
|
|
if err != nil {
|
|
return s
|
|
}
|
|
return ipnet.String()
|
|
}
|
|
|
|
// ---- utility ---------------------------------------------------------------
|
|
|
|
func makeVIPKey(prefix *net.IPNet, protocol uint8, port uint16) vipKey {
|
|
return vipKey{prefix: prefix.String(), protocol: protocol, port: port}
|
|
}
|
|
|
|
func protocolFromConfig(s string) uint8 {
|
|
switch s {
|
|
case "tcp":
|
|
return 6
|
|
case "udp":
|
|
return 17
|
|
}
|
|
return 255 // any
|
|
}
|
|
|
|
func protocolName(p uint8) string {
|
|
switch p {
|
|
case 6:
|
|
return "tcp"
|
|
case 17:
|
|
return "udp"
|
|
case 255:
|
|
return "any"
|
|
}
|
|
return fmt.Sprintf("%d", p)
|
|
}
|
|
|
|
func encapForIP(ip net.IP) lb_types.LbEncapType {
|
|
if ip.To4() != nil {
|
|
return lb_types.LB_API_ENCAP_TYPE_GRE4
|
|
}
|
|
return lb_types.LB_API_ENCAP_TYPE_GRE6
|
|
}
|
|
|
|
func encapName(e lb_types.LbEncapType) string {
|
|
switch e {
|
|
case lb_types.LB_API_ENCAP_TYPE_GRE4:
|
|
return "gre4"
|
|
case lb_types.LB_API_ENCAP_TYPE_GRE6:
|
|
return "gre6"
|
|
}
|
|
return fmt.Sprintf("%d", e)
|
|
}
|