880 lines
29 KiB
Go
880 lines
29 KiB
Go
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
package vpp
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"go.fd.io/govpp/binapi/vlib"
|
|
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/config"
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/health"
|
|
"git.ipng.ch/ipng/vpp-maglev/internal/metrics"
|
|
ip_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/ip_types"
|
|
lb "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb"
|
|
lb_types "git.ipng.ch/ipng/vpp-maglev/internal/vpp/binapi/lb_types"
|
|
)
|
|
|
|
// sortedIPKeys returns the keys of m in ascending numeric IP order. Used
|
|
// by the AS iteration sites in reconcileVIP and recreateVIP to make the
|
|
// sequence of lb_as_add_del calls deterministic across maglevd instances
|
|
// on the same config.
|
|
//
|
|
// Why this matters for Maglev: VPP's LB plugin stores ASes in a vec in
|
|
// insertion order and rebuilds the new-flow lookup table by walking that
|
|
// vec. The per-AS permutation is a pure function of the AS address (so it
|
|
// matches across instances), but when two ASes want the same bucket on
|
|
// the same pass the tie is broken by whichever one comes first in the
|
|
// vec. If maglevd issues its add calls in Go map iteration order — which
|
|
// is randomised on every run — two independent maglevd instances with
|
|
// bit-identical configs can push the same ASes into VPP in different
|
|
// orders, and their lookup tables end up differing in tie-broken buckets.
|
|
// Sorting here is the first half of the fix; a matching sort inside VPP's
|
|
// lb_vip_update_new_flow_table closes the flap-history hole (where a
|
|
// remove+re-add after steady state reuses freed slots in the as_pool in
|
|
// locally-visited order) and is landing in a separate commit.
|
|
//
|
|
// The keys at every call site are IP literals from net.IP.String(). Sort
|
|
// order is numeric, not lexicographic: lexicographic would put 10.0.0.10
|
|
// before 10.0.0.2 (and 2001:db8::10 before 2001:db8::2), which is
|
|
// correct for determinism but confusing to operators reading the sync
|
|
// log. We parse each key back into a net.IP once, compare the canonical
|
|
// 16-byte form, and group IPv4 before IPv6 so mixed-family frontends
|
|
// read as "v4 block, v6 block" top-to-bottom. ParseIP always succeeds
|
|
// here because the caller built the map key via net.IP.String() in the
|
|
// first place.
|
|
func sortedIPKeys[V any](m map[string]V) []string {
|
|
type kv struct {
|
|
key string
|
|
ip net.IP
|
|
}
|
|
pairs := make([]kv, 0, len(m))
|
|
for k := range m {
|
|
pairs = append(pairs, kv{k, net.ParseIP(k)})
|
|
}
|
|
sort.Slice(pairs, func(i, j int) bool {
|
|
return compareIPNumeric(pairs[i].ip, pairs[j].ip) < 0
|
|
})
|
|
out := make([]string, len(pairs))
|
|
for i, p := range pairs {
|
|
out[i] = p.key
|
|
}
|
|
return out
|
|
}
|
|
|
|
// compareIPNumeric returns <0, 0, >0 in the three-way convention. IPv4
|
|
// sorts before IPv6. Within each family the comparison runs against the
|
|
// canonical fixed-width byte form (4 bytes for v4, 16 bytes for v6),
|
|
// which makes byte ordering match numeric ordering. A nil input — which
|
|
// should not happen given sortedIPKeys's call contract — sorts before
|
|
// any non-nil address to keep the comparator total.
|
|
func compareIPNumeric(a, b net.IP) int {
|
|
a4 := a.To4()
|
|
b4 := b.To4()
|
|
switch {
|
|
case a == nil && b == nil:
|
|
return 0
|
|
case a == nil:
|
|
return -1
|
|
case b == nil:
|
|
return 1
|
|
case a4 != nil && b4 == nil:
|
|
return -1
|
|
case a4 == nil && b4 != nil:
|
|
return 1
|
|
case a4 != nil: // both v4
|
|
return bytes.Compare(a4, b4)
|
|
default: // both v6
|
|
return bytes.Compare(a.To16(), b.To16())
|
|
}
|
|
}
|
|
|
|
// ErrFrontendNotFound is returned by SyncLBStateVIP when the caller asks for
|
|
// a frontend name that does not exist in the config.
|
|
var ErrFrontendNotFound = errors.New("frontend not found in config")
|
|
|
|
// vipKey uniquely identifies a VPP LB VIP by its prefix, protocol, and port.
|
|
type vipKey struct {
|
|
prefix string // canonical CIDR form
|
|
protocol uint8
|
|
port uint16
|
|
}
|
|
|
|
// desiredVIP is the sync's view of one VIP derived from the maglev config.
|
|
type desiredVIP struct {
|
|
Prefix *net.IPNet
|
|
Protocol uint8 // 6=TCP, 17=UDP, 255=any
|
|
Port uint16
|
|
SrcIPSticky bool // lb_add_del_vip_v2.src_ip_sticky
|
|
Encap lb_types.LbEncapType // GRE4 / GRE6; matches the backend family, not the VIP's
|
|
ASes map[string]desiredAS // keyed by AS IP string
|
|
}
|
|
|
|
// desiredAS is one application server to be installed under a VIP.
|
|
type desiredAS struct {
|
|
Address net.IP
|
|
Weight uint8 // 0-100
|
|
Flush bool // if true, drop existing flows when transitioning to weight 0
|
|
}
|
|
|
|
// syncStats counts changes made to the dataplane during a sync run.
|
|
type syncStats struct {
|
|
vipAdd int
|
|
vipDel int
|
|
asAdd int
|
|
asDel int
|
|
asWeight int
|
|
}
|
|
|
|
// recordSyncStats increments the Prometheus lbsync counters for one sync
|
|
// run. scope is "all" for SyncLBStateAll and "vip" for SyncLBStateVIP.
|
|
// Zero-valued kinds emit no increment (the counter stays at its previous
|
|
// value for that label set).
|
|
func recordSyncStats(scope string, st *syncStats) {
|
|
if st.vipAdd > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "vip_added").Add(float64(st.vipAdd))
|
|
}
|
|
if st.vipDel > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "vip_removed").Add(float64(st.vipDel))
|
|
}
|
|
if st.asAdd > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "as_added").Add(float64(st.asAdd))
|
|
}
|
|
if st.asDel > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "as_removed").Add(float64(st.asDel))
|
|
}
|
|
if st.asWeight > 0 {
|
|
metrics.LBSyncTotal.WithLabelValues(scope, "as_weight_updated").Add(float64(st.asWeight))
|
|
}
|
|
}
|
|
|
|
// SyncLBStateAll reconciles the full VPP load-balancer state with the given
|
|
// config. For every frontend in cfg:
|
|
// - if the VIP does not exist in VPP, create it;
|
|
// - for every pool backend, add the application server if missing, or
|
|
// update its weight if different.
|
|
//
|
|
// VIPs and ASes present in VPP but absent from the config are removed.
|
|
// Returns an error if any VPP API call fails.
|
|
func (c *Client) SyncLBStateAll(cfg *config.Config) error {
|
|
if !c.IsConnected() {
|
|
return errNotConnected
|
|
}
|
|
src := c.getStateSource()
|
|
if src == nil {
|
|
return fmt.Errorf("no state source configured")
|
|
}
|
|
|
|
cur, err := c.GetLBStateAll()
|
|
if err != nil {
|
|
return fmt.Errorf("read VPP LB state: %w", err)
|
|
}
|
|
desired := desiredFromConfig(cfg, src)
|
|
|
|
ch, err := c.apiChannel()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer ch.Close()
|
|
|
|
slog.Info("vpp-lb-sync-start",
|
|
"scope", "all",
|
|
"vips-desired", len(desired),
|
|
"vips-current", len(cur.VIPs))
|
|
|
|
// Index both sides by (prefix, protocol, port).
|
|
curByKey := make(map[vipKey]LBVIP, len(cur.VIPs))
|
|
for _, v := range cur.VIPs {
|
|
curByKey[makeVIPKey(v.Prefix, v.Protocol, v.Port)] = v
|
|
}
|
|
desByKey := make(map[vipKey]desiredVIP, len(desired))
|
|
for _, d := range desired {
|
|
desByKey[makeVIPKey(d.Prefix, d.Protocol, d.Port)] = d
|
|
}
|
|
|
|
var st syncStats
|
|
|
|
// ---- pass 1: remove VIPs that are in VPP but not in config ----
|
|
for k, v := range curByKey {
|
|
if _, keep := desByKey[k]; keep {
|
|
continue
|
|
}
|
|
if err := removeVIP(ch, v, &st); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// ---- pass 2: add/update VIPs that are in config ----
|
|
for k, d := range desByKey {
|
|
cur, existing := curByKey[k]
|
|
var curPtr *LBVIP
|
|
var curSticky bool
|
|
if existing {
|
|
curPtr = &cur
|
|
curSticky = cur.SrcIPSticky
|
|
}
|
|
if err := reconcileVIP(ch, d, curPtr, curSticky, "", &st); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
recordSyncStats("all", &st)
|
|
slog.Info("vpp-lb-sync-done",
|
|
"scope", "all",
|
|
"vip-added", st.vipAdd,
|
|
"vip-removed", st.vipDel,
|
|
"as-added", st.asAdd,
|
|
"as-removed", st.asDel,
|
|
"as-weight-updated", st.asWeight)
|
|
return nil
|
|
}
|
|
|
|
// SyncLBStateVIP reconciles a single VIP (identified by frontend name) with
|
|
// the given config. Unlike SyncLBStateAll, it never removes VIPs: if the
|
|
// frontend is missing from cfg, SyncLBStateVIP returns ErrFrontendNotFound.
|
|
// This is the right tool for targeted updates on a busy load-balancer with
|
|
// many VIPs — only one VIP is read from VPP and only its ASes are modified.
|
|
//
|
|
// flushAddress, when non-empty, is the IP of an application server whose
|
|
// weight change (if any) should be pushed with IsFlush=true regardless of
|
|
// the usual "only flush on non-zero → zero" heuristic. This is how the
|
|
// SetFrontendPoolBackendWeight RPC exposes an explicit "drop flows now"
|
|
// knob: the server handler resolves the backend's config address and
|
|
// passes it here. Callers that don't need forced flushing pass "".
|
|
func (c *Client) SyncLBStateVIP(cfg *config.Config, feName, flushAddress string) error {
|
|
if !c.IsConnected() {
|
|
return errNotConnected
|
|
}
|
|
src := c.getStateSource()
|
|
if src == nil {
|
|
return fmt.Errorf("no state source configured")
|
|
}
|
|
fe, ok := cfg.Frontends[feName]
|
|
if !ok {
|
|
return fmt.Errorf("%q: %w", feName, ErrFrontendNotFound)
|
|
}
|
|
d := desiredFromFrontend(cfg, fe, src)
|
|
|
|
cur, err := c.GetLBStateVIP(d.Prefix, d.Protocol, d.Port)
|
|
if err != nil {
|
|
return fmt.Errorf("read VPP VIP state: %w", err)
|
|
}
|
|
|
|
ch, err := c.apiChannel()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer ch.Close()
|
|
|
|
slog.Info("vpp-lb-sync-start",
|
|
"scope", "vip",
|
|
"frontend", feName,
|
|
"vip", d.Prefix.IP.String(),
|
|
"protocol", protocolName(d.Protocol),
|
|
"port", d.Port)
|
|
|
|
var curSticky bool
|
|
if cur != nil {
|
|
curSticky = cur.SrcIPSticky
|
|
}
|
|
|
|
var st syncStats
|
|
if err := reconcileVIP(ch, d, cur, curSticky, flushAddress, &st); err != nil {
|
|
return err
|
|
}
|
|
recordSyncStats("vip", &st)
|
|
slog.Info("vpp-lb-sync-done",
|
|
"scope", "vip",
|
|
"frontend", feName,
|
|
"vip-added", st.vipAdd,
|
|
"as-added", st.asAdd,
|
|
"as-removed", st.asDel,
|
|
"as-weight-updated", st.asWeight)
|
|
return nil
|
|
}
|
|
|
|
// reconcileVIP brings one VIP's state in VPP into alignment with the desired
|
|
// state. If cur is nil the VIP is added from scratch; otherwise ASes are
|
|
// added, removed, and reweighted individually. Stats are accumulated into st.
|
|
//
|
|
// curSticky is the src_ip_sticky flag VPP currently has programmed for this
|
|
// VIP, as scraped by queryLBSticky. Callers only consult it when cur != nil,
|
|
// and the scrape reads the same live lb_main.vips pool as lb_vip_dump, so a
|
|
// matching entry is always present. When the flag differs from the desired
|
|
// value, the VIP is torn down (ASes del+flushed, VIP deleted) and recreated
|
|
// — VPP has no API to mutate src_ip_sticky on an existing VIP.
|
|
func reconcileVIP(ch *loggedChannel, d desiredVIP, cur *LBVIP, curSticky bool, flushAddress string, st *syncStats) error {
|
|
if cur == nil {
|
|
if err := addVIP(ch, d); err != nil {
|
|
return err
|
|
}
|
|
st.vipAdd++
|
|
for _, addr := range sortedIPKeys(d.ASes) {
|
|
if err := addAS(ch, d.Prefix, d.Protocol, d.Port, d.ASes[addr]); err != nil {
|
|
return err
|
|
}
|
|
st.asAdd++
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if curSticky != d.SrcIPSticky {
|
|
return recreateVIP(ch, d, *cur, st, "src-ip-sticky-changed",
|
|
"from", curSticky, "to", d.SrcIPSticky)
|
|
}
|
|
|
|
// Encap mismatch: every backend flipped address family (e.g. all
|
|
// IPv6 → all IPv4 after a config edit). VPP's encap is a VIP-level
|
|
// attribute set at lb_add_del_vip time with no mutation API, so
|
|
// adding new-family ASes under the old encap wedges the
|
|
// reconciler: packets would be wrapped for the wrong family and
|
|
// the new backends never see traffic. The only recovery is a VIP
|
|
// recreate. The old-family ASes end up orphaned in VPP's pool and
|
|
// are GC'd on the plugin's own ~40s "USED-flag=false" schedule;
|
|
// the next regular sync tick confirms steady state. A recreate
|
|
// does tear down existing flows to the VIP, which is why we gate
|
|
// on an explicit encap difference rather than reconciling every
|
|
// sync cycle.
|
|
if desiredEncap := encapString(d.Encap); desiredEncap != cur.Encap {
|
|
return recreateVIP(ch, d, *cur, st, "encap-changed",
|
|
"from", cur.Encap, "to", desiredEncap)
|
|
}
|
|
|
|
// VIP exists in both — reconcile ASes.
|
|
curASes := make(map[string]LBAS, len(cur.ASes))
|
|
for _, a := range cur.ASes {
|
|
curASes[a.Address.String()] = a
|
|
}
|
|
|
|
// Remove ASes that are in VPP but not desired.
|
|
for _, addr := range sortedIPKeys(curASes) {
|
|
a := curASes[addr]
|
|
if _, keep := d.ASes[addr]; keep {
|
|
continue
|
|
}
|
|
if err := delAS(ch, cur.Prefix, cur.Protocol, cur.Port, a.Address, a.Weight); err != nil {
|
|
return err
|
|
}
|
|
st.asDel++
|
|
}
|
|
|
|
// Add new ASes, update weights on existing ones.
|
|
for _, addr := range sortedIPKeys(d.ASes) {
|
|
a := d.ASes[addr]
|
|
c, hit := curASes[addr]
|
|
if !hit {
|
|
if err := addAS(ch, d.Prefix, d.Protocol, d.Port, a); err != nil {
|
|
return err
|
|
}
|
|
st.asAdd++
|
|
continue
|
|
}
|
|
// setASWeight is issued whenever the weight changes OR whenever
|
|
// the state machine asks for a flush (a.Flush=true, currently
|
|
// emitted only for StateDisabled). The a.Flush path has to
|
|
// fire even on a no-op weight change, because a backend can
|
|
// reach StateDisabled via a pool-failover that already drained
|
|
// its VPP weight to 0 on an earlier tick — at that moment
|
|
// c.Weight == a.Weight == 0, and a gate keyed solely on the
|
|
// weight diff would silently drop the flush intent and leave
|
|
// stale sticky-cache entries pointing at the now-disabled AS
|
|
// (see the "disable nlams0 after fallback deactivation" trace
|
|
// in the bug investigation).
|
|
//
|
|
// Firing unconditionally on a.Flush is idempotent at VPP's
|
|
// side: lb_as_set_weight with an unchanged weight is a no-op
|
|
// on the Maglev table (lb_vip_update_new_flow_table rebuilds
|
|
// the same table), and a redundant lb_flush_vip_as is bounded
|
|
// — it walks each per-worker sticky_ht once. The trade-off is
|
|
// that disabled backends re-issue the flush on every periodic
|
|
// SyncLBStateAll tick, and any sticky entries that happened to
|
|
// land in the meantime get cleared; both are acceptable for
|
|
// the "correctness over churn" semantics we want here.
|
|
weightChanged := c.Weight != a.Weight
|
|
flush := a.Flush
|
|
// Caller-forced flush: used by SetFrontendPoolBackendWeight
|
|
// with flush=true to explicitly drop live sessions for a
|
|
// single backend. The address match is exact — no other
|
|
// AS's weight change is affected, even if several happen
|
|
// in the same reconcile pass.
|
|
if flushAddress != "" && addr == flushAddress {
|
|
flush = true
|
|
}
|
|
if weightChanged || flush {
|
|
if err := setASWeight(ch, d.Prefix, d.Protocol, d.Port, a, c.Weight, flush); err != nil {
|
|
return err
|
|
}
|
|
st.asWeight++
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// recreateVIP tears down an existing VIP and rebuilds it with the
|
|
// desired configuration and ASes. Used when a VIP attribute that VPP
|
|
// can't mutate in place has changed — today src_ip_sticky and the
|
|
// encap family. reason is logged as an operator-facing explanation;
|
|
// extra is appended to the slog call as additional fields (typically
|
|
// "from", <oldvalue>, "to", <newvalue>).
|
|
func recreateVIP(ch *loggedChannel, d desiredVIP, cur LBVIP, st *syncStats, reason string, extra ...any) error {
|
|
logAttrs := []any{
|
|
"vip", d.Prefix.IP.String(),
|
|
"protocol", protocolName(d.Protocol),
|
|
"port", d.Port,
|
|
"reason", reason,
|
|
}
|
|
logAttrs = append(logAttrs, extra...)
|
|
slog.Info("vpp-lb-sync-vip-recreate", logAttrs...)
|
|
if err := removeVIP(ch, cur, st); err != nil {
|
|
return err
|
|
}
|
|
if err := addVIP(ch, d); err != nil {
|
|
return err
|
|
}
|
|
st.vipAdd++
|
|
for _, addr := range sortedIPKeys(d.ASes) {
|
|
if err := addAS(ch, d.Prefix, d.Protocol, d.Port, d.ASes[addr]); err != nil {
|
|
return err
|
|
}
|
|
st.asAdd++
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// removeVIP flushes all ASes from a VIP and then deletes the VIP itself.
|
|
func removeVIP(ch *loggedChannel, v LBVIP, st *syncStats) error {
|
|
for _, as := range v.ASes {
|
|
if err := delAS(ch, v.Prefix, v.Protocol, v.Port, as.Address, as.Weight); err != nil {
|
|
return err
|
|
}
|
|
st.asDel++
|
|
}
|
|
if err := delVIP(ch, v.Prefix, v.Protocol, v.Port); err != nil {
|
|
return err
|
|
}
|
|
st.vipDel++
|
|
return nil
|
|
}
|
|
|
|
// desiredFromConfig flattens every frontend in cfg into a desired VIP set.
|
|
// src provides the per-backend health state so weights and flush hints
|
|
// reflect the current runtime state, not just the static config.
|
|
func desiredFromConfig(cfg *config.Config, src StateSource) []desiredVIP {
|
|
out := make([]desiredVIP, 0, len(cfg.Frontends))
|
|
for _, fe := range cfg.Frontends {
|
|
out = append(out, desiredFromFrontend(cfg, fe, src))
|
|
}
|
|
return out
|
|
}
|
|
|
|
// desiredFromFrontend builds the desired VIP for a single frontend.
|
|
//
|
|
// All backends across all pools of a frontend are merged into a single
|
|
// application-server list so VPP knows about every backend that could ever
|
|
// receive traffic. The per-AS weight and flush hint are computed by
|
|
// asFromBackend from three inputs: (pool index, backend health state,
|
|
// configured pool weight).
|
|
//
|
|
// When the same backend appears in multiple pools, the first pool it
|
|
// appears in wins.
|
|
func desiredFromFrontend(cfg *config.Config, fe config.Frontend, src StateSource) desiredVIP {
|
|
bits := 32
|
|
if fe.Address.To4() == nil {
|
|
bits = 128
|
|
}
|
|
// Start with an encap derived from the VIP's own family as a
|
|
// fallback. This only applies when the frontend has zero valid
|
|
// backends (e.g. every referenced backend is missing from
|
|
// cfg.Backends); any real backend below overrides it to the
|
|
// backend family, which is the correct choice because the GRE
|
|
// encap carries backend traffic, not VIP traffic. Config
|
|
// validation already guarantees every backend in a frontend
|
|
// shares the same family, so the first valid backend we see is
|
|
// authoritative.
|
|
d := desiredVIP{
|
|
Prefix: &net.IPNet{IP: fe.Address, Mask: net.CIDRMask(bits, bits)},
|
|
Protocol: protocolFromConfig(fe.Protocol),
|
|
Port: fe.Port,
|
|
SrcIPSticky: fe.SrcIPSticky,
|
|
Encap: encapForIP(fe.Address),
|
|
ASes: make(map[string]desiredAS),
|
|
}
|
|
encapSet := false
|
|
|
|
states := snapshotStates(fe, src)
|
|
activePool := health.ActivePoolIndex(fe, states)
|
|
|
|
for poolIdx, pool := range fe.Pools {
|
|
for bName, pb := range pool.Backends {
|
|
b, ok := cfg.Backends[bName]
|
|
if !ok || b.Address == nil {
|
|
continue
|
|
}
|
|
if !encapSet {
|
|
d.Encap = encapForIP(b.Address)
|
|
encapSet = true
|
|
}
|
|
// Disabled backends (either via operator action or config) are
|
|
// kept in the desired set so they stay installed in VPP with
|
|
// weight=0 — they must not be deleted, otherwise a subsequent
|
|
// enable has to re-add them and existing flow-table state (if
|
|
// any) is lost. The state machine drives what weight to set
|
|
// via health.BackendEffectiveWeight; we never filter on
|
|
// b.Enabled here.
|
|
addr := b.Address.String()
|
|
if _, already := d.ASes[addr]; already {
|
|
continue
|
|
}
|
|
w, flush := health.BackendEffectiveWeight(poolIdx, activePool, states[bName], pb.Weight, fe.FlushOnDown)
|
|
d.ASes[addr] = desiredAS{
|
|
Address: b.Address,
|
|
Weight: w,
|
|
Flush: flush,
|
|
}
|
|
}
|
|
}
|
|
return d
|
|
}
|
|
|
|
// EffectiveWeights returns the current effective VPP weight for every backend
|
|
// in every pool of fe, keyed by poolIdx and backend name. Intended for
|
|
// observability (e.g. the GetFrontend gRPC handler) — the sync path and the
|
|
// checker's frontend-state logic use health.EffectiveWeights directly.
|
|
func EffectiveWeights(fe config.Frontend, src StateSource) map[int]map[string]uint8 {
|
|
return health.EffectiveWeights(fe, snapshotStates(fe, src))
|
|
}
|
|
|
|
// snapshotStates builds a state map for every backend referenced by fe. It
|
|
// takes one read-lock per backend via the StateSource interface, so the
|
|
// caller gets a consistent view to feed into the pure health helpers.
|
|
func snapshotStates(fe config.Frontend, src StateSource) map[string]health.State {
|
|
states := make(map[string]health.State)
|
|
for _, pool := range fe.Pools {
|
|
for bName := range pool.Backends {
|
|
if s, ok := src.BackendState(bName); ok {
|
|
states[bName] = s
|
|
} else {
|
|
states[bName] = health.StateUnknown
|
|
}
|
|
}
|
|
}
|
|
return states
|
|
}
|
|
|
|
// ---- API call helpers ------------------------------------------------------
|
|
|
|
// defaultFlowsTableLength is sent as NewFlowsTableLength in lb_add_del_vip_v2.
|
|
// The .api file declares default=1024 but that default is only applied by VAT/
|
|
// the CLI parser, not when a raw message is marshalled over the socket. If we
|
|
// send 0, the plugin's vec_validate explodes (OOM / panic). Must be a power of
|
|
// two — 1024 matches the default that would have been applied via CLI.
|
|
const defaultFlowsTableLength = 1024
|
|
|
|
func addVIP(ch *loggedChannel, d desiredVIP) error {
|
|
req := &lb.LbAddDelVipV2{
|
|
Pfx: ip_types.NewAddressWithPrefix(*d.Prefix),
|
|
Protocol: d.Protocol,
|
|
Port: d.Port,
|
|
Encap: d.Encap,
|
|
Type: lb_types.LB_API_SRV_TYPE_CLUSTERIP,
|
|
NewFlowsTableLength: defaultFlowsTableLength,
|
|
SrcIPSticky: d.SrcIPSticky,
|
|
IsDel: false,
|
|
}
|
|
reply := &lb.LbAddDelVipV2Reply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_add_del_vip_v2 add %s: %w", d.Prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_add_del_vip_v2 add %s: retval=%d", d.Prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-vip-added",
|
|
"vip", d.Prefix.IP.String(),
|
|
"protocol", protocolName(d.Protocol),
|
|
"port", d.Port,
|
|
"encap", encapName(d.Encap),
|
|
"src-ip-sticky", d.SrcIPSticky)
|
|
return nil
|
|
}
|
|
|
|
func delVIP(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16) error {
|
|
req := &lb.LbAddDelVipV2{
|
|
Pfx: ip_types.NewAddressWithPrefix(*prefix),
|
|
Protocol: protocol,
|
|
Port: port,
|
|
IsDel: true,
|
|
}
|
|
reply := &lb.LbAddDelVipV2Reply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_add_del_vip_v2 del %s: %w", prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_add_del_vip_v2 del %s: retval=%d", prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-vip-removed",
|
|
"vip", prefix.IP.String(),
|
|
"protocol", protocolName(protocol),
|
|
"port", port)
|
|
return nil
|
|
}
|
|
|
|
func addAS(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16, a desiredAS) error {
|
|
req := &lb.LbAddDelAsV2{
|
|
Pfx: ip_types.NewAddressWithPrefix(*prefix),
|
|
Protocol: protocol,
|
|
Port: port,
|
|
AsAddress: ip_types.NewAddress(a.Address),
|
|
Weight: a.Weight,
|
|
IsDel: false,
|
|
}
|
|
reply := &lb.LbAddDelAsV2Reply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_add_del_as_v2 add %s@%s: %w", a.Address, prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_add_del_as_v2 add %s@%s: retval=%d", a.Address, prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-as-added",
|
|
"vip", prefix.IP.String(),
|
|
"protocol", protocolName(protocol),
|
|
"port", port,
|
|
"address", a.Address.String(),
|
|
"weight", a.Weight)
|
|
return nil
|
|
}
|
|
|
|
func delAS(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16, addr net.IP, fromWeight uint8) error {
|
|
req := &lb.LbAddDelAsV2{
|
|
Pfx: ip_types.NewAddressWithPrefix(*prefix),
|
|
Protocol: protocol,
|
|
Port: port,
|
|
AsAddress: ip_types.NewAddress(addr),
|
|
IsDel: true,
|
|
IsFlush: true,
|
|
}
|
|
reply := &lb.LbAddDelAsV2Reply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_add_del_as_v2 del %s@%s: %w", addr, prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_add_del_as_v2 del %s@%s: retval=%d", addr, prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-as-removed",
|
|
"vip", prefix.IP.String(),
|
|
"protocol", protocolName(protocol),
|
|
"port", port,
|
|
"address", addr.String(),
|
|
"weight", fromWeight)
|
|
return nil
|
|
}
|
|
|
|
func setASWeight(ch *loggedChannel, prefix *net.IPNet, protocol uint8, port uint16, a desiredAS, fromWeight uint8, flush bool) error {
|
|
req := &lb.LbAsSetWeight{
|
|
Pfx: ip_types.NewAddressWithPrefix(*prefix),
|
|
Protocol: protocol,
|
|
Port: port,
|
|
AsAddress: ip_types.NewAddress(a.Address),
|
|
Weight: a.Weight,
|
|
IsFlush: flush,
|
|
}
|
|
reply := &lb.LbAsSetWeightReply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return fmt.Errorf("lb_as_set_weight %s@%s: %w", a.Address, prefix, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return fmt.Errorf("lb_as_set_weight %s@%s: retval=%d", a.Address, prefix, reply.Retval)
|
|
}
|
|
slog.Info("vpp-lb-sync-as-weight-updated",
|
|
"vip", prefix.IP.String(),
|
|
"protocol", protocolName(protocol),
|
|
"port", port,
|
|
"address", a.Address.String(),
|
|
"from", fromWeight,
|
|
"to", a.Weight,
|
|
"flush", flush)
|
|
return nil
|
|
}
|
|
|
|
// ---- VIP snapshot scrape ---------------------------------------------------
|
|
|
|
// TEMPORARY WORKAROUND: VPP's lb_vip_dump / lb_vip_details message does not
|
|
// return the src_ip_sticky flag, and lb_vip_details also doesn't expose the
|
|
// LB plugin's pool index — which is what the stats segment uses to key
|
|
// per-VIP counters. We work around both by running `show lb vips verbose`
|
|
// via the cli_inband API and parsing the human-readable output. This is
|
|
// slow and fragile (the format is not a stable API) and must be replaced
|
|
// once VPP ships an lb_vip_v2_dump that includes these fields.
|
|
|
|
// lbVIPSnapshot holds the per-VIP facts that the scrape recovers. `index`
|
|
// is the LB pool index (`vip - lbm->vips` in lb.c) used as the second
|
|
// dimension of the vip_counters SimpleCounterStat in the stats segment.
|
|
type lbVIPSnapshot struct {
|
|
index uint32
|
|
sticky bool
|
|
}
|
|
|
|
// queryLBVIPSnapshot runs `show lb vips verbose` and parses it into a map
|
|
// keyed by vipKey. The returned error is non-nil only when the cli_inband
|
|
// RPC itself fails.
|
|
func queryLBVIPSnapshot(ch *loggedChannel) (map[vipKey]lbVIPSnapshot, error) {
|
|
req := &vlib.CliInband{Cmd: "show lb vips verbose"}
|
|
reply := &vlib.CliInbandReply{}
|
|
if err := ch.SendRequest(req).ReceiveReply(reply); err != nil {
|
|
return nil, fmt.Errorf("cli_inband %q: %w", req.Cmd, err)
|
|
}
|
|
if reply.Retval != 0 {
|
|
return nil, fmt.Errorf("cli_inband %q: retval=%d", req.Cmd, reply.Retval)
|
|
}
|
|
return parseLBVIPSnapshot(reply.Reply), nil
|
|
}
|
|
|
|
// queryLBSticky is a thin projection of queryLBVIPSnapshot for callers that
|
|
// only care about the src_ip_sticky flag (the sync path).
|
|
func queryLBSticky(ch *loggedChannel) (map[vipKey]bool, error) {
|
|
snap, err := queryLBVIPSnapshot(ch)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
out := make(map[vipKey]bool, len(snap))
|
|
for k, v := range snap {
|
|
out[k] = v.sticky
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// lbVIPHeaderRe matches the first line of each VIP block in the output of
|
|
// `show lb vips verbose`. VPP produces lines like:
|
|
//
|
|
// ip4-gre4 [1] 192.0.2.1/32 src_ip_sticky
|
|
// ip6-gre6 [2] 2001:db8::1/128
|
|
//
|
|
// Capture groups: 1 = pool index, 2 = prefix (CIDR), 3 = " src_ip_sticky" when present.
|
|
var lbVIPHeaderRe = regexp.MustCompile(
|
|
`\b(?:ip4-gre4|ip6-gre6|ip4-gre6|ip6-gre4|ip4-l3dsr|ip4-nat4|ip6-nat6)\s+\[(\d+)\]\s+(\S+)(\s+src_ip_sticky)?`,
|
|
)
|
|
|
|
// lbVIPProtoRe matches the `protocol:<n> port:<n>` sub-line that appears
|
|
// under each non-all-port VIP block.
|
|
var lbVIPProtoRe = regexp.MustCompile(`protocol:(\d+)\s+port:(\d+)`)
|
|
|
|
// parseLBVIPSnapshot turns the reply text of `show lb vips verbose` into a
|
|
// map of vipKey → lbVIPSnapshot. It walks lines sequentially: each header
|
|
// line starts a new VIP, and the following `protocol:<p> port:<port>` line
|
|
// (if any) refines the key. All-port VIPs have no protocol/port sub-line
|
|
// and are recorded with protocol=255 and port=0 — the same key format used
|
|
// by the rest of the sync path (see protocolFromConfig).
|
|
func parseLBVIPSnapshot(text string) map[vipKey]lbVIPSnapshot {
|
|
out := make(map[vipKey]lbVIPSnapshot)
|
|
var havePending bool
|
|
var curPrefix string
|
|
var curIndex uint32
|
|
var curSticky bool
|
|
var curProto uint8 = 255
|
|
var curPort uint16
|
|
|
|
flush := func() {
|
|
if !havePending || curPrefix == "" {
|
|
return
|
|
}
|
|
out[vipKey{prefix: curPrefix, protocol: curProto, port: curPort}] = lbVIPSnapshot{
|
|
index: curIndex,
|
|
sticky: curSticky,
|
|
}
|
|
}
|
|
|
|
for _, line := range strings.Split(text, "\n") {
|
|
if m := lbVIPHeaderRe.FindStringSubmatch(line); m != nil {
|
|
flush()
|
|
havePending = true
|
|
if idx, err := strconv.ParseUint(m[1], 10, 32); err == nil {
|
|
curIndex = uint32(idx)
|
|
} else {
|
|
curIndex = 0
|
|
}
|
|
curPrefix = canonicalCIDR(m[2])
|
|
curSticky = m[3] != ""
|
|
curProto = 255
|
|
curPort = 0
|
|
continue
|
|
}
|
|
if !havePending {
|
|
continue
|
|
}
|
|
if m := lbVIPProtoRe.FindStringSubmatch(line); m != nil {
|
|
if p, err := strconv.Atoi(m[1]); err == nil {
|
|
curProto = uint8(p)
|
|
}
|
|
if p, err := strconv.Atoi(m[2]); err == nil {
|
|
curPort = uint16(p)
|
|
}
|
|
}
|
|
}
|
|
flush()
|
|
return out
|
|
}
|
|
|
|
// canonicalCIDR parses a CIDR string and returns it in Go's canonical
|
|
// net.IPNet.String() form so it matches the keys produced by makeVIPKey.
|
|
// Returns the input unchanged if parsing fails — the header regex should
|
|
// have validated it, but a mismatch shouldn't panic the sync path.
|
|
func canonicalCIDR(s string) string {
|
|
_, ipnet, err := net.ParseCIDR(s)
|
|
if err != nil {
|
|
return s
|
|
}
|
|
return ipnet.String()
|
|
}
|
|
|
|
// ---- utility ---------------------------------------------------------------
|
|
|
|
func makeVIPKey(prefix *net.IPNet, protocol uint8, port uint16) vipKey {
|
|
return vipKey{prefix: prefix.String(), protocol: protocol, port: port}
|
|
}
|
|
|
|
func protocolFromConfig(s string) uint8 {
|
|
switch s {
|
|
case "tcp":
|
|
return 6
|
|
case "udp":
|
|
return 17
|
|
}
|
|
return 255 // any
|
|
}
|
|
|
|
func protocolName(p uint8) string {
|
|
switch p {
|
|
case 6:
|
|
return "tcp"
|
|
case 17:
|
|
return "udp"
|
|
case 255:
|
|
return "any"
|
|
}
|
|
return fmt.Sprintf("%d", p)
|
|
}
|
|
|
|
func encapForIP(ip net.IP) lb_types.LbEncapType {
|
|
if ip.To4() != nil {
|
|
return lb_types.LB_API_ENCAP_TYPE_GRE4
|
|
}
|
|
return lb_types.LB_API_ENCAP_TYPE_GRE6
|
|
}
|
|
|
|
func encapName(e lb_types.LbEncapType) string {
|
|
switch e {
|
|
case lb_types.LB_API_ENCAP_TYPE_GRE4:
|
|
return "gre4"
|
|
case lb_types.LB_API_ENCAP_TYPE_GRE6:
|
|
return "gre6"
|
|
}
|
|
return fmt.Sprintf("%d", e)
|
|
}
|