Files
vpp-snmp-agent/vpp-snmp-agent.py
Pim van Pelt 95d96d5e61 bugfix: add a control_ping() before each update
If VPP were to disconnect either the Stats Segment or the API endpoint,
for example if it crashes and restarts, vpp-snmp-agent will not detect
this. In such a situation, it will hold on to the stale stats and no
longer receive interface updates.

Before each run, send a control_ping() API request, and if that were to
fail (for example with Broken Pipe, or Connection Refused), disconnect
both API and Stats (in the vpp.disconnect() call, also invalidate the interface
and LCP cache), and then fail the update. The Agent runner will then retry
once per second until the connection (and control_ping()) succeeds.

TESTED:
- Start vpp-snmp-agent, it connects and starts up per normal.
- Exit / Kill vpp
- Upon the next update(), the control_ping() call will fail, causing the
  agent to disconnect
- The agent will now loop:
[ERROR   ]      agentx.agent - update         : VPP API: [Errno 1] Sendall error: BrokenPipeError(32, 'Broken pipe'), retrying
[WARNING ]      agentx.agent - run            : Update failed, last successful update was 1673345631.7658572
[INFO    ]     agentx.vppapi - connect        : Connecting to VPP
[ERROR   ]      agentx.agent - update         : VPP API: Not connected, api definitions not available, retrying

- Start VPP again, when its API endpoint is ready:
[INFO    ]     agentx.vppapi - connect        : Connecting to VPP
[INFO    ]     agentx.vppapi - connect        : VPP version is 23.02-rc0~199-gcfaf44020
[INFO    ]     agentx.vppapi - connect        : Enabling VPP API interface events
[DEBUG   ]      agentx.agent - update         : VPP API: control_ping_reply(_0=24, context=12, retval=0, client_index=0, vpe_pid=705326)
[INFO    ]     agentx.vppapi - get_ifaces     : Requesting interfaces from VPP API
[INFO    ]     agentx.vppapi - get_lcp        : Requesting LCPs from VPP API

- The agent resumes where it left off
2023-01-10 11:24:44 +01:00

412 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from vppstats import VPPStats
from vppapi import VPPApi
import sys
import yaml
import agentx
try:
import argparse
except ImportError:
print("ERROR: install argparse manually: sudo pip install argparse")
sys.exit(2)
def get_phy_by_sw_if_index(ifaces, sw_if_index):
try:
for k, v in ifaces.items():
if v.sw_if_index == sw_if_index:
return v
except:
pass
return None
def get_lcp_by_host_sw_if_index(lcp, host_sw_if_index):
try:
for k, v in lcp.items():
if v.host_sw_if_index == host_sw_if_index:
return v
except:
pass
return None
def get_description_by_ifname(config, name):
try:
if "interfaces" in config:
for ifname, iface in config["interfaces"].items():
if ifname == name:
return iface["description"]
if "sub-interfaces" in iface:
for sub_id, sub_iface in iface["sub-interfaces"].items():
sub_ifname = "%s.%d" % (ifname, sub_id)
if name == sub_ifname:
return sub_iface["description"]
if "loopbacks" in config:
for ifname, iface in config["loopbacks"].items():
if ifname == name:
return iface["description"]
if "taps" in config:
for ifname, iface in config["taps"].items():
if ifname == name:
return iface["description"]
if "vxlan_tunnels" in config:
for ifname, iface in config["vxlan_tunnels"].items():
if ifname == name:
return iface["description"]
except:
pass
return None
class MyAgent(agentx.Agent):
def setup(self):
self.config = None
if self._args.config:
try:
with open(self._args.config, "r") as f:
self.logger.info("Loading configfile %s" % self._args.config)
self.config = yaml.load(f, Loader=yaml.FullLoader)
self.logger.debug("Config: %s" % self.config)
except:
self.logger.error("Couldn't read config from %s" % self._args.config)
try:
self.logger.info("Connecting to VPP Stats Segment")
self.vppstat = VPPStats(socketname="/run/vpp/stats.sock", timeout=2)
self.vppstat.connect()
except:
self.logger.error("Could not connect to VPPStats segment")
return False
try:
self.vpp = VPPApi(clientname="vpp-snmp-agent")
self.vpp.connect()
except:
self.logger.error("Could not connect to VPP API")
return False
self.register("1.3.6.1.2.1.2.2.1")
self.register("1.3.6.1.2.1.31.1.1.1")
return True
def update(self):
try:
self.vpp.connect()
r = self.vpp.vpp.api.control_ping()
self.logger.debug(f"VPP API: {r}")
self.vppstat.connect()
except Exception as e:
self.logger.error(f"VPP API: {e}, retrying")
self.vppstat.disconnect()
self.vpp.disconnect()
return False
ds = agentx.DataSet()
ifaces = self.vpp.get_ifaces()
lcp = self.vpp.get_lcp()
num_ifaces = len(ifaces)
num_vppstat = len(self.vppstat["/if/names"])
num_lcp = len(lcp)
self.logger.debug(
"Retrieved Interfaces: vppapi=%d vppstat=%d lcp=%d"
% (num_ifaces, num_vppstat, num_lcp)
)
if num_ifaces != num_vppstat:
self.logger.warning(
"Interfaces count mismatch: vppapi=%d vppstat=%d"
% (num_ifaces, num_vppstat)
)
for i in range(len(self.vppstat["/if/names"])):
ifname = self.vppstat["/if/names"][i]
idx = 1000 + i
ds.set("1.3.6.1.2.1.2.2.1.1.%u" % (idx), "int", idx)
ifName = ifname
ifAlias = None
try:
if self.config and ifname.startswith("tap"):
host_sw_if_index = ifaces[ifname].sw_if_index
lip = get_lcp_by_host_sw_if_index(lcp, host_sw_if_index)
if lip:
phy = get_phy_by_sw_if_index(ifaces, lip.phy_sw_if_index)
ifName = lip.host_if_name
self.logger.debug(
"Setting ifName of %s to '%s'" % (ifname, ifName)
)
if phy:
ifAlias = "LCP %s (%s)" % (phy.interface_name, ifname)
self.logger.debug(
"Setting ifAlias of %s to '%s'" % (ifname, ifAlias)
)
except:
self.logger.debug("No config entry found for ifname %s" % (ifname))
pass
ds.set("1.3.6.1.2.1.2.2.1.2.%u" % (idx), "str", ifName)
if ifname.startswith("loop"):
ds.set("1.3.6.1.2.1.2.2.1.3.%u" % (idx), "int", 24) # softwareLoopback
else:
ds.set("1.3.6.1.2.1.2.2.1.3.%u" % (idx), "int", 6) # ethermet-csmacd
mtu = 0
if not ifname in ifaces:
self.logger.warning("Could not get MTU for interface %s", ifname)
else:
mtu = ifaces[ifname].mtu[0]
ds.set("1.3.6.1.2.1.2.2.1.4.%u" % (idx), "int", mtu)
speed = 0
if ifname.startswith("loop") or ifname.startswith("tap"):
speed = 1000000000
elif not ifname in ifaces:
self.logger.warning("Could not get link speed for interface %s", ifname)
else:
speed = ifaces[ifname].link_speed * 1000
if speed >= 2 ** 32:
speed = 2 ** 32 - 1
ds.set("1.3.6.1.2.1.2.2.1.5.%u" % (idx), "gauge32", speed)
mac = "00:00:00:00:00:00"
if not ifname in ifaces:
self.logger.warning(
"Could not get PhysAddress for interface %s", ifname
)
else:
mac = str(ifaces[ifname].l2_address)
ds.set("1.3.6.1.2.1.2.2.1.6.%u" % (idx), "str", mac)
admin_status = 3 # testing
if not ifname in ifaces:
self.logger.warning(
"Could not get AdminStatus for interface %s", ifname
)
else:
if int(ifaces[ifname].flags) & 1:
admin_status = 1 # up
else:
admin_status = 2 # down
ds.set("1.3.6.1.2.1.2.2.1.7.%u" % (idx), "int", admin_status)
oper_status = 3 # testing
if not ifname in ifaces:
self.logger.warning("Could not get OperStatus for interface %s", ifname)
else:
if int(ifaces[ifname].flags) & 2:
oper_status = 1 # up
else:
oper_status = 2 # down
ds.set("1.3.6.1.2.1.2.2.1.8.%u" % (idx), "int", oper_status)
ds.set("1.3.6.1.2.1.2.2.1.9.%u" % (idx), "ticks", 0)
ds.set(
"1.3.6.1.2.1.2.2.1.10.%u" % (idx),
"u32",
self.vppstat["/if/rx"][:, i].sum_octets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.2.2.1.11.%u" % (idx),
"u32",
self.vppstat["/if/rx"][:, i].sum_packets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.2.2.1.12.%u" % (idx),
"u32",
self.vppstat["/if/rx-multicast"][:, i].sum_packets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.2.2.1.13.%u" % (idx),
"u32",
self.vppstat["/if/rx-no-buf"][:, i].sum() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.2.2.1.14.%u" % (idx),
"u32",
self.vppstat["/if/rx-error"][:, i].sum() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.2.2.1.16.%u" % (idx),
"u32",
self.vppstat["/if/tx"][:, i].sum_octets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.2.2.1.17.%u" % (idx),
"u32",
self.vppstat["/if/tx"][:, i].sum_packets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.2.2.1.18.%u" % (idx),
"u32",
self.vppstat["/if/tx-multicast"][:, i].sum_packets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.2.2.1.19.%u" % (idx),
"u32",
self.vppstat["/if/drops"][:, i].sum() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.2.2.1.20.%u" % (idx),
"u32",
self.vppstat["/if/tx-error"][:, i].sum() % 2 ** 32,
)
ds.set("1.3.6.1.2.1.31.1.1.1.1.%u" % (idx), "str", ifName)
ds.set(
"1.3.6.1.2.1.31.1.1.1.2.%u" % (idx),
"u32",
self.vppstat["/if/rx-multicast"][:, i].sum_packets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.3.%u" % (idx),
"u32",
self.vppstat["/if/rx-broadcast"][:, i].sum_packets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.4.%u" % (idx),
"u32",
self.vppstat["/if/tx-multicast"][:, i].sum_packets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.5.%u" % (idx),
"u32",
self.vppstat["/if/tx-broadcast"][:, i].sum_packets() % 2 ** 32,
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.6.%u" % (idx),
"u64",
self.vppstat["/if/rx"][:, i].sum_octets(),
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.7.%u" % (idx),
"u64",
self.vppstat["/if/rx"][:, i].sum_packets(),
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.8.%u" % (idx),
"u64",
self.vppstat["/if/rx-multicast"][:, i].sum_packets(),
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.9.%u" % (idx),
"u64",
self.vppstat["/if/rx-broadcast"][:, i].sum_packets(),
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.10.%u" % (idx),
"u64",
self.vppstat["/if/tx"][:, i].sum_octets(),
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.11.%u" % (idx),
"u64",
self.vppstat["/if/tx"][:, i].sum_packets(),
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.12.%u" % (idx),
"u64",
self.vppstat["/if/tx-multicast"][:, i].sum_packets(),
)
ds.set(
"1.3.6.1.2.1.31.1.1.1.13.%u" % (idx),
"u64",
self.vppstat["/if/tx-broadcast"][:, i].sum_packets(),
)
speed = 0
if ifname.startswith("loop") or ifname.startswith("tap"):
speed = 1000
elif not ifname in ifaces:
self.logger.warning("Could not get link speed for interface %s", ifname)
else:
speed = int(ifaces[ifname].link_speed / 1000)
ds.set("1.3.6.1.2.1.31.1.1.1.15.%u" % (idx), "gauge32", speed)
ds.set(
"1.3.6.1.2.1.31.1.1.1.16.%u" % (idx), "int", 2
) # Hardcode to false(2)
ds.set(
"1.3.6.1.2.1.31.1.1.1.17.%u" % (idx), "int", 1
) # Hardcode to true(1)
if self.config and not ifAlias:
try:
descr = get_description_by_ifname(self.config, ifname)
if descr:
self.logger.debug(
"Setting ifAlias of %s to config description '%s'"
% (ifname, descr)
)
ifAlias = descr
except:
pass
if not ifAlias:
self.logger.debug(
"Setting ifAlias of %s to ifname %s" % (ifname, ifname)
)
ifAlias = ifname
ds.set("1.3.6.1.2.1.31.1.1.1.18.%u" % (idx), "str", ifAlias)
ds.set(
"1.3.6.1.2.1.31.1.1.1.19.%u" % (idx), "ticks", 0
) # Hardcode to Timeticks: (0) 0:00:00.00
return ds
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
"-a",
dest="address",
default="localhost:705",
type=str,
help="""Location of the SNMPd agent (unix-path or host:port), default localhost:705""",
)
parser.add_argument(
"-p",
dest="period",
type=int,
default=30,
help="""Period to poll VPP, default 30 (seconds)""",
)
parser.add_argument(
"-c",
dest="config",
type=str,
help="""Optional vppcfg YAML configuration file, default empty""",
)
parser.add_argument(
"-d", dest="debug", action="store_true", help="""Enable debug, default False"""
)
args = parser.parse_args()
if args.debug:
print("Arguments:", args)
agentx.setup_logging(debug=args.debug)
try:
a = MyAgent(server_address=args.address, period=args.period, args=args)
a.run()
except Exception as e:
print("Unhandled exception:", e)
a.stop()
except KeyboardInterrupt:
a.stop()
sys.exit(-1)
if __name__ == "__main__":
main()