From 7c864ed099821f62c5be8cbe9ed3f4dd34000a42 Mon Sep 17 00:00:00 2001 From: Pim van Pelt Date: Sun, 29 Aug 2021 17:07:40 +0200 Subject: [PATCH] Temporary fix Stop adding paths with add_special(); there is a scenario with Bird2 that makes this crash: - Assume a VPP which has its fib fully synced - Kill VPP - Bird will see network devices remove, and mark all routes 'unreach' - Start VPP - Bird will see the devices come back, and issue netlink messages for each route that is unreach - these become add_special() because they have no nexthop and are of type UNREACHABLE - adding these to the FIB sometimes crashes in dpo handling To avoid this, no longer add_special() -- as a caveat, manually inserted routes to unreach/blackhole will not be explicitly added, however most will be caught by fib-entry for default-route (which is a 'drop'). This behavior should be fixed, but it's at the moment not obvious to me how and I'd prefer this behavior over SIGABORT/SIGSEGV deeper in the code. --- lcpng_nl_sync.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/lcpng_nl_sync.c b/lcpng_nl_sync.c index 3597f9d..afeb5a3 100644 --- a/lcpng_nl_sync.c +++ b/lcpng_nl_sync.c @@ -470,13 +470,21 @@ lcp_nl_route_add (struct rtnl_route *rr) lcp_nl_mk_route_prefix (rr, &pfx); entry_flags = lcp_nl_mk_route_entry_flags (rtype, table_id, rproto); + /* Connected is already inserted by ip[46]_add_del_interface_address() */ + if (entry_flags & FIB_ENTRY_FLAG_CONNECTED) + { + NL_DBG ("route_add: skip connected table %d prefix %U flags %U", + rtnl_route_get_table (rr), format_fib_prefix, &pfx, + format_fib_entry_flags, entry_flags); + return; + } /* link local IPv6 */ if (FIB_PROTOCOL_IP6 == pfx.fp_proto && (ip6_address_is_multicast (&pfx.fp_addr.ip6) || ip6_address_is_link_local_unicast (&pfx.fp_addr.ip6))) { - NL_DBG ("route_add: skip table %d prefix %U flags %U", + NL_DBG ("route_add: skip linklocal table %d prefix %U flags %U", rtnl_route_get_table (rr), format_fib_prefix, &pfx, format_fib_entry_flags, entry_flags); return; @@ -489,8 +497,11 @@ lcp_nl_route_add (struct rtnl_route *rr) }; rtnl_route_foreach_nexthop (rr, lcp_nl_route_path_parse, &np); - - lcp_nl_route_path_add_special (rr, &np); + // TODO(pim) - figure out why we have spurious crashes when + // adding a route w/ nexthops {} or nexthops { idx 1 } on an + // empty FIB. + // + // lcp_nl_route_path_add_special (rr, &np); if (0 != vec_len (np.paths)) { @@ -530,9 +541,16 @@ lcp_nl_route_add (struct rtnl_route *rr) } } else - NL_ERROR ("route_add: no paths table %d prefix %U flags %U", - rtnl_route_get_table (rr), format_fib_prefix, &pfx, - format_fib_entry_flags, entry_flags); + // TODO(pim) - while the above add_special() is commented out, any + // route inserted tiwh unreach/prohibit/blackhole, for example when VPP + // is restarted, Bird will flip all routes in the RIB to 'unreach', and + // when it rediscovers devices and their connecteds, it will send a whole + // bunch of them back. Ignore for now. When add_special() is fixed, this + // should become a WARN again. + NL_INFO ("route_add: no paths table %d prefix %U flags %U netlink %U", + rtnl_route_get_table (rr), format_fib_prefix, &pfx, + format_fib_entry_flags, entry_flags, format_nl_object, rr); + vec_free (np.paths); }