diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f2149a..4e47d1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ add_vpp_library(lcpng lcpng_interface.c lcpng_adj.c lcpng_if_sync.c + lcpng_mpls_sync.c lcpng.c LINK_LIBRARIES diff --git a/lcpng_if_node.c b/lcpng_if_node.c index 80417e6..e1d674b 100644 --- a/lcpng_if_node.c +++ b/lcpng_if_node.c @@ -438,6 +438,103 @@ VNET_FEATURE_INIT (lcp_xc_ip6_mcast_node, static) = { .node_name = "linux-cp-xc-ip6", }; +typedef enum +{ + LCP_XC_MPLS_NEXT_DROP, + LCP_XC_MPLS_NEXT_IO, + LCP_XC_MPLS_N_NEXT, +} lcp_xc_mpls_next_t; + +static_always_inline uword +lcp_xc_mpls_inline (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame) +{ + u32 n_left_from, *from, *to_next, n_left_to_next; + lcp_xc_next_t next_index; + + next_index = 0; + n_left_from = frame->n_vectors; + from = vlib_frame_vector_args (frame); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + const ethernet_header_t *eth; + const lcp_itf_pair_t *lip; + u32 next0, bi0, lipi, ai; + vlib_buffer_t *b0; + // const ip_adjacency_t *adj; + + bi0 = to_next[0] = from[0]; + + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + lipi = + lcp_itf_pair_find_by_host (vnet_buffer (b0)->sw_if_index[VLIB_RX]); + lip = lcp_itf_pair_get (lipi); + + vnet_buffer (b0)->sw_if_index[VLIB_TX] = lip->lip_phy_sw_if_index; + vlib_buffer_advance (b0, -lip->lip_rewrite_len); + eth = vlib_buffer_get_current (b0); + + ai = ADJ_INDEX_INVALID; + next0 = LCP_XC_MPLS_NEXT_DROP; + if (!ethernet_address_cast (eth->dst_address)) + ai = lcp_adj_lkup ((u8 *) eth, lip->lip_rewrite_len, + vnet_buffer (b0)->sw_if_index[VLIB_TX]); + if (ai != ADJ_INDEX_INVALID) + { + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = ai; + next0 = LCP_XC_MPLS_NEXT_IO; + } + + if (PREDICT_FALSE ((b0->flags & VLIB_BUFFER_IS_TRACED))) + { + lcp_xc_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->phy_sw_if_index = lip->lip_phy_sw_if_index; + t->adj_index = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_NODE_FN (lcp_xc_mpls) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return (lcp_xc_mpls_inline (vm, node, frame)); +} + +VLIB_REGISTER_NODE ( + lcp_xc_mpls) = { .name = "linux-cp-xc-mpls", + .vector_size = sizeof (u32), + .format_trace = format_lcp_xc_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_next_nodes = LCP_XC_MPLS_N_NEXT, + .next_nodes = { + [LCP_XC_MPLS_NEXT_DROP] = "error-drop", + [LCP_XC_MPLS_NEXT_IO] = "interface-output", + } }; + +VNET_FEATURE_INIT (lcp_xc_mpls_node, static) = { + .arc_name = "mpls-input", + .node_name = "linux-cp-xc-mpls", +}; + typedef enum { LCP_XC_L3_NEXT_XC, diff --git a/lcpng_interface.c b/lcpng_interface.c index dd9cba6..a93d770 100644 --- a/lcpng_interface.c +++ b/lcpng_interface.c @@ -282,6 +282,10 @@ lcp_itf_pair_add (u32 host_sw_if_index, u32 phy_sw_if_index, u8 *host_name, lcp_itf_l3_feat_names[lip->lip_host_type][af], lip->lip_host_sw_if_index, 1, NULL, 0); + /* Enable MPLS */ + vnet_feature_enable_disable ("mpls-input", "linux-cp-xc-mpls", + lip->lip_host_sw_if_index, 1, NULL, 0); + /* * Configure passive punt to the host interface. */ @@ -430,6 +434,9 @@ lcp_itf_pair_del (u32 phy_sw_if_index) lcp_itf_l3_feat_names[lip->lip_host_type][af], lip->lip_host_sw_if_index, 0, NULL, 0); + vnet_feature_enable_disable ("mpls-input", "linux-cp-xc-mpls", + lip->lip_host_sw_if_index, 0, NULL, 0); + lcp_itf_unset_adjs (lip); ip4_punt_redirect_del (lip->lip_phy_sw_if_index); diff --git a/lcpng_mpls_sync.c b/lcpng_mpls_sync.c new file mode 100644 index 0000000..47fed5e --- /dev/null +++ b/lcpng_mpls_sync.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE + +#include +#include +#include + +#include + +#include + +void +lcp_mpls_sync_pair_add_cb (lcp_itf_pair_t *lip) +{ + u8 phy_is_enabled = mpls_sw_interface_is_enabled (lip->lip_phy_sw_if_index); + LCP_IF_DBG ("mpls_pair_add_cb: mpls enabled %u, parent %U", phy_is_enabled, + format_lcp_itf_pair, lip); + if (phy_is_enabled) + mpls_sw_interface_enable_disable (&mpls_main, lip->lip_host_sw_if_index, + 1); +} + +void +lcp_mpls_sync_state_cb (struct mpls_main_t *mm, uword opaque, u32 sw_if_index, + u32 is_enable) +{ + lcp_itf_pair_t *lip; + index_t lipi; + int curr_ns_fd = -1; + int vif_ns_fd = -1; + int ctl_fd = -1; + u8 *ctl_path = NULL; + + LCP_IF_DBG ("mpls_sync_state_cb: called for sw_if_index %u", sw_if_index); + + // If device is LCP PHY, sync state to host tap. + lipi = lcp_itf_pair_find_by_phy (sw_if_index); + if (INDEX_INVALID != lipi) + { + lip = lcp_itf_pair_get (lipi); + LCP_IF_DBG ("mpls_sync_state_cb: mpls enabled %u parent %U", is_enable, + format_lcp_itf_pair, lip); + mpls_sw_interface_enable_disable (&mpls_main, lip->lip_host_sw_if_index, + is_enable); + return; + } + + // If device is LCP host, sync state to linux. + lipi = lcp_itf_pair_find_by_host (sw_if_index); + if (INDEX_INVALID == lipi) + return; + + lip = lcp_itf_pair_get (lipi); + if (lip->lip_namespace) + { + curr_ns_fd = clib_netns_open (NULL /* self */); + vif_ns_fd = clib_netns_open (lip->lip_namespace); + if (vif_ns_fd != -1) + clib_setns (vif_ns_fd); + } + + ctl_path = format (NULL, "/proc/sys/net/mpls/conf/%s/input%c", + lip->lip_host_name, NULL); + if (NULL == ctl_path) + { + LCP_IF_DBG ("mpls_sync_state_cb: failed to format sysctl"); + goto SYNC_CLEANUP; + } + + ctl_fd = open ((char *) ctl_path, O_WRONLY); + if (ctl_fd < 0) + { + LCP_IF_DBG ("mpls_sync_state_cb: failed to open %s for writing", + ctl_path); + goto SYNC_CLEANUP; + } + + if (fdformat (ctl_fd, "%u", is_enable) < 1) + { + LCP_IF_DBG ("mpls_sync_state_cb: failed to write to %s", ctl_path); + goto SYNC_CLEANUP; + } + + LCP_IF_DBG ("mpls_sync_state_cb: set mpls input for %s", lip->lip_host_name); + +SYNC_CLEANUP: + if (ctl_fd < 0) + close (ctl_fd); + + if (NULL != ctl_path) + vec_free (ctl_path); + + if (vif_ns_fd != -1) + close (vif_ns_fd); + + if (curr_ns_fd != -1) + { + clib_setns (curr_ns_fd); + close (curr_ns_fd); + } +} + +static clib_error_t * +lcp_mpls_sync_init (vlib_main_t *vm) +{ + lcp_itf_pair_vft_t mpls_sync_itf_pair_vft = { + .pair_add_fn = lcp_mpls_sync_pair_add_cb, + }; + lcp_itf_pair_register_vft (&mpls_sync_itf_pair_vft); + + mpls_interface_state_change_add_callback (lcp_mpls_sync_state_cb, 0); + + return NULL; +} + +VLIB_INIT_FUNCTION (lcp_mpls_sync_init) = { + .runs_after = VLIB_INITS ("lcp_itf_pair_init", "mpls_init"), +}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/lcpng_netlink.h b/lcpng_netlink.h index c0efc2d..ab368af 100644 --- a/lcpng_netlink.h +++ b/lcpng_netlink.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/lcpng_nl_sync.c b/lcpng_nl_sync.c index 6496e24..0dcbfa1 100644 --- a/lcpng_nl_sync.c +++ b/lcpng_nl_sync.c @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -146,13 +147,31 @@ vnet_sw_interface_get_available_subid (vnet_main_t *vnm, u32 sw_if_index, return 1; } +static fib_protocol_t +lcp_nl_proto_k2f (uint32_t k) +{ + switch (k) + { + case AF_INET6: + return FIB_PROTOCOL_IP6; + case AF_INET: + return FIB_PROTOCOL_IP4; + case AF_MPLS: + return FIB_PROTOCOL_MPLS; + default: + ASSERT (0); + return FIB_PROTOCOL_NONE; + } +} + static fib_protocol_t lcp_nl_mk_addr46 (const struct nl_addr *rna, ip46_address_t *ia) { fib_protocol_t fproto; - fproto = - nl_addr_get_family (rna) == AF_INET6 ? FIB_PROTOCOL_IP6 : FIB_PROTOCOL_IP4; + fproto = lcp_nl_proto_k2f (nl_addr_get_family (rna)); + ASSERT (FIB_PROTOCOL_MPLS != fproto); + ip46_address_reset (ia); if (FIB_PROTOCOL_IP4 == fproto) memcpy (&ia->ip4, nl_addr_get_binary_addr (rna), nl_addr_get_len (rna)); @@ -166,9 +185,31 @@ static void lcp_nl_mk_route_prefix (struct rtnl_route *r, fib_prefix_t *p) { const struct nl_addr *addr = rtnl_route_get_dst (r); + u32 *baddr = nl_addr_get_binary_addr (addr); + u32 blen = nl_addr_get_len (addr); + ip46_address_t *paddr = &p->fp_addr; + u32 entry; + + p->fp_proto = lcp_nl_proto_k2f (nl_addr_get_family (addr)); + + switch (p->fp_proto) + { + case FIB_PROTOCOL_MPLS: + entry = ntohl (*baddr); + p->fp_label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + p->fp_len = 21; + p->fp_eos = MPLS_NON_EOS; + return; + case FIB_PROTOCOL_IP4: + ip46_address_reset (paddr); + memcpy (&paddr->ip4, baddr, blen); + break; + case FIB_PROTOCOL_IP6: + memcpy (&paddr->ip6, baddr, blen); + break; + } p->fp_len = nl_addr_get_prefixlen (addr); - p->fp_proto = lcp_nl_mk_addr46 (addr, &p->fp_addr); } static void @@ -208,6 +249,37 @@ lcp_nl_mk_route_entry_flags (uint8_t rtype, int table_id, uint8_t rproto) return (fef); } +static int +lcp_router_mpls_nladdr_to_path (fib_route_path_t *path, struct nl_addr *addr) +{ + if (!addr) + return 0; + + struct mpls_label *stack = nl_addr_get_binary_addr (addr); + u32 entry, label; + u8 exp, ttl; + int label_count = 0; + + while (1) + { + entry = ntohl (stack[label_count++].entry); + label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + exp = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + + fib_mpls_label_t fml = { + .fml_value = label, + .fml_exp = exp, + .fml_ttl = ttl, + }; + vec_add1 (path->frp_label_stack, fml); + + if (entry & MPLS_LS_S_MASK) + break; + } + return label_count; +} + static void lcp_nl_route_path_parse (struct rtnl_nexthop *rnh, void *arg) { @@ -216,6 +288,7 @@ lcp_nl_route_path_parse (struct rtnl_nexthop *rnh, void *arg) lcp_itf_pair_t *lip; fib_protocol_t fproto; struct nl_addr *addr; + int label_count = 0; /* We do not log a warning/error here, because some routes (like * blackhole/unreach) don't have an interface associated with them. @@ -230,9 +303,16 @@ lcp_nl_route_path_parse (struct rtnl_nexthop *rnh, void *arg) path->frp_flags = FIB_ROUTE_PATH_FLAG_NONE | ctx->type_flags; path->frp_sw_if_index = lip->lip_phy_sw_if_index; - path->frp_weight = rtnl_route_nh_get_weight (rnh); path->frp_preference = ctx->preference; + /* + * FIB Path Weight of 0 is meaningless and replaced with 1 further along. + * See fib_path_create. fib_path_cmp_w_route_path would fail to match + * such a fib_route_path_t with any fib_path_t, because a fib_path_t's + * fp_weight can never be 0. + */ + path->frp_weight = clib_max (1, rtnl_route_nh_get_weight (rnh)); + addr = rtnl_route_nh_get_gateway (rnh); if (!addr) addr = rtnl_route_nh_get_via (rnh); @@ -244,6 +324,32 @@ lcp_nl_route_path_parse (struct rtnl_nexthop *rnh, void *arg) path->frp_proto = fib_proto_to_dpo (fproto); + if (ctx->route_proto == FIB_PROTOCOL_MPLS) + { + addr = rtnl_route_nh_get_newdst (rnh); + label_count = lcp_router_mpls_nladdr_to_path (path, addr); + if (label_count) + { + LCP_NL_DBG ("router_path_parse: is label swap to %u", + path->frp_label_stack[0].fml_value); + } + else + { + fib_mpls_label_t fml = { + .fml_value = MPLS_LABEL_POP, + }; + vec_add1 (path->frp_label_stack, fml); + LCP_NL_DBG ("router_path_parse: is label pop"); + } + } + +#ifdef NL_CAPABILITY_VERSION_3_6_0 + addr = rtnl_route_nh_get_encap_mpls_dst (rnh); + label_count = lcp_router_mpls_nladdr_to_path (path, addr); + if (label_count) + LCP_NL_DBG ("router_path_parse: has encap mpls, %d labels", label_count); +#endif + if (ctx->is_mcast) path->frp_mitf_flags = MFIB_ITF_FLAG_FORWARD; @@ -439,11 +545,21 @@ lcp_nl_route_del (struct rtnl_route *rr) LCP_NL_DBG ("route_del: table %d prefix %U flags %U", rtnl_route_get_table (rr), format_fib_prefix, &pfx, format_fib_entry_flags, entry_flags); - if (pfx.fp_proto == FIB_PROTOCOL_IP6) - fib_table_entry_delete (nlt->nlt_fib_index, &pfx, fib_src); - else - fib_table_entry_path_remove2 (nlt->nlt_fib_index, &pfx, fib_src, - np.paths); + + switch (pfx.fp_proto) + { + case FIB_PROTOCOL_IP6: + fib_table_entry_delete (nlt->nlt_fib_index, &pfx, fib_src); + break; + case FIB_PROTOCOL_MPLS: + fib_table_entry_path_remove2 (nlt->nlt_fib_index, &pfx, fib_src, + np.paths); + /* delete the EOS route in addition to NEOS - fallthrough */ + pfx.fp_eos = MPLS_EOS; + default: + fib_table_entry_path_remove2 (nlt->nlt_fib_index, &pfx, fib_src, + np.paths); + } } vec_free (np.paths); @@ -451,6 +567,26 @@ lcp_nl_route_del (struct rtnl_route *rr) lcp_nl_table_unlock (nlt); } +static fib_route_path_t * +lcp_router_fib_route_path_dup (fib_route_path_t *old) +{ + int idx; + fib_route_path_t *p; + + fib_route_path_t *new = vec_dup (old); + if (!new) + return NULL; + + for (idx = 0; idx < vec_len (new); idx++) + { + p = &new[idx]; + if (p->frp_label_stack) + p->frp_label_stack = vec_dup (p->frp_label_stack); + } + + return new; +} + void lcp_nl_route_add (struct rtnl_route *rr, int is_replace) { @@ -536,6 +672,24 @@ lcp_nl_route_add (struct rtnl_route *rr, int is_replace) rtnl_route_get_table (rr), format_fib_prefix, &pfx, format_fib_entry_flags, entry_flags); + if (pfx.fp_proto == FIB_PROTOCOL_MPLS) + { + /* in order to avoid double-frees, we duplicate the paths. */ + fib_route_path_t *pathdup = + lcp_router_fib_route_path_dup (np.paths); + if (is_replace) + fib_table_entry_update (nlt->nlt_fib_index, &pfx, fib_src, + entry_flags, pathdup); + else + fib_table_entry_path_add2 (nlt->nlt_fib_index, &pfx, fib_src, + entry_flags, pathdup); + vec_free (pathdup); + + /* install EOS route in addition to NEOS */ + pfx.fp_eos = MPLS_EOS; + pfx.fp_payload_proto = np.paths[0].frp_proto; + } + if (is_replace) fib_table_entry_update (nlt->nlt_fib_index, &pfx, fib_src, entry_flags, np.paths);