From 79a395b3c9f0dae9a23e6fbf10c5f284b1facb85 Mon Sep 17 00:00:00 2001
From: Pim van Pelt <pim@ipng.nl>
Date: Fri, 13 Aug 2021 19:59:55 +0200
Subject: [PATCH] Clamp sub-int MTU to parent's MTU

In VPP, a sub-interface can have a larger MTU than its parent.
In Linux, this cannot happen, so the following is problematic:

```
DBGvpp# create sub TenGigabitEthernet3/0/0 10
DBGvpp# set int mtu packet 1500 TenGigabitEthernet3/0/0
DBGvpp# set int mtu packet 9000 TenGigabitEthernet3/0/0.10
694: e0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UNKNOWN mode DEFAULT group default qlen 1000
695: e0.10@e0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000
```

The best way to ensure this works is to clamp the sub-int to a maximum MTU
of that of its parent, and override the user request to change the VPP
sub-int to anything higher than that, perhaps logging an error explaining
why. This means two things:

1.  Any change in VPP of a child MTU to larger than its parent, should be
    reverted.
1   Any change in VPP of a parent MTU should ensure all children are clamped
    to at most that.
---
 lcpng_if_sync.c | 45 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/lcpng_if_sync.c b/lcpng_if_sync.c
index 23fd73b..8f91b11 100644
--- a/lcpng_if_sync.c
+++ b/lcpng_if_sync.c
@@ -34,33 +34,53 @@
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 
-/* walk function to copy forward all sw interface link state flags into
- * their counterpart LIP link state.
+/* walk function to copy forward all sw interface link state flags
+ * MTU, and IP addresses into their counterpart LIP interface.
+ *
+ * This is called upon MTU changes and state changes.
  */
 static walk_rc_t
 lcp_itf_pair_walk_sync_state_cb (index_t lipi, void *ctx)
 {
   lcp_itf_pair_t *lip;
-  vnet_sw_interface_t *phy;
+  vnet_sw_interface_t *sw;
+  vnet_sw_interface_t *sup_sw;
 
   lip = lcp_itf_pair_get (lipi);
   if (!lip)
     return WALK_CONTINUE;
 
-  phy =
+  sw =
     vnet_get_sw_interface_or_null (vnet_get_main (), lip->lip_phy_sw_if_index);
-  if (!phy)
+  if (!sw)
     return WALK_CONTINUE;
+  sup_sw =
+    vnet_get_sw_interface_or_null (vnet_get_main (), sw->sup_sw_if_index);
 
-  LCP_ITF_PAIR_DBG ("walk_sync_state: lip %U flags %u", format_lcp_itf_pair,
-		    lip, phy->flags);
-  lcp_itf_set_link_state (lip, (phy->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP));
+  LCP_ITF_PAIR_DBG ("walk_sync_state: lip %U flags %u mtu %u sup-mtu %u",
+		    format_lcp_itf_pair, lip, sw->flags, sw->mtu[VNET_MTU_L3],
+		    sup_sw->mtu[VNET_MTU_L3]);
+  lcp_itf_set_link_state (lip, (sw->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP));
+
+  /* Linux will clamp MTU of children when the parent is lower. VPP is fine
+   * with differing MTUs. Reconcile any differences
+   */
+  if (sup_sw->mtu[VNET_MTU_L3] < sw->mtu[VNET_MTU_L3])
+    {
+      LCP_ITF_PAIR_ERR ("walk_sync_state: lip %U flags %u mtu %u sup-mtu %u: "
+			"clamping to sup-mtu to satisfy netlink",
+			format_lcp_itf_pair, lip, sw->flags,
+			sw->mtu[VNET_MTU_L3], sup_sw->mtu[VNET_MTU_L3]);
+      vnet_sw_interface_set_mtu (vnet_get_main (), sw->sw_if_index,
+				 sup_sw->mtu[VNET_MTU_L3]);
+      vnet_netlink_set_link_mtu (lip->lip_vif_index, sup_sw->mtu[VNET_MTU_L3]);
+    }
 
   /* Linux will remove IPv6 addresses on children when the master state
    * goes down, so we ensure all IPv4/IPv6 addresses are set when the phy
    * comes back up.
    */
-  if (phy->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
+  if (sw->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
     {
       lcp_itf_set_interface_addr (lip);
     }
@@ -129,6 +149,13 @@ lcp_itf_mtu_change (vnet_main_t *vnm, u32 sw_if_index, u32 flags)
 		     si->mtu[VNET_MTU_L3]);
   vnet_netlink_set_link_mtu (lip->lip_vif_index, si->mtu[VNET_MTU_L3]);
 
+  // When Linux changes MTU on a master interface, all of its children that
+  // have a higher MTU are clamped to this value. This is not true in VPP,
+  // so we are forced to undo that change by walking the sub-interfaces of
+  // a phy and syncing their state back into linux.
+  // For simplicity, just walk all interfaces.
+  lcp_itf_pair_walk (lcp_itf_pair_walk_sync_state_cb, 0);
+
   return NULL;
 }