Compare commits
84 Commits
4aa5745d06
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 867e811d37 | |||
| 76d61f91bc | |||
| 021b522abe | |||
| a09a9da87c | |||
| 078720cfac | |||
| c3a7249bcc | |||
| 04ef0c83be | |||
| 9aa742a377 | |||
| 318c62c289 | |||
| 561c0fa320 | |||
| 01f46097f0 | |||
| aa904f7272 | |||
| 089997ef84 | |||
| ff7281c027 | |||
| e76b55d96c | |||
| fce5164172 | |||
| f75d80d15c | |||
| c645550081 | |||
| 4d8f3a42e8 | |||
| 8ace5cf9ff | |||
| 532ec4fcd7 | |||
| 770ac9ce71 | |||
| 95146624c2 | |||
| af3be584b6 | |||
| 021fdfec4d | |||
| 81324c403c | |||
| d47d5a2bed | |||
| db5dc37d84 | |||
| bf7de1181c | |||
| 3244f49998 | |||
| b42c043205 | |||
| 0e6d142e4c | |||
| dbda2b2665 | |||
| 7bbd0d28d0 | |||
| 6ce64bf139 | |||
| c7856b3446 | |||
| 1bd4be47a3 | |||
| e99f826243 | |||
| 603f787fd3 | |||
| 1b95e25331 | |||
| 512cfd75dc | |||
| 8683d570a1 | |||
| a1a98ad3c6 | |||
| 26ae98d977 | |||
| 619a1dfdf2 | |||
| a9e978effb | |||
| 825335cef9 | |||
| a97115593c | |||
| 3dd0d8a656 | |||
| f137326339 | |||
| 51098ed43c | |||
| 6b337e1167 | |||
| bbf36f5a4e | |||
| b324d71b3f | |||
| 2681861e4b | |||
| 4f0188abeb | |||
| f4ed332b18 | |||
| d9066aa241 | |||
| c68799703b | |||
| c32d1779f8 | |||
| eda80e7e66 | |||
| d13da5608d | |||
| d47261a3b7 | |||
| 383a598fc7 | |||
| 8afa2ff944 | |||
| fe1207ee78 | |||
| 6a59b7d7e6 | |||
| bc2a9bb352 | |||
| 5d02b6466c | |||
| b6b419471d | |||
| 85b41ba4e0 | |||
| ebbb0f8e24 | |||
| 218ee84d5f | |||
| c476fa56fb | |||
| a76abc331f | |||
| 44deb34685 | |||
| ca46bcf6d5 | |||
| 5042f822ef | |||
| fdb77838b8 | |||
| 6d3f4ac206 | |||
| baa3e78045 | |||
| 0972cf4aa1 | |||
| 4f81d377a0 | |||
| 153048eda4 |
-34
@@ -1,34 +0,0 @@
|
|||||||
kind: pipeline
|
|
||||||
name: default
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: git-lfs
|
|
||||||
image: alpine/git
|
|
||||||
commands:
|
|
||||||
- git lfs install
|
|
||||||
- git lfs pull
|
|
||||||
- name: build
|
|
||||||
image: git.ipng.ch/ipng/drone-hugo:release-0.145.1
|
|
||||||
settings:
|
|
||||||
hugo_version: 0.145.0
|
|
||||||
extended: true
|
|
||||||
- name: rsync
|
|
||||||
image: drillster/drone-rsync
|
|
||||||
settings:
|
|
||||||
user: drone
|
|
||||||
key:
|
|
||||||
from_secret: drone_sshkey
|
|
||||||
hosts:
|
|
||||||
- nginx0.chrma0.net.ipng.ch
|
|
||||||
- nginx0.chplo0.net.ipng.ch
|
|
||||||
- nginx0.nlams1.net.ipng.ch
|
|
||||||
- nginx0.nlams2.net.ipng.ch
|
|
||||||
port: 22
|
|
||||||
args: '-6u --delete-after'
|
|
||||||
source: public/
|
|
||||||
target: /var/www/ipng.ch/
|
|
||||||
recursive: true
|
|
||||||
secrets: [ drone_sshkey ]
|
|
||||||
|
|
||||||
image_pull_secrets:
|
|
||||||
- git_ipng_ch_docker
|
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
name: Build and Deploy
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
deploy:
|
||||||
|
runs-on: debian-slim
|
||||||
|
container:
|
||||||
|
image: hugomods/hugo:debian-dart-sass-node-git-0.161.1
|
||||||
|
steps:
|
||||||
|
- name: Install packages
|
||||||
|
env:
|
||||||
|
DEBIAN_FRONTEND: noninteractive
|
||||||
|
run: apt-get update -qq && apt-get install -qq -y git-lfs rsync > /dev/null
|
||||||
|
|
||||||
|
- name: Checkout with LFS
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
lfs: true
|
||||||
|
|
||||||
|
- name: Build Hugo site
|
||||||
|
run: hugo --minify
|
||||||
|
|
||||||
|
- name: Deploy via rsync
|
||||||
|
env:
|
||||||
|
SSH_KEY: ${{ secrets.drone_sshkey }}
|
||||||
|
run: |
|
||||||
|
mkdir -p ~/.ssh
|
||||||
|
echo "$SSH_KEY" > ~/.ssh/id_ed25519
|
||||||
|
chmod 600 ~/.ssh/id_ed25519
|
||||||
|
for host in \
|
||||||
|
nginx0.chrma0.net.ipng.ch \
|
||||||
|
nginx0.chplo0.net.ipng.ch \
|
||||||
|
nginx0.chlzn0.net.ipng.ch \
|
||||||
|
nginx0.frggh0.net.ipng.ch \
|
||||||
|
nginx0.nlams0.net.ipng.ch \
|
||||||
|
nginx0.nlams1.net.ipng.ch \
|
||||||
|
nginx0.nlams2.net.ipng.ch; do
|
||||||
|
cmd="rsync -6a --delete-after -e \"ssh -i ~/.ssh/id_ed25519 -o StrictHostKeyChecking=no -o LogLevel=ERROR\" public/ drone@${host}:/nginx/sites/ipng.ch/"
|
||||||
|
echo "${cmd}"
|
||||||
|
eval "${cmd}"
|
||||||
|
done
|
||||||
+3
-2
@@ -17,13 +17,14 @@ to be connected to the industry both physically, in terms of software defined ne
|
|||||||
and software companies, and socially, to the Swiss and European networking community.
|
and software companies, and socially, to the Swiss and European networking community.
|
||||||
|
|
||||||
IPng Networks GmbH provides networking consultancy, hosting, colocation, internet connectivity
|
IPng Networks GmbH provides networking consultancy, hosting, colocation, internet connectivity
|
||||||
options primarily tailored for the Zurich metropolitan area.
|
options primarily tailored for the Zurich metropolitan area. We are experts in self-hosting, and
|
||||||
|
on principle only use fully open sourced components to build and run our business.
|
||||||
|
|
||||||
Rather than dazzle you with pictures of clouds, grandiose projections of our "global IP backbone",
|
Rather than dazzle you with pictures of clouds, grandiose projections of our "global IP backbone",
|
||||||
and other claims that small businesses make to appear larger than they are, we're happy to show what
|
and other claims that small businesses make to appear larger than they are, we're happy to show what
|
||||||
we know, what we own, and how we can help you accomplish your goals if you want to work with us.
|
we know, what we own, and how we can help you accomplish your goals if you want to work with us.
|
||||||
|
|
||||||
### Keywords: SDN, WDM, IP, Network Design and Consultancy, Hosting, and Colocation.
|
### Keywords: VPP/FD.io, Network Design and Consultancy, (Self-)Hosting, and Colocation.
|
||||||
|
|
||||||
We are proud of our network and the services we operate, because they allow us to provide
|
We are proud of our network and the services we operate, because they allow us to provide
|
||||||
predictable and reliable performance. We maintain and grow the network judiciously and with the
|
predictable and reliable performance. We maintain and grow the network judiciously and with the
|
||||||
|
|||||||
+3
-3
@@ -47,8 +47,8 @@ started his career as a network engineer in the Netherlands, where he worked
|
|||||||
for Intouch, Freeler, and BIT. He helped raise awareness for IPv6, for example
|
for Intouch, Freeler, and BIT. He helped raise awareness for IPv6, for example
|
||||||
by launching it at AMS-IX back in 2001. He also operated
|
by launching it at AMS-IX back in 2001. He also operated
|
||||||
[[SixXS](https://www.sixxs.net/)], a global IPv6 tunnel broker, from 2001 through
|
[[SixXS](https://www.sixxs.net/)], a global IPv6 tunnel broker, from 2001 through
|
||||||
to its sunset in 2017. Since 2006, Pim works as a Distinguished SRE at Google
|
to its sunset in 2017. Since 2006, Pim works as a Distinguished Software Engineer at Google
|
||||||
in Zurich, Switzerland. In his free time, he goes [[Geocaching](https://geocaching.com)],
|
in Zurich, Switzerland. In his free time, he goes [[Geocaching](https://geocaching.com)],
|
||||||
contributes to [[open source](https://github.com/pimvanpelt)] projects, and flies
|
contributes to [[open source](https://git.ipng.ch/ipng/)] projects, and occasionally
|
||||||
model helicopters.
|
flies model helicopters.
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ Historical context - todo, but notes for now
|
|||||||
|
|
||||||
1. started with stack.nl (when it was still stack.urc.tue.nl), 6bone and watching NASA multicast video in 1997.
|
1. started with stack.nl (when it was still stack.urc.tue.nl), 6bone and watching NASA multicast video in 1997.
|
||||||
2. founded ipng.nl project, first IPv6 in NL that was usable outside of NREN.
|
2. founded ipng.nl project, first IPv6 in NL that was usable outside of NREN.
|
||||||
3. attacted attention of the first few IPv6 partitipants in Amsterdam, organized the AIAD - AMS-IX IPv6 Awareness Day
|
3. attracted attention of the first few IPv6 participants in Amsterdam, organized the AIAD - AMS-IX IPv6 Awareness Day
|
||||||
4. launched IPv6 at AMS-IX, first IXP prefix allocated 2001:768:1::/48
|
4. launched IPv6 at AMS-IX, first IXP prefix allocated 2001:768:1::/48
|
||||||
> My Brilliant Idea Of The Day -- encode AS number in leetspeak: `::AS01:2859:1`, because who would've thought we would ever run out of 16 bit AS numbers :)
|
> My Brilliant Idea Of The Day -- encode AS number in leetspeak: `::AS01:2859:1`, because who would've thought we would ever run out of 16 bit AS numbers :)
|
||||||
5. IPng rearchitected to SixXS, and became a very large scale deployment of IPv6 tunnelbroker; our main central provisioning system moved around a few times between ISPs (Intouch, Concepts ICT, BIT, IP Man)
|
5. IPng rearchitected to SixXS, and became a very large scale deployment of IPv6 tunnelbroker; our main central provisioning system moved around a few times between ISPs (Intouch, Concepts ICT, BIT, IP Man)
|
||||||
|
|||||||
@@ -185,7 +185,7 @@ function is_coloclue_beacon()
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Then, I ran the configuration again with one IPv4 beacon set on dcg-1, and still all the bird configs on both IPv4 and IPv6 for all routers parsed correctly, and the generated function on the dcg-1 IPv4 filters file was popupated:
|
Then, I ran the configuration again with one IPv4 beacon set on dcg-1, and still all the bird configs on both IPv4 and IPv6 for all routers parsed correctly, and the generated function on the dcg-1 IPv4 filters file was populated:
|
||||||
```
|
```
|
||||||
function is_coloclue_beacon()
|
function is_coloclue_beacon()
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ lcp lcp-sync off
|
|||||||
```
|
```
|
||||||
|
|
||||||
The prep work for the rest of the interface syncer starts with this
|
The prep work for the rest of the interface syncer starts with this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/2d00de080bd26d80ce69441b1043de37e0326e0a)], and
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/2d00de080bd26d80ce69441b1043de37e0326e0a)], and
|
||||||
for the rest of this blog post, the behavior will be in the 'on' position.
|
for the rest of this blog post, the behavior will be in the 'on' position.
|
||||||
|
|
||||||
### Change interface: state
|
### Change interface: state
|
||||||
@@ -120,7 +120,7 @@ the state it was. I did notice that you can't bring up a sub-interface if its pa
|
|||||||
is down, which I found counterintuitive, but that's neither here nor there.
|
is down, which I found counterintuitive, but that's neither here nor there.
|
||||||
|
|
||||||
All of this is to say that we have to be careful when copying state forward, because as
|
All of this is to say that we have to be careful when copying state forward, because as
|
||||||
this [[commit](https://github.com/pimvanpelt/lcpng/commit/7c15c84f6c4739860a85c599779c199cb9efef03)]
|
this [[commit](https://git.ipng.ch/ipng/lcpng/commit/7c15c84f6c4739860a85c599779c199cb9efef03)]
|
||||||
shows, issuing `set int state ... up` on an interface, won't touch its sub-interfaces in VPP, but
|
shows, issuing `set int state ... up` on an interface, won't touch its sub-interfaces in VPP, but
|
||||||
the subsequent netlink message to bring the _LIP_ for that interface up, **will** update the
|
the subsequent netlink message to bring the _LIP_ for that interface up, **will** update the
|
||||||
children, thus desynchronising Linux and VPP: Linux will have interface **and all its
|
children, thus desynchronising Linux and VPP: Linux will have interface **and all its
|
||||||
@@ -128,7 +128,7 @@ sub-interfaces** up unconditionally; VPP will have the interface up and its sub-
|
|||||||
whatever state they were before.
|
whatever state they were before.
|
||||||
|
|
||||||
To address this, a second
|
To address this, a second
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/a3dc56c01461bdffcac8193ead654ae79225220f)] was
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/a3dc56c01461bdffcac8193ead654ae79225220f)] was
|
||||||
needed. I'm not too sure I want to keep this behavior, but for now, it results in an intuitive
|
needed. I'm not too sure I want to keep this behavior, but for now, it results in an intuitive
|
||||||
end-state, which is that all interfaces states are exactly the same between Linux and VPP.
|
end-state, which is that all interfaces states are exactly the same between Linux and VPP.
|
||||||
|
|
||||||
@@ -157,7 +157,7 @@ DBGvpp# set int state TenGigabitEthernet3/0/0 up
|
|||||||
### Change interface: MTU
|
### Change interface: MTU
|
||||||
|
|
||||||
Finally, a straight forward
|
Finally, a straight forward
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/39bfa1615fd1cafe5df6d8fc9d34528e8d3906e2)], or
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/39bfa1615fd1cafe5df6d8fc9d34528e8d3906e2)], or
|
||||||
so I thought. When the MTU changes in VPP (with `set interface mtu packet N <int>`), there is
|
so I thought. When the MTU changes in VPP (with `set interface mtu packet N <int>`), there is
|
||||||
callback that can be registered which copies this into the _LIP_. I did notice a specific corner
|
callback that can be registered which copies this into the _LIP_. I did notice a specific corner
|
||||||
case: In VPP, a sub-interface can have a larger MTU than its parent. In Linux, this cannot happen,
|
case: In VPP, a sub-interface can have a larger MTU than its parent. In Linux, this cannot happen,
|
||||||
@@ -179,7 +179,7 @@ higher than that, perhaps logging an error explaining why. This means two things
|
|||||||
1. Any change in VPP of a parent MTU should ensure all children are clamped to at most that.
|
1. Any change in VPP of a parent MTU should ensure all children are clamped to at most that.
|
||||||
|
|
||||||
I addressed the issue in this
|
I addressed the issue in this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/79a395b3c9f0dae9a23e6fbf10c5f284b1facb85)].
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/79a395b3c9f0dae9a23e6fbf10c5f284b1facb85)].
|
||||||
|
|
||||||
### Change interface: IP Addresses
|
### Change interface: IP Addresses
|
||||||
|
|
||||||
@@ -199,7 +199,7 @@ VPP into the companion Linux devices:
|
|||||||
_LIP_ with `lcp_itf_set_interface_addr()`.
|
_LIP_ with `lcp_itf_set_interface_addr()`.
|
||||||
|
|
||||||
This means with this
|
This means with this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/f7e1bb951d648a63dfa27d04ded0b6261b9e39fe)], at
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/f7e1bb951d648a63dfa27d04ded0b6261b9e39fe)], at
|
||||||
any time a new _LIP_ is created, the IPv4 and IPv6 address on the VPP interface are fully copied
|
any time a new _LIP_ is created, the IPv4 and IPv6 address on the VPP interface are fully copied
|
||||||
over by the third change, while at runtime, new addresses can be set/removed as well by the first
|
over by the third change, while at runtime, new addresses can be set/removed as well by the first
|
||||||
and second change.
|
and second change.
|
||||||
|
|||||||
@@ -100,7 +100,7 @@ linux-cp {
|
|||||||
|
|
||||||
Based on this config, I set the startup default in `lcp_set_lcp_auto_subint()`, but I realize that
|
Based on this config, I set the startup default in `lcp_set_lcp_auto_subint()`, but I realize that
|
||||||
an administrator may want to turn it on/off at runtime, too, so I add a CLI getter/setter that
|
an administrator may want to turn it on/off at runtime, too, so I add a CLI getter/setter that
|
||||||
interacts with the flag in this [[commit](https://github.com/pimvanpelt/lcpng/commit/d23aab2d95aabcf24efb9f7aecaf15b513633ab7)]:
|
interacts with the flag in this [[commit](https://git.ipng.ch/ipng/lcpng/commit/d23aab2d95aabcf24efb9f7aecaf15b513633ab7)]:
|
||||||
|
|
||||||
```
|
```
|
||||||
DBGvpp# show lcp
|
DBGvpp# show lcp
|
||||||
@@ -116,11 +116,11 @@ lcp lcp-sync off
|
|||||||
```
|
```
|
||||||
|
|
||||||
The prep work for the rest of the interface syncer starts with this
|
The prep work for the rest of the interface syncer starts with this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/2d00de080bd26d80ce69441b1043de37e0326e0a)], and
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/2d00de080bd26d80ce69441b1043de37e0326e0a)], and
|
||||||
for the rest of this blog post, the behavior will be in the 'on' position.
|
for the rest of this blog post, the behavior will be in the 'on' position.
|
||||||
|
|
||||||
The code for the configuration toggle is in this
|
The code for the configuration toggle is in this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/934446dcd97f51c82ddf133ad45b61b3aae14b2d)].
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/934446dcd97f51c82ddf133ad45b61b3aae14b2d)].
|
||||||
|
|
||||||
### Auto create/delete sub-interfaces
|
### Auto create/delete sub-interfaces
|
||||||
|
|
||||||
@@ -145,7 +145,7 @@ I noticed that interface deletion had a bug (one that I fell victim to as well:
|
|||||||
remove the netlink device in the correct network namespace), which I fixed.
|
remove the netlink device in the correct network namespace), which I fixed.
|
||||||
|
|
||||||
The code for the auto create/delete and the bugfix is in this
|
The code for the auto create/delete and the bugfix is in this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/934446dcd97f51c82ddf133ad45b61b3aae14b2d)].
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/934446dcd97f51c82ddf133ad45b61b3aae14b2d)].
|
||||||
|
|
||||||
### Further Work
|
### Further Work
|
||||||
|
|
||||||
|
|||||||
@@ -154,7 +154,7 @@ For now, `lcp_nl_dispatch()` just throws the message away after logging it with
|
|||||||
a function that will come in very useful as I start to explore all the different Netlink message types.
|
a function that will come in very useful as I start to explore all the different Netlink message types.
|
||||||
|
|
||||||
The code that forms the basis of our Netlink Listener lives in [[this
|
The code that forms the basis of our Netlink Listener lives in [[this
|
||||||
commit](https://github.com/pimvanpelt/lcpng/commit/c4e3043ea143d703915239b2390c55f7b6a9b0b1)] and
|
commit](https://git.ipng.ch/ipng/lcpng/commit/c4e3043ea143d703915239b2390c55f7b6a9b0b1)] and
|
||||||
specifically, here I want to call out I was not the primary author, I worked off of Matt and Neale's
|
specifically, here I want to call out I was not the primary author, I worked off of Matt and Neale's
|
||||||
awesome work in this pending [Gerrit](https://gerrit.fd.io/r/c/vpp/+/31122).
|
awesome work in this pending [Gerrit](https://gerrit.fd.io/r/c/vpp/+/31122).
|
||||||
|
|
||||||
@@ -182,7 +182,7 @@ Linux interface VPP is not aware of. But, if I can find the _LIP_, I can convert
|
|||||||
add or remove the ip4/ip6 neighbor adjacency.
|
add or remove the ip4/ip6 neighbor adjacency.
|
||||||
|
|
||||||
The code for this first Netlink message handler lives in this
|
The code for this first Netlink message handler lives in this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/30bab1d3f9ab06670fbef2c7c6a658e7b77f7738)]. An
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/30bab1d3f9ab06670fbef2c7c6a658e7b77f7738)]. An
|
||||||
ironic insight is that after writing the code, I don't think any of it will be necessary, because
|
ironic insight is that after writing the code, I don't think any of it will be necessary, because
|
||||||
the interface plugin will already copy ARP and IPv6 ND packets back and forth and itself update its
|
the interface plugin will already copy ARP and IPv6 ND packets back and forth and itself update its
|
||||||
neighbor adjacency tables; but I'm leaving the code in for now.
|
neighbor adjacency tables; but I'm leaving the code in for now.
|
||||||
@@ -197,7 +197,7 @@ it or remove it, and if there are no link-local addresses left, disable IPv6 on
|
|||||||
There's also a few multicast routes to add (notably 224.0.0.0/24 and ff00::/8, all-local-subnet).
|
There's also a few multicast routes to add (notably 224.0.0.0/24 and ff00::/8, all-local-subnet).
|
||||||
|
|
||||||
The code for IP address handling is in this
|
The code for IP address handling is in this
|
||||||
[[commit]](https://github.com/pimvanpelt/lcpng/commit/87742b4f541d389e745f0297d134e34f17b5b485), but
|
[[commit]](https://git.ipng.ch/ipng/lcpng/commit/87742b4f541d389e745f0297d134e34f17b5b485), but
|
||||||
when I took it out for a spin, I noticed something curious, looking at the log lines that are
|
when I took it out for a spin, I noticed something curious, looking at the log lines that are
|
||||||
generated for the following sequence:
|
generated for the following sequence:
|
||||||
|
|
||||||
@@ -236,7 +236,7 @@ interface and directly connected route addition/deletion is slightly different i
|
|||||||
So, I decide to take a little shortcut -- if an addition returns "already there", or a deletion returns
|
So, I decide to take a little shortcut -- if an addition returns "already there", or a deletion returns
|
||||||
"no such entry", I'll just consider it a successful addition and deletion respectively, saving my eyes
|
"no such entry", I'll just consider it a successful addition and deletion respectively, saving my eyes
|
||||||
from being screamed at by this red error message. I changed that in this
|
from being screamed at by this red error message. I changed that in this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/d63fbd8a9a612d038aa385e79a57198785d409ca)],
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/d63fbd8a9a612d038aa385e79a57198785d409ca)],
|
||||||
turning this situation in a friendly green notice instead.
|
turning this situation in a friendly green notice instead.
|
||||||
|
|
||||||
### Netlink: Link (existing)
|
### Netlink: Link (existing)
|
||||||
@@ -267,7 +267,7 @@ To avoid this loop, I temporarily turn off `lcp-sync` just before handling a bat
|
|||||||
turn it back to its original state when I'm done with that.
|
turn it back to its original state when I'm done with that.
|
||||||
|
|
||||||
The code for all/del of existing links is in this
|
The code for all/del of existing links is in this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/e604dd34784e029b41a47baa3179296d15b0632e)].
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/e604dd34784e029b41a47baa3179296d15b0632e)].
|
||||||
|
|
||||||
### Netlink: Link (new)
|
### Netlink: Link (new)
|
||||||
|
|
||||||
@@ -276,7 +276,7 @@ doesn't have a _LIP_ for, but specifically describes a VLAN interface? Well, th
|
|||||||
is trying to create a new sub-interface. And supporting that operation would be super cool, so let's go!
|
is trying to create a new sub-interface. And supporting that operation would be super cool, so let's go!
|
||||||
|
|
||||||
Using the earlier placeholder hint in `lcp_nl_link_add()` (see the previous
|
Using the earlier placeholder hint in `lcp_nl_link_add()` (see the previous
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/e604dd34784e029b41a47baa3179296d15b0632e)]),
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/e604dd34784e029b41a47baa3179296d15b0632e)]),
|
||||||
I know that I've gotten a NEWLINK request but the Linux ifindex doesn't have a _LIP_. This could be
|
I know that I've gotten a NEWLINK request but the Linux ifindex doesn't have a _LIP_. This could be
|
||||||
because the interface is entirely foreign to VPP, for example somebody created a dummy interface or
|
because the interface is entirely foreign to VPP, for example somebody created a dummy interface or
|
||||||
a VLAN sub-interface on one:
|
a VLAN sub-interface on one:
|
||||||
@@ -331,7 +331,7 @@ a boring `<phy>.<subid>` name.
|
|||||||
|
|
||||||
Alright, without further ado, the code for the main innovation here, the implementation of
|
Alright, without further ado, the code for the main innovation here, the implementation of
|
||||||
`lcp_nl_link_add_vlan()`, is in this
|
`lcp_nl_link_add_vlan()`, is in this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/45f408865688eb7ea0cdbf23aa6f8a973be49d1a)].
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/45f408865688eb7ea0cdbf23aa6f8a973be49d1a)].
|
||||||
|
|
||||||
## Results
|
## Results
|
||||||
|
|
||||||
|
|||||||
@@ -118,7 +118,7 @@ or Virtual Routing/Forwarding domains). So first, I need to add these:
|
|||||||
|
|
||||||
All of this code was heavily inspired by the pending [[Gerrit](https://gerrit.fd.io/r/c/vpp/+/31122)]
|
All of this code was heavily inspired by the pending [[Gerrit](https://gerrit.fd.io/r/c/vpp/+/31122)]
|
||||||
but a few finishing touches were added, and wrapped up in this
|
but a few finishing touches were added, and wrapped up in this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/7a76498277edc43beaa680e91e3a0c1787319106)].
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/7a76498277edc43beaa680e91e3a0c1787319106)].
|
||||||
|
|
||||||
### Deletion
|
### Deletion
|
||||||
|
|
||||||
@@ -459,7 +459,7 @@ it as 'unreachable' rather than deleting it. These are *additions* which have a
|
|||||||
but with an interface index of 1 (which, in Netlink, is 'lo'). This makes VPP intermittently crash, so I
|
but with an interface index of 1 (which, in Netlink, is 'lo'). This makes VPP intermittently crash, so I
|
||||||
currently commented this out, while I gain better understanding. Result: blackhole/unreachable/prohibit
|
currently commented this out, while I gain better understanding. Result: blackhole/unreachable/prohibit
|
||||||
specials can not be set using the plugin. Beware!
|
specials can not be set using the plugin. Beware!
|
||||||
(disabled in this [[commit](https://github.com/pimvanpelt/lcpng/commit/7c864ed099821f62c5be8cbe9ed3f4dd34000a42)]).
|
(disabled in this [[commit](https://git.ipng.ch/ipng/lcpng/commit/7c864ed099821f62c5be8cbe9ed3f4dd34000a42)]).
|
||||||
|
|
||||||
## Credits
|
## Credits
|
||||||
|
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ stat['/if/rx-miss'][:, 1].sum() - returns the sum of packet counters for
|
|||||||
```
|
```
|
||||||
|
|
||||||
Alright, so let's grab that file and refactor it into a small library for me to use, I do
|
Alright, so let's grab that file and refactor it into a small library for me to use, I do
|
||||||
this in [[this commit](https://github.com/pimvanpelt/vpp-snmp-agent/commit/51eee915bf0f6267911da596b41a4475feaf212e)].
|
this in [[this commit](https://git.ipng.ch/ipng/vpp-snmp-agent/commit/51eee915bf0f6267911da596b41a4475feaf212e)].
|
||||||
|
|
||||||
### VPP's API
|
### VPP's API
|
||||||
|
|
||||||
@@ -159,7 +159,7 @@ idx=19 name=tap4 mac=02:fe:17:06:fc:af mtu=9000 flags=3
|
|||||||
|
|
||||||
So I added a little abstration with some error handling and one main function
|
So I added a little abstration with some error handling and one main function
|
||||||
to return interfaces as a Python dictionary of those `sw_interface_details`
|
to return interfaces as a Python dictionary of those `sw_interface_details`
|
||||||
tuples in [[this commit](https://github.com/pimvanpelt/vpp-snmp-agent/commit/51eee915bf0f6267911da596b41a4475feaf212e)].
|
tuples in [[this commit](https://git.ipng.ch/ipng/vpp-snmp-agent/commit/51eee915bf0f6267911da596b41a4475feaf212e)].
|
||||||
|
|
||||||
### AgentX
|
### AgentX
|
||||||
|
|
||||||
@@ -207,9 +207,9 @@ once asked with `GetPDU` or `GetNextPDU` requests, by issuing a corresponding `R
|
|||||||
to the SNMP server -- it takes care of all the rest!
|
to the SNMP server -- it takes care of all the rest!
|
||||||
|
|
||||||
The resulting code is in [[this
|
The resulting code is in [[this
|
||||||
commit](https://github.com/pimvanpelt/vpp-snmp-agent/commit/8c9c1e2b4aa1d40a981f17581f92bba133dd2c29)]
|
commit](https://git.ipng.ch/ipng/vpp-snmp-agent/commit/8c9c1e2b4aa1d40a981f17581f92bba133dd2c29)]
|
||||||
but you can also check out the whole thing on
|
but you can also check out the whole thing on
|
||||||
[[Github](https://github.com/pimvanpelt/vpp-snmp-agent)].
|
[[Github](https://git.ipng.ch/ipng/vpp-snmp-agent)].
|
||||||
|
|
||||||
### Building
|
### Building
|
||||||
|
|
||||||
|
|||||||
@@ -480,7 +480,7 @@ is to say, those packets which were destined to any IP address configured on the
|
|||||||
plane. Any traffic going _through_ VPP will never be seen by Linux! So, I'll have to be
|
plane. Any traffic going _through_ VPP will never be seen by Linux! So, I'll have to be
|
||||||
clever and count this traffic by polling VPP instead. This was the topic of my previous
|
clever and count this traffic by polling VPP instead. This was the topic of my previous
|
||||||
[VPP Part 6]({{< ref "2021-09-10-vpp-6" >}}) about the SNMP Agent. All of that code
|
[VPP Part 6]({{< ref "2021-09-10-vpp-6" >}}) about the SNMP Agent. All of that code
|
||||||
was released to [Github](https://github.com/pimvanpelt/vpp-snmp-agent), notably there's
|
was released to [Github](https://git.ipng.ch/ipng/vpp-snmp-agent), notably there's
|
||||||
a hint there for an `snmpd-dataplane.service` and a `vpp-snmp-agent.service`, including
|
a hint there for an `snmpd-dataplane.service` and a `vpp-snmp-agent.service`, including
|
||||||
the compiled binary that reads from VPP and feeds this to SNMP.
|
the compiled binary that reads from VPP and feeds this to SNMP.
|
||||||
|
|
||||||
|
|||||||
@@ -30,9 +30,9 @@ virtual machine running in Qemu/KVM into a working setup with both [Free Range R
|
|||||||
and [Bird](https://bird.network.cz/) installed side by side.
|
and [Bird](https://bird.network.cz/) installed side by side.
|
||||||
|
|
||||||
**NOTE**: If you're just interested in the resulting image, here's the most pertinent information:
|
**NOTE**: If you're just interested in the resulting image, here's the most pertinent information:
|
||||||
> * ***vpp-proto.qcow2.lrz [[Download](https://ipng.ch/media/vpp-proto/vpp-proto-bookworm-20231015.qcow2.lrz)]***
|
> * ***vpp-proto.qcow2.lrz*** [[Download](https://ipng.ch/media/vpp-proto/vpp-proto-bookworm-20250607.qcow2.lrz)]
|
||||||
> * ***SHA256*** `bff03a80ccd1c0094d867d1eb1b669720a1838330c0a5a526439ecb1a2457309`
|
> * ***SHA256*** `a5fdf157c03f2d202dcccdf6ed97db49c8aa5fdb6b9ca83a1da958a8a24780ab
|
||||||
> * ***Debian Bookworm (12.4)*** and ***VPP 24.02-rc0~46-ga16463610e***
|
> * ***Debian Bookworm (12.11)*** and ***VPP 25.10-rc0~49-g90d92196***
|
||||||
> * ***CPU*** Make sure the (virtualized) CPU supports AVX
|
> * ***CPU*** Make sure the (virtualized) CPU supports AVX
|
||||||
> * ***RAM*** The image needs at least 4GB of RAM, and the hypervisor should support hugepages and AVX
|
> * ***RAM*** The image needs at least 4GB of RAM, and the hypervisor should support hugepages and AVX
|
||||||
> * ***Username***: `ipng` with ***password***: `ipng loves vpp` and is sudo-enabled
|
> * ***Username***: `ipng` with ***password***: `ipng loves vpp` and is sudo-enabled
|
||||||
@@ -62,7 +62,7 @@ plugins:
|
|||||||
or route, or the system receiving ARP or IPv6 neighbor request/reply from neighbors), and applying
|
or route, or the system receiving ARP or IPv6 neighbor request/reply from neighbors), and applying
|
||||||
these events to the VPP dataplane.
|
these events to the VPP dataplane.
|
||||||
|
|
||||||
I've published the code on [Github](https://github.com/pimvanpelt/lcpng/) and I am targeting a release
|
I've published the code on [Github](https://git.ipng.ch/ipng/lcpng/) and I am targeting a release
|
||||||
in upstream VPP, hoping to make the upcoming 22.02 release in February 2022. I have a lot of ground to
|
in upstream VPP, hoping to make the upcoming 22.02 release in February 2022. I have a lot of ground to
|
||||||
cover, but I will note that the plugin has been running in production in [AS8298]({{< ref "2021-02-27-network" >}})
|
cover, but I will note that the plugin has been running in production in [AS8298]({{< ref "2021-02-27-network" >}})
|
||||||
since Sep'21 and no crashes related to LinuxCP have been observed.
|
since Sep'21 and no crashes related to LinuxCP have been observed.
|
||||||
@@ -195,7 +195,7 @@ So grab a cup of tea, while we let Rhino stretch its legs, ehh, CPUs ...
|
|||||||
pim@rhino:~$ mkdir -p ~/src
|
pim@rhino:~$ mkdir -p ~/src
|
||||||
pim@rhino:~$ cd ~/src
|
pim@rhino:~$ cd ~/src
|
||||||
pim@rhino:~/src$ sudo apt install libmnl-dev
|
pim@rhino:~/src$ sudo apt install libmnl-dev
|
||||||
pim@rhino:~/src$ git clone https://github.com/pimvanpelt/lcpng.git
|
pim@rhino:~/src$ git clone https://git.ipng.ch/ipng/lcpng.git
|
||||||
pim@rhino:~/src$ git clone https://gerrit.fd.io/r/vpp
|
pim@rhino:~/src$ git clone https://gerrit.fd.io/r/vpp
|
||||||
pim@rhino:~/src$ ln -s ~/src/lcpng ~/src/vpp/src/plugins/lcpng
|
pim@rhino:~/src$ ln -s ~/src/lcpng ~/src/vpp/src/plugins/lcpng
|
||||||
pim@rhino:~/src$ cd ~/src/vpp
|
pim@rhino:~/src$ cd ~/src/vpp
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ In this first post, let's take a look at tablestakes: writing a YAML specificati
|
|||||||
configuration elements of VPP, and then ensures that the YAML file is both syntactically as well as
|
configuration elements of VPP, and then ensures that the YAML file is both syntactically as well as
|
||||||
semantically correct.
|
semantically correct.
|
||||||
|
|
||||||
**Note**: Code is on [my Github](https://github.com/pimvanpelt/vppcfg), but it's not quite ready for
|
**Note**: Code is on [my Github](https://git.ipng.ch/ipng/vppcfg), but it's not quite ready for
|
||||||
prime-time yet. Take a look, and engage with us on GitHub (pull requests preferred over issues themselves)
|
prime-time yet. Take a look, and engage with us on GitHub (pull requests preferred over issues themselves)
|
||||||
or reach out by [contacting us](/s/contact/).
|
or reach out by [contacting us](/s/contact/).
|
||||||
|
|
||||||
@@ -348,7 +348,7 @@ to mess up my (or your!) VPP router by feeding it garbage, so the lions' share o
|
|||||||
has been to assert the YAML file is both syntactically and semantically valid.
|
has been to assert the YAML file is both syntactically and semantically valid.
|
||||||
|
|
||||||
|
|
||||||
In the mean time, you can take a look at my code on [GitHub](https://github.com/pimvanpelt/vppcfg), but to
|
In the mean time, you can take a look at my code on [GitHub](https://git.ipng.ch/ipng/vppcfg), but to
|
||||||
whet your appetite, here's a hefty configuration that demonstrates all implemented types:
|
whet your appetite, here's a hefty configuration that demonstrates all implemented types:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ the configuration to the dataplane. Welcome to `vppcfg`!
|
|||||||
In this second post of the series, I want to talk a little bit about how planning a path from a running
|
In this second post of the series, I want to talk a little bit about how planning a path from a running
|
||||||
configuration to a desired new configuration might look like.
|
configuration to a desired new configuration might look like.
|
||||||
|
|
||||||
**Note**: Code is on [my Github](https://github.com/pimvanpelt/vppcfg), but it's not quite ready for
|
**Note**: Code is on [my Github](https://git.ipng.ch/ipng/vppcfg), but it's not quite ready for
|
||||||
prime-time yet. Take a look, and engage with us on GitHub (pull requests preferred over issues themselves)
|
prime-time yet. Take a look, and engage with us on GitHub (pull requests preferred over issues themselves)
|
||||||
or reach out by [contacting us](/s/contact/).
|
or reach out by [contacting us](/s/contact/).
|
||||||
|
|
||||||
|
|||||||
@@ -171,12 +171,12 @@ GigabitEthernet1/0/0 1 up GigabitEthernet1/0/0
|
|||||||
|
|
||||||
After this exploratory exercise, I have learned enough about the hardware to be able to take the
|
After this exploratory exercise, I have learned enough about the hardware to be able to take the
|
||||||
Fitlet2 out for a spin. To configure the VPP instance, I turn to
|
Fitlet2 out for a spin. To configure the VPP instance, I turn to
|
||||||
[[vppcfg](https://github.com/pimvanpelt/vppcfg)], which can take a YAML configuration file
|
[[vppcfg](https://git.ipng.ch/ipng/vppcfg)], which can take a YAML configuration file
|
||||||
describing the desired VPP configuration, and apply it safely to the running dataplane using the VPP
|
describing the desired VPP configuration, and apply it safely to the running dataplane using the VPP
|
||||||
API. I've written a few more posts on how it does that, notably on its [[syntax]({{< ref "2022-03-27-vppcfg-1" >}})]
|
API. I've written a few more posts on how it does that, notably on its [[syntax]({{< ref "2022-03-27-vppcfg-1" >}})]
|
||||||
and its [[planner]({{< ref "2022-04-02-vppcfg-2" >}})]. A complete
|
and its [[planner]({{< ref "2022-04-02-vppcfg-2" >}})]. A complete
|
||||||
configuration guide on vppcfg can be found
|
configuration guide on vppcfg can be found
|
||||||
[[here](https://github.com/pimvanpelt/vppcfg/blob/main/docs/config-guide.md)].
|
[[here](https://git.ipng.ch/ipng/vppcfg/blob/main/docs/config-guide.md)].
|
||||||
|
|
||||||
```
|
```
|
||||||
pim@fitlet:~$ sudo dpkg -i {lib,}vpp*23.06*deb
|
pim@fitlet:~$ sudo dpkg -i {lib,}vpp*23.06*deb
|
||||||
|
|||||||
@@ -185,7 +185,7 @@ forgetful chipmunk-sized brain!), so here, I'll only recap what's already writte
|
|||||||
|
|
||||||
**1. BUILD:** For the first step, the build is straight forward, and yields a VPP instance based on
|
**1. BUILD:** For the first step, the build is straight forward, and yields a VPP instance based on
|
||||||
`vpp-ext-deps_23.06-1` at version `23.06-rc0~71-g182d2b466`, which contains my
|
`vpp-ext-deps_23.06-1` at version `23.06-rc0~71-g182d2b466`, which contains my
|
||||||
[[LCPng](https://github.com/pimvanpelt/lcpng.git)] plugin. I then copy the packages to the router.
|
[[LCPng](https://git.ipng.ch/ipng/lcpng.git)] plugin. I then copy the packages to the router.
|
||||||
The router has an E-2286G CPU @ 4.00GHz with 6 cores and 6 hyperthreads. There's a really handy tool
|
The router has an E-2286G CPU @ 4.00GHz with 6 cores and 6 hyperthreads. There's a really handy tool
|
||||||
called `likwid-topology` that can show how the L1, L2 and L3 cache lines up with respect to CPU
|
called `likwid-topology` that can show how the L1, L2 and L3 cache lines up with respect to CPU
|
||||||
cores. Here I learn that CPU (0+6) and (1+7) share L1 and L2 cache -- so I can conclude that 0-5 are
|
cores. Here I learn that CPU (0+6) and (1+7) share L1 and L2 cache -- so I can conclude that 0-5 are
|
||||||
@@ -351,7 +351,7 @@ in `vppcfg`:
|
|||||||
* When I create the initial `--novpp` config, there's a bug in `vppcfg` where I incorrectly
|
* When I create the initial `--novpp` config, there's a bug in `vppcfg` where I incorrectly
|
||||||
reference a dataplane object which I haven't initialized (because with `--novpp` the tool
|
reference a dataplane object which I haven't initialized (because with `--novpp` the tool
|
||||||
will not contact the dataplane at all. That one was easy to fix, which I did in [[this
|
will not contact the dataplane at all. That one was easy to fix, which I did in [[this
|
||||||
commit](https://github.com/pimvanpelt/vppcfg/commit/0a0413927a0be6ed3a292a8c336deab8b86f5eee)]).
|
commit](https://git.ipng.ch/ipng/vppcfg/commit/0a0413927a0be6ed3a292a8c336deab8b86f5eee)]).
|
||||||
|
|
||||||
After that small detour, I can now proceed to configure the dataplane by offering the resulting
|
After that small detour, I can now proceed to configure the dataplane by offering the resulting
|
||||||
VPP commands, like so:
|
VPP commands, like so:
|
||||||
@@ -573,7 +573,7 @@ see is that which is destined to the controlplane (eg, to one of the IPv4 or IPv
|
|||||||
multicast/broadcast groups that they are participating in), so things like tcpdump or SNMP won't
|
multicast/broadcast groups that they are participating in), so things like tcpdump or SNMP won't
|
||||||
really work.
|
really work.
|
||||||
|
|
||||||
However, due to my [[vpp-snmp-agent](https://github.com/pimvanpelt/vpp-snmp-agent.git)], which is
|
However, due to my [[vpp-snmp-agent](https://git.ipng.ch/ipng/vpp-snmp-agent.git)], which is
|
||||||
feeding as an AgentX behind an snmpd that in turn is running in the `dataplane` namespace, SNMP scrapes
|
feeding as an AgentX behind an snmpd that in turn is running in the `dataplane` namespace, SNMP scrapes
|
||||||
work as they did before, albeit with a few different interface names.
|
work as they did before, albeit with a few different interface names.
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ performance and versatility. For those of us who have used Cisco IOS/XR devices,
|
|||||||
_ASR_ (aggregation service router), VPP will look and feel quite familiar as many of the approaches
|
_ASR_ (aggregation service router), VPP will look and feel quite familiar as many of the approaches
|
||||||
are shared between the two.
|
are shared between the two.
|
||||||
|
|
||||||
I've been working on the Linux Control Plane [[ref](https://github.com/pimvanpelt/lcpng)], which you
|
I've been working on the Linux Control Plane [[ref](https://git.ipng.ch/ipng/lcpng)], which you
|
||||||
can read all about in my series on VPP back in 2021:
|
can read all about in my series on VPP back in 2021:
|
||||||
|
|
||||||
[{: style="width:300px; float: right; margin-left: 1em;"}](https://video.ipng.ch/w/erc9sAofrSZ22qjPwmv6H4)
|
[{: style="width:300px; float: right; margin-left: 1em;"}](https://video.ipng.ch/w/erc9sAofrSZ22qjPwmv6H4)
|
||||||
@@ -70,7 +70,7 @@ answered by a Response PDU.
|
|||||||
|
|
||||||
Using parts of a Python Agentx library written by GitHub user hosthvo
|
Using parts of a Python Agentx library written by GitHub user hosthvo
|
||||||
[[ref](https://github.com/hosthvo/pyagentx)], I tried my hands at writing one of these AgentX's.
|
[[ref](https://github.com/hosthvo/pyagentx)], I tried my hands at writing one of these AgentX's.
|
||||||
The resulting source code is on [[GitHub](https://github.com/pimvanpelt/vpp-snmp-agent)]. That's the
|
The resulting source code is on [[GitHub](https://git.ipng.ch/ipng/vpp-snmp-agent)]. That's the
|
||||||
one that's running in production ever since I started running VPP routers at IPng Networks AS8298.
|
one that's running in production ever since I started running VPP routers at IPng Networks AS8298.
|
||||||
After the _AgentX_ exposes the dataplane interfaces and their statistics into _SNMP_, an open source
|
After the _AgentX_ exposes the dataplane interfaces and their statistics into _SNMP_, an open source
|
||||||
monitoring tool such as LibreNMS [[ref](https://librenms.org/)] can discover the routers and draw
|
monitoring tool such as LibreNMS [[ref](https://librenms.org/)] can discover the routers and draw
|
||||||
@@ -126,7 +126,7 @@ for any interface created in the dataplane.
|
|||||||
|
|
||||||
I wish I were good at Go, but I never really took to the language. I'm pretty good at Python, but
|
I wish I were good at Go, but I never really took to the language. I'm pretty good at Python, but
|
||||||
sorting through the stats segment isn't super quick as I've already noticed in the Python3 based
|
sorting through the stats segment isn't super quick as I've already noticed in the Python3 based
|
||||||
[[VPP SNMP Agent](https://github.com/pimvanpelt/vpp-snmp-agent)]. I'm probably the world's least
|
[[VPP SNMP Agent](https://git.ipng.ch/ipng/vpp-snmp-agent)]. I'm probably the world's least
|
||||||
terrible C programmer, so maybe I can take a look at the VPP Stats Client and make sense of it. Luckily,
|
terrible C programmer, so maybe I can take a look at the VPP Stats Client and make sense of it. Luckily,
|
||||||
there's an example already in `src/vpp/app/vpp_get_stats.c` and it reveals the following pattern:
|
there's an example already in `src/vpp/app/vpp_get_stats.c` and it reveals the following pattern:
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ same time keep an IPng Site Local network with IPv4 and IPv6 that is separate fr
|
|||||||
based on hardware/silicon based forwarding at line rate and high availability. You can read all
|
based on hardware/silicon based forwarding at line rate and high availability. You can read all
|
||||||
about my Centec MPLS shenanigans in [[this article]({{< ref "2023-03-11-mpls-core" >}})].
|
about my Centec MPLS shenanigans in [[this article]({{< ref "2023-03-11-mpls-core" >}})].
|
||||||
|
|
||||||
Ever since the release of the Linux Control Plane [[ref](https://github.com/pimvanpelt/lcpng)]
|
Ever since the release of the Linux Control Plane [[ref](https://git.ipng.ch/ipng/lcpng)]
|
||||||
plugin in VPP, folks have asked "What about MPLS?" -- I have never really felt the need to go this
|
plugin in VPP, folks have asked "What about MPLS?" -- I have never really felt the need to go this
|
||||||
rabbit hole, because I figured that in this day and age, higher level IP protocols that do tunneling
|
rabbit hole, because I figured that in this day and age, higher level IP protocols that do tunneling
|
||||||
are just as performant, and a little bit less of an 'art' to get right. For example, the Centec
|
are just as performant, and a little bit less of an 'art' to get right. For example, the Centec
|
||||||
|
|||||||
@@ -459,6 +459,6 @@ and VPP, and the overall implementation before attempting to use in production.
|
|||||||
we got at least some of this right, but testing and runtime experience will tell.
|
we got at least some of this right, but testing and runtime experience will tell.
|
||||||
|
|
||||||
I will be silently porting the change into my own copy of the Linux Controlplane called lcpng on
|
I will be silently porting the change into my own copy of the Linux Controlplane called lcpng on
|
||||||
[[GitHub](https://github.com/pimvanpelt/lcpng.git)]. If you'd like to test this - reach out to the VPP
|
[[GitHub](https://git.ipng.ch/ipng/lcpng.git)]. If you'd like to test this - reach out to the VPP
|
||||||
Developer [[mailinglist](mailto:vpp-dev@lists.fd.io)] any time!
|
Developer [[mailinglist](mailto:vpp-dev@lists.fd.io)] any time!
|
||||||
|
|
||||||
|
|||||||
@@ -385,5 +385,5 @@ and VPP, and the overall implementation before attempting to use in production.
|
|||||||
we got at least some of this right, but testing and runtime experience will tell.
|
we got at least some of this right, but testing and runtime experience will tell.
|
||||||
|
|
||||||
I will be silently porting the change into my own copy of the Linux Controlplane called lcpng on
|
I will be silently porting the change into my own copy of the Linux Controlplane called lcpng on
|
||||||
[[GitHub](https://github.com/pimvanpelt/lcpng.git)]. If you'd like to test this - reach out to the VPP
|
[[GitHub](https://git.ipng.ch/ipng/lcpng.git)]. If you'd like to test this - reach out to the VPP
|
||||||
Developer [[mailinglist](mailto:vpp-dev@lists.fd.io)] any time!
|
Developer [[mailinglist](mailto:vpp-dev@lists.fd.io)] any time!
|
||||||
|
|||||||
@@ -304,7 +304,7 @@ Gateway, just to show a few of the more advanced features of VPP. For me, this t
|
|||||||
line of thinking: classifiers. This extract/match/act pattern can be used in policers, ACLs and
|
line of thinking: classifiers. This extract/match/act pattern can be used in policers, ACLs and
|
||||||
arbitrary traffic redirection through VPP's directed graph (eg. selecting a next node for
|
arbitrary traffic redirection through VPP's directed graph (eg. selecting a next node for
|
||||||
processing). I'm going to deep-dive into this classifier behavior in an upcoming article, and see
|
processing). I'm going to deep-dive into this classifier behavior in an upcoming article, and see
|
||||||
how I might add this to [[vppcfg](https://github.com/pimvanpelt/vppcfg.git)], because I think it
|
how I might add this to [[vppcfg](https://git.ipng.ch/ipng/vppcfg.git)], because I think it
|
||||||
would be super powerful to abstract away the rather complex underlying API into something a little
|
would be super powerful to abstract away the rather complex underlying API into something a little
|
||||||
bit more ... user friendly. Stay tuned! :)
|
bit more ... user friendly. Stay tuned! :)
|
||||||
|
|
||||||
|
|||||||
@@ -359,7 +359,7 @@ does not have an IPv4 address. Except -- I'm bending the rules a little bit by d
|
|||||||
There's an internal function `ip4_sw_interface_enable_disable()` which is called to enable IPv4
|
There's an internal function `ip4_sw_interface_enable_disable()` which is called to enable IPv4
|
||||||
processing on an interface once the first IPv4 address is added. So my first fix is to force this to
|
processing on an interface once the first IPv4 address is added. So my first fix is to force this to
|
||||||
be enabled for any interface that is exposed via Linux Control Plane, notably in `lcp_itf_pair_create()`
|
be enabled for any interface that is exposed via Linux Control Plane, notably in `lcp_itf_pair_create()`
|
||||||
[[here](https://github.com/pimvanpelt/lcpng/blob/main/lcpng_interface.c#L777)].
|
[[here](https://git.ipng.ch/ipng/lcpng/blob/main/lcpng_interface.c#L777)].
|
||||||
|
|
||||||
This approach is partially effective:
|
This approach is partially effective:
|
||||||
|
|
||||||
@@ -500,7 +500,7 @@ which is unnumbered. Because I don't know for sure if everybody would find this
|
|||||||
I make sure to guard the behavior behind a backwards compatible configuration option.
|
I make sure to guard the behavior behind a backwards compatible configuration option.
|
||||||
|
|
||||||
If you're curious, please take a look at the change in my [[GitHub
|
If you're curious, please take a look at the change in my [[GitHub
|
||||||
repo](https://github.com/pimvanpelt/lcpng/commit/a960d64a87849d312b32d9432ffb722672c14878)], in
|
repo](https://git.ipng.ch/ipng/lcpng/commit/a960d64a87849d312b32d9432ffb722672c14878)], in
|
||||||
which I:
|
which I:
|
||||||
1. add a new configuration option, `lcp-sync-unnumbered`, which defaults to `on`. That would be
|
1. add a new configuration option, `lcp-sync-unnumbered`, which defaults to `on`. That would be
|
||||||
what the plugin would do in the normal case: copy forward these borrowed IP addresses to Linux.
|
what the plugin would do in the normal case: copy forward these borrowed IP addresses to Linux.
|
||||||
|
|||||||
@@ -147,7 +147,7 @@ With all of that, I am ready to demonstrate two working solutions now. I first c
|
|||||||
Ondrej's [[commit](https://gitlab.nic.cz/labs/bird/-/commit/280daed57d061eb1ebc89013637c683fe23465e8)].
|
Ondrej's [[commit](https://gitlab.nic.cz/labs/bird/-/commit/280daed57d061eb1ebc89013637c683fe23465e8)].
|
||||||
Then, I compile VPP with my pending [[gerrit](https://gerrit.fd.io/r/c/vpp/+/40482)]. Finally,
|
Then, I compile VPP with my pending [[gerrit](https://gerrit.fd.io/r/c/vpp/+/40482)]. Finally,
|
||||||
to demonstrate how `update_loopback_addr()` might work, I compile `lcpng` with my previous
|
to demonstrate how `update_loopback_addr()` might work, I compile `lcpng` with my previous
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/a960d64a87849d312b32d9432ffb722672c14878)],
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/a960d64a87849d312b32d9432ffb722672c14878)],
|
||||||
which allows me to inhibit copying forward addresses from VPP to Linux, when using _unnumbered_
|
which allows me to inhibit copying forward addresses from VPP to Linux, when using _unnumbered_
|
||||||
interfaces.
|
interfaces.
|
||||||
|
|
||||||
|
|||||||
@@ -250,10 +250,10 @@ remove the IPv4 and IPv6 addresses from the <span style='color:red;font-weight:b
|
|||||||
routers in Brüttisellen. They are directly connected, and if anything goes wrong, I can walk
|
routers in Brüttisellen. They are directly connected, and if anything goes wrong, I can walk
|
||||||
over and rescue them. Sounds like a safe way to start!
|
over and rescue them. Sounds like a safe way to start!
|
||||||
|
|
||||||
I quickly add the ability for [[vppcfg](https://github.com/pimvanpelt/vppcfg)] to configure
|
I quickly add the ability for [[vppcfg](https://git.ipng.ch/ipng/vppcfg)] to configure
|
||||||
_unnumbered_ interfaces. In VPP, these are interfaces that don't have an IPv4 or IPv6 address of
|
_unnumbered_ interfaces. In VPP, these are interfaces that don't have an IPv4 or IPv6 address of
|
||||||
their own, but they borrow one from another interface. If you're curious, you can take a look at the
|
their own, but they borrow one from another interface. If you're curious, you can take a look at the
|
||||||
[[User Guide](https://github.com/pimvanpelt/vppcfg/blob/main/docs/config-guide.md#interfaces)] on
|
[[User Guide](https://git.ipng.ch/ipng/vppcfg/blob/main/docs/config-guide.md#interfaces)] on
|
||||||
GitHub.
|
GitHub.
|
||||||
|
|
||||||
Looking at their `vppcfg` files, the change is actually very easy, taking as an example the
|
Looking at their `vppcfg` files, the change is actually very easy, taking as an example the
|
||||||
@@ -291,7 +291,7 @@ interface.
|
|||||||
|
|
||||||
In the article, you'll see that discussed as _Solution 2_, and it includes a bit of rationale why I
|
In the article, you'll see that discussed as _Solution 2_, and it includes a bit of rationale why I
|
||||||
find this better. I implemented it in this
|
find this better. I implemented it in this
|
||||||
[[commit](https://github.com/pimvanpelt/lcpng/commit/a960d64a87849d312b32d9432ffb722672c14878)], in
|
[[commit](https://git.ipng.ch/ipng/lcpng/commit/a960d64a87849d312b32d9432ffb722672c14878)], in
|
||||||
case you're curious, and the commandline keyword is `lcp lcp-sync-unnumbered off` (the default is
|
case you're curious, and the commandline keyword is `lcp lcp-sync-unnumbered off` (the default is
|
||||||
_on_).
|
_on_).
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,238 @@
|
|||||||
|
---
|
||||||
|
date: "2024-09-03T13:07:54Z"
|
||||||
|
title: Loadtest notes, ASR9001
|
||||||
|
draft: true
|
||||||
|
---
|
||||||
|
|
||||||
|
### L2 point-to-point (L2XC) config
|
||||||
|
|
||||||
|
```
|
||||||
|
interface TenGigE0/0/0/0
|
||||||
|
mtu 9216
|
||||||
|
load-interval 30
|
||||||
|
l2transport
|
||||||
|
!
|
||||||
|
!
|
||||||
|
interface TenGigE0/0/0/1
|
||||||
|
mtu 9216
|
||||||
|
load-interval 30
|
||||||
|
l2transport
|
||||||
|
!
|
||||||
|
!
|
||||||
|
interface TenGigE0/0/0/2
|
||||||
|
mtu 9216
|
||||||
|
load-interval 30
|
||||||
|
l2transport
|
||||||
|
!
|
||||||
|
!
|
||||||
|
interface TenGigE0/0/0/3
|
||||||
|
mtu 9216
|
||||||
|
load-interval 30
|
||||||
|
l2transport
|
||||||
|
!
|
||||||
|
!
|
||||||
|
|
||||||
|
|
||||||
|
...
|
||||||
|
l2vpn
|
||||||
|
load-balancing flow src-dst-ip
|
||||||
|
logging
|
||||||
|
bridge-domain
|
||||||
|
pseudowire
|
||||||
|
!
|
||||||
|
xconnect group LoadTest
|
||||||
|
p2p pair0
|
||||||
|
interface TenGigE0/0/2/0
|
||||||
|
interface TenGigE0/0/2/1
|
||||||
|
!
|
||||||
|
p2p pair1
|
||||||
|
interface TenGigE0/0/2/2
|
||||||
|
interface TenGigE0/0/2/3
|
||||||
|
!
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### L2 Bridge-Domain
|
||||||
|
|
||||||
|
```
|
||||||
|
l2vpn
|
||||||
|
bridge group LoadTestp
|
||||||
|
bridge-domain bd0
|
||||||
|
interface TenGigE0/0/0/0
|
||||||
|
!
|
||||||
|
interface TenGigE0/0/0/1
|
||||||
|
!
|
||||||
|
!
|
||||||
|
bridge-domain bd1
|
||||||
|
interface TenGigE0/0/0/2
|
||||||
|
!
|
||||||
|
interface TenGigE0/0/0/3
|
||||||
|
!
|
||||||
|
!
|
||||||
|
...
|
||||||
|
```
|
||||||
|
RP/0/RSP0/CPU0:micro-fridge#show l2vpn forwarding bridge-domain mac-address location 0/0/CPU0
|
||||||
|
Sat Aug 31 12:09:08.957 UTC
|
||||||
|
Mac Address Type Learned from/Filtered on LC learned Resync Age Mapped to
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
9c69.b461.fcf2 dynamic Te0/0/0/0 0/0/CPU0 0d 0h 0m 14s N/A
|
||||||
|
9c69.b461.fcf3 dynamic Te0/0/0/1 0/0/CPU0 0d 0h 0m 2s N/A
|
||||||
|
001b.2155.1f11 dynamic Te0/0/0/2 0/0/CPU0 0d 0h 0m 0s N/A
|
||||||
|
001b.2155.1f10 dynamic Te0/0/0/3 0/0/CPU0 0d 0h 0m 15s N/A
|
||||||
|
001b.21bc.47a4 dynamic Te0/0/1/0 0/0/CPU0 0d 0h 0m 6s N/A
|
||||||
|
001b.21bc.47a5 dynamic Te0/0/1/1 0/0/CPU0 0d 0h 0m 21s N/A
|
||||||
|
9c69.b461.ff41 dynamic Te0/0/1/2 0/0/CPU0 0d 0h 0m 16s N/A
|
||||||
|
9c69.b461.ff40 dynamic Te0/0/1/3 0/0/CPU0 0d 0h 0m 10s N/A
|
||||||
|
001b.2155.1d1d dynamic Te0/0/2/0 0/0/CPU0 0d 0h 0m 9s N/A
|
||||||
|
001b.2155.1d1c dynamic Te0/0/2/1 0/0/CPU0 0d 0h 0m 16s N/A
|
||||||
|
001b.2155.1e08 dynamic Te0/0/2/2 0/0/CPU0 0d 0h 0m 4s N/A
|
||||||
|
001b.2155.1e09 dynamic Te0/0/2/3 0/0/CPU0 0d 0h 0m 11s N/A
|
||||||
|
```
|
||||||
|
|
||||||
|
Interesting finding, after a bridge-domain overload occurs, forwarding pretty much stops
|
||||||
|
```
|
||||||
|
Te0/0/0/0:
|
||||||
|
30 second input rate 6931755000 bits/sec, 14441158 packets/sec
|
||||||
|
30 second output rate 0 bits/sec, 0 packets/sec
|
||||||
|
Te0/0/0/1:
|
||||||
|
30 second input rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second output rate 19492000 bits/sec, 40609 packets/sec
|
||||||
|
|
||||||
|
Te0/0/0/2:
|
||||||
|
30 second input rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second output rate 19720000 bits/sec, 41084 packets/sec
|
||||||
|
Te0/0/0/3:
|
||||||
|
30 second input rate 6931728000 bits/sec, 14441100 packets/sec
|
||||||
|
30 second output rate 0 bits/sec, 0 packets/sec
|
||||||
|
|
||||||
|
... and so on
|
||||||
|
|
||||||
|
30 second input rate 6931558000 bits/sec, 14440748 packets/sec
|
||||||
|
30 second output rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second input rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second output rate 12627000 bits/sec, 26307 packets/sec
|
||||||
|
30 second input rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second output rate 12710000 bits/sec, 26479 packets/sec
|
||||||
|
30 second input rate 6931542000 bits/sec, 14440712 packets/sec
|
||||||
|
30 second output rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second input rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second output rate 19196000 bits/sec, 39992 packets/sec
|
||||||
|
30 second input rate 6931651000 bits/sec, 14440938 packets/sec
|
||||||
|
30 second output rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second input rate 6931658000 bits/sec, 14440958 packets/sec
|
||||||
|
30 second output rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second input rate 0 bits/sec, 0 packets/sec
|
||||||
|
30 second output rate 13167000 bits/sec, 27431 packets/sec
|
||||||
|
```
|
||||||
|
|
||||||
|
MPLS enabled test:
|
||||||
|
```
|
||||||
|
arp vrf default 100.64.0.2 001b.2155.1e08 ARPA
|
||||||
|
arp vrf default 100.64.1.2 001b.2155.1e09 ARPA
|
||||||
|
arp vrf default 100.64.2.2 001b.2155.1d1c ARPA
|
||||||
|
arp vrf default 100.64.3.2 001b.2155.1d1d ARPA
|
||||||
|
arp vrf default 100.64.4.2 001b.21bc.47a4 ARPA
|
||||||
|
arp vrf default 100.64.5.2 001b.21bc.47a5 ARPA
|
||||||
|
arp vrf default 100.64.6.2 9c69.b461.fcf2 ARPA
|
||||||
|
arp vrf default 100.64.7.2 9c69.b461.fcf3 ARPA
|
||||||
|
arp vrf default 100.64.8.2 001b.2155.1f10 ARPA
|
||||||
|
arp vrf default 100.64.9.2 001b.2155.1f11 ARPA
|
||||||
|
arp vrf default 100.64.10.2 9c69.b461.ff40 ARPA
|
||||||
|
arp vrf default 100.64.11.2 9c69.b461.ff41 ARPA
|
||||||
|
|
||||||
|
router static
|
||||||
|
address-family ipv4 unicast
|
||||||
|
0.0.0.0/0 198.19.5.1
|
||||||
|
16.0.0.0/24 100.64.0.2
|
||||||
|
16.0.1.0/24 100.64.2.2
|
||||||
|
16.0.2.0/24 100.64.4.2
|
||||||
|
16.0.3.0/24 100.64.6.2
|
||||||
|
16.0.4.0/24 100.64.8.2
|
||||||
|
16.0.5.0/24 100.64.10.2
|
||||||
|
48.0.0.0/24 100.64.1.2
|
||||||
|
48.0.1.0/24 100.64.3.2
|
||||||
|
48.0.2.0/24 100.64.5.2
|
||||||
|
48.0.3.0/24 100.64.7.2
|
||||||
|
48.0.4.0/24 100.64.9.2
|
||||||
|
48.0.5.0/24 100.64.11.2
|
||||||
|
!
|
||||||
|
!
|
||||||
|
|
||||||
|
mpls static
|
||||||
|
interface TenGigE0/0/0/0
|
||||||
|
interface TenGigE0/0/0/1
|
||||||
|
interface TenGigE0/0/0/2
|
||||||
|
interface TenGigE0/0/0/3
|
||||||
|
interface TenGigE0/0/1/0
|
||||||
|
interface TenGigE0/0/1/1
|
||||||
|
interface TenGigE0/0/1/2
|
||||||
|
interface TenGigE0/0/1/3
|
||||||
|
interface TenGigE0/0/2/0
|
||||||
|
interface TenGigE0/0/2/1
|
||||||
|
interface TenGigE0/0/2/2
|
||||||
|
interface TenGigE0/0/2/3
|
||||||
|
address-family ipv4 unicast
|
||||||
|
local-label 16 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/2/3 100.64.1.2 out-label 17
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 17 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/2/2 100.64.0.2 out-label 16
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 18 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/2/0 100.64.3.2 out-label 19
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 19 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/2/1 100.64.2.2 out-label 18
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 20 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/1/1 100.64.5.2 out-label 21
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 21 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/1/0 100.64.4.2 out-label 20
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 22 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/0/1 100.64.7.2 out-label 23
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 23 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/0/0 100.64.6.2 out-label 22
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 24 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/0/2 100.64.9.2 out-label 25
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 25 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/0/3 100.64.8.2 out-label 24
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 26 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/1/2 100.64.11.2 out-label 27
|
||||||
|
!
|
||||||
|
!
|
||||||
|
local-label 27 allocate
|
||||||
|
forward
|
||||||
|
path 1 nexthop TenGigE0/0/1/3 100.64.10.2 out-label 26
|
||||||
|
!
|
||||||
|
!
|
||||||
|
!
|
||||||
|
!
|
||||||
|
```
|
||||||
@@ -230,7 +230,7 @@ does not have any form of configuration persistence and that's deliberate. VPP's
|
|||||||
programmable dataplane, and explicitly has left the programming and configuration as an exercise for
|
programmable dataplane, and explicitly has left the programming and configuration as an exercise for
|
||||||
integrators. I have written a Python project that takes a YAML file as input and uses it to
|
integrators. I have written a Python project that takes a YAML file as input and uses it to
|
||||||
configure (and reconfigure, on the fly) the dataplane automatically, called
|
configure (and reconfigure, on the fly) the dataplane automatically, called
|
||||||
[[VPPcfg](https://github.com/pimvanpelt/vppcfg.git)]. Previously, I wrote some implementation thoughts
|
[[VPPcfg](https://git.ipng.ch/ipng/vppcfg.git)]. Previously, I wrote some implementation thoughts
|
||||||
on its [[datamodel]({{< ref 2022-03-27-vppcfg-1 >}})] and its [[operations]({{< ref 2022-04-02-vppcfg-2
|
on its [[datamodel]({{< ref 2022-03-27-vppcfg-1 >}})] and its [[operations]({{< ref 2022-04-02-vppcfg-2
|
||||||
>}})] so I won't repeat that here. Instead, I will just show the configuration:
|
>}})] so I won't repeat that here. Instead, I will just show the configuration:
|
||||||
|
|
||||||
|
|||||||
@@ -430,7 +430,7 @@ Boom. I could not be more pleased.
|
|||||||
This was a nice exercise for me! I'm going this direction becaue the
|
This was a nice exercise for me! I'm going this direction becaue the
|
||||||
[[Containerlab](https://containerlab.dev)] framework will start containers with given NOS images,
|
[[Containerlab](https://containerlab.dev)] framework will start containers with given NOS images,
|
||||||
not too dissimilar from the one I just made, and then attaches `veth` pairs between the containers.
|
not too dissimilar from the one I just made, and then attaches `veth` pairs between the containers.
|
||||||
I started dabbling with a [[pull-request](https://github.com/srl-labs/containerlab/pull/2569)], but
|
I started dabbling with a [[pull-request](https://github.com/srl-labs/containerlab/pull/2571)], but
|
||||||
I got stuck with a part of the Containerlab code that pre-deploys config files into the containers.
|
I got stuck with a part of the Containerlab code that pre-deploys config files into the containers.
|
||||||
You see, I will need to generate two files:
|
You see, I will need to generate two files:
|
||||||
|
|
||||||
@@ -448,7 +448,7 @@ will connect a few VPP containers together with an SR Linux node in a lab. Stand
|
|||||||
|
|
||||||
Once we have that, there's still quite some work for me to do. Notably:
|
Once we have that, there's still quite some work for me to do. Notably:
|
||||||
* Configuration persistence. `clab` allows you to save the running config. For that, I'll need to
|
* Configuration persistence. `clab` allows you to save the running config. For that, I'll need to
|
||||||
introduce [[vppcfg](https://github.com/pimvanpelt/vppcfg.git)] and a means to invoke it when
|
introduce [[vppcfg](https://git.ipng.ch/ipng/vppcfg)] and a means to invoke it when
|
||||||
the lab operator wants to save their config, and then reconfigure VPP when the container
|
the lab operator wants to save their config, and then reconfigure VPP when the container
|
||||||
restarts.
|
restarts.
|
||||||
* I'll need to have a few files from `clab` shared with the host, notably the `startup.conf` and
|
* I'll need to have a few files from `clab` shared with the host, notably the `startup.conf` and
|
||||||
|
|||||||
@@ -0,0 +1,373 @@
|
|||||||
|
---
|
||||||
|
date: "2025-05-04T15:07:23Z"
|
||||||
|
title: 'VPP in Containerlab - Part 2'
|
||||||
|
params:
|
||||||
|
asciinema: true
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image float="right" src="/assets/containerlab/containerlab.svg" alt="Containerlab Logo" width="12em" >}}
|
||||||
|
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
From time to time the subject of containerized VPP instances comes up. At IPng, I run the routers in
|
||||||
|
AS8298 on bare metal (Supermicro and Dell hardware), as it allows me to maximize performance.
|
||||||
|
However, VPP is quite friendly in virtualization. Notably, it runs really well on virtual machines
|
||||||
|
like Qemu/KVM or VMWare. I can pass through PCI devices directly to the host, and use CPU pinning to
|
||||||
|
allow the guest virtual machine access to the underlying physical hardware. In such a mode, VPP
|
||||||
|
performance almost the same as on bare metal. But did you know that VPP can also run in Docker?
|
||||||
|
|
||||||
|
The other day I joined the [[ZANOG'25](https://nog.net.za/event1/zanog25/)] in Durban, South Africa.
|
||||||
|
One of the presenters was Nardus le Roux of Nokia, and he showed off a project called
|
||||||
|
[[Containerlab](https://containerlab.dev/)], which provides a CLI for orchestrating and managing
|
||||||
|
container-based networking labs. It starts the containers, builds virtual wiring between them to
|
||||||
|
create lab topologies of users' choice and manages the lab lifecycle.
|
||||||
|
|
||||||
|
Quite regularly I am asked 'when will you add VPP to Containerlab?', but at ZANOG I made a promise
|
||||||
|
to actually add it. In my previous [[article]({{< ref 2025-05-03-containerlab-1.md >}})], I took
|
||||||
|
a good look at VPP as a dockerized container. In this article, I'll explore how to make such a
|
||||||
|
container run in Containerlab!
|
||||||
|
|
||||||
|
## Completing the Docker container
|
||||||
|
|
||||||
|
Just having VPP running by itself in a container is not super useful (although it _is_ cool!). I
|
||||||
|
decide first to add a few bits and bobs that will come in handy in the `Dockerfile`:
|
||||||
|
|
||||||
|
```
|
||||||
|
FROM debian:bookworm
|
||||||
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
ARG VPP_INSTALL_SKIP_SYSCTL=true
|
||||||
|
ARG REPO=release
|
||||||
|
EXPOSE 22/tcp
|
||||||
|
RUN apt-get update && apt-get -y install curl procps tcpdump iproute2 iptables \
|
||||||
|
iputils-ping net-tools git python3 python3-pip vim-tiny openssh-server bird2 \
|
||||||
|
mtr-tiny traceroute && apt-get clean
|
||||||
|
|
||||||
|
# Install VPP
|
||||||
|
RUN mkdir -p /var/log/vpp /root/.ssh/
|
||||||
|
RUN curl -s https://packagecloud.io/install/repositories/fdio/${REPO}/script.deb.sh | bash
|
||||||
|
RUN apt-get update && apt-get -y install vpp vpp-plugin-core && apt-get clean
|
||||||
|
|
||||||
|
# Build vppcfg
|
||||||
|
RUN pip install --break-system-packages build netaddr yamale argparse pyyaml ipaddress
|
||||||
|
RUN git clone https://git.ipng.ch/ipng/vppcfg.git && cd vppcfg && python3 -m build && \
|
||||||
|
pip install --break-system-packages dist/vppcfg-*-py3-none-any.whl
|
||||||
|
|
||||||
|
# Config files
|
||||||
|
COPY files/etc/vpp/* /etc/vpp/
|
||||||
|
COPY files/etc/bird/* /etc/bird/
|
||||||
|
COPY files/init-container.sh /sbin/
|
||||||
|
RUN chmod 755 /sbin/init-container.sh
|
||||||
|
CMD ["/sbin/init-container.sh"]
|
||||||
|
```
|
||||||
|
|
||||||
|
A few notable additions:
|
||||||
|
* ***vppcfg*** is a handy utility I wrote and discussed in a previous [[article]({{< ref
|
||||||
|
2022-04-02-vppcfg-2 >}})]. Its purpose is to take YAML file that describes the configuration of
|
||||||
|
the dataplane (like which interfaces, sub-interfaces, MTU, IP addresses and so on), and then
|
||||||
|
apply this safely to a running dataplane. You can check it out in my
|
||||||
|
[[vppcfg](https://git.ipng.ch/ipng/vppcfg)] git repository.
|
||||||
|
* ***openssh-server*** will come in handy to log in to the container, in addition to the already
|
||||||
|
available `docker exec`.
|
||||||
|
* ***bird2*** which will be my controlplane of choice. At a future date, I might also add FRR,
|
||||||
|
which may be a good alterantive for some. VPP works well with both. You can check out Bird on
|
||||||
|
the nic.cz [[website](https://bird.network.cz/?get_doc&f=bird.html&v=20)].
|
||||||
|
|
||||||
|
I'll add a couple of default config files for Bird and VPP, and replace the CMD with a generic
|
||||||
|
`/sbin/init-container.sh` in which I can do any late binding stuff before launching VPP.
|
||||||
|
|
||||||
|
### Initializing the Container
|
||||||
|
|
||||||
|
#### VPP Containerlab: NetNS
|
||||||
|
|
||||||
|
VPP's Linux Control Plane plugin wants to run in its own network namespace. So the first order of
|
||||||
|
business of `/sbin/init-container.sh` is to create it:
|
||||||
|
|
||||||
|
```
|
||||||
|
NETNS=${NETNS:="dataplane"}
|
||||||
|
|
||||||
|
echo "Creating dataplane namespace"
|
||||||
|
/usr/bin/mkdir -p /etc/netns/$NETNS
|
||||||
|
/usr/bin/touch /etc/netns/$NETNS/resolv.conf
|
||||||
|
/usr/sbin/ip netns add $NETNS
|
||||||
|
```
|
||||||
|
|
||||||
|
#### VPP Containerlab: SSH
|
||||||
|
|
||||||
|
Then, I'll set the root password (which is `vpp` by the way), and start aan SSH daemon which allows
|
||||||
|
for password-less logins:
|
||||||
|
|
||||||
|
```
|
||||||
|
echo "Starting SSH, with credentials root:vpp"
|
||||||
|
sed -i -e 's,^#PermitRootLogin prohibit-password,PermitRootLogin yes,' /etc/ssh/sshd_config
|
||||||
|
sed -i -e 's,^root:.*,root:$y$j9T$kG8pyZEVmwLXEtXekQCRK.$9iJxq/bEx5buni1hrC8VmvkDHRy7ZMsw9wYvwrzexID:20211::::::,' /etc/shadow
|
||||||
|
/etc/init.d/ssh start
|
||||||
|
```
|
||||||
|
|
||||||
|
#### VPP Containerlab: Bird2
|
||||||
|
|
||||||
|
I can already predict that Bird2 won't be the only option for a controlplane, even though I'm a huge
|
||||||
|
fan of it. Therefore, I'll make it configurable to leave the door open for other controlplane
|
||||||
|
implementations in the future:
|
||||||
|
|
||||||
|
```
|
||||||
|
BIRD_ENABLED=${BIRD_ENABLED:="true"}
|
||||||
|
|
||||||
|
if [ "$BIRD_ENABLED" == "true" ]; then
|
||||||
|
echo "Starting Bird in $NETNS"
|
||||||
|
mkdir -p /run/bird /var/log/bird
|
||||||
|
chown bird:bird /var/log/bird
|
||||||
|
ROUTERID=$(ip -br a show eth0 | awk '{ print $3 }' | cut -f1 -d/)
|
||||||
|
sed -i -e "s,.*router id .*,router id $ROUTERID; # Set by container-init.sh," /etc/bird/bird.conf
|
||||||
|
/usr/bin/nsenter --net=/var/run/netns/$NETNS /usr/sbin/bird -u bird -g bird
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
I am reminded that Bird won't start if it cannot determine its _router id_. When I start it in the
|
||||||
|
`dataplane` namespace, it will immediately exit, because there will be no IP addresses configured
|
||||||
|
yet. But luckily, it logs its complaint and it's easily addressed. I decide to take the management
|
||||||
|
IPv4 address from `eth0` and write that into the `bird.conf` file, which otherwise does some basic
|
||||||
|
initialization that I described in a previous [[article]({{< ref 2021-09-02-vpp-5 >}})], so I'll
|
||||||
|
skip that here. However, I do include an empty file called `/etc/bird/bird-local.conf` for users to
|
||||||
|
further configure Bird2.
|
||||||
|
|
||||||
|
#### VPP Containerlab: Binding veth pairs
|
||||||
|
|
||||||
|
When Containerlab starts the VPP container, it'll offer it a set of `veth` ports that connect this
|
||||||
|
container to other nodes in the lab. This is done by the `links` list in the topology file
|
||||||
|
[[ref](https://containerlab.dev/manual/network/)]. It's my goal to take all of the interfaces
|
||||||
|
that are of type `veth`, and generate a little snippet to grab them and bind them into VPP while
|
||||||
|
setting their MTU to 9216 to allow for jumbo frames:
|
||||||
|
|
||||||
|
```
|
||||||
|
CLAB_VPP_FILE=${CLAB_VPP_FILE:=/etc/vpp/clab.vpp}
|
||||||
|
|
||||||
|
echo "Generating $CLAB_VPP_FILE"
|
||||||
|
: > $CLAB_VPP_FILE
|
||||||
|
MTU=9216
|
||||||
|
for IFNAME in $(ip -br link show type veth | cut -f1 -d@ | grep -v '^eth0$' | sort); do
|
||||||
|
MAC=$(ip -br link show dev $IFNAME | awk '{ print $3 }')
|
||||||
|
echo " * $IFNAME hw-addr $MAC mtu $MTU"
|
||||||
|
ip link set $IFNAME up mtu $MTU
|
||||||
|
cat << EOF >> $CLAB_VPP_FILE
|
||||||
|
create host-interface name $IFNAME hw-addr $MAC
|
||||||
|
set interface name host-$IFNAME $IFNAME
|
||||||
|
set interface mtu $MTU $IFNAME
|
||||||
|
set interface state $IFNAME up
|
||||||
|
|
||||||
|
EOF
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image width="5em" float="left" src="/assets/shared/warning.png" alt="Warning" >}}
|
||||||
|
|
||||||
|
One thing I realized is that VPP will assign a random MAC address on its copy of the `veth` port,
|
||||||
|
which is not great. I'll explicitly configure it with the same MAC address as the `veth` interface
|
||||||
|
itself, otherwise I'd have to put the interface into promiscuous mode.
|
||||||
|
|
||||||
|
#### VPP Containerlab: VPPcfg
|
||||||
|
|
||||||
|
I'm almost ready, but I have one more detail. The user will be able to offer a
|
||||||
|
[[vppcfg](https://git.ipng.ch/ipng/vppcfg)] YAML file to configure the interfaces and so on. If such
|
||||||
|
a file exists, I'll apply it to the dataplane upon startup:
|
||||||
|
|
||||||
|
```
|
||||||
|
VPPCFG_VPP_FILE=${VPPCFG_VPP_FILE:=/etc/vpp/vppcfg.vpp}
|
||||||
|
|
||||||
|
echo "Generating $VPPCFG_VPP_FILE"
|
||||||
|
: > $VPPCFG_VPP_FILE
|
||||||
|
if [ -r /etc/vpp/vppcfg.yaml ]; then
|
||||||
|
vppcfg plan --novpp -c /etc/vpp/vppcfg.yaml -o $VPPCFG_VPP_FILE
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
Once the VPP process starts, it'll execute `/etc/vpp/bootstrap.vpp`, which in turn executes these
|
||||||
|
newly generated `/etc/vpp/clab.vpp` to grab the `veth` interfaces, and then `/etc/vpp/vppcfg.vpp` to
|
||||||
|
further configure the dataplane. Easy peasy!
|
||||||
|
|
||||||
|
### Adding VPP to Containerlab
|
||||||
|
|
||||||
|
Roman points out a previous integration for the 6WIND VSR in
|
||||||
|
[[PR#2540](https://github.com/srl-labs/containerlab/pull/2540)]. This serves as a useful guide to
|
||||||
|
get me started. I fork the repo, create a branch so that Roman can also add a few commits, and
|
||||||
|
together we start hacking in [[PR#2571](https://github.com/srl-labs/containerlab/pull/2571)].
|
||||||
|
|
||||||
|
First, I add the documentation skeleton in `docs/manual/kinds/fdio_vpp.md`, which links in from a
|
||||||
|
few other places, and will be where the end-user facing documentation will live. That's about half
|
||||||
|
the contributed LOC, right there!
|
||||||
|
|
||||||
|
Next, I'll create a Go module in `nodes/fdio_vpp/fdio_vpp.go` which doesn't do much other than
|
||||||
|
creating the `struct`, and its required `Register` and `Init` functions. The `Init` function ensures
|
||||||
|
the right capabilities are set in Docker, and the right devices are bound for the container.
|
||||||
|
|
||||||
|
I notice that Containerlab rewrites the Dockerfile `CMD` string and prepends an `if-wait.sh` script
|
||||||
|
to it. This is because when Containerlab starts the container, it'll still be busy adding these
|
||||||
|
`link` interfaces to it, and if a container starts too quickly, it may not see all the interfaces.
|
||||||
|
So, containerlab informs the container using an environment variable called `CLAB_INTFS`, so this
|
||||||
|
script simply sleeps for a while until that exact amount of interfaces are present. Ok, cool beans.
|
||||||
|
|
||||||
|
Roman helps me a bit with Go templating. You see, I think it'll be slick to have the CLI prompt for
|
||||||
|
the VPP containers to reflect their hostname, because normally, VPP will assign `vpp# `. I add the
|
||||||
|
template in `nodes/fdio_vpp/vpp_startup_config.go.tpl` and it only has one variable expansion: `unix
|
||||||
|
{ cli-prompt {{ .ShortName }}# }`. But I totally think it's worth it, because when running many VPP
|
||||||
|
containers in the lab, it could otherwise get confusing.
|
||||||
|
|
||||||
|
Roman also shows me a trick in the function `PostDeploy()`, which will write the user's SSH pubkeys
|
||||||
|
to `/root/.ssh/authorized_keys`. This allows users to log in without having to use password
|
||||||
|
authentication.
|
||||||
|
|
||||||
|
Collectively, we decide to punt on the `SaveConfig` function until we're a bit further along. I have
|
||||||
|
an idea how this would work, basically along the lines of calling `vppcfg dump` and bind-mounting
|
||||||
|
that file into the lab directory somewhere. This way, upon restarting, the YAML file can be re-read
|
||||||
|
and the dataplane initialized. But it'll be for another day.
|
||||||
|
|
||||||
|
After the main module is finished, all I have to do is add it to `clab/register.go` and that's just
|
||||||
|
about it. In about 170 lines of code, 50 lines of Go template, and 170 lines of Markdown, this
|
||||||
|
contribution is about ready to ship!
|
||||||
|
|
||||||
|
### Containerlab: Demo
|
||||||
|
|
||||||
|
After I finish writing the documentation, I decide to include a demo with a quickstart to help folks
|
||||||
|
along. A simple lab showing two VPP instances and two Alpine Linux clients can be found on
|
||||||
|
[[git.ipng.ch/ipng/vpp-containerlab](https://git.ipng.ch/ipng/vpp-containerlab)]. Simply check out the
|
||||||
|
repo and start the lab, like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ git clone https://git.ipng.ch/ipng/vpp-containerlab.git
|
||||||
|
$ cd vpp-containerlab
|
||||||
|
$ containerlab deploy --topo vpp.clab.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Containerlab: configs
|
||||||
|
|
||||||
|
The file `vpp.clab.yml` contains an example topology existing of two VPP instances connected each to
|
||||||
|
one Alpine linux container, in the following topology:
|
||||||
|
|
||||||
|
{{< image src="/assets/containerlab/learn-vpp.png" alt="Containerlab Topo" width="100%" >}}
|
||||||
|
|
||||||
|
Two relevant files for each VPP router are included in this
|
||||||
|
[[repository](https://git.ipng.ch/ipng/vpp-containerlab)]:
|
||||||
|
1. `config/vpp*/vppcfg.yaml` configures the dataplane interfaces, including a loopback address.
|
||||||
|
1. `config/vpp*/bird-local.conf` configures the controlplane to enable BFD and OSPF.
|
||||||
|
|
||||||
|
To illustrate these files, let me take a closer look at node `vpp1`. It's VPP dataplane
|
||||||
|
configuration looks like this:
|
||||||
|
```
|
||||||
|
pim@summer:~/src/vpp-containerlab$ cat config/vpp1/vppcfg.yaml
|
||||||
|
interfaces:
|
||||||
|
eth1:
|
||||||
|
description: 'To client1'
|
||||||
|
mtu: 1500
|
||||||
|
lcp: eth1
|
||||||
|
addresses: [ 10.82.98.65/28, 2001:db8:8298:101::1/64 ]
|
||||||
|
eth2:
|
||||||
|
description: 'To vpp2'
|
||||||
|
mtu: 9216
|
||||||
|
lcp: eth2
|
||||||
|
addresses: [ 10.82.98.16/31, 2001:db8:8298:1::1/64 ]
|
||||||
|
loopbacks:
|
||||||
|
loop0:
|
||||||
|
description: 'vpp1'
|
||||||
|
lcp: loop0
|
||||||
|
addresses: [ 10.82.98.0/32, 2001:db8:8298::/128 ]
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, I enable BFD, OSPF and OSPFv3 on `eth2` and `loop0` on both of the VPP routers:
|
||||||
|
```
|
||||||
|
pim@summer:~/src/vpp-containerlab$ cat config/vpp1/bird-local.conf
|
||||||
|
protocol bfd bfd1 {
|
||||||
|
interface "eth2" { interval 100 ms; multiplier 30; };
|
||||||
|
}
|
||||||
|
|
||||||
|
protocol ospf v2 ospf4 {
|
||||||
|
ipv4 { import all; export all; };
|
||||||
|
area 0 {
|
||||||
|
interface "loop0" { stub yes; };
|
||||||
|
interface "eth2" { type pointopoint; cost 10; bfd on; };
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
protocol ospf v3 ospf6 {
|
||||||
|
ipv6 { import all; export all; };
|
||||||
|
area 0 {
|
||||||
|
interface "loop0" { stub yes; };
|
||||||
|
interface "eth2" { type pointopoint; cost 10; bfd on; };
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Containerlab: playtime!
|
||||||
|
|
||||||
|
Once the lab comes up, I can SSH to the VPP containers (`vpp1` and `vpp2`) which have my SSH pubkeys
|
||||||
|
installed thanks to Roman's work. Barring that, I could still log in as user `root` using
|
||||||
|
password `vpp`. VPP runs its own network namespace called `dataplane`, which is very similar to SR
|
||||||
|
Linux default `network-instance`. I can join that namespace to take a closer look:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~/src/vpp-containerlab$ ssh root@vpp1
|
||||||
|
root@vpp1:~# nsenter --net=/var/run/netns/dataplane
|
||||||
|
root@vpp1:~# ip -br a
|
||||||
|
lo DOWN
|
||||||
|
loop0 UP 10.82.98.0/32 2001:db8:8298::/128 fe80::dcad:ff:fe00:0/64
|
||||||
|
eth1 UNKNOWN 10.82.98.65/28 2001:db8:8298:101::1/64 fe80::a8c1:abff:fe77:acb9/64
|
||||||
|
eth2 UNKNOWN 10.82.98.16/31 2001:db8:8298:1::1/64 fe80::a8c1:abff:fef0:7125/64
|
||||||
|
|
||||||
|
root@vpp1:~# ping 10.82.98.1
|
||||||
|
PING 10.82.98.1 (10.82.98.1) 56(84) bytes of data.
|
||||||
|
64 bytes from 10.82.98.1: icmp_seq=1 ttl=64 time=9.53 ms
|
||||||
|
64 bytes from 10.82.98.1: icmp_seq=2 ttl=64 time=15.9 ms
|
||||||
|
^C
|
||||||
|
--- 10.82.98.1 ping statistics ---
|
||||||
|
2 packets transmitted, 2 received, 0% packet loss, time 1002ms
|
||||||
|
rtt min/avg/max/mdev = 9.530/12.735/15.941/3.205 ms
|
||||||
|
```
|
||||||
|
|
||||||
|
From `vpp1`, I can tell that Bird2's OSPF adjacency has formed, because I can ping the `loop0`
|
||||||
|
address of `vpp2` router on 10.82.98.1. Nice! The two client nodes are running a minimalistic Alpine
|
||||||
|
Linux container, which doesn't ship with SSH by default. But of course I can still enter the
|
||||||
|
containers using `docker exec`, like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~/src/vpp-containerlab$ docker exec -it client1 sh
|
||||||
|
/ # ip addr show dev eth1
|
||||||
|
531235: eth1@if531234: <BROADCAST,MULTICAST,UP,LOWER_UP,M-DOWN> mtu 9500 qdisc noqueue state UP
|
||||||
|
link/ether 00:c1:ab:00:00:01 brd ff:ff:ff:ff:ff:ff
|
||||||
|
inet 10.82.98.66/28 scope global eth1
|
||||||
|
valid_lft forever preferred_lft forever
|
||||||
|
inet6 2001:db8:8298:101::2/64 scope global
|
||||||
|
valid_lft forever preferred_lft forever
|
||||||
|
inet6 fe80::2c1:abff:fe00:1/64 scope link
|
||||||
|
valid_lft forever preferred_lft forever
|
||||||
|
/ # traceroute 10.82.98.82
|
||||||
|
traceroute to 10.82.98.82 (10.82.98.82), 30 hops max, 46 byte packets
|
||||||
|
1 10.82.98.65 (10.82.98.65) 5.906 ms 7.086 ms 7.868 ms
|
||||||
|
2 10.82.98.17 (10.82.98.17) 24.007 ms 23.349 ms 15.933 ms
|
||||||
|
3 10.82.98.82 (10.82.98.82) 39.978 ms 31.127 ms 31.854 ms
|
||||||
|
|
||||||
|
/ # traceroute 2001:db8:8298:102::2
|
||||||
|
traceroute to 2001:db8:8298:102::2 (2001:db8:8298:102::2), 30 hops max, 72 byte packets
|
||||||
|
1 2001:db8:8298:101::1 (2001:db8:8298:101::1) 0.701 ms 7.144 ms 7.900 ms
|
||||||
|
2 2001:db8:8298:1::2 (2001:db8:8298:1::2) 23.909 ms 22.943 ms 23.893 ms
|
||||||
|
3 2001:db8:8298:102::2 (2001:db8:8298:102::2) 31.964 ms 30.814 ms 32.000 ms
|
||||||
|
```
|
||||||
|
|
||||||
|
From the vantage point of `client1`, the first hop represents the `vpp1` node, which forwards to
|
||||||
|
`vpp2`, which finally forwards to `client2`, which shows that both VPP routers are passing traffic.
|
||||||
|
Dope!
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
After all of this deep-diving, all that's left is for me to demonstrate the Containerlab by means of
|
||||||
|
this little screencast [[asciinema](/assets/containerlab/vpp-containerlab.cast)]. I hope you enjoy
|
||||||
|
it as much as I enjoyed creating it:
|
||||||
|
|
||||||
|
{{< asciinema src="/assets/containerlab/vpp-containerlab.cast" >}}
|
||||||
|
|
||||||
|
## Acknowledgements
|
||||||
|
|
||||||
|
I wanted to give a shout-out Roman Dodin for his help getting the Containerlab parts squared away
|
||||||
|
when I got a little bit stuck. He took the time to explain the internals and idiom of Containerlab
|
||||||
|
project, which really saved me a tonne of time. He also pair-programmed the
|
||||||
|
[[PR#2471](https://github.com/srl-labs/containerlab/pull/2571)] with me over the span of two
|
||||||
|
evenings.
|
||||||
|
|
||||||
|
Collaborative open source rocks!
|
||||||
@@ -0,0 +1,713 @@
|
|||||||
|
---
|
||||||
|
date: "2025-05-28T22:07:23Z"
|
||||||
|
title: 'Case Study: Minio S3 - Part 1'
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image float="right" src="/assets/minio/minio-logo.png" alt="MinIO Logo" width="6em" >}}
|
||||||
|
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
Amazon Simple Storage Service (Amazon S3) is an object storage service offering industry-leading
|
||||||
|
scalability, data availability, security, and performance. Millions of customers of all sizes and
|
||||||
|
industries store, manage, analyze, and protect any amount of data for virtually any use case, such
|
||||||
|
as data lakes, cloud-native applications, and mobile apps. With cost-effective storage classes and
|
||||||
|
easy-to-use management features, you can optimize costs, organize and analyze data, and configure
|
||||||
|
fine-tuned access controls to meet specific business and compliance requirements.
|
||||||
|
|
||||||
|
Amazon's S3 became the _de facto_ standard object storage system, and there exist several fully open
|
||||||
|
source implementations of the protocol. One of them is MinIO: designed to allow enterprises to
|
||||||
|
consolidate all of their data on a single, private cloud namespace. Architected using the same
|
||||||
|
principles as the hyperscalers, AIStor delivers performance at scale at a fraction of the cost
|
||||||
|
compared to the public cloud.
|
||||||
|
|
||||||
|
IPng Networks is an Internet Service Provider, but I also dabble in self-hosting things, for
|
||||||
|
example [[PeerTube](https://video.ipng.ch/)], [[Mastodon](https://ublog.tech/)],
|
||||||
|
[[Immich](https://photos.ipng.ch/)], [[Pixelfed](https://pix.ublog.tech/)] and of course
|
||||||
|
[[Hugo](https://ipng.ch/)]. These services all have one thing in common: they tend to use lots of
|
||||||
|
storage when they grow. At IPng Networks, all hypervisors ship with enterprise SAS flash drives,
|
||||||
|
mostly 1.92TB and 3.84TB. Scaling up each of these services, and backing them up safely, can be
|
||||||
|
quite the headache.
|
||||||
|
|
||||||
|
This article is for the storage-buffs. I'll set up a set of distributed MinIO nodes from scatch.
|
||||||
|
|
||||||
|
## Physical
|
||||||
|
|
||||||
|
{{< image float="right" src="/assets/minio/disks.png" alt="MinIO Disks" width="16em" >}}
|
||||||
|
|
||||||
|
I'll start with the basics. I still have a few Dell R720 servers laying around, they are getting a
|
||||||
|
bit older but still have 24 cores and 64GB of memory. First I need to get me some disks. I order
|
||||||
|
36pcs of 16TB SATA enterprise disk, a mixture of Seagate EXOS and Toshiba MG series disks. I've once
|
||||||
|
learned (the hard way), that buying a big stack of disks from one production run is a risk - so I'll
|
||||||
|
mix and match the drives.
|
||||||
|
|
||||||
|
Three trays of caddies and a melted credit card later, I have 576TB of SATA disks safely in hand.
|
||||||
|
Each machine will carry 192TB of raw storage. The nice thing about this chassis is that Dell can
|
||||||
|
ship them with 12x 3.5" SAS slots in the front, and 2x 2.5" SAS slots in the rear of the chassis.
|
||||||
|
|
||||||
|
So I'll install Debian Bookworm on one small 480G SSD in software RAID1.
|
||||||
|
|
||||||
|
### Cloning an install
|
||||||
|
|
||||||
|
I have three identical machines so in total I'll want six of these SSDs. I temporarily screw the
|
||||||
|
other five in 3.5" drive caddies and plug them into the first installed Dell, which I've called
|
||||||
|
`minio-proto`:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@minio-proto:~$ for i in b c d e f; do
|
||||||
|
sudo dd if=/dev/sda of=/dev/sd${i} bs=512 count=1;
|
||||||
|
sudo mdadm --manage /dev/md0 --add /dev/md${i}1
|
||||||
|
done
|
||||||
|
pim@minio-proto:~$ sudo mdadm --manage /dev/md0 --grow 6
|
||||||
|
pim@minio-proto:~$ watch cat /proc/mdstat
|
||||||
|
pim@minio-proto:~$ for i in a b c d e f; do
|
||||||
|
sudo grub-install /dev/sd$i
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image float="right" src="/assets/minio/rack.png" alt="MinIO Rack" width="16em" >}}
|
||||||
|
|
||||||
|
The first command takes my installed disk, `/dev/sda`, and copies the first sector over to the other
|
||||||
|
five. This will give them the same partition table. Next, I'll add the first partition of each disk
|
||||||
|
to the raidset. Then, I'll expand the raidset to have six members, after which the kernel starts a
|
||||||
|
recovery process that syncs the newly added paritions to `/dev/md0` (by copying from `/dev/sda` to
|
||||||
|
all other disks at once). Finally, I'll watch this exciting movie and grab a cup of tea.
|
||||||
|
|
||||||
|
|
||||||
|
Once the disks are fully copied, I'll shut down the machine and distribute the disks to their
|
||||||
|
respective Dell R720, two each. Once they boot they will all be identical. I'll need to make sure
|
||||||
|
their hostnames, and machine/host-id are unique, otherwise things like bridges will have overlapping
|
||||||
|
MAC addresses - ask me how I know:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@minio-proto:~$ sudo mdadm --manage /dev/md0 --grow -n 2
|
||||||
|
pim@minio-proto:~$ sudo rm /etc/ssh/ssh_host*
|
||||||
|
pim@minio-proto:~$ sudo hostname minio0-chbtl0
|
||||||
|
pim@minio-proto:~$ sudo dpkg-reconfigure openssh-server
|
||||||
|
pim@minio-proto:~$ sudo dd if=/dev/random of=/etc/hostid bs=4 count=1
|
||||||
|
pim@minio-proto:~$ sudo /usr/bin/dbus-uuidgen > /etc/machine-id
|
||||||
|
pim@minio-proto:~$ sudo reboot
|
||||||
|
```
|
||||||
|
|
||||||
|
After which I have three beautiful and unique machines:
|
||||||
|
* `minio0.chbtl0.net.ipng.ch`: which will go into my server rack at the IPng office.
|
||||||
|
* `minio0.ddln0.net.ipng.ch`: which will go to [[Daedalean]({{< ref
|
||||||
|
2022-02-24-colo >}})], doing AI since before it was all about vibe coding.
|
||||||
|
* `minio0.chrma0.net.ipng.ch`: which will go to [[IP-Max](https://ip-max.net/)], one of the best
|
||||||
|
ISPs on the planet. 🥰
|
||||||
|
|
||||||
|
|
||||||
|
## Deploying Minio
|
||||||
|
|
||||||
|
The user guide that MinIO provides
|
||||||
|
[[ref](https://min.io/docs/minio/linux/operations/installation.html)] is super good, arguably one of
|
||||||
|
the best documented open source projects I've ever seen. it shows me that I can do three types of
|
||||||
|
install. A 'Standalone' with one disk, a 'Standalone Multi-Drive', and a 'Distributed' deployment.
|
||||||
|
I decide to make three independent standalone multi-drive installs. This way, I have less shared
|
||||||
|
fate, and will be immune to network partitions (as these are going to be in three different
|
||||||
|
physical locations). I've also read about per-bucket _replication_, which will be an excellent way
|
||||||
|
to get geographical distribution and active/active instances to work together.
|
||||||
|
|
||||||
|
I feel good about the single-machine multi-drive decision. I follow the install guide
|
||||||
|
[[ref](https://min.io/docs/minio/linux/operations/install-deploy-manage/deploy-minio-single-node-multi-drive.html#minio-snmd)]
|
||||||
|
for this deployment type.
|
||||||
|
|
||||||
|
### IPng Frontends
|
||||||
|
|
||||||
|
At IPng I use a private IPv4/IPv6/MPLS network that is not connected to the internet. I call this
|
||||||
|
network [[IPng Site Local]({{< ref 2023-03-11-mpls-core.md >}})]. But how will users reach my Minio
|
||||||
|
install? I have four redundantly and geographically deployed frontends, two in the Netherlands and
|
||||||
|
two in Switzerland. I've described the frontend setup in a [[previous article]({{< ref
|
||||||
|
2023-03-17-ipng-frontends >}})] and the certificate management in [[this article]({{< ref
|
||||||
|
2023-03-24-lego-dns01 >}})].
|
||||||
|
|
||||||
|
I've decided to run the service on these three regionalized endpoints:
|
||||||
|
1. `s3.chbtl0.ipng.ch` which will back into `minio0.chbtl0.net.ipng.ch`
|
||||||
|
1. `s3.ddln0.ipng.ch` which will back into `minio0.ddln0.net.ipng.ch`
|
||||||
|
1. `s3.chrma0.ipng.ch` which will back into `minio0.chrma0.net.ipng.ch`
|
||||||
|
|
||||||
|
The first thing I take note of is that S3 buckets can be either addressed _by path_, in other words
|
||||||
|
something like `s3.chbtl0.ipng.ch/my-bucket/README.md`, but they can also be addressed by virtual
|
||||||
|
host, like so: `my-bucket.s3.chbtl0.ipng.ch/README.md`. A subtle difference, but from the docs I
|
||||||
|
understand that Minio needs to have control of the whole space under its main domain.
|
||||||
|
|
||||||
|
There's a small implication to this requirement -- the Web Console that ships with MinIO (eh, well,
|
||||||
|
maybe that's going to change, more on that later), will want to have its own domain-name, so I
|
||||||
|
choose something simple: `cons0-s3.chbtl0.ipng.ch` and so on. This way, somebody might still be able
|
||||||
|
to have a bucket name called `cons0` :)
|
||||||
|
|
||||||
|
#### Let's Encrypt Certificates
|
||||||
|
|
||||||
|
Alright, so I will be neading nine domains into this new certificate which I'll simply call
|
||||||
|
`s3.ipng.ch`. I configure it in Ansible:
|
||||||
|
|
||||||
|
```
|
||||||
|
certbot:
|
||||||
|
certs:
|
||||||
|
...
|
||||||
|
s3.ipng.ch:
|
||||||
|
groups: [ 'nginx', 'minio' ]
|
||||||
|
altnames:
|
||||||
|
- 's3.chbtl0.ipng.ch'
|
||||||
|
- 'cons0-s3.chbtl0.ipng.ch'
|
||||||
|
- '*.s3.chbtl0.ipng.ch'
|
||||||
|
- 's3.ddln0.ipng.ch'
|
||||||
|
- 'cons0-s3.ddln0.ipng.ch'
|
||||||
|
- '*.s3.ddln0.ipng.ch'
|
||||||
|
- 's3.chrma0.ipng.ch'
|
||||||
|
- 'cons0-s3.chrma0.ipng.ch'
|
||||||
|
- '*.s3.chrma0.ipng.ch'
|
||||||
|
```
|
||||||
|
|
||||||
|
I run the `certbot` playbook and it does two things:
|
||||||
|
1. On the machines from group `nginx` and `minio`, it will ensure there exists a user `lego` with
|
||||||
|
an SSH key and write permissions to `/etc/lego/`; this is where the automation will write (and
|
||||||
|
update) the certificate keys.
|
||||||
|
1. On the `lego` machine, it'll create two files. One is the certificate requestor, and the other
|
||||||
|
is a certificate distribution script that will copy the cert to the right machine(s) when it
|
||||||
|
renews.
|
||||||
|
|
||||||
|
On the `lego` machine, I'll run the cert request for the first time:
|
||||||
|
|
||||||
|
```
|
||||||
|
lego@lego:~$ bin/certbot:s3.ipng.ch
|
||||||
|
lego@lego:~$ RENEWED_LINEAGE=/home/lego/acme-dns/live/s3.ipng.ch bin/certbot-distribute
|
||||||
|
```
|
||||||
|
|
||||||
|
The first script asks me to add the _acme-challenge DNS entries, which I'll do, for example on the
|
||||||
|
`s3.chbtl0.ipng.ch` instance (and similar for the `ddln0` and `chrma0` ones:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ORIGIN chbtl0.ipng.ch.
|
||||||
|
_acme-challenge.s3 CNAME 51f16fd0-8eb6-455c-b5cd-96fad12ef8fd.auth.ipng.ch.
|
||||||
|
_acme-challenge.cons0-s3 CNAME 450477b8-74c9-4b9e-bbeb-de49c3f95379.auth.ipng.ch.
|
||||||
|
s3 CNAME nginx0.ipng.ch.
|
||||||
|
*.s3 CNAME nginx0.ipng.ch.
|
||||||
|
cons0-s3 CNAME nginx0.ipng.ch.
|
||||||
|
```
|
||||||
|
|
||||||
|
I push and reload the `ipng.ch` zonefile with these changes after which the certificate gets
|
||||||
|
requested and a cronjob added to check for renewals. The second script will copy the newly created
|
||||||
|
cert to all three `minio` machines, and all four `nginx` machines. From now on, every 90 days, a new
|
||||||
|
cert will be automatically generated and distributed. Slick!
|
||||||
|
|
||||||
|
#### NGINX Configs
|
||||||
|
|
||||||
|
With the LE wildcard certs in hand, I can create an NGINX frontend for these minio deployments.
|
||||||
|
|
||||||
|
First, a simple redirector service that punts people on port 80 to port 443:
|
||||||
|
|
||||||
|
```
|
||||||
|
server {
|
||||||
|
listen [::]:80;
|
||||||
|
listen 0.0.0.0:80;
|
||||||
|
|
||||||
|
server_name cons0-s3.chbtl0.ipng.ch s3.chbtl0.ipng.ch *.s3.chbtl0.ipng.ch;
|
||||||
|
access_log /var/log/nginx/s3.chbtl0.ipng.ch-access.log;
|
||||||
|
include /etc/nginx/conf.d/ipng-headers.inc;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
return 301 https://$server_name$request_uri;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Next, the Minio API service itself which runs on port 9000, with a configuration snippet inspired by
|
||||||
|
the MinIO [[docs](https://min.io/docs/minio/linux/integrations/setup-nginx-proxy-with-minio.html)]:
|
||||||
|
|
||||||
|
```
|
||||||
|
server {
|
||||||
|
listen [::]:443 ssl http2;
|
||||||
|
listen 0.0.0.0:443 ssl http2;
|
||||||
|
ssl_certificate /etc/certs/s3.ipng.ch/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/certs/s3.ipng.ch/privkey.pem;
|
||||||
|
include /etc/nginx/conf.d/options-ssl-nginx.inc;
|
||||||
|
ssl_dhparam /etc/nginx/conf.d/ssl-dhparams.inc;
|
||||||
|
|
||||||
|
server_name s3.chbtl0.ipng.ch *.s3.chbtl0.ipng.ch;
|
||||||
|
access_log /var/log/nginx/s3.chbtl0.ipng.ch-access.log upstream;
|
||||||
|
include /etc/nginx/conf.d/ipng-headers.inc;
|
||||||
|
|
||||||
|
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
|
||||||
|
|
||||||
|
ignore_invalid_headers off;
|
||||||
|
client_max_body_size 0;
|
||||||
|
# Disable buffering
|
||||||
|
proxy_buffering off;
|
||||||
|
proxy_request_buffering off;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_set_header Host $http_host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
proxy_connect_timeout 300;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Connection "";
|
||||||
|
chunked_transfer_encoding off;
|
||||||
|
|
||||||
|
proxy_pass http://minio0.chbtl0.net.ipng.ch:9000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, the Minio Console service which runs on port 9090:
|
||||||
|
|
||||||
|
```
|
||||||
|
include /etc/nginx/conf.d/geo-ipng-trusted.inc;
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen [::]:443 ssl http2;
|
||||||
|
listen 0.0.0.0:443 ssl http2;
|
||||||
|
ssl_certificate /etc/certs/s3.ipng.ch/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/certs/s3.ipng.ch/privkey.pem;
|
||||||
|
include /etc/nginx/conf.d/options-ssl-nginx.inc;
|
||||||
|
ssl_dhparam /etc/nginx/conf.d/ssl-dhparams.inc;
|
||||||
|
|
||||||
|
server_name cons0-s3.chbtl0.ipng.ch;
|
||||||
|
access_log /var/log/nginx/cons0-s3.chbtl0.ipng.ch-access.log upstream;
|
||||||
|
include /etc/nginx/conf.d/ipng-headers.inc;
|
||||||
|
|
||||||
|
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
|
||||||
|
|
||||||
|
ignore_invalid_headers off;
|
||||||
|
client_max_body_size 0;
|
||||||
|
# Disable buffering
|
||||||
|
proxy_buffering off;
|
||||||
|
proxy_request_buffering off;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
if ($geo_ipng_trusted = 0) { rewrite ^ https://ipng.ch/ break; }
|
||||||
|
proxy_set_header Host $http_host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_set_header X-NginX-Proxy true;
|
||||||
|
|
||||||
|
real_ip_header X-Real-IP;
|
||||||
|
proxy_connect_timeout 300;
|
||||||
|
chunked_transfer_encoding off;
|
||||||
|
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
|
||||||
|
proxy_pass http://minio0.chbtl0.net.ipng.ch:9090;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This last one has an NGINX trick. It will only allow users in if they are in the map called
|
||||||
|
`geo_ipng_trusted`, which contains a set of IPv4 and IPv6 prefixes. Visitors who are not in this map
|
||||||
|
will receive an HTTP redirect back to the [[IPng.ch](https://ipng.ch/)] homepage instead.
|
||||||
|
|
||||||
|
I run the Ansible Playbook which contains the NGINX changes to all frontends, but of course nothing
|
||||||
|
runs yet, because I haven't yet started MinIO backends.
|
||||||
|
|
||||||
|
### MinIO Backends
|
||||||
|
|
||||||
|
The first thing I need to do is get those disks mounted. MinIO likes using XFS, so I'll install that
|
||||||
|
and prepare the disks as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@minio0-chbtl0:~$ sudo apt install xfsprogs
|
||||||
|
pim@minio0-chbtl0:~$ sudo modprobe xfs
|
||||||
|
pim@minio0-chbtl0:~$ echo xfs | sudo tee -a /etc/modules
|
||||||
|
pim@minio0-chbtl0:~$ sudo update-initramfs -k all -u
|
||||||
|
pim@minio0-chbtl0:~$ for i in a b c d e f g h i j k l; do sudo mkfs.xfs /dev/sd$i; done
|
||||||
|
pim@minio0-chbtl0:~$ blkid | awk 'BEGIN {i=1} /TYPE="xfs"/ {
|
||||||
|
printf "%s /minio/disk%d xfs defaults 0 2\n",$2,i; i++;
|
||||||
|
}' | sudo tee -a /etc/fstab
|
||||||
|
pim@minio0-chbtl0:~$ for i in `seq 1 12`; do sudo mkdir -p /minio/disk$i; done
|
||||||
|
pim@minio0-chbtl0:~$ sudo mount -t xfs -a
|
||||||
|
pim@minio0-chbtl0:~$ sudo chown -R minio-user: /minio/
|
||||||
|
```
|
||||||
|
|
||||||
|
From the top: I'll install `xfsprogs` which contains the things I need to manipulate XFS filesystems
|
||||||
|
in Debian. Then I'll install the `xfs` kernel module, and make sure it gets inserted upon subsequent
|
||||||
|
startup by adding it to `/etc/modules` and regenerating the initrd for the installed kernels.
|
||||||
|
|
||||||
|
Next, I'll format all twelve 16TB disks (which are `/dev/sda` - `/dev/sdl` on these machines), and
|
||||||
|
add their resulting blockdevice id's to `/etc/fstab` so they get persistently mounted on reboot.
|
||||||
|
|
||||||
|
Finally, I'll create their mountpoints, mount all XFS filesystems, and chown them to the user that
|
||||||
|
MinIO is running as. End result:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@minio0-chbtl0:~$ df -T
|
||||||
|
Filesystem Type 1K-blocks Used Available Use% Mounted on
|
||||||
|
udev devtmpfs 32950856 0 32950856 0% /dev
|
||||||
|
tmpfs tmpfs 6595340 1508 6593832 1% /run
|
||||||
|
/dev/md0 ext4 114695308 5423976 103398948 5% /
|
||||||
|
tmpfs tmpfs 32976680 0 32976680 0% /dev/shm
|
||||||
|
tmpfs tmpfs 5120 4 5116 1% /run/lock
|
||||||
|
/dev/sda xfs 15623792640 121505936 15502286704 1% /minio/disk1
|
||||||
|
/dev/sde xfs 15623792640 121505968 15502286672 1% /minio/disk12
|
||||||
|
/dev/sdi xfs 15623792640 121505968 15502286672 1% /minio/disk11
|
||||||
|
/dev/sdl xfs 15623792640 121505904 15502286736 1% /minio/disk10
|
||||||
|
/dev/sdd xfs 15623792640 121505936 15502286704 1% /minio/disk4
|
||||||
|
/dev/sdb xfs 15623792640 121505968 15502286672 1% /minio/disk3
|
||||||
|
/dev/sdk xfs 15623792640 121505936 15502286704 1% /minio/disk5
|
||||||
|
/dev/sdc xfs 15623792640 121505936 15502286704 1% /minio/disk9
|
||||||
|
/dev/sdf xfs 15623792640 121506000 15502286640 1% /minio/disk2
|
||||||
|
/dev/sdj xfs 15623792640 121505968 15502286672 1% /minio/disk7
|
||||||
|
/dev/sdg xfs 15623792640 121506000 15502286640 1% /minio/disk8
|
||||||
|
/dev/sdh xfs 15623792640 121505968 15502286672 1% /minio/disk6
|
||||||
|
tmpfs tmpfs 6595336 0 6595336 0% /run/user/0
|
||||||
|
```
|
||||||
|
|
||||||
|
MinIO likes to be configured using environment variables - and this is likely because it's a popular
|
||||||
|
thing to run in a containerized environment like Kubernetes. The maintainers ship it also as a
|
||||||
|
Debian package, which will read its environment from `/etc/default/minio`, and I'll prepare that
|
||||||
|
file as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@minio0-chbtl0:~$ cat << EOF | sudo tee /etc/default/minio
|
||||||
|
MINIO_DOMAIN="s3.chbtl0.ipng.ch,minio0.chbtl0.net.ipng.ch"
|
||||||
|
MINIO_ROOT_USER="XXX"
|
||||||
|
MINIO_ROOT_PASSWORD="YYY"
|
||||||
|
MINIO_VOLUMES="/minio/disk{1...12}"
|
||||||
|
MINIO_OPTS="--console-address :9001"
|
||||||
|
EOF
|
||||||
|
pim@minio0-chbtl0:~$ sudo systemctl enable --now minio
|
||||||
|
pim@minio0-chbtl0:~$ sudo journalctl -u minio
|
||||||
|
May 31 10:44:11 minio0-chbtl0 minio[690420]: MinIO Object Storage Server
|
||||||
|
May 31 10:44:11 minio0-chbtl0 minio[690420]: Copyright: 2015-2025 MinIO, Inc.
|
||||||
|
May 31 10:44:11 minio0-chbtl0 minio[690420]: License: GNU AGPLv3 - https://www.gnu.org/licenses/agpl-3.0.html
|
||||||
|
May 31 10:44:11 minio0-chbtl0 minio[690420]: Version: RELEASE.2025-05-24T17-08-30Z (go1.24.3 linux/amd64)
|
||||||
|
May 31 10:44:11 minio0-chbtl0 minio[690420]: API: http://198.19.4.11:9000 http://127.0.0.1:9000
|
||||||
|
May 31 10:44:11 minio0-chbtl0 minio[690420]: WebUI: https://cons0-s3.chbtl0.ipng.ch/
|
||||||
|
May 31 10:44:11 minio0-chbtl0 minio[690420]: Docs: https://docs.min.io
|
||||||
|
|
||||||
|
pim@minio0-chbtl0:~$ sudo ipmitool sensor | grep Watts
|
||||||
|
Pwr Consumption | 154.000 | Watts
|
||||||
|
```
|
||||||
|
|
||||||
|
Incidentally - I am pretty pleased with this 192TB disk tank, sporting 24 cores, 64GB memory and
|
||||||
|
2x10G network, casually hanging out at 154 Watts of power all up. Slick!
|
||||||
|
|
||||||
|
{{< image float="right" src="/assets/minio/minio-ec.svg" alt="MinIO Erasure Coding" width="22em" >}}
|
||||||
|
|
||||||
|
MinIO implements _erasure coding_ as a core component in providing availability and resiliency
|
||||||
|
during drive or node-level failure events. MinIO partitions each object into data and parity shards
|
||||||
|
and distributes those shards across a single so-called _erasure set_. Under the hood, it uses
|
||||||
|
[[Reed-Solomon](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction)] erasure coding
|
||||||
|
implementation and partitions the object for distribution. From the MinIO website, I'll borrow a
|
||||||
|
diagram to show how it looks like on a single node like mine to the right.
|
||||||
|
|
||||||
|
Anyway, MinIO detects 12 disks and installs an erasure set with 8 data disks and 4 parity disks,
|
||||||
|
which it calls `EC:4` encoding, also known in the industry as `RS8.4`.
|
||||||
|
Just like that, the thing shoots to life. Awesome!
|
||||||
|
|
||||||
|
### MinIO Client
|
||||||
|
|
||||||
|
On Summer, I'll install the MinIO Client called `mc`. This is easy because the maintainers ship a
|
||||||
|
Linux binary which I can just download. On OpenBSD, they don't do that. Not a problem though, on
|
||||||
|
Squanchy, Pencilvester and Glootie, I will just `go install` the client. Using the `mc` commandline,
|
||||||
|
I can all any of the S3 APIs on my new MinIO instance:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~$ set +o history
|
||||||
|
pim@summer:~$ mc alias set chbtl0 https://s3.chbtl0.ipng.ch/ <rootuser> <rootpass>
|
||||||
|
pim@summer:~$ set -o history
|
||||||
|
pim@summer:~$ mc admin info chbtl0/
|
||||||
|
● s3.chbtl0.ipng.ch
|
||||||
|
Uptime: 22 hours
|
||||||
|
Version: 2025-05-24T17:08:30Z
|
||||||
|
Network: 1/1 OK
|
||||||
|
Drives: 12/12 OK
|
||||||
|
Pool: 1
|
||||||
|
|
||||||
|
┌──────┬───────────────────────┬─────────────────────┬──────────────┐
|
||||||
|
│ Pool │ Drives Usage │ Erasure stripe size │ Erasure sets │
|
||||||
|
│ 1st │ 0.8% (total: 116 TiB) │ 12 │ 1 │
|
||||||
|
└──────┴───────────────────────┴─────────────────────┴──────────────┘
|
||||||
|
|
||||||
|
95 GiB Used, 5 Buckets, 5,859 Objects, 318 Versions, 1 Delete Marker
|
||||||
|
12 drives online, 0 drives offline, EC:4
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Cool beans. I think I should get rid of this root account though, I've installed those credentials
|
||||||
|
into the `/etc/default/minio` environment file, but I don't want to keep them out in the open. So
|
||||||
|
I'll make an account for myself and assign me reasonable privileges, called `consoleAdmin` in the
|
||||||
|
default install:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~$ set +o history
|
||||||
|
pim@summer:~$ mc admin user add chbtl0/ <someuser> <somepass>
|
||||||
|
pim@summer:~$ mc admin policy info chbtl0 consoleAdmin
|
||||||
|
pim@summer:~$ mc admin policy attach chbtl0 consoleAdmin --user=<someuser>
|
||||||
|
pim@summer:~$ mc alias set chbtl0 https://s3.chbtl0.ipng.ch/ <someuser> <somepass>
|
||||||
|
pim@summer:~$ set -o history
|
||||||
|
```
|
||||||
|
|
||||||
|
OK, I feel less gross now that I'm not operating as root on the MinIO deployment. Using my new
|
||||||
|
user-powers, let me set some metadata on my new minio server:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~$ mc admin config set chbtl0/ site name=chbtl0 region=switzerland
|
||||||
|
Successfully applied new settings.
|
||||||
|
Please restart your server 'mc admin service restart chbtl0/'.
|
||||||
|
pim@summer:~$ mc admin service restart chbtl0/
|
||||||
|
Service status: ▰▰▱ [DONE]
|
||||||
|
Summary:
|
||||||
|
┌───────────────┬─────────────────────────────┐
|
||||||
|
│ Servers: │ 1 online, 0 offline, 0 hung │
|
||||||
|
│ Restart Time: │ 61.322886ms │
|
||||||
|
└───────────────┴─────────────────────────────┘
|
||||||
|
pim@summer:~$ mc admin config get chbtl0/ site
|
||||||
|
site name=chbtl0 region=switzerland
|
||||||
|
```
|
||||||
|
|
||||||
|
By the way, what's really cool about these open standards is that both the Amazon `aws` client works
|
||||||
|
with MinIO, but `mc` also works with AWS!
|
||||||
|
### MinIO Console
|
||||||
|
|
||||||
|
Although I'm pretty good with APIs and command line tools, there's some benefit also in using a
|
||||||
|
Graphical User Interface. MinIO ships with one, but there was a bit of a kerfuffle in the MinIO
|
||||||
|
community. Unfortunately, these are pretty common -- Redis (an open source key/value storage system)
|
||||||
|
changed their offering abruptly. Terraform (an open source infrastructure-as-code tool) changed
|
||||||
|
their licensing at some point. Ansible (an open source machine management tool) changed their
|
||||||
|
offering also. MinIO developers decided to strip their console of ~all features recently. The gnarly
|
||||||
|
bits are discussed on
|
||||||
|
[[reddit](https://www.reddit.com/r/selfhosted/comments/1kva3pw/avoid_minio_developers_introduce_trojan_horse/)].
|
||||||
|
but suffice to say: the same thing that happened in literally 100% of the other cases, also happened
|
||||||
|
here. Somebody decided to simply fork the code from before it was changed.
|
||||||
|
|
||||||
|
Enter OpenMaxIO. A cringe worthy name, but it gets the job done. Reading up on the
|
||||||
|
[[GitHub](https://github.com/OpenMaxIO/openmaxio-object-browser/issues/5)], reviving the fully
|
||||||
|
working console is pretty straight forward -- that is, once somebody spent a few days figuring it
|
||||||
|
out. Thank you `icesvz` for this excellent pointer. With this, I can create a systemd service for
|
||||||
|
the console and start it:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@minio0-chbtl0:~$ cat << EOF | sudo tee -a /etc/default/minio
|
||||||
|
## NOTE(pim): For openmaxio console service
|
||||||
|
CONSOLE_MINIO_SERVER="http://localhost:9000"
|
||||||
|
MINIO_BROWSER_REDIRECT_URL="https://cons0-s3.chbtl0.ipng.ch/"
|
||||||
|
EOF
|
||||||
|
pim@minio0-chbtl0:~$ cat << EOF | sudo tee /lib/systemd/system/minio-console.service
|
||||||
|
[Unit]
|
||||||
|
Description=OpenMaxIO Console Service
|
||||||
|
Wants=network-online.target
|
||||||
|
After=network-online.target
|
||||||
|
AssertFileIsExecutable=/usr/local/bin/minio-console
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
|
||||||
|
WorkingDirectory=/usr/local
|
||||||
|
|
||||||
|
User=minio-user
|
||||||
|
Group=minio-user
|
||||||
|
ProtectProc=invisible
|
||||||
|
|
||||||
|
EnvironmentFile=-/etc/default/minio
|
||||||
|
ExecStart=/usr/local/bin/minio-console server
|
||||||
|
Restart=always
|
||||||
|
LimitNOFILE=1048576
|
||||||
|
MemoryAccounting=no
|
||||||
|
TasksMax=infinity
|
||||||
|
TimeoutSec=infinity
|
||||||
|
OOMScoreAdjust=-1000
|
||||||
|
SendSIGKILL=no
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
pim@minio0-chbtl0:~$ sudo systemctl enable --now minio-console
|
||||||
|
pim@minio0-chbtl0:~$ sudo systemctl restart minio
|
||||||
|
```
|
||||||
|
|
||||||
|
The first snippet is an update to the MinIO configuration that instructs it to redirect users who
|
||||||
|
are not trying to use the API to the console endpoint on `cons0-s3.chbtl0.ipng.ch`, and then the
|
||||||
|
console-server needs to know where to find the API, which from its vantage point is running on
|
||||||
|
`localhost:9000`. Hello, beautiful fully featured console:
|
||||||
|
|
||||||
|
{{< image src="/assets/minio/console-1.png" alt="MinIO Console" >}}
|
||||||
|
|
||||||
|
### MinIO Prometheus
|
||||||
|
|
||||||
|
MinIO ships with a prometheus metrics endpoint, and I notice on its console that it has a nice
|
||||||
|
metrics tab, which is fully greyed out. This is most likely because, well, I don't have a Prometheus
|
||||||
|
install here yet. I decide to keep the storage nodes self-contained and start a Prometheus server on
|
||||||
|
the local machine. I can always plumb that to IPng's Grafana instance later.
|
||||||
|
|
||||||
|
For now, I'll install Prometheus as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@minio0-chbtl0:~$ cat << EOF | sudo tee -a /etc/default/minio
|
||||||
|
## NOTE(pim): Metrics for minio-console
|
||||||
|
MINIO_PROMETHEUS_AUTH_TYPE="public"
|
||||||
|
CONSOLE_PROMETHEUS_URL="http://localhost:19090/"
|
||||||
|
CONSOLE_PROMETHEUS_JOB_ID="minio-job"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
pim@minio0-chbtl0:~$ sudo apt install prometheus
|
||||||
|
pim@minio0-chbtl0:~$ cat << EOF | sudo tee /etc/default/prometheus
|
||||||
|
ARGS="--web.listen-address='[::]:19090' --storage.tsdb.retention.size=16GB"
|
||||||
|
EOF
|
||||||
|
pim@minio0-chbtl0:~$ cat << EOF | sudo tee /etc/prometheus/prometheus.yml
|
||||||
|
global:
|
||||||
|
scrape_interval: 60s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: minio-job
|
||||||
|
metrics_path: /minio/v2/metrics/cluster
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9000']
|
||||||
|
labels:
|
||||||
|
cluster: minio0-chbtl0
|
||||||
|
|
||||||
|
- job_name: minio-job-node
|
||||||
|
metrics_path: /minio/v2/metrics/node
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9000']
|
||||||
|
labels:
|
||||||
|
cluster: minio0-chbtl0
|
||||||
|
|
||||||
|
- job_name: minio-job-bucket
|
||||||
|
metrics_path: /minio/v2/metrics/bucket
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9000']
|
||||||
|
labels:
|
||||||
|
cluster: minio0-chbtl0
|
||||||
|
|
||||||
|
- job_name: minio-job-resource
|
||||||
|
metrics_path: /minio/v2/metrics/resource
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9000']
|
||||||
|
labels:
|
||||||
|
cluster: minio0-chbtl0
|
||||||
|
|
||||||
|
- job_name: node
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9100']
|
||||||
|
labels:
|
||||||
|
cluster: minio0-chbtl0
|
||||||
|
pim@minio0-chbtl0:~$ sudo systemctl restart minio prometheus
|
||||||
|
```
|
||||||
|
|
||||||
|
In the first snippet, I'll tell MinIO where it should find its Prometheus instance. Since the MinIO
|
||||||
|
console service is running on port 9090, and this is also the default port for Prometheus, I will
|
||||||
|
run Promtheus on port 19090 instead. From reading the MinIO docs, I can see that normally MinIO will
|
||||||
|
want prometheus to authenticate to it before it'll allow the endpoints to be scraped. I'll turn that
|
||||||
|
off by making these public. On the IPng Frontends, I can always remove access to /minio/v2 and
|
||||||
|
simply use the IPng Site Local access for local Prometheus scrapers instead.
|
||||||
|
|
||||||
|
After telling Prometheus its runtime arguments (in `/etc/default/prometheus`) and its scraping
|
||||||
|
endpoints (in `/etc/prometheus/prometheus.yml`), I can restart minio and prometheus. A few minutes
|
||||||
|
later, I can see the _Metrics_ tab in the console come to life.
|
||||||
|
|
||||||
|
But now that I have this prometheus running on the MinIO node, I can also add it to IPng's Grafana
|
||||||
|
configuration, by adding a new data source on `minio0.chbtl0.net.ipng.ch:19090` and pointing the
|
||||||
|
default Grafana [[Dashboard](https://grafana.com/grafana/dashboards/13502-minio-dashboard/)] at it:
|
||||||
|
|
||||||
|
{{< image src="/assets/minio/console-2.png" alt="Grafana Dashboard" >}}
|
||||||
|
|
||||||
|
A two-for-one: I will both be able to see metrics directly in the console, but also I will be able
|
||||||
|
to hook up these per-node prometheus instances into IPng's alertmanager also, and I've read some
|
||||||
|
[[docs](https://min.io/docs/minio/linux/operations/monitoring/collect-minio-metrics-using-prometheus.html)]
|
||||||
|
on the concepts. I'm really liking the experience so far!
|
||||||
|
|
||||||
|
### MinIO Nagios
|
||||||
|
|
||||||
|
Prometheus is fancy and all, but at IPng Networks, I've been doing monitoring for a while now. As a
|
||||||
|
dinosaur, I still have an active [[Nagios](https://www.nagios.org/)] install, which autogenerates
|
||||||
|
all of its configuration using the Ansible repository I have. So for the new Ansible group called
|
||||||
|
`minio`, I will autogenerate the following snippet:
|
||||||
|
|
||||||
|
```
|
||||||
|
define command {
|
||||||
|
command_name ipng_check_minio
|
||||||
|
command_line $USER1$/check_http -E -H $HOSTALIAS$ -I $ARG1$ -p $ARG2$ -u $ARG3$ -r '$ARG4$'
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
hostgroup_name ipng:minio:ipv6
|
||||||
|
service_description minio6:api
|
||||||
|
check_command ipng_check_minio!$_HOSTADDRESS6$!9000!/minio/health/cluster!
|
||||||
|
use ipng-service-fast
|
||||||
|
notification_interval 0 ; set > 0 if you want to be renotified
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
hostgroup_name ipng:minio:ipv6
|
||||||
|
service_description minio6:prom
|
||||||
|
check_command ipng_check_minio!$_HOSTADDRESS6$!19090!/classic/targets!minio-job
|
||||||
|
use ipng-service-fast
|
||||||
|
notification_interval 0 ; set > 0 if you want to be renotified
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
hostgroup_name ipng:minio:ipv6
|
||||||
|
service_description minio6:console
|
||||||
|
check_command ipng_check_minio!$_HOSTADDRESS6$!9090!/!MinIO Console
|
||||||
|
use ipng-service-fast
|
||||||
|
notification_interval 0 ; set > 0 if you want to be renotified
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
I've shown the snippet for IPv6 but I also have three services defined for legacy IP in the
|
||||||
|
hostgroup `ipng:minio:ipv4`. The check command here uses `-I` which has the IPv4 or IPv6 address to
|
||||||
|
talk to, `-p` for the port to consule, `-u` for the URI to hit and an option `-r` for a regular
|
||||||
|
expression to expect in the output. For the Nagios afficianados out there: my Ansible `groups`
|
||||||
|
correspond one to one with autogenerated Nagios `hostgroups`. This allows me to add arbitrary checks
|
||||||
|
by group-type, like above in the `ipng:minio` group for IPv4 and IPv6.
|
||||||
|
|
||||||
|
In the MinIO [[docs](https://min.io/docs/minio/linux/operations/monitoring/healthcheck-probe.html)]
|
||||||
|
I read up on the Healthcheck API. I choose to monitor the _Cluster Write Quorum_ on my minio
|
||||||
|
deployments. For Prometheus, I decide to hit the `targets` endpoint and expect the `minio-job` to be
|
||||||
|
among them. Finally, for the MinIO Console, I expect to see a login screen with the words `MinIO
|
||||||
|
Console` in the returned page. I guessed right, because Nagios is all green:
|
||||||
|
|
||||||
|
{{< image src="/assets/minio/nagios.png" alt="Nagios Dashboard" >}}
|
||||||
|
|
||||||
|
## My First Bucket
|
||||||
|
|
||||||
|
The IPng website is a statically generated Hugo site, and when-ever I submit a change to my Git
|
||||||
|
repo, a CI/CD runner (called [[Drone](https://www.drone.io/)]), picks up the change. It re-builds
|
||||||
|
the static website, and copies it to four redundant NGINX servers.
|
||||||
|
|
||||||
|
But IPng's website has amassed quite a bit of extra files (like VM images and VPP packages that I
|
||||||
|
publish), which are copied separately using a simple push script I have in my home directory. This
|
||||||
|
avoids all those big media files from cluttering the Git repository. I decide to move this stuff
|
||||||
|
into S3:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~/src/ipng-web-assets$ echo 'Gruezi World.' > ipng.ch/media/README.md
|
||||||
|
pim@summer:~/src/ipng-web-assets$ mc mb chbtl0/ipng-web-assets
|
||||||
|
pim@summer:~/src/ipng-web-assets$ mc mirror . chbtl0/ipng-web-assets/
|
||||||
|
...ch/media/README.md: 6.50 GiB / 6.50 GiB ┃▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓┃ 236.38 MiB/s 28s
|
||||||
|
pim@summer:~/src/ipng-web-assets$ mc anonymous set download chbtl0/ipng-web-assets/
|
||||||
|
```
|
||||||
|
|
||||||
|
OK, two things that immediately jump out at me. This stuff is **fast**: Summer is connected with a
|
||||||
|
2.5GbE network card, and she's running hard, copying the 6.5GB of data that are in these web assets
|
||||||
|
essentially at line rate. It doesn't really surprise me because Summer is running off of Gen4 NVME,
|
||||||
|
while MinIO has 12 spinning disks which each can write about 160MB/s or so sustained
|
||||||
|
[[ref](https://www.seagate.com/www-content/datasheets/pdfs/exos-x16-DS2011-1-1904US-en_US.pdf)],
|
||||||
|
with 24 CPUs to tend to the NIC (2x10G) and disks (2x SSD, 12x LFF). Should be plenty!
|
||||||
|
|
||||||
|
The second is that MinIO allows for buckets to be publicly shared in three ways: 1) read-only by
|
||||||
|
setting `download`; 2) write-only by setting `upload`, and 3) read-write by setting `public`.
|
||||||
|
I set `download` here, which means I should be able to fetch an asset now publicly:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~$ curl https://s3.chbtl0.ipng.ch/ipng-web-assets/ipng.ch/media/README.md
|
||||||
|
Gruezi World.
|
||||||
|
pim@summer:~$ curl https://ipng-web-assets.s3.chbtl0.ipng.ch/ipng.ch/media/README.md
|
||||||
|
Gruezi World.
|
||||||
|
```
|
||||||
|
|
||||||
|
The first `curl` here shows the path-based access, while the second one shows an equivalent
|
||||||
|
virtual-host based access. Both retrieve the file I just pushed via the public Internet. Whoot!
|
||||||
|
|
||||||
|
# What's Next
|
||||||
|
|
||||||
|
I'm going to be moving [[Restic](https://restic.net/)] backups from IPng's ZFS storage pool to this
|
||||||
|
S3 service over the next few days. I'll also migrate PeerTube and possibly Mastodon from NVME based
|
||||||
|
storage to replicated S3 buckets as well. Finally, the IPng website media that I mentioned above,
|
||||||
|
should make for a nice followup article. Stay tuned!
|
||||||
@@ -0,0 +1,475 @@
|
|||||||
|
---
|
||||||
|
date: "2025-06-01T10:07:23Z"
|
||||||
|
title: 'Case Study: Minio S3 - Part 2'
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image float="right" src="/assets/minio/minio-logo.png" alt="MinIO Logo" width="6em" >}}
|
||||||
|
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
Amazon Simple Storage Service (Amazon S3) is an object storage service offering industry-leading
|
||||||
|
scalability, data availability, security, and performance. Millions of customers of all sizes and
|
||||||
|
industries store, manage, analyze, and protect any amount of data for virtually any use case, such
|
||||||
|
as data lakes, cloud-native applications, and mobile apps. With cost-effective storage classes and
|
||||||
|
easy-to-use management features, you can optimize costs, organize and analyze data, and configure
|
||||||
|
fine-tuned access controls to meet specific business and compliance requirements.
|
||||||
|
|
||||||
|
Amazon's S3 became the _de facto_ standard object storage system, and there exist several fully open
|
||||||
|
source implementations of the protocol. One of them is MinIO: designed to allow enterprises to
|
||||||
|
consolidate all of their data on a single, private cloud namespace. Architected using the same
|
||||||
|
principles as the hyperscalers, AIStor delivers performance at scale at a fraction of the cost
|
||||||
|
compared to the public cloud.
|
||||||
|
|
||||||
|
IPng Networks is an Internet Service Provider, but I also dabble in self-hosting things, for
|
||||||
|
example [[PeerTube](https://video.ipng.ch/)], [[Mastodon](https://ublog.tech/)],
|
||||||
|
[[Immich](https://photos.ipng.ch/)], [[Pixelfed](https://pix.ublog.tech/)] and of course
|
||||||
|
[[Hugo](https://ipng.ch/)]. These services all have one thing in common: they tend to use lots of
|
||||||
|
storage when they grow. At IPng Networks, all hypervisors ship with enterprise SAS flash drives,
|
||||||
|
mostly 1.92TB and 3.84TB. Scaling up each of these services, and backing them up safely, can be
|
||||||
|
quite the headache.
|
||||||
|
|
||||||
|
In a [[previous article]({{< ref 2025-05-28-minio-1 >}})], I talked through the install of a
|
||||||
|
redundant set of three Minio machines. In this article, I'll start putting them to good use.
|
||||||
|
|
||||||
|
## Use Case: Restic
|
||||||
|
|
||||||
|
{{< image float="right" src="/assets/minio/restic-logo.png" alt="Restic Logo" width="12em" >}}
|
||||||
|
|
||||||
|
[[Restic](https://restic.org/)] is a modern backup program that can back up your files from multiple
|
||||||
|
host OS, to many different storage types, easily, effectively, securely, verifiably and freely. With
|
||||||
|
a sales pitch like that, what's not to love? Actually, I am a long-time
|
||||||
|
[[BorgBackup](https://www.borgbackup.org/)] user, and I think I'll keep that running. However, for
|
||||||
|
resilience, and because I've heard only good things about Restic, I'll make a second backup of the
|
||||||
|
routers, hypervisors, and virtual machines using Restic.
|
||||||
|
|
||||||
|
Restic can use S3 buckets out of the box (incidentally, so can BorgBackup). To configure it, I use
|
||||||
|
a mixture of environment variables and flags. But first, let me create a bucket for the backups.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@glootie:~$ mc mb chbtl0/ipng-restic
|
||||||
|
pim@glootie:~$ mc admin user add chbtl0/ <key> <secret>
|
||||||
|
pim@glootie:~$ cat << EOF | tee ipng-restic-access.json
|
||||||
|
{
|
||||||
|
"PolicyName": "ipng-restic-access",
|
||||||
|
"Policy": {
|
||||||
|
"Version": "2012-10-17",
|
||||||
|
"Statement": [
|
||||||
|
{
|
||||||
|
"Effect": "Allow",
|
||||||
|
"Action": [ "s3:DeleteObject", "s3:GetObject", "s3:ListBucket", "s3:PutObject" ],
|
||||||
|
"Resource": [ "arn:aws:s3:::ipng-restic", "arn:aws:s3:::ipng-restic/*" ]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
pim@glootie:~$ mc admin policy create chbtl0/ ipng-restic-access.json
|
||||||
|
pim@glootie:~$ mc admin policy attach chbtl0/ ipng-restic-access --user <key>
|
||||||
|
```
|
||||||
|
|
||||||
|
First, I'll create a bucket called `ipng-restic`. Then, I'll create a _user_ with a given secret
|
||||||
|
_key_. To protect the innocent, and my backups, I'll not disclose them. Next, I'll create an
|
||||||
|
IAM policy that allows for Get/List/Put/Delete to be performed on the bucket and its contents, and
|
||||||
|
finally I'll attach this policy to the user I just created.
|
||||||
|
|
||||||
|
To run a Restic backup, I'll first have to create a so-called _repository_. The repository has a
|
||||||
|
location and a password, which Restic uses to encrypt the data. Because I'm using S3, I'll also need
|
||||||
|
to specify the key and secret:
|
||||||
|
|
||||||
|
```
|
||||||
|
root@glootie:~# RESTIC_PASSWORD="changeme"
|
||||||
|
root@glootie:~# RESTIC_REPOSITORY="s3:https://s3.chbtl0.ipng.ch/ipng-restic/$(hostname)/"
|
||||||
|
root@glootie:~# AWS_ACCESS_KEY_ID="<key>"
|
||||||
|
root@glootie:~# AWS_SECRET_ACCESS_KEY:="<secret>"
|
||||||
|
root@glootie:~# export RESTIC_PASSWORD RESTIC_REPOSITORY AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY
|
||||||
|
root@glootie:~# restic init
|
||||||
|
created restic repository 807cf25e85 at s3:https://s3.chbtl0.ipng.ch/ipng-restic/glootie.ipng.ch/
|
||||||
|
```
|
||||||
|
|
||||||
|
Restic prints out some repository finterprint of the latest 'snapshot' it just created. Taking a
|
||||||
|
look on the MinIO install:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@glootie:~$ mc stat chbtl0/ipng-restic/glootie.ipng.ch/
|
||||||
|
Name : config
|
||||||
|
Date : 2025-06-01 12:01:43 UTC
|
||||||
|
Size : 155 B
|
||||||
|
ETag : 661a43f72c43080649712e45da14da3a
|
||||||
|
Type : file
|
||||||
|
Metadata :
|
||||||
|
Content-Type: application/octet-stream
|
||||||
|
|
||||||
|
Name : keys/
|
||||||
|
Date : 2025-06-01 12:03:33 UTC
|
||||||
|
Type : folder
|
||||||
|
```
|
||||||
|
|
||||||
|
Cool. Now I'm ready to make my first full backup:
|
||||||
|
|
||||||
|
```
|
||||||
|
root@glootie:~# ARGS="--exclude /proc --exclude /sys --exclude /dev --exclude /run"
|
||||||
|
root@glootie:~# ARGS="$ARGS --exclude-if-present .nobackup"
|
||||||
|
root@glootie:~# restic backup $ARGS /
|
||||||
|
...
|
||||||
|
processed 1141426 files, 131.111 GiB in 15:12
|
||||||
|
snapshot 34476c74 saved
|
||||||
|
```
|
||||||
|
|
||||||
|
Once the backup completes, the Restic authors advise me to also do a check of the repository, and to
|
||||||
|
prune it so that it keeps a finite amount of daily, weekly and monthly backups. My further journey
|
||||||
|
for Restic looks a bit like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
root@glootie:~# restic check
|
||||||
|
using temporary cache in /tmp/restic-check-cache-2712250731
|
||||||
|
create exclusive lock for repository
|
||||||
|
load indexes
|
||||||
|
check all packs
|
||||||
|
check snapshots, trees and blobs
|
||||||
|
[0:04] 100.00% 1 / 1 snapshots
|
||||||
|
|
||||||
|
no errors were found
|
||||||
|
|
||||||
|
root@glootie:~# restic forget --prune --keep-daily 8 --keep-weekly 5 --keep-monthly 6
|
||||||
|
repository 34476c74 opened (version 2, compression level auto)
|
||||||
|
Applying Policy: keep 8 daily, 5 weekly, 6 monthly snapshots
|
||||||
|
keep 1 snapshots:
|
||||||
|
ID Time Host Tags Reasons Paths
|
||||||
|
---------------------------------------------------------------------------------
|
||||||
|
34476c74 2025-06-01 12:18:54 glootie.ipng.ch daily snapshot /
|
||||||
|
weekly snapshot
|
||||||
|
monthly snapshot
|
||||||
|
----------------------------------------------------------------------------------
|
||||||
|
1 snapshots
|
||||||
|
```
|
||||||
|
|
||||||
|
Right on! I proceed to update the Ansible configs at IPng to roll this out against the entire fleet
|
||||||
|
of 152 hosts at IPng Networks. I do this in a little tool called `bitcron`, which I wrote for a
|
||||||
|
previous company I worked at: [[BIT](https://bit.nl)] in the Netherlands. Bitcron allows me to
|
||||||
|
create relatively elegant cronjobs that can raise warnings, errors and fatal issues. If no issues
|
||||||
|
are found, an e-mail can be sent to a bitbucket address, but if warnings or errors are found, a
|
||||||
|
different _monitored_ address will be used. Bitcron is kind of cool, and I wrote it in 2001. Maybe
|
||||||
|
I'll write about it, for old time's sake. I wonder if the folks at BIT still use it?
|
||||||
|
|
||||||
|
## Use Case: NGINX
|
||||||
|
|
||||||
|
{{< image float="right" src="/assets/minio/nginx-logo.png" alt="NGINX Logo" width="11em" >}}
|
||||||
|
|
||||||
|
OK, with the first use case out of the way, I turn my attention to a second - in my opinion more
|
||||||
|
interesting - use case. In the [[previous article]({{< ref 2025-05-28-minio-1 >}})], I created a
|
||||||
|
public bucket called `ipng-web-assets` in which I stored 6.50GB of website data belonging to the
|
||||||
|
IPng website, and some material I posted when I was on my
|
||||||
|
[[Sabbatical](https://sabbatical.ipng.nl/)] last year.
|
||||||
|
|
||||||
|
### MinIO: Bucket Replication
|
||||||
|
|
||||||
|
First things first: redundancy. These web assets are currently pushed to all four nginx machines,
|
||||||
|
and statically served. If I were to replace them with a single S3 bucket, I would create a single
|
||||||
|
point of failure, and that's _no bueno_!
|
||||||
|
|
||||||
|
Off I go, creating a replicated bucket using two MinIO instances (`chbtl0` and `ddln0`):
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@glootie:~$ mc mb ddln0/ipng-web-assets
|
||||||
|
pim@glootie:~$ mc anonymous set download ddln0/ipng-web-assets
|
||||||
|
pim@glootie:~$ mc admin user add ddln0/ <replkey> <replsecret>
|
||||||
|
pim@glootie:~$ cat << EOF | tee ipng-web-assets-access.json
|
||||||
|
{
|
||||||
|
"PolicyName": "ipng-web-assets-access",
|
||||||
|
"Policy": {
|
||||||
|
"Version": "2012-10-17",
|
||||||
|
"Statement": [
|
||||||
|
{
|
||||||
|
"Effect": "Allow",
|
||||||
|
"Action": [ "s3:DeleteObject", "s3:GetObject", "s3:ListBucket", "s3:PutObject" ],
|
||||||
|
"Resource": [ "arn:aws:s3:::ipng-web-assets", "arn:aws:s3:::ipng-web-assets/*" ]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
pim@glootie:~$ mc admin policy create ddln0/ ipng-web-assets-access.json
|
||||||
|
pim@glootie:~$ mc admin policy attach ddln0/ ipng-web-assets-access --user <replkey>
|
||||||
|
pim@glootie:~$ mc replicate add chbtl0/ipng-web-assets \
|
||||||
|
--remote-bucket https://<key>:<secret>@s3.ddln0.ipng.ch/ipng-web-assets
|
||||||
|
```
|
||||||
|
|
||||||
|
What happens next is pure magic. I've told `chbtl0` that I want it to replicate all existing and
|
||||||
|
future changes to that bucket to its neighbor `ddln0`. Only minutes later, I check the replication
|
||||||
|
status, just to see that it's _already done_:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@glootie:~$ mc replicate status chbtl0/ipng-web-assets
|
||||||
|
Replication status since 1 hour
|
||||||
|
s3.ddln0.ipng.ch
|
||||||
|
Replicated: 142 objects (6.5 GiB)
|
||||||
|
Queued: ● 0 objects, 0 B (avg: 4 objects, 915 MiB ; max: 0 objects, 0 B)
|
||||||
|
Workers: 0 (avg: 0; max: 0)
|
||||||
|
Transfer Rate: 15 kB/s (avg: 88 MB/s; max: 719 MB/s
|
||||||
|
Latency: 3ms (avg: 3ms; max: 7ms)
|
||||||
|
Link: ● online (total downtime: 0 milliseconds)
|
||||||
|
Errors: 0 in last 1 minute; 0 in last 1hr; 0 since uptime
|
||||||
|
Configured Max Bandwidth (Bps): 644 GB/s Current Bandwidth (Bps): 975 B/s
|
||||||
|
pim@summer:~/src/ipng-web-assets$ mc ls ddln0/ipng-web-assets/
|
||||||
|
[2025-06-01 12:42:22 CEST] 0B ipng.ch/
|
||||||
|
[2025-06-01 12:42:22 CEST] 0B sabbatical.ipng.nl/
|
||||||
|
```
|
||||||
|
|
||||||
|
MinIO has pumped the data from bucket `ipng-web-assets` to the other machine at an average of 88MB/s
|
||||||
|
with a peak throughput of 719MB/s (probably for the larger VM images). And indeed, looking at the
|
||||||
|
remote machine, it is fully caught up after the push, within only a minute or so with a completely
|
||||||
|
fresh copy. Nice!
|
||||||
|
|
||||||
|
### MinIO: Missing directory index
|
||||||
|
|
||||||
|
I take a look at what I just built, on the following URL:
|
||||||
|
* [https://ipng-web-assets.s3.ddln0.ipng.ch/sabbatical.ipng.nl/media/vdo/IMG_0406_0.mp4](https://ipng-web-assets.s3.ddln0.ipng.ch/sabbatical.ipng.nl/media/vdo/IMG_0406_0.mp4)
|
||||||
|
|
||||||
|
That checks out, and I can see the mess that was my room when I first went on sabbatical. By the
|
||||||
|
way, I totally cleaned it up, see
|
||||||
|
[[here](https://sabbatical.ipng.nl/blog/2024/08/01/thursday-basement-done/)] for proof. I can't,
|
||||||
|
however, see the directory listing:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@glootie:~$ curl https://ipng-web-assets.s3.ddln0.ipng.ch/sabbatical.ipng.nl/media/vdo/
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<Error>
|
||||||
|
<Code>NoSuchKey</Code>
|
||||||
|
<Message>The specified key does not exist.</Message>
|
||||||
|
<Key>sabbatical.ipng.nl/media/vdo/</Key>
|
||||||
|
<BucketName>ipng-web-assets</BucketName>
|
||||||
|
<Resource>/sabbatical.ipng.nl/media/vdo/</Resource>
|
||||||
|
<RequestId>1844EC0CFEBF3C5F</RequestId>
|
||||||
|
<HostId>dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8</HostId>
|
||||||
|
</Error>
|
||||||
|
```
|
||||||
|
|
||||||
|
That's unfortunate, because some of the IPng articles link to a directory full of files, which I'd
|
||||||
|
like to be shown so that my readers can navigate through the directories. Surely I'm not the first
|
||||||
|
to encounter this? And sure enough, I'm not
|
||||||
|
[[ref](https://github.com/glowinthedark/index-html-generator)] by user `glowinthedark` who wrote a
|
||||||
|
little python script that generates `index.html` files for their Caddy file server. I'll take me
|
||||||
|
some of that Python, thank you!
|
||||||
|
|
||||||
|
With the following little script, my setup is complete:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@glootie:~/src/ipng-web-assets$ cat push.sh
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
echo "Generating index.html files ..."
|
||||||
|
for D in */media; do
|
||||||
|
echo "* Directory $D"
|
||||||
|
./genindex.py -r $D
|
||||||
|
done
|
||||||
|
echo "Done (genindex)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "Mirroring directoro to S3 Bucket"
|
||||||
|
mc mirror --remove --overwrite . chbtl0/ipng-web-assets/
|
||||||
|
echo "Done (mc mirror)"
|
||||||
|
echo ""
|
||||||
|
pim@glootie:~/src/ipng-web-assets$ ./push.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Only a few seconds after I run `./push.sh`, the replication is complete and I have two identical
|
||||||
|
copies of my media:
|
||||||
|
|
||||||
|
1. [https://ipng-web-assets.s3.chbtl0.ipng.ch/ipng.ch/media/](https://ipng-web-assets.s3.chbtl0.ipng.ch/ipng.ch/media/index.html)
|
||||||
|
1. [https://ipng-web-assets.s3.ddln0.ipng.ch/ipng.ch/media/](https://ipng-web-assets.s3.ddln0.ipng.ch/ipng.ch/media/index.html)
|
||||||
|
|
||||||
|
|
||||||
|
### NGINX: Proxy to Minio
|
||||||
|
|
||||||
|
Before moving to S3 storage, my NGINX frontends all kept a copy of the IPng media on local NVME
|
||||||
|
disk. That's great for reliability, as each NGINX instance is completely hermetic and standalone.
|
||||||
|
However, it's not great for scaling: the current NGINX instances only have 16GB of local storage,
|
||||||
|
and I'd rather not have my static web asset data outgrow that filesystem. From before, I already had
|
||||||
|
an NGINX config that served the Hugo static data from `/var/www/ipng.ch/ and the `/media'
|
||||||
|
subdirectory from a different directory in `/var/www/ipng-web-assets/ipng.ch/media`.
|
||||||
|
|
||||||
|
Moving to redundant S3 storage backenda is straight forward:
|
||||||
|
|
||||||
|
```
|
||||||
|
upstream minio_ipng {
|
||||||
|
least_conn;
|
||||||
|
server minio0.chbtl0.net.ipng.ch:9000;
|
||||||
|
server minio0.ddln0.net.ipng.ch:9000;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
...
|
||||||
|
location / {
|
||||||
|
root /var/www/ipng.ch/;
|
||||||
|
}
|
||||||
|
|
||||||
|
location /media {
|
||||||
|
proxy_set_header Host $http_host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
proxy_connect_timeout 300;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Connection "";
|
||||||
|
chunked_transfer_encoding off;
|
||||||
|
|
||||||
|
rewrite (.*)/$ $1/index.html;
|
||||||
|
|
||||||
|
proxy_pass http://minio_ipng/ipng-web-assets/ipng.ch/media;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
I want to make note of a few things:
|
||||||
|
1. The `upstream` definition here uses IPng Site Local entrypoints, considering the NGINX servers
|
||||||
|
all have direct MTU=9000 access to the MinIO instances. I'll put both in there, in a
|
||||||
|
round-robin configuration favoring the replica with _least connections_.
|
||||||
|
1. Deeplinking to directory names without the trailing `/index.html` would serve a 404 from the
|
||||||
|
backend, so I'll intercept these and rewrite directory to always include the `/index.html'.
|
||||||
|
1. The used upstream endpoint is _path-based_, that is to say has the bucketname and website name
|
||||||
|
included. This whole location used to be simply `root /var/www/ipng-web-assets/ipng.ch/media/`
|
||||||
|
so the mental change is quite small.
|
||||||
|
|
||||||
|
### NGINX: Caching
|
||||||
|
|
||||||
|
|
||||||
|
After deploying the S3 upstream on all IPng websites, I can delete the old
|
||||||
|
`/var/www/ipng-web-assets/` directory and reclaim about 7GB of diskspace. This gives me an idea ...
|
||||||
|
|
||||||
|
{{< image width="8em" float="left" src="/assets/shared/brain.png" alt="brain" >}}
|
||||||
|
|
||||||
|
On the one hand it's great that I will pull these assets from Minio and all, but at the same time,
|
||||||
|
it's a tad inefficient to retrieve them from, say, Zurich to Amsterdam just to serve them onto the
|
||||||
|
internet again. If at any time something on the IPng website goes viral, it'd be nice to be able to
|
||||||
|
serve them directly from the edge, right?
|
||||||
|
|
||||||
|
A webcache. What could _possibly_ go wrong :)
|
||||||
|
|
||||||
|
NGINX is really really good at caching content. It has a powerful engine to store, scan, revalidate
|
||||||
|
and match any content and upstream headers. It's also very well documented, so I take a look at the
|
||||||
|
proxy module's documentation [[here](https://nginx.org/en/docs/http/ngx_http_proxy_module.html)] and
|
||||||
|
in particular a useful [[blog](https://blog.nginx.org/blog/nginx-caching-guide)] on their website.
|
||||||
|
|
||||||
|
The first thing I need to do is create what is called a _key zone_, which is a region of memory in
|
||||||
|
which URL keys are stored with some metadata. Having a copy of the keys in memory enables NGINX to
|
||||||
|
quickly determine if a request is a HIT or a MISS without having to go to disk, greatly speeding up
|
||||||
|
the check.
|
||||||
|
|
||||||
|
In `/etc/nginx/conf.d/ipng-cache.conf` I add the following NGINX cache:
|
||||||
|
|
||||||
|
```
|
||||||
|
proxy_cache_path /var/www/nginx-cache levels=1:2 keys_zone=ipng_cache:10m max_size=8g
|
||||||
|
inactive=24h use_temp_path=off;
|
||||||
|
```
|
||||||
|
|
||||||
|
With this statement, I'll create a 2-level subdirectory, and allocate 10MB of space, which should
|
||||||
|
hold on the order of 100K entries. The maximum size I'll allow the cache to grow to is 8GB, and I'll
|
||||||
|
mark any object inactive if it's not been referenced for 24 hours. I learn that inactive is
|
||||||
|
different to expired content. If a cache element has expired, but NGINX can't reach the upstream
|
||||||
|
for a new copy, it can be configured to serve a inactive (stale) copy from the cache. That's dope,
|
||||||
|
as it serves as an extra layer of defence in case the network or all available S3 replicas take the
|
||||||
|
day off. I'll ask NGINX to avoid writing objects first to a tmp directory and them moving them into
|
||||||
|
the `/var/www/nginx-cache` directory. These are recommendations I grab from the manual.
|
||||||
|
|
||||||
|
Within the `location` block I configured above, I'm now ready to enable this cache. I'll do that by
|
||||||
|
adding two include files, which I'll reference in all sites that I want to have make use of this
|
||||||
|
cache:
|
||||||
|
|
||||||
|
First, to enable the cache, I write the following snippet:
|
||||||
|
```
|
||||||
|
pim@nginx0-nlams1:~$ cat /etc/nginx/conf.d/ipng-cache.inc
|
||||||
|
proxy_cache ipng_cache;
|
||||||
|
proxy_ignore_headers Cache-Control;
|
||||||
|
proxy_cache_valid any 1h;
|
||||||
|
proxy_cache_revalidate on;
|
||||||
|
proxy_cache_use_stale error timeout updating http_500 http_502 http_503 http_504;
|
||||||
|
proxy_cache_background_update on;
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, I find it useful to emit a few debugging HTTP headers, and at the same time I see that Minio
|
||||||
|
emits a bunch of HTTP headers that may not be safe for me to propagate, so I pen two more snippets:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@nginx0-nlams1:~$ cat /etc/nginx/conf.d/ipng-strip-minio-headers.inc
|
||||||
|
proxy_hide_header x-minio-deployment-id;
|
||||||
|
proxy_hide_header x-amz-request-id;
|
||||||
|
proxy_hide_header x-amz-id-2;
|
||||||
|
proxy_hide_header x-amz-replication-status;
|
||||||
|
proxy_hide_header x-amz-version-id;
|
||||||
|
|
||||||
|
pim@nginx0-nlams1:~$ cat /etc/nginx/conf.d/ipng-add-upstream-headers.inc
|
||||||
|
add_header X-IPng-Frontend $hostname always;
|
||||||
|
add_header X-IPng-Upstream $upstream_addr always;
|
||||||
|
add_header X-IPng-Upstream-Status $upstream_status always;
|
||||||
|
add_header X-IPng-Cache-Status $upstream_cache_status;
|
||||||
|
```
|
||||||
|
|
||||||
|
With that, I am ready to enable caching of the IPng `/media` location:
|
||||||
|
|
||||||
|
```
|
||||||
|
location /media {
|
||||||
|
...
|
||||||
|
include /etc/nginx/conf.d/ipng-strip-minio-headers.inc;
|
||||||
|
include /etc/nginx/conf.d/ipng-add-upstream-headers.inc;
|
||||||
|
include /etc/nginx/conf.d/ipng-cache.inc;
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
I run the Ansible playbook for the NGINX cluster and take a look at the replica at Coloclue in
|
||||||
|
Amsterdam, called `nginx0.nlams1.ipng.ch`. Notably, it'll have to retrieve the file from a MinIO
|
||||||
|
replica in Zurich (12ms away), so it's expected to take a little while.
|
||||||
|
|
||||||
|
The first attempt:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@nginx0-nlams1:~$ curl -v -o /dev/null --connect-to ipng.ch:443:localhost:443 \
|
||||||
|
https://ipng.ch/media/vpp-proto/vpp-proto-bookworm.qcow2.lrz
|
||||||
|
...
|
||||||
|
< last-modified: Sun, 01 Jun 2025 12:37:52 GMT
|
||||||
|
< x-ipng-frontend: nginx0-nlams1
|
||||||
|
< x-ipng-cache-status: MISS
|
||||||
|
< x-ipng-upstream: [2001:678:d78:503::b]:9000
|
||||||
|
< x-ipng-upstream-status: 200
|
||||||
|
|
||||||
|
100 711M 100 711M 0 0 26.2M 0 0:00:27 0:00:27 --:--:-- 26.6M
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
OK, that's respectable, I've read the file at 26MB/s. Of course I just turned on the cache, so the
|
||||||
|
NGINX fetches the file from Zurich while handing it over to my `curl` here. It notifies me by means
|
||||||
|
of a HTTP header that the cache was a `MISS`, and then which upstream server it contacted to
|
||||||
|
retrieve the object.
|
||||||
|
|
||||||
|
But look at what happens the _second_ time I run the same command:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@nginx0-nlams1:~$ curl -v -o /dev/null --connect-to ipng.ch:443:localhost:443 \
|
||||||
|
https://ipng.ch/media/vpp-proto/vpp-proto-bookworm.qcow2.lrz
|
||||||
|
< last-modified: Sun, 01 Jun 2025 12:37:52 GMT
|
||||||
|
< x-ipng-frontend: nginx0-nlams1
|
||||||
|
< x-ipng-cache-status: HIT
|
||||||
|
|
||||||
|
100 711M 100 711M 0 0 436M 0 0:00:01 0:00:01 --:--:-- 437M
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Holy moly! First I see the object has the same _Last-Modified_ header, but I now also see that the
|
||||||
|
_Cache-Status_ was a `HIT`, and there is no mention of any upstream server. I do however see the
|
||||||
|
file come in at a whopping 437MB/s which is 16x faster than over the network!! Nice work, NGINX!
|
||||||
|
|
||||||
|
{{< image float="right" src="/assets/minio/rack-2.png" alt="Rack-o-Minio" width="12em" >}}
|
||||||
|
|
||||||
|
# What's Next
|
||||||
|
|
||||||
|
I'm going to deploy the third MinIO replica in Rümlang once the disks arrive. I'll release the
|
||||||
|
~4TB of disk used currently in Restic backups for the fleet, and put that ZFS capacity to other use.
|
||||||
|
Now, creating services like PeerTube, Mastodon, Pixelfed, Loops, NextCloud and what-have-you, will
|
||||||
|
become much easier for me. And with the per-bucket replication between MinIO deployments, I also
|
||||||
|
think this is a great way to auto-backup important data. First off, it'll be RS8.4 on the MinIO node
|
||||||
|
itself, and secondly, user data will be copied automatically to a neighboring facility.
|
||||||
|
|
||||||
|
I've convinced myself that S3 storage is a great service to operate, and that MinIO is awesome.
|
||||||
@@ -0,0 +1,375 @@
|
|||||||
|
---
|
||||||
|
date: "2025-07-12T08:07:23Z"
|
||||||
|
title: 'VPP and eVPN/VxLAN - Part 1'
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="6em" float="right" src="/assets/vpp/fdio-color.svg" alt="VPP" >}}
|
||||||
|
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
You know what would be really cool? If VPP could be an eVPN/VxLAN speaker! Sometimes I feel like I'm
|
||||||
|
the very last on the planet to learn about something cool. My latest "A-Ha!"-moment was when I was
|
||||||
|
configuring the eVPN fabric for [[Frys-IX](https://frys-ix.net/)], and I wrote up an article about
|
||||||
|
it [[here]({{< ref 2025-04-09-frysix-evpn >}})] back in April.
|
||||||
|
|
||||||
|
I can build the equivalent of Virtual Private Wires (VPWS), also called L2VPN or Virtual Leased
|
||||||
|
Lines, and these are straight forward because they typically only have two endpoints. A "regular"
|
||||||
|
VxLAN tunnel which is L2 cross connected with another interface already does that just fine. Take a
|
||||||
|
look at an article on [[L2 Gymnastics]({{< ref 2022-01-12-vpp-l2 >}})] for that. But the real kicker
|
||||||
|
is that I can also create multi-site L2 domains like Virtual Private LAN Services (VPLS) or also
|
||||||
|
called Virtual Private Ethernet, L2VPN or Ethernet LAN Service (E-LAN). And *that* is a whole other
|
||||||
|
level of awesome.
|
||||||
|
|
||||||
|
## Recap: VPP today
|
||||||
|
|
||||||
|
### VPP: VxLAN
|
||||||
|
|
||||||
|
The current VPP VxLAN tunnel plugin does point to point tunnels, that is they are configured with a
|
||||||
|
source address, destination address, destination port and VNI. As I mentioned, a point to point
|
||||||
|
ethernet transport is configured very easily:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp0# create vxlan tunnel src 192.0.2.1 dst 192.0.2.254 vni 8298 instance 0
|
||||||
|
vpp0# set int l2 xconnect vxlan_tunnel0 HundredGigabitEthernet10/0/0
|
||||||
|
vpp0# set int l2 xconnect HundredGigabitEthernet10/0/0 vxlan_tunnel0
|
||||||
|
vpp0# set int state vxlan_tunnel0 up
|
||||||
|
vpp0# set int state HundredGigabitEthernet10/0/0 up
|
||||||
|
|
||||||
|
vpp1# create vxlan tunnel src 192.0.2.254 dst 192.0.2.1 vni 8298 instance 0
|
||||||
|
vpp1# set int l2 xconnect vxlan_tunnel0 HundredGigabitEthernet10/0/1
|
||||||
|
vpp1# set int l2 xconnect HundredGigabitEthernet10/0/1 vxlan_tunnel0
|
||||||
|
vpp1# set int state vxlan_tunnel0 up
|
||||||
|
vpp1# set int state HundredGigabitEthernet10/0/1 up
|
||||||
|
```
|
||||||
|
|
||||||
|
And with that, `vpp0:Hu10/0/0` is cross connected with `vpp1:Hu10/0/1` and ethernet flows between
|
||||||
|
the two.
|
||||||
|
|
||||||
|
### VPP: Bridge Domains
|
||||||
|
|
||||||
|
Now consider a VPLS with five different routers. While it's possible to create a bridge-domain and add
|
||||||
|
some local ports and four other VxLAN tunnels:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp0# create bridge-domain 8298
|
||||||
|
vpp0# set int l2 bridge HundredGigabitEthernet10/0/1 8298
|
||||||
|
vpp0# create vxlan tunnel src 192.0.2.1 dst 192.0.2.2 vni 8298 instance 0
|
||||||
|
vpp0# create vxlan tunnel src 192.0.2.1 dst 192.0.2.3 vni 8298 instance 1
|
||||||
|
vpp0# create vxlan tunnel src 192.0.2.1 dst 192.0.2.4 vni 8298 instance 2
|
||||||
|
vpp0# create vxlan tunnel src 192.0.2.1 dst 192.0.2.5 vni 8298 instance 3
|
||||||
|
vpp0# set int l2 bridge vxlan_tunnel0 8298
|
||||||
|
vpp0# set int l2 bridge vxlan_tunnel1 8298
|
||||||
|
vpp0# set int l2 bridge vxlan_tunnel2 8298
|
||||||
|
vpp0# set int l2 bridge vxlan_tunnel3 8298
|
||||||
|
```
|
||||||
|
|
||||||
|
To make this work, I will have to replicate this configuration to all other `vpp1`-`vpp4` routers.
|
||||||
|
While it does work, it's really not very practical. When other VPP instances get added to a VPLS,
|
||||||
|
every other router will have to have a new VxLAN tunnel created and added to its local bridge
|
||||||
|
domain. Consider 1000s of VPLS instances on 100s of routers, it would yield ~100'000 VxLAN tunnels
|
||||||
|
on every router, yikes!
|
||||||
|
|
||||||
|
Such a configuration reminds me in a way of iBGP in a large network: the naive approach is to have a
|
||||||
|
full mesh of all routers speaking to all other routers, but that quickly becomes a maintenance
|
||||||
|
headache. The canonical solution for this is to create iBGP _Route Reflectors_ to which every router
|
||||||
|
connects, and their job is to redistribute routing information between the fleet of routers. This
|
||||||
|
turns the iBGP problem from an O(N^2) to an O(N) problem: all 1'000 routers connect to, say, three
|
||||||
|
regional route reflectors for a total of 3'000 BGP connections, which is much better than ~1'000'000
|
||||||
|
BGP connections in the naive approach.
|
||||||
|
|
||||||
|
## Recap: eVPN Moving parts
|
||||||
|
|
||||||
|
The reason why I got so enthusiastic when I was playing with Arista and Nokia's eVPN stuff, is
|
||||||
|
because it requires very little dataplane configuration, and a relatively intuitive controlplane
|
||||||
|
configuration:
|
||||||
|
|
||||||
|
1. **Dataplane**: For each L2 broadcast domain (be it a L2XC or a Bridge Domain), really all I
|
||||||
|
need is a single VxLAN interface with a given VNI, which should be able to send encapsulated
|
||||||
|
ethernet frames to one more more other speakers in the same domain.
|
||||||
|
1. **Controlplane**: I will need to learn MAC addresses locally, and inform some BGP eVPN
|
||||||
|
implementation of who-lives-where. Other VxLAN speakers learn of the MAC addresses I own, and
|
||||||
|
will send me encapsulated ethernet for those addresses
|
||||||
|
1. **Dataplane**: For unknown layer2 destinations, like _Broadcast_, _Unknown Unicast_, and
|
||||||
|
_Multicast_ (BUM) traffic, I will want to keep track of which other VxLAN speakers these
|
||||||
|
packets should be flooded. I make note that this is not that different to flooding the packets
|
||||||
|
to local interfaces, except here it'd be flooding them to remote VxLAN endpoints.
|
||||||
|
1. **ControlPlane**: Flooding L2 traffic across wide area networks is typically considered icky,
|
||||||
|
so a few tricks might be optionally deployed. Since the controlplane already knows which MAC
|
||||||
|
lives where, it may as well also make note of any local IPv6 ARP and IPv6 neighbor discovery
|
||||||
|
replies and teach its peers which IPv4/IPv6 addresses live where: a distributed neighbor table.
|
||||||
|
|
||||||
|
{{< image width="6em" float="left" src="/assets/shared/brain.png" alt="brain" >}}
|
||||||
|
|
||||||
|
For the controlplane parts, [[FRRouting](https://frrouting.org/)] has a working implementation for
|
||||||
|
L2 (MAC-VRF) and L3 (IP-VRF). My favorite, [[Bird](https://bird.nic.cz/)], is slowly catching up, and
|
||||||
|
has a few of these controlplane parts already working (mostly MAC-VRF). Commercial vendors like Arista,
|
||||||
|
Nokia, Juniper, Cisco are ready to go. If we want VPP to inter-operate, we may need to make a few
|
||||||
|
changes.
|
||||||
|
|
||||||
|
## VPP: Changes needed
|
||||||
|
|
||||||
|
### Dynamic VxLAN
|
||||||
|
|
||||||
|
I propose two changes to the VxLAN plugin, or perhaps, a new plugin that changes the behavior so that
|
||||||
|
we don't have to break any performance or functional promises to existing users. This new VxLAN
|
||||||
|
interface behavior changes in the following ways:
|
||||||
|
|
||||||
|
1. Each VxLAN interface has a local L2FIB attached to it, the keys are MAC address and the
|
||||||
|
values are remote VTEPs. In its simplest form, the values would be just IPv4 or IPv6 addresses,
|
||||||
|
because I can re-use the VNI and port information from the tunnel definition itself.
|
||||||
|
|
||||||
|
1. Each VxLAN interface has a local flood-list attached to it. This list contains remote VTEPs
|
||||||
|
that I am supposed to send 'flood' packets to. Similar to the Bridge Domain, when packets are marked
|
||||||
|
for flooding, I will need to prepare and replicate them, sending them to each VTEP.
|
||||||
|
|
||||||
|
|
||||||
|
A set of APIs will be needed to manipulate these:
|
||||||
|
* ***Interface***: I will need to have an interface create, delete and list call, which will
|
||||||
|
be able to maintain the interfaces, their metadata like source address, source/destination port,
|
||||||
|
VNI and such.
|
||||||
|
* ***L2FIB***: I will need to add, replace, delete, and list which MAC addresses go where,
|
||||||
|
With such a table, each time a packet is handled for a given Dynamic VxLAN interface, the
|
||||||
|
dst_addr can be written into the packet.
|
||||||
|
* ***Flooding***: For those packets that are not unicast (BUM), I will need to be able to add,
|
||||||
|
remove and list which VTEPs should receive this packet.
|
||||||
|
|
||||||
|
It would be pretty dope if the configuration looked something like this:
|
||||||
|
```
|
||||||
|
vpp# create evpn-vxlan src <v46address> dst-port <port> vni <vni> instance <id>
|
||||||
|
vpp# evpn-vxlan l2fib <iface> mac <mac> dst <v46address> [del]
|
||||||
|
vpp# evpn-vxlan flood <iface> dst <v46address> [del]
|
||||||
|
```
|
||||||
|
|
||||||
|
The VxLAN underlay transport can be either IPv4 or IPv6. Of course manipulating L2FIB or Flood
|
||||||
|
destinations must match the address family of an interface of type evpn-vxlan. A practical example
|
||||||
|
might be:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp# create evpn-vxlan src 2001:db8::1 dst-port 4789 vni 8298 instance 6
|
||||||
|
vpp# evpn-vxlan l2fib evpn-vxlan0 mac 00:01:02:82:98:02 dst 2001:db8::2
|
||||||
|
vpp# evpn-vxlan l2fib evpn-vxlan0 mac 00:01:02:82:98:03 dst 2001:db8::3
|
||||||
|
vpp# evpn-vxlan flood evpn-vxlan0 dst 2001:db8::2
|
||||||
|
vpp# evpn-vxlan flood evpn-vxlan0 dst 2001:db8::3
|
||||||
|
vpp# evpn-vxlan flood evpn-vxlan0 dst 2001:db8::4
|
||||||
|
```
|
||||||
|
|
||||||
|
By the way, while this _could_ be a new plugin, it could also just be added to the existing VxLAN
|
||||||
|
plugin. One way in which I might do this when creating a normal vxlan tunnel is to allow for its
|
||||||
|
destination address to be either 0.0.0.0 for IPv4 or :: for IPv6. That would signal 'dynamic'
|
||||||
|
tunneling, upon which the L2FIB and Flood lists are used. It would slow down each VxLAN packet by
|
||||||
|
the time it takes to call `ip46_address_is_zero()` which is only a handfull of clocks.
|
||||||
|
|
||||||
|
### Bridge Domain
|
||||||
|
|
||||||
|
{{< image width="6em" float="left" src="/assets/shared/warning.png" alt="Warning" >}}
|
||||||
|
|
||||||
|
It's important to understand that L2 learning is **required** for eVPN to function. Each router
|
||||||
|
needs to be able to tell the iBGP eVPN session which MAC addresses should be forwarded to it. This
|
||||||
|
rules out the simple case of L2XC because there, no learning is performed. The corollary is that a
|
||||||
|
bridge-domain is required for any form of eVPN.
|
||||||
|
|
||||||
|
The L2 code in VPP already does most of what I'd need. It maintains an L2FIB in `vnet/l2/l2_fib.c`,
|
||||||
|
which is keyed by bridge-id and MAC address, and its values are a 64 bit structure that points
|
||||||
|
essentially to a `sw_if_index` output interface. The L2FIB of the eVPN needs a bit more information
|
||||||
|
though, notably a `ip46address` struct to know which VTEP to send to. It's tempting to add this
|
||||||
|
extra data to the bridge domain code. I would recommend against it, because other implementations,
|
||||||
|
for example MPLS, GENEVE or Carrier Pigeon IP may need more than just the destination address. Even
|
||||||
|
the VxLAN implementation I'm thinking about might want to be able to override other things like the
|
||||||
|
destination port for a given VTEP, or even the VNI. Putting all of this stuff in the bridge-domain
|
||||||
|
code will just clutter it, for all users, not just those users who might want eVPN.
|
||||||
|
|
||||||
|
Similarly, one might argue it is tempting to re-use/extend the behavior in `vnet/l2/l2_flood.c`,
|
||||||
|
because if it's already replicating BUM traffic, why not replicate it many times over the flood list
|
||||||
|
for any member interface that happens to be a dynamic VxLAN interface? This would be a bad idea
|
||||||
|
because of a few reasons. Firstly, it is not guaranteed that the VxLAN plugin is loaded, and in
|
||||||
|
doing this, I would leak internal details of VxLAN into the bridge-domain code. Secondly, the
|
||||||
|
`l2_flood.c` code would potentially get messy if other types were added (like the MPLS and GENEVE
|
||||||
|
above).
|
||||||
|
|
||||||
|
A reasonable request is to mark such BUM frames once in the existing L2 code and when handing the
|
||||||
|
replicated packet into the VxLAN node, to see the `is_bum` marker and once again replicate -- in the
|
||||||
|
vxlan plugin -- these packets to the VTEPs in our local flood-list. Although a bit more work, this
|
||||||
|
approach only requires a tiny amount of work in the `l2_flood.c` code (the marking), and will keep
|
||||||
|
all the logic tucked away where it is relevant, derisking the VPP vnet codebase.
|
||||||
|
|
||||||
|
Fundamentally, I think the cleanest design is to keep the dynamic VxLAN interface fully
|
||||||
|
self-contained and it would therefor maintain its own L2FIB and Flooding logic. The only thing I
|
||||||
|
would add to the L2 codebase is some form of BUM marker to allow for efficient flooding.
|
||||||
|
|
||||||
|
### Control Plane
|
||||||
|
|
||||||
|
There's a few things the control plane has to do. Some external agent, like FRR or Bird, will be
|
||||||
|
receiving a few types of eVPN messages. The ones I'm interested in are:
|
||||||
|
|
||||||
|
* ***Type 2***: MAC/IP Advertisement Route
|
||||||
|
- On the way in, these should be fed to the VxLAN L2FIB belonging to the bridge-domain.
|
||||||
|
- On the way out, learned addresses should be advertised to peers.
|
||||||
|
- Regarding IPv4/IPv6 addresses, that is the ARP / ND tables: we can talk about those later.
|
||||||
|
* ***Type 3***: Inclusive Multicast Ethernet Tag Route
|
||||||
|
- On the way in, these will populate the VxLAN Flood list belonging to the bridge-domain
|
||||||
|
- On the way out, each bridge-domain should advertise itself as IMET to peers.
|
||||||
|
* ***Type 5***: IP Prefix Route
|
||||||
|
- Similar to IP information in Type 2, we can talk about those later once L3VPN/eVPN is
|
||||||
|
needed.
|
||||||
|
|
||||||
|
The 'on the way in' stuff can be easily done with my proposed APIs in the Dynamic VxLAN (or a new
|
||||||
|
eVPN VxLAN) plugin. Adding, removing, listing L2FIB and Flood lists is easy as far as VPP is
|
||||||
|
concerned. It's just that the controlplane implementation needs to somehow _feed_ the API, so an
|
||||||
|
external program may be needed, or alterntively the Linux Control Plane netlink plugin might be used
|
||||||
|
to consume this information.
|
||||||
|
|
||||||
|
The 'on the way out' stuff is a bit trickier. I will need to listen to creation of new broadcast
|
||||||
|
domains and associate them with the right IMET announcements, and for each MAC address learned, pick
|
||||||
|
them up and advertise them into eVPN. Later, if ever ARP and ND proxying becomes important, I'll
|
||||||
|
have to revisit the bridge-domain feature to do IPv4 ARP and IPv6 Neighbor Discovery, and replace it
|
||||||
|
with some code that populates the IPv4/IPv6 parts of the Type2 messages on the way out, and
|
||||||
|
similarly on the way in, populates an L3 neighbor cache for the bridge domain, so ARP and ND replies
|
||||||
|
can be synthesized based on what we've learned in eVPN.
|
||||||
|
|
||||||
|
# Demonstration
|
||||||
|
|
||||||
|
### VPP: Current VxLAN
|
||||||
|
|
||||||
|
I'll build a small demo environment on Summer to show how the interaction of VxLAN and Bridge
|
||||||
|
Domain works today:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp# create tap host-if-name dummy0 host-mtu-size 9216 host-ip4-addr 192.0.2.1/24
|
||||||
|
vpp# set int state tap0 up
|
||||||
|
vpp# set int ip address tap0 192.0.2.1/24
|
||||||
|
vpp# set ip neighbor tap0 192.0.2.254 01:02:03:82:98:fe static
|
||||||
|
vpp# set ip neighbor tap0 192.0.2.2 01:02:03:82:98:02 static
|
||||||
|
vpp# set ip neighbor tap0 192.0.2.3 01:02:03:82:98:03 static
|
||||||
|
|
||||||
|
vpp# create vxlan tunnel src 192.0.2.1 dst 192.0.2.254 vni 8298
|
||||||
|
vpp# set int state vxlan_tunnel0 up
|
||||||
|
|
||||||
|
vpp# create tap host-if-name vpptap0 host-mtu-size 9216 hw-addr 02:fe:64:dc:1b:82
|
||||||
|
vpp# set int state tap1 up
|
||||||
|
|
||||||
|
vpp# create bridge-domain 8298
|
||||||
|
vpp# set int l2 bridge tap1 8298
|
||||||
|
vpp# set int l2 bridge vxlan_tunnel0 8298
|
||||||
|
```
|
||||||
|
|
||||||
|
I've created a tap device called `dummy0` and gave it an IPv4 address. Normally, I would use some
|
||||||
|
DPDK or RDMA interface like `TenGigabutEthernet10/0/0`. Then I'll populate some static ARP entries.
|
||||||
|
Again, normally this would just be 'use normal routing'. However, for the purposes of this
|
||||||
|
demonstration, it helps to use a TAP device, as any packets I make VPP send to those 192.0.2.254 and
|
||||||
|
so on, can be captured with `tcpdump` in Linux in addition to `trace add` in VPP.
|
||||||
|
|
||||||
|
Then, I create a VxLAN tunnel with a default destination of 192.0.2.254 and the given VNI.
|
||||||
|
Next, I create a TAP interface called `vpptap0` with the given MAC address.
|
||||||
|
Finally, I bind these two interfaces together in a bridge-domain.
|
||||||
|
|
||||||
|
I proceed to write a small ScaPY program:
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from scapy.all import Ether, IP, UDP, Raw, sendp
|
||||||
|
|
||||||
|
pkt = Ether(dst="01:02:03:04:05:02", src="02:fe:64:dc:1b:82", type=0x0800)
|
||||||
|
/ IP(src="192.168.1.1", dst="192.168.1.2")
|
||||||
|
/ UDP(sport=8298, dport=7) / Raw(load=b"ping")
|
||||||
|
print(pkt)
|
||||||
|
sendp(pkt, iface="vpptap0")
|
||||||
|
|
||||||
|
pkt = Ether(dst="01:02:03:04:05:03", src="02:fe:64:dc:1b:82", type=0x0800)
|
||||||
|
/ IP(src="192.168.1.1", dst="192.168.1.3")
|
||||||
|
/ UDP(sport=8298, dport=7) / Raw(load=b"ping")
|
||||||
|
print(pkt)
|
||||||
|
sendp(pkt, iface="vpptap0")
|
||||||
|
```
|
||||||
|
|
||||||
|
What will happen is, the ScaPY program will emit these frames into device `vpptap0` which is in
|
||||||
|
bridge-domain 8298. The bridge will learn our src MAC `02:fe:64:dc:1b:82`, and look up the dst MAC
|
||||||
|
`01:02:03:04:05:02`, and because there hasn't been traffic yet, it'll flood to all member ports, one
|
||||||
|
of which is the VxLAN tunnel. VxLAN will then encapsulate the packets to the other side of the
|
||||||
|
tunnel.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~$ sudo ./vxlan-test.py
|
||||||
|
Ether / IP / UDP 192.168.1.1:8298 > 192.168.1.2:echo / Raw
|
||||||
|
Ether / IP / UDP 192.168.1.1:8298 > 192.168.1.3:echo / Raw
|
||||||
|
|
||||||
|
pim@summer:~$ sudo tcpdump -evni dummy0
|
||||||
|
10:50:35.310620 02:fe:72:52:38:53 > 01:02:03:82:98:fe, ethertype IPv4 (0x0800), length 96:
|
||||||
|
(tos 0x0, ttl 253, id 0, offset 0, flags [none], proto UDP (17), length 82)
|
||||||
|
192.0.2.1.6345 > 192.0.2.254.4789: VXLAN, flags [I] (0x08), vni 8298
|
||||||
|
02:fe:64:dc:1b:82 > 01:02:03:04:05:02, ethertype IPv4 (0x0800), length 46:
|
||||||
|
(tos 0x0, ttl 64, id 1, offset 0, flags [none], proto UDP (17), length 32)
|
||||||
|
192.168.1.1.8298 > 192.168.1.2.7: UDP, length 4
|
||||||
|
10:50:35.362552 02:fe:72:52:38:53 > 01:02:03:82:98:fe, ethertype IPv4 (0x0800), length 96:
|
||||||
|
(tos 0x0, ttl 253, id 0, offset 0, flags [none], proto UDP (17), length 82)
|
||||||
|
192.0.2.1.23916 > 192.0.2.254.4789: VXLAN, flags [I] (0x08), vni 8298
|
||||||
|
02:fe:64:dc:1b:82 > 01:02:03:04:05:03, ethertype IPv4 (0x0800), length 46:
|
||||||
|
(tos 0x0, ttl 64, id 1, offset 0, flags [none], proto UDP (17), length 32)
|
||||||
|
192.168.1.1.8298 > 192.168.1.3.7: UDP, length 4
|
||||||
|
```
|
||||||
|
|
||||||
|
I want to point out that nothing, so far, is special. All of this works with upstream VPP just fine.
|
||||||
|
I can see two VxLAN encapsulated packets, both destined to `192.0.2.254:4789`. Cool.
|
||||||
|
|
||||||
|
### Dynamic VPP VxLAN
|
||||||
|
|
||||||
|
I wrote a prototype for a Dynamic VxLAN tunnel in [[43433](https://gerrit.fd.io/r/c/vpp/+/43433)].
|
||||||
|
The good news is, this works. The bad news is, I think I'll want to discuss my proposal (this
|
||||||
|
article) with the community before going further down a potential rabbit hole.
|
||||||
|
|
||||||
|
With my gerrit patched in, I can do the following:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp# vxlan l2fib vxlan_tunnel0 mac 01:02:03:04:05:02 dst 192.0.2.2
|
||||||
|
Added VXLAN dynamic destination for 01:02:03:04:05:02 on vxlan_tunnel0 dst 192.0.2.2
|
||||||
|
vpp# vxlan l2fib vxlan_tunnel0 mac 01:02:03:04:05:03 dst 192.0.2.3
|
||||||
|
Added VXLAN dynamic destination for 01:02:03:04:05:03 on vxlan_tunnel0 dst 192.0.2.3
|
||||||
|
|
||||||
|
vpp# show vxlan l2fib
|
||||||
|
VXLAN Dynamic L2FIB entries:
|
||||||
|
MAC Interface Destination Port VNI
|
||||||
|
01:02:03:04:05:02 vxlan_tunnel0 192.0.2.2 4789 8298
|
||||||
|
01:02:03:04:05:03 vxlan_tunnel0 192.0.2.3 4789 8298
|
||||||
|
Dynamic L2FIB entries: 2
|
||||||
|
```
|
||||||
|
|
||||||
|
I've instructed the VxLAN tunnel to change the tunnel destination based on the destination MAC.
|
||||||
|
|
||||||
|
|
||||||
|
I run the script and tcpdump again:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~$ sudo tcpdump -evni dummy0
|
||||||
|
11:16:53.834619 02:fe:fe:ae:0d:a3 > 01:02:03:82:98:fe, ethertype IPv4 (0x0800), length 96:
|
||||||
|
(tos 0x0, ttl 253, id 0, offset 0, flags [none], proto UDP (17), length 82, bad cksum 3945 (->3997)!)
|
||||||
|
192.0.2.1.6345 > 192.0.2.2.4789: VXLAN, flags [I] (0x08), vni 8298
|
||||||
|
02:fe:64:dc:1b:82 > 01:02:03:04:05:02, ethertype IPv4 (0x0800), length 46:
|
||||||
|
(tos 0x0, ttl 64, id 1, offset 0, flags [none], proto UDP (17), length 32)
|
||||||
|
192.168.1.1.8298 > 192.168.1.2.7: UDP, length 4
|
||||||
|
11:16:53.882554 02:fe:fe:ae:0d:a3 > 01:02:03:82:98:fe, ethertype IPv4 (0x0800), length 96:
|
||||||
|
(tos 0x0, ttl 253, id 0, offset 0, flags [none], proto UDP (17), length 82, bad cksum 3944 (->3996)!)
|
||||||
|
192.0.2.1.23916 > 192.0.2.3.4789: VXLAN, flags [I] (0x08), vni 8298
|
||||||
|
02:fe:64:dc:1b:82 > 01:02:03:04:05:03, ethertype IPv4 (0x0800), length 46:
|
||||||
|
(tos 0x0, ttl 64, id 1, offset 0, flags [none], proto UDP (17), length 32)
|
||||||
|
192.168.1.1.8298 > 192.168.1.3.7: UDP, length 4
|
||||||
|
```
|
||||||
|
|
||||||
|
Two important notes: Firstly, this works! For the MAC address ending in `:02`, send the packet to
|
||||||
|
`192.0.2.2` instead of the default of `192.0.2.254`. Same for the `:03` MAC which now goes to
|
||||||
|
`192.0.2.3`. Nice! But secondly, the IPv4 header of the VxLAN packets was changed, so there needs to
|
||||||
|
be a call to `ip4_header_checksum()` inserted somewhere. That's an easy fix.
|
||||||
|
|
||||||
|
# What's next
|
||||||
|
|
||||||
|
I want to discuss a few things, perhaps at an upcoming VPP Community meeting. Notably:
|
||||||
|
1. Is the VPP Developer community supportive of adding eVPN support? Does anybody want to help
|
||||||
|
write it with me?
|
||||||
|
1. Is changing the existing VxLAN plugin appropriate, or should I make a new plugin which adds
|
||||||
|
dynamic endpoints, L2FIB and Flood lists for BUM traffic?
|
||||||
|
1. Is it acceptable for me to add a BUM marker in `l2_flood.c` so that I can reuse all the logic
|
||||||
|
from bridge-domain flooding as I extend to also do VTEP flooding?
|
||||||
|
1. (perhaps later) VxLAN is the canonical underlay, but is there an appetite to extend also to,
|
||||||
|
say, GENEVE or MPLS?
|
||||||
|
1. (perhaps later) What's a good way to tie in a controlplane like FRRouting or Bird2 into the
|
||||||
|
dataplane (perhaps using a sidecar controller, or perhaps using Linux CP Netlink messages)?
|
||||||
|
|
||||||
@@ -0,0 +1,701 @@
|
|||||||
|
---
|
||||||
|
date: "2025-07-26T22:07:23Z"
|
||||||
|
title: 'Certificate Transparency - Part 1 - TesseraCT'
|
||||||
|
aliases:
|
||||||
|
- /s/articles/2025/07/26/certificate-transparency-part-1/
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="10em" float="right" src="/assets/ctlog/ctlog-logo-ipng.png" alt="ctlog logo" >}}
|
||||||
|
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
There once was a Dutch company called [[DigiNotar](https://en.wikipedia.org/wiki/DigiNotar)], as the
|
||||||
|
name suggests it was a form of _digital notary_, and they were in the business of issuing security
|
||||||
|
certificates. Unfortunately, in June of 2011, their IT infrastructure was compromised and
|
||||||
|
subsequently it issued hundreds of fraudulent SSL certificates, some of which were used for
|
||||||
|
man-in-the-middle attacks on Iranian Gmail users. Not cool.
|
||||||
|
|
||||||
|
Google launched a project called **Certificate Transparency**, because it was becoming more common
|
||||||
|
that the root of trust given to _Certification Authorities_ could no longer be unilaterally trusted.
|
||||||
|
These attacks showed that the lack of transparency in the way CAs operated was a significant risk to
|
||||||
|
the Web Public Key Infrastructure. It led to the creation of this ambitious
|
||||||
|
[[project](https://certificate.transparency.dev/)] to improve security online by bringing
|
||||||
|
accountability to the system that protects our online services with _SSL_ (Secure Socket Layer)
|
||||||
|
and _TLS_ (Transport Layer Security).
|
||||||
|
|
||||||
|
In 2013, [[RFC 6962](https://datatracker.ietf.org/doc/html/rfc6962)] was published by the IETF. It
|
||||||
|
describes an experimental protocol for publicly logging the existence of Transport Layer Security
|
||||||
|
(TLS) certificates as they are issued or observed, in a manner that allows anyone to audit
|
||||||
|
certificate authority (CA) activity and notice the issuance of suspect certificates as well as to
|
||||||
|
audit the certificate logs themselves. The intent is that eventually clients would refuse to honor
|
||||||
|
certificates that do not appear in a log, effectively forcing CAs to add all issued certificates to
|
||||||
|
the logs.
|
||||||
|
|
||||||
|
This series explores and documents how IPng Networks will be running two Static CT _Logs_ with two
|
||||||
|
different implementations. One will be [[Sunlight](https://sunlight.dev/)], and the other will be
|
||||||
|
[[TesseraCT](https://github.com/transparency-dev/tesseract)].
|
||||||
|
|
||||||
|
## Static Certificate Transparency
|
||||||
|
|
||||||
|
In this context, _Logs_ are network services that implement the protocol operations for submissions
|
||||||
|
and queries that are defined in a specification that builds on the previous RFC. A few years ago,
|
||||||
|
my buddy Antonis asked me if I would be willing to run a log, but operationally they were very
|
||||||
|
complex and expensive to run. However, over the years, the concept of _Static Logs_ put running one
|
||||||
|
in reach. This [[Static CT API](https://github.com/C2SP/C2SP/blob/main/static-ct-api.md)] defines a
|
||||||
|
read-path HTTP static asset hierarchy (for monitoring) to be implemented alongside the write-path
|
||||||
|
RFC 6962 endpoints (for submission).
|
||||||
|
|
||||||
|
Aside from the different read endpoints, a log that implements the Static API is a regular CT log
|
||||||
|
that can work alongside RFC 6962 logs and that fulfills the same purpose. In particular, it requires
|
||||||
|
no modification to submitters and TLS clients.
|
||||||
|
|
||||||
|
If you only read one document about Static CT, read Filippo Valsorda's excellent
|
||||||
|
[[paper](https://filippo.io/a-different-CT-log)]. It describes a radically cheaper and easier to
|
||||||
|
operate [[Certificate Transparency](https://certificate.transparency.dev/)] log that is backed by a
|
||||||
|
consistent object storage, and can scale to 30x the current issuance rate for 2-10% of the costs
|
||||||
|
with no merge delay.
|
||||||
|
|
||||||
|
## Scalable, Cheap, Reliable: choose two
|
||||||
|
|
||||||
|
{{< image width="18em" float="right" src="/assets/ctlog/MPLS Backbone - CTLog.svg" alt="ctlog at ipng" >}}
|
||||||
|
|
||||||
|
In the diagram, I've drawn an overview of IPng's network. In {{< boldcolor color="red" >}}red{{<
|
||||||
|
/boldcolor >}} a European backbone network is provided by a [[BGP Free Core
|
||||||
|
network]({{< ref 2022-12-09-oem-switch-2 >}})]. It operates a private IPv4, IPv6, and MPLS network, called
|
||||||
|
_IPng Site Local_, which is not connected to the internet. On top of that, IPng offers L2 and L3
|
||||||
|
services, for example using [[VPP]({{< ref 2021-02-27-network >}})].
|
||||||
|
|
||||||
|
In {{< boldcolor color="lightgreen" >}}green{{< /boldcolor >}} I built a cluster of replicated
|
||||||
|
NGINX frontends. They connect into _IPng Site Local_ and can reach all hypervisors, VMs, and storage
|
||||||
|
systems. They also connect to the Internet with a single IPv4 and IPv6 address. One might say that
|
||||||
|
SSL is _added and removed here :-)_ [[ref](/assets/ctlog/nsa_slide.jpg)].
|
||||||
|
|
||||||
|
Then in {{< boldcolor color="orange" >}}orange{{< /boldcolor >}} I built a set of [[MinIO]({{< ref
|
||||||
|
2025-05-28-minio-1 >}})] S3 storage pools. Amongst others, I serve the static content from the IPng
|
||||||
|
website from these pools, providing fancy redundancy and caching. I wrote about its design in [[this
|
||||||
|
article]({{< ref 2025-06-01-minio-2 >}})].
|
||||||
|
|
||||||
|
Finally, I turn my attention to the {{< boldcolor color="blue" >}}blue{{< /boldcolor >}} which is
|
||||||
|
two hypervisors, one run by [[IPng](https://ipng.ch/)] and the other by [[Massar](https://massars.net/)]. Each
|
||||||
|
of them will be running one of the _Log_ implementations. IPng provides two large ZFS storage tanks
|
||||||
|
for offsite backup, in case a hypervisor decides to check out, and daily backups to an S3 bucket
|
||||||
|
using Restic.
|
||||||
|
|
||||||
|
Having explained all of this, I am well aware that end-to-end reliability will be coming from the
|
||||||
|
fact that there are many independent _Log_ operators, and folks wanting to validate certificates can
|
||||||
|
simply monitor many. If there is a gap in coverage, say due to any given _Log_'s downtime, this will
|
||||||
|
not necessarily be problematic. It does mean that I may have to suppress the SRE in me...
|
||||||
|
|
||||||
|
## MinIO
|
||||||
|
|
||||||
|
My first instinct is to leverage the distributed storage IPng has, but as I'll show in the rest of
|
||||||
|
this article, maybe a simpler, more elegant design could be superior, precisely because individual
|
||||||
|
log reliability is not _as important_ as having many available log _instances_ to choose from.
|
||||||
|
|
||||||
|
From operators in the field I understand that the world-wide generation of certificates is roughly
|
||||||
|
17M/day, which amounts to some 200-250qps of writes. Antonis explains that certs with a validity
|
||||||
|
of 180 days or less will need two CT log entries, while certs with a validity of more than 180d will
|
||||||
|
need three CT log entries. So the write rate is roughly 2.2x that, as an upper bound.
|
||||||
|
|
||||||
|
My first thought is to see how fast my open source S3 machines can go, really. I'm curious also as
|
||||||
|
to the difference between SSD and spinning disks.
|
||||||
|
|
||||||
|
I boot two Dell R630s in the Lab. These machines have two Xeon E5-2640 v4 CPUs for a total of 20
|
||||||
|
cores and 40 threads, and 512GB of DDR4 memory. They also sport a SAS controller. In one machine I
|
||||||
|
place 6pcs 1.2TB SAS3 disks (HPE part number EG1200JEHMC), and in the second machine I place 6pcs
|
||||||
|
of 1.92TB enterprise storage (Samsung part number P1633N19).
|
||||||
|
|
||||||
|
I spin up a 6-device MinIO cluster on both and take them out for a spin using [[S3
|
||||||
|
Benchmark](https://github.com/wasabi-tech/s3-benchmark.git)] from Wasabi Tech.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/s3-benchmark$ for dev in disk ssd; do \
|
||||||
|
for t in 1 8 32; do \
|
||||||
|
for z in 4M 1M 8k 4k; do \
|
||||||
|
./s3-benchmark -a $KEY -s $SECRET -u http://minio-$dev:9000 -t $t -z $z \
|
||||||
|
| tee -a minio-results.txt; \
|
||||||
|
done; \
|
||||||
|
done; \
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
The loadtest above does a bunch of runs with varying parameters. First it tries to read and write
|
||||||
|
object sizes of 4MB, 1MB, 8kB and 4kB respectively. Then it tries to do this with either 1 thread, 8
|
||||||
|
threads or 32 threads. Finally it tests both the disk-based variant as well as the SSD based one.
|
||||||
|
The loadtest runs from a third machine, so that the Dell R630 disk tanks can stay completely
|
||||||
|
dedicated to their task of running MinIO.
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/ctlog/minio_8kb_performance.png" alt="MinIO 8kb disk vs SSD" >}}
|
||||||
|
|
||||||
|
The left-hand side graph feels pretty natural to me. With one thread, uploading 8kB objects will
|
||||||
|
quickly hit the IOPS rate of the disks, each of which has to participate in the write due to EC:3
|
||||||
|
encoding when using six disks, and it tops out at ~56 PUT/s. The single thread hitting SSDs will not
|
||||||
|
hit that limit, and has ~371 PUT/s which I found a bit underwhelming. But, when performing the
|
||||||
|
loadtest with either 8 or 32 write threads, the hard disks become only marginally faster (topping
|
||||||
|
out at 240 PUT/s), while the SSDs really start to shine, with 3850 PUT/s. Pretty good performance.
|
||||||
|
|
||||||
|
On the read-side, I am pleasantly surprised that there's not really that much of a difference
|
||||||
|
between disks and SSDs. This is likely because the host filesystem cache is playing a large role, so
|
||||||
|
the 1-thread performance is equivalent (765 GET/s for disks, 677 GET/s for SSDs), and the 32-thread
|
||||||
|
performance is also equivalent (at 7624 GET/s for disks with 7261 GET/s for SSDs). I do wonder why
|
||||||
|
the hard disks consistently outperform the SSDs with all the other variables (OS, MinIO version,
|
||||||
|
hardware) the same.
|
||||||
|
|
||||||
|
## Sidequest: SeaweedFS
|
||||||
|
|
||||||
|
Something that has long caught my attention is the way in which
|
||||||
|
[[SeaweedFS](https://github.com/seaweedfs/seaweedfs)] approaches blob storage. Many operators have
|
||||||
|
great success with many small file writes in SeaweedFS compared to MinIO and even AWS S3 storage.
|
||||||
|
This is because writes with WeedFS are not broken into erasure-sets, which would require every disk
|
||||||
|
to write a small part or checksum of the data, but rather files are replicated within the cluster in
|
||||||
|
their entirety on different disks, racks or datacenters. I won't bore you with the details of
|
||||||
|
SeaweedFS but I'll tack on a docker [[compose file](/assets/ctlog/seaweedfs.docker-compose.yml)]
|
||||||
|
that I used at the end of this article, if you're curious.
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/ctlog/size_comparison_8t.png" alt="MinIO vs SeaWeedFS" >}}
|
||||||
|
|
||||||
|
In the write-path, SeaweedFS dominates in all cases, due to its different way of achieving durable
|
||||||
|
storage (per-file replication in SeaweedFS versus all-disk erasure-sets in MinIO):
|
||||||
|
* 4k: 3,384 ops/sec vs MinIO's 111 ops/sec (30x faster!)
|
||||||
|
* 8k: 3,332 ops/sec vs MinIO's 111 ops/sec (30x faster!)
|
||||||
|
* 1M: 383 ops/sec vs MinIO's 44 ops/sec (9x faster)
|
||||||
|
* 4M: 104 ops/sec vs MinIO's 32 ops/sec (4x faster)
|
||||||
|
|
||||||
|
For the read-path, in GET operations MinIO is better at small objects, and really dominates the
|
||||||
|
large objects:
|
||||||
|
* 4k: 7,411 ops/sec vs SeaweedFS 5,014 ops/sec
|
||||||
|
* 8k: 7,666 ops/sec vs SeaweedFS 5,165 ops/sec
|
||||||
|
* 1M: 5,466 ops/sec vs SeaweedFS 2,212 ops/sec
|
||||||
|
* 4M: 3,084 ops/sec vs SeaweedFS 646 ops/sec
|
||||||
|
|
||||||
|
This makes me draw an interesting conclusion: seeing as CT Logs are read/write heavy (every couple
|
||||||
|
of seconds, the Merkle tree is recomputed which is reasonably disk-intensive), SeaweedFS might be a
|
||||||
|
slightly better choice. IPng Networks has three MinIO deployments, but no SeaweedFS deployments. Yet.
|
||||||
|
|
||||||
|
# Tessera
|
||||||
|
|
||||||
|
[[Tessera](https://github.com/transparency-dev/tessera.git)] is a Go library for building tile-based
|
||||||
|
transparency logs (tlogs) [[ref](https://github.com/C2SP/C2SP/blob/main/tlog-tiles.md)]. It is the
|
||||||
|
logical successor to the approach that Google took when building and operating _Logs_ using its
|
||||||
|
predecessor called [[Trillian](https://github.com/google/trillian)]. The implementation and its APIs
|
||||||
|
bake-in current best-practices based on the lessons learned over the past decade of building and
|
||||||
|
operating transparency logs in production environments and at scale.
|
||||||
|
|
||||||
|
Tessera was introduced at the Transparency.Dev summit in October 2024. I first watched Al and Martin
|
||||||
|
[[introduce](https://www.youtube.com/watch?v=9j_8FbQ9qSc)] it at last year's summit. At a high
|
||||||
|
level, it wraps what used to be a whole Kubernetes cluster full of components, into a single library
|
||||||
|
that can be used with Cloud based services, either like AWS S3 and RDS database, or like GCP's GCS
|
||||||
|
storage and Spanner database. However, Google also made it easy to use a regular POSIX filesystem
|
||||||
|
implementation.
|
||||||
|
|
||||||
|
## TesseraCT
|
||||||
|
|
||||||
|
{{< image width="10em" float="right" src="/assets/ctlog/tesseract-logo.png" alt="tesseract logo" >}}
|
||||||
|
|
||||||
|
While Tessera is a library, a CT log implementation comes from its sibling GitHub repository called
|
||||||
|
[[TesseraCT](https://github.com/transparency-dev/tesseract)]. Because it leverages Tessera under the
|
||||||
|
hood, TesseraCT can run on GCP, AWS, POSIX-compliant, or on S3-compatible systems alongside a MySQL
|
||||||
|
database. In order to provide ecosystem agility and to control the growth of CT Log sizes, new CT
|
||||||
|
Logs must be temporally sharded, defining a certificate expiry range denoted in the form of two
|
||||||
|
dates: `[rangeBegin, rangeEnd)`. The certificate expiry range allows a Log to reject otherwise valid
|
||||||
|
logging submissions for certificates that expire before or after this defined range, thus
|
||||||
|
partitioning the set of publicly-trusted certificates that each Log will accept. I will be expected
|
||||||
|
to keep logs for an extended period of time, say 3-5 years.
|
||||||
|
|
||||||
|
It's time for me to figure out what this TesseraCT thing can do .. are you ready? Let's go!
|
||||||
|
|
||||||
|
### TesseraCT: S3 and SQL
|
||||||
|
|
||||||
|
TesseraCT comes with a few so-called _personalities_. These are implementations of the underlying
|
||||||
|
storage infrastructure in an opinionated way. The first personality I look at is the `aws` one in
|
||||||
|
`cmd/tesseract/aws`. I notice that this personality does make hard assumptions about the use of AWS
|
||||||
|
which is unfortunate as the documentation says '.. or self-hosted S3 and MySQL database'. However,
|
||||||
|
the `aws` personality assumes the AWS SecretManager in order to fetch its signing key. Before I
|
||||||
|
can be successful, I need to untangle that.
|
||||||
|
|
||||||
|
#### TesseraCT: AWS and Local Signer
|
||||||
|
|
||||||
|
First, I change `cmd/tesseract/aws/main.go` to add two new flags:
|
||||||
|
|
||||||
|
* ***-signer_public_key_file***: a path to the public key for checkpoints and SCT signer
|
||||||
|
* ***-signer_private_key_file***: a path to the private key for checkpoints and SCT signer
|
||||||
|
|
||||||
|
I then change the program to assume if these flags are both set, the user will want a
|
||||||
|
_NewLocalSigner_ instead of a _NewSecretsManagerSigner_. Now all I have to do is implement the
|
||||||
|
signer interface in a package `local_signer.go`. There, function _NewLocalSigner()_ will read the
|
||||||
|
public and private PEM from file, decode them, and create an _ECDSAWithSHA256Signer_ with them, a
|
||||||
|
simple example to show what I mean:
|
||||||
|
|
||||||
|
```
|
||||||
|
// NewLocalSigner creates a new signer that uses the ECDSA P-256 key pair from
|
||||||
|
// local disk files for signing digests.
|
||||||
|
func NewLocalSigner(publicKeyFile, privateKeyFile string) (*ECDSAWithSHA256Signer, error) {
|
||||||
|
// Read public key
|
||||||
|
publicKeyPEM, err := os.ReadFile(publicKeyFile)
|
||||||
|
publicPemBlock, rest := pem.Decode(publicKeyPEM)
|
||||||
|
|
||||||
|
var publicKey crypto.PublicKey
|
||||||
|
publicKey, err = x509.ParsePKIXPublicKey(publicPemBlock.Bytes)
|
||||||
|
ecdsaPublicKey, ok := publicKey.(*ecdsa.PublicKey)
|
||||||
|
|
||||||
|
// Read private key
|
||||||
|
privateKeyPEM, err := os.ReadFile(privateKeyFile)
|
||||||
|
privatePemBlock, rest := pem.Decode(privateKeyPEM)
|
||||||
|
|
||||||
|
var ecdsaPrivateKey *ecdsa.PrivateKey
|
||||||
|
ecdsaPrivateKey, err = x509.ParseECPrivateKey(privatePemBlock.Bytes)
|
||||||
|
|
||||||
|
// Verify the correctness of the signer key pair
|
||||||
|
if !ecdsaPrivateKey.PublicKey.Equal(ecdsaPublicKey) {
|
||||||
|
return nil, errors.New("signer key pair doesn't match")
|
||||||
|
}
|
||||||
|
|
||||||
|
return &ECDSAWithSHA256Signer{
|
||||||
|
publicKey: ecdsaPublicKey,
|
||||||
|
privateKey: ecdsaPrivateKey,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
In the snippet above I omitted all of the error handling, but the local signer logic itself is
|
||||||
|
hopefully clear. And with that, I am liberated from Amazon's Cloud offering and can run this thing
|
||||||
|
all by myself!
|
||||||
|
|
||||||
|
#### TesseraCT: Running with S3, MySQL, and Local Signer
|
||||||
|
|
||||||
|
First, I need to create a suitable ECDSA key:
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~$ openssl ecparam -name prime256v1 -genkey -noout -out /tmp/private_key.pem
|
||||||
|
pim@ctlog-test:~$ openssl ec -in /tmp/private_key.pem -pubout -out /tmp/public_key.pem
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, I'll install the MySQL server and create the databases:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~$ sudo apt install default-mysql-server
|
||||||
|
pim@ctlog-test:~$ sudo mysql -u root
|
||||||
|
|
||||||
|
CREATE USER 'tesseract'@'localhost' IDENTIFIED BY '<db_passwd>';
|
||||||
|
CREATE DATABASE tesseract;
|
||||||
|
CREATE DATABASE tesseract_antispam;
|
||||||
|
GRANT ALL PRIVILEGES ON tesseract.* TO 'tesseract'@'localhost';
|
||||||
|
GRANT ALL PRIVILEGES ON tesseract_antispam.* TO 'tesseract'@'localhost';
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, I use the SSD MinIO lab-machine that I just loadtested to create an S3 bucket.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~$ mc mb minio-ssd/tesseract-test
|
||||||
|
pim@ctlog-test:~$ cat << EOF > /tmp/minio-access.json
|
||||||
|
{ "Version": "2012-10-17", "Statement": [ {
|
||||||
|
"Effect": "Allow",
|
||||||
|
"Action": [ "s3:ListBucket", "s3:PutObject", "s3:GetObject", "s3:DeleteObject" ],
|
||||||
|
"Resource": [ "arn:aws:s3:::tesseract-test/*", "arn:aws:s3:::tesseract-test" ]
|
||||||
|
} ]
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
pim@ctlog-test:~$ mc admin user add minio-ssd <user> <secret>
|
||||||
|
pim@ctlog-test:~$ mc admin policy create minio-ssd tesseract-test-access /tmp/minio-access.json
|
||||||
|
pim@ctlog-test:~$ mc admin policy attach minio-ssd tesseract-test-access --user <user>
|
||||||
|
pim@ctlog-test:~$ mc anonymous set public minio-ssd/tesseract-test
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image width="6em" float="left" src="/assets/shared/brain.png" alt="brain" >}}
|
||||||
|
|
||||||
|
After some fiddling, I understand that the AWS software development kit makes some assumptions that
|
||||||
|
you'll be using .. _quelle surprise_ .. AWS services. But you can also use local S3 services by
|
||||||
|
setting a few key environment variables. I had heard of the S3 access and secret key environment
|
||||||
|
variables before, but I now need to also use a different S3 endpoint. That little detour into the
|
||||||
|
codebase only took me .. several hours.
|
||||||
|
|
||||||
|
Armed with that knowledge, I can build and finally start my TesseraCT instance:
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract/cmd/tesseract/aws$ go build -o ~/aws .
|
||||||
|
pim@ctlog-test:~$ export AWS_DEFAULT_REGION="us-east-1"
|
||||||
|
pim@ctlog-test:~$ export AWS_ACCESS_KEY_ID="<user>"
|
||||||
|
pim@ctlog-test:~$ export AWS_SECRET_ACCESS_KEY="<secret>"
|
||||||
|
pim@ctlog-test:~$ export AWS_ENDPOINT_URL_S3="http://minio-ssd.lab.ipng.ch:9000/"
|
||||||
|
pim@ctlog-test:~$ ./aws --http_endpoint='[::]:6962' \
|
||||||
|
--origin=ctlog-test.lab.ipng.ch/test-ecdsa \
|
||||||
|
--bucket=tesseract-test \
|
||||||
|
--db_host=ctlog-test.lab.ipng.ch \
|
||||||
|
--db_user=tesseract \
|
||||||
|
--db_password=<db_passwd> \
|
||||||
|
--db_name=tesseract \
|
||||||
|
--antispam_db_name=tesseract_antispam \
|
||||||
|
--signer_public_key_file=/tmp/public_key.pem \
|
||||||
|
--signer_private_key_file=/tmp/private_key.pem \
|
||||||
|
--roots_pem_file=internal/hammer/testdata/test_root_ca_cert.pem
|
||||||
|
|
||||||
|
I0727 15:13:04.666056 337461 main.go:128] **** CT HTTP Server Starting ****
|
||||||
|
```
|
||||||
|
|
||||||
|
Hah! I think most of the command line flags and environment variables should make sense, but I was
|
||||||
|
struggling for a while with the `--roots_pem_file` and the `--origin` flags, so I phoned a friend
|
||||||
|
(Al Cutter, Googler extraordinaire and an expert in Tessera/CT). He explained to me that the Log is
|
||||||
|
actually an open endpoint to which anybody might POST data. However, to avoid folks abusing the log
|
||||||
|
infrastructure, each POST is expected to come from one of the certificate authorities listed in the
|
||||||
|
`--roots_pem_file`. OK, that makes sense.
|
||||||
|
|
||||||
|
Then, the `--origin` flag designates how my log calls itself. In the resulting `checkpoint` file it
|
||||||
|
will enumerate a hash of the latest merged and published Merkle tree. In case a server serves
|
||||||
|
multiple logs, it uses the `--origin` flag to make the distinction which checksum belongs to which.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ curl http://tesseract-test.minio-ssd.lab.ipng.ch:9000/checkpoint
|
||||||
|
ctlog-test.lab.ipng.ch/test-ecdsa
|
||||||
|
0
|
||||||
|
JGPitKWWI0aGuCfC2k1n/p9xdWAYPm5RZPNDXkCEVUU=
|
||||||
|
|
||||||
|
— ctlog-test.lab.ipng.ch/test-ecdsa L+IHdQAAAZhMCONUBAMARjBEAiA/nc9dig6U//vPg7SoTHjt9bxP5K+x3w4MYKpIRn4ULQIgUY5zijRK8qyuJGvZaItDEmP1gohCt+wI+sESBnhkuqo=
|
||||||
|
```
|
||||||
|
|
||||||
|
When creating the bucket above, I used `mc anonymous set public`, which made the S3 bucket
|
||||||
|
world-readable. I can now execute the whole read-path simply by hitting the S3 service. Check.
|
||||||
|
|
||||||
|
#### TesseraCT: Loadtesting S3/MySQL
|
||||||
|
|
||||||
|
{{< image width="12em" float="right" src="/assets/ctlog/stop-hammer-time.jpg" alt="Stop, hammer time" >}}
|
||||||
|
|
||||||
|
The write path is a server on `[::]:6962`. I should be able to write a log to it, but how? Here's
|
||||||
|
where I am grateful to find a tool in the TesseraCT GitHub repository called `hammer`. This hammer
|
||||||
|
sets up read and write traffic to a Static CT API log to test correctness and performance under
|
||||||
|
load. The traffic is sent according to the [[Static CT API](https://c2sp.org/static-ct-api)] spec.
|
||||||
|
Slick!
|
||||||
|
|
||||||
|
The tool starts a text-based UI (my favorite! also when using Cisco T-Rex loadtester) in the terminal
|
||||||
|
that shows the current status, logs, and supports increasing/decreasing read and write traffic. This
|
||||||
|
TUI allows for a level of interactivity when probing a new configuration of a log in order to find
|
||||||
|
any cliffs where performance degrades. For real load-testing applications, especially headless runs
|
||||||
|
as part of a CI pipeline, it is recommended to run the tool with `-show_ui=false` in order to disable
|
||||||
|
the UI.
|
||||||
|
|
||||||
|
I'm a bit lost in the somewhat terse
|
||||||
|
[[README.md](https://github.com/transparency-dev/tesseract/tree/main/internal/hammer)], but my buddy
|
||||||
|
Al comes to my rescue and explains the flags to me. First of all, the loadtester wants to hit the
|
||||||
|
same `--origin` that I configured the write-path to accept. In my case this is
|
||||||
|
`ctlog-test.lab.ipng.ch/test-ecdsa`. Then, it needs the public key for that _Log_, which I can find
|
||||||
|
in `/tmp/public_key.pem`. The text there is the _DER_ (Distinguished Encoding Rules), stored as a
|
||||||
|
base64 encoded string. What follows next was the most difficult for me to understand, as I was
|
||||||
|
thinking the hammer would read some log from the internet somewhere and replay it locally. Al
|
||||||
|
explains that actually, the `hammer` tool synthetically creates all of these entries itself, and it
|
||||||
|
regularly reads the `checkpoint` from the `--log_url` place, while it writes its certificates to
|
||||||
|
`--write_log_url`. The last few flags just inform the `hammer` how many read and write ops/sec it
|
||||||
|
should generate, and with that explanation my brain plays _tadaa.wav_ and I am ready to go.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ go run ./internal/hammer \
|
||||||
|
--origin=ctlog-test.lab.ipng.ch/test-ecdsa \
|
||||||
|
--log_public_key=MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEucHtDWe9GYNicPnuGWbEX8rJg/VnDcXs8z40KdoNidBKy6/ZXw2u+NW1XAUnGpXcZozxufsgOMhijsWb25r7jw== \
|
||||||
|
--log_url=http://tesseract-test.minio-ssd.lab.ipng.ch:9000/ \
|
||||||
|
--write_log_url=http://localhost:6962/ctlog-test.lab.ipng.ch/test-ecdsa/ \
|
||||||
|
--max_read_ops=0 \
|
||||||
|
--num_writers=5000 \
|
||||||
|
--max_write_ops=100
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image width="30em" float="right" src="/assets/ctlog/ctlog-loadtest1.png" alt="S3/MySQL Loadtest 100qps" >}}
|
||||||
|
|
||||||
|
Cool! It seems that the loadtest is happily chugging along at 100qps. The log is consuming them in
|
||||||
|
the HTTP write-path by accepting POST requests to
|
||||||
|
`/ctlog-test.lab.ipng.ch/test-ecdsa/ct/v1/add-chain`, where hammer is offering them at a rate of
|
||||||
|
100qps, with a configured probability of duplicates set at 10%. What that means is that every now
|
||||||
|
and again, it'll repeat a previous request. The purpose of this is to stress test the so-called
|
||||||
|
`antispam` implementation. When `hammer` sends its requests, it signs them with a certificate that
|
||||||
|
was issued by the CA described in `internal/hammer/testdata/test_root_ca_cert.pem`, which is why
|
||||||
|
TesseraCT accepts them.
|
||||||
|
|
||||||
|
I raise the write load by using the '>' key a few times. I notice things are great at 500qps, which
|
||||||
|
is nice because that's double what we are to expect. But I start seeing a bit more noise at 600qps.
|
||||||
|
When I raise the write-rate to 1000qps, all hell breaks loose on the logs of the server (and similar
|
||||||
|
logs in the `hammer` loadtester):
|
||||||
|
|
||||||
|
```
|
||||||
|
W0727 15:54:33.419881 348475 handlers.go:168] ctlog-test.lab.ipng.ch/test-ecdsa: AddChain handler error: couldn't store the leaf: failed to fetch entry bundle at index 0: failed to fetch resource: getObject: failed to create reader for object "tile/data/000" in bucket "tesseract-test": operation error S3: GetObject, context deadline exceeded
|
||||||
|
W0727 15:55:02.727962 348475 aws.go:345] GarbageCollect failed: failed to delete one or more objects: failed to delete objects: operation error S3: DeleteObjects, https response error StatusCode: 400, RequestID: 1856202CA3C4B83F, HostID: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8, api error MalformedXML: The XML you provided was not well-formed or did not validate against our published schema.
|
||||||
|
E0727 15:55:10.448973 348475 append_lifecycle.go:293] followerStats: follower "AWS antispam" EntriesProcessed(): failed to read follow coordination info: Error 1040: Too many connections
|
||||||
|
```
|
||||||
|
|
||||||
|
I see on the MinIO instance that it's doing about 150/s of GETs and 15/s of PUTs, which is totally
|
||||||
|
reasonable:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ mc admin trace --stats ssd
|
||||||
|
Duration: 6m9s ▰▱▱
|
||||||
|
RX Rate:↑ 34 MiB/m
|
||||||
|
TX Rate:↓ 2.3 GiB/m
|
||||||
|
RPM : 10588.1
|
||||||
|
-------------
|
||||||
|
Call Count RPM Avg Time Min Time Max Time Avg TTFB Max TTFB Avg Size Rate /min
|
||||||
|
s3.GetObject 60558 (92.9%) 9837.2 4.3ms 708µs 48.1ms 3.9ms 47.8ms ↑144B ↓246K ↑1.4M ↓2.3G
|
||||||
|
s3.PutObject 2199 (3.4%) 357.2 5.3ms 2.4ms 32.7ms 5.3ms 32.7ms ↑92K ↑32M
|
||||||
|
s3.DeleteMultipleObjects 1212 (1.9%) 196.9 877µs 290µs 41.1ms 850µs 41.1ms ↑230B ↓369B ↑44K ↓71K
|
||||||
|
s3.ListObjectsV2 1212 (1.9%) 196.9 18.4ms 999µs 52.8ms 18.3ms 52.7ms ↑131B ↓261B ↑25K ↓50K
|
||||||
|
```
|
||||||
|
|
||||||
|
Another nice way to see what makes it through is this oneliner, which reads the `checkpoint` every
|
||||||
|
second, and once it changes, shows the delta in seconds and how many certs were written:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ T=0; O=0; while :; do \
|
||||||
|
N=$(curl -sS http://tesseract-test.minio-ssd.lab.ipng.ch:9000/checkpoint | grep -E '^[0-9]+$'); \
|
||||||
|
if [ "$N" -eq "$O" ]; then \
|
||||||
|
echo -n .; \
|
||||||
|
else \
|
||||||
|
echo " $T seconds $((N-O)) certs"; O=$N; T=0; echo -n $N\ ;
|
||||||
|
fi; \
|
||||||
|
T=$((T+1)); sleep 1; done
|
||||||
|
1012905 .... 5 seconds 2081 certs
|
||||||
|
1014986 .... 5 seconds 2126 certs
|
||||||
|
1017112 .... 5 seconds 1913 certs
|
||||||
|
1019025 .... 5 seconds 2588 certs
|
||||||
|
1021613 .... 5 seconds 2591 certs
|
||||||
|
1024204 .... 5 seconds 2197 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
So I can see that the checkpoint is refreshed every 5 seconds and between 1913 and 2591 certs are
|
||||||
|
written each time. And indeed, at 400/s there are no errors or warnings at all. At this write rate,
|
||||||
|
TesseraCT is using about 2.9 CPUs/s, with MariaDB using 0.3 CPUs/s, but the hammer is using 6.0
|
||||||
|
CPUs/s. Overall, the machine is perfectly happily serving for a few hours under this load test.
|
||||||
|
|
||||||
|
***Conclusion: a write-rate of 400/s should be safe with S3+MySQL***
|
||||||
|
|
||||||
|
### TesseraCT: POSIX
|
||||||
|
|
||||||
|
I have been playing with this idea of having a reliable read-path by having the S3 cluster be
|
||||||
|
redundant, or by replicating the S3 bucket. But Al asks: why not use our experimental POSIX?
|
||||||
|
We discuss two very important benefits, but also two drawbacks:
|
||||||
|
|
||||||
|
* On the plus side:
|
||||||
|
1. There is no need for S3 storage, read/writing to a local ZFS raidz2 pool instead.
|
||||||
|
1. There is no need for MySQL, as the POSIX implementation can use a local badger instance
|
||||||
|
also on the local filesystem.
|
||||||
|
* On the drawbacks:
|
||||||
|
1. There is a SPOF in the read-path, as the single VM must handle both. The write-path always
|
||||||
|
has a SPOF on the TesseraCT VM.
|
||||||
|
1. Local storage is more expensive than S3 storage, and can be used only for the purposes of
|
||||||
|
one application (and at best, shared with other VMs on the same hypervisor).
|
||||||
|
|
||||||
|
Come to think of it, this is maybe not such a bad tradeoff. I do kind of like having a single-VM
|
||||||
|
with a single-binary and no other moving parts. It greatly simplifies the architecture, and for the
|
||||||
|
read-path I can (and will) still use multiple upstream NGINX machines in IPng's network.
|
||||||
|
|
||||||
|
I consider myself nerd-sniped, and take a look at the POSIX variant. I have a few SAS3
|
||||||
|
solid state storage (NetAPP part number X447_S1633800AMD), which I plug into the `ctlog-test`
|
||||||
|
machine.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~$ sudo zpool create -o ashift=12 -o autotrim=on -o ssd-vol0 mirror \
|
||||||
|
/dev/disk/by-id/wwn-0x5002538a0???????
|
||||||
|
pim@ctlog-test:~$ sudo zfs create ssd-vol0/tesseract-test
|
||||||
|
pim@ctlog-test:~$ sudo chown pim:pim /ssd-vol0/tesseract-test
|
||||||
|
pim@ctlog-test:~/src/tesseract$ go run ./cmd/experimental/posix --http_endpoint='[::]:6962' \
|
||||||
|
--origin=ctlog-test.lab.ipng.ch/test-ecdsa \
|
||||||
|
--private_key=/tmp/private_key.pem \
|
||||||
|
--storage_dir=/ssd-vol0/tesseract-test \
|
||||||
|
--roots_pem_file=internal/hammer/testdata/test_root_ca_cert.pem
|
||||||
|
badger 2025/07/27 16:29:15 INFO: All 0 tables opened in 0s
|
||||||
|
badger 2025/07/27 16:29:15 INFO: Discard stats nextEmptySlot: 0
|
||||||
|
badger 2025/07/27 16:29:15 INFO: Set nextTxnTs to 0
|
||||||
|
I0727 16:29:15.032845 363156 files.go:502] Initializing directory for POSIX log at "/ssd-vol0/tesseract-test" (this should only happen ONCE per log!)
|
||||||
|
I0727 16:29:15.034101 363156 main.go:97] **** CT HTTP Server Starting ****
|
||||||
|
|
||||||
|
pim@ctlog-test:~/src/tesseract$ cat /ssd-vol0/tesseract-test/checkpoint
|
||||||
|
ctlog-test.lab.ipng.ch/test-ecdsa
|
||||||
|
0
|
||||||
|
47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=
|
||||||
|
|
||||||
|
— ctlog-test.lab.ipng.ch/test-ecdsa L+IHdQAAAZhMSgC8BAMARzBFAiBjT5zdkniKlryqlUlx/gLHOtVK26zuWwrc4BlyTVzCWgIhAJ0GIrlrP7YGzRaHjzdB5tnS5rpP3LeOsPbpLateaiFc
|
||||||
|
```
|
||||||
|
|
||||||
|
Alright, I can see the log started and created an empty checkpoint file. Nice!
|
||||||
|
|
||||||
|
Before I can loadtest it, I will need to get the read-path to become visible. The `hammer` can read
|
||||||
|
a checkpoint from local `file:///` prefixes, but I'll have to serve them over the network eventually
|
||||||
|
anyway, so I create the following NGINX config for it:
|
||||||
|
|
||||||
|
```
|
||||||
|
server {
|
||||||
|
listen 80 default_server backlog=4096;
|
||||||
|
listen [::]:80 default_server backlog=4096;
|
||||||
|
root /ssd-vol0/tesseract-test/;
|
||||||
|
index index.html index.htm index.nginx-debian.html;
|
||||||
|
|
||||||
|
server_name _;
|
||||||
|
|
||||||
|
access_log /var/log/nginx/access.log combined buffer=512k flush=5s;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
try_files $uri $uri/ =404;
|
||||||
|
tcp_nopush on;
|
||||||
|
sendfile on;
|
||||||
|
tcp_nodelay on;
|
||||||
|
keepalive_timeout 65;
|
||||||
|
keepalive_requests 1000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Just a couple of small thoughts on this configuration. I'm using buffered access logs, to avoid
|
||||||
|
excessive disk writes in the read-path. Then, I'm using kernel `sendfile()` which will instruct the
|
||||||
|
kernel to serve the static objects directly, so that NGINX can move on. Further, I'll allow for a
|
||||||
|
long keepalive in HTTP 1.1, so that future requests can use the same TCP connection, and I'll set
|
||||||
|
the flag `tcp_nodelay` and `tcp_nopush` to just blast the data out without waiting.
|
||||||
|
|
||||||
|
Without much ado:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ curl -sS ctlog-test.lab.ipng.ch/checkpoint
|
||||||
|
ctlog-test.lab.ipng.ch/test-ecdsa
|
||||||
|
0
|
||||||
|
47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=
|
||||||
|
|
||||||
|
— ctlog-test.lab.ipng.ch/test-ecdsa L+IHdQAAAZhMTfksBAMASDBGAiEAqADLH0P/SRVloF6G1ezlWG3Exf+sTzPIY5u6VjAKLqACIQCkJO2N0dZQuDHvkbnzL8Hd91oyU41bVqfD3vs5EwUouA==
|
||||||
|
```
|
||||||
|
|
||||||
|
#### TesseraCT: Loadtesting POSIX
|
||||||
|
|
||||||
|
The loadtesting is roughly the same. I start the `hammer` with the same 500qps of write rate, which
|
||||||
|
was roughly where the S3+MySQL variant topped. My checkpoint tracker shows the following:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ T=0; O=0; while :; do \
|
||||||
|
N=$(curl -sS http://localhost/checkpoint | grep -E '^[0-9]+$'); \
|
||||||
|
if [ "$N" -eq "$O" ]; then \
|
||||||
|
echo -n .; \
|
||||||
|
else \
|
||||||
|
echo " $T seconds $((N-O)) certs"; O=$N; T=0; echo -n $N\ ;
|
||||||
|
fi; \
|
||||||
|
T=$((T+1)); sleep 1; done
|
||||||
|
59250 ......... 10 seconds 5244 certs
|
||||||
|
64494 ......... 10 seconds 5000 certs
|
||||||
|
69494 ......... 10 seconds 5000 certs
|
||||||
|
74494 ......... 10 seconds 5000 certs
|
||||||
|
79494 ......... 10 seconds 5256 certs
|
||||||
|
79494 ......... 10 seconds 5256 certs
|
||||||
|
84750 ......... 10 seconds 5244 certs
|
||||||
|
89994 ......... 10 seconds 5256 certs
|
||||||
|
95250 ......... 10 seconds 5000 certs
|
||||||
|
100250 ......... 10 seconds 5000 certs
|
||||||
|
105250 ......... 10 seconds 5000 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
I learn two things. First, the checkpoint interval in this `posix` variant is 10 seconds, compared
|
||||||
|
to the 5 seconds of the `aws` variant I tested before. I dive into the code, because there doesn't
|
||||||
|
seem to be a `--checkpoint_interval` flag. In the `tessera` library, I find
|
||||||
|
`DefaultCheckpointInterval` which is set to 10 seconds. I change it to be 2 seconds instead, and
|
||||||
|
restart the `posix` binary:
|
||||||
|
|
||||||
|
```
|
||||||
|
238250 . 2 seconds 1000 certs
|
||||||
|
239250 . 2 seconds 1000 certs
|
||||||
|
240250 . 2 seconds 1000 certs
|
||||||
|
241250 . 2 seconds 1000 certs
|
||||||
|
242250 . 2 seconds 1000 certs
|
||||||
|
243250 . 2 seconds 1000 certs
|
||||||
|
244250 . 2 seconds 1000 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image width="30em" float="right" src="/assets/ctlog/ctlog-loadtest2.png" alt="Posix Loadtest 5000qps" >}}
|
||||||
|
|
||||||
|
Very nice! Maybe I can write a few more certs? I restart the `hammer` with 5000/s, which somewhat to my
|
||||||
|
surprise, ends up serving!
|
||||||
|
|
||||||
|
```
|
||||||
|
642608 . 2 seconds 6155 certs
|
||||||
|
648763 . 2 seconds 10256 certs
|
||||||
|
659019 . 2 seconds 9237 certs
|
||||||
|
668256 . 2 seconds 8800 certs
|
||||||
|
677056 . 2 seconds 8729 certs
|
||||||
|
685785 . 2 seconds 8237 certs
|
||||||
|
694022 . 2 seconds 7487 certs
|
||||||
|
701509 . 2 seconds 8572 certs
|
||||||
|
710081 . 2 seconds 7413 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
The throughput is highly variable though, seemingly between 3700/sec and 5100/sec, and I quickly
|
||||||
|
find out that the `hammer` is completely saturating the CPU on the machine, leaving very little room
|
||||||
|
for the `posix` TesseraCT to serve. I'm going to need more machines!
|
||||||
|
|
||||||
|
So I start a `hammer` loadtester on the two now-idle MinIO servers, and run them at about 6000qps
|
||||||
|
**each**, for a total of 12000 certs/sec. And my little `posix` binary is keeping up like a champ:
|
||||||
|
|
||||||
|
```
|
||||||
|
2987169 . 2 seconds 23040 certs
|
||||||
|
3010209 . 2 seconds 23040 certs
|
||||||
|
3033249 . 2 seconds 21760 certs
|
||||||
|
3055009 . 2 seconds 21504 certs
|
||||||
|
3076513 . 2 seconds 23808 certs
|
||||||
|
3100321 . 2 seconds 22528 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
One thing is reasonably clear, the `posix` TesseraCT is CPU bound, not disk bound. The CPU is now
|
||||||
|
running at about 18.5 CPUs/s (with 20 cores), which is pretty much all this Dell has to offer. The
|
||||||
|
NetAPP enterprise solid state drives are not impressed:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ zpool iostat -v ssd-vol0 10 100
|
||||||
|
capacity operations bandwidth
|
||||||
|
pool alloc free read write read write
|
||||||
|
-------------------------- ----- ----- ----- ----- ----- -----
|
||||||
|
ssd-vol0 11.4G 733G 0 3.13K 0 117M
|
||||||
|
mirror-0 11.4G 733G 0 3.13K 0 117M
|
||||||
|
wwn-0x5002538a05302930 - - 0 1.04K 0 39.1M
|
||||||
|
wwn-0x5002538a053069f0 - - 0 1.06K 0 39.1M
|
||||||
|
wwn-0x5002538a06313ed0 - - 0 1.02K 0 39.1M
|
||||||
|
-------------------------- ----- ----- ----- ----- ----- -----
|
||||||
|
|
||||||
|
pim@ctlog-test:~/src/tesseract$ zpool iostat -l ssd-vol0 10
|
||||||
|
capacity operations bandwidth total_wait disk_wait syncq_wait asyncq_wait scrub trim
|
||||||
|
pool alloc free read write read write read write read write read write read write wait wait
|
||||||
|
---------- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -----
|
||||||
|
ssd-vol0 14.0G 730G 0 1.48K 0 35.4M - 2ms - 535us - 1us - 3ms - 50ms
|
||||||
|
ssd-vol0 14.0G 730G 0 1.12K 0 23.0M - 1ms - 733us - 2us - 1ms - 44ms
|
||||||
|
ssd-vol0 14.1G 730G 0 1.42K 0 45.3M - 508us - 122us - 914ns - 2ms - 41ms
|
||||||
|
ssd-vol0 14.2G 730G 0 678 0 21.0M - 863us - 144us - 2us - 2ms - -
|
||||||
|
```
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
OK, that kind of seals the deal for me. The write path needs about 250 certs/sec and I'm hammering
|
||||||
|
now with 12'000 certs/sec, with room to spare. But what about the read path? The cool thing about
|
||||||
|
the static log is that reads are all entirely done by NGINX. The only file that isn't cacheable is
|
||||||
|
the `checkpoint` file which gets updated every two seconds (or ten seconds in the default `tessera`
|
||||||
|
settings).
|
||||||
|
|
||||||
|
So I start yet another `hammer` whose job it is to read back from the static filesystem:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ curl localhost/nginx_status; sleep 60; curl localhost/nginx_status
|
||||||
|
Active connections: 10556
|
||||||
|
server accepts handled requests
|
||||||
|
25302 25302 1492918
|
||||||
|
Reading: 0 Writing: 1 Waiting: 10555
|
||||||
|
Active connections: 7791
|
||||||
|
server accepts handled requests
|
||||||
|
25764 25764 1727631
|
||||||
|
Reading: 0 Writing: 1 Waiting: 7790
|
||||||
|
```
|
||||||
|
|
||||||
|
And I can see that it's keeping up quite nicely. In one minute, it handled (1727631-1492918) or
|
||||||
|
234713 requests, which is a cool 3911 requests/sec. All these read/write hammers are kind of
|
||||||
|
saturating the `ctlog-test` machine though:
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/ctlog/ctlog-loadtest3.png" alt="Posix Loadtest 8000qps write, 4000qps read" >}}
|
||||||
|
|
||||||
|
But after a little bit of fiddling, I can assert my conclusion:
|
||||||
|
|
||||||
|
***Conclusion: a write-rate of 8'000/s alongside a read-rate of 4'000/s should be safe with POSIX***
|
||||||
|
|
||||||
|
## What's Next
|
||||||
|
|
||||||
|
I am going to offer such a machine in production together with Antonis Chariton and Jeroen Massar.
|
||||||
|
I plan to do a few additional things:
|
||||||
|
|
||||||
|
* Test Sunlight as well on the same hardware. It would be nice to see a comparison between write
|
||||||
|
rates of the two implementations.
|
||||||
|
* Work with Al Cutter and the Transparency Dev team to close a few small gaps (like the
|
||||||
|
`local_signer.go` and some Prometheus monitoring of the `posix` binary.
|
||||||
|
* Install and launch both under `*.ct.ipng.ch`, which in itself deserves its own report, showing
|
||||||
|
how I intend to do log cycling and care/feeding, as well as report on the real production
|
||||||
|
experience running these CT Logs.
|
||||||
@@ -0,0 +1,666 @@
|
|||||||
|
---
|
||||||
|
date: "2025-08-10T12:07:23Z"
|
||||||
|
title: 'Certificate Transparency - Part 2 - Sunlight'
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="10em" float="right" src="/assets/ctlog/ctlog-logo-ipng.png" alt="ctlog logo" >}}
|
||||||
|
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
There once was a Dutch company called [[DigiNotar](https://en.wikipedia.org/wiki/DigiNotar)], as the
|
||||||
|
name suggests it was a form of _digital notary_, and they were in the business of issuing security
|
||||||
|
certificates. Unfortunately, in June of 2011, their IT infrastructure was compromised and
|
||||||
|
subsequently it issued hundreds of fraudulent SSL certificates, some of which were used for
|
||||||
|
man-in-the-middle attacks on Iranian Gmail users. Not cool.
|
||||||
|
|
||||||
|
Google launched a project called **Certificate Transparency**, because it was becoming more common
|
||||||
|
that the root of trust given to _Certification Authorities_ could no longer be unilaterally trusted.
|
||||||
|
These attacks showed that the lack of transparency in the way CAs operated was a significant risk to
|
||||||
|
the Web Public Key Infrastructure. It led to the creation of this ambitious
|
||||||
|
[[project](https://certificate.transparency.dev/)] to improve security online by bringing
|
||||||
|
accountability to the system that protects our online services with _SSL_ (Secure Socket Layer)
|
||||||
|
and _TLS_ (Transport Layer Security).
|
||||||
|
|
||||||
|
In 2013, [[RFC 6962](https://datatracker.ietf.org/doc/html/rfc6962)] was published by the IETF. It
|
||||||
|
describes an experimental protocol for publicly logging the existence of Transport Layer Security
|
||||||
|
(TLS) certificates as they are issued or observed, in a manner that allows anyone to audit
|
||||||
|
certificate authority (CA) activity and notice the issuance of suspect certificates as well as to
|
||||||
|
audit the certificate logs themselves. The intent is that eventually clients would refuse to honor
|
||||||
|
certificates that do not appear in a log, effectively forcing CAs to add all issued certificates to
|
||||||
|
the logs.
|
||||||
|
|
||||||
|
In a [[previous article]({{< ref 2025-07-26-ctlog-1 >}})], I took a deep dive into a new
|
||||||
|
open source implementation of Static CT Logs made by Google. There is however a very competent
|
||||||
|
alternative called [[Sunlight](https://sunlight.dev/)], which deserves some attention to get to know
|
||||||
|
its look and feel, as well as its performance characteristics.
|
||||||
|
|
||||||
|
## Sunlight
|
||||||
|
|
||||||
|
I start by reading up on the project website, and learn:
|
||||||
|
|
||||||
|
> _Sunlight is a [[Certificate Transparency](https://certificate.transparency.dev/)] log implementation
|
||||||
|
> and monitoring API designed for scalability, ease of operation, and reduced cost. What started as
|
||||||
|
> the Sunlight API is now the [[Static CT API](https://c2sp.org/static-ct-api)] and is allowed by the
|
||||||
|
> CT log policies of the major browsers._
|
||||||
|
>
|
||||||
|
> _Sunlight was designed by Filippo Valsorda for the needs of the WebPKI community, through the
|
||||||
|
> feedback of many of its members, and in particular of the Sigsum, Google TrustFabric, and ISRG
|
||||||
|
> teams. It is partially based on the Go Checksum Database. Sunlight's development was sponsored by
|
||||||
|
> Let's Encrypt._
|
||||||
|
|
||||||
|
I have a chat with Filippo and think I'm addressing an Elephant by asking him which of the two
|
||||||
|
implementations, TesseraCT or Sunlight, he thinks would be a good fit. One thing he says really sticks
|
||||||
|
with me: "The community needs _any_ static log operator, so if Google thinks TesseraCT is ready, by
|
||||||
|
all means use that. The diversity will do us good!".
|
||||||
|
|
||||||
|
To find out if one or the other is 'ready' is partly on the software, but importantly also on the
|
||||||
|
operator. So I carefully take Sunlight out of its cardboard box, and put it onto the same Dell R630
|
||||||
|
that I used in my previous tests: two Xeon E5-2640 v4 CPUs for a total of 20 cores and 40 threads,
|
||||||
|
and 512GB of DDR4 memory. They also sport a SAS controller. In one machine I place 6 pcs 1.2TB SAS3
|
||||||
|
drives (HPE part number EG1200JEHMC), and in the second machine I place 6pcs of 1.92TB enterprise
|
||||||
|
storage (Samsung part number P1633N19).
|
||||||
|
|
||||||
|
### Sunlight: setup
|
||||||
|
|
||||||
|
I download the source from GitHub, which, one of these days, will have an IPv6 address. Building the
|
||||||
|
tools is easy enough, there are three main tools:
|
||||||
|
1. ***sunlight***: Which serves the write-path. Certification authorities add their certs here.
|
||||||
|
1. ***sunlight-keygen***: A helper tool to create the so-called `seed` file (key material) for a
|
||||||
|
log.
|
||||||
|
1. ***skylight***: Which serves the read-path. `/checkpoint` and things like `/tile` and `/issuer`
|
||||||
|
are served here in a spec-compliant way.
|
||||||
|
|
||||||
|
The YAML configuration file is straightforward, and can define and handle multiple logs in one
|
||||||
|
instance, which sets it apart from TesseraCT which can only handle one log per instance. There's a
|
||||||
|
`submissionprefix` which `sunlight` will use to accept writes, and a `monitoringprefix` which
|
||||||
|
`skylight` will use for reads.
|
||||||
|
|
||||||
|
I stumble across a small issue - I haven't created multiple DNS hostnames for the test machine. So I
|
||||||
|
decide to use a different port for one versus the other. The write path will use TLS on port 1443
|
||||||
|
while Sunlight will point to a normal HTTP port 1080. And considering I don't have a certificate for
|
||||||
|
`*.lab.ipng.ch`, I will use a self-signed one instead:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ openssl genrsa -out ca.key 2048
|
||||||
|
pim@ctlog-test:/etc/sunlight$ openssl req -new -x509 -days 365 -key ca.key \
|
||||||
|
-subj "/C=CH/ST=ZH/L=Bruttisellen/O=IPng Networks GmbH/CN=IPng Root CA" -out ca.crt
|
||||||
|
pim@ctlog-test:/etc/sunlight$ openssl req -newkey rsa:2048 -nodes -keyout sunlight-key.pem \
|
||||||
|
-subj "/C=CH/ST=ZH/L=Bruttisellen/O=IPng Networks GmbH/CN=*.lab.ipng.ch" -out sunlight.csr
|
||||||
|
pim@ctlog-test:/etc/sunlight# openssl x509 -req -extfile \
|
||||||
|
<(printf "subjectAltName=DNS:ctlog-test.lab.ipng.ch,DNS:ctlog-test.lab.ipng.ch") -days 365 \
|
||||||
|
-in sunlight.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out sunlight.pem
|
||||||
|
ln -s sunlight.pem skylight.pem
|
||||||
|
ln -s sunlight-key.pem skylight-key.pem
|
||||||
|
```
|
||||||
|
|
||||||
|
This little snippet yields `sunlight.pem` (the certificate) and `sunlight-key.pem` (the private
|
||||||
|
key), and symlinks them to `skylight.pem` and `skylight-key.pem` for simplicity. With these in hand,
|
||||||
|
I can start the rest of the show. First I will prepare the NVME storage with a few datasets in
|
||||||
|
which Sunlight will store its data:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~$ sudo zfs create ssd-vol0/sunlight-test
|
||||||
|
pim@ctlog-test:~$ sudo zfs create ssd-vol0/sunlight-test/shared
|
||||||
|
pim@ctlog-test:~$ sudo zfs create ssd-vol0/sunlight-test/logs
|
||||||
|
pim@ctlog-test:~$ sudo zfs create ssd-vol0/sunlight-test/logs/sunlight-test
|
||||||
|
pim@ctlog-test:~$ sudo chown -R pim:pim /ssd-vol0/sunlight-test
|
||||||
|
```
|
||||||
|
|
||||||
|
Then I'll create the Sunlight configuration:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ sunlight-keygen -f sunlight-test.seed.bin
|
||||||
|
Log ID: IPngJcHCHWi+s37vfFqpY9ouk+if78wAY2kl/sh3c8E=
|
||||||
|
ECDSA public key:
|
||||||
|
-----BEGIN PUBLIC KEY-----
|
||||||
|
MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAE6Hg60YncYt/V69kLmg4LlTO9RmHR
|
||||||
|
wRllfa2cjURBJIKPpCUbgiiMX/jLQqmfzYrtveUws4SG8eT7+ICoa8xdAQ==
|
||||||
|
-----END PUBLIC KEY-----
|
||||||
|
Ed25519 public key:
|
||||||
|
-----BEGIN PUBLIC KEY-----
|
||||||
|
0pHg7KptAxmb4o67m9xNM1Ku3YH4bjjXbyIgXn2R2bk=
|
||||||
|
-----END PUBLIC KEY-----
|
||||||
|
```
|
||||||
|
|
||||||
|
The first block creates key material for the log, and I get a fun surprise: the Log ID starts
|
||||||
|
precisely with the string IPng... what are the odds that that would happen!? I should tell Antonis
|
||||||
|
about this, it's dope!
|
||||||
|
|
||||||
|
As a safety precaution, Sunlight requires the operator to make the `checkpoints.db` by hand, which
|
||||||
|
I'll also do:
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ sqlite3 /ssd-vol0/sunlight-test/shared/checkpoints.db \
|
||||||
|
"CREATE TABLE checkpoints (logID BLOB PRIMARY KEY, body TEXT)"
|
||||||
|
```
|
||||||
|
|
||||||
|
And with that, I'm ready to create my first log!
|
||||||
|
|
||||||
|
### Sunlight: Setting up S3
|
||||||
|
|
||||||
|
When learning about [[Tessera]({{< ref 2025-07-26-ctlog-1 >}})], I already kind of drew the
|
||||||
|
conclusion that, for our case at IPng at least, running the fully cloud-native version with S3
|
||||||
|
storage and MySQL database, gave both poorer performance, but also more operational complexity. But
|
||||||
|
I find it interesting to compare behavior and performance, so I'll start by creating a Sunlight log
|
||||||
|
using backing MinIO SSD storage.
|
||||||
|
|
||||||
|
I'll first create the bucket and a user account to access it:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~$ export AWS_ACCESS_KEY_ID="<some user>"
|
||||||
|
pim@ctlog-test:~$ export AWS_SECRET_ACCESS_KEY="<some password>"
|
||||||
|
pim@ctlog-test:~$ export S3_BUCKET=sunlight-test
|
||||||
|
|
||||||
|
pim@ctlog-test:~$ mc mb ssd/${S3_BUCKET}
|
||||||
|
pim@ctlog-test:~$ cat << EOF > /tmp/minio-access.json
|
||||||
|
{ "Version": "2012-10-17", "Statement": [ {
|
||||||
|
"Effect": "Allow",
|
||||||
|
"Action": [ "s3:ListBucket", "s3:PutObject", "s3:GetObject", "s3:DeleteObject" ],
|
||||||
|
"Resource": [ "arn:aws:s3:::${S3_BUCKET}/*", "arn:aws:s3:::${S3_BUCKET}" ]
|
||||||
|
} ]
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
pim@ctlog-test:~$ mc admin user add ssd ${AWS_ACCESS_KEY_ID} ${AWS_SECRET_ACCESS_KEY}
|
||||||
|
pim@ctlog-test:~$ mc admin policy create ssd ${S3_BUCKET}-access /tmp/minio-access.json
|
||||||
|
pim@ctlog-test:~$ mc admin policy attach ssd ${S3_BUCKET}-access --user ${AWS_ACCESS_KEY_ID}
|
||||||
|
pim@ctlog-test:~$ mc anonymous set public ssd/${S3_BUCKET}
|
||||||
|
```
|
||||||
|
|
||||||
|
After setting up the S3 environment, all I must do is wire it up to the Sunlight configuration
|
||||||
|
file:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ cat << EOF > sunlight-s3.yaml
|
||||||
|
listen:
|
||||||
|
- "[::]:1443"
|
||||||
|
checkpoints: /ssd-vol0/sunlight-test/shared/checkpoints.db
|
||||||
|
logs:
|
||||||
|
- shortname: sunlight-test
|
||||||
|
inception: 2025-08-10
|
||||||
|
submissionprefix: https://ctlog-test.lab.ipng.ch:1443/
|
||||||
|
monitoringprefix: http://sunlight-test.minio-ssd.lab.ipng.ch:9000/
|
||||||
|
secret: /etc/sunlight/sunlight-test.seed.bin
|
||||||
|
cache: /ssd-vol0/sunlight-test/logs/sunlight-test/cache.db
|
||||||
|
s3region: eu-schweiz-1
|
||||||
|
s3bucket: sunlight-test
|
||||||
|
s3endpoint: http://minio-ssd.lab.ipng.ch:9000/
|
||||||
|
roots: /etc/sunlight/roots.pem
|
||||||
|
period: 200
|
||||||
|
poolsize: 15000
|
||||||
|
notafterstart: 2024-01-01T00:00:00Z
|
||||||
|
notafterlimit: 2025-01-01T00:00:00Z
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
The one thing of note here is the use of `roots:` file which contains the Root CA for the TesseraCT
|
||||||
|
loadtester which I'll be using. In production, Sunlight can grab the approved roots from the
|
||||||
|
so-called _Common CA Database_ or CCADB. But you can also specify either all roots using the `roots`
|
||||||
|
field, or additional roots on top of the `ccadbroots` field, using the `extraroots` field. That's a
|
||||||
|
handy trick! You can find more info on the [[CCADB](https://www.ccadb.org/)] homepage.
|
||||||
|
|
||||||
|
I can then start Sunlight just like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ sunlight -testcert -c /etc/sunlight/sunlight-s3.yaml {"time":"2025-08-10T13:49:36.091384532+02:00","level":"INFO","source":{"function":"main.main.func1","file":"/home/pim/src/sunlight/cmd/sunlight/sunlig
|
||||||
|
ht.go","line":341},"msg":"debug server listening","addr":{"IP":"127.0.0.1","Port":37477,"Zone":""}}
|
||||||
|
time=2025-08-10T13:49:36.091+02:00 level=INFO msg="debug server listening" addr=127.0.0.1:37477 {"time":"2025-08-10T13:49:36.100471647+02:00","level":"INFO","source":{"function":"main.main","file":"/home/pim/src/sunlight/cmd/sunlight/sunlight.go"
|
||||||
|
,"line":542},"msg":"today is the Inception date, creating log","log":"sunlight-test"} time=2025-08-10T13:49:36.100+02:00 level=INFO msg="today is the Inception date, creating log" log=sunlight-test
|
||||||
|
{"time":"2025-08-10T13:49:36.119529208+02:00","level":"INFO","source":{"function":"filippo.io/sunlight/internal/ctlog.CreateLog","file":"/home/pim/src
|
||||||
|
/sunlight/internal/ctlog/ctlog.go","line":159},"msg":"created log","log":"sunlight-test","timestamp":1754826576111,"logID":"IPngJcHCHWi+s37vfFqpY9ouk+if78wAY2kl/sh3c8E="}
|
||||||
|
time=2025-08-10T13:49:36.119+02:00 level=INFO msg="created log" log=sunlight-test timestamp=1754826576111 logID="IPngJcHCHWi+s37vfFqpY9ouk+if78wAY2kl/sh3c8E="
|
||||||
|
{"time":"2025-08-10T13:49:36.127702166+02:00","level":"WARN","source":{"function":"filippo.io/sunlight/internal/ctlog.LoadLog","file":"/home/pim/src/s
|
||||||
|
unlight/internal/ctlog/ctlog.go","line":296},"msg":"failed to parse previously trusted roots","log":"sunlight-test","roots":""} time=2025-08-10T13:49:36.127+02:00 level=WARN msg="failed to parse previously trusted roots" log=sunlight-test roots=""
|
||||||
|
{"time":"2025-08-10T13:49:36.127766452+02:00","level":"INFO","source":{"function":"filippo.io/sunlight/internal/ctlog.LoadLog","file":"/home/pim/src/sunlight/internal/ctlog/ctlog.go","line":301},"msg":"loaded log","log":"sunlight-test","logID":"IPngJcHCHWi+s37vfFqpY9ouk+if78wAY2kl/sh3c8E=","size":0,
|
||||||
|
"timestamp":1754826576111}
|
||||||
|
time=2025-08-10T13:49:36.127+02:00 level=INFO msg="loaded log" log=sunlight-test logID="IPngJcHCHWi+s37vfFqpY9ouk+if78wAY2kl/sh3c8E=" size=0 timestamp=1754826576111
|
||||||
|
{"time":"2025-08-10T13:49:36.540297532+02:00","level":"INFO","source":{"function":"filippo.io/sunlight/internal/ctlog.(*Log).sequencePool","file":"/home/pim/src/sunlight/internal/ctlog/ctlog.go","line":972},"msg":"sequenced pool","log":"sunlight-test","old_tree_size":0,"entries":0,"start":"2025-08-1
|
||||||
|
0T13:49:36.534500633+02:00","tree_size":0,"tiles":0,"timestamp":1754826576534,"elapsed":5788099}
|
||||||
|
time=2025-08-10T13:49:36.540+02:00 level=INFO msg="sequenced pool" log=sunlight-test old_tree_size=0 entries=0 start=2025-08-10T13:49:36.534+02:00 tree_size=0 tiles=0 timestamp=1754826576534 elapsed=5.788099ms
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
Although that looks pretty good, I see that something is not quite right. When Sunlight comes up, it shares
|
||||||
|
with me a few links, in the `get-roots` and `json` fields on the homepage, but neither of them work:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~$ curl -k https://ctlog-test.lab.ipng.ch:1443/ct/v1/get-roots
|
||||||
|
404 page not found
|
||||||
|
pim@ctlog-test:~$ curl -k https://ctlog-test.lab.ipng.ch:1443/log.v3.json
|
||||||
|
404 page not found
|
||||||
|
```
|
||||||
|
|
||||||
|
I'm starting to think that using a non-standard listen port won't work, or more precisely, adding
|
||||||
|
a port in the `monitoringprefix` won't work. I notice that the logname is called
|
||||||
|
`ctlog-test.lab.ipng.ch:1443` which I don't think is supposed to have a port number in it. So instead,
|
||||||
|
I make Sunlight `listen` on port 443 and omit the port in the `submissionprefix`, and give it and
|
||||||
|
its companion Skylight the needed privileges to bind the privileged port like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~$ sudo setcap 'cap_net_bind_service=+ep' /usr/local/bin/sunlight
|
||||||
|
pim@ctlog-test:~$ sudo setcap 'cap_net_bind_service=+ep' /usr/local/bin/skylight
|
||||||
|
pim@ctlog-test:~$ sunlight -testcert -c /etc/sunlight/sunlight-s3.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image width="60%" src="/assets/ctlog/sunlight-test-s3.png" alt="Sunlight testlog / S3" >}}
|
||||||
|
|
||||||
|
And with that, Sunlight reports for duty and the links work. Hoi!
|
||||||
|
|
||||||
|
#### Sunlight: Loadtesting S3
|
||||||
|
|
||||||
|
I have some good experience loadtesting from the [[TesseraCT article]({{< ref 2025-07-26-ctlog-1
|
||||||
|
>}})]. One important difference is that Sunlight wants to use SSL for the submission and monitoring
|
||||||
|
paths, and I've created a snakeoil self-signed cert. CT Hammer does not accept that out of the box,
|
||||||
|
so I need to make a tiny change to the Hammer:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ git diff
|
||||||
|
diff --git a/internal/hammer/hammer.go b/internal/hammer/hammer.go
|
||||||
|
index 3828fbd..1dfd895 100644
|
||||||
|
--- a/internal/hammer/hammer.go
|
||||||
|
+++ b/internal/hammer/hammer.go
|
||||||
|
@@ -104,6 +104,9 @@ func main() {
|
||||||
|
MaxIdleConns: *numWriters + *numReadersFull + *numReadersRandom,
|
||||||
|
MaxIdleConnsPerHost: *numWriters + *numReadersFull + *numReadersRandom,
|
||||||
|
DisableKeepAlives: false,
|
||||||
|
+ TLSClientConfig: &tls.Config{
|
||||||
|
+ InsecureSkipVerify: true,
|
||||||
|
+ },
|
||||||
|
},
|
||||||
|
Timeout: *httpTimeout,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
With that small bit of insecurity out of the way, Sunlight makes it otherwise pretty easy for me to
|
||||||
|
construct the CT Hammer commandline:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ go run ./internal/hammer --origin=ctlog-test.lab.ipng.ch \
|
||||||
|
--log_public_key=MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAE6Hg60YncYt/V69kLmg4LlTO9RmHRwRllfa2cjURBJIKPpCUbgiiMX/jLQqmfzYrtveUws4SG8eT7+ICoa8xdAQ== \
|
||||||
|
--log_url=http://sunlight-test.minio-ssd.lab.ipng.ch:9000/ --write_log_url=https://ctlog-test.lab.ipng.ch/ \
|
||||||
|
--max_read_ops=0 --num_writers=5000 --max_write_ops=100
|
||||||
|
|
||||||
|
pim@ctlog-test:/etc/sunlight$ T=0; O=0; while :; do \
|
||||||
|
N=$(curl -sS http://sunlight-test.minio-ssd.lab.ipng.ch:9000/checkpoint | grep -E '^[0-9]+$'); \
|
||||||
|
if [ "$N" -eq "$O" ]; then \
|
||||||
|
echo -n .; \
|
||||||
|
else \
|
||||||
|
echo " $T seconds $((N-O)) certs"; O=$N; T=0; echo -n $N\ ;
|
||||||
|
fi; \
|
||||||
|
T=$((T+1)); sleep 1; done
|
||||||
|
24915 1 seconds 96 certs
|
||||||
|
25011 1 seconds 92 certs
|
||||||
|
25103 1 seconds 93 certs
|
||||||
|
25196 1 seconds 87 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
On the first commandline I'll start the loadtest at 100 writes/sec with the standard duplication
|
||||||
|
probability of 10%, which allows me to test Sunlight's ability to avoid writing duplicates. This
|
||||||
|
means I should see on average a growth of the tree at about 90/s. Check. I raise the write-load to
|
||||||
|
500/s:
|
||||||
|
|
||||||
|
```
|
||||||
|
39421 1 seconds 443 certs
|
||||||
|
39864 1 seconds 442 certs
|
||||||
|
40306 1 seconds 441 certs
|
||||||
|
40747 1 seconds 447 certs
|
||||||
|
41194 1 seconds 448 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
.. and to 1'000/s:
|
||||||
|
```
|
||||||
|
57941 1 seconds 945 certs
|
||||||
|
58886 1 seconds 970 certs
|
||||||
|
59856 1 seconds 948 certs
|
||||||
|
60804 1 seconds 965 certs
|
||||||
|
61769 1 seconds 955 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
After a few minutes I see a few errors from CT Hammer:
|
||||||
|
```
|
||||||
|
W0810 14:55:29.660710 1398779 analysis.go:134] (1 x) failed to create request: failed to write leaf: Post "https://ctlog-test.lab.ipng.ch/ct/v1/add-chain": EOF
|
||||||
|
W0810 14:55:30.496603 1398779 analysis.go:124] (1 x) failed to create request: write leaf was not OK. Status code: 500. Body: "failed to read body: read tcp 127.0.1.1:443->127.0.0.1:44908: i/o timeout\n"
|
||||||
|
```
|
||||||
|
|
||||||
|
I raise the Hammer load to 5'000/sec (which means 4'500/s unique certs and 500 duplicates), and find
|
||||||
|
the max committed writes/sec to max out at around 4'200/s:
|
||||||
|
```
|
||||||
|
879637 1 seconds 4213 certs
|
||||||
|
883850 1 seconds 4207 certs
|
||||||
|
888057 1 seconds 4211 certs
|
||||||
|
892268 1 seconds 4249 certs
|
||||||
|
896517 1 seconds 4216 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
The error rate is a steady stream of errors like the one before:
|
||||||
|
```
|
||||||
|
W0810 14:59:48.499274 1398779 analysis.go:124] (1 x) failed to create request: failed to write leaf: Post "https://ctlog-test.lab.ipng.ch/ct/v1/add-chain": EOF
|
||||||
|
W0810 14:59:49.034194 1398779 analysis.go:124] (1 x) failed to create request: failed to write leaf: Post "https://ctlog-test.lab.ipng.ch/ct/v1/add-chain": EOF
|
||||||
|
W0810 15:00:05.496459 1398779 analysis.go:124] (1 x) failed to create request: failed to write leaf: Post "https://ctlog-test.lab.ipng.ch/ct/v1/add-chain": EOF
|
||||||
|
W0810 15:00:07.187181 1398779 analysis.go:124] (1 x) failed to create request: failed to write leaf: Post "https://ctlog-test.lab.ipng.ch/ct/v1/add-chain": EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
At this load of 4'200/s, MinIO is not very impressed. Remember in the [[other article]({{< ref
|
||||||
|
2025-07-26-ctlog-1 >}})] I loadtested it to about 7'500 ops/sec and the statistics below are about
|
||||||
|
50 ops/sec (2'800/min). I conclude that MinIO is, in fact, bored of this whole activity:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ mc admin trace --stats ssd
|
||||||
|
Duration: 18m58s ▱▱▱
|
||||||
|
RX Rate:↑ 115 MiB/m
|
||||||
|
TX Rate:↓ 2.4 MiB/m
|
||||||
|
RPM : 2821.3
|
||||||
|
-------------
|
||||||
|
Call Count RPM Avg Time Min Time Max Time Avg TTFB Max TTFB Avg Size Rate /min Errors
|
||||||
|
s3.PutObject 37602 (70.3%) 1982.2 6.2ms 785µs 86.7ms 6.1ms 86.6ms ↑59K ↓0B ↑115M ↓1.4K 0
|
||||||
|
s3.GetObject 15918 (29.7%) 839.1 996µs 670µs 51.3ms 912µs 51.2ms ↑46B ↓3.0K ↑38K ↓2.4M 0
|
||||||
|
```
|
||||||
|
|
||||||
|
Sunlight still keeps its certificate cache on local disk. At a rate of 4'200/s, the ZFS pool has a
|
||||||
|
write rate of about 105MB/s with about 877 ZFS writes per second.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ zpool iostat -v ssd-vol0 10
|
||||||
|
capacity operations bandwidth
|
||||||
|
pool alloc free read write read write
|
||||||
|
-------------------------- ----- ----- ----- ----- ----- -----
|
||||||
|
ssd-vol0 59.1G 685G 0 2.55K 0 312M
|
||||||
|
mirror-0 59.1G 685G 0 2.55K 0 312M
|
||||||
|
wwn-0x5002538a05302930 - - 0 877 0 104M
|
||||||
|
wwn-0x5002538a053069f0 - - 0 871 0 104M
|
||||||
|
wwn-0x5002538a06313ed0 - - 0 866 0 104M
|
||||||
|
-------------------------- ----- ----- ----- ----- ----- -----
|
||||||
|
|
||||||
|
pim@ctlog-test:/etc/sunlight$ zpool iostat -l ssd-vol0 10
|
||||||
|
capacity operations bandwidth total_wait disk_wait syncq_wait asyncq_wait scrub trim
|
||||||
|
pool alloc free read write read write read write read write read write read write wait wait
|
||||||
|
---------- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -----
|
||||||
|
ssd-vol0 59.0G 685G 0 3.19K 0 388M - 8ms - 628us - 990us - 10ms - 88ms
|
||||||
|
ssd-vol0 59.2G 685G 0 2.49K 0 296M - 5ms - 557us - 163us - 8ms - -
|
||||||
|
ssd-vol0 59.6G 684G 0 2.04K 0 253M - 2ms - 704us - 296us - 4ms - -
|
||||||
|
ssd-vol0 58.8G 685G 0 2.72K 0 328M - 6ms - 783us - 701us - 9ms - 68ms
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
A few interesting observations:
|
||||||
|
* Sunlight still uses a local sqlite3 database for the certificate tracking, which is more
|
||||||
|
efficient than MariaDB/MySQL, let alone AWS RDS, so it has one less runtime dependency.
|
||||||
|
* The write rate to ZFS is significantly higher with Sunlight than TesseraCT (about 8:1). This is
|
||||||
|
likely explained because the sqlite3 database lives on ZFS here, while TesseraCT uses MariaDB
|
||||||
|
running on a different filesystem.
|
||||||
|
* The MinIO usage is a lot lighter. As I reduce the load to 1'000/s, as was the case in the TesseraCT
|
||||||
|
test, I can see the ratio of Get:Put was 93:4 in TesseraCT, while it's 70:30 here. TesseraCT was
|
||||||
|
also consuming more IOPS, running at about 10.5k requests/minute, while Sunlight is
|
||||||
|
significantly calmer at 2.8k requests/minute (almost 4x less!)
|
||||||
|
* The burst capacity of Sunlight is a fair bit higher than TesseraCT, likely due to its more
|
||||||
|
efficient use of S3 backends.
|
||||||
|
|
||||||
|
***Conclusion***: Sunlight S3+MinIO can handle 1'000/s reliably, and can spike to 4'200/s with only
|
||||||
|
few errors.
|
||||||
|
|
||||||
|
#### Sunlight: Loadtesting POSIX
|
||||||
|
|
||||||
|
When I took a closer look at TesseraCT a few weeks ago, it struck me that while making a
|
||||||
|
cloud-native setup, with S3 storage would allow for a cool way to enable storage scaling and
|
||||||
|
read-path redundancy, by creating synchronously replicated buckets, it does come at a significant
|
||||||
|
operational overhead and complexity. My main concern is the amount of different moving parts, and
|
||||||
|
Sunlight really has one very appealing property: it can run entirely on one machine without the need
|
||||||
|
for any other moving parts - even the SQL database is linked in. That's pretty slick.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ cat << EOF > sunlight.yaml
|
||||||
|
listen:
|
||||||
|
- "[::]:443"
|
||||||
|
checkpoints: /ssd-vol0/sunlight-test/shared/checkpoints.db
|
||||||
|
logs:
|
||||||
|
- shortname: sunlight-test
|
||||||
|
inception: 2025-08-10
|
||||||
|
submissionprefix: https://ctlog-test.lab.ipng.ch/
|
||||||
|
monitoringprefix: https://ctlog-test.lab.ipng.ch:1443/
|
||||||
|
secret: /etc/sunlight/sunlight-test.seed.bin
|
||||||
|
cache: /ssd-vol0/sunlight-test/logs/sunlight-test/cache.db
|
||||||
|
localdirectory: /ssd-vol0/sunlight-test/logs/sunlight-test/data
|
||||||
|
roots: /etc/sunlight/roots.pem
|
||||||
|
period: 200
|
||||||
|
poolsize: 15000
|
||||||
|
notafterstart: 2024-01-01T00:00:00Z
|
||||||
|
notafterlimit: 2025-01-01T00:00:00Z
|
||||||
|
EOF
|
||||||
|
pim@ctlog-test:/etc/sunlight$ sunlight -testcert -c sunlight.yaml
|
||||||
|
pim@ctlog-test:/etc/sunlight$ skylight -testcert -c skylight.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
First I'll start a hello-world loadtest at 100/s and take a look at the number of leaves in the
|
||||||
|
checkpoint after a few minutes, I would expect about three minutes worth at 100/s with a duplicate
|
||||||
|
probability of 10% to yield about 16'200 unique certificates in total.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ while :; do curl -ksS https://ctlog-test.lab.ipng.ch:1443/checkpoint | grep -E '^[0-9]+$'; sleep 60; done
|
||||||
|
10086
|
||||||
|
15518
|
||||||
|
20920
|
||||||
|
26339
|
||||||
|
```
|
||||||
|
|
||||||
|
And would you look at that? `(26339-10086)` is right on the dot! One thing that I find particularly
|
||||||
|
cool about Sunlight is its baked in Prometheus metrics. This allows me some pretty solid insight on
|
||||||
|
its performance. Take a look for example at the write path latency tail (99th ptile):
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ curl -ksS https://ctlog-test.lab.ipng.ch/metrics | egrep 'seconds.*quantile=\"0.99\"'
|
||||||
|
sunlight_addchain_wait_seconds{log="sunlight-test",quantile="0.99"} 0.207285993
|
||||||
|
sunlight_cache_get_duration_seconds{log="sunlight-test",quantile="0.99"} 0.001409719
|
||||||
|
sunlight_cache_put_duration_seconds{log="sunlight-test",quantile="0.99"} 0.002227985
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="discard",quantile="0.99"} 0.000224969
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="fetch",quantile="0.99"} 8.3003e-05
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="upload",quantile="0.99"} 0.042118751
|
||||||
|
sunlight_http_request_duration_seconds{endpoint="add-chain",log="sunlight-test",quantile="0.99"} 0.2259605
|
||||||
|
sunlight_sequencing_duration_seconds{log="sunlight-test",quantile="0.99"} 0.108987393
|
||||||
|
sunlight_sqlite_update_duration_seconds{quantile="0.99"} 0.014922489
|
||||||
|
```
|
||||||
|
|
||||||
|
I'm seeing here that at a load of 100/s (with 90/s of unique certificates), the 99th percentile
|
||||||
|
add-chain latency is 207ms, which makes sense because the `period` configuration field is set to
|
||||||
|
200ms. The filesystem operations (discard, fetch, upload) are _de minimis_ and the sequencing
|
||||||
|
duration is at 109ms. Excellent!
|
||||||
|
|
||||||
|
But can this thing go really fast? I do remember that the CT Hammer uses more CPU than TesseraCT,
|
||||||
|
and I've seen it above also when running my 5'000/s loadtest that's about all the hammer can take on
|
||||||
|
a single Dell R630. So, as I did with the TesseraCT test, I'll use the MinIO SSD and MinIO Disk
|
||||||
|
machines to generate the load.
|
||||||
|
|
||||||
|
I boot them, so that I can hammer, or shall I say jackhammer away:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:~/src/tesseract$ go run ./internal/hammer --origin=ctlog-test.lab.ipng.ch \
|
||||||
|
--log_public_key=MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAE6Hg60YncYt/V69kLmg4LlTO9RmHRwRllfa2cjURBJIKPpCUbgiiMX/jLQqmfzYrtveUws4SG8eT7+ICoa8xdAQ== \
|
||||||
|
--log_url=https://ctlog-test.lab.ipng.ch:1443/ --write_log_url=https://ctlog-test.lab.ipng.ch/ \
|
||||||
|
--max_read_ops=0 --num_writers=5000 --max_write_ops=5000
|
||||||
|
|
||||||
|
pim@minio-ssd:~/src/tesseract$ go run ./internal/hammer --origin=ctlog-test.lab.ipng.ch \
|
||||||
|
--log_public_key=MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAE6Hg60YncYt/V69kLmg4LlTO9RmHRwRllfa2cjURBJIKPpCUbgiiMX/jLQqmfzYrtveUws4SG8eT7+ICoa8xdAQ== \
|
||||||
|
--log_url=https://ctlog-test.lab.ipng.ch:1443/ --write_log_url=https://ctlog-test.lab.ipng.ch/ \
|
||||||
|
--max_read_ops=0 --num_writers=5000 --max_write_ops=5000 --serial_offset=1000000
|
||||||
|
|
||||||
|
pim@minio-disk:~/src/tesseract$ go run ./internal/hammer --origin=ctlog-test.lab.ipng.ch \
|
||||||
|
--log_public_key=MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAE6Hg60YncYt/V69kLmg4LlTO9RmHRwRllfa2cjURBJIKPpCUbgiiMX/jLQqmfzYrtveUws4SG8eT7+ICoa8xdAQ== \
|
||||||
|
--log_url=https://ctlog-test.lab.ipng.ch:1443/ --write_log_url=https://ctlog-test.lab.ipng.ch/ \
|
||||||
|
--max_read_ops=0 --num_writers=5000 --max_write_ops=5000 --serial_offset=2000000
|
||||||
|
```
|
||||||
|
|
||||||
|
This will generate 15'000/s of load, which I note does bring Sunlight to its knees, although it does
|
||||||
|
remain stable (yaay!) with a somewhat more bursty checkpoint interval:
|
||||||
|
|
||||||
|
```
|
||||||
|
5504780 1 seconds 4039 certs
|
||||||
|
5508819 1 seconds 10000 certs
|
||||||
|
5518819 . 2 seconds 7976 certs
|
||||||
|
5526795 1 seconds 2022 certs
|
||||||
|
5528817 1 seconds 9782 certs
|
||||||
|
5538599 1 seconds 217 certs
|
||||||
|
5538816 1 seconds 3114 certs
|
||||||
|
5541930 1 seconds 6818 certs
|
||||||
|
```
|
||||||
|
|
||||||
|
So what I do instead is a somewhat simpler measurement of certificates per minute:
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ while :; do curl -ksS https://ctlog-test.lab.ipng.ch:1443/checkpoint | grep -E '^[0-9]+$'; sleep 60; done
|
||||||
|
6008831
|
||||||
|
6296255
|
||||||
|
6576712
|
||||||
|
```
|
||||||
|
|
||||||
|
This rate boils down to `(6576712-6008831)/120` or 4'700/s of written certs, which at a duplication
|
||||||
|
ratio of 10% means approximately 5'200/s of total accepted certs. At this rate, Sunlight is consuming
|
||||||
|
about 10.3 CPUs/s, while Skylight is at 0.1 CPUs/s and the CT Hammer is at 11.1 CPUs/s; Given the 40
|
||||||
|
threads on this machine, I am not saturating the CPU, but I'm curious as this rate is significantly
|
||||||
|
lower than TesseraCT. I briefly turn off the hammer on `ctlog-test` to allow Sunlight to monopolize
|
||||||
|
the entire machine. The CPU use does reduce to about 9.3 CPUs/s suggesting that indeed, the bottleneck
|
||||||
|
is not strictly CPU:
|
||||||
|
|
||||||
|
{{< image width="90%" src="/assets/ctlog/btop-sunlight.png" alt="Sunlight btop" >}}
|
||||||
|
|
||||||
|
When using only two CT Hammers (on `minio-ssd.lab.ipng.ch` and `minio-disk.lab.ipng.ch`), the CPU
|
||||||
|
use on the `ctlog-test.lab.ipng.ch` machine definitely goes down (CT Hammer is kind of a CPU hog....),
|
||||||
|
but the resulting throughput doesn't change that much:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ while :; do curl -ksS https://ctlog-test.lab.ipng.ch:1443/checkpoint | grep -E '^[0-9]+$'; sleep 60; done
|
||||||
|
7985648
|
||||||
|
8302421
|
||||||
|
8528122
|
||||||
|
8772758
|
||||||
|
```
|
||||||
|
|
||||||
|
What I find particularly interesting is that the total rate stays approximately 4'400/s
|
||||||
|
(`(8772758-7985648)/180`), while the checkpoint latency varies considerably. One really cool thing I
|
||||||
|
learned earlier is that Sunlight comes with baked in Prometheus metrics, which I can take a look at
|
||||||
|
while keeping it under this load of ~10'000/sec:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ curl -ksS https://ctlog-test.lab.ipng.ch/metrics | egrep 'seconds.*quantile=\"0.99\"'
|
||||||
|
sunlight_addchain_wait_seconds{log="sunlight-test",quantile="0.99"} 1.889983538
|
||||||
|
sunlight_cache_get_duration_seconds{log="sunlight-test",quantile="0.99"} 0.000148819
|
||||||
|
sunlight_cache_put_duration_seconds{log="sunlight-test",quantile="0.99"} 0.837981208
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="discard",quantile="0.99"} 0.000433179
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="fetch",quantile="0.99"} NaN
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="upload",quantile="0.99"} 0.067494558
|
||||||
|
sunlight_http_request_duration_seconds{endpoint="add-chain",log="sunlight-test",quantile="0.99"} 1.86894666
|
||||||
|
sunlight_sequencing_duration_seconds{log="sunlight-test",quantile="0.99"} 1.111400223
|
||||||
|
sunlight_sqlite_update_duration_seconds{quantile="0.99"} 0.016859223
|
||||||
|
```
|
||||||
|
|
||||||
|
Comparing the throughput at 4'400/s with that first test of 100/s, I expect and can confirm a
|
||||||
|
significant increase in all of these metrics. The 99th percentile addchain is now 1889ms (up from
|
||||||
|
207ms) and the sequencing duration is now 1111ms (up from 109ms).
|
||||||
|
|
||||||
|
#### Sunlight: Effect of period
|
||||||
|
|
||||||
|
I fiddle a little bit with Sunlight's configuration file, notably the `period` and `poolsize`.
|
||||||
|
First I set `period:2000` and `poolsize:15000`, which yields pretty much the same throughput:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ while :; do curl -ksS https://ctlog-test.lab.ipng.ch:1443/checkpoint | grep -E '^[0-9]+$'; sleep 60; done
|
||||||
|
701850
|
||||||
|
1001424
|
||||||
|
1295508
|
||||||
|
1575789
|
||||||
|
```
|
||||||
|
|
||||||
|
With a generated load of 10'000/sec with a 10% duplication rate, I am offering roughly 9'000/sec of
|
||||||
|
unique certificates, and I'm seeing `(1575789 - 701850)/180` or about 4'855/sec come through. Just
|
||||||
|
for reference, at this rate and with `period:2000`, the latency tail looks like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ curl -ksS https://ctlog-test.lab.ipng.ch/metrics | egrep 'seconds.*quantile=\"0.99\"'
|
||||||
|
sunlight_addchain_wait_seconds{log="sunlight-test",quantile="0.99"} 3.203510079
|
||||||
|
sunlight_cache_get_duration_seconds{log="sunlight-test",quantile="0.99"} 0.000108613
|
||||||
|
sunlight_cache_put_duration_seconds{log="sunlight-test",quantile="0.99"} 0.950453973
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="discard",quantile="0.99"} 0.00046192
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="fetch",quantile="0.99"} NaN
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="upload",quantile="0.99"} 0.049007693
|
||||||
|
sunlight_http_request_duration_seconds{endpoint="add-chain",log="sunlight-test",quantile="0.99"} 3.570709413
|
||||||
|
sunlight_sequencing_duration_seconds{log="sunlight-test",quantile="0.99"} 1.5968609040000001
|
||||||
|
sunlight_sqlite_update_duration_seconds{quantile="0.99"} 0.010847308
|
||||||
|
```
|
||||||
|
|
||||||
|
Then I also set a `period:100` and `poolsize:15000`, which does improve a bit:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ while :; do curl -ksS https://ctlog-test.lab.ipng.ch:1443/checkpoint | grep -E '^[0-9]+$'; sleep 60; done
|
||||||
|
560654
|
||||||
|
950524
|
||||||
|
1324645
|
||||||
|
1720362
|
||||||
|
```
|
||||||
|
|
||||||
|
With the same generated load of 10'000/sec with a 10% duplication rate, I am still offering roughly
|
||||||
|
9'000/sec of unique certificates, and I'm seeing `(1720362 - 560654)/180` or about 6'440/sec come
|
||||||
|
through, which is a fair bit better, at the expense of more disk activity. At this rate and with
|
||||||
|
`period:100`, the latency tail looks like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@ctlog-test:/etc/sunlight$ curl -ksS https://ctlog-test.lab.ipng.ch/metrics | egrep 'seconds.*quantile=\"0.99\"'
|
||||||
|
sunlight_addchain_wait_seconds{log="sunlight-test",quantile="0.99"} 1.616046445
|
||||||
|
sunlight_cache_get_duration_seconds{log="sunlight-test",quantile="0.99"} 7.5123e-05
|
||||||
|
sunlight_cache_put_duration_seconds{log="sunlight-test",quantile="0.99"} 0.534935803
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="discard",quantile="0.99"} 0.000377273
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="fetch",quantile="0.99"} 4.8893e-05
|
||||||
|
sunlight_fs_op_duration_seconds{log="sunlight-test",method="upload",quantile="0.99"} 0.054685991
|
||||||
|
sunlight_http_request_duration_seconds{endpoint="add-chain",log="sunlight-test",quantile="0.99"} 1.946445877
|
||||||
|
sunlight_sequencing_duration_seconds{log="sunlight-test",quantile="0.99"} 0.980602185
|
||||||
|
sunlight_sqlite_update_duration_seconds{quantile="0.99"} 0.018385831
|
||||||
|
```
|
||||||
|
|
||||||
|
***Conclusion***: Sunlight on POSIX can reliably handle 4'400/s (with a duplicate rate of 10%) on
|
||||||
|
this setup.
|
||||||
|
|
||||||
|
## Wrapup - Observations
|
||||||
|
|
||||||
|
From an operator's point of view, TesseraCT and Sunlight handle quite differently. Both are easily up
|
||||||
|
to the task of serving the current write-load (which is about 250/s).
|
||||||
|
|
||||||
|
* ***S3***: When using the S3 backend, TesseraCT became quite unhappy above 800/s while Sunlight
|
||||||
|
went all the way up to 4'200/s and sent significantly fewer requests to MinIO (about 4x less),
|
||||||
|
while showing good telemetry on the use of S3 backends. In this mode, TesseraCT uses MySQL (in
|
||||||
|
my case, MariaDB) which was not on the ZFS pool, but on the boot-disk.
|
||||||
|
|
||||||
|
* ***POSIX***: When using a normal filesystem, Sunlight seems to peak at 4'800/s while TesseraCT
|
||||||
|
went all the way to 12'000/s. When doing so, disk I/O was quite similar between the two
|
||||||
|
solutions, taking into account that TesseraCT runs BadgerDB, while Sunlight uses sqlite3,
|
||||||
|
both are using their respective ZFS pool.
|
||||||
|
|
||||||
|
***Notable***: Sunlight POSIX and S3 performance is roughly identical (both handle about
|
||||||
|
5'000/sec), while TesseraCT POSIX performance (12'000/s) is significantly better than its S3
|
||||||
|
(800/s). Some other observations:
|
||||||
|
|
||||||
|
* Sunlight has a very opinionated configuration, and can run multiple logs with one configuration
|
||||||
|
file and one binary. Its configuration was a bit constraining though, as I could not manage to
|
||||||
|
use `monitoringprefix` or `submissionprefix` with `http://` prefix - a likely security
|
||||||
|
precaution - but also using ports in those prefixes (other than the standard 443) rendered
|
||||||
|
Sunlight and Skylight unusable for me.
|
||||||
|
|
||||||
|
* Skylight only serves from local directory, it does not have support for S3. For operators using S3,
|
||||||
|
an alternative could be to use NGINX in the serving path, similar to TesseraCT. Skylight does have
|
||||||
|
a few things to teach me though, notably on proper compression, content type and other headers.
|
||||||
|
|
||||||
|
* TesseraCT does not have a configuration file, and will run exactly one log per binary
|
||||||
|
instance. It uses flags to construct the environment, and is much more forgiving for creative
|
||||||
|
`origin` (log name), and submission- and monitoring URLs. It's happy to use regular 'http://'
|
||||||
|
for both, which comes in handy in those architectures where the system is serving behind a
|
||||||
|
reverse proxy.
|
||||||
|
|
||||||
|
* The TesseraCT Hammer tool then again does not like using self-signed certificates, and needs
|
||||||
|
to be told to skip certificate validation in the case of Sunlight loadtests while it is
|
||||||
|
running with the `-testcert` commandline.
|
||||||
|
|
||||||
|
I consider all of these small and mostly cosmetic issues, because in production there will be proper
|
||||||
|
TLS certificates issued and normal https:// serving ports with unique monitoring and submission
|
||||||
|
hostnames.
|
||||||
|
|
||||||
|
## What's Next
|
||||||
|
|
||||||
|
Together with Antonis Chariton and Jeroen Massar, IPng Networks will be offering both TesseraCT and
|
||||||
|
Sunlight logs on the public internet. One final step is to productionize both logs, and file the
|
||||||
|
paperwork for them in the community. Although at this point our Sunlight log is already running,
|
||||||
|
I'll wait a few weeks to gather any additional intel, before wrapping up in a final article.
|
||||||
|
|
||||||
@@ -0,0 +1,515 @@
|
|||||||
|
---
|
||||||
|
date: "2025-08-24T12:07:23Z"
|
||||||
|
title: 'Certificate Transparency - Part 3 - Operations'
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="10em" float="right" src="/assets/ctlog/ctlog-logo-ipng.png" alt="ctlog logo" >}}
|
||||||
|
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
There once was a Dutch company called [[DigiNotar](https://en.wikipedia.org/wiki/DigiNotar)], as the
|
||||||
|
name suggests it was a form of _digital notary_, and they were in the business of issuing security
|
||||||
|
certificates. Unfortunately, in June of 2011, their IT infrastructure was compromised and
|
||||||
|
subsequently it issued hundreds of fraudulent SSL certificates, some of which were used for
|
||||||
|
man-in-the-middle attacks on Iranian Gmail users. Not cool.
|
||||||
|
|
||||||
|
Google launched a project called **Certificate Transparency**, because it was becoming more common
|
||||||
|
that the root of trust given to _Certification Authorities_ could no longer be unilaterally trusted.
|
||||||
|
These attacks showed that the lack of transparency in the way CAs operated was a significant risk to
|
||||||
|
the Web Public Key Infrastructure. It led to the creation of this ambitious
|
||||||
|
[[project](https://certificate.transparency.dev/)] to improve security online by bringing
|
||||||
|
accountability to the system that protects our online services with _SSL_ (Secure Socket Layer)
|
||||||
|
and _TLS_ (Transport Layer Security).
|
||||||
|
|
||||||
|
In 2013, [[RFC 6962](https://datatracker.ietf.org/doc/html/rfc6962)] was published by the IETF. It
|
||||||
|
describes an experimental protocol for publicly logging the existence of Transport Layer Security
|
||||||
|
(TLS) certificates as they are issued or observed, in a manner that allows anyone to audit
|
||||||
|
certificate authority (CA) activity and notice the issuance of suspect certificates as well as to
|
||||||
|
audit the certificate logs themselves. The intent is that eventually clients would refuse to honor
|
||||||
|
certificates that do not appear in a log, effectively forcing CAs to add all issued certificates to
|
||||||
|
the logs.
|
||||||
|
|
||||||
|
In the first two articles of this series, I explored [[TesseraCT]({{< ref 2025-07-26-ctlog-1 >}})]
|
||||||
|
and [[Sunlight]({{< ref 2025-08-10-ctlog-2 >}})], two open source implementations of the Static CT
|
||||||
|
protocol. In this final article, I'll share the details on how I created the environment and
|
||||||
|
production instances for four logs that IPng will be providing: Rennet and Lipase are two
|
||||||
|
ingredients to make cheese and will serve as our staging/testing logs. Gouda and Halloumi are two
|
||||||
|
delicious cheeses that pay homage to our heritage, Jeroen and I being Dutch and Antonis being
|
||||||
|
Greek.
|
||||||
|
|
||||||
|
## Hardware
|
||||||
|
|
||||||
|
At IPng Networks, all hypervisors are from the same brand: Dell's Poweredge line. In this project,
|
||||||
|
Jeroen is also contributing a server, and it so happens that he also has a Dell Poweredge. We're
|
||||||
|
both running Debian on our hypervisor, so we install a fresh VM with Debian 13.0, codenamed
|
||||||
|
_Trixie_, and give the machine 16GB of memory, 8 vCPU and a 16GB boot disk. Boot disks are placed on
|
||||||
|
the hypervisor's ZFS pool, and a blockdevice snapshot is taken every 6hrs. This allows the boot disk
|
||||||
|
to be rolled back to a last known good point in case an upgrade goes south. If you haven't seen it
|
||||||
|
yet, take a look at [[zrepl](https://zrepl.github.io/)], a one-stop, integrated solution for ZFS
|
||||||
|
replication. This tool is incredibly powerful, and can do snapshot management, sourcing / sinking
|
||||||
|
to remote hosts, of course using incremental snapshots as they are native to ZFS.
|
||||||
|
|
||||||
|
Once the machine is up, we pass four enterprise-class storage drives, in our case 3.84TB Kioxia
|
||||||
|
NVMe, model _KXD51RUE3T84_ which are PCIe 3.1 x4 lanes, and NVMe 1.2.1 specification with a good
|
||||||
|
durability and reasonable (albeit not stellar) read throughput of ~2700MB/s, write throughput of
|
||||||
|
~800MB/s with 240 kIOPS random read and 21 kIOPS random write. My attention is also drawn to a
|
||||||
|
specific specification point: these drives allow for 1.0 DWPD, which stands for _Drive Writes Per
|
||||||
|
Day_, in other words they are not going to run themselves off a cliff after a few petabytes of
|
||||||
|
writes, and I am reminded that a CT Log wants to write to disk a lot during normal operation.
|
||||||
|
|
||||||
|
The point of these logs is to **keep them safe**, and the most important aspects of the compute
|
||||||
|
environment are the use of ECC memory to detect single bit errors, and dependable storage. Toshiba
|
||||||
|
makes a great product.
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog1:~$ sudo zpool create -f -o ashift=12 -o autotrim=on -O atime=off -O xattr=sa \
|
||||||
|
ssd-vol0 raidz2 /dev/disk/by-id/nvme-KXD51RUE3T84_TOSHIBA_*M
|
||||||
|
ctlog1:~$ sudo zfs create -o encryption=on -o keyformat=passphrase ssd-vol0/enc
|
||||||
|
ctlog1:~$ sudo zfs create ssd-vol0/logs
|
||||||
|
ctlog1:~$ for log in lipase; do \
|
||||||
|
for shard in 2025h2 2026h1 2026h2 2027h1 2027h2; do \
|
||||||
|
sudo zfs create ssd-vol0/logs/${log}${shard} \
|
||||||
|
done \
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
The hypervisor will use PCI passthrough for the NVMe drives, and we'll handle ZFS directly on the
|
||||||
|
VM. The first command creates a ZFS raidz2 pool using 4kB blocks, turns off _atime_ (which avoids one
|
||||||
|
metadata write for each read!), and turns on SSD trimming in ZFS, a very useful feature.
|
||||||
|
|
||||||
|
Then I'll create an encrypted volume for the configuration and key material. This way, if the
|
||||||
|
machine is ever physically transported, the keys will be safe in transit. Finally, I'll create the
|
||||||
|
temporal log shards starting at 2025h2, all the way through to 2027h2 for our testing log called
|
||||||
|
_Lipase_ and our production log called _Halloumi_ on Jeroen's machine. On my own machine, it'll be
|
||||||
|
_Rennet_ for the testing log and _Gouda_ for the production log.
|
||||||
|
|
||||||
|
## Sunlight
|
||||||
|
|
||||||
|
{{< image width="10em" float="right" src="/assets/ctlog/sunlight-logo.png" alt="Sunlight logo" >}}
|
||||||
|
|
||||||
|
I set up Sunlight first, as its authors have extensive operational notes both in terms of the
|
||||||
|
[[config](https://config.sunlight.geomys.org/)] of Geomys' _Tuscolo_ log, as well as on the
|
||||||
|
[[Sunlight](https://sunlight.dev)] homepage. I really appreciate that Filippo added some
|
||||||
|
[[Gists](https://gist.github.com/FiloSottile/989338e6ba8e03f2c699590ce83f537b)] and
|
||||||
|
[[Doc](https://docs.google.com/document/d/1ID8dX5VuvvrgJrM0Re-jt6Wjhx1eZp-trbpSIYtOhRE/edit?tab=t.0#heading=h.y3yghdo4mdij)]
|
||||||
|
with pretty much all I need to know to run one too. Our Rennet and Gouda logs use a very similar
|
||||||
|
approach for their configuration, with one notable exception: the VMs do not have a public IP
|
||||||
|
address, and are tucked away in a private network called IPng Site Local. I'll get back to that
|
||||||
|
later.
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog@ctlog0:/ssd-vol0/enc/sunlight$ cat << EOF | tee sunlight-staging.yaml
|
||||||
|
listen:
|
||||||
|
- "[::]:16420"
|
||||||
|
checkpoints: /ssd-vol0/shared/checkpoints.db
|
||||||
|
logs:
|
||||||
|
- shortname: rennet2025h2
|
||||||
|
inception: 2025-07-28
|
||||||
|
period: 200
|
||||||
|
poolsize: 750
|
||||||
|
submissionprefix: https://rennet2025h2.log.ct.ipng.ch
|
||||||
|
monitoringprefix: https://rennet2025h2.mon.ct.ipng.ch
|
||||||
|
ccadbroots: testing
|
||||||
|
extraroots: /ssd-vol0/enc/sunlight/extra-roots-staging.pem
|
||||||
|
secret: /ssd-vol0/enc/sunlight/keys/rennet2025h2.seed.bin
|
||||||
|
cache: /ssd-vol0/logs/rennet2025h2/cache.db
|
||||||
|
localdirectory: /ssd-vol0/logs/rennet2025h2/data
|
||||||
|
notafterstart: 2025-07-01T00:00:00Z
|
||||||
|
notafterlimit: 2026-01-01T00:00:00Z
|
||||||
|
...
|
||||||
|
EOF
|
||||||
|
ctlog@ctlog0:/ssd-vol0/enc/sunlight$ cat << EOF | tee skylight-staging.yaml
|
||||||
|
listen:
|
||||||
|
- "[::]:16421"
|
||||||
|
homeredirect: https://ipng.ch/s/ct/
|
||||||
|
logs:
|
||||||
|
- shortname: rennet2025h2
|
||||||
|
monitoringprefix: https://rennet2025h2.mon.ct.ipng.ch
|
||||||
|
localdirectory: /ssd-vol0/logs/rennet2025h2/data
|
||||||
|
staging: true
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
In the first configuration file, I'll tell _Sunlight_ (the write path component) to listen on port
|
||||||
|
`:16420` and I'll tell _Skylight_ (the read path component) to listen on port `:16421`. I've disabled
|
||||||
|
the automatic certificate renewals, and will handle SSL upstream. A few notes on this:
|
||||||
|
|
||||||
|
1. Most importantly, I will be using a common frontend pool with a wildcard certificate for
|
||||||
|
`*.ct.ipng.ch`. I wrote about [[DNS-01]({{< ref 2023-03-24-lego-dns01 >}})] before, it's a very
|
||||||
|
convenient way for IPng to do certificate pool management. I will be sharing a certificate for all log
|
||||||
|
types.
|
||||||
|
1. ACME/HTTP-01 could be made to work with a bit of effort; plumbing through the `/.well-known/`
|
||||||
|
URIs on the frontend and pointing them to these instances. But then the cert would have to be copied
|
||||||
|
from Sunlight back to the frontends.
|
||||||
|
|
||||||
|
I've noticed that when the log doesn't exist yet, I can start Sunlight and it'll create the bits and
|
||||||
|
pieces on the local filesystem and start writing checkpoints. But if the log already exists, I am
|
||||||
|
required to have the _monitoringprefix_ active, otherwise Sunlight won't start up. It's a small
|
||||||
|
thing, as I will have the read path operational in a few simple steps. Anyway, all five logshards
|
||||||
|
for Rennet, and a few days later, for Gouda, are operational this way.
|
||||||
|
|
||||||
|
Skylight provides all the things I need to serve the data back, which is a huge help. The [[Static
|
||||||
|
Log Spec](https://github.com/C2SP/C2SP/blob/main/static-ct-api.md)] is very clear on things like
|
||||||
|
compression, content-type, cache-control and other headers. Skylight makes this a breeze, as it reads
|
||||||
|
a configuration file very similar to the Sunlight write-path one, and takes care of it all for me.
|
||||||
|
|
||||||
|
## TesseraCT
|
||||||
|
|
||||||
|
{{< image width="10em" float="right" src="/assets/ctlog/tesseract-logo.png" alt="TesseraCT logo" >}}
|
||||||
|
|
||||||
|
Good news came to our community on August 14th, when Google's TrustFabric team announced their Alpha
|
||||||
|
milestone of [[TesseraCT](https://blog.transparency.dev/introducing-tesseract)]. This release
|
||||||
|
also moved the POSIX variant from experimental alongside the already further along GCP and AWS
|
||||||
|
personalities. After playing around with it with Al and the team, I think I've learned enough to get
|
||||||
|
us going in a public `tesseract-posix` instance.
|
||||||
|
|
||||||
|
One thing I liked about Sunlight is its compact YAML file that described the pertinent bits of the
|
||||||
|
system, and that I can serve any number of logs with the same process. On the other hand, TesseraCT
|
||||||
|
can serve only one log per process. Both have pros and cons, notably if any poisonous submission
|
||||||
|
would be offered, Sunlight might take down all logs, while TesseraCT would only take down the log
|
||||||
|
receiving the offensive submission. On the other hand, maintaining separate processes is cumbersome,
|
||||||
|
and all log instances need to be meticulously configured.
|
||||||
|
|
||||||
|
|
||||||
|
### TesseraCT genconf
|
||||||
|
|
||||||
|
I decide to automate this by vibing a little tool called `tesseract-genconf`, which I've published on
|
||||||
|
[[Gitea](https://git.ipng.ch/certificate-transparency/cheese)]. What it does is take a YAML file
|
||||||
|
describing the logs, and outputs the bits and pieces needed to operate multiple separate processes
|
||||||
|
that together form the sharded static log. I've attempted to stay mostly compatible with the
|
||||||
|
Sunlight YAML configuration, and came up with a variant like this one:
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog@ctlog1:/ssd-vol0/enc/tesseract$ cat << EOF | tee tesseract-staging.yaml
|
||||||
|
listen:
|
||||||
|
- "[::]:8080"
|
||||||
|
roots: /ssd-vol0/enc/tesseract/roots.pem
|
||||||
|
logs:
|
||||||
|
- shortname: lipase2025h2
|
||||||
|
listen: "[::]:16900"
|
||||||
|
submissionprefix: https://lipase2025h2.log.ct.ipng.ch
|
||||||
|
monitoringprefix: https://lipase2025h2.mon.ct.ipng.ch
|
||||||
|
extraroots: /ssd-vol0/enc/tesseract/extra-roots-staging.pem
|
||||||
|
secret: /ssd-vol0/enc/tesseract/keys/lipase2025h2.pem
|
||||||
|
localdirectory: /ssd-vol0/logs/lipase2025h2/data
|
||||||
|
notafterstart: 2025-07-01T00:00:00Z
|
||||||
|
notafterlimit: 2026-01-01T00:00:00Z
|
||||||
|
...
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
With this snippet, I have all the information I need. Here are the steps I take to construct the log
|
||||||
|
itself:
|
||||||
|
|
||||||
|
***1. Generate keys***
|
||||||
|
|
||||||
|
The keys are `prime256v1` and the format that TesseraCT accepts did change since I wrote up my first
|
||||||
|
[[deep dive]({{< ref 2025-07-26-ctlog-1 >}})] a few weeks ago. Now, the tool accepts a `PEM` format
|
||||||
|
private key, from which the _Log ID_ and _Public Key_ can be derived. So off I go:
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog@ctlog1:/ssd-vol0/enc/tesseract$ tesseract-genconf -c tesseract-staging.yaml gen-key
|
||||||
|
Creating /ssd-vol0/enc/tesseract/keys/lipase2025h2.pem
|
||||||
|
Creating /ssd-vol0/enc/tesseract/keys/lipase2026h1.pem
|
||||||
|
Creating /ssd-vol0/enc/tesseract/keys/lipase2026h2.pem
|
||||||
|
Creating /ssd-vol0/enc/tesseract/keys/lipase2027h1.pem
|
||||||
|
Creating /ssd-vol0/enc/tesseract/keys/lipase2027h2.pem
|
||||||
|
```
|
||||||
|
|
||||||
|
Of course, if a file already exists at that location, it'll just print a warning like:
|
||||||
|
```
|
||||||
|
Key already exists: /ssd-vol0/enc/tesseract/keys/lipase2025h2.pem (skipped)
|
||||||
|
```
|
||||||
|
|
||||||
|
***2. Generate JSON/HTML***
|
||||||
|
|
||||||
|
I will be operating the read-path with NGINX. Log operators have started speaking about their log
|
||||||
|
metadata in terms of a small JSON file called `log.v3.json`, and Skylight does a good job of
|
||||||
|
exposing that one, alongside all the other pertinent metadata. So I'll generate these files for each
|
||||||
|
of the logs:
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog@ctlog1:/ssd-vol0/enc/tesseract$ tesseract-genconf -c tesseract-staging.yaml gen-html
|
||||||
|
Creating /ssd-vol0/logs/lipase2025h2/data/index.html
|
||||||
|
Creating /ssd-vol0/logs/lipase2025h2/data/log.v3.json
|
||||||
|
Creating /ssd-vol0/logs/lipase2026h1/data/index.html
|
||||||
|
Creating /ssd-vol0/logs/lipase2026h1/data/log.v3.json
|
||||||
|
Creating /ssd-vol0/logs/lipase2026h2/data/index.html
|
||||||
|
Creating /ssd-vol0/logs/lipase2026h2/data/log.v3.json
|
||||||
|
Creating /ssd-vol0/logs/lipase2027h1/data/index.html
|
||||||
|
Creating /ssd-vol0/logs/lipase2027h1/data/log.v3.json
|
||||||
|
Creating /ssd-vol0/logs/lipase2027h2/data/index.html
|
||||||
|
Creating /ssd-vol0/logs/lipase2027h2/data/log.v3.json
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image width="60%" src="/assets/ctlog/lipase.png" alt="TesseraCT Lipase Log" >}}
|
||||||
|
|
||||||
|
It's nice to see a familiar look-and-feel for these logs appear in those `index.html` (which all
|
||||||
|
cross-link to each other within the logs specified in `tesseract-staging.yaml`, which is dope.
|
||||||
|
|
||||||
|
***3. Generate Roots***
|
||||||
|
|
||||||
|
Antonis had seen this before (thanks for the explanation!) but TesseraCT does not natively implement
|
||||||
|
fetching of the [[CCADB](https://www.ccadb.org/)] roots. But, he points out, you can just get them
|
||||||
|
from any other running log instance, so I'll implement a `gen-roots` command:
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog@ctlog1:/ssd-vol0/enc/tesseract$ tesseract-genconf gen-roots \
|
||||||
|
--source https://tuscolo2027h1.sunlight.geomys.org --output production-roots.pem
|
||||||
|
Fetching roots from: https://tuscolo2027h1.sunlight.geomys.org/ct/v1/get-roots
|
||||||
|
2025/08/25 08:24:58 Warning: Failed to parse certificate,carefully skipping: x509: negative serial number
|
||||||
|
Creating production-roots.pem
|
||||||
|
Successfully wrote 248 certificates to tusc.pem (out of 249 total)
|
||||||
|
|
||||||
|
ctlog@ctlog1:/ssd-vol0/enc/tesseract$ tesseract-genconf gen-roots \
|
||||||
|
--source https://navigli2027h1.sunlight.geomys.org --output testing-roots.pem
|
||||||
|
Fetching roots from: https://navigli2027h1.sunlight.geomys.org/ct/v1/get-roots
|
||||||
|
Creating testing-roots.pem
|
||||||
|
Successfully wrote 82 certificates to tusc.pem (out of 82 total)
|
||||||
|
```
|
||||||
|
|
||||||
|
I can do this regularly, say daily, in a cronjob and if the files were to change, restart the
|
||||||
|
TesseraCT processes. It's not ideal (because the restart might be briefly disruptive), but it's a
|
||||||
|
reasonable option for the time being.
|
||||||
|
|
||||||
|
***4. Generate TesseraCT cmdline***
|
||||||
|
|
||||||
|
I will be running TesseraCT as a _templated unit_ in systemd. These are system unit files that have
|
||||||
|
an argument, they will have an @ in their name, like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog@ctlog1:/ssd-vol0/enc/tesseract$ cat << EOF | sudo tee /lib/systemd/system/tesseract@.service
|
||||||
|
[Unit]
|
||||||
|
Description=Tesseract CT Log service for %i
|
||||||
|
ConditionFileExists=/ssd-vol0/logs/%i/data/.env
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
# The %i here refers to the instance name, e.g., "lipase2025h2"
|
||||||
|
# This path should point to where your instance-specific .env files are located
|
||||||
|
EnvironmentFile=/ssd-vol0/logs/%i/data/.env
|
||||||
|
ExecStart=/home/ctlog/bin/tesseract-posix $TESSERACT_ARGS
|
||||||
|
User=ctlog
|
||||||
|
Group=ctlog
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=5
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
I can now implement a `gen-env` command for my tool:
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog@ctlog1:/ssd-vol0/enc/tesseract$ tesseract-genconf -c tesseract-staging.yaml gen-env
|
||||||
|
Creating /ssd-vol0/logs/lipase2025h2/data/roots.pem
|
||||||
|
Creating /ssd-vol0/logs/lipase2025h2/data/.env
|
||||||
|
Creating /ssd-vol0/logs/lipase2026h1/data/roots.pem
|
||||||
|
Creating /ssd-vol0/logs/lipase2026h1/data/.env
|
||||||
|
Creating /ssd-vol0/logs/lipase2026h2/data/roots.pem
|
||||||
|
Creating /ssd-vol0/logs/lipase2026h2/data/.env
|
||||||
|
Creating /ssd-vol0/logs/lipase2027h1/data/roots.pem
|
||||||
|
Creating /ssd-vol0/logs/lipase2027h1/data/.env
|
||||||
|
Creating /ssd-vol0/logs/lipase2027h2/data/roots.pem
|
||||||
|
Creating /ssd-vol0/logs/lipase2027h2/data/.env
|
||||||
|
```
|
||||||
|
|
||||||
|
Looking at one of those .env files, I can show the exact commandline I'll be feeding to the
|
||||||
|
`tesseract-posix` binary:
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog@ctlog1:/ssd-vol0/enc/tesseract$ cat /ssd-vol0/logs/lipase2025h2/data/.env
|
||||||
|
TESSERACT_ARGS="--private_key=/ssd-vol0/enc/tesseract/keys/lipase2025h2.pem
|
||||||
|
--origin=lipase2025h2.log.ct.ipng.ch --storage_dir=/ssd-vol0/logs/lipase2025h2/data
|
||||||
|
--roots_pem_file=/ssd-vol0/logs/lipase2025h2/data/roots.pem --http_endpoint=[::]:16900
|
||||||
|
--not_after_start=2025-07-01T00:00:00Z --not_after_limit=2026-01-01T00:00:00Z"
|
||||||
|
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image width="7em" float="left" src="/assets/shared/warning.png" alt="Warning" >}}
|
||||||
|
A quick operational note on OpenTelemetry (also often referred to as Otel): Al and the TrustFabric
|
||||||
|
team added open telemetry to the TesseraCT personalities, as it was mostly already implemented in
|
||||||
|
the underlying Tessera library. By default, it'll try to send its telemetry to localhost using
|
||||||
|
`https`, which makes sense in those cases where the collector is on a different machine. In my case,
|
||||||
|
I'll keep `otelcol` (the collector) on the same machine. Its job is to consume the Otel telemetry
|
||||||
|
stream, and turn those back into Prometheus `/metrics` endpoint on port `:9464`.
|
||||||
|
|
||||||
|
The `gen-env` command also assembles the per-instance `roots.pem` file. For staging logs, it'll take
|
||||||
|
the file pointed to by the `roots:` key, and append any per-log `extraroots:` files. For me, these
|
||||||
|
extraroots are empty and the main roots file points at either the testing roots that came from
|
||||||
|
_Rennet_ (our Sunlight staging log), or the production roots that came from _Gouda_. A job well done!
|
||||||
|
|
||||||
|
***5. Generate NGINX***
|
||||||
|
|
||||||
|
When I first ran my tests, I noticed that the log check tool called `ct-fsck` threw errors on my
|
||||||
|
read path. Filippo explained that the HTTP headers matter in the Static CT specification. Tiles,
|
||||||
|
Issuers, and Checkpoint must all have specific caching and content type headers set. This is what
|
||||||
|
makes Skylight such a gem - I get to read it (and the spec!) to see what I'm supposed to be serving.
|
||||||
|
|
||||||
|
And thus, the `gen-nginx` command is born, and listens on port `:8080` for requests:
|
||||||
|
|
||||||
|
```
|
||||||
|
ctlog@ctlog1:/ssd-vol0/enc/tesseract$ tesseract-genconf -c tesseract-staging.yaml gen-nginx
|
||||||
|
Creating nginx config: /ssd-vol0/logs/lipase2025h2/data/lipase2025h2.mon.ct.ipng.ch.conf
|
||||||
|
Creating nginx config: /ssd-vol0/logs/lipase2026h1/data/lipase2026h1.mon.ct.ipng.ch.conf
|
||||||
|
Creating nginx config: /ssd-vol0/logs/lipase2026h2/data/lipase2026h2.mon.ct.ipng.ch.conf
|
||||||
|
Creating nginx config: /ssd-vol0/logs/lipase2027h1/data/lipase2027h1.mon.ct.ipng.ch.conf
|
||||||
|
Creating nginx config: /ssd-vol0/logs/lipase2027h2/data/lipase2027h2.mon.ct.ipng.ch.conf
|
||||||
|
```
|
||||||
|
|
||||||
|
All that's left for me to do is symlink these from `/etc/nginx/sites-enabled/` and the read-path is
|
||||||
|
off to the races. With these commands in the `tesseract-genconf` tool, I am hoping that future
|
||||||
|
travelers have an easy time setting up their static log. Please let me know if you'd like to use, or
|
||||||
|
contribute, to the tool. You can find me in the Transparency Dev Slack, in #ct and also #cheese.
|
||||||
|
|
||||||
|
|
||||||
|
## IPng Frontends
|
||||||
|
|
||||||
|
{{< image width="18em" float="right" src="/assets/ctlog/MPLS Backbone - CTLog.svg" alt="ctlog at ipng" >}}
|
||||||
|
|
||||||
|
IPng Networks has a private internal network called [[IPng Site Local]({{< ref 2023-03-11-mpls-core
|
||||||
|
>}})], which is not routed on the internet. Our [[Frontends]({{< ref 2023-03-17-ipng-frontends >}})]
|
||||||
|
are the only things that have public IPv4 and IPv6 addresses. It allows for things like anycasted
|
||||||
|
webservers and loadbalancing with
|
||||||
|
[[Maglev](https://research.google/pubs/maglev-a-fast-and-reliable-software-network-load-balancer/)].
|
||||||
|
|
||||||
|
The IPng Site Local network kind of looks like the picture to the right. The hypervisors running the
|
||||||
|
Sunlight and TesseraCT logs are at NTT Zurich1 in Rümlang, Switzerland. The IPng frontends are
|
||||||
|
in green, and the sweet thing is, some of them run in IPng's own ISP network (AS8298), while others
|
||||||
|
run in partner networks (like IP-Max AS25091, and Coloclue AS8283). This means that I will benefit
|
||||||
|
from some pretty solid connectivity redundancy.
|
||||||
|
|
||||||
|
The frontends are provisioned with Ansible. There are two aspects to them - firstly, a _certbot_
|
||||||
|
instance maintains the Let's Encrypt wildcard certificates for `*.ct.ipng.ch`. There's a machine
|
||||||
|
tucked away somewhere called `lego.net.ipng.ch` -- again, not exposed on the internet -- and its job
|
||||||
|
is to renew certificates and copy them to the machines that need them. Next, a cluster of NGINX
|
||||||
|
servers uses these certificates to expose IPng and customer services to the Internet.
|
||||||
|
|
||||||
|
I can tie it all together with a snippet like so, for which I apologize in advance - it's quite a
|
||||||
|
wall of text:
|
||||||
|
|
||||||
|
```
|
||||||
|
map $http_user_agent $no_cache_ctlog_lipase {
|
||||||
|
"~*TesseraCT fsck" 1;
|
||||||
|
default 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen [::]:443 ssl http2;
|
||||||
|
listen 0.0.0.0:443 ssl http2;
|
||||||
|
ssl_certificate /etc/certs/ct.ipng.ch/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/certs/ct.ipng.ch/privkey.pem;
|
||||||
|
include /etc/nginx/conf.d/options-ssl-nginx.inc;
|
||||||
|
ssl_dhparam /etc/nginx/conf.d/ssl-dhparams.inc;
|
||||||
|
|
||||||
|
server_name lipase2025h2.log.ct.ipng.ch;
|
||||||
|
access_log /nginx/logs/lipase2025h2.log.ct.ipng.ch-access.log upstream buffer=512k flush=5s;
|
||||||
|
include /etc/nginx/conf.d/ipng-headers.inc;
|
||||||
|
|
||||||
|
location = / {
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host lipase2025h2.mon.ct.ipng.ch;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_pass http://ctlog1.net.ipng.ch:8080/index.html;
|
||||||
|
}
|
||||||
|
|
||||||
|
location = /metrics {
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_pass http://ctlog1.net.ipng.ch:9464;
|
||||||
|
}
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_pass http://ctlog1.net.ipng.ch:16900;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen [::]:443 ssl http2;
|
||||||
|
listen 0.0.0.0:443 ssl http2;
|
||||||
|
ssl_certificate /etc/certs/ct.ipng.ch/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/certs/ct.ipng.ch/privkey.pem;
|
||||||
|
include /etc/nginx/conf.d/options-ssl-nginx.inc;
|
||||||
|
ssl_dhparam /etc/nginx/conf.d/ssl-dhparams.inc;
|
||||||
|
|
||||||
|
server_name lipase2025h2.mon.ct.ipng.ch;
|
||||||
|
access_log /nginx/logs/lipase2025h2.mon.ct.ipng.ch-access.log upstream buffer=512k flush=5s;
|
||||||
|
include /etc/nginx/conf.d/ipng-headers.inc;
|
||||||
|
|
||||||
|
location = /checkpoint {
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
proxy_pass http://ctlog1.net.ipng.ch:8080;
|
||||||
|
}
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
include /etc/nginx/conf.d/ipng-upstream-headers.inc;
|
||||||
|
proxy_cache ipng_cache;
|
||||||
|
proxy_cache_key "$scheme://$host$request_uri";
|
||||||
|
proxy_cache_valid 200 24h;
|
||||||
|
proxy_cache_revalidate off;
|
||||||
|
proxy_cache_bypass $no_cache_ctlog_lipase;
|
||||||
|
proxy_no_cache $no_cache_ctlog_lipase;
|
||||||
|
|
||||||
|
proxy_pass http://ctlog1.net.ipng.ch:8080;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Taking _Lipase_ shard 2025h2 as an example, the submission path (on `*.log.ct.ipng.ch`) will show
|
||||||
|
the same `index.html` as the monitoring path (on `*.mon.ct.ipng.ch`), to provide some consistency
|
||||||
|
with Sunlight logs. Otherwise, the `/metrics` endpoint is forwarded to the `otelcol` running on port
|
||||||
|
`:9464`, and the rest (the `/ct/v1/` and so on) are sent to the first port `:16900` of the
|
||||||
|
TesseraCT.
|
||||||
|
|
||||||
|
Then the read-path makes a special-case of the `/checkpoint` endpoint, which it does not cache. That
|
||||||
|
request (as all others) is forwarded to port `:8080` which is where NGINX is running. Other
|
||||||
|
requests (notably `/tile` and `/issuer`) are cacheable, so I'll cache these on the upstream NGINX
|
||||||
|
servers, both for resilience as well as for performance. Having four of these NGINX upstream will
|
||||||
|
allow the Static CT logs (regardless of being Sunlight or TesseraCT) to serve very high read-rates.
|
||||||
|
|
||||||
|
## What's Next
|
||||||
|
|
||||||
|
I need to spend a little bit of time thinking about rate limits, specifically write-ratelimits. I
|
||||||
|
think I'll use a request limiter in upstream NGINX, to allow for each IP or /24 or /48 subnet to
|
||||||
|
only send a fixed number of requests/sec. I'll probably keep that part private though, as it's a
|
||||||
|
good rule of thumb to never offer information to attackers.
|
||||||
|
|
||||||
|
Together with Antonis Chariton and Jeroen Massar, IPng Networks will be offering both TesseraCT and
|
||||||
|
Sunlight logs on the public internet. One final step is to productionize both logs, and file the
|
||||||
|
paperwork for them in the community. At this point our Sunlight log has been running for a month or
|
||||||
|
so, and we've filed the paperwork for it to be included at Apple and Google.
|
||||||
|
|
||||||
|
I'm going to have folks poke at _Lipase_ as well, after which I'll try to run a few `ct-fsck` to
|
||||||
|
make sure the logs are sane, before offering them into the inclusion program as well. Wish us luck!
|
||||||
@@ -0,0 +1,432 @@
|
|||||||
|
---
|
||||||
|
date: "2026-02-14T11:35:14Z"
|
||||||
|
title: VPP Policers
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="200px" float="right" src="/assets/vpp/fdio-color.svg" alt="VPP" >}}
|
||||||
|
|
||||||
|
# About this series
|
||||||
|
|
||||||
|
Ever since I first saw VPP - the Vector Packet Processor - I have been deeply impressed with its
|
||||||
|
performance and versatility. For those of us who have used Cisco IOS/XR devices, like the classic
|
||||||
|
_ASR_ (aggregation service router), VPP will look and feel quite familiar as many of the approaches
|
||||||
|
are shared between the two.
|
||||||
|
|
||||||
|
There are some really fantastic features in VPP, some of which are less well known, and not always
|
||||||
|
very well documented. In this article, I will describe a unique use case in which I think VPP will
|
||||||
|
excel, notably acting as a gateway for Internet Exchange Points.
|
||||||
|
|
||||||
|
A few years ago, I toyed with the idea to use VPP as an _IXP Reseller_ concentrator, allowing
|
||||||
|
several carriers to connect with say 10G or 25G ports, and carry sub-customers on tagged interfaces
|
||||||
|
with safety (like MAC address ACLs) and rate limiting (say any given customer limited to 1Gbps on a
|
||||||
|
10G or 100G trunk), all provided by VPP. You can take a look at my [[VPP IXP Gateway]({{< ref
|
||||||
|
2023-10-21-vpp-ixp-gateway-1 >}})] article for details. I never ended up deploying it.
|
||||||
|
|
||||||
|
In this article, I follow up and fix a few shortcomings in VPP's policer framework.
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
Consider the following policer in VPP:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp# policer add name client-a rate kbps cir 150000 cb 15000000 conform-action transmit
|
||||||
|
vpp# policer input name client-a GigabitEthernet10/0/1
|
||||||
|
vpp# policer output name client-a GigabitEthernet10/0/1
|
||||||
|
```
|
||||||
|
|
||||||
|
The idea is to give a _committed information rate_ of 150Mbps with a _committed burst_ rate of 15MB.
|
||||||
|
The _CIR_ represents the average bandwidth allowed for the interface, while the _CB_ represents the
|
||||||
|
maximum amount of data (in bytes) that can be sent at line speed in a single burst before the _CIR_
|
||||||
|
kicks in to throttle the traffic.
|
||||||
|
|
||||||
|
Back in October of 2023, I reached the conclusion that the policer works in the following modes:
|
||||||
|
* ***On input***, the policer is applied on `device-input` which means it takes frames directly
|
||||||
|
from the Phy. It will not work on any sub-interfaces. It explains why the policer worked on
|
||||||
|
untagged (`Gi10/0/1`) but not on tagged (`Gi10/0/1.100`) sub-interfaces.
|
||||||
|
* ***On output***, the policer is applied on `ip4-output` and `ip6-output`, which works only for
|
||||||
|
L3 enabled interfaces, not for L2 ones like the ones one might use on bridge domain or L2 cross
|
||||||
|
connects.
|
||||||
|
|
||||||
|
## VPP Infra: L2 Feature Maps
|
||||||
|
|
||||||
|
The benefit of using the `device-input` arc is that it's efficient: every packet that comes from the
|
||||||
|
device (`Gi10/0/1`) regardless of tagging or not, will be handed off to the policer plugin. It means
|
||||||
|
any traffic (L2, L3, sub-interface, tagged, untagged) will all go through the same policer.
|
||||||
|
|
||||||
|
In `src/vnet/l2/` there are two nodes called `l2-input` and `l2-output`. I can configure VPP to call
|
||||||
|
these nodes before `ip[46]-unicast` and before `ip[46]-output` respectively. These L2 nodes have a
|
||||||
|
feature bitmap with 32 entries. The l2-input / l2-output nodes use a bitmap walk: they find the
|
||||||
|
highest set bit, and then dispatch the packet to a pre-configured graph node. Upon return, the
|
||||||
|
`feat-bitmap-next` checks the next bit, and if that one is set, dispatches the packet to the next
|
||||||
|
pre-configured graph node. This continues until all the bits are checked and packets have been
|
||||||
|
handed to their respective graph node if any given bit is set.
|
||||||
|
|
||||||
|
To show what I can do with these nodes, let me dive into an example. When a packet arrives on an
|
||||||
|
interface configured in L2 mode, either because it's a bridge-domain or an L2XC, `ethernet-input`
|
||||||
|
will send it to `l2-input`. This node does three things:
|
||||||
|
|
||||||
|
1. It will classify the packet, by reading the interface configuration (`l2input_main.configs`) for
|
||||||
|
the sw_if_index, which contains the mode of the interface (`bridge-domain`, `l2xc`, or `bvi`). It
|
||||||
|
also contains the feature bitmap: a statically configured set of features for this interface.
|
||||||
|
|
||||||
|
1. It will store the effective feature bitmap for each individual packet in the packet buffer. For
|
||||||
|
bridge mode, depending on the packet being unicast or multicast, some features are disabled. For
|
||||||
|
example, flooding for unicast packets is not performed, so those bits are cleared. The result is
|
||||||
|
stored in a per-packet working copy that downstream nodes can be triggered on, in turn.
|
||||||
|
|
||||||
|
1. For each of the bits set in the packet buffer's `l2.feature_bitmap`, starting from highest bit
|
||||||
|
set, `l2-input` will set the next node, for example `l2-input-vtr` to do VLAN Tag Rewriting. Once
|
||||||
|
that node is finished, it'll clear its own bit, and search for the next one set, in order to set a
|
||||||
|
new node.
|
||||||
|
|
||||||
|
I note that processing order is HIGH to LOW bits. By reading `l2_input.h`, I can see that the full
|
||||||
|
`l2-input` chain looks like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
l2-input
|
||||||
|
→ SPAN(17) → INPUT_CLASSIFY(16) → INPUT_FEAT_ARC(15) → POLICER_CLAS(14)
|
||||||
|
→ ACL(13) → VPATH(12) → L2_IP_QOS_RECORD(11) → VTR(10) → LEARN(9) → RW(8)
|
||||||
|
→ FWD(7) → UU_FWD(6) → UU_FLOOD(5) → ARP_TERM(4) → ARP_UFWD(3) → FLOOD(2)
|
||||||
|
→ XCONNECT(1) → DROP(0)
|
||||||
|
|
||||||
|
l2-output
|
||||||
|
→ XCRW(12) → OUTPUT_FEAT_ARC(11) → OUTPUT_CLASSIFY(10) → LINESTATUS_DOWN(9)
|
||||||
|
→ STP_BLOCKED(8) → IPIW(7) → EFP_FILTER(6) → L2PT(5) → ACL(4) → QOS(3)
|
||||||
|
→ CFM(2) → SPAN(1) → OUTPUT(0)
|
||||||
|
```
|
||||||
|
|
||||||
|
If none of the L2 processing nodes set the next node, ultimately `feature-bitmap-drop` gently takes
|
||||||
|
the packet behind the shed and drops it. On the way out, ultimately the last `OUTPUT` bit sends the
|
||||||
|
packet to `interface-output`, which hands off to the driver's TX node.
|
||||||
|
|
||||||
|
### Enabling L2 features
|
||||||
|
|
||||||
|
There's lots of places in VPP where L2 feature bitmaps are set/cleared. Here's a few examples:
|
||||||
|
|
||||||
|
```
|
||||||
|
# VTR: sets L2INPUT_FEAT_VTR + configures output VTR (VLAN Tag Rewriting)
|
||||||
|
vpp# set interface l2 tag-rewrite GigE0/0/0.100 pop 1
|
||||||
|
|
||||||
|
# ACL: sets L2INPUT_FEAT_ACL / L2OUTPUT_FEAT_ACL
|
||||||
|
vpp# set interface l2 input acl intfc GigE0/0/0 ip4-table 0
|
||||||
|
vpp# set interface l2 output acl intfc GigE0/0/0 ip4-table 0
|
||||||
|
|
||||||
|
# SPAN: sets L2INPUT_FEAT_SPAN / L2OUTPUT_FEAT_SPAN
|
||||||
|
vpp# set interface span GigE0/0/0 l2 destination GigE0/0/1
|
||||||
|
|
||||||
|
# Bridge domain level (affects bd_feature_bitmap, applied to all bridge members)
|
||||||
|
vpp# set bridge-domain learn 1 # enable/disable LEARN in BD
|
||||||
|
vpp# set bridge-domain forward 1 # enable/disable FWD in BD
|
||||||
|
vpp# set bridge-domain flood 1 # enable/disable FLOOD in BD
|
||||||
|
```
|
||||||
|
|
||||||
|
I'm starting to see how these L2 feature bitmaps are super powerful, yet flexible. I'm ready to add one!
|
||||||
|
|
||||||
|
### Creating L2 features
|
||||||
|
|
||||||
|
First, I need to insert my new `POLICER` bit in `l2_input.h` and `l2_output.h`. Then, I can call
|
||||||
|
`l2input_intf_bitmap_enable()` and its companion `l2output_intf_bitmap_enable()` to enable or
|
||||||
|
disable the L2 feature, and point it at a new graph node.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
/* Enable policer both on L2 feature bitmap, and L3 feature arcs */
|
||||||
|
if (dir == VLIB_RX) {
|
||||||
|
l2input_intf_bitmap_enable (sw_if_index, L2INPUT_FEAT_POLICER, apply);
|
||||||
|
vnet_feature_enable_disable ("ip4-unicast", "policer-input", sw_if_index, apply, 0, 0);
|
||||||
|
vnet_feature_enable_disable ("ip6-unicast", "policer-input", sw_if_index, apply, 0, 0);
|
||||||
|
} else {
|
||||||
|
l2output_intf_bitmap_enable (sw_if_index, L2OUTPUT_FEAT_POLICER, apply);
|
||||||
|
vnet_feature_enable_disable ("ip4-output", "policer-output", sw_if_index, apply, 0, 0);
|
||||||
|
vnet_feature_enable_disable ("ip6-output", "policer-output", sw_if_index, apply, 0, 0);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
What this means is that if the interface happens to be in L2 mode, in other words when it is a
|
||||||
|
`bridge-domain` member or when it is in an `l2XC` mode, I will enable the L2 features. However, for
|
||||||
|
L3 packets, I will still proceed to enable the existing `policer-input` node by calling
|
||||||
|
`vnet_feature_enable_disable()` on the IPv4 and IPv6 input arc. I make a mental note that MPLS and
|
||||||
|
other non-IP traffic will not be policed in this way.
|
||||||
|
|
||||||
|
### Updating Policer graph node
|
||||||
|
|
||||||
|
The policer framework has an existing dataplane node called `vnet_policer_inline()` which I extend
|
||||||
|
to take a flag `is_l2`. Using this flag, I can either set the next graph node to be
|
||||||
|
`vnet_l2_feature_next()`, or, in the pre-existing L3 case, set `vnet_feature_next()` on the packets
|
||||||
|
that move through the node. The nodes now look like this:
|
||||||
|
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
VLIB_NODE_FN (policer_l2_input_node)
|
||||||
|
(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
|
||||||
|
{
|
||||||
|
return vnet_policer_inline (vm, node, frame, VLIB_RX, 1 /* is_l2 */);
|
||||||
|
}
|
||||||
|
|
||||||
|
VLIB_REGISTER_NODE (policer_l2_input_node) = {
|
||||||
|
.name = "l2-policer-input",
|
||||||
|
.vector_size = sizeof (u32),
|
||||||
|
.format_trace = format_policer_trace,
|
||||||
|
.type = VLIB_NODE_TYPE_INTERNAL,
|
||||||
|
.n_errors = ARRAY_LEN(vnet_policer_error_strings),
|
||||||
|
.error_strings = vnet_policer_error_strings,
|
||||||
|
.n_next_nodes = VNET_POLICER_N_NEXT,
|
||||||
|
.next_nodes = {
|
||||||
|
[VNET_POLICER_NEXT_DROP] = "error-drop",
|
||||||
|
[VNET_POLICER_NEXT_HANDOFF] = "policer-input-handoff",
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Register on IP unicast arcs for L3 routed sub-interfaces */
|
||||||
|
VNET_FEATURE_INIT (policer_ip4_unicast, static) = {
|
||||||
|
.arc_name = "ip4-unicast",
|
||||||
|
.node_name = "policer-input",
|
||||||
|
.runs_before = VNET_FEATURES ("ip4-lookup"),
|
||||||
|
};
|
||||||
|
|
||||||
|
VNET_FEATURE_INIT (policer_ip6_unicast, static) = {
|
||||||
|
.arc_name = "ip6-unicast",
|
||||||
|
.node_name = "policer-input",
|
||||||
|
.runs_before = VNET_FEATURES ("ip6-lookup"),
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
Here, I install the L3 feature before `ip[46]-lookup`, and hook up the L2 feature with a new node
|
||||||
|
that really just calls the existing node but with `is_l2` set to true. I do something very similar
|
||||||
|
for the output direction, except there I'll hook the L3 feature before `ip[46]-output`.
|
||||||
|
|
||||||
|
## Tests!
|
||||||
|
|
||||||
|
I think writing unit- and integration tests is a great idea. I add a new file
|
||||||
|
`test/test_policer_subif.py` which actually tests all four new cases:
|
||||||
|
1. **L3 Input**: on a routed sub-interface
|
||||||
|
1. **L3 Output**: on a routed sub-interface
|
||||||
|
1. **L2 Input**: on a bridge-domain sub-interface
|
||||||
|
1. **L2 Output**: on a bridge-domain sub-interface
|
||||||
|
|
||||||
|
The existing `test/test_policer.py` should also cover existing cases, and of course it's important
|
||||||
|
that my work does not break existing functionality. Lucky me, the existing tests all still pass :)
|
||||||
|
|
||||||
|
### Test: L3 in/output
|
||||||
|
|
||||||
|
The tests use a VPP feature called `packet-generator`, which creates virtual devices upon which I
|
||||||
|
can emit packets using ScaPY, and use pcap to receive them. For the input, first I'll create the
|
||||||
|
interface and apply a new policer to it:
|
||||||
|
|
||||||
|
```python
|
||||||
|
sub_if0 = VppDot1QSubint(self, self.pg0, 10)
|
||||||
|
sub_if0.admin_up()
|
||||||
|
sub_if0.config_ip4()
|
||||||
|
sub_if0.resolve_arp()
|
||||||
|
|
||||||
|
# Create policer
|
||||||
|
action_tx = PolicerAction(VppEnum.vl_api_sse2_qos_action_type_t.SSE2_QOS_ACTION_API_TRANSMIT, 0)
|
||||||
|
policer = VppPolicer(self, "subif_l3_pol", 80, 0, 1000, 0,
|
||||||
|
conform_action=action_tx, exceed_action=action_tx, violate_action=action_tx,
|
||||||
|
)
|
||||||
|
policer.add_vpp_config()
|
||||||
|
|
||||||
|
# Apply policer to sub-interface input on pg0
|
||||||
|
policer.apply_vpp_config(sub_if0.sw_if_index, Dir.RX, True)
|
||||||
|
```
|
||||||
|
|
||||||
|
The policer with name `subif_l3_pol` has a _CIR_ of 80kbps, and _EIR_ of 0kB, a _CB_ of 1000 bytes, and
|
||||||
|
_EB_ of 0kB, and otherwise always accepts packets. I do this so that I can eventually detect if and
|
||||||
|
how many packets were seen, and how many bytes were passed in the conform and violate actions.
|
||||||
|
|
||||||
|
Next, I can generate a few packets and send them out from `pg0`, and wait to receive them on `pg1`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Send packets with VLAN tag from sub_if0 to sub_if1
|
||||||
|
pkts = []
|
||||||
|
for i in range(NUM_PKTS): # NUM_PKTS = 67
|
||||||
|
pkt = (
|
||||||
|
Ether(src=self.pg0.remote_mac, dst=self.pg0.local_mac) / Dot1Q(vlan=10)
|
||||||
|
/ IP(src=sub_if0.remote_ip4, dst=sub_if1.remote_ip4) / UDP(sport=1234, dport=1234)
|
||||||
|
/ Raw(b"\xa5" * 100)
|
||||||
|
)
|
||||||
|
pkts.append(pkt)
|
||||||
|
|
||||||
|
# Send and verify packets are policed and forwarded
|
||||||
|
rx = self.send_and_expect(self.pg0, pkts, self.pg1)
|
||||||
|
|
||||||
|
stats = policer.get_stats()
|
||||||
|
# Verify policing happened
|
||||||
|
self.assertGreater(stats["conform_packets"], 0)
|
||||||
|
self.assertEqual(stats["exceed_packets"], 0)
|
||||||
|
self.assertGreater(stats["violate_packets"], 0)
|
||||||
|
|
||||||
|
self.logger.info(f"L3 sub-interface input policer stats: {stats}")
|
||||||
|
```
|
||||||
|
|
||||||
|
Similar to the L3 sub-interface input policer, I also write a test for L3 sub-interface output
|
||||||
|
policer. The only difference between the two is that in the output case, the policer is applied to
|
||||||
|
`pg1` in the `Dir.TX` direction, while in the input case, it's applied to `pg0` in the `Dir.RX`
|
||||||
|
direction.
|
||||||
|
|
||||||
|
I can predict the outcome. Every packet is exactly 146 bytes:
|
||||||
|
* 14 bytes src/dst MAC in `Ether()`
|
||||||
|
* 4 bytes VLAN tag (10) in `Dot1Q()`
|
||||||
|
* 20 bytes IPv4 header in `IP()`
|
||||||
|
* 8 bytes UDP header in `UDP()`
|
||||||
|
* 100 bytes of additional payload.
|
||||||
|
|
||||||
|
When allowing a burst of 1000 bytes, that means 6 packets should make it through (876 bytes) in the
|
||||||
|
`conform` bucket while the other 61 should be in the `violate` bucket. I won't see any packets in
|
||||||
|
the `exceed` bucket, because the policer I created is a simple one-rate, two-color `1R2C` policer
|
||||||
|
with `EB` set to 0, so every non-conforming packet goes straight to violate as there is no extra
|
||||||
|
budget in the exceed bucket. However they are all sent, because the action was set to transmit in
|
||||||
|
all cases.
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~/src/vpp$ make test-debug TEST=test_policer_subif V=2
|
||||||
|
15:21:46,868 L3 sub-interface input policer stats: {'conform_packets': 7, 'conform_bytes': 896,
|
||||||
|
'exceed_packets': 0, 'exceed_bytes': 0, 'violate_packets': 60, 'violate_bytes': 7680}
|
||||||
|
15:21:47,919 L3 sub-interface output policer stats: {'conform_packets': 6, 'conform_bytes': 876,
|
||||||
|
'exceed_packets': 0, 'exceed_bytes': 0, 'violate_packets': 61, 'violate_bytes': 8906}
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image width="5em" float="left" src="/assets/shared/warning.png" alt="Warning" >}}
|
||||||
|
|
||||||
|
**Whoops!** So much for predicting the outcome! I see that 7 packets (896 bytes) make it through on input
|
||||||
|
while 6 packets (876 bytes) make it through on output. In the input case, the packet size is
|
||||||
|
`896/7 = 128` bytes, which is 18 bytes short. What's going on?
|
||||||
|
|
||||||
|
### Side Quest: Policer Accounting
|
||||||
|
|
||||||
|
On the vpp-dev mailinglist, Ben points out that the accounting will be changing when moving from
|
||||||
|
`device-input` to `ip[46]-input`, because after device-input, the packet buffer is advanced to the
|
||||||
|
L3 portion, and will start at the IPv4 or IPv6 header. Considering I was using dot1q tagged
|
||||||
|
sub-interfaces, that means I will be short exactly 18 bytes. The reason why this does not happen on
|
||||||
|
the way out, is that `ip[46]-rewrite` have both already wound back the buffer to be able to insert
|
||||||
|
the ethernet frame and encapsulation, so no adjustment is needed there.
|
||||||
|
|
||||||
|
Ben also points out that when applying the policer to the interface, I can detect at creation time if
|
||||||
|
it's a PHY, a single-tagged or a double-tagged interface, and store some information to help correct
|
||||||
|
the accounting. We discuss a little bit on the mailinglist, and agree that it's best for all four
|
||||||
|
cases (L2 input/output and L3 input/output) to use the full L2 frame bytes in the accounting, which
|
||||||
|
as an added benefit, also remains backwards compatible with the `device-input` accounting.
|
||||||
|
Chapeau, Ben, you're so clever!
|
||||||
|
|
||||||
|
I add a little helper function:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
static u8 vnet_policer_compute_l2_overhead (vnet_main_t *vnm, u32 sw_if_index, vlib_dir_t dir)
|
||||||
|
{
|
||||||
|
if (dir == VLIB_TX) return 0;
|
||||||
|
|
||||||
|
vnet_hw_interface_t *hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
|
||||||
|
if (PREDICT_FALSE (hi->hw_class_index != ethernet_hw_interface_class.index))
|
||||||
|
return 0; /* Not Ethernet */
|
||||||
|
|
||||||
|
vnet_sw_interface_t *si = vnet_get_sw_interface (vnm, sw_if_index);
|
||||||
|
if (si->type == VNET_SW_INTERFACE_TYPE_SUB) {
|
||||||
|
if (si->sub.eth.flags.one_tag) return 18; /* Ethernet + single VLAN */
|
||||||
|
if (si->sub.eth.flags.two_tags) return 22; /* Ethernet + QinQ */
|
||||||
|
}
|
||||||
|
|
||||||
|
return 14; /* Untagged Ethernet */
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
And in the policer struct, I also add a `l2_overhead_by_sw_if_index[dir][sw_if_index]` to store
|
||||||
|
these values. That way, I do not need to do this calculation for every packet in the dataplane, but
|
||||||
|
just blindly add the value I pre-computed at creation time. This is safe, because sub-interfaces
|
||||||
|
cannot change their encapsulation after being created.
|
||||||
|
|
||||||
|
In the `vnet_policer_police()` dataplane function, I add an `l2_overhead` argument, and then call it
|
||||||
|
like so:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
u16 l2_overhead0 = (is_l2) ? 0 : pm->l2_overhead_by_sw_if_index[dir][sw_if_index0];
|
||||||
|
act0 = vnet_policer_police (vm, b0, pi0, ..., l2_overhead0);
|
||||||
|
```
|
||||||
|
|
||||||
|
And with that, my two tests give the same results:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~/src/vpp$ make test-debug TEST=test_policer_subif V=2 | grep 'policer stats'
|
||||||
|
15:38:39,720 L3 sub-interface input policer stats: {'conform_packets': 6, 'conform_bytes': 876,
|
||||||
|
'exceed_packets': 0, 'exceed_bytes': 0, 'violate_packets': 61, 'violate_bytes': 8906}
|
||||||
|
15:38:40,715 L3 sub-interface output policer stats: {'conform_packets': 6, 'conform_bytes': 876,
|
||||||
|
'exceed_packets': 0, 'exceed_bytes': 0, 'violate_packets': 61, 'violate_bytes': 8906}
|
||||||
|
```
|
||||||
|
|
||||||
|
Yay, great success!
|
||||||
|
|
||||||
|
### Test: L2 in/output
|
||||||
|
|
||||||
|
The tests for the L2 input and output case are not radically different. In the setup, rather than
|
||||||
|
giving the VLAN sub-interfaces an IPv4 address, I'll just add them to a bridge-domain:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Create VLAN sub-interfaces on pg0 and pg1
|
||||||
|
sub_if0 = VppDot1QSubint(self, self.pg0, 30)
|
||||||
|
sub_if0.admin_up()
|
||||||
|
sub_if1 = VppDot1QSubint(self, self.pg1, 30)
|
||||||
|
sub_if1.admin_up()
|
||||||
|
|
||||||
|
# Add both sub-interfaces to bridge domain 1
|
||||||
|
self.vapi.sw_interface_set_l2_bridge(sub_if0.sw_if_index, bd_id=1)
|
||||||
|
self.vapi.sw_interface_set_l2_bridge(sub_if1.sw_if_index, bd_id=1)
|
||||||
|
```
|
||||||
|
|
||||||
|
This puts the sub-interfaces in L2 mode, after which the `l2-input` and `l2-output` feature bitmaps
|
||||||
|
kick in. Without further ado:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~/src/vpp$ make test-debug TEST=test_policer_subif V=2 | grep 'L2.*policer stats'
|
||||||
|
15:50:15,217 L2 sub-interface input policer stats: {'conform_packets': 6, 'conform_bytes': 876,
|
||||||
|
'exceed_packets': 0, 'exceed_bytes': 0, 'violate_packets': 61, 'violate_bytes': 8906}
|
||||||
|
15:50:16,217 L2 sub-interface output policer stats: {'conform_packets': 6, 'conform_bytes': 876,
|
||||||
|
'exceed_packets': 0, 'exceed_bytes': 0, 'violate_packets': 61, 'violate_bytes': 8906}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
The policer works in all sorts of cool scenarios now. Let me give a concrete example, where I
|
||||||
|
create an L2XC with VTR and then apply a policer. I've written about VTR, which stands for _VLAN Tag
|
||||||
|
Rewriting_ before, in an old article lovingly called [[VPP VLAN Gymnastics]({{< ref
|
||||||
|
"2022-02-14-vpp-vlan-gym" >}})]. It all looks like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp# create sub Gi10/0/0 100
|
||||||
|
vpp# create sub Gi10/0/1 200
|
||||||
|
vpp# set interface l2 xconnect Gi10/0/0.100 Gi10/0/1.200
|
||||||
|
vpp# set interface l2 xconnect Gi10/0/1.200 Gi10/0/0.100
|
||||||
|
vpp# set interface l2 tag-rewrite Gi10/0/0.100 pop 1
|
||||||
|
vpp# set interface l2 tag-rewrite Gi10/0/1.200 pop 1
|
||||||
|
vpp# policer add name pol-test rate kbps cir 150000 cb 15000000 conform-action transmit
|
||||||
|
vpp# policer input name pol-test Gi10/0/0.100
|
||||||
|
```
|
||||||
|
|
||||||
|
After applying this configuration, the input bitmap on Gi10/0/0.100 becomes `POLICER(14) | VTR(10) |
|
||||||
|
XCONNECT(1) | DROP(0)`. Packets now take the following path through the dataplane:
|
||||||
|
|
||||||
|
```
|
||||||
|
ethernet-input
|
||||||
|
→ l2-input (computes bitmap, dispatches to bit 14)
|
||||||
|
→ l2-policer-input (clears bit 14, polices, dispatches to bit 10)
|
||||||
|
→ l2-input-vtr (clears bit 10, pops 1 tag, dispatches to bit 1)
|
||||||
|
→ l2-output (XCONNECT: sw_if_index[TX]=Gi10/0/1.200)
|
||||||
|
→ inline output VTR (pushes 1 tag for .200)
|
||||||
|
→ interface-output
|
||||||
|
→ Gi10/0/1-tx
|
||||||
|
```
|
||||||
|
|
||||||
|
## What's Next
|
||||||
|
|
||||||
|
I've sent the change, which was only about ~300 LOC, off for review. You can follow along on the
|
||||||
|
gerrit on [[44654](https://gerrit.fd.io/r/c/vpp/+/44654)]. I don't think the policer got much slower
|
||||||
|
after adding the l2 path, and one might argue it doesn't matter because policing didn't work on
|
||||||
|
sub-interfaces or L2 output at all before this change. However, for the L3 input/output case, and
|
||||||
|
for the PHY input case, there are a few CPU cycles added now to address the L2 and sub-int use
|
||||||
|
cases. Perhaps I should do a side by side comparison of packets/sec throughput on the bench some
|
||||||
|
time.
|
||||||
|
|
||||||
|
It would be great if VPP would support FQ-CoDel (Flow Queue-Controlled Delay), which is an algorithm
|
||||||
|
and packet scheduler designed to eliminate bufferbloat, which is high latency caused by excessive
|
||||||
|
buffering in network equipment, while ensuring fair bandwidth distribution among competing traffic
|
||||||
|
flows. I know that Dave Täht - may he rest in peace - always wanted that.
|
||||||
|
|
||||||
|
For me, I've set my sights on eVPN VxLAN, and I also started toying with SRv6 L2 transport. I hope
|
||||||
|
that in the spring I'll have a bit more time to contribute to VPP and write about it. Stay tuned!
|
||||||
@@ -0,0 +1,572 @@
|
|||||||
|
---
|
||||||
|
date: "2026-02-21T11:35:14Z"
|
||||||
|
title: VPP SRv6 L2VPN
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="200px" float="right" src="/assets/vpp/fdio-color.svg" alt="VPP" >}}
|
||||||
|
|
||||||
|
# About this series
|
||||||
|
|
||||||
|
Ever since I first saw VPP - the Vector Packet Processor - I have been deeply impressed with its
|
||||||
|
performance and versatility. For those of us who have used Cisco IOS/XR devices, like the classic
|
||||||
|
_ASR_ (aggregation service router), VPP will look and feel quite familiar as many of the approaches
|
||||||
|
are shared between the two.
|
||||||
|
|
||||||
|
Segment Routing is a lesser known technique that allows network operators to determine a path
|
||||||
|
through their network by encoding the path inside headers in the packet itself, rather than relying
|
||||||
|
on the IGP to determine the path. Originally created to help traffic engineering of MPLS packets,
|
||||||
|
the concepts were carried forward for IPv6 as well.
|
||||||
|
|
||||||
|
In this article I take SRv6 out for a spin, implement some missing features in VPP, and stumble
|
||||||
|
across, and manage to fix a nasty bug in its implementation.
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
SRv6 - Segment Routing for IPv6 - is defined in a number of RFCs.
|
||||||
|
|
||||||
|
1. [[RFC 8402](https://datatracker.ietf.org/doc/html/rfc8402)]: Segment Routing Architecture. This
|
||||||
|
document describes the fundamentals. It defines the general concepts of Segment Routing (nodes,
|
||||||
|
segments, and steering) for both MPLS and IPv6.
|
||||||
|
1. [[RFC 8754](https://datatracker.ietf.org/doc/html/rfc8754)]: IPv6 Segment Routing Header (SRH).
|
||||||
|
This RFC defines the specific IPv6 Extension Header used for SRv6. It explains how segments are listed
|
||||||
|
and how the Segments Left field works.
|
||||||
|
1. [[RFC 8986](https://datatracker.ietf.org/doc/html/rfc8986)]: SRv6 Network Programming.
|
||||||
|
This one describes the so-called "behaviors" associated with a Segment ID (SID). It defines functions
|
||||||
|
like End (Endpoint), End.X (Layer-3 cross-connect), and End.DT4/6 (VRF decapsulation).
|
||||||
|
|
||||||
|
While reading these RFCs, I learn that I can configure an SRv6 path through the network that picks
|
||||||
|
up an ethernet packet on the ingress, and decapsulates and cross connects that ethernet packet to an
|
||||||
|
interface on the egress: an L2VPN using Ethernet-over-IPv6. That sounds dope to me!
|
||||||
|
|
||||||
|
### SRv6 in VPP - Segment Routing Header
|
||||||
|
|
||||||
|
For the dataplane, there are two parts of note. Firstly, when an IPv6 packet arrives with an IPv6
|
||||||
|
extension header, the so-called _Segment Routing Header_ or SRH, any router supporting SRv6 needs to
|
||||||
|
inspect it. The presence of an SRH changes the forwarding logic from a simple "look at the
|
||||||
|
destination, do a FIB lookup for next hop, and send the packet on its merry way" to a more
|
||||||
|
customized "process the instruction and update the IPv6 headers" kind of thing.
|
||||||
|
|
||||||
|
In IPv6, an (almost) arbitrary amount of headers can be chained from the base IPv6 packet header, to
|
||||||
|
the ultimate layer4 protocol header like ICMP, TCP or UDP. In IPv4, this is not the case, there is
|
||||||
|
only the L3 header (IPv4) and the L4 header (TCP/UDP/ICMP etc). These intermediate headers are
|
||||||
|
called Routing Extension headers, and the SRH is the one with type 4.
|
||||||
|
|
||||||
|
The fields in this header are:
|
||||||
|
|
||||||
|
* ***Next Header***: Identifies the type of header following the SRH. It can be another routing
|
||||||
|
extension header or it might be the Layer4 protocol header like TCP, UDP or ICMP.
|
||||||
|
* ***Flags***: IANA loves reserving optionality for the future. The authors of SRv6 added an 8-bit
|
||||||
|
flags field, but none of them have been assigned yet.
|
||||||
|
* ***Tag***: Moar optionality! This 16-bit tag is not defined in the RFC, simply stating that _The
|
||||||
|
allocation and use of tag is outside the scope of this document_. OK then!
|
||||||
|
* ***Segments Left (SL)***: A counter indicating how many intermediate nodes still need to be
|
||||||
|
visited.
|
||||||
|
* ***Last Entry***: The index (starting from 0) of the last element in the Segment List.
|
||||||
|
* ***Segment List***: This is an array of 128-bit IPv6 addresses, listed in reverse order of the
|
||||||
|
path. The first segment to be visited is at the highest index.
|
||||||
|
* (optional) ***TLVs***: These Type-Length-Value objects can encode other information, like HMAC
|
||||||
|
signatures, operational and performance monitoring data, and so on.
|
||||||
|
|
||||||
|
### SRv6: Anatomy
|
||||||
|
|
||||||
|
{{< image width="14em" float="right" src="/assets/vpp-srv6/magnets.jpg" alt="Insane Clown" >}}
|
||||||
|
|
||||||
|
Much like magnets, you might be wondering _SRv6 Routers: How do they work?_. There are really only
|
||||||
|
three relevant things: SR Policy (they determine how packets are steered into the SRv6 routing
|
||||||
|
domain), SRv6 Source nodes (they handle the ingress part), and SRv6 Segment Endpoint Nodes (they
|
||||||
|
handle both the intermediate routers that participate in SRv6, and also the egress part where the
|
||||||
|
packet leaves the SRv6 routing domain).
|
||||||
|
|
||||||
|
#### SRv6: Policies
|
||||||
|
|
||||||
|
A _Segment Routing Policy_ is the same for MPLS and SRv6. They are represented by either a stack of
|
||||||
|
MPLS labels, or by a stack of IPv6 addresses, and they are uniquely identified by either an MPLS
|
||||||
|
label or an IPv6 address as well. The identifier is called a _Binding Segment ID_ or BSID, and the
|
||||||
|
elements of the list are called _Segment IDs_ or SIDs.
|
||||||
|
|
||||||
|
```
|
||||||
|
BSID := SID [, SID] [, SID] ...
|
||||||
|
8298::1 := 2001:db8::1 , 2001:db8::2 , 2001:db8::3
|
||||||
|
```
|
||||||
|
|
||||||
|
These policies are written to the FIB in the router. I can now do a lookup for `8298::1`, and find
|
||||||
|
that it points to this _SR Policy_ object with the list of three IPv6 addresses. In the case of
|
||||||
|
MPLS, the _BSID_ will be in the MPLS FIB and point at a list of three MPLS labels, but I'm going to
|
||||||
|
stop talking about MPLS now :)
|
||||||
|
|
||||||
|
#### SRv6: Source Node
|
||||||
|
|
||||||
|
An _SR Source Node_ originates an IPv6 packet with a Segment in the destination address, and it
|
||||||
|
optionally adds an SRH with a list of instructions for the network. The _SR Source Node_ is the
|
||||||
|
ingress point and enables SRv6 processing in the network, which is called _steering_. Instead of
|
||||||
|
setting the destination address to the final destination, the source node will set it to the first
|
||||||
|
Segment, which is the first router that needs to be visited.
|
||||||
|
|
||||||
|
#### SRv6: Transit Node
|
||||||
|
|
||||||
|
Spoiler alert! This node type doesn't have anything to do with SRv6. SRv6 packets really do look
|
||||||
|
like normal packets, the IPv6 source address is the Source Node, and the destination address is the
|
||||||
|
_Transit Node_, which can just forward it like any other packet using their routing table. Notably,
|
||||||
|
those routers are not actively participating in SRv6 and they don't need to know anything about it.
|
||||||
|
|
||||||
|
#### SRv6: Segment Endpoint Node
|
||||||
|
|
||||||
|
The _Segment Endpoint Node_ is a router that is SRv6 capable. A packet may arrive with a locally
|
||||||
|
configured address in the IPv6 destination. The magic happens here - one of two things can occur:
|
||||||
|
|
||||||
|
1. The _Segment Routing Header_ is inspected. If _Segments Left_ is 0, then the next header
|
||||||
|
(typically UDP, TCP, ICMP) is processed. Otherwise, the next segment is read from the _Segment
|
||||||
|
List_, and the IPv6 destination address is overwritten with it. The _Segments Left_ field is
|
||||||
|
decremented. In this case the packet is routed normally through a bunch of potential transit
|
||||||
|
routers, who are blissfully ignorant of what is happening, and onto a next _Segment Endpoint_
|
||||||
|
router.
|
||||||
|
|
||||||
|
1. The IPv6 destination address might have an entry in the forwarding table which points at a
|
||||||
|
specific local meaning, called a _Local Segment ID_ or _LocalSID_. The LocalSID tells this router
|
||||||
|
what to do, for example decapsulate the packet and do a next-hop lookup in a specific routing table,
|
||||||
|
useful for L3VPNs; or perhaps an instruction to decapsulate the packet and cross connect it to a
|
||||||
|
local interface, useful for L2VPN. The key insight here is, that the local FIB entry can carry any
|
||||||
|
type of further instruction.
|
||||||
|
|
||||||
|
## VPP: IPng LAB
|
||||||
|
|
||||||
|
At this point I'm pretty sure I've bored you to tears with all the RFC stuff and theory. I do think
|
||||||
|
that segment routing (both the MPLS and the SRv6 variant) are sufficiently complex that taking a
|
||||||
|
read of the main RFCs at least once is useful. But for me, the fun part is seeing it work in
|
||||||
|
practice. So I boot the [[IPng Lab]({{< ref 2022-10-14-lab-1 >}})], which looks a bit like this.
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/lab/LAB v2.svg" alt="Logical" >}}
|
||||||
|
|
||||||
|
In this environment, each of the VPP routers is running Bird2 with OSPF and OSPFv3. They are
|
||||||
|
connected in a string, and each VPP router has an interface (`Gi10/0/2`) connected to a debian host
|
||||||
|
called `host0-0` (at the bottom), as well as an interface (`Gi10/0/3`) connected to a host called
|
||||||
|
`host0-1` (at the top). One really cool feature of the LAB is that all links are on an OpenVSwitch
|
||||||
|
which is mirroring all traffic to a tap host called `tap0-0`, so I can see traffic clearly:
|
||||||
|
|
||||||
|
```
|
||||||
|
root@vpp0-0:/etc/bird# ping -n 2001:678:d78:200::3 -c1
|
||||||
|
PING 2001:678:d78:200::3 (2001:678:d78:200::3) 56 data bytes
|
||||||
|
64 bytes from 2001:678:d78:200::3: icmp_seq=1 ttl=62 time=3.24 ms
|
||||||
|
|
||||||
|
--- 2001:678:d78:200::3 ping statistics ---
|
||||||
|
1 packets transmitted, 1 received, 0% packet loss, time 0ms
|
||||||
|
rtt min/avg/max/mdev = 3.240/3.240/3.240/0.000 ms
|
||||||
|
|
||||||
|
root@tap0-0:~# tcpdump -eni enp16s0f0
|
||||||
|
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
|
||||||
|
listening on enp16s0f0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
|
||||||
|
10:39:23.558942 52:54:00:f0:11:01 > 52:54:00:f0:11:10, ethertype 802.1Q (0x8100), length 122: vlan 20, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), 2001:678:d78:201::1:0 > 2001:678:d78:200::3: ICMP6, echo request, id 12, seq 1, length 64
|
||||||
|
10:39:23.558942 52:54:00:f0:11:11 > 52:54:00:f0:11:20, ethertype 802.1Q (0x8100), length 122: vlan 21, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), 2001:678:d78:201::1:0 > 2001:678:d78:200::3: ICMP6, echo request, id 12, seq 1, length 64
|
||||||
|
10:39:23.559993 52:54:00:f0:11:21 > 52:54:00:f0:11:30, ethertype 802.1Q (0x8100), length 122: vlan 22, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), 2001:678:d78:201::1:0 > 2001:678:d78:200::3: ICMP6, echo request, id 12, seq 1, length 64
|
||||||
|
|
||||||
|
10:39:23.560179 52:54:00:f0:11:30 > 52:54:00:f0:11:21, ethertype 802.1Q (0x8100), length 122: vlan 22, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), 2001:678:d78:200::3 > 2001:678:d78:201::1:0: ICMP6, echo reply, id 12, seq 1, length 64
|
||||||
|
10:39:23.561070 52:54:00:f0:11:20 > 52:54:00:f0:11:11, ethertype 802.1Q (0x8100), length 122: vlan 21, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), 2001:678:d78:200::3 > 2001:678:d78:201::1:0: ICMP6, echo reply, id 12, seq 1, length 64
|
||||||
|
10:39:23.561248 52:54:00:f0:11:10 > 52:54:00:f0:11:01, ethertype 802.1Q (0x8100), length 122: vlan 20, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), 2001:678:d78:200::3 > 2001:678:d78:201::1:0: ICMP6, echo reply, id 12, seq 1, length 64
|
||||||
|
```
|
||||||
|
|
||||||
|
Here you can see the packet path from `vpp0-0` sending one ICMPv6 echo request to `vpp0-3`, which
|
||||||
|
responded with one ICMPv6 echo reply. I can see the packet on vlan 20, 21, 22 on the way out, and
|
||||||
|
back again on vlan 22, 21 and 20.
|
||||||
|
|
||||||
|
### VPP: SRv6 Example
|
||||||
|
|
||||||
|
Alright, here I go! With the following short snippet, I can sum up all of the theory above in a
|
||||||
|
practical first example:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp0-0# set sr encaps source addr 2001:678:d78:200::
|
||||||
|
vpp0-0# sr policy add bsid 8298::2:1 next 2001:678:d78:20F::3:1 encap
|
||||||
|
vpp0-0# sr steer l2 GigabitEthernet10/0/2 via bsid 8298::2:1
|
||||||
|
vpp0-0# sr localsid address 2001:678:d78:20f::0:1 behavior end.dx2 GigabitEthernet10/0/2
|
||||||
|
vpp0-0# set int state GigabitEthernet10/0/2 up
|
||||||
|
```
|
||||||
|
|
||||||
|
Looking at what I typed on `vpp0-0`, first I tell the system that its encapsulation source address
|
||||||
|
is its IPv6 loopback address. Then I add a _Binding SID_ with one _Segment ID_ and I instruct this
|
||||||
|
policy to encapsulate the packet. Then, I add an L2 steering from interface `Gi10/0/2` via this
|
||||||
|
_BSID_. At this point, `vpp0-0` knows that if an ethernet frame comes in on that interface, it needs
|
||||||
|
to encapsulate it in SRv6 from `2001:678:d78:200::` and send it to `2001:678:d78:20F::3:1`. Finally,
|
||||||
|
I tell the system that if an IPv6 packet arrives with destination address `2001:678:d78:20f::0:1`,
|
||||||
|
that it needs to decapsulate it and send the resulting L2 datagram out on Gi10/0/2.
|
||||||
|
|
||||||
|
There is one last thing I have to do, and that's somehow attract this `2001:678:d78:20F::0:0/112` prefix
|
||||||
|
to `vpp0-0` and `2001:678:d78:20F::3:0/112` prefix to `vpp0-3`. I can do this by adding the prefix
|
||||||
|
to `loop0`, like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp0-0# create loopback interface instance 0
|
||||||
|
vpp0-0# set interface state loop0 up
|
||||||
|
vpp0-0# set interface ip address loop0 192.168.10.0/32
|
||||||
|
vpp0-0# set interface ip address loop0 2001:678:d78:200::0/128
|
||||||
|
vpp0-0# set interface ip address loop0 2001:678:d78:20F::0:0/112
|
||||||
|
```
|
||||||
|
|
||||||
|
This will be picked up in OSPFv3, and all routers will install a FIB entry pointing at `vpp0-0` for
|
||||||
|
the /112. Did it work?
|
||||||
|
|
||||||
|
```
|
||||||
|
root@host0-0:~# ping6 ff02::1%enp16s0f0
|
||||||
|
PING ff02::1%enp16s0f0 (ff02::1%enp16s0f0) 56 data bytes
|
||||||
|
64 bytes from fe80::5054:ff:fef0:1000%enp16s0f0: icmp_seq=1 ttl=64 time=0.156 ms
|
||||||
|
64 bytes from fe80::5054:ff:fef0:1013%enp16s0f0: icmp_seq=1 ttl=64 time=4.03 ms
|
||||||
|
^C
|
||||||
|
--- ff02::1%enp16s0f0 ping statistics ---
|
||||||
|
1 packets transmitted, 1 received, +1 duplicates, 0% packet loss, time 0ms
|
||||||
|
rtt min/avg/max/mdev = 0.156/2.092/4.029/1.936 ms
|
||||||
|
```
|
||||||
|
|
||||||
|
{{< image width="12em" float="right" src="/assets/vpp-srv6/hannibal-plan.png" alt="Hannibal Smith loves it" >}}
|
||||||
|
|
||||||
|
Yes, it worked! I love it when a plan comes together! This IPv6 address that I pinged, `ff02::1` is
|
||||||
|
called `all-hosts`, and I can see one reply from `fe80::5054:ff:fef0:1000` which is host0-0's own
|
||||||
|
link-local address, and a second reply from `fe80::5054:ff:fef0:1013` which is host0-1's address.
|
||||||
|
I have created a point to point L2VPN or _Virtual Leased Line_ between `vpp0-0:Gi10/0/2` and
|
||||||
|
`vpp0-3:Gi10/0/3` and any ethernet traffic between these two ports is passed through the network as
|
||||||
|
IPv6 packets including segment routing. Nice going!
|
||||||
|
|
||||||
|
### SRv6 on the Wire
|
||||||
|
|
||||||
|
I learn something curious. I configure an IPv4 address on both hosts:
|
||||||
|
|
||||||
|
```
|
||||||
|
root@host0-0:~# ip addr add 192.0.2.0/31 dev enp16s0f0
|
||||||
|
|
||||||
|
root@host0-1:~# ip addr add 192.0.2.1/31 dev enp16s0f3
|
||||||
|
root@host0-1:~# ping 192.0.2.0
|
||||||
|
PING 192.0.2.0 (192.0.2.0) 56(84) bytes of data.
|
||||||
|
64 bytes from 192.0.2.0: icmp_seq=1 ttl=64 time=5.27 ms
|
||||||
|
^C
|
||||||
|
--- 192.0.2.0 ping statistics ---
|
||||||
|
1 packets transmitted, 1 received, 0% packet loss, time 0ms
|
||||||
|
rtt min/avg/max/mdev = 5.274/5.274/5.274/0.000 ms
|
||||||
|
```
|
||||||
|
|
||||||
|
And then I take a look at this IPv4 ICMP packet on the wire:
|
||||||
|
|
||||||
|
```
|
||||||
|
11:03:22.118770 52:54:00:f0:10:00 > 52:54:00:f0:10:13, ethertype 802.1Q-QinQ (0x88a8), length 102: vlan 30, p 0,
|
||||||
|
ethertype IPv4 (0x0800), (tos 0x0, ttl 64, id 35014, offset 0, flags [DF], proto ICMP (1), length 84)
|
||||||
|
192.0.2.0 > 192.0.2.1: ICMP echo request, id 50, seq 1, length 64
|
||||||
|
|
||||||
|
11:03:22.119078 52:54:00:f0:11:01 > 52:54:00:f0:11:10, ethertype 802.1Q (0x8100), length 156: vlan 20, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), (flowlabel 0x09d8f, hlim 63, next-header Ethernet (143) payload length: 98) 2001:678:d78:200:: > 2001:678:d78:20f::3:1:
|
||||||
|
52:54:00:f0:10:00 > 52:54:00:f0:10:13, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 35014, offset 0, flags [DF], proto ICMP (1), length 84)
|
||||||
|
192.0.2.0 > 192.0.2.1: ICMP echo request, id 50, seq 1, length 64
|
||||||
|
```
|
||||||
|
|
||||||
|
The first packet is coming in on vlan 30 (`host0-0:enp16s0f0` to `vpp0-0:Gi10/0/2`). I then see it
|
||||||
|
go out on vlan 20 (from `vpp0-0` to `vpp0-1`). I see it is an IPv6 packet from `2001:678:d78:200::`
|
||||||
|
(the encapsulation address I configured), and to `2001:678:d78:20f::3:1` (the _BSID_ resolves to an
|
||||||
|
_SR Policy_ with a single segment: this address), and then I see the Ethernet inner payload with the
|
||||||
|
ICMP echo packet. But where's the _Segment Routing Header_??
|
||||||
|
|
||||||
|
It is here that I learn why the RFC says that SRH are optional. This packet has everything it needs
|
||||||
|
to have using the destination address, `2001:678:d78:20f::3:1`, which is routed towards the loopback
|
||||||
|
interface of `vpp0-3`. There, it is looked up in the FIB and the _Local Segment ID_ or LocalSID
|
||||||
|
determines that packets to this address must be decapsulated and forwarded out on `vpp0-3:Gi10/0/3`.
|
||||||
|
|
||||||
|
### VPP: Let's ZigZag
|
||||||
|
|
||||||
|
So how do I get these elusive SRH headers? Easy: make more than one segment in the BSID, because
|
||||||
|
then, the _SR Source Node_ will have to encode it in the _Segment List_, for which it needs to
|
||||||
|
construct an SRH.
|
||||||
|
|
||||||
|
I want to tell `vpp0-0` to do some scenic routing. I want it to send the packet first to `vpp0-2`,
|
||||||
|
then `vpp0-1` and then `vpp0-3`. I struggle a little bit, because how should I construct the
|
||||||
|
_Segment List_ ? If I put `vpp0-2`'s loopback address in there, the packet will be seen as local,
|
||||||
|
and sent for local processing, in VPP's `ip6-receive` node. I don't want that to happen, instead I
|
||||||
|
want VPP to inspect the SRH in this case. After reading a little bit in
|
||||||
|
`src/vnet/srv6/sr_localsid.c`, I realize the trick is simple (once you know it, of course): I need
|
||||||
|
to tell all routers to handle a specific localsid as _End_ behavior, which will make the
|
||||||
|
intermediate routers run `end_srh_processing()` which processes the SRH and does the destination
|
||||||
|
swap.
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp0-3# sr localsid address 2001:678:d78:20F::3:ffff behavior end
|
||||||
|
vpp0-2# sr localsid address 2001:678:d78:20F::2:ffff behavior end
|
||||||
|
vpp0-1# sr localsid address 2001:678:d78:20F::1:ffff behavior end
|
||||||
|
vpp0-0# sr localsid address 2001:678:d78:20F::0:ffff behavior end
|
||||||
|
vpp0-0# sr policy add bsid 8298::2:2 next 2001:678:d78:20F::2:ffff next 2001:678:d78:20F::1:ffff
|
||||||
|
next 2001:678:d78:20f::3:1 encap
|
||||||
|
```
|
||||||
|
|
||||||
|
Now each router knows that if an IPv6 packet is destined to its `:ffff` address, that it needs to
|
||||||
|
"End" the segment by inspecting the SRH. And the _SR Policy_ for `vpp0-0` is to send it first to
|
||||||
|
`::2:ffff`, which is `vpp0-2`, which now inspects the SRH and advances the _Segment List_.
|
||||||
|
|
||||||
|
|
||||||
|
The proof is in the tcpdump pudding, and it makes me smile to see the icmp-echo packet bounce back
|
||||||
|
and forward on its scenic route:
|
||||||
|
|
||||||
|
```
|
||||||
|
root@tap0-0:~# tcpdump -veni enp16s0f0 src 2001:678:d78:200::
|
||||||
|
tcpdump: listening on enp16s0f0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
|
||||||
|
|
||||||
|
12:15:39.442587 52:54:00:f0:10:00 > 52:54:00:f0:10:13, ethertype 802.1Q-QinQ (0x88a8), length 102: vlan 30, p 0,
|
||||||
|
ethertype IPv4 (0x0800), (tos 0x0, ttl 64, id 5534, offset 0, flags [DF], proto ICMP (1), length 84)
|
||||||
|
192.0.2.0 > 192.0.2.1: ICMP echo request, id 51, seq 561, length 64
|
||||||
|
|
||||||
|
12:15:39.501353 52:54:00:f0:11:01 > 52:54:00:f0:11:10, ethertype 802.1Q (0x8100), length 212: vlan 20, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), (flowlabel 0x09d8f, hlim 63, next-header Routing (43) payload length: 154)
|
||||||
|
2001:678:d78:200:: > 2001:678:d78:20f::2:ffff: RT6 (len=6, type=4, segleft=2, last-entry=2, flags=0x0, tag=0, [0]2001:678:d78:20f::3:1, [1]2001:678:d78:20f::1:ffff, [2]2001:678:d78:20f::2:ffff)
|
||||||
|
52:54:00:f0:10:00 > 52:54:00:f0:10:13, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 64406, offset 0, flags [DF], proto ICMP (1), length 84)
|
||||||
|
192.0.2.0 > 192.0.2.1: ICMP echo request, id 51, seq 6, length 64
|
||||||
|
|
||||||
|
12:15:39.501902 52:54:00:f0:11:11 > 52:54:00:f0:11:20, ethertype 802.1Q (0x8100), length 212: vlan 21, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), (flowlabel 0x09d8f, hlim 62, next-header Routing (43) payload length: 154)
|
||||||
|
2001:678:d78:200:: > 2001:678:d78:20f::2:ffff: RT6 (len=6, type=4, segleft=2, last-entry=2, flags=0x0, tag=0, [0]2001:678:d78:20f::3:1, [1]2001:678:d78:20f::1:ffff, [2]2001:678:d78:20f::2:ffff)
|
||||||
|
52:54:00:f0:10:00 > 52:54:00:f0:10:13, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 64406, offset 0, flags [DF], proto ICMP (1), length 84)
|
||||||
|
192.0.2.0 > 192.0.2.1: ICMP echo request, id 51, seq 6, length 64
|
||||||
|
|
||||||
|
12:15:39.502658 52:54:00:f0:11:20 > 52:54:00:f0:11:11, ethertype 802.1Q (0x8100), length 212: vlan 21, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), (flowlabel 0x09d8f, hlim 61, next-header Routing (43) payload length: 154)
|
||||||
|
2001:678:d78:200:: > 2001:678:d78:20f::1:ffff: RT6 (len=6, type=4, segleft=1, last-entry=2, flags=0x0, tag=0, [0]2001:678:d78:20f::3:1, [1]2001:678:d78:20f::1:ffff, [2]2001:678:d78:20f::2:ffff)
|
||||||
|
52:54:00:f0:10:00 > 52:54:00:f0:10:13, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 64406, offset 0, flags [DF], proto ICMP (1), length 84)
|
||||||
|
192.0.2.0 > 192.0.2.1: ICMP echo request, id 51, seq 6, length 64
|
||||||
|
|
||||||
|
12:15:39.502990 52:54:00:f0:11:11 > 52:54:00:f0:11:20, ethertype 802.1Q (0x8100), length 212: vlan 21, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), (flowlabel 0x09d8f, hlim 60, next-header Routing (43) payload length: 154)
|
||||||
|
2001:678:d78:200:: > 2001:678:d78:20f::3:1: RT6 (len=6, type=4, segleft=0, last-entry=2, flags=0x0, tag=0, [0]2001:678:d78:20f::3:1, [1]2001:678:d78:20f::1:ffff, [2]2001:678:d78:20f::2:ffff)
|
||||||
|
52:54:00:f0:10:00 > 52:54:00:f0:10:13, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 64406, offset 0, flags [DF], proto ICMP (1), length 84)
|
||||||
|
192.0.2.0 > 192.0.2.1: ICMP echo request, id 51, seq 6, length 64
|
||||||
|
|
||||||
|
12:15:39.503813 52:54:00:f0:11:21 > 52:54:00:f0:11:30, ethertype 802.1Q (0x8100), length 212: vlan 22, p 0,
|
||||||
|
ethertype IPv6 (0x86dd), (flowlabel 0x09d8f, hlim 59, next-header Routing (43) payload length: 154)
|
||||||
|
2001:678:d78:200:: > 2001:678:d78:20f::3:1: RT6 (len=6, type=4, segleft=0, last-entry=2, flags=0x0, tag=0, [0]2001:678:d78:20f::3:1, [1]2001:678:d78:20f::1:ffff, [2]2001:678:d78:20f::2:ffff)
|
||||||
|
52:54:00:f0:10:00 > 52:54:00:f0:10:13, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 64406, offset 0, flags [DF], proto ICMP (1), length 84)
|
||||||
|
192.0.2.0 > 192.0.2.1: ICMP echo request, id 51, seq 6, length 64
|
||||||
|
|
||||||
|
12:15:39.525605 52:54:00:f0:10:00 > 52:54:00:f0:10:13, ethertype 802.1Q-QinQ (0x88a8), length 102: vlan 43, p 0,
|
||||||
|
ethertype IPv4 (0x0800), (tos 0x0, ttl 64, id 5534, offset 0, flags [DF], proto ICMP (1), length 84)
|
||||||
|
192.0.2.0 > 192.0.2.1: ICMP echo request, id 51, seq 561, length 64
|
||||||
|
```
|
||||||
|
|
||||||
|
The echo-request packet can be observed seven times:
|
||||||
|
1. coming in on vlan 30 (between `host0-0` and `vpp0-0:Gi10/0/2`), here it is simply an IPv4
|
||||||
|
packet.
|
||||||
|
1. on vlan 20, encapsulated in an IPv6 packet, this time _including_ SRH header showing where it
|
||||||
|
is expected to go.
|
||||||
|
1. on vlan 21, because the first segment wants the packet to go to `vpp0-2`. and `vpp0-1` is
|
||||||
|
acting as a transit router (just normally using IPv6 FIB lookup to pass it along)
|
||||||
|
1. on vlan 21 again, because when `vpp0-2` got it, it decremented the SRH _Segments Left_ from 2
|
||||||
|
to 1, and sent it to the second segment, which is onwards to `vpp0-1`.
|
||||||
|
1. on vlan 21 yet again, because when `vpp0-1` got it, it decremented the SRH _Segments Left_ from
|
||||||
|
1 to 0, and sent it to the third and final segment, which is onwards to `vpp0-3`.
|
||||||
|
1. on vlan 22, because `vpp0-2` is acting as a transit router here (the destination is now `vpp0-3`,
|
||||||
|
not its own localsid), using its FIB to pass it along to `vpp0-3`, which decapsulates it with End.DX2
|
||||||
|
and sends it as an L2 packet on Gi10/0/3.
|
||||||
|
1. coming out of vlan 43 (between `vpp0-3:Gi10/0/3` and `host0-1`), where it is simply an IPv4
|
||||||
|
packet again.
|
||||||
|
|
||||||
|
Some folks find it easier to visualize packets by looking at Wireshark output. I grabbed one of the
|
||||||
|
packets from the wire, and here's what it looks like:
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/vpp-srv6/wireshark.png" alt="Wireshark SRv6 packet with SRH" >}}
|
||||||
|
|
||||||
|
The screenshot shows the packet observed on step 4 above - it is coming from `vpp0-0`'s loopback address and
|
||||||
|
destined to the End localsid on `vpp0-1`, and I can see that the SRH has the list of 3 Segments in
|
||||||
|
reversed order, where `Address[0]` is the final destination: a _LocalSID_ on `vpp0-3` configured as End.DX2. I
|
||||||
|
can also see that _Segments Left_ is set to 1.
|
||||||
|
|
||||||
|
VPP has a few relevant dataplane nodes:
|
||||||
|
1. ***sr-pl-rewrite-encaps-l2***: This node encapsulates ethernet at the ingress point by steering
|
||||||
|
packets into an _SR Policy_ named by its _Binding Segment ID_
|
||||||
|
1. ***sr-localsid***: This node implements End behavior, in this case sending to the next Segment
|
||||||
|
Router by looking up its _Local Segment ID_ in the FIB
|
||||||
|
1. ***sr-localsid-d***: This node decapsulates the ethernet, on an `End.DX2` behavior, by looking
|
||||||
|
up its _Local Segment ID_ in the FIB
|
||||||
|
|
||||||
|
## VPP: Adding SRv6 encap/decap on sub-interface
|
||||||
|
|
||||||
|
A few years ago, I thought maybe it'd be cool to use SRv6 for L2VPN at IPng. But I was quickly
|
||||||
|
disappointed because SRv6 encap and decap is only implemented on the `device-input` path which means
|
||||||
|
it will not work with sub-interfaces.
|
||||||
|
|
||||||
|
A few weeks ago, I worked on Gerrit [[44654](https://gerrit.fd.io/r/c/vpp/+/44654)], which
|
||||||
|
implements policers on sub-interfaces. I wrote about it in a [[policer article]({{< ref
|
||||||
|
2026-02-14-vpp-policers >}})], but since my brain's instruction cache is still warm with the code I
|
||||||
|
wrote to enable L2 features on input- and output, I thought I'd give it another go. If you're not
|
||||||
|
interested in the software engineering parts, you can stop reading now :-)
|
||||||
|
|
||||||
|
***0. Remove vlan_index everywhere***
|
||||||
|
|
||||||
|
The original author followed the RFC, where there is an `End.DX2V` behavior that allows to
|
||||||
|
decapsulate to a VLAN tag on an interface, but they never implemented it and added a note to the
|
||||||
|
code to that effect. I can see why, DX2V is not idiomatic for VPP, but there's an alternative. It
|
||||||
|
would make more sense to decapsulate with `End.DX2` to a sub-interface. So I removed this from the
|
||||||
|
codebase in all places except the API functions, where I marked them as 'not implemented', which is
|
||||||
|
true at this point anyway.
|
||||||
|
|
||||||
|
***1. Add feature bitmap entries***
|
||||||
|
|
||||||
|
I added `L2INPUT_FEAT_SRV6` to `l2_input.h`. This allows me to turn on an SRv6 feature bit, and on
|
||||||
|
ingress, send L2 datagrams from `l2-input` node directly to `sr-pl-rewrite-encaps-l2` node,
|
||||||
|
regardless of the interface being a PHY like `Gi10/0/0` or a SUB like `Gi10/0/0.100`. It comes at a
|
||||||
|
small CPU cost though, because moving on the `device-input` arc directly to the encapsulation node
|
||||||
|
will skip a bunch of L2 processing, like L2 ACL, and VLAN TAG Rewriting (which doesn't make sense on
|
||||||
|
an untagged interface anyway). But, in return I can apply SRv6 encapsulation to any interface type.
|
||||||
|
|
||||||
|
***2. Precompute DX2 headerlen***
|
||||||
|
|
||||||
|
In the case of an `End.DX2` to a sub-interface, I need to add either 4 bytes (single tag) or 8 bytes
|
||||||
|
(QinQ or QinAD double tag) to the packet length. I know which at creation time, because I can look
|
||||||
|
that up from the to-be-DX2'd interface. I'll store this in the localsid structure as `ls->l2_len`
|
||||||
|
(either 14, 18, or 22 bytes).
|
||||||
|
|
||||||
|
***3a. Connect to `l2-input` on ingress***
|
||||||
|
|
||||||
|
When enabling the `sr steer` with keyword `encap`, I need to change two things: first, I need to
|
||||||
|
allow `VNET_SW_INTERFACE_TYPE_SUB` in addition to the already present
|
||||||
|
`VNET_SW_INTERFACE_TYPE_HARDWARE`, and then if the steer policy is `SR_STEER_L2`, I remove the bits
|
||||||
|
which initialize the feature arc on `device-input`, and instead, call
|
||||||
|
`set_int_l2_mode()` in `MODE_L2_XC` (cross connect), but then I sneakily clear the feature bitmap
|
||||||
|
bit for `L2INPUT_FEAT_XCONNECT`, and instead set my new `L2INPUT_FEAT_SRV6` bit. This means that
|
||||||
|
from now on, any L2 frames will get sent to node `sr-pl-rewrite-encaps-l2` instead of `l2-output`
|
||||||
|
which is what the L2XC would've done. Finally, I initialize the L2 feature bitmap next-nodes for the
|
||||||
|
encapsulation node in function `sr_policy_rewrite_init()`.
|
||||||
|
|
||||||
|
***3b. Connect to `l2-output` on egress***
|
||||||
|
|
||||||
|
I call `l2output_create_output_node_mapping()` on the (sub)-interface, so that traffic into it will
|
||||||
|
go to `l2-output`, where I can inspect the feature bitmap to see if I need to send it to
|
||||||
|
decapsulation or not. I also need to update `sr_localsid_next` to remove `interface-output` and
|
||||||
|
replace it with `l2-output` so that egress traffic visits `l2-output`. In
|
||||||
|
`end_decaps_srh_processing()`, I need to set the `l2_len` on the buffer, and change the next node to
|
||||||
|
be `SR_LOCALSID_NEXT_L2_OUTPUT` instead of `SR_LOCALSID_NEXT_INTERFACE_OUTPUT`, so that
|
||||||
|
sub-interface processing can occur (eg, VLAN Tag Rewriting, ACLs, SPAN, and so on).
|
||||||
|
|
||||||
|
***4. Fix a bug in `sr_policy_rewrite_encaps_l2`***
|
||||||
|
|
||||||
|
I kind of thought I would be done, and it did work, but I had about 75% packet loss and iperf
|
||||||
|
performance was 20Mbps or so, while on the bench I usually expect 350+ Mbps. I scratched my head a
|
||||||
|
little bit, but then found a bug in the quad-loop processing of `sr_policy_rewrite_encaps_l2()`.
|
||||||
|
Maybe you can spot it too?
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
if (vec_len (sp0->segments_lists) == 1)
|
||||||
|
vnet_buffer (b0)->ip.adj_index[VLIB_TX] = sp0->segments_lists[0];
|
||||||
|
else {
|
||||||
|
vnet_buffer (b0)->ip.flow_hash = flow_label0;
|
||||||
|
vnet_buffer (b0)->ip.adj_index[VLIB_TX] = sp0->segments_lists[(vnet_buffer (b0)->ip.flow_hash & (vec_len (sp0->segments_lists) - 1))];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vec_len (sp1->segments_lists) == 1)
|
||||||
|
vnet_buffer (b1)->ip.adj_index[VLIB_TX] = sp1->segments_lists[1];
|
||||||
|
else {
|
||||||
|
vnet_buffer (b1)->ip.flow_hash = flow_label1;
|
||||||
|
vnet_buffer (b1)->ip.adj_index[VLIB_TX] = sp1->segments_lists[(vnet_buffer (b1)->ip.flow_hash & (vec_len (sp1->segments_lists) - 1))];
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Once I found this, I became quite certain that nobody uses L2 encapsulation in VPP, because if 4+
|
||||||
|
packets would be present in the vector, for the second through fourth packet (`b1`-`b3`), and if the
|
||||||
|
segment list had length 1, then the segment list index would incorrectly be set to garbage
|
||||||
|
`segment_lists[1]` rather than the first and only segment `segment_list[0]`. Yikes! But it explains
|
||||||
|
perfectly why I had roughly 75% packetloss, lots of TCP retransmits, and terrible throughput. I fix
|
||||||
|
this bug and SRv6 encap starts to work flawlessly.
|
||||||
|
|
||||||
|
***5. Add tests***
|
||||||
|
|
||||||
|
I decide to add four tests: for {PHY, SUB} x {Encap, Decap}. On the encap side, I create a _SR
|
||||||
|
Policy_ with BSID `a3::9999:1` which encapsulates from source `a3::` and sends to _Segment List_
|
||||||
|
[`a4::`, `a5::`, `a6::c7`]. I then _steer_ L2 traffic from interface `pg0` using this _BSID_. I'll
|
||||||
|
generate a packet and want to receive it from `pg1` encapsulated with the correct SRH and
|
||||||
|
destination address. On the decap side, I create an SRv6 packet and send it into `pg1`, and want to
|
||||||
|
see it decapsulated and exit on interface `pg0`.
|
||||||
|
|
||||||
|
I try to get consistency by adding a `send_and_verify_pkts()` which takes an argument as a validator
|
||||||
|
function, either `compare_rx_tx_packet_T_Encaps_L2()` or `compare_rx_tx_packet_End_DX2()`. These
|
||||||
|
four tests succeed, look at me!
|
||||||
|
|
||||||
|
```
|
||||||
|
==============================================================================
|
||||||
|
SRv6 L2 Sub-Interface Steering Test Case [main thread only]
|
||||||
|
==============================================================================
|
||||||
|
Test SRv6 End.DX2 decapsulation to a hardware (phy) interface. 1.53 OK
|
||||||
|
Test SRv6 End.DX2 decapsulation to a sub-interface (VLAN). 1.00 OK
|
||||||
|
Test SRv6 L2 encapsulation on a hardware (phy) interface. 1.97 OK
|
||||||
|
Test SRv6 L2 encapsulation on a sub-interface (VLAN). 1.93 OK
|
||||||
|
|
||||||
|
==============================================================================
|
||||||
|
TEST RESULTS:
|
||||||
|
Scheduled tests: 4
|
||||||
|
Executed tests: 4
|
||||||
|
Passed tests: 4
|
||||||
|
==============================================================================
|
||||||
|
```
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
With this change, it becomes possible to `sr steer` into a sub-interface, and to have an `sr
|
||||||
|
localsid` that outputs to a sub-interface, which I can demonstrate like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp0-0# create sub-interfaces GigabitEthernet10/0/2 100
|
||||||
|
vpp0-0# set int l2 tag-rewrite GigabitEthernet10/0/2.100 pop 1
|
||||||
|
vpp0-0# set int state GigabitEthernet10/0/2.100 up
|
||||||
|
vpp0-0# sr policy add bsid 8298::2:2 next 2001:678:d78:20f::3:2 encap
|
||||||
|
vpp0-0# sr steer l2 GigabitEthernet10/0/2.100 via bsid 8298::2:2
|
||||||
|
vpp0-0# sr localsid address 2001:678:d78:20f::0:2 behavior end.dx2 GigabitEthernet10/0/2.100
|
||||||
|
|
||||||
|
vpp0-3# create sub-interfaces GigabitEthernet10/0/3 200
|
||||||
|
vpp0-3# set int l2 tag-rewrite GigabitEthernet10/0/3.200 pop 1
|
||||||
|
vpp0-3# set int state GigabitEthernet10/0/3.200 up
|
||||||
|
vpp0-3# sr policy add bsid 8298::2:2 next 2001:678:d78:20F::2 encap
|
||||||
|
vpp0-3# sr steer l2 GigabitEthernet10/0/3.200 via bsid 8298::2:2
|
||||||
|
vpp0-3# sr localsid address 2001:678:d78:20f::3:2 behavior end.dx2 GigabitEthernet10/0/3.200
|
||||||
|
```
|
||||||
|
|
||||||
|
One thing to remember, is that when sub-interfaces are created and used in L2 mode, they have to get
|
||||||
|
the [[VLAN Gymnastics]({{< ref 2022-02-14-vpp-vlan-gym >}})] applied to them. In VPP terminology, it
|
||||||
|
means applying _VTR_ or _VLAN Tag Rewrite_ feature, where the tag is removed upon ingress, and
|
||||||
|
re-added on egress. That way, the ethernet frame that gets put into the SRv6 L2VPN is untagged. It
|
||||||
|
allows me to have different encapsulation on both sides.
|
||||||
|
|
||||||
|
Now, for the _moment suprème_, on the two hosts, I can now create this sub-interface and use the tagged L2VPN also:
|
||||||
|
```
|
||||||
|
root@host0-0:~# ip link add link enp16s0f0 name enp16s0f0.100 type vlan id 100
|
||||||
|
root@host0-0:~# ip link set enp16s0f0.100 up mtu 1500
|
||||||
|
root@host0-0:~# ip addr add 192.0.2.128/31 dev enp16s0f0.100
|
||||||
|
|
||||||
|
root@host0-1:~# ip link add link enp16s0f3 name enp16s0f3.200 type vlan id 200
|
||||||
|
root@host0-1:~# ip link set enp16s0f3.200 up mtu 1500
|
||||||
|
root@host0-1:~# ip addr add 192.0.2.129/31 dev enp16s0f3.200
|
||||||
|
root@host0-1:~# ping 192.0.2.128
|
||||||
|
PING 192.0.2.128 (192.0.2.128) 56(84) bytes of data.
|
||||||
|
64 bytes from 192.0.2.128: icmp_seq=1 ttl=64 time=9.88 ms
|
||||||
|
64 bytes from 192.0.2.128: icmp_seq=2 ttl=64 time=4.88 ms
|
||||||
|
64 bytes from 192.0.2.128: icmp_seq=3 ttl=64 time=7.07 ms
|
||||||
|
^C
|
||||||
|
--- 192.0.2.128 ping statistics ---
|
||||||
|
3 packets transmitted, 3 received, 0% packet loss, time 2003ms
|
||||||
|
rtt min/avg/max/mdev = 4.880/7.273/9.876/2.044 ms
|
||||||
|
|
||||||
|
root@host0-1:~# ip nei | grep 200
|
||||||
|
192.0.2.128 dev enp16s0f3.200 lladdr 52:54:00:f0:10:00 REACHABLE
|
||||||
|
fe80::5054:ff:fef0:1000 dev enp16s0f3.200 lladdr 52:54:00:f0:10:00 DELAY
|
||||||
|
```
|
||||||
|
|
||||||
|
## What's Next
|
||||||
|
|
||||||
|
I've sent the change, which is about ~850 LOC, off for review. You can follow along on the
|
||||||
|
gerrit on [[44899](https://gerrit.fd.io/r/c/vpp/+/44899)]. I'm happy to have fixed the quad-loop
|
||||||
|
encap bug, but it does show me that SRv6 (at least in L2 transport mode) is not super common for
|
||||||
|
VPP, perhaps not common in the industry? I am not convinced that I want to use this in production on
|
||||||
|
AS8298, but if I did, the basic functionality would be adding an IPv6 prefix to each of the loopback
|
||||||
|
devices, in order to attract traffic to the router, add an 'End' localsid on every router so that
|
||||||
|
they can participate in multi-hop SRv6, and add some static config to
|
||||||
|
[[vppcfg](https://git.ipng.ch/ipng/vppcfg)] to do the encap/decap for L2VPN. By the way, there's a
|
||||||
|
whole world of encap and decap behaviors, including L3VPN for IPv4, IPv6, GTP-U, and so on.
|
||||||
|
|
||||||
|
For me, I've still set my sights on eVPN VxLAN as a destination, because that will give me
|
||||||
|
multi-point ethernet mesh akin to VPLS. However there's a lot of ground to cover for me,
|
||||||
|
considering IPng uses Bird2 as a routing controlplane. Bird2 is starting to get eVPN support, but
|
||||||
|
there's a lot for me to learn. Stay tuned!
|
||||||
|
|
||||||
@@ -0,0 +1,352 @@
|
|||||||
|
---
|
||||||
|
date: "2026-04-30T06:35:14Z"
|
||||||
|
title: VPP with Maglev Loadbalancing - Part 1
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="200px" float="right" src="/assets/vpp/fdio-color.svg" alt="VPP" >}}
|
||||||
|
|
||||||
|
# About this series
|
||||||
|
|
||||||
|
Ever since I first saw VPP - the Vector Packet Processor - I have been deeply impressed with its
|
||||||
|
performance and versatility. For those of us who have used Cisco IOS/XR devices, like the classic
|
||||||
|
_ASR_ (aggregation service router), VPP will look and feel quite familiar as many of the approaches
|
||||||
|
are shared between the two.
|
||||||
|
|
||||||
|
Load balancing is one of those topics that sounds deceptively simple until you think about it for a
|
||||||
|
while. In this article I take the VPP load balancer plugin out for a spin, fix a handful of API bugs,
|
||||||
|
and add two small new features that make running it in production a little bit easier.
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
IPng runs services that want to be reachable via as few public IP addresses as possible. Let's say I
|
||||||
|
want to run a DNS resolver or authoritative nameserver or even the IPng website, but I want these to
|
||||||
|
be highly available and perhaps scale to more traffic than one backend server could provide. What
|
||||||
|
are my options?
|
||||||
|
|
||||||
|
My first option is just put a bunch of servers online and give them all an A/AAAA record, and put
|
||||||
|
them all in DNS, say 7 webservers, and then point `ipng.ch` to those. It's clumsy, notably if one
|
||||||
|
server is down for maintenance or failure, one seventh of the traffic may still want to reach it.
|
||||||
|
Also, removing a server will have lots of lingering traffic stay on the webserver, as clients are
|
||||||
|
sometimes slow to pick up the DNS changes, even if my TTL is low.
|
||||||
|
|
||||||
|
Let me show you an example:
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/vpp-maglev/qps-before.png" alt="NGINX qps per instance" >}}
|
||||||
|
|
||||||
|
There are two main problems with this graph:
|
||||||
|
|
||||||
|
1. ***Load imbalance***: there are seven webservers in this graph, but somehow only three of them
|
||||||
|
are getting traffic, the others are not. One is much more heavily loaded (`nginx0.chrma0`) than the
|
||||||
|
others. It's receiving 1.2kqps while others are receiving ~40qps. This poses a risk when the clients
|
||||||
|
that are somehow attracted to this instance grow, they may overwhelm this little webserver, even if
|
||||||
|
there are six others that could help out!
|
||||||
|
|
||||||
|
2. ***Drains take _forever_***: The green graph was a drain of `nginx0.nlams2` due to a pending
|
||||||
|
maintenance window as the datacenter is closing and the server needs to be physically moved. I put
|
||||||
|
in the DNS change at around 16:15 UTC and the traffic finally dropped at 21:45, a full five hours
|
||||||
|
(!) later. And believe it or not, the TTL was 15 minutes on these records. Some clients just don't
|
||||||
|
get the hint ...
|
||||||
|
|
||||||
|
### Load balancing 101
|
||||||
|
|
||||||
|
A naive load balancing solution is to simply round-robin: send each new packet to the next backend in the list.
|
||||||
|
That works reasonably well for stateless UDP traffic like DNS, although even with DNS there is a gotcha: some
|
||||||
|
DNS queries need TCP, for example those that are too big to fit in a single UDP packet, and they
|
||||||
|
will not be tolerant of naive packet round robin. For TCP this naive load balancing solution
|
||||||
|
quickly falls apart, because every packet in a connection needs to reach the *same* backend. Sending
|
||||||
|
a SYN to backend A and the subsequent ACK to backend B will not establish a TCP connection.
|
||||||
|
|
||||||
|
|
||||||
|
The classical answer is to keep per-session state on the load balancer: a table that maps a 5-tuple of
|
||||||
|
{source IP, destination IP, source port, destination port, protocol} to a chosen backend. That
|
||||||
|
works, but it introduces a stateful bottleneck. At line rate on a load balancer handling millions of
|
||||||
|
flows and packets/sec, maintaining and synchronising that table across multiple CPU threads is expensive. It also means that if the
|
||||||
|
load balancer restarts, every existing TCP session breaks.
|
||||||
|
|
||||||
|
What if there was some form of *consistent hashing*: given the 5-tuple of a packet, the load balancer
|
||||||
|
might always select the same backend deterministically, without storing any per-session state. If backends come and go, only
|
||||||
|
the flows that were assigned to the changed backend are affected — all other flows keep working.
|
||||||
|
Google solved this problem at scale and published their solution. They call it Maglev.
|
||||||
|
|
||||||
|
## Introducing Maglev
|
||||||
|
|
||||||
|
{{< image width="12em" float="right" src="/assets/vpp-maglev/maglev.png" alt="Icon of a maglev train" >}}
|
||||||
|
|
||||||
|
Google's Maglev load balancer has been running in production since 2008 and I happen to know several
|
||||||
|
of its authors - as a personal aside I was sad to learn that Cody Smith, with whom I shared an office
|
||||||
|
and a team for many years, passed away earlier this year. Rest in peace, Cody!
|
||||||
|
|
||||||
|
The Google team published their design at NSDI 2016 in the paper
|
||||||
|
[[Maglev: A Fast and Reliable Software Network Load Balancer](https://research.google/pubs/maglev-a-fast-and-reliable-software-network-load-balancer/)].
|
||||||
|
It is worth reading in full — the paper is well written and covers not only the hashing algorithm
|
||||||
|
but also the wider architecture of how Google handles frontend traffic at scale.
|
||||||
|
|
||||||
|
The key insight is that Maglev uses a pre-computed lookup table of size M (some large prime number,
|
||||||
|
65537 in the paper) filled with backend indices. To handle a packet, the forwarder computes a
|
||||||
|
hash over the 5-tuple modulo M, looks up the table, and forwards to whatever backend is stored
|
||||||
|
there. No per-session state is needed avoiding the need for session matching and lots of RAM, and
|
||||||
|
the flow lookup can be done super efficiently.
|
||||||
|
|
||||||
|
### The Maglev new flow table
|
||||||
|
|
||||||
|
The interesting part is _how_ that lookup table is filled. I learn that a simple approach might be
|
||||||
|
to divide M slots evenly among N backends. That would work, but removing a backend would shift every
|
||||||
|
remaining backend's range, disrupting all flows and resetting TCP connections all over the place.
|
||||||
|
Maglev uses a smarter fill algorithm:
|
||||||
|
|
||||||
|
1. For each backend _i_, derive two independent hash values from its identity (typically its IP
|
||||||
|
address): an offset and a skip value. These define a *preference list* — a permutation of all M
|
||||||
|
slots that this backend would like to occupy, in preference order.
|
||||||
|
1. Iterate over all backends round-robin. Each backend claims its next preferred slot if it is
|
||||||
|
still free. Continue until every slot is filled.
|
||||||
|
|
||||||
|
The result is a table where each backend occupies approximately M/N slots, the distribution is
|
||||||
|
uniform, and most importantly, adding or removing one backend only displaces approximately 1/N of
|
||||||
|
the flows. All other flows keep hashing to the same backend. Slick!
|
||||||
|
|
||||||
|
### The Maglev existing flow hash table
|
||||||
|
|
||||||
|
Consistent hashing handles the common case well, but there is one subtlety: the hashing
|
||||||
|
guarantees that the *same 5-tuple always maps to the same backend*, but only as long as the set
|
||||||
|
of backends does not change. If a backend is added mid-stream, a fraction of existing TCP
|
||||||
|
connections will start hashing to a different backend.
|
||||||
|
|
||||||
|
To protect long-lived connections, Maglev keeps a small per-CPU *flow hash table*: an LRU cache of
|
||||||
|
recently seen 5-tuple to backend mappings. For every packet:
|
||||||
|
|
||||||
|
1. Look up in the Maglev flow hash table. On a hit, forward to the cached backend (even if the Maglev
|
||||||
|
table would now say something different).
|
||||||
|
1. On a miss, look up the Maglev new-flow table, select the backend, and insert the mapping into the
|
||||||
|
flow hash table.
|
||||||
|
|
||||||
|
The flow hash table does not need to be exhaustive — it only needs to cover *active* connections. An
|
||||||
|
LRU eviction policy handles the rest. This means the load balancer is *mostly* stateless, as the
|
||||||
|
Maglev table is deterministic and identical on every CPU, with just enough per-connection state
|
||||||
|
to protect existing TCP sessions from transient backend changes.
|
||||||
|
|
||||||
|
### VPP LB: Plugin anatomy
|
||||||
|
|
||||||
|
The VPP load balancer plugin lives in `src/plugins/lb/`. Its core data structures map directly to
|
||||||
|
the Maglev design:
|
||||||
|
|
||||||
|
* ***VIP*** (Virtual IP): a prefix plus an optional {protocol, port} pair. This is the
|
||||||
|
public-facing address that clients connect to. A VIP can be protocol-agnostic and forward all
|
||||||
|
traffic to its backends, or it can be port-specific and forward only for example TCP/443 to its
|
||||||
|
backends.
|
||||||
|
* ***AS*** (Application Server): a backend endpoint associated with a VIP. The plugin
|
||||||
|
maintains a list of active ASes per VIP.
|
||||||
|
* ***New flow table***: the Maglev lookup table, computed from the active AS list whenever an
|
||||||
|
AS is added or removed. Size is configurable, defaulting to 1024 entries. It is filled by the
|
||||||
|
clever algorithm described above.
|
||||||
|
* ***Flow hash table***: per-worker LRU hash table of recent {5-tuple → AS} mappings. This is the
|
||||||
|
connection affinity cache described above.
|
||||||
|
* ***Encapsulation***: packets are forwarded to the AS by encapsulating them in either GRE
|
||||||
|
(GRE4 or GRE6), or via L3DSR (direct server return using DSCP remarking). The AS decapsulates and
|
||||||
|
responds directly to the client, bypassing the load balancer on the return path.
|
||||||
|
|
||||||
|
When a new flow arrives, VPP computes a hash over the 5-tuple modulo the length of its
|
||||||
|
new_flow_table, it then looks up the backend that will serve this client, stores it in the
|
||||||
|
per-worker flow hash table, and encapsulates the packet towards the AS. Subsequent packets for the
|
||||||
|
same 5-tuple hit the flow hash table directly, skipping the Maglev lookup entirely.
|
||||||
|
|
||||||
|
A garbage collection timer periodically walks the flow table and removes entries for backends that
|
||||||
|
have become inactive, preventing stale flows from reaching a long-gone AS. Operators can also remove
|
||||||
|
these AS, and flush existing connections to them.
|
||||||
|
|
||||||
|
#### Observations
|
||||||
|
|
||||||
|
After reading the LB code in VPP, I am ready to make a few observations.
|
||||||
|
|
||||||
|
***1: Lameduck*** I have the choice of 'remove AS from VIP', by removing it from the Maglev new-flow
|
||||||
|
table it will not get new flows assigned but if there are long-lived clients, the server will keep
|
||||||
|
connections open potentially indefinitely. A good example is a websocket that streams data between
|
||||||
|
a client and the webserver: it never disconnects!
|
||||||
|
|
||||||
|
My other choice is to 'Remove and flush AS from VIP', which will also remove it from being eligible
|
||||||
|
for new flows, but forcibly remove all existing flows from the flow hash table at the same time.
|
||||||
|
Yikes.
|
||||||
|
|
||||||
|
I want a middle ground, operationally:
|
||||||
|
1. Remove AS from VIP for _new connections_ while keeping existing ones for a grace period. This
|
||||||
|
is commonly referred to as _lameduck_ mode.
|
||||||
|
1. Remove AS from VIP for _all connections_, which will reset any lingering connections and move
|
||||||
|
them to another backend where they reconnect and continue on their journey.
|
||||||
|
|
||||||
|
***2: Slow undrain***: From my own experience, adding a new AS needs to often be done carefully, for
|
||||||
|
two reasons. First, sloshing traffic around can overwhelm a new / freshly started server which does
|
||||||
|
lazy initialization (for example, a Java binary). Second, a new server may have a different
|
||||||
|
configuration on purpose, for example different version of the server binary, or different
|
||||||
|
parameters like caching flags and what-not. It may be good to ease in traffic and inspect it for a
|
||||||
|
little while before bringing full load onto the server. This is commonly referred to as a _canary_
|
||||||
|
backend. I'll come back to this later.
|
||||||
|
|
||||||
|
### VPP LB: Bugs
|
||||||
|
|
||||||
|
While playing around with the plugin's binary API, I ran into a collection of bugs that made the
|
||||||
|
plugin largely unusable via the API (as opposed to the CLI). I fixed those in Gerrit
|
||||||
|
[[45428](https://gerrit.fd.io/r/c/vpp/+/45428)].
|
||||||
|
|
||||||
|
* ***IPv4 VIP prefixlen offset bug***: `lb_add_del_vip()` was computing the prefix length
|
||||||
|
incorrectly for IPv4 addresses due to an off-by-one in the address family handling, producing
|
||||||
|
VIPs that silently matched no traffic.
|
||||||
|
|
||||||
|
* ***Wrong encap type on VIP create***: Both `lb_add_del_vip()` and `lb_add_del_vip_v2()`
|
||||||
|
were passing the encapsulation type through an incorrect enum mapping, so a VIP created with GRE4
|
||||||
|
encap via the API would actually end up configured with a different encap type internally.
|
||||||
|
|
||||||
|
* ***lb_vip_dump() returning wrong fields***: The dump handler was returning a stale encap
|
||||||
|
type and an incorrect protocol value, making it impossible to verify what was actually configured
|
||||||
|
via the API.
|
||||||
|
|
||||||
|
* ***lb_as_dump() port filter broken***: The AS dump call accepts an optional VIP filter. The
|
||||||
|
port comparison was being done against an uninitialized variable, causing the filter to miss
|
||||||
|
entries or match wrong ones depending on stack contents.
|
||||||
|
|
||||||
|
* ***Missing lb_conf_get()***: There was no API call to retrieve the global LB configuration
|
||||||
|
(flow table size, timeout values). I added `lb_conf_get()` so an operator or controlplane can verify
|
||||||
|
the running configuration without resorting to CLI parsing.
|
||||||
|
|
||||||
|
* ***'show lb vips' unformatting error***: The CLI handler dereferenced a pointer that
|
||||||
|
is only valid in verbose mode, causing unexpected output (and a possible crash!) on a plain `show lb
|
||||||
|
vips`.
|
||||||
|
|
||||||
|
* ***GC only triggered by CLI input***: The garbage collector for the flow table was only
|
||||||
|
invoked when the operator typed a CLI command. On a production load balancer, stale flow entries
|
||||||
|
would accumulate indefinitely. So I added a periodic GC timer that automatically cleans up the flow
|
||||||
|
hash table.
|
||||||
|
|
||||||
|
While discussing on the `vpp-dev` mailing list, my buddy Jerome Tollet independently found two of
|
||||||
|
these bugs (the encap type mismatch and the dump port filter) and reported them during review. Both
|
||||||
|
are addressed in the latest patchset.
|
||||||
|
|
||||||
|
### VPP LB: New Feature - Weights
|
||||||
|
|
||||||
|
My attempt to address the two observations above comes from an insight that they are actually the
|
||||||
|
same class of problem: I want to be able to set a variable amount of traffic anywhere from 100% all
|
||||||
|
the way down to 0% of load that a given backend is capable of handling, and I want to be able to
|
||||||
|
flush (remove existing flows from the flow hash table) independently of the new-flow assignment.
|
||||||
|
This is commonly referred to as _weights_ in a load balancer, and in Gerrit
|
||||||
|
[[45487](https://gerrit.fd.io/r/c/vpp/+/45487)] I add per-AS weights to the Maglev new flow table,
|
||||||
|
and decouple 'flush' from 'set weight' semantically.
|
||||||
|
|
||||||
|
The motivation comes from the two operational scenarios I kept running into while testing the plugin:
|
||||||
|
|
||||||
|
**1. Draining a backend without disrupting existing sessions.** When a backend needs to go down for
|
||||||
|
maintenance, the only option was `lb as del flush`, which both removes the AS *and* flushes the
|
||||||
|
flow table. Flushing the flow table is disruptive: all existing TCP sessions that were pinned to
|
||||||
|
any backend suddenly need to re-select, causing a brief spike of misdirected packets. What I
|
||||||
|
actually want is to stop sending *new* flows to the AS while letting existing sessions drain
|
||||||
|
naturally.
|
||||||
|
|
||||||
|
**2. Introducing a new backend gradually.** When adding a new AS to a busy VIP, the Maglev algorithm
|
||||||
|
immediately assigns it ~1/N of the new-flow table slots. On a VIP handling tens of thousands of
|
||||||
|
new connections per second, that is a lot of traffic hitting a backend that may not yet be fully
|
||||||
|
warmed up (think JVM JIT, filled caches, established database connections). It would be useful to
|
||||||
|
introduce the new AS slowly and ramp it up over time.
|
||||||
|
|
||||||
|
My solution for both is to allow each AS to carry a weight in the range 0–100, which controls what
|
||||||
|
fraction of the new flow table slots it is allowed to occupy:
|
||||||
|
|
||||||
|
* ***weight 100*** (default): the AS gets its full ~1/N share of slots. This is the existing
|
||||||
|
behavior, and remains the default.
|
||||||
|
* ***weight 1–99***: the AS gets a proportionally smaller share. Useful for gradual introduction
|
||||||
|
as well as gradual removal.
|
||||||
|
* ***weight 0***: the AS gets no slots in the new flow table — no new flows are sent to it. The
|
||||||
|
flow table entries for existing sessions remain intact, so those connections keep working until
|
||||||
|
they naturally expire.
|
||||||
|
|
||||||
|
The Maglev fill algorithm is made weight-aware by scaling each AS's preference list length
|
||||||
|
proportionally to its weight. The sort order is deterministic (sorted by `(replica, address)`)
|
||||||
|
so the resulting table is identical regardless of the order ASes were added, which also has a bonus
|
||||||
|
side effect of making anycast and ECMP VIPs work correctly.
|
||||||
|
|
||||||
|
Because VPP developers do not change API signatures once they are published, I added a few new API
|
||||||
|
calls instead:
|
||||||
|
|
||||||
|
* `lb_as_add_del_v2()` — creates or deletes an AS with an explicit weight, and optionally
|
||||||
|
flushes the flow table for that AS on deletion.
|
||||||
|
* `lb_as_dump_v2()` — returns the weight and the number of new-flow-table buckets currently
|
||||||
|
assigned to each AS, which is useful for verifying the distribution.
|
||||||
|
* `lb_as_set_weight()` — changes the weight of an existing AS in place, optionally flushing
|
||||||
|
the flow table, without needing to delete and recreate the AS.
|
||||||
|
|
||||||
|
From the CLI, the weight is set with:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp# lb as 192.0.2.0/32 10.0.0.1 weight 0
|
||||||
|
vpp# lb as 192.0.2.0/32 10.0.0.1 weight 1
|
||||||
|
vpp# lb as 192.0.2.0/32 10.0.0.1 weight 10
|
||||||
|
vpp# lb as 192.0.2.0/32 10.0.0.1 weight 100
|
||||||
|
vpp# lb as 192.0.2.0/32 10.0.0.1 weight 0
|
||||||
|
vpp# lb as 192.0.2.0/32 10.0.0.1 weight 0 flush
|
||||||
|
```
|
||||||
|
|
||||||
|
In the sequence above, backend AS `10.0.0.1` starts off fully drained, then getting a token amount
|
||||||
|
of traffic by setting it to weight 1, then 10, and finally 100. When the backend needs to be
|
||||||
|
removed, I can set `weight 0` which will put it in _lameduck_ mode but keep existing flows alive.
|
||||||
|
A few minutes later, I can set it to `weight 0 flush` which will remove the remaining existing
|
||||||
|
flows. The backend then can be safely removed, without having to wait 5+ hours like I did with the
|
||||||
|
uncontrolled DNS 'drain'.
|
||||||
|
|
||||||
|
### VPP LB: New Feature - Punt Unknown
|
||||||
|
|
||||||
|
I'm still on the fence on this feature, but since I wrote it .. Gerrit
|
||||||
|
[[45431](https://gerrit.fd.io/r/c/vpp/+/45431)] adds a `punt` flag to port-based VIPs.
|
||||||
|
|
||||||
|
By default, when a VIP is configured with a specific protocol and port (e.g. TCP/443), any packet
|
||||||
|
that arrives at that VIP's address but does *not* match the configured {protocol, port} pair is
|
||||||
|
sent by VPP to `error-drop`. This is the correct behavior for most cases: if I am load balancing
|
||||||
|
TCP/443, I do not want stray UDP packets forwarded anywhere.
|
||||||
|
|
||||||
|
The problem is that this also drops ICMP. If an operator runs `traceroute` towards the VIP, or
|
||||||
|
sends an ICMP echo, or a client receives an ICMP unreachable, all of that is silently discarded.
|
||||||
|
This makes the VIP opaque from the network's perspective and can complicate debugging.
|
||||||
|
|
||||||
|
When creating a port-based VIP, I decide to add a `punt` flag, so any traffic that does not match
|
||||||
|
the configured protocol/port pairs on the VIP will newly be punted to the local IP stack
|
||||||
|
(`ip4-local` or `ip6-local`) instead of dropped. To make this work, I ask VPP to insert the VIP's
|
||||||
|
address into the FIB at a higher priority than device routes, so the punt path is actually
|
||||||
|
reachable. This allows the load balancer to handle TCP/443 (or whatever protocol/port combinations
|
||||||
|
are configured) while the local stack takes care of ICMP, traceroute, and anything else that arrives
|
||||||
|
at that address and is not a part of the maglev configuration.
|
||||||
|
|
||||||
|
The `punt` flag is only permitted on port-based VIPs — on a protocol-agnostic VIP there is
|
||||||
|
nothing left to punt, since all traffic is already matched and forwarded to application servers.
|
||||||
|
|
||||||
|
Enabling this from the CLI is straightforward, at creation time:
|
||||||
|
|
||||||
|
```
|
||||||
|
vpp# loopback create interface instance 0
|
||||||
|
vpp# lcp create loop0 host-if maglev0
|
||||||
|
vpp# set int state loop0 up
|
||||||
|
vpp# set int ip address loop0 192.0.2.0/32
|
||||||
|
vpp# lb vip 192.0.2.0/32 protocol tcp port 443 encap gre4 punt
|
||||||
|
```
|
||||||
|
|
||||||
|
In this configuration snippet, I first create a simple loopback device with a given IPv4 address,
|
||||||
|
and plumb it through to Linux using the [[Linux CP]({{< ref 2021-08-12-vpp-1 >}})] plugin. This makes
|
||||||
|
it reachable, I can ping it and traceroute to it just like any other Linux Interface Pair _LIP_.
|
||||||
|
Then, I _steal_ some traffic from it, by creating an LB VIP on this address. Without this feature,
|
||||||
|
the VIP would become unreachable, as the LB plugin would take all traffic destined to the IPv4
|
||||||
|
address. But with the `punt` keyword, any traffic not matching the LB VIP(s) on this address, will
|
||||||
|
be sent onwards to the IP stack and end up in Linux. For those of us who like pinging their VIPs,
|
||||||
|
the `punt` feature flag on VIPs will come in handy.
|
||||||
|
|
||||||
|
For the same reason as with the other feature I wrote, I need to add new API calls rather than
|
||||||
|
changing existing ones, so here I go:
|
||||||
|
|
||||||
|
* `lb_add_del_vip_v3()` — adds a `is_punt` flag to the VIP creation call.
|
||||||
|
* `lb_vip_dump_v2()` — returns `is_punt` in the VIP details, so an operator or controlplane can
|
||||||
|
verify the configuration.
|
||||||
|
|
||||||
|
## What's Next
|
||||||
|
|
||||||
|
I am going to use Maglev at IPng Networks to load balance our services like SMTP, IMAP, HTTP, DNS and
|
||||||
|
what-not. But before I can do that, I'm going to want to write some sort of controlplane that can
|
||||||
|
manipulate the VIPs, AS weights, and do things like health checking. I'm inspired by
|
||||||
|
[[HAProxy](https://haproxy.org/)] which I used to use way back when. I find its health checking
|
||||||
|
algorithm particularly clever, so I will give that codebase a good read and with what I learn,
|
||||||
|
create a health checking VPP Maglev controlplane which will give me much better insight into what
|
||||||
|
traffic goes where.
|
||||||
|
|
||||||
|
Stay tuned!
|
||||||
@@ -0,0 +1,650 @@
|
|||||||
|
---
|
||||||
|
date: "2026-05-08T06:35:14Z"
|
||||||
|
title: VPP with Maglev Loadbalancing - Part 2
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="200px" float="right" src="/assets/vpp/fdio-color.svg" alt="VPP" >}}
|
||||||
|
|
||||||
|
# About this series
|
||||||
|
|
||||||
|
Ever since I first saw VPP - the Vector Packet Processor - I have been deeply impressed with its
|
||||||
|
performance and versatility. For those of us who have used Cisco IOS/XR devices, like the classic
|
||||||
|
_ASR_ (aggregation service router), VPP will look and feel quite familiar as many of the approaches
|
||||||
|
are shared between the two.
|
||||||
|
|
||||||
|
In a [[previous article]({{< ref "2026-04-30-vpp-maglev" >}})], I looked into the Maglev algorithm and
|
||||||
|
how it is implemented in VPP. I fixed a couple of bugs in the API and added features to set weights
|
||||||
|
for application server backends. In this article, I am going to describe an approach to a control
|
||||||
|
plane for VPP's Maglev plugin.
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
For the VPP Maglev plugin to be truly useful, some automation has to govern its use of backends:
|
||||||
|
which ones get how much traffic, which ones are unhealthy and need to be drained, and so on.
|
||||||
|
Ideally, this control loop is fully automatic: when backends go missing either because they are
|
||||||
|
down themselves or because the datacenter they are in decides to take the day off, it would be nice
|
||||||
|
if the load balancer notices this and avoids sending traffic there. However, the VPP Maglev plugin
|
||||||
|
does not offer any of these smarts. The plugin is a pure dataplane component that can sling packets
|
||||||
|
to backends at very high rates, and all the rest is left as an exercise for the reader.
|
||||||
|
|
||||||
|
## VPP Maglev: Controlplane
|
||||||
|
|
||||||
|
The core problem is that VPP's `lb` plugin is pure dataplane. It holds a table of VIPs, each with a
|
||||||
|
set of application servers and their weights using the feature I added. It then hashes new flows
|
||||||
|
deterministically onto those servers. That is cool, but it is all it does. If a backend stops
|
||||||
|
responding, VPP does not know and does not care - it will keep sending traffic to that address until
|
||||||
|
someone or something tells it otherwise. The result is a black hole: clients trying to establish new
|
||||||
|
connections time out while waiting for a backend that will never respond.
|
||||||
|
|
||||||
|
Before I decided to write `vpp-maglev`, the fix for missing/down backends was manual: watch your
|
||||||
|
monitoring dashboards, notice when a backend is down, SSH into the machine running VPP, and use
|
||||||
|
`vppctl lb as ... del flush` to remove the dead backend. That works, but it obviously requires a
|
||||||
|
human in the loop and introduces a window of failure between the backend going down and the operator
|
||||||
|
reacting. For a production load balancer that is supposed to be invisible to users, this is not good
|
||||||
|
enough.
|
||||||
|
|
||||||
|
What IPng needs at a high level, is a controlplane that can:
|
||||||
|
|
||||||
|
1. Continuously probe each backend and maintain an accurate view of its health.
|
||||||
|
1. Translate health state changes into VPP API calls immediately, without human intervention.
|
||||||
|
1. Handle edge cases gracefully: what happens when `maglevd` itself restarts? When VPP restarts?
|
||||||
|
When a backend is briefly playing _Flappy Bird_?
|
||||||
|
1. Expose all of this state through a uniform API so that CLIs, dashboards, and monitoring scripts
|
||||||
|
can all read from (and write to) the same source of truth.
|
||||||
|
|
||||||
|
To address my needs, I decided to write **vpp-maglev**, which ships as four binaries: `maglevd` (the
|
||||||
|
controlplane daemon), `maglevc` (a CLI for it), `maglevd-frontend` (a web dashboard for it), and
|
||||||
|
`maglevt` (an out-of-band test utility). The rest of this article goes through each one in detail.
|
||||||
|
|
||||||
|
## Design Principles
|
||||||
|
|
||||||
|
Before blindly writing code, I wrote down a few of the constraints I wanted to hold true. Wait, a
|
||||||
|
design you say? Well, yes! And this design turned out to drive most of the architectural decisions:
|
||||||
|
|
||||||
|
**One source of truth.** Every component - CLI, web dashboard, alerting scripts - reads `maglevd`
|
||||||
|
through one typed gRPC interface. There is no secondary control plane. The CLI and the web dashboard
|
||||||
|
show exactly the same state as each other because they both ask the same controlplane daemon.
|
||||||
|
|
||||||
|
**Restart neutrality.** Restarting `maglevd` while VPP is serving live traffic must not cause user
|
||||||
|
interruption or traffic blackholing. A naive implementation would initialize an empty LB state upon
|
||||||
|
startup, because at that point the vpp-maglev daemon sees every backend in an initial `unknown` state. I
|
||||||
|
need to make sure I design for things like controlplane upgrades from the get-go, so they are safe.
|
||||||
|
|
||||||
|
**Diff-based reconciliation.** I want to create a VPP sync that computes a desired state from the
|
||||||
|
config and current observed health, then diffs it against what VPP already has, issuing only the
|
||||||
|
minimum set of API calls to converge. This is not too dissimilar from the approach I took in
|
||||||
|
[[vppcfg]({{< ref 2022-03-27-vppcfg-1 >}})], in that running the sync multiple times needs to
|
||||||
|
produce the same outcome as running it once.
|
||||||
|
|
||||||
|
**Structured observability from the start.** Every state change needs to be accounted for in a
|
||||||
|
structured JSON log, a Prometheus counter increment, and a streaming gRPC event. All three, every
|
||||||
|
time. I find it very frustrating to debug production systems that have ad hoc log messages and no
|
||||||
|
metrics, and if it's one thing a life-time career of being an SRE has taught me, it is to set the
|
||||||
|
observability bar high early.
|
||||||
|
|
||||||
|
## Health Checker: `maglevd`
|
||||||
|
|
||||||
|
`maglevd` is the long-running daemon at the center of everything. It needs to have some initial
|
||||||
|
state configuration which needs to be present on the machine, so that cold restarts do not need to
|
||||||
|
phone home to get a running config. My first decision is to let it read a YAML configuration
|
||||||
|
file that describes three named collections: `healthchecks`, `backends` that reference the health
|
||||||
|
checks, and `frontends` that reference the backends.
|
||||||
|
|
||||||
|
The configuration structure maps directly onto the internal runtime model, sort of like this:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
maglev:
|
||||||
|
healthchecks:
|
||||||
|
http-check:
|
||||||
|
type: http
|
||||||
|
port: 80
|
||||||
|
params:
|
||||||
|
path: /.well-known/ipng/healthz
|
||||||
|
response-code: "200-204"
|
||||||
|
interval: 5s
|
||||||
|
|
||||||
|
backends:
|
||||||
|
nginx0-ams:
|
||||||
|
address: 192.0.2.10
|
||||||
|
healthcheck: http-check
|
||||||
|
nginx1-ams:
|
||||||
|
address: 192.0.2.11
|
||||||
|
healthcheck: http-check
|
||||||
|
nginx0-fra:
|
||||||
|
address: 192.0.2.12
|
||||||
|
healthcheck: http-check
|
||||||
|
|
||||||
|
frontends:
|
||||||
|
http-vip:
|
||||||
|
address: 192.0.2.1
|
||||||
|
protocol: tcp
|
||||||
|
port: 80
|
||||||
|
pools:
|
||||||
|
- name: primary
|
||||||
|
backends:
|
||||||
|
nginx0-ams: { weight: 100 }
|
||||||
|
nginx1-ams: { weight: 10 }
|
||||||
|
- name: fallback
|
||||||
|
backends:
|
||||||
|
nginx0-fra: {}
|
||||||
|
```
|
||||||
|
|
||||||
|
A **healthcheck** defines how to probe - the protocol, port, success criteria, timing parameters,
|
||||||
|
and so on. A **backend** is a named IP address bound to a healthcheck. A **frontend** is a VIP
|
||||||
|
address with one or more named **pools**, where each pool is an ordered list of `(backend, weight)`
|
||||||
|
tuples. At runtime, each backend gets exactly one probe (which Go lets me use goroutines for),
|
||||||
|
regardless of how many frontends reference it, which greatly cuts down on probe traffic.
|
||||||
|
|
||||||
|
Probes run on the configured schedule and their results flow through a state machine. State
|
||||||
|
changes emit events that the reconciler picks up and translates into VPP API calls and gRPC
|
||||||
|
streaming events for subscribed clients. The frontend's aggregate state, be it `up`, `down`, or
|
||||||
|
`unknown`, is derived from the effective weights of its backends and needs to be updated on every
|
||||||
|
backend transition.
|
||||||
|
|
||||||
|
The Golang `slog` (structured log) package emits machine-consumable JSON directly:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"level":"INFO","msg":"backend-transition","backend":"nginx0-ams","from":"down","to":"up","code":"L7OK","detail":""}
|
||||||
|
{"level":"INFO","msg":"frontend-transition","frontend":"http-vip","from":"down","to":"up"}
|
||||||
|
```
|
||||||
|
|
||||||
|
I don't really have to think about all of this state checking stuff from scratch. There are a few
|
||||||
|
really good loadbalancers out there already! One of them is HAProxy, which I used a very long time
|
||||||
|
ago. It features a really good health checking approach, the principles of which I am grateful to
|
||||||
|
borrow for my own project.
|
||||||
|
|
||||||
|
### HAProxy: Learning from its Health Counter
|
||||||
|
|
||||||
|
The state machine is driven by a single integer borrowed from HAProxy's health model: given a `rise`
|
||||||
|
threshold and a `fall` threshold, define a counter `health` in the range `[0, rise + fall - 1]`. The
|
||||||
|
backend is considered `up` when `health >= rise` and `down` when `health < rise`.
|
||||||
|
|
||||||
|
On each probe, a pass increments the counter (ceiling at maximum); a failure decrements it (floor
|
||||||
|
at zero). This gives **hysteresis**: a backend sitting at the rise boundary needs `fall`
|
||||||
|
consecutive failures before it transitions to down, and a fully-down backend needs `rise`
|
||||||
|
consecutive passes to come back up. A flapping backend that alternates between passing and failing
|
||||||
|
stays in the degraded zone without bouncing between states - which is exactly what I want to
|
||||||
|
avoid a storm of VPP API calls from a noisy backend.
|
||||||
|
|
||||||
|
In _pseudocode_, here's what that simple yet elegant approach looks like:
|
||||||
|
|
||||||
|
```go
|
||||||
|
type HealthCounter struct {
|
||||||
|
Health int
|
||||||
|
Rise int
|
||||||
|
Fall int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *HealthCounter) IsUp() bool { return h.Health >= h.Rise }
|
||||||
|
|
||||||
|
func (h *HealthCounter) RecordPass() bool {
|
||||||
|
wasUp := h.IsUp()
|
||||||
|
if h.Health < h.Max() { h.Health++ }
|
||||||
|
return !wasUp && h.IsUp()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *HealthCounter) RecordFail() bool {
|
||||||
|
wasDown := !h.IsUp()
|
||||||
|
if h.Health > 0 { h.Health-- }
|
||||||
|
return !wasDown && !h.IsUp()
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Taking an example of `rise=2, fall=3`, the health counter will span `[0, 4]`. The state boundary
|
||||||
|
sits between the 'down' side (health of 0 or 1), and 'up' side (health of 2, 3 or 4). A backend
|
||||||
|
sitting at health counter 2 (just transitioned to 'up') will need three consecutive failures to go
|
||||||
|
down: 2->1->0.
|
||||||
|
|
||||||
|
When a backend enters the `unknown` state, for example when the `vpp-maglev` daemon just started, or
|
||||||
|
after a backend was briefly paused or disabled, I try to be a bit more clever than HAProxy (famous
|
||||||
|
last words, I'm sure), by pre-setting the health counter to `rise - 1`. This means the very first
|
||||||
|
probe resolves the state immediately: one pass produces an _unknown_ transition to _up_, and one
|
||||||
|
fail produces an _unknown_ transition to _down_. The shortcut allows any probe failure while the
|
||||||
|
state is `unknown` to immediately be marked down. I argue that a backend that cannot pass even its
|
||||||
|
very first probe should not receive traffic and we should not wait for its health to fall all the
|
||||||
|
way down to 0.
|
||||||
|
|
||||||
|
**Probe types.** `maglevd` starts off its life supporting four probe types:
|
||||||
|
|
||||||
|
- **`icmp`** - sends an ICMP echo request and waits for a reply, for which I do not need to run the
|
||||||
|
daemon with root privileges, instead I can assign `CAP_NET_RAW` for this purpose. This healthcheck
|
||||||
|
type is useful for checking basic reachability without opening a TCP connection. Borrowing again
|
||||||
|
from HAProxy, this can result in probe codes: `L4OK` on reply, `L4TOUT` on timeout, `L4CON` on send
|
||||||
|
error.
|
||||||
|
- **`tcp`** - opens a TCP connection to the configured port and closes it cleanly. This healthcheck
|
||||||
|
can optionally wrap the connection in TLS with parameter `ssl: true`, with optional server name and
|
||||||
|
`insecure-skip-verify` to allow for self-signed certificates. The resulting probe codes are `L4OK`
|
||||||
|
on connect, `L4CON` on refused, `L4TOUT` on timeout, `L6OK`/`L6CON`/`L6TOUT` for TLS.
|
||||||
|
- **`http`** - opens a TCP connection, sends an HTTP/1.1 `GET` request to the configured path with
|
||||||
|
an optional `Host` header, and validates the response code against a configured range (e.g.
|
||||||
|
`"200-204"`). This healthcheck can optionally validate the body against a regular expression, making
|
||||||
|
it similar to how Nagios does its checks. The probe return codes are: `L7OK` on success, `L7STS` on
|
||||||
|
unexpected status code, `L7RSP` on bad response, and `L7TOUT` on timeout.
|
||||||
|
- **`https`** - This is a special-case of the `http` healthcheck type, but using TLS. It supports
|
||||||
|
the use of SNI `server-name` override and `insecure-skip-verify` as well for backends with
|
||||||
|
self-signed certificates.
|
||||||
|
|
||||||
|
One other thing I noticed while reading the HAProxy docs is that its probe timing is not fixed,
|
||||||
|
instead depending on the counter state. A fully healthy backend (counter at maximum) is probed at
|
||||||
|
the configured `interval`. A degraded or unknown backend is probed at the faster `fast-interval`, to
|
||||||
|
be able to mark it either up or down more quickly. And, a fully down backend is probed at the slower
|
||||||
|
`down-interval`. The result of these is that a recovering backend is re-evaluated quickly while one
|
||||||
|
that has been offline for a long time generates less probe traffic.
|
||||||
|
|
||||||
|
I add one additional detail (which I've learned the hard way when operating very large loadbalancer
|
||||||
|
pools with thousands of backends), namely jitter: every computed interval (fast, down or normal)
|
||||||
|
is scaled by a uniformly-random factor of 10% so that all probe goroutines do not phase-lock to the
|
||||||
|
same wall-clock tick after a restart, and do not hit the backend at exactly the same time either.
|
||||||
|
Good for `vpp-maglev` and good for the backends. We can all win, sometimes :)
|
||||||
|
|
||||||
|
**Pool failover.** I've found it can be useful, mostly in smaller deployments like IPng's mail and
|
||||||
|
webserver cluster, to have primary traffic stay local to the Maglev loadbalancer (eg. a VPP Maglev
|
||||||
|
instance in Amsterdam will select nginx backends in Amsterdam, not Paris or Zurich), but if they are
|
||||||
|
all down, then fall back to further away backends in a different city.
|
||||||
|
|
||||||
|
This is how I came to the decision to give the ability for a frontend to have one or more pools, which
|
||||||
|
are priority tiers. The idea is that the active pool will be the first one that contains at least
|
||||||
|
one backend in `up` state. Backends in inactive pools have their weight effectively forced to zero
|
||||||
|
and will therefore receive no traffic. If all backends in the primary pool were to be down, the
|
||||||
|
weight of the next-best pool needs to be re-evaluated, and when the backends in the primary pool
|
||||||
|
recover, demotion of the standby pool can be graceful thanks to the `lb as ... weight` feature I
|
||||||
|
added to VPP: existing flows to standby backends are left to drain naturally. Only an operator
|
||||||
|
`disable` call will trigger an immediate flow-table flush.
|
||||||
|
|
||||||
|
## Controlplane API: gRPC Endpoint
|
||||||
|
|
||||||
|
I want all client-visible functionality to be exposed through a single gRPC service. Read-only
|
||||||
|
questions like 'how many frontends are there?' or 'what is the current health state of backend X?'
|
||||||
|
but also state changing questions like 'set frontend F's backend B to weight W' need to be simple
|
||||||
|
RPCs.
|
||||||
|
|
||||||
|
The most powerful RPC I add is called `WatchEvents`. This one returns a streaming response, and a
|
||||||
|
client can initiate a `WatchRequest` which specifies which event types to include. The `vpp-maglev`
|
||||||
|
daemon then pushes events as they happen - there is no polling. The event envelope is a protobuf `oneof`:
|
||||||
|
|
||||||
|
```protobuf
|
||||||
|
message Event {
|
||||||
|
oneof event {
|
||||||
|
LogEvent log = 1; // structured log record with key/value attrs
|
||||||
|
BackendEvent backend = 2; // backend state transition
|
||||||
|
FrontendEvent frontend = 3; // frontend aggregate state change
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Using this approach allows the maglev daemon to send useful information to downstream consumers like
|
||||||
|
a CLI or WebUI in a simple yet extensible way. I imagine a CLI command like `watch events`, or a web
|
||||||
|
dashboard that shows health checks and state transitions in realtime. Those will be super useful and
|
||||||
|
can be observed within milliseconds without any busy-waiting or polling.
|
||||||
|
|
||||||
|
I didn't know this, but in the process of writing `vpp-maglev`, I learned about gRPC server
|
||||||
|
reflection, which I've enabled by default, so I can poke at the API without having the `.proto`
|
||||||
|
file, for example using `grpcurl` on the commandline:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pim@summer:~$ grpcurl -plaintext localhost:9090 list
|
||||||
|
pim@summer:~$ grpcurl -plaintext localhost:9090 maglev.Maglev/ListFrontends
|
||||||
|
pim@summer:~$ grpcurl -plaintext -d '{"name":"http-vip"}' localhost:9090 maglev.Maglev/GetFrontend
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dataplane API: VPP Plugin Programming
|
||||||
|
|
||||||
|
There are two parts to programming the VPP dataplane state. First, a reconciler reacts to individual
|
||||||
|
backend state transitions, and then a VPP LB Sync module computes a minimal set of API calls to
|
||||||
|
perform to make the dataplane reflect the backend state as seen by the controlplane daemon.
|
||||||
|
|
||||||
|
I tried to keep the reconciler as simple as possible. It only subscribes to the healthchecker's
|
||||||
|
event channel and for every backend transition, calls `SyncLBStateVIP` for the affected frontend. To
|
||||||
|
catch drift in the VPP dataplane, for example if VPP restarted, or if we re-connected to VPP, a
|
||||||
|
periodic `SyncLBStateAll` also runs and sweeps up any changes. This should not occur in general
|
||||||
|
operation, though, it's a belt-and-suspenders type of thing.
|
||||||
|
|
||||||
|
This isolated `SyncLBState*` stuff is also a future hook for divorcing the healthchecker and the LB
|
||||||
|
reconciler into two different binaries: think of a datacenter with 100 maglev frontends and 1000
|
||||||
|
local backends. In such a scenario, having three (N+2) healthcheckers should be sufficient, no need
|
||||||
|
to have every maglev check every backend!
|
||||||
|
|
||||||
|
Otherwise, the reconciler carries no state of its own. I put all the logic in `SyncLBStateVIP`,
|
||||||
|
which computes the full desired state from the config and current health, diffs it against what VPP
|
||||||
|
has, and issues only the necessary Binary API calls to bring the two in sync.
|
||||||
|
|
||||||
|
### Dataplane API: Startup Warmup
|
||||||
|
|
||||||
|
{{< image width="7em" float="left" src="/assets/shared/warning.png" alt="Warning" >}}
|
||||||
|
|
||||||
|
During one of my tests, I noticed that after restarting the maglevd, it completely wipes the VPP
|
||||||
|
loadbalancer VIPs. In hindsight this makes total sense because when the healthchecker starts, all
|
||||||
|
backends are in `unknown` state, which causes the weights to be zero until the backends transition
|
||||||
|
to the `up` state. This causes thrashing in the dataplane, which is not what I intended. I think for
|
||||||
|
a bit and decide how I'm going to prevent that. My solution is a two-phase startup warmup controlled
|
||||||
|
by `startup-min-delay` (default 5s) and `startup-max-delay` (default 30s):
|
||||||
|
|
||||||
|
**Phase 1: hands-off window.** For the first `startup-min-delay` seconds after maglevd starts,
|
||||||
|
neither the reconciler nor the periodic sync loop can touch VPP at all. Probes run, the checker
|
||||||
|
accumulates state, but transitions are suppressed at the dataplane. VPP continues serving whatever
|
||||||
|
it was programmed with before the restart.
|
||||||
|
|
||||||
|
**Phase 2: per-VIP release.** Between `startup-min-delay` and `startup-max-delay`, each VIP is
|
||||||
|
released as soon as every backend it references has reached a non-`unknown` state. A background
|
||||||
|
poll running every 250 milliseconds checks for releasable VIPs, and the reconciler also checks
|
||||||
|
on every received transition. Whichever wins the race performs a single `SyncLBStateVIP` for that
|
||||||
|
VIP. It is free to live its life.
|
||||||
|
|
||||||
|
**Watchdog.** At `startup-max-delay`, any VIP whose backends are still `unknown` is swept by a
|
||||||
|
final `SyncLBStateAll`. Those stragglers are programmed with weight zero: something is still wrong
|
||||||
|
with them, but this is an unlikely situation, and one of those belt-and-suspenders things again.
|
||||||
|
|
||||||
|
## Controlplane CLI: `maglevc`
|
||||||
|
|
||||||
|
`maglevc` connects to a running `maglevd` over gRPC and either executes a single command or drops
|
||||||
|
into an interactive shell. The same command tree is available in both modes:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pim@summer:~$ maglevc show frontends
|
||||||
|
pim@summer:~$ maglevc show backends nginx0-nlams0
|
||||||
|
pim@summer:~$ maglevc --color=false show vpp lb state
|
||||||
|
pim@summer:~$ maglevc --server chbtl2.net.ipng.ch:9090 watch events log level debug backend
|
||||||
|
```
|
||||||
|
|
||||||
|
In interactive mode, the prompt is `maglev> `. I put real effort into the shell experience because
|
||||||
|
this is the tool I reach for constantly when I want to interact with the system. I'm inspired by
|
||||||
|
Bird and try to mimic its look and feel, which will come in handy as IPng Networks uses Bird in
|
||||||
|
our routing controlplane. Having these tools all look and feel the same really helps, especially
|
||||||
|
when fecal matter hits the fast-spinning cooling device.
|
||||||
|
|
||||||
|
### Command Tree and Completion
|
||||||
|
|
||||||
|
The CLI is built around a tree of command nodes. Each node carries a short description used for
|
||||||
|
inline help, a list of fixed keyword children, and optionally a live-completion function that
|
||||||
|
fetches candidates from the runtime state when the _tab_ key is pressed. For backend names, the
|
||||||
|
completion function calls `ListBackends` with a one-second timeout; for frontend names,
|
||||||
|
`ListFrontends`; and so on. Unambiguous prefixes complete in place; multiple matches are listed so
|
||||||
|
I know what to type next. I saw this trick first in the SR Linux command-line interface, and I like the
|
||||||
|
in-line completion logic a lot. As the Dutch would say, 'Beter goed gestolen dan slecht bedacht'.
|
||||||
|
|
||||||
|
**Prefix matching** means I never have to type the full command. `sh ba nginx0` is equivalent to
|
||||||
|
`show backends nginx0`, and `sh vpp l s` expands to `show vpp lb state`. This was important to me
|
||||||
|
because I am often working in a hurry and do not want to type long commands.
|
||||||
|
|
||||||
|
**Inline help** via `?` will print the available completions for the current cursor position with
|
||||||
|
a short description next to each keyword. The `?` character is not consumed - the input line is
|
||||||
|
unchanged after the help display, which is identical to how Bird consumes `?` characters.
|
||||||
|
|
||||||
|
**Color mode** defaults to on in the interactive shell and off in one-shot mode, so piped output is
|
||||||
|
always clean. You can override either default with `--color=true` or `--color=false`. This is of
|
||||||
|
course not necessary, but sometimes is helpful to see the difference between static tokens and
|
||||||
|
variable nouns in the output. I like it, anyway :)
|
||||||
|
|
||||||
|
### Viewing State
|
||||||
|
|
||||||
|
The most frequently used commands are the `show` family. `show backends <name>` shows the current
|
||||||
|
state, the enabled flag, the healthcheck, and the recent transition history with timestamps:
|
||||||
|
|
||||||
|
```
|
||||||
|
maglev> show backends nginx0-chplo0
|
||||||
|
name nginx0-chplo0
|
||||||
|
address 2001:678:d78:7::2:0
|
||||||
|
state up for 5d19h23m35s
|
||||||
|
enabled true
|
||||||
|
healthcheck nginx
|
||||||
|
transitions down → up 2026-04-24 18:19:51.608 5d19h23m35s ago
|
||||||
|
up → down 2026-04-23 22:14:48.311 6d15h28m39s ago
|
||||||
|
unknown → up 2026-04-22 09:44:31.664 8d3h58m55s ago
|
||||||
|
disabled → unknown 2026-04-22 09:44:30.628 8d3h58m56s ago
|
||||||
|
up → disabled 2026-04-22 09:41:54.495 8d4h1m33s ago
|
||||||
|
```
|
||||||
|
|
||||||
|
`show frontends <name>` shows both the configured weight and the effective weight for every backend
|
||||||
|
in every pool. The effective weight is what was actually programmed into VPP after pool failover
|
||||||
|
logic:
|
||||||
|
|
||||||
|
```
|
||||||
|
maglev> show frontends nginx-ip6-https
|
||||||
|
name nginx-ip6-https
|
||||||
|
address 2001:678:d78::1:0:1
|
||||||
|
protocol tcp
|
||||||
|
port 443
|
||||||
|
src-ip-sticky false
|
||||||
|
flush-on-down true
|
||||||
|
description IPv6 HTTPS VIP - nginx backends
|
||||||
|
pools
|
||||||
|
name primary
|
||||||
|
backends nginx0-chlzn0 weight 100 effective 100
|
||||||
|
nginx0-chplo0 weight 100 effective 0 [disabled]
|
||||||
|
name secondary
|
||||||
|
backends nginx0-nlams0 weight 100 effective 0
|
||||||
|
nginx0-frggh0 weight 100 effective 0
|
||||||
|
```
|
||||||
|
|
||||||
|
Here, I brought `nginx0-chplo0` down so its effective weight is zero; the two instances
|
||||||
|
`nginx0-nlams0` and `nginx0-frggh0` are in the secondary pool, which is inactive because the primary
|
||||||
|
pool still has `nginx0-chlzn0` up and serving (all) the traffic.
|
||||||
|
|
||||||
|
### VPP State - A Separate Concern
|
||||||
|
|
||||||
|
One design decision I am happy with is keeping the `maglevd` view of the world (frontend and backend
|
||||||
|
state, health counters, effective weights) completely separate from the VPP view (what is actually
|
||||||
|
programmed in the dataplane). Both are visible through `maglevc`, but through different commands:
|
||||||
|
|
||||||
|
```
|
||||||
|
maglev> show frontends # maglevd's view: pools, backends, effective weights
|
||||||
|
maglev> show vpp lb state # VPP's view: VIPs, AS addresses, bucket counts
|
||||||
|
maglev> show vpp lb counters # VPP's view: per-VIP packet/byte counters
|
||||||
|
```
|
||||||
|
|
||||||
|
The `show vpp lb state` command shows the VPP load-balancer state as the plugin sees it: each VIP
|
||||||
|
with its application servers, their VPP-side weights, and how many of the 1024 Maglev hash buckets
|
||||||
|
are assigned to each AS. This is invaluable for confirming that a sync operation actually reached
|
||||||
|
VPP, and for debugging bucket distribution across backends with different weights.
|
||||||
|
|
||||||
|
### Operator Actions
|
||||||
|
|
||||||
|
The `set` commands drive mutations. `set backend <name> pause` stops the probe goroutine and drives
|
||||||
|
the effective weight to zero; `set backend <name> disable` does the same but also flushes existing
|
||||||
|
flows. `set backend <name> resume` and `set backend <name> enable` restart probing and recompute
|
||||||
|
effective weights when the backend is ready to serve again.
|
||||||
|
|
||||||
|
Weight changes are immediate:
|
||||||
|
|
||||||
|
```
|
||||||
|
maglev> set frontend nginx-ip6-https pool primary backend nginx0-chplo0 weight 0
|
||||||
|
maglev> set frontend nginx-ip6-https pool primary backend nginx0-chplo0 weight 0 flush
|
||||||
|
maglev> set backend nginx0-chplo0 disable
|
||||||
|
```
|
||||||
|
|
||||||
|
The first command gracefully drains `nginx0-chplo0` from the pool `primary` in frontend
|
||||||
|
`nginx-ip6-https`. When setting the weight to zero, new flows go elsewhere but existing ones finish.
|
||||||
|
The second flushes existing flows immediately. The third command then marks the backend as disabled,
|
||||||
|
which will remove it from serving in all pools it's a member of. This is useful when performing
|
||||||
|
maintenance on a backend, and it's the command I ran in the 'show frontend' output above.
|
||||||
|
|
||||||
|
Arguably the coolest idea, `maglevc watch events`, streams everything in real time. Combined with
|
||||||
|
`log level debug`, it shows every probe attempt and every VPP API call as they happen:
|
||||||
|
|
||||||
|
```
|
||||||
|
maglev> watch events log level debug backend
|
||||||
|
{"backend":{"backendName":"nginx0-chlzn0","transition":{"from":"up","to":"up"}}}
|
||||||
|
{"backend":{"backendName":"nginx0-chplo0","transition":{"from":"up","to":"up"}}}
|
||||||
|
{"backend":{"backendName":"nginx0-frggh0","transition":{"from":"up","to":"up"}}}
|
||||||
|
{"backend":{"backendName":"nginx0-nlams0","transition":{"from":"up","to":"up"}}}
|
||||||
|
{"log":{"atUnixNs":"1777558154335278835","level":"DEBUG","msg":"probe-start",
|
||||||
|
"attrs":[{"key":"backend","value":"nginx0-chplo0"},{"key":"type","value":"https"}]}}
|
||||||
|
{"log":{"atUnixNs":"1777558154371619020","level":"DEBUG","msg":"probe-done",
|
||||||
|
"attrs":[{"key":"backend","value":"nginx0-chplo0"},{"key":"type","value":"https"},
|
||||||
|
{"key":"ok","value":"true"},{"key":"code","value":"L7OK"},{"key":"detail"},
|
||||||
|
{"key":"elapsed","value":"36ms"}]}}
|
||||||
|
```
|
||||||
|
|
||||||
|
And finally, I mimic Bird's "reconfigure" with a set of two primitives `config check` and `config
|
||||||
|
reload` which let me validate and apply configuration changes without restarting the daemon. With
|
||||||
|
that, the maglev daemon, the main brains of the operation, is feature complete.
|
||||||
|
|
||||||
|
## Test Utility: `maglevt`
|
||||||
|
|
||||||
|
Once `maglevd` is running and `maglevc` shows everything healthy, the natural next question is: does
|
||||||
|
it actually work end-to-end? A healthcheck passing means the backend can accept a TCP connection
|
||||||
|
or return an HTTP 200, but it does not tell me whether a client hitting the VIP actually reaches the
|
||||||
|
right backend, or whether failover is visible at the application level?
|
||||||
|
|
||||||
|
I wanted a tool that could sit outside the control plane entirely - not talking gRPC, not reading
|
||||||
|
`maglevd` state - but just hitting the VIPs directly as a real client would, tallying which backend
|
||||||
|
served each request. The obvious approach is to configure each backend to include its own hostname
|
||||||
|
in an HTTP response header. On my nginx servers I add a header `X-IPng-Frontend` which returns the
|
||||||
|
local `$hostname` variable. Then a probe tool that reads `X-IPng-Frontend` from each response can
|
||||||
|
show the live distribution across backends, and a failover is immediately visible as a
|
||||||
|
redistribution of the tally.
|
||||||
|
|
||||||
|
That idea turns into `maglevt`, which reads one or more `maglev.yaml` files, enumerates the
|
||||||
|
HTTP/HTTPS frontends, and probes each VIP at a configurable interval (default 100ms per VIP, with
|
||||||
|
+/-10% jitter to prevent phase-locking). Each probe opens a fresh TCP connection - keep-alives are off
|
||||||
|
by default - so every request is independently hashed by VPP's Maglev algorithm. The tally
|
||||||
|
reshuffles the moment a backend goes down or a standby pool activates.
|
||||||
|
|
||||||
|
The UI is a terminal dashboard built with [[Bubble Tea](https://github.com/charmbracelet/bubbletea)],
|
||||||
|
a Go TUI library. Each VIP gets a tile showing a rolling latency summary (min, max, average, p95),
|
||||||
|
running success and failure counts, and the response header tally, and a set of errors, like so:
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/vpp-maglev/maglevt.png" alt="VPP Maglev TUI client" >}}
|
||||||
|
|
||||||
|
There's a lot to see in this screenshot, so let me unpack it. I'm running `maglevt` on a machine at
|
||||||
|
AS12859, BIT in the Netherlands, called `nlede01.paphosting.net`. It's reaching the VIPs that are
|
||||||
|
announced in Amsterdam, the Netherlands (`vip0.l.ipng.ch`) and Lille, France (`vip1.l.ipng.ch`), and
|
||||||
|
it is doing so with both IPv4 and IPv6, and it is doing so on port 80 and 443, which yields eight
|
||||||
|
targets. The webservers are configured to respond with an empty HTTP 204 response, and I've replayed
|
||||||
|
about 1Mio requests to each VIP. A few of these failed, which was mostly me playing around with
|
||||||
|
backend drains/flushes, hostile shutdowns (rebooting an nginx), and VIP failover scenarios. Then,
|
||||||
|
each VIP shows its last 100 probes in terms of latency, latency tail, and success rate.
|
||||||
|
|
||||||
|
In the second section, the tool is just showing how many times a response had a certain HTTP header
|
||||||
|
in it. The greyed out ones are values which have not been seen in five seconds, the white ones are
|
||||||
|
seen: it shows that I'm consistently hashing this client to one frontend at a time (because each row
|
||||||
|
has exactly one bright white entry): this test is using HTTP keepalive.
|
||||||
|
|
||||||
|
In the bottom section, a list of recent events is shown - this is mostly when the latency ceiling is
|
||||||
|
hit. These are 'spikes' written in bright yellow, or if things like timeouts occur, they would be
|
||||||
|
written in bright red.
|
||||||
|
|
||||||
|
{{< image width="4em" float="left" src="/assets/vpp-maglev/Claude_AI.svg" alt="Claude Code" >}}
|
||||||
|
|
||||||
|
I have to be honest here: before this project I had never written a Terminal UI in my life. The
|
||||||
|
Bubble Tea documentation is good but the model - a pure functional message-passing loop - took me
|
||||||
|
a while to internalize. I ended up leaning on Claude quite a bit to get the layout right, especially
|
||||||
|
the live-updating cells and the latency histogram accumulation.
|
||||||
|
|
||||||
|
What I found was that I could describe what I wanted in plain language and the code that came back
|
||||||
|
was usually correct and idiomatic. I then spent time reading and understanding the code before
|
||||||
|
committing it. I learned a lot about how Go handles terminal output and about the Elm architecture
|
||||||
|
that Bubble Tea is based on - much faster than I would have on my own. Having an AI collaborator
|
||||||
|
that writes correct code does not mean I can stop learning; if anything, having working code in
|
||||||
|
front of me makes the learning faster!
|
||||||
|
|
||||||
|
## Frontend: GUI `maglevd-frontend`
|
||||||
|
|
||||||
|
Now that I'm in "yes, I vibe"-admission-mode, there's another type of component I've rarely if ever
|
||||||
|
worked on: web frontends! `maglevd-frontend` is a single Go binary with a
|
||||||
|
[[SolidJS](https://www.solidjs.com/)] single-page app embedded at build time via `//go:embed` - no
|
||||||
|
runtime file dependencies, no Node.js required after the build. Simple and standalone.
|
||||||
|
|
||||||
|
One design goal I set early was to be able to observe all my load balancer instances from a single
|
||||||
|
dashboard. `maglevd-frontend` connects to one or more `maglevd` instances by adding them to a `--server`
|
||||||
|
flag upon startup.
|
||||||
|
|
||||||
|
At the top of the page, I add a **scope selector**, one pill per configured `maglevd`, colored green when
|
||||||
|
the frontend's gRPC channel to that instance is alive and red when it cannot connect. Clicking a pill
|
||||||
|
switches the entire view to that site's frontends. I notice that reloading the page resets all of
|
||||||
|
it, so I add a cookie so that all selections can persist across page reloads.
|
||||||
|
|
||||||
|
### Frontend: Live Event Streaming
|
||||||
|
|
||||||
|
I learn about Server-Sent Events (SSE): `maglevd-frontend` subscribes to `WatchEvents` on each
|
||||||
|
configured `maglevd` and translates the gRPC stream into SSE events on the `/view/api/events`
|
||||||
|
endpoint. The browser's EventSource API reconnects automatically on disconnect, and the server
|
||||||
|
maintains a 30-second / 2000-event ring buffer so that a page reload replays recent events using
|
||||||
|
`Last-Event-ID`. I'm pleased with the result: a dashboard that stays current in real time with no
|
||||||
|
polling and visible catch-up after a brief disconnect, like a laptop lid close.
|
||||||
|
|
||||||
|
When a backend transitions from `up` to `down`, the badge in the frontend card updates within
|
||||||
|
milliseconds. A pool failover - where the primary pool empties and the fallback pool activates -
|
||||||
|
appears as a cascade of state changes followed by a re-rendering of the effective weight column. The
|
||||||
|
LB buckets column (showing VPP's actual hash table allocation for each AS) is refreshed via a
|
||||||
|
debounced `GetVPPLBState` scrape on every transition, at most once per second per `maglevd`. And
|
||||||
|
looking at this frontend, it may be clear to you why I designed the backend to have a subscribable
|
||||||
|
event stream:
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/vpp-maglev/maglev-frontend.png" alt="VPP Maglev Frontend" >}}
|
||||||
|
|
||||||
|
The tech stack for the Single Page App is [[SolidJS](https://www.solidjs.com/)], a super cool reactive
|
||||||
|
framework that compiles away its virtual DOM and produces small, fast bundles. I chose it over React
|
||||||
|
partly because I was curious about it and partly because the bundle size matters when you are
|
||||||
|
embedding the whole thing in a Go binary. The event store is a simple Solid signal that the SSE
|
||||||
|
handler updates; every component that cares re-renders automatically without explicit subscription
|
||||||
|
management. It's slick and much easier to use than I had initially thought!
|
||||||
|
|
||||||
|
### Frontend: Admin Surface
|
||||||
|
|
||||||
|
When both `MAGLEV_FRONTEND_USER` and `MAGLEV_FRONTEND_PASSWORD` environment variables are set, the
|
||||||
|
admin surface is activated at `/admin/`. I make sure that without credentials, `/admin/` returns
|
||||||
|
404. In this case, the admin path is not just unprotected, it is entirely absent. Security matters,
|
||||||
|
at least a little bit, even if the frontend will not be exposed onto the Internet.
|
||||||
|
|
||||||
|
In admin mode, every backend row grows a `⋮` (kebab) menu with `pause`, `resume`, `enable`,
|
||||||
|
`disable`, and `set weight` entries. Lifecycle actions open a confirmation dialog that spells out the
|
||||||
|
dataplane consequence: `disable` explicitly warns that it will drop live sessions via the flow-table
|
||||||
|
flush. The weight dialog has a 0-100 slider and a `flush existing flows` checkbox - unchecked is the
|
||||||
|
graceful drain, checked is the immediate session-drop path.
|
||||||
|
|
||||||
|
Also in admin mode, a **Debug panel** at the bottom of the page tails every event the SPA has seen
|
||||||
|
across all `maglevd` instances: backend and frontend transitions, log lines, VPP LB sync events, and
|
||||||
|
connection status flips, all formatted for scanning. A scope filter narrows the tail to the current
|
||||||
|
`maglevd`; an `all maglevds` checkbox enables firehose mode; a `pause` button freezes the tail so
|
||||||
|
you can read back.
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
I've rolled this out at IPng Networks a few weeks ago, and it's been running rock solid ever since.
|
||||||
|
I've taken four VPP machines, connected them to the core routers, and started to announce two VIPs,
|
||||||
|
each announced in two cities. `vip0` is announced from Zurich (Switzerland) and Amsterdam (the
|
||||||
|
Netherlands), and `vip1` is announced from Lucerne (Switzerland) and Lille (France). I've moved over
|
||||||
|
most websites, as I find putting skin in the game is important:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@summer:~$ host ipng.ch
|
||||||
|
ipng.ch has address 194.1.163.31
|
||||||
|
ipng.ch has address 194.126.235.31
|
||||||
|
ipng.ch has IPv6 address 2001:678:d78::1:0:1
|
||||||
|
ipng.ch has IPv6 address 2a0b:dd80::1:0:1
|
||||||
|
```
|
||||||
|
|
||||||
|
The only service I'm a bit apprehensive about - even though I don't think I need to be - is the
|
||||||
|
[[Static CT Logs](/s/ct/)], which do about 2.5kqps of reads and 400qps of writes at the moment. The
|
||||||
|
plan is to let this marinate for a few weeks, and then move the read-path and later on, also the
|
||||||
|
write-path to this construction.
|
||||||
|
|
||||||
|
You can find the project at [[git.ipng.ch/ipng/vpp-maglev](https://git.ipng.ch/ipng/vpp-maglev.git)]
|
||||||
|
and debian packages are on [[deb.ipng.ch](https://deb.ipng.ch/)]. I wrote some reasonable
|
||||||
|
documentation for the project at:
|
||||||
|
* [[docs/design.md](https://git.ipng.ch/ipng/vpp-maglev/src/branch/main/docs/design.md)] on the
|
||||||
|
architecture, components, and numbered functional / non-functional requirements. Start here if
|
||||||
|
you want the big picture before diving into the code.
|
||||||
|
* [[docs/user-guide.md](https://git.ipng.ch/ipng/vpp-maglev/src/branch/main/docs/user-guide.md)]
|
||||||
|
describes the flags, signals, and maglevc command reference.
|
||||||
|
* [[docs/config-guide.md](https://git.ipng.ch/ipng/vpp-maglev/src/branch/main/docs/config-guide.md)]
|
||||||
|
shows the full YAML configuration file reference.
|
||||||
|
* [[docs/healthchecks.md](https://git.ipng.ch/ipng/vpp-maglev/src/branch/main/docs/healthchecks.md)]
|
||||||
|
is a deepdive on the health state machine, probe scheduling, and rise/fall semantics.
|
||||||
|
|
||||||
|
## What's Next
|
||||||
|
|
||||||
|
Using Maglev has a few significant benefits. Most importantly, I can drain (or weather an outage of)
|
||||||
|
any nginx frontend within seconds, and there is no more DNS propagation delay. Another key property
|
||||||
|
is that the loadbalanced VIPs themselves are now completely mobile, and anycasted. I can drain a VPP
|
||||||
|
loadbalancer by simply removing its announcement of the VIPs, and anycast routing will seamlessly
|
||||||
|
move the traffic to another live replica. This immunizes IPng from site / datacenter / machine
|
||||||
|
failures as well, as rerouting happens within only a few seconds.
|
||||||
|
|
||||||
|
However, there's also a few smaller downsides. Notably, this setup is more complex than merely
|
||||||
|
having "the webserver", there are now half a dozen webservers, and potentially half a dozen places
|
||||||
|
where traffic can enter the system, which poses a challenge with observability. In an upcoming
|
||||||
|
article, I'll spend some time thinking through how to make it as easy as possible, with Prometheus
|
||||||
|
and Grafana dashboards, as well as a clever trick to be able to see which Maglev loadbalancer sent
|
||||||
|
which request to which IPng nginx Frontend. If this type of thing is interesting to you, stay tuned!
|
||||||
@@ -0,0 +1,555 @@
|
|||||||
|
---
|
||||||
|
date: "2026-05-15T18:22:11Z"
|
||||||
|
title: VPP with Maglev Loadbalancing - Part 3
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="200px" float="right" src="/assets/vpp/fdio-color.svg" alt="VPP" >}}
|
||||||
|
|
||||||
|
# About this series
|
||||||
|
|
||||||
|
Ever since I first saw VPP - the Vector Packet Processor - I have been deeply impressed with its
|
||||||
|
performance and versatility. For those of us who have used Cisco IOS/XR devices, like the classic
|
||||||
|
_ASR_ (aggregation service router), VPP will look and feel quite familiar as many of the approaches
|
||||||
|
are shared between the two.
|
||||||
|
|
||||||
|
In the [[first article]({{< ref "2026-04-30-vpp-maglev" >}})] of this series, I looked at the Maglev
|
||||||
|
algorithm and how it is implemented in VPP. Then, I wrote about a health checking controller called
|
||||||
|
[[vpp-maglev](https://git.ipng.ch/ipng/vpp-maglev)], and its architecture, server, client and
|
||||||
|
frontend in a [[second piece]({{< ref 2026-05-08-vpp-maglev-2 >}})]. The traffic flows in a somewhat
|
||||||
|
more complex way now from users to IPng's webservers, so this article dives into the observability
|
||||||
|
that makes this system manageable.
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
One might argue the Maglev delivery system is elegant. Traffic comes in over anycasted VIPs, VPP
|
||||||
|
hashes each new TCP flow onto an application backend using the Maglev algorithm. The backend
|
||||||
|
receives the packet via a GRE6 tunnel and responds directly to the client. Elegant, stateless, fast.
|
||||||
|
|
||||||
|
{{< image width="10em" float="left" src="/assets/smtp/pulling_hair.png" alt="Pulling Hair" >}}
|
||||||
|
|
||||||
|
I would argue, this is elegant right up until something explodes, and then all of a sudden the
|
||||||
|
system is just an opaque blackbox: VPP operates at the flow level and has no knowledge of individual
|
||||||
|
HTTP requests. GRE delivery means nginx sees the client IP but not which maglev sent it. Direct
|
||||||
|
Server Return means the response never passes through the original VPP Maglev instance again.
|
||||||
|
Nothing in this chain sees the full picture of a single request's journey, which will make me lose
|
||||||
|
sleep. And I love sleeping.
|
||||||
|
|
||||||
|
To close that gap, I need to build an observability layer on top of this system:
|
||||||
|
|
||||||
|
1. **vpp-maglev** itself exposes backend health state and VPP dataplane counters over Prometheus.
|
||||||
|
2. **nginx-ipng-stats-plugin** is an nginx module that attributes each request to the Maglev
|
||||||
|
frontend that delivered it, and exports per-VIP, per-frontend counters over Prometheus.
|
||||||
|
3. **nginx-logtail** is a Go pipeline that ingests per-request log lines from all nginx instances
|
||||||
|
and maintains globally ranked top-K tables across multiple time windows for high-cardinality
|
||||||
|
queries, exposed via gRPC and rollup stats, once again, over Prometheus.
|
||||||
|
|
||||||
|
Before explaining more, I need to get something off my chest. Some readers asked me based on my last
|
||||||
|
article, why would I name the header `X-IPng-Frontend` even though it's clearly a maglev backend? It
|
||||||
|
turns out, the words "frontend" and "backend" are overloaded, pretty much in any system that has
|
||||||
|
more than two components. In a chain (VPP Maglev <-> nginx <-> docker container), nginx is both a
|
||||||
|
backend (of the maglev) and a frontend (of the docker container), so I'll use the following terms to
|
||||||
|
try to disambiguate:
|
||||||
|
|
||||||
|
- **maglev frontend**: a VPP machine that announces BGP anycast VIPs and forwards traffic to nginx
|
||||||
|
machines via GRE6 tunnels.
|
||||||
|
- **nginx frontend**: an nginx machine that receives GRE-wrapped packets, unwraps them, and proxies
|
||||||
|
requests to application containers. From Maglev's perspective it is an application server (AS);
|
||||||
|
from the web service perspective it is the front door.
|
||||||
|
- **nginx backend**: a Docker container running an application like Hugo, Nextcloud, or Immich. This
|
||||||
|
is what the user's request ultimately reaches.
|
||||||
|
|
||||||
|
IPng currently announces two IPv4/IPv6 anycasted VIPs (`vip0.l.ipng.ch` and `vip1.l.ipng.ch`) from
|
||||||
|
four maglev frontends (`chbtl2`, `frggh1`, `chlzn1`, `nlams1`) and eight nginx frontends spread
|
||||||
|
across Switzerland, France, and the Netherlands. Adding/Removing instances of Maglev and instances
|
||||||
|
of nginx is non-intrusive and can be done in mere minutes with Ansible and Kees.
|
||||||
|
|
||||||
|
### Maglev: GRE Delivery
|
||||||
|
|
||||||
|
VPP delivers each packet to an nginx machine by wrapping it in a GRE6 tunnel. GRE6 uses IPv6 as the
|
||||||
|
outer header regardless of whether the inner packet is IPv4 or IPv6. The per-packet overhead is
|
||||||
|
fixed at 44 bytes, the outer IPv6 header taking 40 bytes and the GRE header, assuming no
|
||||||
|
key/checksum, weighs in at an additional 4 bytes. A standard 1500-byte client packet wrapped in GRE6
|
||||||
|
comes out at 1544 bytes regardless of address family:
|
||||||
|
|
||||||
|
| Inner packet | Breakdown | Inner L3 size | GRE6 overhead | Wrapped total |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| IPv4 TCP | 20B IPv4 + 20B TCP + 1460B payload | 1500 bytes | 40+4 bytes | **1544 bytes** |
|
||||||
|
| IPv6 TCP | 40B IPv6 + 20B TCP + 1440B payload | 1500 bytes | 40+4 bytes | **1544 bytes** |
|
||||||
|
|
||||||
|
I decide to configure the GRE tunnel interfaces on each nginx machine with an MTU of 2026 bytes,
|
||||||
|
precisely to accommodate those 1544-byte wrapped packets without fragmentation. The 44-byte
|
||||||
|
encapsulation overhead is an internal implementation detail; from the internet's perspective,
|
||||||
|
traffic wants to flow at the standard 1500-byte MTU end to end.
|
||||||
|
|
||||||
|
That last point is why [[MSS clamping](https://en.wikipedia.org/wiki/Maximum_segment_size)] is
|
||||||
|
necessary. When nginx sends its SYN-ACK in the three-way handshake, it would normally derive its
|
||||||
|
advertised MSS from its own interface MTU. An interface MTU of 2026 would yield an MSS of 1986 bytes
|
||||||
|
(IPv4) or 1966 bytes (IPv6), telling the client it can send segments of almost 2000 bytes. Those
|
||||||
|
segments would travel across the internet on a standard 1500-byte path and cause fragmentation or
|
||||||
|
black-holing before they even reached VPP. No bueno!
|
||||||
|
|
||||||
|
MSS clamping in the SYN-ACK overrides that interface-derived value and advertises the standard
|
||||||
|
internet MSS instead:
|
||||||
|
|
||||||
|
- IPv4 clients: 1500 - 20 (IPv4) - 20 (TCP) = **1460 bytes**
|
||||||
|
- IPv6 clients: 1500 - 40 (IPv6) - 20 (TCP) = **1440 bytes**
|
||||||
|
|
||||||
|
The client then sends standard-sized segments. Those arrive at VPP as 1500-byte packets, get
|
||||||
|
GRE6-wrapped to 1544 bytes, and arrive at the nginx GRE interface well within its 2026-byte MTU.
|
||||||
|
On the return path, nginx sends responses directly to the client via DSR at the standard 1500-byte
|
||||||
|
MTU.
|
||||||
|
|
||||||
|
On each nginx, I first allow GRE6 from the Maglev frontends, and apply the MSS clamping on the
|
||||||
|
(larger MTU) internet-facing interface `enp1s0f0`. Then, using netplan, I'll create an IP6GRE tunnel
|
||||||
|
to each Maglev frontend, using descriptive interface names which reveal who owns the remote side of
|
||||||
|
the tunnel:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@nginx0-chlzn0:~$ sudo ip6tables -A INPUT -p gre -s 2001:678:d78::/96 -j ALLOW
|
||||||
|
pim@nginx0-chlzn0:~$ sudo ip6tables -A POSTROUTING -o enp1s0f0 -p tcp --tcp-flags SYN,RST SYN -j TCPMSS --set-mss 1440
|
||||||
|
pim@nginx0-chlzn0:~$ sudo iptables -A POSTROUTING -o enp1s0f0 -p tcp --tcp-flags SYN,RST SYN -j TCPMSS --set-mss 1460
|
||||||
|
pim@nginx0-chlzn0:~$ cat /etc/netplan/01-netcfg.yaml
|
||||||
|
network:
|
||||||
|
version: 2
|
||||||
|
renderer: networkd
|
||||||
|
ethernets:
|
||||||
|
...
|
||||||
|
tunnels:
|
||||||
|
chbtl2:
|
||||||
|
mode: ip6gre
|
||||||
|
mtu: 1544
|
||||||
|
local: 2001:678:d78:f::2:0
|
||||||
|
remote: 2001:678:d78::e
|
||||||
|
addresses: [ 194.1.163.31/32, 2001:678:d78::1:0:1/128, 194.126.235.31/32, 2a0b:dd80::1:0:1/128 ]
|
||||||
|
chlzn1:
|
||||||
|
mode: ip6gre
|
||||||
|
mtu: 1544
|
||||||
|
local: 2001:678:d78:f::2:0
|
||||||
|
remote: 2001:678:d78::10
|
||||||
|
addresses: [ 194.1.163.31/32, 2001:678:d78::1:0:1/128, 194.126.235.31/32, 2a0b:dd80::1:0:1/128 ]
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
With this, I know that traffic arriving on `chbtl2` was forwarded by the `chbtl2` maglev frontend.
|
||||||
|
That mapping is the key that makes per-source attribution possible, which is a property that I can
|
||||||
|
exploit in an nginx plugin. Clever!
|
||||||
|
|
||||||
|
### NGINX: Stats Plugin
|
||||||
|
|
||||||
|
I wrote a tiny nginx module on
|
||||||
|
[[nginx-ipng-stats-plugin](https://git.ipng.ch/ipng/nginx-ipng-stats-plugin)] that counts requests
|
||||||
|
per VIP, split by which interface (in my case, which GRE tunnel) delivered them. It is packaged as a
|
||||||
|
standard Debian `libnginx-mod-http-ipng-stats` package on [[deb.ipng.ch](https://deb.ipng.ch/)] and
|
||||||
|
loads into stock upstream nginx without recompiling nginx itself. The design document and user guide
|
||||||
|
are in the [[docs/](https://git.ipng.ch/ipng/nginx-ipng-stats-plugin/src/branch/main/docs)]
|
||||||
|
directory of the repo.
|
||||||
|
|
||||||
|
A tcpdump on the GRE tunnel interface confirms the handshake looks correct. Both sides settle on
|
||||||
|
the standard internet MSS values despite the larger internal MTU:
|
||||||
|
|
||||||
|
```
|
||||||
|
pim@nginx0-chlzn0:~$ sudo tcpdump -i any -n '(tcp[tcpflags] & tcp-syn) != 0 or (ip6 and ip6[6]=6 and (ip6[13+40] & 2) != 0)'
|
||||||
|
05:13:46.547826 chbtl2 In IP 162.19.252.246.39246 > 194.1.163.31.443: Flags [S], seq 2576867891,
|
||||||
|
win 64240, options [mss 1460,sackOK,TS val 767241235 ecr 0,nop,wscale 7], length 0
|
||||||
|
05:13:46.547860 enp1s0f0 Out IP 194.1.163.31.443 > 162.19.252.246.39246: Flags [S.], seq 3931956759,
|
||||||
|
ack 2576867892, win 65142, options [mss 1460,sackOK,TS val 681127624 ecr 767241235,nop,wscale 7], length 0
|
||||||
|
|
||||||
|
05:13:46.584236 chlzn1 In IP6 2a03:2880:f812:5e::.28858 > 2a0b:dd80::1:0:1.443: Flags [S], seq 36254033,
|
||||||
|
win 65535, options [mss 1380,sackOK,TS val 3022307959 ecr 0,nop,wscale 8], length 0
|
||||||
|
05:13:46.584300 enp1s0f0 Out IP6 2a0b:dd80::1:0:1.443 > 2a03:2880:f812:5e::.28858: Flags [S.], seq 3586034356,
|
||||||
|
ack 36254034, win 64482, options [mss 1440,sackOK,TS val 1977557327 ecr 3022307959,nop,wscale 7], length 0
|
||||||
|
```
|
||||||
|
|
||||||
|
The thing to look for here, is the IPv4 address from OVH was told in the packet going out on
|
||||||
|
`enp1s0f0` that we are happy to take MSS of 1460 for this IPv4 TCP connection, despite us having a
|
||||||
|
larger MTU on the interface. Similarly, the IPv6 address from Meta was told we'll accept MSS of
|
||||||
|
1440. That's MSS clamping at work!
|
||||||
|
|
||||||
|
#### A curious case of `SO_BINDTODEVICE` versus `IP_PKTINFO`
|
||||||
|
|
||||||
|
Attributing a connection to its ingress interface sounds simple: bind each listening socket to a
|
||||||
|
specific interface with `SO_BINDTODEVICE`. Traffic arriving on `chbtl2` goes to that socket;
|
||||||
|
traffic on `nlams1` goes to another socket; attribution falls out of which socket accepted the
|
||||||
|
connection. And this approach signaled my first failure :-)
|
||||||
|
|
||||||
|
The problem is that `SO_BINDTODEVICE` affects both ingress _and egress_. A socket bound to
|
||||||
|
`chbtl2` device will try to route its return traffic back through that GRE tunnel, back to the
|
||||||
|
Maglev machine. That completely breaks direct server return, which requires responses to leave via
|
||||||
|
the default gateway (the internet-facing NIC). `SO_BINDTODEVICE` on a GRE interface and DSR are
|
||||||
|
mutually exclusive. Yikes.
|
||||||
|
|
||||||
|
{{< image width="8em" float="left" src="/assets/shared/brain.png" alt="brain" >}}
|
||||||
|
|
||||||
|
But Linux has another trick up its kernelistic sleeve: `IP_PKTINFO` (for
|
||||||
|
[[IPv4](https://man7.org/linux/man-pages/man7/ip.7.html)]) and `IPV6_RECVPKTINFO` (for
|
||||||
|
[[IPv6](https://man7.org/linux/man-pages/man7/ipv6.7.html)]). These socket options tell the kernel
|
||||||
|
to attach a control message (cmsg) to each accepted connection containing the interface index on
|
||||||
|
which the packet arrived. The listening socket itself remains a wildcard bound to no specific
|
||||||
|
interface, so outgoing packets follow the normal routing table and leave via the default gateway.
|
||||||
|
Attribution comes from reading the cmsg at connection time, not from socket binding. Whoot!
|
||||||
|
|
||||||
|
In pseudocode, reading the ingress interface from the ancillary data looks like this:
|
||||||
|
|
||||||
|
```c
|
||||||
|
// Enable IP_PKTINFO on the socket at init time.
|
||||||
|
setsockopt(fd, IPPROTO_IP, IP_PKTINFO, &one, sizeof(one));
|
||||||
|
setsockopt(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
|
||||||
|
|
||||||
|
// At accept time, read the cmsg to find the ifindex.
|
||||||
|
struct msghdr msg = { ... };
|
||||||
|
recvmsg(fd, &msg, 0);
|
||||||
|
for (struct cmsghdr *cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
|
||||||
|
if (cm->cmsg_level == IPPROTO_IP && cm->cmsg_type == IP_PKTINFO) {
|
||||||
|
struct in_pktinfo *pki = (struct in_pktinfo *)CMSG_DATA(cm);
|
||||||
|
ifindex = pki->ipi_ifindex; // <- the ingress interface
|
||||||
|
}
|
||||||
|
if (cm->cmsg_level == IPPROTO_IPV6 && cm->cmsg_type == IPV6_PKTINFO) {
|
||||||
|
struct in6_pktinfo *pki = (struct in6_pktinfo *)CMSG_DATA(cm);
|
||||||
|
ifindex = pki->ipi6_ifindex; // <- the ingress interface
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The module enables both socket options on every nginx listening socket and reads the ifindex from
|
||||||
|
the cmsg on each accepted connection. It then looks the ifindex up in a table built at
|
||||||
|
configuration time from the `device=<ifname>` parameters on the `listen` directives. The match
|
||||||
|
produces a short attribution tag. Connections that arrive on an interface with no registered
|
||||||
|
binding fall back to the configurable default tag (called `direct`), which handles unattributed
|
||||||
|
traffic like direct HTTPS connections that bypass Maglev entirely.
|
||||||
|
|
||||||
|
#### NGINX Plugin
|
||||||
|
|
||||||
|
Three things are needed in `nginx.conf`: a shared memory zone for counters, device-bound `listen`
|
||||||
|
directives that map each GRE interface to a source tag, and a scrape location:
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
http {
|
||||||
|
ipng_stats_zone ipng:4m;
|
||||||
|
|
||||||
|
server {
|
||||||
|
# One device-tagged listen per GRE interface per address family.
|
||||||
|
# Each 'listen' tells the module which ifname maps to which tag.
|
||||||
|
listen 80 device=chbtl2 ipng_source_tag=chbtl2;
|
||||||
|
listen [::]:80 device=chbtl2 ipng_source_tag=chbtl2;
|
||||||
|
listen 443 device=chbtl2 ipng_source_tag=chbtl2 ssl;
|
||||||
|
listen [::]:443 device=chbtl2 ipng_source_tag=chbtl2 ssl;
|
||||||
|
listen 80 device=nlams1 ipng_source_tag=nlams1;
|
||||||
|
listen [::]:80 device=nlams1 ipng_source_tag=nlams1;
|
||||||
|
listen 443 device=nlams1 ipng_source_tag=nlams1 ssl;
|
||||||
|
listen [::]:443 device=nlams1 ipng_source_tag=nlams1 ssl;
|
||||||
|
# ... repeat for frggh1, chlzn1 ...
|
||||||
|
}
|
||||||
|
|
||||||
|
# Scrape endpoint on a management port.
|
||||||
|
server {
|
||||||
|
listen 127.0.0.1:9113;
|
||||||
|
location = /.well-known/ipng/statsz {
|
||||||
|
ipng_stats;
|
||||||
|
allow 127.0.0.1;
|
||||||
|
deny all;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The cool trick is, that multiple `device=` listens on the same port are not multiple kernel sockets;
|
||||||
|
under this `IP_PKTINFO` model they collapse to a single wildcard socket, and the module distinguishes
|
||||||
|
traffic by reading the ifindex from the cmsg. Adding a new VIP is a `server_name` change in nginx;
|
||||||
|
adding a new maglev frontend is a two-line append to the listens file. Neither requires a restart.
|
||||||
|
|
||||||
|
I kept the hot path intentionally minimal. On each request's log phase, the nginx worker increments
|
||||||
|
a counter in its own private table. There are no locks; no atomics; nothing fancy. Just an integer
|
||||||
|
increment into memory that only this worker ever writes. A periodic timer (default one second) then
|
||||||
|
flushes the worker's private deltas into the shared memory zone using atomic adds. The scrape
|
||||||
|
handler called `ipng_stats` reads only from shared memory.
|
||||||
|
|
||||||
|
The module also registers an nginx variable `$ipng_source_tag` that resolves to the attribution
|
||||||
|
tag for the current connection. That variable is available in `log_format`, `map`, `add_header`,
|
||||||
|
and any other directive that accepts nginx variables, which is how the logtail pipeline gets the
|
||||||
|
attribution.
|
||||||
|
|
||||||
|
Scraping the endpoint confirms attribution is working. With `Accept: text/plain`, the output is
|
||||||
|
Prometheus text format; with `Accept: application/json`, it is JSON. Both support `source_tag=`
|
||||||
|
and `vip=` query parameters to filter the output:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pim@nginx0-chlzn0:~$ curl -Ss http://localhost:9113/.well-known/ipng/statsz?source_tag=chbtl2
|
||||||
|
nginx_ipng_requests_total{source_tag="chbtl2",vip="194.1.163.31",code="4xx"} 100062
|
||||||
|
nginx_ipng_requests_total{source_tag="chbtl2",vip="194.1.163.31",code="2xx"} 14621209
|
||||||
|
nginx_ipng_requests_total{source_tag="chbtl2",vip="2001:678:d78::1:0:1",code="4xx"} 6340
|
||||||
|
nginx_ipng_requests_total{source_tag="chbtl2",vip="2001:678:d78::1:0:1",code="2xx"} 10339863
|
||||||
|
nginx_ipng_bytes_in_sum{source_tag="chbtl2",vip="194.1.163.31"} 1599408141.000
|
||||||
|
nginx_ipng_bytes_in_sum{source_tag="chbtl2",vip="2001:678:d78::1:0:1"} 2405616085.000
|
||||||
|
nginx_ipng_bytes_out_sum{source_tag="chbtl2",vip="194.1.163.31"} 418826340291.000
|
||||||
|
nginx_ipng_bytes_out_sum{source_tag="chbtl2",vip="2001:678:d78::1:0:1"} 47520361606.000
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
The `code` label is bucketed into six classes (`1xx`..`5xx` and `unknown`), which keeps
|
||||||
|
per-VIP cardinality bounded regardless of how many distinct HTTP status codes appear in the wild
|
||||||
|
(and trust me, they *all* occur in the wild). The module also exports request duration histograms
|
||||||
|
and upstream response time histograms, but the request/byte counters above are the day-to-day
|
||||||
|
workhorses.
|
||||||
|
|
||||||
|
### NGINX: Log Hook
|
||||||
|
|
||||||
|
Since I'm looking at all requests in the nginx log phase anyway, I thought perhaps I could go one
|
||||||
|
step further. Prometheus counters answer "how much traffic?" but not "from whom?" or "to which
|
||||||
|
URI?". Adding per-client-IP or per-URI dimensions to Prometheus would be a catastrophic idea during
|
||||||
|
a DDoS: a modest attack with one million source IPs would create one million Prometheus time series
|
||||||
|
and cause the monitoring system to be the first casualty of the incident. The C module is
|
||||||
|
deliberately narrow by design.
|
||||||
|
|
||||||
|
Instead, every request emits a structured log line that carries all the high-cardinality
|
||||||
|
dimensions. The format used by the stats plugin's logtail integration is:
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
log_format ipng_stats_logtail
|
||||||
|
'v1\t$host\t$remote_addr\t$request_method\t$request_uri\t$status'
|
||||||
|
'\t$body_bytes_sent\t$request_time\t$is_tor\t$asn\t$ipng_source_tag'
|
||||||
|
'\t$server_addr\t$scheme';
|
||||||
|
```
|
||||||
|
|
||||||
|
The `v1\t` prefix is a version tag. When the format needs to evolve, a new version (say `v2`)
|
||||||
|
can be added while old emitters are still running; a reader can route each packet to the
|
||||||
|
appropriate parser by looking at the version. In case you're curious, these variables like `$is_tor`
|
||||||
|
come from a map I maintain with TOR exit nodes, and `$asn` comes from Maxmind GeoIP. Check it out
|
||||||
|
[[here](https://www.maxmind.com/en/geoip-databases)].
|
||||||
|
|
||||||
|
{{< image width="6em" float="left" src="/assets/shared/warning.png" alt="Warning" >}}
|
||||||
|
|
||||||
|
Adding an `access_log` directive in every `server` or `location` block would be error-prone and
|
||||||
|
would miss any newly added vhost. It would also potentially cause a lot of disk activity to log both
|
||||||
|
the `logtail` and the regular `access_log`. I decide that my stats plugin will provide an
|
||||||
|
`ipng_stats_logtail` directive at the `http` level that fires globally for every request, regardless
|
||||||
|
of which server or location handled it. Because why not?
|
||||||
|
|
||||||
|
To exclude noisy requests like health probes from the logtail stream, I add an idiomatic `if=` parameter
|
||||||
|
which evaluates an nginx variable at log phase and suppresses emission when the value is empty or `0`.
|
||||||
|
A `map` block is the idiomatic way to build that variable:
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
map $request_uri $logtail_skip_uri {
|
||||||
|
~^/\.well-known/ipng 1;
|
||||||
|
default 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
map $host $logtail_skip_host {
|
||||||
|
maglev.ipng.ch 1;
|
||||||
|
default 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
map "$logtail_skip_uri:$logtail_skip_host" $logtail_enabled {
|
||||||
|
"0:0" 1;
|
||||||
|
default 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ipng_stats_logtail ipng_stats_logtail udp://127.0.0.1:9514 buffer=64k flush=1s if=$logtail_enabled;
|
||||||
|
```
|
||||||
|
|
||||||
|
It took me a while to get used to constructions like this. The first map maps a regular expression
|
||||||
|
on the `$request_uri` to a string (0 or 1) in an O(1) lookup hashtable. The second map maps the
|
||||||
|
`$host` (or a regexp on the host) as well. Then, some funky boolean math allows me to concatenate
|
||||||
|
these two strings in a new map, which can yield "0:0" (neither 'skip' matches), or "1:0", (the URI
|
||||||
|
skip matched, but the host skip did not) "0:1" (URI did not match, but host did), "1:1" (both of the
|
||||||
|
'skip' matched), once again mapping those to a string (0 or 1), which the `ipng_stats_logtail`
|
||||||
|
directive can use in its `if=` argument.
|
||||||
|
|
||||||
|
The log lines themselves are buffered in a per-worker memory buffer (64 KB by default) and transmitted as a
|
||||||
|
single UDP datagram on flush. If no receiver is listening on `127.0.0.1:9514`, the kernel will silently
|
||||||
|
discard the datagram. No blocking, no error, no disk I/O. This fire-and-forget design is just great:
|
||||||
|
analytics should never slow down a request. A lost log line is acceptable; a slow request is not.
|
||||||
|
|
||||||
|
### NGINX: Logtail
|
||||||
|
|
||||||
|
But who or what consumes these UDP packets? Enter
|
||||||
|
[[nginx-logtail](https://git.ipng.ch/ipng/nginx-logtail)], a four-binary Go pipeline that ingests
|
||||||
|
those log lines and answers "which client prefix is being served 429s right now?" or "which ASN is
|
||||||
|
sending me most requests to `/ct/api` in the last 6hrs?" I'll just come right out and admit it: this
|
||||||
|
little program is 100% written and maintained by Claude Code, and I had no hesitation deploying it;
|
||||||
|
I reviewed every bit of the code before it went into production. The design document is in
|
||||||
|
[[docs/design.md](https://git.ipng.ch/ipng/nginx-logtail/src/branch/main/docs/design.md)].
|
||||||
|
|
||||||
|
The four components are:
|
||||||
|
|
||||||
|
- **collector** runs on each nginx host. It receives UDP datagrams from the stats plugin and
|
||||||
|
maintains in-memory ranked top-K counters across six time windows (1m, 5m, 15m, 60m, 6h, 24h).
|
||||||
|
It exposes a gRPC endpoint and rolls up its log counters into a Prometheus `/metrics` endpoint.
|
||||||
|
- **aggregator** runs on a central host. It subscribes to all collectors' snapshots via streaming
|
||||||
|
gRPC and serves a merged view using the same gRPC interface.
|
||||||
|
- **CLI** (`nginx-logtail`) allows one-off queries from the shell, against any collector or the
|
||||||
|
aggregator, and can output JSON or text.
|
||||||
|
- **frontend** is an HTTP dashboard with drilldown tables and SVG sparklines; server-rendered HTML,
|
||||||
|
zero JavaScript, because again, why not?
|
||||||
|
|
||||||
|
The data model is a 7-tuple: `(website, client_prefix, uri, status, is_tor, asn, source_tag)`,
|
||||||
|
mapped to a 64-bit request count. Client IPs are truncated to /24 (IPv4) or /48 (IPv6) prefixes
|
||||||
|
at ingest, which keeps cardinality bounded even during DDoS events with millions of source IPs. The
|
||||||
|
`source_tag` dimension is the attribution tag from `$ipng_source_tag`, which is how the logtail
|
||||||
|
data can be filtered by maglev frontend. Isn't that cool?!
|
||||||
|
|
||||||
|
#### Backfill on Aggregator Restart
|
||||||
|
|
||||||
|
While developing the logtail, I noticed that restarting the aggregator would (obviously) mean losing
|
||||||
|
24 hours of historical data. To avoid this, the aggregator calls `DumpSnapshots` on each collector
|
||||||
|
at startup. Each collector streams its entire fine (1-minute) and coarse (5-minute+) ring buffer
|
||||||
|
contents back to the aggregator, which merges them into its own rings. The backfill is concurrent
|
||||||
|
across all collectors and happens before the aggregator's HTTP endpoint starts serving. From a user
|
||||||
|
perspective, an aggregator restart is invisible: the dashboard shows historical data from the full
|
||||||
|
retention window immediately, all at the expense of a few gigs of network traffic on IPng's
|
||||||
|
backbone.
|
||||||
|
|
||||||
|
#### CLI Examples
|
||||||
|
|
||||||
|
The CLI is a quick tool for operational triage:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Top 20 client prefixes by request count in the last 5 minutes.
|
||||||
|
nginx-logtail topn --target agg:9091 --window 5m --group-by prefix --n 20
|
||||||
|
|
||||||
|
# Which client prefixes are receiving the most 429s right now?
|
||||||
|
nginx-logtail topn --target agg:9091 --window 1m --group-by prefix --status 429 --n 20
|
||||||
|
|
||||||
|
# Is traffic from a specific maglev frontend distributed normally across websites?
|
||||||
|
nginx-logtail topn --target agg:9091 --window 5m --group-by website --source-tag chbtl2
|
||||||
|
|
||||||
|
# Which URIs are generating the most 5xx responses in the last hour?
|
||||||
|
nginx-logtail topn --target agg:9091 --window 60m --group-by uri --status '>=500'
|
||||||
|
|
||||||
|
# Show a time-series trend for errors on one website.
|
||||||
|
nginx-logtail trend --target agg:9091 --window 5m --website ipng.ch --status '>=400'
|
||||||
|
```
|
||||||
|
|
||||||
|
The `--status` flag accepts expressions like `429`, `>=400`, `!=200`, or `<500`. The
|
||||||
|
`--website-re` and `--uri-re` flags accept RE2 regex patterns. `--json` emits NDJSON for
|
||||||
|
downstream processing with `jq`.
|
||||||
|
|
||||||
|
#### Frontend
|
||||||
|
|
||||||
|
But who needs CLIs when you can also ask Claude to make web-frontends? The nginx-logtail frontend is
|
||||||
|
a server-rendered dashboard with no JavaScript. It uses the gRPC endpoints on collector and
|
||||||
|
aggregator to render top-K tables with inline SVG sparklines showing the request count trend per
|
||||||
|
time bucket over the last 24 hours, with drill down and filtering based on the 7-tuple above.
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/vpp-maglev/logtail-frontend.png" alt="nginx-logtail web frontend" >}}
|
||||||
|
|
||||||
|
Clicking any row in the table adds it as a filter and advances to the next dimension in the
|
||||||
|
hierarchy: website, client prefix, request URI, HTTP status, ASN, source tag. A breadcrumb strip
|
||||||
|
above the table shows all active filters; clicking the `x` on any token removes just that filter.
|
||||||
|
A filter expression box accepts direct text input for filters like `status!=200 AND
|
||||||
|
website~=mon.ct.ipng.ch` above. The URL encodes the full query state so any view can be bookmarked
|
||||||
|
or shared. Requests are _quick_, the average being at around 150ms or so. It's proven so useful to
|
||||||
|
find out who is using what webservice, from where.
|
||||||
|
|
||||||
|
### Prometheus
|
||||||
|
|
||||||
|
Three Prometheus sources cover the system from different angles. They are designed to be used
|
||||||
|
together; each answers questions the others cannot.
|
||||||
|
|
||||||
|
**Source 1: vpp-maglev** is the health and controlplane view. It exports backend states
|
||||||
|
(up / down / unknown / paused / disabled), effective weights per pool, and VPP API call outcomes.
|
||||||
|
This is the authoritative source for "which backends are healthy right now" and "what weight is
|
||||||
|
VPP actually using for each application server." Dashboards built here answer: _is the system
|
||||||
|
healthy?_ The [[vpp-maglev docs](https://git.ipng.ch/ipng/vpp-maglev/src/branch/main/docs)]
|
||||||
|
describe the full metric surface.
|
||||||
|
|
||||||
|
**Source 2: nginx-ipng-stats-plugin** is the traffic volume view. It exports per-`(source_tag,
|
||||||
|
vip)` request and byte counters from inside nginx. The key metrics are
|
||||||
|
`nginx_ipng_requests_total` and `nginx_ipng_bytes_out_total`, both labeled `source_tag` and
|
||||||
|
`vip`, with a `code` label for status class. This layer is deliberately terse and scoped: no
|
||||||
|
per-client, no per-URI dimensions. Dashboards built here answer: _which maglev frontend is
|
||||||
|
sending how much traffic to which VIP?_
|
||||||
|
|
||||||
|
**Source 3: nginx-logtail** (collector) is the high-cardinality view. The collector's Prometheus
|
||||||
|
endpoint exports per-host request counters, body-size histograms, and request-time histograms,
|
||||||
|
plus per-`source_tag` rollup counters. The gRPC top-K service answers the "who and what" questions
|
||||||
|
that Prometheus alone cannot, without the cardinality risk.
|
||||||
|
|
||||||
|
The three sources complement each other for cross-layer diagnostics:
|
||||||
|
|
||||||
|
- If vpp-maglev shows all backends up but `nginx_ipng_requests_total` is zero for a specific
|
||||||
|
`source_tag`, the maglev frontend stopped forwarding. The BGP announcement may have been
|
||||||
|
withdrawn, or the GRE tunnel is down.
|
||||||
|
- If `nginx_ipng_requests_total` is healthy for a VIP but vpp-maglev shows a backend in down
|
||||||
|
state, the pool failover is working: traffic has moved to the standby pool, and the primary
|
||||||
|
pool is being drained.
|
||||||
|
- If vpp-maglev shows a backend as up and the stats plugin shows traffic, but error rates in
|
||||||
|
nginx-logtail are climbing, the application itself is struggling, not the load balancer.
|
||||||
|
|
||||||
|
{{< image width="100%" src="/assets/vpp-maglev/grafana-dashboard.png" alt="Grafana dashboard" >}}
|
||||||
|
|
||||||
|
The Grafana dashboard combines all three sources. The top panel shows per-maglev-frontend request
|
||||||
|
rates from `nginx_ipng_requests_total`, so I can see at a glance which of the Maglev frontends is
|
||||||
|
busiest and whether the distribution between them looks right. Backend health state from
|
||||||
|
vpp-maglev is overlaid as annotations: a backend going down appears as a vertical band on the
|
||||||
|
traffic panel at the exact moment the traffic redistributed.
|
||||||
|
|
||||||
|
Good observability consists of both metrics/analytics as well as signals. Two alerting rules I find
|
||||||
|
particularly useful:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
groups:
|
||||||
|
- name: maglev
|
||||||
|
rules:
|
||||||
|
- alert: NoTrafficFromMaglevFrontend
|
||||||
|
expr: |
|
||||||
|
sum by (source_tag) (
|
||||||
|
rate(nginx_ipng_requests_total{source_tag!="direct"}[10m])
|
||||||
|
) < 1
|
||||||
|
for: 10m
|
||||||
|
annotations:
|
||||||
|
summary: "Maglev frontend {{ $labels.source_tag }} is sourcing de-minimis traffic"
|
||||||
|
description: "Check anycast announcements and GRE tunnel state for {{ $labels.source_tag }}"
|
||||||
|
|
||||||
|
- alert: NoTrafficToVIP
|
||||||
|
expr: |
|
||||||
|
sum by (vip) (
|
||||||
|
rate(nginx_ipng_requests_total[10m])
|
||||||
|
) < 1
|
||||||
|
for: 10m
|
||||||
|
annotations:
|
||||||
|
summary: "VIP {{ $labels.vip }} is receiving de-minimis traffic from any source"
|
||||||
|
description: "Check anycast announcements; no maglev frontend is forwarding to this VIP"
|
||||||
|
```
|
||||||
|
|
||||||
|
`NoTrafficFromMaglevFrontend` fires if a specific Maglev frontend goes silent for ten minutes, where
|
||||||
|
silent here means less than 1.0 qps of traffic coming from it. This is distinct from a backend
|
||||||
|
going down: it means the maglev machine itself has stopped forwarding, which is a network event
|
||||||
|
(remember, it's always BGP!) rather than an application event.
|
||||||
|
|
||||||
|
`NoTrafficToVIP` fires if a VIP receives no traffic from any Maglev frontend. This would be pretty
|
||||||
|
bad, as `l.ipng.ch` is advertising the VIP via A/AAAA records (remember, it's always DNS!), so if
|
||||||
|
the VIP is not receiving any traffic from any Maglev source at all, that would be a fairly serious
|
||||||
|
situation that warrants a page.
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
The [[nginx-logtail](https://git.ipng.ch/ipng/nginx-logtail)] service has been running for about
|
||||||
|
three months now. Originally it used a literal file to tail the [[Static CT Logs](/s/ct/)] as they
|
||||||
|
were being scraped by some abusive user. Using logfiles had a nice benefit of not needing any
|
||||||
|
changes to nginx at all, just a bunch of repeated `access_log` statements referring to the custom
|
||||||
|
`log_format`.
|
||||||
|
|
||||||
|
Then I added the [[nginx-ipng-stats-plugin]](https://git.ipng.ch/ipng/nginx-ipng-stats-plugin)]
|
||||||
|
which has been running now for about four weeks in production, and gives a lot of very useful stats
|
||||||
|
information. The [[vpp-maglev](https://git.ipng.ch/ipng/vpp-maglev)] project is in pretty good
|
||||||
|
shape, and has been running for about the same time (one month or so) as well.
|
||||||
|
|
||||||
|
On May 1st, we celebrated Labor day here in Switzerland. It seemed like an as good a day as any to
|
||||||
|
move all web properties at IPng Networks over to the `l.ipng.ch` VIPs, funneling all traffic through
|
||||||
|
redundantly announced Maglev instances into redundantly conneted nginx frontends. For about two
|
||||||
|
weeks now, things have been completely find - knock on wood - but it's safe to say IPng ❤️ Maglev.
|
||||||
|
|
||||||
|
## What's Next
|
||||||
|
|
||||||
|
The VPP routers in AS8298 and also the nginx frontends all perform `sflow` sampling, using the
|
||||||
|
[[sFlow]({{< ref 2025-02-08-sflow-3 >}})] implementation I worked on last year. I'm pretty confident
|
||||||
|
that, given the `sFlow` packet data, and near real time `logtail` request data, I should be able to
|
||||||
|
detect abuse / ddos and other failure scenarios. I think my next project will be to create some form
|
||||||
|
of nginx plugin that allows me to rate limit or drop abusive client IP addresses programmatically,
|
||||||
|
based on these signals. Similarly, being able to feed (very) abusive IP prefixes into BGP Flowspec
|
||||||
|
and having them simply dropped at the VPP Maglev frontend (rather than forwarded to the nginx
|
||||||
|
frontends), sounds like another fun thing to toy with.
|
||||||
|
|
||||||
|
But for now, I'm content with the progress in IPng's web serving infrastructure.
|
||||||
|
|
||||||
@@ -0,0 +1,91 @@
|
|||||||
|
---
|
||||||
|
title: 'Certificate Transparency'
|
||||||
|
date: 2025-07-30
|
||||||
|
url: /s/ct
|
||||||
|
---
|
||||||
|
|
||||||
|
{{< image width="10em" float="right" src="/assets/ctlog/ctlog-logo-ipng.png" alt="ctlog logo" >}}
|
||||||
|
|
||||||
|
Certificate Transparency logs are "append-only" and publicly-auditable ledgers of certificates being
|
||||||
|
created, updated, and expired. This is the homepage for IPng Networks' Certificate Transparency
|
||||||
|
project.
|
||||||
|
|
||||||
|
Certificate Transparency [[CT](https://certificate.transparency.dev)] is a system for logging and
|
||||||
|
monitoring certificate issuance. It greatly enhances everyone’s ability to monitor and study
|
||||||
|
certificate issuance, and these capabilities have led to numerous improvements to the CA ecosystem
|
||||||
|
and Web security. As a result, it is rapidly becoming critical Internet infrastructure. Originally
|
||||||
|
developed by Google, the concept is now being adopted by many _Certification Authorities_ who log
|
||||||
|
their certificates, and professional _Monitoring_ companies who observe the certificates and
|
||||||
|
report anomalies.
|
||||||
|
|
||||||
|
IPng Networks runs our logs under the domain `ct.ipng.ch`, split into a `*.log.ct.ipng.ch` for the
|
||||||
|
write-path, and `*.mon.ct.ipng.ch` for the read-path.
|
||||||
|
|
||||||
|
We are [[tracking](https://issues.chromium.org/issues/437003344)] our logs for inclusion in the
|
||||||
|
approved log lists for Google Chrome and Apple Safari. As of Oct'25, our logs have been added
|
||||||
|
to these trusted lists and that change will propagate to people’s browsers with subsequent browser
|
||||||
|
version releases.
|
||||||
|
|
||||||
|
We operate two popular implementations of Static Certificate Transparency software.
|
||||||
|
|
||||||
|
## Sunlight
|
||||||
|
|
||||||
|
{{< image width="10em" float="right" src="/assets/ctlog/sunlight-logo.png" alt="sunlight logo" >}}
|
||||||
|
|
||||||
|
[[Sunlight](https://sunlight.dev)] was designed by Filippo Valsorda for the needs of the WebPKI
|
||||||
|
community, through the feedback of many of its members, and in particular of the Sigsum, Google
|
||||||
|
TrustFabric, and ISRG teams. It is partially based on the Go Checksum Database. Sunlight's
|
||||||
|
development was sponsored by Let's Encrypt.
|
||||||
|
|
||||||
|
Our Sunlight logs:
|
||||||
|
* A staging log called [[Rennet](https://rennet2025h2.log.ct.ipng.ch/)], incepted 2025-07-28,
|
||||||
|
starting from temporal shard `rennet2025h2`.
|
||||||
|
* A production log called [[Gouda](https://gouda2025h2.log.ct.ipng.ch/)], incepted 2025-07-30,
|
||||||
|
starting from temporal shard `gouda2025h2`.
|
||||||
|
|
||||||
|
## TesseraCT
|
||||||
|
|
||||||
|
{{< image width="10em" float="right" src="/assets/ctlog/tesseract-logo.png" alt="tesseract logo" >}}
|
||||||
|
|
||||||
|
[[TesseraCT](https://github.com/transparency-dev/tesseract)] is a Certificate Transparency (CT) log
|
||||||
|
implementation by the TrustFabric team at Google. It was built to allow log operators to run
|
||||||
|
production static-ct-api CT logs starting with temporal shards covering 2026 onwards, as the
|
||||||
|
successor to Trillian's CTFE.
|
||||||
|
|
||||||
|
Our TesseraCT logs:
|
||||||
|
* A staging log called [[Lipase](https://lipase2025h2.log.ct.ipng.ch/)], incepted 2025-08-22,
|
||||||
|
starting from temporal shard `lipase2025h2`.
|
||||||
|
* A production log called [[Halloumi](https://halloumi2025h2.log.ct.ipng.ch/)], incepted 2025-08-24,
|
||||||
|
starting from temporal shard `halloumi2025h2`.
|
||||||
|
* Shard `halloumi2026h2` incorporated incorrect data into its Merkle Tree at entry 4357956 and
|
||||||
|
4552365, due to a [[TesseraCT bug](https://github.com/transparency-dev/tesseract/issues/553)]
|
||||||
|
and was retired on 2025-09-08, to be replaced by temporal shard `halloumi2026h2a`.
|
||||||
|
|
||||||
|
We also submit them to [[github.com/geomys/ct-archive](https://github.com/geomys/ct-archive)].
|
||||||
|
|
||||||
|
## Operational Details
|
||||||
|
|
||||||
|
You can read more details about our infrastructure on:
|
||||||
|
* **[[TesseraCT]({{< ref 2025-07-26-ctlog-1 >}})]** - published on 2025-07-26.
|
||||||
|
* **[[Sunlight]({{< ref 2025-08-10-ctlog-2 >}})]** - published on 2025-08-10.
|
||||||
|
* **[[Operations]({{< ref 2025-08-24-ctlog-3 >}})]** - published on 2025-08-24.
|
||||||
|
|
||||||
|
The operators of this infrastructure are **Antonis Chariton**, **Jeroen Massar** and **Pim van Pelt**. \
|
||||||
|
You can reach us via e-mail at [[<ct-ops@ipng.ch>](mailto:ct-ops@ipng.ch)].
|
||||||
|
|
||||||
|
## Archived logs
|
||||||
|
|
||||||
|
Logs are archived in the [[c2sp.org/static-ct-api@v1.0.0](https://c2sp.org/static-ct-api@v1.0.0)] format,
|
||||||
|
although if they were originally served through RFC 6962 APIs, leaves might miss the LeafIndex extension.
|
||||||
|
IPng archives its static log shards at least two weeks after the _notafterlimit_, and removes the DNS
|
||||||
|
entries at least two weeks after archiving.
|
||||||
|
|
||||||
|
We serve our archived logs from both S3 as well as
|
||||||
|
[[ct-archive-serve](https://github.com/colin-stubbs/ct-archive-serve)]:
|
||||||
|
|
||||||
|
* halloumi2026h2.log.ct.ipng.ch - [[S3](https://ct.ipng.ch/archive/halloumi2026h2/)] - [[log.v3.json](https://halloumi2026h2.mon.ct.ipng.ch/log.v3.json)]
|
||||||
|
* halloumi2025h2.log.ct.ipng.ch - [[S3](https://ct.ipng.ch/archive/halloumi2025h2/)] - [[log.v3.json](https://halloumi2025h2.mon.ct.ipng.ch/log.v3.json)]
|
||||||
|
* lipase2025h2.log.ct.ipng.ch - [[S3](https://ct.ipng.ch/archive/lipase2025h2/)] - [[log.v3.json](https://lipase2025h2.mon.ct.ipng.ch/log.v3.json)]
|
||||||
|
* gouda2025h2.log.ct.ipng.ch - [[S3](https://ct.ipng.ch/archive/gouda2025h2/)] - [[log.v3.json](https://gouda2025h2.mon.ct.ipng.ch/log.v3.json)]
|
||||||
|
* rennet2025h2.log.ct.ipng.ch - [[S3](https://ct.ipng.ch/archive/rennet2025h2/)] - [[log.v3.json](https://rennet2025h2.mon.ct.ipng.ch/log.v3.json)]
|
||||||
|
|
||||||
@@ -56,6 +56,14 @@ can help broker a deal that is tailored to your needs.
|
|||||||
You can read more about how we built our own colocation from scratch in this [[informative post](
|
You can read more about how we built our own colocation from scratch in this [[informative post](
|
||||||
{{< ref "2022-02-24-colo" >}})].
|
{{< ref "2022-02-24-colo" >}})].
|
||||||
|
|
||||||
|
### Self-Hosting
|
||||||
|
|
||||||
|
For IPng it's important to take back a little bit of responsibility for our online presence, away
|
||||||
|
from centrally hosted services and to privately operated ones. We are experts at self-hosting, with
|
||||||
|
services such as [[Mastodon](https://ublog.tech)], [[Pixelfed](https://pix.ublog.tech/)],
|
||||||
|
[[Loops](https://flx.ublog.tech/)], [[PeerTube](https://video.ipng.ch/)], [[Mail]({{< ref
|
||||||
|
2024-05-17-smtp >}})] and myriad others.
|
||||||
|
|
||||||
## Project Design / Execution
|
## Project Design / Execution
|
||||||
|
|
||||||
{{< image width="15em" float="right" src="/assets/pdu19.png" alt="19 inch PDU" >}}
|
{{< image width="15em" float="right" src="/assets/pdu19.png" alt="19 inch PDU" >}}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
baseURL: 'https://ipng.ch/'
|
baseURL: 'https://ipng.ch/'
|
||||||
languageCode: 'en-us'
|
locale: 'en-us'
|
||||||
title: "IPng Networks"
|
title: "IPng Networks"
|
||||||
theme: 'hugo-theme-ipng'
|
theme: 'hugo-theme-ipng'
|
||||||
|
|
||||||
@@ -12,7 +12,7 @@ params:
|
|||||||
showBlogLatest: false
|
showBlogLatest: false
|
||||||
mainSections: ["articles"]
|
mainSections: ["articles"]
|
||||||
showTaxonomyLinks: false
|
showTaxonomyLinks: false
|
||||||
nBlogLatest: 14 # number of blog post om the home page
|
nBlogLatest: 20 # number of blog post om the home page
|
||||||
Paginate: 30
|
Paginate: 30
|
||||||
blogLatestHeading: "Latest Dabblings"
|
blogLatestHeading: "Latest Dabblings"
|
||||||
footer: "Copyright 2021- IPng Networks GmbH, all rights reserved"
|
footer: "Copyright 2021- IPng Networks GmbH, all rights reserved"
|
||||||
@@ -20,10 +20,8 @@ params:
|
|||||||
social:
|
social:
|
||||||
email: "info+www@ipng.ch"
|
email: "info+www@ipng.ch"
|
||||||
mastodon: "@IPngNetworks"
|
mastodon: "@IPngNetworks"
|
||||||
twitter: "IPngNetworks"
|
|
||||||
linkedin: "pimvanpelt"
|
linkedin: "pimvanpelt"
|
||||||
github: "pimvanpelt"
|
github: "pimvanpelt"
|
||||||
instagram: "IPngNetworks"
|
|
||||||
rss: true
|
rss: true
|
||||||
|
|
||||||
taxonomies:
|
taxonomies:
|
||||||
@@ -35,4 +33,7 @@ taxonomies:
|
|||||||
permalinks:
|
permalinks:
|
||||||
articles: "/s/articles/:year/:month/:day/:slug"
|
articles: "/s/articles/:year/:month/:day/:slug"
|
||||||
|
|
||||||
|
outputs:
|
||||||
|
home: ["HTML", "RSS", "JSON"]
|
||||||
|
|
||||||
ignoreLogs: [ "warning-goldmark-raw-html" ]
|
ignoreLogs: [ "warning-goldmark-raw-html" ]
|
||||||
|
|||||||
@@ -0,0 +1,11 @@
|
|||||||
|
{{- $index := slice -}}
|
||||||
|
{{- range .Site.RegularPages -}}
|
||||||
|
{{- $index = $index | append (dict
|
||||||
|
"title" .Title
|
||||||
|
"desc" (.Summary | plainify)
|
||||||
|
"contents" .Plain
|
||||||
|
"section" .Section
|
||||||
|
"date" (.Date.Format "2006-01-02")
|
||||||
|
"permalink" .Permalink) -}}
|
||||||
|
{{- end -}}
|
||||||
|
{{- $index | jsonify -}}
|
||||||
@@ -7,28 +7,95 @@
|
|||||||
<script type="text/JavaScript">
|
<script type="text/JavaScript">
|
||||||
|
|
||||||
const ntag_list = [
|
const ntag_list = [
|
||||||
|
"/s/articles/2016/10/13/fiber7-on-litexchange/",
|
||||||
|
"/s/articles/2017/03/14/sunsetting-sixxs/",
|
||||||
|
"/s/articles/2021/02/26/ipng-history/",
|
||||||
|
"/s/articles/2021/02/27/ipng-network/",
|
||||||
|
"/s/articles/2021/02/27/loadtesting-at-coloclue/",
|
||||||
|
"/s/articles/2021/03/27/case-study-vpp-at-coloclue-part-1/",
|
||||||
|
"/s/articles/2021/05/17/ipng-arrives-in-frankfurt/",
|
||||||
|
"/s/articles/2021/05/26/ipng-arrives-in-amsterdam/",
|
||||||
|
"/s/articles/2021/05/28/ipng-arrives-in-lille/",
|
||||||
|
"/s/articles/2021/06/01/ipng-arrives-in-paris/",
|
||||||
|
"/s/articles/2021/06/28/launch-of-as112/",
|
||||||
|
"/s/articles/2021/07/03/ipng-arrives-in-geneva/",
|
||||||
|
"/s/articles/2021/07/19/review-pcengines-apu6-with-sfp/",
|
||||||
|
"/s/articles/2021/07/26/a-story-of-a-bucketlist/",
|
||||||
|
"/s/articles/2021/08/07/review-fs-s5860-20sq-switch/",
|
||||||
|
"/s/articles/2021/08/12/vpp-linux-cp-part1/",
|
||||||
|
"/s/articles/2021/08/13/vpp-linux-cp-part2/",
|
||||||
|
"/s/articles/2021/08/15/vpp-linux-cp-part3/",
|
||||||
|
"/s/articles/2021/08/25/vpp-linux-cp-part4/",
|
||||||
|
"/s/articles/2021/08/26/fiber7-x-in-1790bre/",
|
||||||
|
"/s/articles/2021/09/02/vpp-linux-cp-part5/",
|
||||||
|
"/s/articles/2021/09/10/vpp-linux-cp-part6/",
|
||||||
"/s/articles/2021/09/21/vpp-linux-cp-part7/",
|
"/s/articles/2021/09/21/vpp-linux-cp-part7/",
|
||||||
|
"/s/articles/2021/10/24/ipng-acquires-as8298/",
|
||||||
|
"/s/articles/2021/11/14/case-study-bgp-routing-policy/",
|
||||||
|
"/s/articles/2021/11/26/review-netgate-6100/",
|
||||||
"/s/articles/2021/12/23/vpp-linux-cp-virtual-machine-playground/",
|
"/s/articles/2021/12/23/vpp-linux-cp-virtual-machine-playground/",
|
||||||
"/s/articles/2022/01/12/case-study-virtual-leased-line-vll-in-vpp/",
|
"/s/articles/2022/01/12/case-study-virtual-leased-line-vll-in-vpp/",
|
||||||
"/s/articles/2022/02/14/case-study-vlan-gymnastics-with-vpp/",
|
"/s/articles/2022/02/14/case-study-vlan-gymnastics-with-vpp/",
|
||||||
|
"/s/articles/2022/02/21/review-cisco-asr9006/rsp440-se/",
|
||||||
|
"/s/articles/2022/02/24/ipng-networks-colocation/",
|
||||||
|
"/s/articles/2022/03/03/syslog-to-telegram/",
|
||||||
"/s/articles/2022/03/27/vpp-configuration-part1/",
|
"/s/articles/2022/03/27/vpp-configuration-part1/",
|
||||||
|
"/s/articles/2022/04/02/vpp-configuration-part2/",
|
||||||
"/s/articles/2022/10/14/vpp-lab-setup/",
|
"/s/articles/2022/10/14/vpp-lab-setup/",
|
||||||
|
"/s/articles/2022/11/20/mastodon-part-1-installing/",
|
||||||
|
"/s/articles/2022/11/24/mastodon-part-2-monitoring/",
|
||||||
|
"/s/articles/2022/11/27/mastodon-part-3-statsd-and-prometheus/",
|
||||||
|
"/s/articles/2022/12/05/review-s5648x-2q4z-switch-part-1-vxlan/geneve/nvgre/",
|
||||||
|
"/s/articles/2022/12/09/review-s5648x-2q4z-switch-part-2-mpls/",
|
||||||
|
"/s/articles/2023/02/12/review-compulab-fitlet2/",
|
||||||
|
"/s/articles/2023/02/24/case-study-vpp-at-coloclue-part-2/",
|
||||||
"/s/articles/2023/03/11/case-study-centec-mpls-core/",
|
"/s/articles/2023/03/11/case-study-centec-mpls-core/",
|
||||||
|
"/s/articles/2023/03/17/case-study-site-local-nginx/",
|
||||||
|
"/s/articles/2023/03/24/case-study-lets-encrypt-dns-01/",
|
||||||
"/s/articles/2023/04/09/vpp-monitoring/",
|
"/s/articles/2023/04/09/vpp-monitoring/",
|
||||||
|
"/s/articles/2023/05/07/vpp-mpls-part-1/",
|
||||||
|
"/s/articles/2023/05/17/vpp-mpls-part-2/",
|
||||||
|
"/s/articles/2023/05/21/vpp-mpls-part-3/",
|
||||||
"/s/articles/2023/05/28/vpp-mpls-part-4/",
|
"/s/articles/2023/05/28/vpp-mpls-part-4/",
|
||||||
|
"/s/articles/2023/08/06/pixelfed-part-1-installing/",
|
||||||
|
"/s/articles/2023/08/27/case-study-nginx--certbot-with-ansible/",
|
||||||
|
"/s/articles/2023/10/21/vpp-ixp-gateway-part-1/",
|
||||||
"/s/articles/2023/11/11/debian-on-mellanox-sn2700-32x100g/",
|
"/s/articles/2023/11/11/debian-on-mellanox-sn2700-32x100g/",
|
||||||
"/s/articles/2023/12/17/debian-on-ipngs-vpp-routers/",
|
"/s/articles/2023/12/17/debian-on-ipngs-vpp-routers/",
|
||||||
"/s/articles/2024/01/27/vpp-python-api/",
|
"/s/articles/2024/01/27/vpp-python-api/",
|
||||||
"/s/articles/2024/02/10/vpp-on-freebsd-part-1/",
|
"/s/articles/2024/02/10/vpp-on-freebsd-part-1/",
|
||||||
|
"/s/articles/2024/02/17/vpp-on-freebsd-part-2/",
|
||||||
"/s/articles/2024/03/06/vpp-with-babel-part-1/",
|
"/s/articles/2024/03/06/vpp-with-babel-part-1/",
|
||||||
"/s/articles/2024/04/06/vpp-with-loopback-only-ospfv3-part-1/",
|
"/s/articles/2024/04/06/vpp-with-loopback-only-ospfv3-part-1/",
|
||||||
"/s/articles/2024/04/27/freeix-remote/"
|
"/s/articles/2024/04/27/freeix-remote-part-1/",
|
||||||
|
"/s/articles/2024/05/17/case-study-ipngs-mail-servers/",
|
||||||
|
"/s/articles/2024/05/25/case-study-nat64/",
|
||||||
|
"/s/articles/2024/06/22/vpp-with-loopback-only-ospfv3-part-2/",
|
||||||
|
"/s/articles/2024/06/29/case-study-ipng-at-coloclue/",
|
||||||
|
"/s/articles/2024/07/05/review-r86s-jasper-lake-n6005/",
|
||||||
|
"/s/articles/2024/08/03/review-gowin-1u-2x25g-alder-lake-n305/",
|
||||||
|
"/s/articles/2024/08/12/case-study-from-jekyll-to-hugo/",
|
||||||
|
"/s/articles/2024/09/08/vpp-with-sflow-part-1/",
|
||||||
|
"/s/articles/2024/10/06/vpp-with-sflow-part-2/",
|
||||||
|
"/s/articles/2024/10/21/freeix-remote-part-2/",
|
||||||
|
"/s/articles/2025/02/08/vpp-with-sflow-part-3/",
|
||||||
|
"/s/articles/2025/04/09/frysix-evpn-think-different/",
|
||||||
|
"/s/articles/2025/05/03/vpp-in-containerlab-part-1/",
|
||||||
|
"/s/articles/2025/05/04/vpp-in-containerlab-part-2/",
|
||||||
|
"/s/articles/2025/05/28/case-study-minio-s3-part-1/",
|
||||||
|
"/s/articles/2025/06/01/case-study-minio-s3-part-2/",
|
||||||
|
"/s/articles/2025/07/12/vpp-and-evpn/vxlan-part-1/",
|
||||||
|
"/s/articles/2025/07/26/certificate-transparency-part-1-tesseract/",
|
||||||
|
"/s/articles/2025/08/10/certificate-transparency-part-2-sunlight/",
|
||||||
|
"/s/articles/2025/08/24/certificate-transparency-part-3-operations/",
|
||||||
|
"/s/articles/2026/02/14/vpp-policers/",
|
||||||
|
"/s/articles/2026/02/21/vpp-srv6-l2vpn/",
|
||||||
];
|
];
|
||||||
|
|
||||||
var redir_url = "https://ipng.ch/";
|
var redir_url = "https://ipng.ch/";
|
||||||
var key = window.location.hash.slice(1);
|
var key = window.location.hash.slice(1);
|
||||||
if (key.startsWith("ntag")) {
|
if (key.startsWith("ntag")) {
|
||||||
let week = Math.round(new Date().getTime() / 1000 / (7*24*3400));
|
let week = Math.round(new Date().getTime() / 1000 / (7*24*3600));
|
||||||
let num = parseInt(key.slice(-2));
|
let num = parseInt(key.slice(-2));
|
||||||
let idx = (num + week) % ntag_list.length;
|
let idx = (num + week) % ntag_list.length;
|
||||||
console.log("(ntag " + num + " + week number " + week + ") % " + ntag_list.length + " = " + idx);
|
console.log("(ntag " + num + " + week number " + week + ") % " + ntag_list.length + " = " + idx);
|
||||||
|
|||||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 147 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,164 @@
|
|||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=4M
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 813, speed = 54.2MB/sec, 13.5 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 23168, speed = 1.5GB/sec, 386.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 2.2 secs, 371.2 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=1M
|
||||||
|
2025/07/20 16:07:25 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FACEBAC4D052, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 1221, speed = 20.3MB/sec, 20.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 31000, speed = 516.7MB/sec, 516.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 3.2 secs, 376.5 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=8k
|
||||||
|
2025/07/20 16:09:29 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FAEB70060604, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 3353, speed = 447KB/sec, 55.9 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 45913, speed = 6MB/sec, 765.2 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 9.3 secs, 361.6 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=4k
|
||||||
|
2025/07/20 16:11:38 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FB098B162788, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 3404, speed = 226.9KB/sec, 56.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 45230, speed = 2.9MB/sec, 753.8 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 9.4 secs, 362.6 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=4M
|
||||||
|
2025/07/20 16:13:47 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FB27AE890E75, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.1 secs, objects = 1898, speed = 126.4MB/sec, 31.6 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 185034, speed = 12GB/sec, 3083.9 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 0.4 secs, 4267.8 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=1M
|
||||||
|
2025/07/20 16:15:48 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FB43C0386015, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.2 secs, objects = 2627, speed = 43.7MB/sec, 43.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 327959, speed = 5.3GB/sec, 5465.9 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 0.6 secs, 4045.6 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=8k
|
||||||
|
2025/07/20 16:17:49 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FB5FE2012590, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 6663, speed = 887.7KB/sec, 111.0 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 459962, speed = 59.9MB/sec, 7666.0 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 1.7 secs, 3890.9 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=4k
|
||||||
|
2025/07/20 16:19:50 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FB7C3CF0FFCA, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.1 secs, objects = 6673, speed = 444.4KB/sec, 111.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 444637, speed = 28.9MB/sec, 7410.5 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 1.5 secs, 4411.8 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=4M
|
||||||
|
2025/07/20 16:21:52 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FB988DB60881, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.2 secs, objects = 3093, speed = 205.5MB/sec, 51.4 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 168750, speed = 11GB/sec, 2811.4 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 0.3 secs, 9112.2 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=1M
|
||||||
|
2025/07/20 16:23:53 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FBB4A1E534DE, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.2 secs, objects = 4652, speed = 77.2MB/sec, 77.2 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 351187, speed = 5.7GB/sec, 5852.8 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 0.6 secs, 8141.6 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=8k
|
||||||
|
2025/07/20 16:25:54 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FBD0C4764C64, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.1 secs, objects = 14497, speed = 1.9MB/sec, 241.4 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 457437, speed = 59.6MB/sec, 7623.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 1.7 secs, 8353.6 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=4k
|
||||||
|
2025/07/20 16:27:55 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FBED210B0792, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.1 secs, objects = 14459, speed = 962.6KB/sec, 240.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 466680, speed = 30.4MB/sec, 7777.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 1.7 secs, 8605.3 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=4M
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 1866, speed = 124.4MB/sec, 31.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 16400, speed = 1.1GB/sec, 273.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 5.1 secs, 369.3 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=1M
|
||||||
|
2025/07/20 16:32:02 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FC25AE815718, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 5459, speed = 91MB/sec, 91.0 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 25090, speed = 418.2MB/sec, 418.2 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 14.8 secs, 369.8 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=8k
|
||||||
|
2025/07/20 16:34:17 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FC4514A78873, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 22278, speed = 2.9MB/sec, 371.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 40626, speed = 5.3MB/sec, 677.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 61.6 secs, 361.8 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=4k
|
||||||
|
2025/07/20 16:37:19 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FC6F629ACFAC, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 23394, speed = 1.5MB/sec, 389.9 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 39249, speed = 2.6MB/sec, 654.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 64.5 secs, 363.0 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=4M
|
||||||
|
2025/07/20 16:40:23 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FC9A5D101971, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 10564, speed = 704.1MB/sec, 176.0 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 20682, speed = 1.3GB/sec, 344.6 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 2.5 secs, 4178.8 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=1M
|
||||||
|
2025/07/20 16:42:26 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FCB6EB0A45D9, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 26550, speed = 442.4MB/sec, 442.4 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 124810, speed = 2GB/sec, 2080.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 6.6 secs, 4049.2 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=8k
|
||||||
|
2025/07/20 16:44:32 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FCD4684A110E, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 129363, speed = 16.8MB/sec, 2155.9 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 423956, speed = 55.2MB/sec, 7065.8 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 32.4 secs, 3992.0 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=4k
|
||||||
|
2025/07/20 16:47:05 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FCF7EA4857CF, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 123067, speed = 8MB/sec, 2051.0 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 357694, speed = 23.3MB/sec, 5961.4 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 30.9 secs, 3986.0 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=4M
|
||||||
|
2025/07/20 16:49:36 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FD1B12EFDEBC, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.1 secs, objects = 13131, speed = 873.3MB/sec, 218.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.1 secs, objects = 18630, speed = 1.2GB/sec, 310.2 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 1.7 secs, 7787.5 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=1M
|
||||||
|
2025/07/20 16:51:38 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FD3779E97644, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.1 secs, objects = 40226, speed = 669.8MB/sec, 669.8 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 85692, speed = 1.4GB/sec, 1427.8 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 4.7 secs, 8610.2 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=8k
|
||||||
|
2025/07/20 16:53:42 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FD5489FB2F1F, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 230985, speed = 30.1MB/sec, 3849.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 435703, speed = 56.7MB/sec, 7261.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 25.8 secs, 8945.8 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:9000, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=4k
|
||||||
|
2025/07/20 16:56:08 WARNING: createBucket wasabi-benchmark-bucket error, ignoring BucketAlreadyOwnedByYou: Your previous request to create the named bucket succeeded and you already own it.
|
||||||
|
status code: 409, request id: 1853FD7683B9BB96, host id: dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 228647, speed = 14.9MB/sec, 3810.4 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 452412, speed = 29.5MB/sec, 7539.9 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 27.2 secs, 8418.0 deletes/sec. Slowdowns = 0
|
||||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,80 @@
|
|||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=1M
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 1994, speed = 33.2MB/sec, 33.2 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 29243, speed = 487.4MB/sec, 487.4 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 2.8 secs, 701.4 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=8k
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 13634, speed = 1.8MB/sec, 227.2 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 32284, speed = 4.2MB/sec, 538.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 18.7 secs, 727.8 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=1M
|
||||||
|
Loop 1: PUT time 62.0 secs, objects = 23733, speed = 382.8MB/sec, 382.8 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 132708, speed = 2.2GB/sec, 2211.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 3.7 secs, 6490.1 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=8k
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 199925, speed = 26MB/sec, 3331.9 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 309937, speed = 40.4MB/sec, 5165.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 31.2 secs, 6406.0 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=1M
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 1975, speed = 32.9MB/sec, 32.9 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 29898, speed = 498.3MB/sec, 498.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 2.7 secs, 726.6 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=1, loops=1, size=8k
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 13662, speed = 1.8MB/sec, 227.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 31865, speed = 4.1MB/sec, 531.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 18.8 secs, 726.9 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=1M
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 26622, speed = 443.6MB/sec, 443.6 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 117688, speed = 1.9GB/sec, 1961.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 4.1 secs, 6499.5 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=8k
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 198238, speed = 25.8MB/sec, 3303.9 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 312868, speed = 40.7MB/sec, 5214.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 30.8 secs, 6432.7 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=4M
|
||||||
|
Loop 1: PUT time 60.1 secs, objects = 6220, speed = 414.2MB/sec, 103.6 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 38773, speed = 2.5GB/sec, 646.1 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 0.9 secs, 6693.3 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=4k
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 203033, speed = 13.2MB/sec, 3383.8 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 300824, speed = 19.6MB/sec, 5013.6 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 31.1 secs, 6528.6 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=4M
|
||||||
|
Loop 1: PUT time 60.3 secs, objects = 13181, speed = 874.2MB/sec, 218.6 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.1 secs, objects = 18575, speed = 1.2GB/sec, 309.3 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 0.8 secs, 17547.2 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-disk:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=4k
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 495006, speed = 32.2MB/sec, 8249.5 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 465947, speed = 30.3MB/sec, 7765.4 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 41.4 secs, 11961.3 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=4M
|
||||||
|
Loop 1: PUT time 60.1 secs, objects = 7073, speed = 471MB/sec, 117.8 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 31248, speed = 2GB/sec, 520.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 1.1 secs, 6576.1 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=8, loops=1, size=4k
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 214387, speed = 14MB/sec, 3573.0 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 297586, speed = 19.4MB/sec, 4959.7 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 32.9 secs, 6519.8 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=4M
|
||||||
|
Loop 1: PUT time 60.1 secs, objects = 14365, speed = 956MB/sec, 239.0 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.1 secs, objects = 18113, speed = 1.2GB/sec, 301.6 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 0.8 secs, 18655.8 deletes/sec. Slowdowns = 0
|
||||||
|
Wasabi benchmark program v2.0
|
||||||
|
Parameters: url=http://minio-ssd:8333, bucket=wasabi-benchmark-bucket, region=us-east-1, duration=60, threads=32, loops=1, size=4k
|
||||||
|
Loop 1: PUT time 60.0 secs, objects = 489736, speed = 31.9MB/sec, 8161.8 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: GET time 60.0 secs, objects = 460296, speed = 30MB/sec, 7671.2 operations/sec. Slowdowns = 0
|
||||||
|
Loop 1: DELETE time 41.0 secs, 11957.6 deletes/sec. Slowdowns = 0
|
||||||
@@ -0,0 +1,116 @@
|
|||||||
|
# Test Setup for SeaweedFS with 6 disks, a Filer an an S3 API
|
||||||
|
#
|
||||||
|
# Use with the following .env file
|
||||||
|
# root@minio-ssd:~# cat /opt/seaweedfs/.env
|
||||||
|
# AWS_ACCESS_KEY_ID="hottentotten"
|
||||||
|
# AWS_SECRET_ACCESS_KEY="tentententoonstelling"
|
||||||
|
|
||||||
|
services:
|
||||||
|
# Master
|
||||||
|
master0:
|
||||||
|
image: chrislusf/seaweedfs
|
||||||
|
ports:
|
||||||
|
- 9333:9333
|
||||||
|
- 19333:19333
|
||||||
|
command: "-v=1 master -volumeSizeLimitMB 100 -resumeState=false -ip=master0 -ip.bind=0.0.0.0 -port=9333 -mdir=/var/lib/seaweedfs/master"
|
||||||
|
volumes:
|
||||||
|
- ./data/master0:/var/lib/seaweedfs/master
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Volume Server 1
|
||||||
|
volume1:
|
||||||
|
image: chrislusf/seaweedfs
|
||||||
|
command: 'volume -dataCenter=dc1 -rack=r1 -mserver="master0:9333" -port=8081 -preStopSeconds=1 -dir=/var/lib/seaweedfs/volume1'
|
||||||
|
volumes:
|
||||||
|
- /data/disk1:/var/lib/seaweedfs/volume1
|
||||||
|
depends_on:
|
||||||
|
- master0
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Volume Server 2
|
||||||
|
volume2:
|
||||||
|
image: chrislusf/seaweedfs
|
||||||
|
command: 'volume -dataCenter=dc1 -rack=r1 -mserver="master0:9333" -port=8082 -preStopSeconds=1 -dir=/var/lib/seaweedfs/volume2'
|
||||||
|
volumes:
|
||||||
|
- /data/disk2:/var/lib/seaweedfs/volume2
|
||||||
|
depends_on:
|
||||||
|
- master0
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Volume Server 3
|
||||||
|
volume3:
|
||||||
|
image: chrislusf/seaweedfs
|
||||||
|
command: 'volume -dataCenter=dc1 -rack=r1 -mserver="master0:9333" -port=8083 -preStopSeconds=1 -dir=/var/lib/seaweedfs/volume3'
|
||||||
|
volumes:
|
||||||
|
- /data/disk3:/var/lib/seaweedfs/volume3
|
||||||
|
depends_on:
|
||||||
|
- master0
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Volume Server 4
|
||||||
|
volume4:
|
||||||
|
image: chrislusf/seaweedfs
|
||||||
|
command: 'volume -dataCenter=dc1 -rack=r1 -mserver="master0:9333" -port=8084 -preStopSeconds=1 -dir=/var/lib/seaweedfs/volume4'
|
||||||
|
volumes:
|
||||||
|
- /data/disk4:/var/lib/seaweedfs/volume4
|
||||||
|
depends_on:
|
||||||
|
- master0
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Volume Server 5
|
||||||
|
volume5:
|
||||||
|
image: chrislusf/seaweedfs
|
||||||
|
command: 'volume -dataCenter=dc1 -rack=r1 -mserver="master0:9333" -port=8085 -preStopSeconds=1 -dir=/var/lib/seaweedfs/volume5'
|
||||||
|
volumes:
|
||||||
|
- /data/disk5:/var/lib/seaweedfs/volume5
|
||||||
|
depends_on:
|
||||||
|
- master0
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Volume Server 6
|
||||||
|
volume6:
|
||||||
|
image: chrislusf/seaweedfs
|
||||||
|
command: 'volume -dataCenter=dc1 -rack=r1 -mserver="master0:9333" -port=8086 -preStopSeconds=1 -dir=/var/lib/seaweedfs/volume6'
|
||||||
|
volumes:
|
||||||
|
- /data/disk6:/var/lib/seaweedfs/volume6
|
||||||
|
depends_on:
|
||||||
|
- master0
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Filer
|
||||||
|
filer:
|
||||||
|
image: chrislusf/seaweedfs
|
||||||
|
ports:
|
||||||
|
- 8888:8888
|
||||||
|
- 18888:18888
|
||||||
|
command: 'filer -defaultReplicaPlacement=002 -iam -master="master0:9333"'
|
||||||
|
volumes:
|
||||||
|
- ./data/filer:/data
|
||||||
|
depends_on:
|
||||||
|
- master0
|
||||||
|
- volume1
|
||||||
|
- volume2
|
||||||
|
- volume3
|
||||||
|
- volume4
|
||||||
|
- volume5
|
||||||
|
- volume6
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# S3 API
|
||||||
|
s3:
|
||||||
|
image: chrislusf/seaweedfs
|
||||||
|
ports:
|
||||||
|
- 8333:8333
|
||||||
|
command: 's3 -filer="filer:8888" -ip.bind=0.0.0.0'
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
depends_on:
|
||||||
|
- master0
|
||||||
|
- volume1
|
||||||
|
- volume2
|
||||||
|
- volume3
|
||||||
|
- volume4
|
||||||
|
- volume5
|
||||||
|
- volume6
|
||||||
|
- filer
|
||||||
|
restart: unless-stopped
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 124 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
After Width: | Height: | Size: 90 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" class="w-full" fill="hsl(14.8, 63.1%, 59.6%)"><path d="m19.6 66.5 19.7-11 .3-1-.3-.5h-1l-3.3-.2-11.2-.3L14 53l-9.5-.5-2.4-.5L0 49l.2-1.5 2-1.3 2.9.2 6.3.5 9.5.6 6.9.4L38 49.1h1.6l.2-.7-.5-.4-.4-.4L29 41l-10.6-7-5.6-4.1-3-2-1.5-2-.6-4.2 2.7-3 3.7.3.9.2 3.7 2.9 8 6.1L37 36l1.5 1.2.6-.4.1-.3-.7-1.1L33 25l-6-10.4-2.7-4.3-.7-2.6c-.3-1-.4-2-.4-3l3-4.2L28 0l4.2.6L33.8 2l2.6 6 4.1 9.3L47 29.9l2 3.8 1 3.4.3 1h.7v-.5l.5-7.2 1-8.7 1-11.2.3-3.2 1.6-3.8 3-2L61 2.6l2 2.9-.3 1.8-1.1 7.7L59 27.1l-1.5 8.2h.9l1-1.1 4.1-5.4 6.9-8.6 3-3.5L77 13l2.3-1.8h4.3l3.1 4.7-1.4 4.9-4.4 5.6-3.7 4.7-5.3 7.1-3.2 5.7.3.4h.7l12-2.6 6.4-1.1 7.6-1.3 3.5 1.6.4 1.6-1.4 3.4-8.2 2-9.6 2-14.3 3.3-.2.1.2.3 6.4.6 2.8.2h6.8l12.6 1 3.3 2 1.9 2.7-.3 2-5.1 2.6-6.8-1.6-16-3.8-5.4-1.3h-.8v.4l4.6 4.5 8.3 7.5L89 80.1l.5 2.4-1.3 2-1.4-.2-9.2-7-3.6-3-8-6.8h-.5v.7l1.8 2.7 9.8 14.7.5 4.5-.7 1.4-2.6 1-2.7-.6-5.8-8-6-9-4.7-8.2-.5.4-2.9 30.2-1.3 1.5-3 1.2-2.5-2-1.4-3 1.4-6.2 1.6-8 1.3-6.4 1.2-7.9.7-2.6v-.2H49L43 72l-9 12.3-7.2 7.6-1.7.7-3-1.5.3-2.8L24 86l10-12.8 6-7.9 4-4.6-.1-.5h-.3L17.2 77.4l-4.7.6-2-2 .2-3 1-1 8-5.5Z"></path></svg>
|
||||||
|
After Width: | Height: | Size: 1.1 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,240 @@
|
|||||||
|
/*
|
||||||
|
Fast Search for Hugo — adapted from
|
||||||
|
https://gist.github.com/cmod/5410eae147e4318164258742dd053993
|
||||||
|
MIT License
|
||||||
|
*/
|
||||||
|
|
||||||
|
const DEFAULT_CONFIG = {
|
||||||
|
shortcuts: {
|
||||||
|
open: {
|
||||||
|
key: '/',
|
||||||
|
metaKey: false,
|
||||||
|
altKey: false,
|
||||||
|
ctrlKey: false,
|
||||||
|
shiftKey: false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
search: {
|
||||||
|
minChars: 2,
|
||||||
|
maxResults: 8,
|
||||||
|
fields: {
|
||||||
|
title: true,
|
||||||
|
description: true,
|
||||||
|
section: true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
function initSearch(userConfig = {}) {
|
||||||
|
const CONFIG = mergeConfigs(DEFAULT_CONFIG, userConfig);
|
||||||
|
|
||||||
|
const mainNav = document.getElementById('mainNav');
|
||||||
|
const fastSearch = document.getElementById('fastSearch');
|
||||||
|
const searchInput = document.getElementById('searchInput');
|
||||||
|
const searchResults = document.getElementById('searchResults');
|
||||||
|
|
||||||
|
let searchIndex = null;
|
||||||
|
let searchVisible = false;
|
||||||
|
let resultsAvailable = false;
|
||||||
|
let firstRun = true;
|
||||||
|
|
||||||
|
async function loadSearchIndex() {
|
||||||
|
try {
|
||||||
|
const response = await fetch('/index.json');
|
||||||
|
if (!response.ok) throw new Error('Failed to load search index');
|
||||||
|
const data = await response.json();
|
||||||
|
searchIndex = data.map(item => ({
|
||||||
|
...item,
|
||||||
|
searchableTitle: item.title?.toLowerCase() || '',
|
||||||
|
searchableDesc: item.desc?.toLowerCase() || '',
|
||||||
|
searchableSection: item.section?.toLowerCase() || '',
|
||||||
|
searchableContents: item.contents?.toLowerCase() || ''
|
||||||
|
}));
|
||||||
|
if (searchInput.value) performSearch(searchInput.value);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error loading search index:', error);
|
||||||
|
searchResults.innerHTML = '<li class="search-message">Error loading search index.</li>';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function simpleFuzzyMatch(text, term) {
|
||||||
|
if (text.includes(term)) return true;
|
||||||
|
if (term.length < 3) return false;
|
||||||
|
let matches = 0, lastMatchIndex = -1;
|
||||||
|
for (let i = 0; i < term.length; i++) {
|
||||||
|
const found = text.indexOf(term[i], lastMatchIndex + 1);
|
||||||
|
if (found > -1) { matches++; lastMatchIndex = found; }
|
||||||
|
}
|
||||||
|
return matches === term.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
function matchesShortcut(event, sc) {
|
||||||
|
return event.key === sc.key &&
|
||||||
|
event.metaKey === sc.metaKey &&
|
||||||
|
event.altKey === sc.altKey &&
|
||||||
|
event.ctrlKey === sc.ctrlKey &&
|
||||||
|
event.shiftKey === sc.shiftKey;
|
||||||
|
}
|
||||||
|
|
||||||
|
function openSearch() {
|
||||||
|
searchVisible = true;
|
||||||
|
mainNav.style.display = 'none';
|
||||||
|
fastSearch.style.display = 'flex';
|
||||||
|
if (firstRun) { loadSearchIndex(); firstRun = false; }
|
||||||
|
searchInput.focus();
|
||||||
|
searchInput.value = '';
|
||||||
|
searchResults.innerHTML = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
function closeSearch() {
|
||||||
|
searchVisible = false;
|
||||||
|
fastSearch.style.display = 'none';
|
||||||
|
mainNav.style.display = '';
|
||||||
|
searchInput.blur();
|
||||||
|
searchInput.value = '';
|
||||||
|
searchResults.innerHTML = '';
|
||||||
|
resultsAvailable = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('keydown', (event) => {
|
||||||
|
const tag = event.target.tagName;
|
||||||
|
const inOtherInput = tag === 'TEXTAREA' || (tag === 'INPUT' && event.target.id !== 'searchInput');
|
||||||
|
const inSearchInput = tag === 'INPUT' && event.target.id === 'searchInput';
|
||||||
|
|
||||||
|
// Never steal keystrokes from other inputs/textareas
|
||||||
|
if (inOtherInput) return;
|
||||||
|
|
||||||
|
// Toggle shortcut — not when the user is already typing in the search box
|
||||||
|
if (!inSearchInput && matchesShortcut(event, CONFIG.shortcuts.open)) {
|
||||||
|
event.preventDefault();
|
||||||
|
searchVisible ? closeSearch() : openSearch();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ESC closes from anywhere, including the search input
|
||||||
|
if (event.key === 'Escape' && searchVisible) {
|
||||||
|
closeSearch();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Arrow navigation and Enter
|
||||||
|
if (searchVisible && resultsAvailable) {
|
||||||
|
const links = Array.from(searchResults.getElementsByTagName('a'));
|
||||||
|
if (!links.length) return;
|
||||||
|
const active = document.activeElement;
|
||||||
|
const activeInResults = searchResults.contains(active) && active.tagName === 'A';
|
||||||
|
const i = activeInResults ? links.indexOf(active) : -1;
|
||||||
|
|
||||||
|
if (event.key === 'Enter') {
|
||||||
|
// Follow the focused result, not always the first one
|
||||||
|
const target = activeInResults ? active : links[0];
|
||||||
|
event.preventDefault();
|
||||||
|
window.location.href = target.href;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.key === 'ArrowDown') {
|
||||||
|
event.preventDefault();
|
||||||
|
if (!activeInResults) { links[0].focus(); } // from input or stale focus → first result
|
||||||
|
else if (i < links.length - 1) { links[i + 1].focus(); }
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.key === 'ArrowUp') {
|
||||||
|
event.preventDefault();
|
||||||
|
if (!activeInResults || i === 0) { searchInput.focus(); }
|
||||||
|
else { links[i - 1].focus(); }
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enter with no results: do nothing but swallow it so the form doesn't submit
|
||||||
|
if (event.key === 'Enter' && searchVisible) {
|
||||||
|
event.preventDefault();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
function performSearch(term) {
|
||||||
|
term = term.toLowerCase().trim();
|
||||||
|
if (!term || !searchIndex) { searchResults.innerHTML = ''; resultsAvailable = false; return; }
|
||||||
|
|
||||||
|
if (term.length < CONFIG.search.minChars) {
|
||||||
|
searchResults.innerHTML = '<li class="search-message">Type at least 2 characters…</li>';
|
||||||
|
resultsAvailable = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const searchTerms = term.split(/\s+/).filter(t => t.length > 0);
|
||||||
|
|
||||||
|
const results = searchIndex
|
||||||
|
.map(item => {
|
||||||
|
let score = 0;
|
||||||
|
const matchesAll = searchTerms.every(t => {
|
||||||
|
let matched = false;
|
||||||
|
if (CONFIG.search.fields.title) {
|
||||||
|
if (item.searchableTitle.startsWith(t)) { score += 3; matched = true; }
|
||||||
|
else if (simpleFuzzyMatch(item.searchableTitle, t)) { score += 2; matched = true; }
|
||||||
|
}
|
||||||
|
if (!matched && CONFIG.search.fields.description && item.searchableDesc.includes(t)) { score += 0.5; matched = true; }
|
||||||
|
if (!matched && CONFIG.search.fields.section && item.searchableSection.includes(t)) { score += 0.5; matched = true; }
|
||||||
|
if (!matched && item.searchableContents.includes(t)) { score += 0.1; matched = true; }
|
||||||
|
return matched;
|
||||||
|
});
|
||||||
|
return { item, score: matchesAll ? score : 0 };
|
||||||
|
})
|
||||||
|
.filter(r => r.score > 0)
|
||||||
|
.sort((a, b) => b.score - a.score)
|
||||||
|
.slice(0, CONFIG.search.maxResults)
|
||||||
|
.map(r => r.item);
|
||||||
|
|
||||||
|
resultsAvailable = results.length > 0;
|
||||||
|
|
||||||
|
if (!resultsAvailable) {
|
||||||
|
searchResults.innerHTML = '<li class="search-message">No results found.</li>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
searchResults.innerHTML = results.map(item => `
|
||||||
|
<li>
|
||||||
|
<a href="${escapeHtml(item.permalink)}" tabindex="0">
|
||||||
|
<span class="title">${escapeHtml(item.title)}</span>
|
||||||
|
<span class="meta">${escapeHtml(item.section)} — ${escapeHtml(item.date)}</span>
|
||||||
|
<span class="desc">${escapeHtml(item.desc)}</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
`).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
searchInput.addEventListener('input', function () {
|
||||||
|
if (!searchIndex && !firstRun) {
|
||||||
|
searchResults.innerHTML = '<li class="search-message">Loading…</li>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
performSearch(this.value);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function mergeConfigs(defaultConfig, userConfig) {
|
||||||
|
const merged = { ...defaultConfig };
|
||||||
|
for (const [key, value] of Object.entries(userConfig)) {
|
||||||
|
if (value && typeof value === 'object' && !Array.isArray(value)) {
|
||||||
|
merged[key] = mergeConfigs(defaultConfig[key] || {}, value);
|
||||||
|
} else {
|
||||||
|
merged[key] = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return merged;
|
||||||
|
}
|
||||||
|
|
||||||
|
function escapeHtml(unsafe) {
|
||||||
|
if (!unsafe) return '';
|
||||||
|
return unsafe
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, ''');
|
||||||
|
}
|
||||||
|
|
||||||
|
initSearch();
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
@use "sass:color";
|
||||||
// Colors
|
// Colors
|
||||||
$midnight-blue: #07608f;
|
$midnight-blue: #07608f;
|
||||||
$midnight-matte: #4c7e99;
|
$midnight-matte: #4c7e99;
|
||||||
@@ -265,7 +266,7 @@ nav {
|
|||||||
justify-content: flex-end;
|
justify-content: flex-end;
|
||||||
flex-flow: row wrap;
|
flex-flow: row wrap;
|
||||||
overflow: hidden;
|
overflow: hidden;
|
||||||
margin: auto 2% .6em auto;
|
margin: 0 2% 0 auto;
|
||||||
padding-left: 2em;
|
padding-left: 2em;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -341,6 +342,7 @@ nav li:hover {
|
|||||||
text-align: center;
|
text-align: center;
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-flow: row wrap;
|
flex-flow: row wrap;
|
||||||
|
align-items: center;
|
||||||
|
|
||||||
/* $mq-mini or smaller: */ margin: 0; width: 100%; font-size: $base-font-size * 0.8;
|
/* $mq-mini or smaller: */ margin: 0; width: 100%; font-size: $base-font-size * 0.8;
|
||||||
@media #{$mq-small} { margin: 0 5%; width: 90%; font-size: $base-font-size * 0.9; }
|
@media #{$mq-small} { margin: 0 5%; width: 90%; font-size: $base-font-size * 0.9; }
|
||||||
@@ -763,7 +765,7 @@ table {
|
|||||||
border: 2px solid $ipng-orange;
|
border: 2px solid $ipng-orange;
|
||||||
tr {
|
tr {
|
||||||
&:nth-child(even) {
|
&:nth-child(even) {
|
||||||
background-color: darken($table-cell-background, 10%);
|
background-color: color.adjust($table-cell-background, $lightness: -10%);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
th, td {
|
th, td {
|
||||||
@@ -771,10 +773,103 @@ table {
|
|||||||
}
|
}
|
||||||
th {
|
th {
|
||||||
color: $ipng-orange;
|
color: $ipng-orange;
|
||||||
background-color: lighten($table-header-background, 3%);
|
background-color: color.adjust($table-header-background, $lightness: 3%);
|
||||||
border-bottom: 3px solid $ipng-orange;
|
border-bottom: 3px solid $ipng-orange;
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
}
|
}
|
||||||
td {
|
td {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Search — sits in the header, replaces nav when active
|
||||||
|
#fastSearch {
|
||||||
|
display: none; // toggled to flex by JS
|
||||||
|
flex: 1;
|
||||||
|
align-items: center;
|
||||||
|
align-self: center; // vertically align with the header text
|
||||||
|
position: relative; // anchor for the results dropdown
|
||||||
|
margin: 0;
|
||||||
|
|
||||||
|
input#searchInput {
|
||||||
|
flex: 1;
|
||||||
|
width: 100%;
|
||||||
|
padding: .25em .63em;
|
||||||
|
font-size: 1.1em; // matches nav a font-size
|
||||||
|
color: $text-dark;
|
||||||
|
background-color: transparent;
|
||||||
|
border: none; // header border-bottom already provides the bottom line
|
||||||
|
outline: none;
|
||||||
|
box-sizing: border-box;
|
||||||
|
|
||||||
|
&::placeholder { color: $text-very-light; font-style: italic; }
|
||||||
|
}
|
||||||
|
|
||||||
|
ul#searchResults {
|
||||||
|
position: absolute;
|
||||||
|
top: 100%;
|
||||||
|
right: 0;
|
||||||
|
left: 0;
|
||||||
|
list-style: none;
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
background-color: #f7f7f7; // matches header bg
|
||||||
|
border: 1px solid $tab-border-color;
|
||||||
|
border-top: 2px solid $ipng-orange;
|
||||||
|
box-shadow: 0 4px 12px rgba(0,0,0,0.15);
|
||||||
|
z-index: 200;
|
||||||
|
|
||||||
|
&:empty { display: none; }
|
||||||
|
|
||||||
|
li {
|
||||||
|
border-bottom: 1px solid $tab-border-color;
|
||||||
|
|
||||||
|
&:last-child { border-bottom: none; }
|
||||||
|
|
||||||
|
&.search-message {
|
||||||
|
padding: .5em .63em;
|
||||||
|
color: $text-normal;
|
||||||
|
font-style: italic;
|
||||||
|
font-size: .9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
a {
|
||||||
|
display: block;
|
||||||
|
padding: .4em .63em;
|
||||||
|
text-decoration: none;
|
||||||
|
color: $text-dark;
|
||||||
|
|
||||||
|
&:hover, &:focus {
|
||||||
|
outline: none;
|
||||||
|
background-color: $ipng-orange;
|
||||||
|
color: #fff;
|
||||||
|
|
||||||
|
.meta, .desc { color: rgba(255,255,255,0.85); }
|
||||||
|
}
|
||||||
|
|
||||||
|
.title {
|
||||||
|
display: block;
|
||||||
|
font-size: 1em;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
.meta {
|
||||||
|
display: block;
|
||||||
|
font-size: .72em;
|
||||||
|
color: $text-normal;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: .04em;
|
||||||
|
margin: .1em 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.desc {
|
||||||
|
display: block;
|
||||||
|
font-size: .85em;
|
||||||
|
color: $text-normal;
|
||||||
|
white-space: nowrap;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,5 +7,6 @@
|
|||||||
{{- block "main" . }}{{- end }}
|
{{- block "main" . }}{{- end }}
|
||||||
</div>
|
</div>
|
||||||
{{- partial "footer.html" . -}}
|
{{- partial "footer.html" . -}}
|
||||||
|
<script src="/js/fastsearch.js"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
@@ -5,18 +5,18 @@
|
|||||||
<link>{{ .Permalink }}</link>
|
<link>{{ .Permalink }}</link>
|
||||||
<description>Latest blog posts from {{ .Site.Title }}</description>
|
<description>Latest blog posts from {{ .Site.Title }}</description>
|
||||||
|
|
||||||
{{ with .Site.LanguageCode }}
|
{{ with .Site.Language.Locale }}
|
||||||
<language>{{.}}</language>
|
<language>{{.}}</language>
|
||||||
{{end}}
|
{{end}}
|
||||||
|
|
||||||
{{ with .Site.Author.email }}
|
{{ with .Site.Params.social.email }}
|
||||||
<managingEditor>{{.}}
|
<managingEditor>{{.}}
|
||||||
{{ with $.Site.Author.name }} ({{.}}){{end}}
|
{{ with $.Site.Params.author }} ({{.}}){{end}}
|
||||||
</managingEditor>
|
</managingEditor>
|
||||||
{{end}}
|
{{end}}
|
||||||
|
|
||||||
{{ with .Site.Author.email }}
|
{{ with .Site.Params.social.email }}
|
||||||
<webMaster>{{.}}{{ with $.Site.Author.name }} ({{.}}){{end}}
|
<webMaster>{{.}}{{ with $.Site.Params.author }} ({{.}}){{end}}
|
||||||
</webMaster>
|
</webMaster>
|
||||||
{{end}}
|
{{end}}
|
||||||
|
|
||||||
@@ -38,7 +38,7 @@
|
|||||||
<title>{{ .Title }}</title>
|
<title>{{ .Title }}</title>
|
||||||
<link>{{ .Permalink }}</link>
|
<link>{{ .Permalink }}</link>
|
||||||
<pubDate>{{ .Date.Format "Mon, 02 Jan 2006 15:04:05 -0700" | safeHTML }}</pubDate>
|
<pubDate>{{ .Date.Format "Mon, 02 Jan 2006 15:04:05 -0700" | safeHTML }}</pubDate>
|
||||||
{{ with .Site.Author.email }}<author>{{.}}{{ with $.Site.Author.name }} ({{.}}){{end}}</author>{{end}}
|
{{ with .Site.Params.social.email }}<author>{{.}}{{ with $.Site.Params.author }} ({{.}}){{end}}</author>{{end}}
|
||||||
<guid>{{ .Permalink }}</guid>
|
<guid>{{ .Permalink }}</guid>
|
||||||
<description>{{ .Summary | markdownify }}</description>
|
<description>{{ .Summary | markdownify }}</description>
|
||||||
</item>
|
</item>
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
<footer class='page-footer'>
|
<footer class='page-footer'>
|
||||||
{{- $socialMap := .Site.Data.notrack.social }}
|
{{- $socialMap := hugo.Data.notrack.social }}
|
||||||
{{- with $.Site.Params.social }}
|
{{- with $.Site.Params.social }}
|
||||||
<div class="social">
|
<div class="social">
|
||||||
<ul>
|
<ul>
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
<title>{{ .Site.Title }} {{ with .Title }}- {{ . }} {{ end }}</title>
|
<title>{{ .Site.Title }} {{ with .Title }}- {{ . }} {{ end }}</title>
|
||||||
<link rel="stylesheet" type="text/css" href="{{ "css/fonts.css" | relURL }}">
|
<link rel="stylesheet" type="text/css" href="{{ "css/fonts.css" | relURL }}">
|
||||||
<link rel="stylesheet" type="text/css" href="{{ "css/fontawesome.css" | relURL }}">
|
<link rel="stylesheet" type="text/css" href="{{ "css/fontawesome.css" | relURL }}">
|
||||||
{{ $options := dict "transpiler" "libsass" "targetPath" "css/styles.css" -}}
|
{{ $options := dict "transpiler" "dartsass" "targetPath" "css/styles.css" -}}
|
||||||
{{ $style := resources.Get "styles.scss" | toCSS $options | minify | fingerprint -}}
|
{{ $style := resources.Get "styles.scss" | toCSS $options | minify | fingerprint -}}
|
||||||
<link rel="stylesheet" type="text/css" href="{{ $style.RelPermalink }}">
|
<link rel="stylesheet" type="text/css" href="{{ $style.RelPermalink }}">
|
||||||
{{ with resources.Get "css/userstyles.css" }}
|
{{ with resources.Get "css/userstyles.css" }}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
<div class="myname">
|
<div class="myname">
|
||||||
<h2><a href="{{ default .Site.Home .Site.BaseURL }}">{{ default .Site.Params.Author .Site.Params.siteHeading }}</a></h2>
|
<h2><a href="{{ default .Site.Home .Site.BaseURL }}">{{ default .Site.Params.Author .Site.Params.siteHeading }}</a></h2>
|
||||||
</div>
|
</div>
|
||||||
<nav>
|
<nav id="mainNav">
|
||||||
<ul class="navbar">
|
<ul class="navbar">
|
||||||
{{- /* info about current page */ -}}
|
{{- /* info about current page */ -}}
|
||||||
{{- $currentPage := . -}}
|
{{- $currentPage := . -}}
|
||||||
@@ -67,4 +67,8 @@
|
|||||||
{{- end }}
|
{{- end }}
|
||||||
</ul>
|
</ul>
|
||||||
</nav>
|
</nav>
|
||||||
|
<div id="fastSearch">
|
||||||
|
<input id="searchInput" tabindex="0" placeholder="Search…" aria-label="Search" autocomplete="off">
|
||||||
|
<ul id="searchResults"></ul>
|
||||||
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
<span style="color: {{ .Get "color" }}; font-weight: bold;">{{ .Inner }}</span>
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
{{ with $.Page.Site.Params.social }}
|
{{ with $.Page.Site.Params.social }}
|
||||||
{{ $socialMap := $.Page.Site.Data.notrack.social }}
|
{{ $socialMap := hugo.Data.notrack.social }}
|
||||||
{{ $width := default "12em" (.Params.Get "width") }}
|
{{ $width := default "12em" (.Params.Get "width") }}
|
||||||
{{ $height := default "auto" (.Params.Get "height") }}
|
{{ $height := default "auto" (.Params.Get "height") }}
|
||||||
{{ $float := default "right" (.Params.Get "float") }}
|
{{ $float := default "right" (.Params.Get "float") }}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
{{ $socialMap := .Site.Data.notrack.social }}
|
{{ $socialMap := hugo.Data.notrack.social }}
|
||||||
|
|
||||||
{{ with $.Site.Params.social }}
|
{{ with $.Site.Params.social }}
|
||||||
<div class="social">
|
<div class="social">
|
||||||
|
|||||||
Reference in New Issue
Block a user