commit 66db9ee20cdee41a54dbeca8fda41276081d0575 Author: Stefan Bühler Date: Sun Apr 25 15:22:38 2021 +0200 initial diff --git a/README.md b/README.md new file mode 100644 index 0000000..dfd9476 --- /dev/null +++ b/README.md @@ -0,0 +1,127 @@ +# Carrier-grade NAT demo (work in progress) + +> **Current state**: cross-VRF routing is working, but NAT breaks it. +> +> conntrack log shows state is immediately destroyed after it gets created, +> and the packet is "lost" between `up` and `muplink`. + +The basic idea of `100.64.0.0/10` seems to be that a CGN-Router should be able to handle multiple interfaces using `100.64.0.0/10` (including an uplink), but keeping them separated. + +Now theoretically it should work moving each interface (apart from the uplink) into a different network namespace, connect all network namespaces with `veth` pairs to the main one (using some other IP addresses...), and enable SNAT when forwarding packets to the main namespace, and SNAT again when forwarding to the uplink. + +This demo tries to use VRFs; hopefully this results in having to NAT only once (and doesn't need additional local IP addresses). + +To test yourself run `./cgnat-demo.sh` as root (doesn't need network, so feel free to use some isolated container/VM/...): +- spawns `tmux` with multiple windows after setup is done (`ip vrf/netns exec ...` and others) +- `tmux` is configured to use `ctrl-a` prefix (like screen) +- `tmux` shouldn't be detached; default detach keybind (`ctrl-a d`) is replaced to prompt for session destroy + +Dependencies: + +- `nftables` for NAT / trace +- `conntrack` to show conntrack events +- `tmux` to open shells in various contexts + +## Example pings + +- Working in `blue_c2`: + - `ping -I 192.0.2.2 192.0.2.1` - ping `uplink` "public" IP + - `ping 100.64.0.1` - ping `blue_c1` + - `ping 2001:db8:b:10::1` - ping `blue_c1` + - `ping 100.127.255.254` - ping gateway + - `ping 2001:db8:b:10::ffff` - ping gateway + - `ping 2001:db8:b:20::1` - ping `red_c1` + - `ping 2001:db8:a::ffff` - ping `uplink` + - `ping 2001:db8:a::1` - ping `main` (i.e. `up:muplink`) +- Broken everywhere but `uplink`: + - `ping 192.0.2.1` +- Broken in `up`: + - `ping 100.127.255.254` (works as soon NAT gets disabled) + +## Basic design + +- Run everything in a separate network+mount+UTS namespace +- Explicit VRFs for everything, including the uplink + - Uplink VRF (`up`) with `muplink` interface + - Two client VRFs (`blue` and `red`), each with a brigde to connect clients to +- Simulate an uplink with one client (in namespace `uplink`) +- Simulate two clients in VRF `blue` (namespaces `blue_c1` and `blue_c2`) +- Simulate one clients in VRF `red` (namespace `red_c1`) +- IPv4: NAT from client VRFs (`blue` and `red`) to uplink `up` +- IPv6: no NAT, proper routing +- Route `192.0.2.2` from uplink all the way through to `blue_c2` (test IPv4 cross-VRF connectivity without NAT) + +Topology: + +``` ++--------------------+ +-----------------------+ +--------------------+ +| uplink: | | main: | | blue_c1: | +| lo | | lo | | lo | +| | | up (vrf) | +--=---> cuplink (veth) | +| client1 (veth) <-=-=--> muplink (veth) | | +--------------------+ ++--------------------+ | blue (vrf) | | + | br-blue (bridge) | | +--------------------+ + | blue_c1 (veth) <-=--+ | blue_c2: | + | blue_c2 (veth) <-=--+ | lo | + | red (vrf) | +--=---> cuplink (veth) | + | br-red (bridge) | +--------------------+ + | red_c1 (veth) <--=--+ + +-----------------------+ | +--------------------+ + | | red_c1: | + | | lo | + +--=---> cuplink (veth) | + +--------------------+ +``` + +## Basic VRF setup + +Proper VRF `ip rule` setup with unreachables if VRF table didn't succeed: + +``` +1000: from all lookup [l3mdev-table] +2000: from all lookup [l3mdev-table] unreachable +32765: from all lookup local +32766: from all lookup main +``` + +(+ `lookup default` in IPv4) + +## `uplink` configuration + +- Address `192.0.2.1/32` on `lo` +- Addresses `100.127.255.254/10` and `2001:db8:a::ffff/64` on `client1` +- Route `2001:db8:b::/48 via 2001:db8:a::1 dev client1` +- Route `192.0.2.2 via 100.64.0.1 dev client1` + +## `main:up` configuration + +- Addresses `100.64.0.1/10` and `2001:db8:a::1/64` on `muplink` +- Route `default via 100.127.255.254 dev muplink` and `default via 2001:db8:a::ffff dev muplink` +- Route `2001:db8:b:10::/64 dev blue` (forward to VRF `blue`) +- Route `2001:db8:b:20::/64 dev red` (forward to VRF `red`) +- Route `192.0.2.2 dev blue` (forward to VRF `blue`) + +## `main:blue` configuration + +- Addresses `100.127.255.254/10` and `2001:db8:b:10::ffff/64` on `br-blue` +- Route `default dev up` (IPv4 + IPv6) - forward to VRF `up` +- Route `192.0.2.2 dev br-blue` (connected in `blue_c2`) + +## `main:red` configuration + +- Addresses `100.127.255.254/10` and `2001:db8:b:20::ffff/64` on `br-red` +- Route `default dev up` (IPv4 + IPv6) - forward to VRF `up` + +## client configuration + +- Addresses on `cuplink`: + - `blue_c1`: `100.64.0.1/10` and `2001:db8:b:10::1/64` + - `blue_c2`: `100.64.0.2/10` and `2001:db8:b:10::2/64`, also `192.0.2.2/32` + - `red_c1`: `100.64.0.1/10` and `2001:db8:b:20::1/64` +- Route `default via 100.127.255.254 dev cuplink` +- Route `default via 2001:db8:b:$$$$::ffff dev cuplink` (depending on `blue`/`red`) + +## TODO + +- get NAT working +- test whether one can route to `lo` instead of VRF `up` (and drop VRF `up`), or whether there are other ways for for cross-VRF routing diff --git a/cgnat-demo.sh b/cgnat-demo.sh new file mode 100755 index 0000000..c4f9368 --- /dev/null +++ b/cgnat-demo.sh @@ -0,0 +1,152 @@ +#!/bin/bash + +set -e + +if [ "$1" != "--inner" ]; then + if [ ! -d "/run/netns" ]; then + mkdir /run/netns + chmod 0755 /run/netns + fi + + export tmpdir=$(mktemp -p /run -d netns-cgnat-demo-XXXXXXX) + trap 'rm -rf "${tmpdir}"' EXIT + export NAMESPACEDIR="${tmpdir}/netns" + mkdir "${NAMESPACEDIR}" + chmod 0755 "${NAMESPACEDIR}" + # Run actuall demo in network+mount+UTS namespaces + unshare -m -n -u -- "$0" --inner + echo "Cleaning up" + # cleanup afterwards + exit 0 +fi + +show_failed_command() { + local rc=$? + if [ "${rc}" -ne 0 ]; then + printf 'Failed command: %s\n' "${BASH_COMMAND}" + fi + exit $rc +} + +trap show_failed_command EXIT + +cd "$(dirname "$(readlink -f "$0")")" + +cp tmux_base.conf "${tmpdir}/tmux.conf" +printf >>"${tmpdir}/tmux.conf" 'new-session -n main -s cgnat-demo "%s"\n' "${SHELL} -i" +printf >>"${tmpdir}/tmux.conf" 'new-window -d -n trace "nft monitor trace"\n' +printf >>"${tmpdir}/tmux.conf" 'new-window -d -n conntrack "conntrack -E -o timestamp"\n' + +# setup local ip-netns "namespace" (so ip-netns names don't conflict with other stuff) +mount -o bind "${NAMESPACEDIR}" /run/netns +mount --make-private /run/netns + +# gonna do routing +sysctl -q net.ipv4.ip_forward=1 +sysctl -q net.ipv6.conf.default.forwarding=1 +sysctl -q net.ipv6.conf.all.forwarding=1 + +# basic setup of our main network namespace +ip link set dev lo up +./fix-vrf-rules.sh + +netns() { + local name="$1" + shift + ip netns exec "${name}" "$@" +} + +create_netns() { + local name="$1" + + ip netns add "${name}" + # basic setup + ip -n "${name}" link set dev lo up + netns "${name}" ./fix-vrf-rules.sh +} + +# build explicit VRF to uplink (and route others through) +ip link add name "up" type vrf table "1" +ip link set dev "up" up +printf >>"${tmpdir}/tmux.conf" 'new-window -d -n up -e debian_chroot=up "%s"\n' "ip vrf exec up ${SHELL} -i" + +export UPLINK="100.127.255.254" # last usable ip in 100.64.0.0/10 +export UPLINK6="2001:db8:a::ffff" +export PUBLIC="192.0.2.1" + +# build "uplink": uplink has one client: the "main" netns +create_netns "uplink" +ip link add name muplink type veth peer client1 +ip link set dev client1 netns "uplink" +ip -n "uplink" address add "${PUBLIC}/32" dev lo +ip -n "uplink" link set dev client1 up +ip -n "uplink" address add "${UPLINK}/10" dev client1 +ip -n "uplink" address add "${UPLINK6}/64" dev client1 +ip -n "uplink" route add "2001:db8:b::/48" via 2001:db8:a::1 dev client1 +ip link set dev muplink vrf "up" up +ip address add 100.64.0.1/10 dev muplink +ip address add 2001:db8:a::1/64 dev muplink +ip route add default vrf "up" via "${UPLINK}" dev muplink +ip -6 route add default vrf "up" via "${UPLINK6}" dev muplink +printf >>"${tmpdir}/tmux.conf" 'new-window -d -n uplink -e debian_chroot=uplink "%s"\n' "ip netns exec uplink ${SHELL} -i" + +declare -A VRFIDS + +create_client_vrf() { + local vrfname="$1" + local vrfid=$2 + VRFIDS[${vrfname}]=${vrfid} + ip link add name "${vrfname}" type vrf table "${vrfid}" + ip link add name "br-${vrfname}" type bridge + ip link set dev "br-${vrfname}" master "${vrfname}" up + ip link set dev "${vrfname}" up + ip address add "${UPLINK}/10" dev "br-${vrfname}" + ip address add "2001:db8:b:${vrfid}::ffff/64" dev "br-${vrfname}" + ip route add "2001:db8:b:${vrfid}::/64" vrf "up" dev "${vrfname}" # route-leak IPv6 clients + ip route add default vrf "${vrfname}" dev "up" # route-leak uplink + ip -6 route add default vrf "${vrfname}" dev "up" # route-leak uplink + + printf >>"${tmpdir}/tmux.conf" 'new-window -d -n %s -e debian_chroot=%s "%s"\n' "${vrfname}" "${vrfname}" "ip vrf exec ${vrfname} ${SHELL} -i" +} + +create_client() { + local vrfname="$1" + local name="$2" + local id="$3" + local vrfid=${VRFIDS[$vrfname]} + local ip="100.64.0.${id}/10" + local ipv6="2001:db8:b:${vrfid}::${id}/64" + + create_netns "${name}" + ip link add name "${name}" type veth peer cuplink + ip link set dev cuplink netns "${name}" + ip -n "${name}" address add "${ip}" dev cuplink + ip -n "${name}" address add "${ipv6}" dev cuplink + ip -n "${name}" link set dev cuplink up + ip -n "${name}" route add default via "${UPLINK}" dev cuplink + ip -n "${name}" route add default via "2001:db8:b:${vrfid}::ffff" dev cuplink + sysctl -q "net.ipv6.conf.${name}.disable_ipv6=1" # disable ipv6 on bridge slave + ip link set dev "${name}" master "br-${vrfname}" up + + printf >>"${tmpdir}/tmux.conf" 'new-window -d -n %s -e debian_chroot=%s "%s"\n' "${name}" "${name}" "ip netns exec ${name} ${SHELL} -i" +} + +# setup firewall / NAT +/usr/sbin/nft -f nft.conf + +create_client_vrf "blue" 10 +create_client "blue" "blue_c1" 1 +create_client "blue" "blue_c2" 2 +create_client_vrf "red" 20 +create_client "red" "red_c1" 1 + +# without NAT ipv4 seems to be working: +ip -n "blue_c2" address add "192.0.2.2/32" dev cuplink +ip route add "192.0.2.2/32" vrf "blue" dev br-blue # on bridge in vrf blue +ip route add "192.0.2.2/32" vrf "up" dev blue # leak to vrf up +ip -n "uplink" route add "192.0.2.2/32" via "100.64.0.1" dev client1 # static route in uplink + +echo +echo "--- Have fun checking it out yourself (exit the shell to close the experiment)." +export debian_chroot=cgnat-demo +exec tmux -L "cgnat-demo-$$" -f "${tmpdir}/tmux.conf" attach diff --git a/fix-vrf-rules.sh b/fix-vrf-rules.sh new file mode 100755 index 0000000..3a1bb25 --- /dev/null +++ b/fix-vrf-rules.sh @@ -0,0 +1,61 @@ +#!/bin/sh + +# Creating a VRF on linux (like `ip link add vrf_foobar type vrf table 10`) automatically inserts a +# `l3mdev` rule (both IPv4 and IPv6) with preference 1000 by default. +# +# Sadly this means that the `lookup local` with preference 0 (the table `local` containing your +# addresses in the "default VRF") is queried before that, which breaks routing of packets from a +# VRF to your non-VRF addresses. +# +# So you actually want the `l3mdev` rule before the `lookup local` rule, and this script helps with +# that. +# +# Your VRF routing table usually is contained completely in the table you specified when creating +# the VRF; this script also creates an "pref 2000 l3mdev unreachable" rule to make sure within VRFs +# no routes "outside" the VRF are used. (As an alternative you could add `unreachable default +# metric 4278198272` routes in both IPv4 and IPv6 VRF tables). +# +# This should still leave enough room to add policy-based routing rules if you need them. +# +# Also see `vrf_prepare()` and `vrf_create()` in linux kernel +# source:tools/testing/selftests/net/forwarding/lib.sh + +set -e + +has_rule() { + if [ -n "$(ip $family rule list "$@")" ]; then + # echo "Have: ip $family rule $*" + return 0 + else + # echo "Have not: ip $family rule $*" + return 1 + fi +} + +rule() { + # echo "Running: ip $family rule $*" + ip $family rule "$@" +} + +run() { + # move lookup local to pref 32765 (from 0) + if ! has_rule pref 32765 lookup local; then + rule add pref 32765 lookup local + fi + if has_rule pref 0 lookup local; then + rule del pref 0 lookup local + fi + # make sure that in VRFs after failed lookup in the VRF specific table nothing else is reached + if ! has_rule pref 1000 l3mdev; then + # this should be added by the kernel when a VRF is created; add it here for completeness + rule add pref 1000 l3mdev protocol kernel + fi + if ! has_rule pref 2000 l3mdev; then # can't search for actions; so can't make sure this is actually using "unreachable" + rule add pref 2000 l3mdev unreachable + fi +} + +family=-4 +run +family=-6 +run diff --git a/nft.conf b/nft.conf new file mode 100644 index 0000000..a46998a --- /dev/null +++ b/nft.conf @@ -0,0 +1,66 @@ +#!/usr/sbin/nft -f + +flush ruleset + +# Counting IPv4 packets in `inet` tables: +# meta nfproto ipv4 counter accept + +# NAT when routing packets from some VRF to "up" VRF +table inet nat { + chain postrouting { + type nat hook postrouting priority srcnat; policy accept; + ip saddr 100.64.0.0/10 oif "up" counter masquerade + # 192.0.2.2 is statically routed: stops working as soon as NAT is enabled + # ip saddr 192.0.2.2/32 oif "up" counter masquerade + accept # less noise in trace + } + +# pre kernel 4.18 needs this: + chain prerouting { + type nat hook prerouting priority -100; policy accept; + accept # less noise in trace + } +} + +# Trace all IPv4: +# define filter hooks so we see packets tracing through them + +table inet main { + chain prerouting { + type filter hook prerouting priority filter; policy accept; + accept # less noise in trace + } + + chain input { + type filter hook input priority filter; policy accept; + accept # less noise in trace + } + + chain forward { + type filter hook forward priority filter; policy accept; + accept # less noise in trace + } + + chain output { + type filter hook output priority filter; policy accept; + accept # less noise in trace + } + + chain postrouting { + type filter hook postrouting priority filter; policy accept; + accept # less noise in trace + } +} + +# enable tracing for all IPv4 packets (either start in prerouting or output) +table ip traceall { + chain prerouting { + type filter hook prerouting priority -350; policy accept; + meta nftrace set 1 accept + } + + chain output { + type filter hook output priority -350; policy accept; + meta nftrace set 1 accept + } +} diff --git a/tmux_base.conf b/tmux_base.conf new file mode 100644 index 0000000..27d5d4d --- /dev/null +++ b/tmux_base.conf @@ -0,0 +1,44 @@ +# screen like prefix +set-option -g prefix C-a +unbind-key C-b +bind-key a send-prefix +bind-key C-a last-window + +# Ctrl-N for next window +# bind-key -T root ^N next-window +bind-key -n ^N next-window +# Ctrl-P for previous window +# bind-key -T root ^P previous-window +bind-key -n ^P previous-window + +# ctrl-arrow keys +set-window-option -g xterm-keys on + +# layout/colours +set-option -g status-bg black +set-option -g status-fg colour45 +set-option -g status-justify centre +set-option -g status-keys vi +set-option -g status-left "#[fg=green][ #H ]#[fg=red] [ #W ]" +set-option -g status-left-length 40 +set-option -g status-right "#[fg=colour5][ %H:%M %d-%b-%y ]" +#set-option -g status-utf8 on + +set-window-option -g monitor-activity on +set-window-option -g window-status-current-style bold +set-window-option -g window-status-current-format "#[fg=colour196](#[fg=default]#I#F #W#[fg=colour196])" +set-window-option -g window-status-format "[#I#F #W]" +#set-window-option -g window-status-alert-fg color226 + +set-option -g set-titles on +set-window-option -g automatic-rename off +set-window-option -g allow-rename on + +# destroy instead of detach +bind-key d confirm-before -p "kill session #S? (y/n)" kill-session +# vim style :quit / :q +set-option -s command-alias[200] quit='confirm-before -p "kill session #S? (y/n)" kill-session' +set-option -s command-alias[201] q='confirm-before -p "kill session #S? (y/n)" kill-session' + +# new -n bash "exec /bin/bash" +# ...