#!/bin/bash
-# Copyright (C) 2017 Ian Kelling
+# I, Ian Kelling, follow the GNU license recommendations at
+# https://www.gnu.org/licenses/license-recommendations.en.html. They
+# recommend that small programs, < 300 lines, be licensed under the
+# Apache License 2.0. This file contains or is part of one or more small
+# programs. If a small program grows beyond 300 lines, I plan to switch
+# its license to GPL.
+
+# Copyright 2024 Ian Kelling
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
[[ $EUID == 0 ]] || exec sudo -E "$BASH_SOURCE" "$@"
-if [[ ! $ERRHANDLE_PATH ]]; then
- ERRHANDLE_PATH=$(readlink -f "${BASH_SOURCE}")
- ERRHANDLE_PATH=$(readlink -f ${ERRHANDLE_PATH%/*}/../errhandle)
-fi
-err_sourced=true
-for p in $ERRHANDLE_PATH/{errcatch-function,bash-trace-function}; do
- if [[ -e $p ]]; then
- source $p
- else
- err_sourced=false
+# https://savannah.nongnu.org/projects/bash-bear-trap/
+set -e; . /usr/local/lib/bash-bear; set +e
+
+m() {
+ local out
+ printf "newns: %s\n" "$*"
+ if ! out=$("$@" 2>&1); then
+ echo "newns: WARNING: last command exit code: $?"
+ elif [[ ! $out ]]; then
+ echo "newns: WARNING: no output from last command"
fi
-done
-if $err_sourced; then
- errcatch
-else
- set -eE -o pipefail
- trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
-fi
+}
usage() {
cat <<EOF
-usage: ${0##*/} [OPTS] start|stop NS_NAME
+usage: ${0##*/} [OPTS] start|stop|show NS_NAME
Nat a network namespace. systemd friendly
Also creates a mount namespace with a cloned /run/resolvconf.
--c, --create Create a named network namespace. When running from
+Arguments:
+
+start|stop: these do what they say.
+
+show: Show the state we expected to be there or not there based on
+start/stop. This is useful for debugging.
+
+NS_NAME: We use this to name the interfaces we create, the mount
+namespace, and if we are creating a named network space, that too.
+
+-c, --create Create or destroy a named network namespace. When running from
the same network namespace as pid 1, this is set automatically.
- A systemd created private network is in a network namespace
- different than pid 1.
+ A systemd created private network is in an unnamed network namespace
+ different than pid 1. I haven't found a need for a named network
+ namespace in that case.
-n NETWORK x.x.x /24 private network to use. If not specified, uses
the first unused one starting at 10.173.1
-h, --help Show this help and exit.
-From within a systemd network namespace, nat it to the outside. This
+From a normal shell:
+
+If we do create the netns, to join it with a shell, we can do (as root)
+/usr/bin/nsenter --mount=/run/mount-namespaces/NAME --net=/var/run/netns/NAME bash
+
+If you dont care about the mount namespace, you can leave that option off.
+
+
+For systemd:
+
+From within a systemd network namespace, we nat it to the outside. This
would be called from ExecStartPre, and or subsequent units called with
JoinsNamespaceOf= and PrivateNetwork=true.
-Also create a named mount namespace under /root/mount_namespaces, so we
-can alter some system config for this namespace. Subsequent systemd
-command lines would be prefixed with:
+If resolvconf is installed, we create a named mount namespace under
+/run/mount-namespaces, so we can alter some system config for this
+namespace. systemd command lines would be prefixed with:
-/usr/bin/nsenter --mount=/root/mount_namespaces/NS_NAME
+/usr/bin/nsenter --mount=/run/mount-namespaces/NS_NAME
Note, this means that they can't run as unpriveledged users, but once
systemd 233 comes out, it will have a bind mount option from within unit
will update the script to that the mount namespace not created unless a
flag is passed in. Patch welcome to add that flag before then.
-A recommmended dependency of this script is my other repo named "errhandle",
-which prints stack trace on error, and calls a cleanup function:
-https://iankelling.org/git/?p=errhandle, set ERRHANDLE_PATH, or put it
-in a directory adjacent to the absolute, resolved directory this file is
-in.
-Background:
+This script has a dependency
+https://savannah.nongnu.org/projects/bash-bear-trap/ . Search the script for "source" to see where to install or modify the installed location.
+
+
+Background on this project (you can skip if you like):
If we aren't creating a named network namespace, to join the namespace
with a shell, I use:
Note: if I knew how to easily ask systemd what pid a unit has, i would
do that.
-If we do create the netns, to join it with a shell, we can do
-/usr/bin/nsenter --mount=/root/mount_namespaces/NAME --net=/var/run/netns/NAME bash
-
"ip netns new ..." also does a mount namespace, then bind
mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
for openvpn having it's own resolv.conf by using it's user script which
copy /run/resolvconf somehwere then bind mount it on top of
/run/resolvconf.
+
Note: for debugging, adding set -x is a pretty good option.
+TODO: make "start" be idempotent.
+
Please email me if you have a patches, bugs, feedback, or republish this
somewhere else: Ian Kelling <ian@iankelling.org>.
EOF
#### begin arg parsing ####
create=false
-temp=$(getopt -l help,create hcn: "$@") || usage 1
+temp=$(getopt -l help,create hcdn: "$@") || usage 1
eval set -- "$temp"
while true; do
case $1 in
fi
#### end sanity checking ####
-
v0=veth0-$nn
v1=veth1-$nn
ip_base=10.173
-if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
- create=true
-fi
-# make the default network namespace be named
+### begin make the default network namespace be named "default" ###
+mkdir -p /run/netns
target=/run/netns/default
if [[ ! -e $target && ! -L $target ]]; then
- mkdir -p /run/netns
- ln -s /proc/1/ns/net $target
+ # -f to avoid a race condition with running twice
+ ln -sf /proc/1/ns/net $target
fi
+### end make the default network namespace be named "default" ###
-
-ipd() { ip -n default "$@"; }
+if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
+ create=true
+fi
+# otherwise we are already in the network namespace and it's unnamed.
if $create; then
- # run ip in the network namespace
- ipnn() { ip -n $nn "$@"; }
-else
- # we are already in the network namespace and it's unnamed.
- # run ip in the network namespace
- ipnn() { ip "$@"; }
+ ipnnargs="-n $nn"
fi
+
+
+ipd() { ip -n default "$@"; }
+
+# run ip in the network namespace
+ipnn() { ip $ipnnargs "$@"; }
+
# default network namespace exec
dexec() { ip netns exec default "$@"; }
# mount namespace exec
-mexec() { /usr/bin/nsenter --mount=/root/mount_namespaces/$nn "$@"; }
-
+mexec() { /usr/bin/nsenter --mount=/run/mount-namespaces/$nn "$@"; }
-# background: head -n1 is defensive. Not sure if there is some weird feature
-# for 2 routes to be 0/0.
-gateway_ifs=($(ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'))
-
-if [[ ! $gateway_ifs ]]; then
- cat >&2 <<EOF
-$0: error: failed to find gateway interface. No output from:
-ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'
-output from "ipd route list exact 0/0":
-$(ipd route list exact 0/0)
-EOF
- exit 1
-fi
nat() {
- for if in ${gateway_ifs[@]}; do
- dexec iptables -t nat $1 POSTROUTING -o $if -j MASQUERADE \
- -m comment --comment "systemd network namespace nat"
- done
+ # Note: duplicated in show()
+ # Note, in a previous commit i specified the output interface with -o,
+ # but that broke things when my gateway interface changed, and I can't
+ # see any advantage to it, so I removed it.
+ dexec iptables -t nat $1 POSTROUTING -s $network.0/24 -j MASQUERADE \
+ -m comment --comment "systemd network namespace nat"
}
# d = default
}
-find_network() {
+find-network() {
if [[ $network ]]; then
return
fi
found=false
- existing=false
ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
for ((i=1; i <= 254; i++)); do
network=$ip_base.$i
- if printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
- existing=true
- else
+ if ! printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
found=true
break
fi
done
-}
-
-start() {
- find_network
if ! $found; then
echo "$0: error: no open network found"
exit 1
fi
+}
+
+# ip add idempotent (if it doesn't exist already)
+ip-add() {
+ local cmd net dev
+ cmd=$1
+ net=$2
+ dev=$3
+ if ! $cmd addr show dev $dev | sed 's/^ *//;s/ *$//' | grep -xF "inet $net scope global $dev"; then
+ $cmd addr add $net dev $dev
+ fi
+
+}
+
+start() {
+ find-network
#### begin mount namespace setup ####
- mkdir -p /root/mount_namespaces
- if ! mountpoint /root/mount_namespaces >/dev/null; then
- mount --bind /root/mount_namespaces /root/mount_namespaces
+ mkdir -p /run/mount-namespaces
+ if ! mountpoint /run/mount-namespaces >/dev/null; then
+ mount --bind /run/mount-namespaces /run/mount-namespaces
fi
# note: This is outside the mount condition because I've mysteriously
# had this become shared instead of private, perhaps it
# got remounted somehow and lost the setting.
- mount --make-private /root/mount_namespaces
- if [[ ! -e /root/mount_namespaces/$nn ]]; then
- touch /root/mount_namespaces/$nn
+ mount --make-private /run/mount-namespaces
+ if [[ ! -e /run/mount-namespaces/$nn ]]; then
+ touch /run/mount-namespaces/$nn
fi
- if ! mountpoint /root/mount_namespaces/$nn >/dev/null; then
+ if ! mountpoint /run/mount-namespaces/$nn >/dev/null; then
# Here, we specify that we only want mount changes changes under
# this mountpoint to be propagated into the bind, but changes
# from within the bind do not propagate to outside the bind.
# documentation on propagation is a bit weird because it
# confusingly talks about binds, namespaces, and mirrors (which
# seems to be just another name for bind), shared subtrees
- # (which seems to a term for binds and namespaces), and does not
+ # (which seems to be a term for binds and namespaces), and does not
# properly specify whether the documentation applies to binds,
# namespaces, or both. Notably, propagation for binds is marked
# on the original mount point, and propagation for a mount
# namespace is marked on mounts within the namespace.
- unshare --propagation slave --mount=/root/mount_namespaces/$nn /bin/true
+ unshare --propagation slave --mount=/run/mount-namespaces/$nn /bin/true
fi
#### end mount namespace setup ####
if $create; then
- ip netns add $nn
+ if ! ip netns | grep -xF $nn &>/dev/null; then
+ ip netns add $nn
+ fi
ip -n $nn link set dev lo up
fi
- echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward 2>/dev/null
+ echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward status=none
# docker helpfully changes the default FORWARD to drop...
diptables-add FORWARD -i $v0 -j ACCEPT
diptables-add FORWARD -o $v0 -j ACCEPT
- _errcatch_cleanup=stop
+ err-cleanup() { stop; }
ipnn link add $v0 type veth peer name $v1
ipnn link set $v0 netns default
- ipd addr add $network.1/24 dev $v0
+ ip-add ipd $network.1/24 $v0
ipd link set $v0 up
nat -C &>/dev/null || nat -A
- ipnn addr add $network.2/24 dev $v1
+ ip-add ipnn $network.2/24 $v1
ipnn link set $v1 up
- ipnn route add default via $network.1
+ cmd="ipnn route add default via $network.1"
+ $cmd
+ fails=0
+ max_fails=2
+ # I've had adding the default route mysteriously fail on boot, so
+ # here we check that it succeeded, do a sleep and a retry.
+ while true; do
+ default_route=$(ipnn route show default | sed -r 's,^[[:space:]]+|[[:space:]]+$,,')
+ if [[ $default_route != "default via $network.1 dev $v1" ]]; then
+ fails=$((fails + 1))
+ else
+ break
+ fi
+ if (( fails >= max_fails )); then
+ echo "$0: ERROR: default route added but not found, retried $max_fails. expected route: 'default via $network.1 dev $v1', found: '$default_route'"
+ # Note: for debugging, if you have a systemd unit which tears down
+ # the newns upon failure, you may want to uncomment the break so
+ # that we proceed and can inspect the system. break
+ exit 1
+ else
+ sleep 1
+ $cmd
+ fi
+ done
+ if (( fails >= 1 )); then
+ echo "$0: WARNING: route added but not found until retried $max_fails times: $cmd"
+ fi
- ###### begin setup resolvconf
- resolv_copy=/root/resolvconf-$nn
- # this condition should never happen, just coding defensively
- if mexec mountpoint /run/resolvconf &>/dev/null; then
- mexec umount /run/resolvconf
- fi
- cp -aT /run/resolvconf $resolv_copy
- if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
- echo "error: resolv-conf bindmount failed"
- exit 1
- fi
- # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
- # in the network namespace, so adjust the address.
- if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
- mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
- mexec resolvconf -u
- fi
- # and in debian based distros at least, it runs with --local-service, and needs a restart
- # to know about the new local network
- if [[ $(systemctl --no-pager show -p ActiveState dnsmasq ) == ActiveState=active ]]; then
- systemctl restart dnsmasq
- fi
+ ###### begin setup resolvconf
+ if [[ -e /run/resolvconf ]]; then # resolvconf probably installed
+ resolv_copy=/root/resolvconf-$nn
- # background: if we did this in openvpn's resolv-conf script, we could guard it in
- # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
- # and we could get $nn by
- # config_basename=${config%%.*}
- # config_basename=${config_basename##*/}
- # but dnsmasq forces us to do it earlier.
- ###### end setup resolvconf
+ # this condition should never happen, just coding defensively
+ if mexec mountpoint /run/resolvconf &>/dev/null; then
+ mexec umount /run/resolvconf
+ fi
+ cp -aT /run/resolvconf $resolv_copy
+ if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
+ echo "error: resolv-conf bindmount failed"
+ exit 1
+ fi
+ # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
+ # in the network namespace, so adjust the address.
+ if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
+ mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
+ mexec resolvconf -u
+ fi
+ # and in debian based distros at least, it runs with --local-service, and needs a restart
+ # to know about the new local network
+ if [[ $(systemctl --no-pager show -p ActiveState dnsmasq ) == ActiveState=active ]]; then
+ systemctl restart dnsmasq
+ fi
+ # background: if we did this in openvpn's resolv-conf script, we could guard it in
+ # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
+ # and we could get $nn by
+ # config_basename=${config%%.*}
+ # config_basename=${config_basename##*/}
+ # but dnsmasq forces us to do it earlier.
+ fi # end if [[ -e /run/resolvconf ]]
+ ###### end setup resolvconf
}
stop() {
+ if [[ ! $network ]]; then
+ network=$(ipd -f inet a show dev $v0 2>/dev/null | awk '/inet / {print $2}' | sed -r 's,\.[0-9]+/.*,,' ||:)
+ fi
if ipd link list $v0 &>/dev/null; then
# this also deletes $v1 and the route we added.
ipd link del $v0
fi
- find_network
- if ! $existing; then
- if nat -C &>/dev/null; then nat -D; fi
- fi
- dexec iptables -D FORWARD -i $v0 -j ACCEPT ||:
- if $create && [[ -e /var/run/netns/client ]]; then
+ if [[ $network ]] && nat -C &>/dev/null; then nat -D; fi
+ dexec iptables -D FORWARD -i $v0 -j ACCEPT &>/dev/null ||:
+ if $create && [[ -e /var/run/netns/$nn ]]; then
ip netns del $nn
fi
mexec umount /run/resolvconf
fi
- if mountpoint /root/mount_namespaces/$nn >/dev/null; then
- umount /root/mount_namespaces/$nn
+ if mountpoint /run/mount-namespaces/$nn >/dev/null; then
+ umount /run/mount-namespaces/$nn
fi
}
+show() {
+ m ipd link list $v0
+ m dexec iptables -t nat -C POSTROUTING -s $network.0/24 -j MASQUERADE \
+ -m comment --comment "systemd network namespace nat" ||:
+ m dexec iptables -C FORWARD -i $v0 -j ACCEPT
+ m mexec mountpoint /run/resolvconf
+ m mountpoint /run/mount-namespaces/$nn
+}
+
case $action in
- start|stop)
+ start|stop|show)
$action
;;
*)