#!/bin/bash
-# Copyright (C) 2017 Ian Kelling
+# I, Ian Kelling, follow the GNU license recommendations at
+# https://www.gnu.org/licenses/license-recommendations.en.html. They
+# recommend that small programs, < 300 lines, be licensed under the
+# Apache License 2.0. This file contains or is part of one or more small
+# programs. If a small program grows beyond 300 lines, I plan to switch
+# its license to GPL.
+
+# Copyright 2024 Ian Kelling
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
[[ $EUID == 0 ]] || exec sudo -E "$BASH_SOURCE" "$@"
-if [[ ! $ERRHANDLE_PATH ]]; then
- ERRHANDLE_PATH=$(readlink -f "${BASH_SOURCE}")
- ERRHANDLE_PATH=$(readlink -f ${ERRHANDLE_PATH%/*}/../errhandle)
-fi
-err_sourced=true
-for p in $ERRHANDLE_PATH/{errcatch-function,bash-trace-function}; do
- if [[ -e $p ]]; then
- source $p
- else
- err_sourced=false
- fi
-done
-if $err_sourced; then
- errcatch
-else
- set -eE -o pipefail
- trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
-fi
+# https://savannah.nongnu.org/projects/bash-bear-trap/
+set -e; . /usr/local/lib/bash-bear; set +e
+
+m() {
+ local out
+ printf "newns: %s\n" "$*"
+ if ! out=$("$@" 2>&1); then
+ echo "newns: WARNING: last command exit code: $?"
+ elif [[ ! $out ]]; then
+ echo "newns: WARNING: no output from last command"
+ fi
+}
usage() {
- cat <<EOF
-usage: ${0##*/} [OPTS] start|stop NETNS_NAME
+ cat <<EOF
+usage: ${0##*/} [OPTS] start|stop|show NS_NAME
+Nat a network namespace. systemd friendly
+
+Also creates a mount namespace with a cloned /run/resolvconf.
--c, --create Create network namespace. For running outside systemd private net.
+Arguments:
+
+start|stop: these do what they say.
+
+show: Show the state we expected to be there or not there based on
+start/stop. This is useful for debugging.
+
+NS_NAME: We use this to name the interfaces we create, the mount
+namespace, and if we are creating a named network space, that too.
+
+-c, --create Create or destroy a named network namespace. When running from
+ the same network namespace as pid 1, this is set automatically.
+ A systemd created private network is in an unnamed network namespace
+ different than pid 1. I haven't found a need for a named network
+ namespace in that case.
+-n NETWORK x.x.x /24 private network to use. If not specified, uses
+ the first unused one starting at 10.173.1
-h, --help Show this help and exit.
-From within systemd network namespace, nat it to the outside. If given
--c, or if in the default network namespace, create a named network
-namepace natted to the current netns.
+From a normal shell:
+
+If we do create the netns, to join it with a shell, we can do (as root)
+/usr/bin/nsenter --mount=/run/mount-namespaces/NAME --net=/var/run/netns/NAME bash
+
+If you dont care about the mount namespace, you can leave that option off.
+
+
+For systemd:
-Also create a named mount namespace under /root/mount_namespaces, so we
-can alter some system config for this namespace. Subsequent systemd
-command lines would be prefixed with:
+From within a systemd network namespace, we nat it to the outside. This
+would be called from ExecStartPre, and or subsequent units called with
+JoinsNamespaceOf= and PrivateNetwork=true.
-/usr/bin/nsenter --mount=/root/mount_namespaces/NETNS_NAME
+If resolvconf is installed, we create a named mount namespace under
+/run/mount-namespaces, so we can alter some system config for this
+namespace. systemd command lines would be prefixed with:
+/usr/bin/nsenter --mount=/run/mount-namespaces/NS_NAME
-"ip netns new ..." also does a mount namespace, then bind mounts each
-thing in /etc/netns/NETNS_NAME to /etc/NETNS_NAME. Note, for openvpn having it's own
-resolv.conf, this doesn't help much. What we actually want to do is copy
-/run/resolvconf somehwere, then bind mount it on top of /run/resolvconf.
+Note, this means that they can't run as unpriveledged users, but once
+systemd 233 comes out, it will have a bind mount option from within unit
+files, so the mount namespace won't be needed for most use cases, and I
+will update the script to that the mount namespace not created unless a
+flag is passed in. Patch welcome to add that flag before then.
-Once systemd 233 comes out, it will have a bind mount option from within
-unit files, so the mount namespace won't be needed for this use case.
-Recommmended dependency of errhandle to print stack trace on error:
-https://iankelling.org/git/?p=errhandle, set ERRHANDLE_PATH, or put it
-in a directory adjacent to the absolute, resolved directory this file is
-in.
+This script has a dependency
+https://savannah.nongnu.org/projects/bash-bear-trap/ . Search the script for "source" to see where to install or modify the installed location.
+
+Background on this project (you can skip if you like):
+
+If we aren't creating a named network namespace, to join the namespace
+with a shell, I use:
+nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash
+
+Note: if I knew how to easily ask systemd what pid a unit has, i would
+do that.
+
+"ip netns new ..." also does a mount namespace, then bind
+mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
+for openvpn having it's own resolv.conf by using it's user script which
+calls resolvconf, this doesn't help much. What we actually want to do is
+copy /run/resolvconf somehwere then bind mount it on top of
+/run/resolvconf.
+
+
+Note: for debugging, adding set -x is a pretty good option.
+
+TODO: make "start" be idempotent.
+
+Please email me if you have a patches, bugs, feedback, or republish this
+somewhere else: Ian Kelling <ian@iankelling.org>.
EOF
- exit ${1:-0}
+ exit ${1:-0}
}
-## begin arg parsing ##
+#### begin arg parsing ####
create=false
-temp=$(getopt -l help,create hc "$@") || usage 1
+temp=$(getopt -l help,create hcdn: "$@") || usage 1
eval set -- "$temp"
while true; do
- case $1 in
- -c|--create) create=true; shift ;;
- -h|--help) usage ;;
- --) shift; break ;;
- *) echo "$0: Internal error!" ; exit 1 ;;
- esac
+ case $1 in
+ -c|--create) create=true; shift ;;
+ -n) network=$2; shift 2 ;;
+ -h|--help) usage ;;
+ --) shift; break ;;
+ *) echo "$0: Internal error!" ; exit 1 ;;
+ esac
done
if (( $# != 2 )); then
- usage 1
+ usage 1
fi
action=$1
-nn=$2 # network namespace / namespace name
-## end arg parsing ##
-
-## begin sanity checking ##
+nn=$2 # namespace name
+#### end arg parsing ####
+#### begin sanity checking ####
install_error=false
if ! type -p ip &>/dev/null; then
- echo "please install the iproute2 package"
- install_error=true
+ echo "please install the iproute2 package"
+ install_error=true
fi
if ! type -p iptables &>/dev/null; then
- echo "please install the iptables package"
- install_error=true
+ echo "please install the iptables package"
+ install_error=true
fi
if $install_error; then
- exit 1
+ exit 1
fi
-
-## end sanity checking ##
-
+#### end sanity checking ####
v0=veth0-$nn
v1=veth1-$nn
ip_base=10.173
-if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
- create=true
-fi
+### begin make the default network namespace be named "default" ###
+mkdir -p /run/netns
target=/run/netns/default
if [[ ! -e $target && ! -L $target ]]; then
- mkdir -p /run/netns
- # make the default network namespace be named
- ln -s /proc/1/ns/net $target
+ # -f to avoid a race condition with running twice
+ ln -sf /proc/1/ns/net $target
fi
+### end make the default network namespace be named "default" ###
-
-ipd() { ip -n default "$@"; }
+if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
+ create=true
+fi
+# otherwise we are already in the network namespace and it's unnamed.
if $create; then
- ipnn() { ip -n $nn "$@"; }
-else
- # we are already in the network namespace and it's unnamed.
- ipnn() { ip "$@"; }
+ ipnnargs="-n $nn"
fi
+
+
+ipd() { ip -n default "$@"; }
+
+# run ip in the network namespace
+ipnn() { ip $ipnnargs "$@"; }
+
+# default network namespace exec
dexec() { ip netns exec default "$@"; }
+# mount namespace exec
+mexec() { /usr/bin/nsenter --mount=/run/mount-namespaces/$nn "$@"; }
-# head -n1 is defensive. Not sure if there is some weird feature
-# for 2 routes to be 0/0.
-gateway_if=$(ipd route list exact 0/0 | head -n1| sed -r 's/.*\s(\S+)\s*$/\1/')
-nat() { dexec iptables -t nat $1 POSTROUTING -o $gateway_if -j MASQUERADE \
- -m comment --comment "systemd network namespace nat"; }
-
-find_network() {
- found=false
- existing=false
- ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
- for ((i=0; i <= 254; i++)); do
- network=$ip_base.$i
- if printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
- existing=true
- else
- found=true
- break
- fi
- done
+nat() {
+ # Note: duplicated in show()
+ # Note, in a previous commit i specified the output interface with -o,
+ # but that broke things when my gateway interface changed, and I can't
+ # see any advantage to it, so I removed it.
+ dexec iptables -t nat $1 POSTROUTING -s $network.0/24 -j MASQUERADE \
+ -m comment --comment "systemd network namespace nat"
}
-start() {
+# d = default
+diptables-add() {
+ if ! dexec iptables -C "$@" &>/dev/null; then
+ dexec iptables -I "$@"
+ fi
- find_network
- if ! $found; then
- echo "$0: error: no open network found"
- exit 1
- fi
+}
- mkdir -p /root/mount_namespaces
- if ! mountpoint /root/mount_namespaces >/dev/null; then
- mount --bind /root/mount_namespaces /root/mount_namespaces
- mount --make-private /root/mount_namespaces
- fi
- if [[ ! -e /root/mount_namespaces/$nn ]]; then
- touch /root/mount_namespaces/$nn
- fi
- if ! mountpoint /root/mount_namespaces/$nn >/dev/null; then
- unshare --mount=/root/mount_namespaces/$nn
+find-network() {
+ if [[ $network ]]; then
+ return
+ fi
+ found=false
+ ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
+ for ((i=1; i <= 254; i++)); do
+ network=$ip_base.$i
+ if ! printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
+ found=true
+ break
fi
+ done
+ if ! $found; then
+ echo "$0: error: no open network found"
+ exit 1
+ fi
+}
+# ip add idempotent (if it doesn't exist already)
+ip-add() {
+ local cmd net dev
+ cmd=$1
+ net=$2
+ dev=$3
+ if ! $cmd addr show dev $dev | sed 's/^ *//;s/ *$//' | grep -xF "inet $net scope global $dev"; then
+ $cmd addr add $net dev $dev
+ fi
+
+}
- if $create; then
- ip netns add $nn
- ip -n $nn link set dev lo up
+start() {
+ find-network
+
+ #### begin mount namespace setup ####
+ mkdir -p /run/mount-namespaces
+ if ! mountpoint /run/mount-namespaces >/dev/null; then
+ mount --bind /run/mount-namespaces /run/mount-namespaces
+ fi
+ # note: This is outside the mount condition because I've mysteriously
+ # had this become shared instead of private, perhaps it
+ # got remounted somehow and lost the setting.
+ mount --make-private /run/mount-namespaces
+ if [[ ! -e /run/mount-namespaces/$nn ]]; then
+ touch /run/mount-namespaces/$nn
+ fi
+ if ! mountpoint /run/mount-namespaces/$nn >/dev/null; then
+ # Here, we specify that we only want mount changes changes under
+ # this mountpoint to be propagated into the bind, but changes
+ # from within the bind do not propagate to outside the bind.
+ #
+ # slave is documented in.
+ # /usr/share/doc/linux-doc-4.9/Documentation/filesystems/sharedsubtree.txt.gz
+ # documentation on propagation is a bit weird because it
+ # confusingly talks about binds, namespaces, and mirrors (which
+ # seems to be just another name for bind), shared subtrees
+ # (which seems to be a term for binds and namespaces), and does not
+ # properly specify whether the documentation applies to binds,
+ # namespaces, or both. Notably, propagation for binds is marked
+ # on the original mount point, and propagation for a mount
+ # namespace is marked on mounts within the namespace.
+ unshare --propagation slave --mount=/run/mount-namespaces/$nn /bin/true
+ fi
+
+ #### end mount namespace setup ####
+
+
+ if $create; then
+ if ! ip netns | grep -xF $nn &>/dev/null; then
+ ip netns add $nn
+ fi
+ ip -n $nn link set dev lo up
+ fi
+
+ echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward status=none
+
+ # docker helpfully changes the default FORWARD to drop...
+ diptables-add FORWARD -i $v0 -j ACCEPT
+ diptables-add FORWARD -o $v0 -j ACCEPT
+
+
+ err-cleanup() { stop; }
+ ipnn link add $v0 type veth peer name $v1
+ ipnn link set $v0 netns default
+ ip-add ipd $network.1/24 $v0
+ ipd link set $v0 up
+ nat -C &>/dev/null || nat -A
+ ip-add ipnn $network.2/24 $v1
+ ipnn link set $v1 up
+ cmd="ipnn route add default via $network.1"
+ $cmd
+ fails=0
+ max_fails=2
+ # I've had adding the default route mysteriously fail on boot, so
+ # here we check that it succeeded, do a sleep and a retry.
+ while true; do
+ default_route=$(ipnn route show default | sed -r 's,^[[:space:]]+|[[:space:]]+$,,')
+ if [[ $default_route != "default via $network.1 dev $v1" ]]; then
+ fails=$((fails + 1))
+ else
+ break
+ fi
+ if (( fails >= max_fails )); then
+ echo "$0: ERROR: default route added but not found, retried $max_fails. expected route: 'default via $network.1 dev $v1', found: '$default_route'"
+ # Note: for debugging, if you have a systemd unit which tears down
+ # the newns upon failure, you may want to uncomment the break so
+ # that we proceed and can inspect the system. break
+ exit 1
+ else
+ sleep 1
+ $cmd
fi
+ done
+ if (( fails >= 1 )); then
+ echo "$0: WARNING: route added but not found until retried $max_fails times: $cmd"
+ fi
+ ###### begin setup resolvconf
+ if [[ -e /run/resolvconf ]]; then # resolvconf probably installed
+ resolv_copy=/root/resolvconf-$nn
- echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward 2>/dev/null
+ # this condition should never happen, just coding defensively
+ if mexec mountpoint /run/resolvconf &>/dev/null; then
+ mexec umount /run/resolvconf
+ fi
+ cp -aT /run/resolvconf $resolv_copy
+ if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
+ echo "error: resolv-conf bindmount failed"
+ exit 1
+ fi
+ # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
+ # in the network namespace, so adjust the address.
+ if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
+ mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
+ mexec resolvconf -u
+ fi
+ # and in debian based distros at least, it runs with --local-service, and needs a restart
+ # to know about the new local network
+ if [[ $(systemctl --no-pager show -p ActiveState dnsmasq ) == ActiveState=active ]]; then
+ systemctl restart dnsmasq
+ fi
- _errcatch_cleanup=stop
- ipnn link add $v0 type veth peer name $v1
- ipnn link set $v0 netns default
- ipd addr add $network.1/24 dev $v0
- ipd link set $v0 up
- nat -C &>/dev/null || nat -A
- ipnn addr add $network.2/24 dev $v1
- ipnn link set $v1 up
- ipnn route add default via $network.1
+ # background: if we did this in openvpn's resolv-conf script, we could guard it in
+ # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
+ # and we could get $nn by
+ # config_basename=${config%%.*}
+ # config_basename=${config_basename##*/}
+ # but dnsmasq forces us to do it earlier.
+ fi # end if [[ -e /run/resolvconf ]]
+ ###### end setup resolvconf
}
stop() {
- if ipd link list $v0 &>/dev/null; then
- # this also deletes $v1 and the route we added.
- ipd link del $v0
- fi
- find_network
- if ! $existing; then
- if nat -C &>/dev/null; then nat -D; fi
- fi
- if $create; then
- ip netns del $nn
- fi
+ if [[ ! $network ]]; then
+ network=$(ipd -f inet a show dev $v0 2>/dev/null | awk '/inet / {print $2}' | sed -r 's,\.[0-9]+/.*,,' ||:)
+ fi
+ if ipd link list $v0 &>/dev/null; then
+ # this also deletes $v1 and the route we added.
+ ipd link del $v0
+ fi
+ if [[ $network ]] && nat -C &>/dev/null; then nat -D; fi
+ dexec iptables -D FORWARD -i $v0 -j ACCEPT &>/dev/null ||:
+ if $create && [[ -e /var/run/netns/$nn ]]; then
+ ip netns del $nn
+ fi
+
+ # not sure this is necessary since we are tearing down the mount namespace
+ if mexec mountpoint /run/resolvconf &>/dev/null; then
+ mexec umount /run/resolvconf
+ fi
+
+ if mountpoint /run/mount-namespaces/$nn >/dev/null; then
+ umount /run/mount-namespaces/$nn
+ fi
+}
+
+show() {
+ m ipd link list $v0
+ m dexec iptables -t nat -C POSTROUTING -s $network.0/24 -j MASQUERADE \
+ -m comment --comment "systemd network namespace nat" ||:
+ m dexec iptables -C FORWARD -i $v0 -j ACCEPT
+ m mexec mountpoint /run/resolvconf
+ m mountpoint /run/mount-namespaces/$nn
}
case $action in
- start|stop)
- $action
- ;;
- *)
- echo "$0: error: unsupported action"
- exit 1
- ;;
+ start|stop|show)
+ $action
+ ;;
+ *)
+ echo "$0: error: unsupported action"
+ exit 1
+ ;;
esac