usage() {
cat <<EOF
usage: ${0##*/} [OPTS] start|stop NS_NAME
-Setup new or systemd created network namespace with nat and mount namespace
+Nat a network namespace. systemd friendly
--c, --create Create network namespace. For running outside systemd private net.
+Also creates a mount namespace with a cloned /run/resolvconf.
+
+-c, --create Create a named network namespace. When running from
+ the same network namespace as pid 1, this is set automatically.
+ A systemd created private network is in a network namespace
+ different than pid 1.
+-n NETWORK x.x.x /24 private network to use. If not specified, uses
+ the first one starting at 10.173.1
-h, --help Show this help and exit.
From within a systemd network namespace, nat it to the outside. This
would be called from ExecStartPre, and or subsequent units called with
JoinsNamespaceOf= and PrivateNetwork=true.
-If given -c, or if in the default network namespace, create a named
-network namepace natted to the current netns.
-
-Uses /24 network, finding the first locally unused one starting at
-10.173.0.
-
Also create a named mount namespace under /root/mount_namespaces, so we
can alter some system config for this namespace. Subsequent systemd
command lines would be prefixed with:
in a directory adjacent to the absolute, resolved directory this file is
in.
-Background: "ip netns new ..." also does a mount namespace, then bind
+Background:
+
+If we aren't creating a named network namespace, to join the namespace
+with a shell, I use:
+nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash
+
+Note: if I knew how to easily ask systemd what pid a unit has, i would
+do that.
+
+If we do create the netns, to join it with a shell, we can do
+/usr/bin/nsenter --mount=/root/mount_namespaces/NAME --net=/var/run/netns/NAME bash
+
+"ip netns new ..." also does a mount namespace, then bind
mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
for openvpn having it's own resolv.conf by using it's user script which
calls resolvconf, this doesn't help much. What we actually want to do is
copy /run/resolvconf somehwere then bind mount it on top of
/run/resolvconf.
+Note: for debugging, adding set -x is a pretty good option.
+
Please email me if you have a patches, bugs, feedback, or republish this
somewhere else: Ian Kelling <ian@iankelling.org>.
EOF
#### begin arg parsing ####
create=false
-temp=$(getopt -l help,create hc "$@") || usage 1
+temp=$(getopt -l help,create hcn: "$@") || usage 1
eval set -- "$temp"
while true; do
case $1 in
-c|--create) create=true; shift ;;
+ -n) network=$2; shift 2 ;;
-h|--help) usage ;;
--) shift; break ;;
*) echo "$0: Internal error!" ; exit 1 ;;
ipd() { ip -n default "$@"; }
if $create; then
+ # run ip in the network namespace
ipnn() { ip -n $nn "$@"; }
else
# we are already in the network namespace and it's unnamed.
+ # run ip in the network namespace
ipnn() { ip "$@"; }
fi
+# default network namespace exec
dexec() { ip netns exec default "$@"; }
+# mount namespace exec
+mexec() { /usr/bin/nsenter --mount=/root/mount_namespaces/$nn "$@"; }
# background: head -n1 is defensive. Not sure if there is some weird feature
# for 2 routes to be 0/0.
-gateway_if=$(ipd route list exact 0/0 | head -n1| sed -r 's/.*\s(\S+)\s*$/\1/')
+gateway_if=$(ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/')
+
+if [[ ! $gateway_if ]]; then
+ cat >&2 <<EOF
+$0: error: failed to find gateway interface. No output from:
+ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'
+output from "ipd route list exact 0/0":
+$(ipd route list exact 0/0)
+EOF
+ exit 1
+fi
+
nat() { dexec iptables -t nat $1 POSTROUTING -o $gateway_if -j MASQUERADE \
-m comment --comment "systemd network namespace nat"; }
+# d = default
+diptables-add() {
+ if ! dexec iptables -C "$@" &>/dev/null; then
+ dexec iptables -I "$@"
+ fi
+
+}
+
find_network() {
+ if [[ $network ]]; then
+ return
+ fi
found=false
existing=false
ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
- for ((i=0; i <= 254; i++)); do
+ for ((i=1; i <= 254; i++)); do
network=$ip_base.$i
if printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
existing=true
mkdir -p /root/mount_namespaces
if ! mountpoint /root/mount_namespaces >/dev/null; then
mount --bind /root/mount_namespaces /root/mount_namespaces
- mount --make-private /root/mount_namespaces
fi
+ # note: This is outside the mount condition because I've mysteriously
+ # had this become shared instead of private, perhaps it
+ # got remounted somehow and lost the setting.
+ mount --make-private /root/mount_namespaces
if [[ ! -e /root/mount_namespaces/$nn ]]; then
touch /root/mount_namespaces/$nn
fi
if ! mountpoint /root/mount_namespaces/$nn >/dev/null; then
- unshare --mount=/root/mount_namespaces/$nn
+ # Here, we specify that we only want mount changes changes under
+ # this mountpoint to be propagated into the bind, but changes
+ # from within the bind do not propagate to outside the bind.
+ #
+ # slave is documented in.
+ # /usr/share/doc/linux-doc-4.9/Documentation/filesystems/sharedsubtree.txt.gz
+ # documentation on propagation is a bit weird because it
+ # confusingly talks about binds, namespaces, and mirrors (which
+ # seems to be just another name for bind), shared subtrees
+ # (which seems to a term for binds and namespaces), and does not
+ # properly specify whether the documentation applies to binds,
+ # namespaces, or both. Notably, propagation for binds is marked
+ # on the original mount point, and propagation for a mount
+ # namespace is marked on mounts within the namespace.
+ unshare --propagation slave --mount=/root/mount_namespaces/$nn /bin/true
fi
+
#### end mount namespace setup ####
echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward 2>/dev/null
+ # docker helpfully changes the default FORWARD to drop...
+ diptables-add FORWARD -i $v0 -j ACCEPT
+ diptables-add FORWARD -o $v0 -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT
+
+
_errcatch_cleanup=stop
ipnn link add $v0 type veth peer name $v1
ipnn link set $v0 netns default
ipnn link set $v1 up
ipnn route add default via $network.1
+ ###### begin setup resolvconf
+ resolv_copy=/root/resolvconf-$nn
+
+ # this condition should never happen, just coding defensively
+ if mexec mountpoint /run/resolvconf &>/dev/null; then
+ mexec umount /run/resolvconf
+ fi
+ cp -aT /run/resolvconf $resolv_copy
+ if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
+ echo "error: resolv-conf bindmount failed"
+ exit 1
+ fi
+ # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
+ # in the network namespace, so adjust the address.
+ if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
+ mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
+ mexec resolvconf -u
+ fi
+ # background: if we did this in openvpn's resolv-conf script, we could guard it in
+ # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
+ # and we could get $nn by
+ # config_basename=${config%%.*}
+ # config_basename=${config_basename##*/}
+ # but dnsmasq forces us to do it earlier.
+ ###### end setup resolvconf
+
+
}
stop() {
if ! $existing; then
if nat -C &>/dev/null; then nat -D; fi
fi
- if $create; then
+ dexec iptables -D FORWARD -i $v0 -j ACCEPT ||:
+ if $create && [[ -e /var/run/netns/client ]]; then
ip netns del $nn
fi
+
+ # not sure this is necessary since we are tearing down the mount namespace
+ if mexec mountpoint /run/resolvconf &>/dev/null; then
+ mexec umount /run/resolvconf
+ fi
+
if mountpoint /root/mount_namespaces/$nn >/dev/null; then
umount /root/mount_namespaces/$nn
fi