fix docker nat rules properly

[newns] / newns
diff --git a/newns b/newns

index 31f1b528c80bdab325c17631adf5c6245c745895..cdf7e7dafc31cb0d8f266cdebafa15bf631fe779 100755 (executable)
--- a/newns
+++ b/newns
@@ -37,24 +37,28 @@ fi
  
  usage() {
      cat <<EOF
-usage: ${0##*/} [OPTS] start|stop NETNS_NAME
-Setup new or systemd created network namespace with nat and mount namespace
+usage: ${0##*/} [OPTS] start|stop NS_NAME
+Nat a network namespace. systemd friendly
  
--c, --create    Create network namespace. For running outside systemd private net.
--h, --help      Show this help and exit.
+Also creates a mount namespace with a cloned /run/resolvconf.
  
-From within systemd network namespace, nat it to the outside. If given
--c, or if in the default network namespace, create a named network
-namepace natted to the current netns.
+-c, --create    Create a named network namespace. When running from
+                the same network namespace as pid 1, this is set automatically.
+                A systemd created private network is in a network namespace
+                different than pid 1.
+-n NETWORK      x.x.x /24 private network to use. If not specified, uses
+                the first one starting at 10.173.1
+-h, --help      Show this help and exit.
  
-Uses /24 network, finding the first locally unused one starting at
-10.173.0.
+From within a systemd network namespace, nat it to the outside. This
+would be called from ExecStartPre, and or subsequent units called with
+JoinsNamespaceOf= and PrivateNetwork=true.
  
  Also create a named mount namespace under /root/mount_namespaces, so we
  can alter some system config for this namespace. Subsequent systemd
  command lines would be prefixed with:
  
-/usr/bin/nsenter --mount=/root/mount_namespaces/NETNS_NAME
+/usr/bin/nsenter --mount=/root/mount_namespaces/NS_NAME
  
  Note, this means that they can't run as unpriveledged users, but once
  systemd 233 comes out, it will have a bind mount option from within unit
@@ -68,13 +72,27 @@ https://iankelling.org/git/?p=errhandle, set ERRHANDLE_PATH, or put it
  in a directory adjacent to the absolute, resolved directory this file is
  in.
  
-Background: "ip netns new ..." also does a mount namespace, then bind
-mounts each file/dir in /etc/netns/NETNS_NAME to /etc/NETNS_NAME. Note,
+Background:
+
+If we aren't creating a named network namespace, to join the namespace
+with a shell, I use:
+nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash
+
+Note: if I knew how to easily ask systemd what pid a unit has, i would
+do that.
+
+If we do create the netns, to join it with a shell, we can do
+/usr/bin/nsenter --mount=/root/mount_namespaces/NAME --net=/var/run/netns/NAME bash
+
+"ip netns new ..." also does a mount namespace, then bind
+mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
  for openvpn having it's own resolv.conf by using it's user script which
  calls resolvconf, this doesn't help much. What we actually want to do is
  copy /run/resolvconf somehwere then bind mount it on top of
  /run/resolvconf.
  
+Note: for debugging, adding set -x is a pretty good option.
+
  Please email me if you have a patches, bugs, feedback, or republish this
  somewhere else: Ian Kelling <ian@iankelling.org>.
  EOF
@@ -82,13 +100,14 @@ EOF
  }
  
  
-## begin arg parsing ##
+#### begin arg parsing ####
  create=false
-temp=$(getopt -l help,create hc "$@") || usage 1
+temp=$(getopt -l help,create hcn: "$@") || usage 1
  eval set -- "$temp"
  while true; do
      case $1 in
          -c|--create) create=true; shift ;;
+        -n) network=$2; shift 2 ;;
          -h|--help) usage ;;
          --) shift; break ;;
          *) echo "$0: Internal error!" ; exit 1 ;;
@@ -99,11 +118,10 @@ if (( $# != 2 )); then
  fi
  
  action=$1
-nn=$2 # network namespace / namespace name
-## end arg parsing ##
-
-## begin sanity checking ##
+nn=$2 # namespace name
+#### end arg parsing ####
  
+#### begin sanity checking ####
  install_error=false
  if ! type -p ip &>/dev/null; then
      echo "please install the iproute2 package"
@@ -116,8 +134,7 @@ fi
  if $install_error; then
      exit 1
  fi
-
-##   end sanity checking ##
+####   end sanity checking ####
  
  
  v0=veth0-$nn
@@ -128,35 +145,62 @@ if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)"
      create=true
  fi
  
+# make the default network namespace be named
  target=/run/netns/default
  if [[ ! -e $target && ! -L $target ]]; then
      mkdir -p /run/netns
-    # make the default network namespace be named
      ln -s /proc/1/ns/net $target
  fi
  
  
  ipd() { ip -n default "$@"; }
  if $create; then
+    # run ip in the network namespace
      ipnn() { ip -n $nn "$@"; }
  else
      # we are already in the network namespace and it's unnamed.
+    # run ip in the network namespace
      ipnn() { ip "$@"; }
  fi
+# default network namespace exec
  dexec() { ip netns exec default "$@"; }
+# mount namespace exec
+mexec() { /usr/bin/nsenter --mount=/root/mount_namespaces/$nn "$@"; }
  
  
-# head -n1 is defensive. Not sure if there is some weird feature
+# background: head -n1 is defensive. Not sure if there is some weird feature
  # for 2 routes to be 0/0.
-gateway_if=$(ipd route list exact 0/0 | head -n1| sed -r 's/.*\s(\S+)\s*$/\1/')
+gateway_if=$(ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/')
+
+if [[ ! $gateway_if ]]; then
+    cat >&2 <<EOF
+$0: error: failed to find gateway interface. No output from:
+ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'
+output from "ipd route list exact 0/0":
+$(ipd route list exact 0/0)
+EOF
+    exit 1
+fi
+
  nat() { dexec iptables -t nat $1 POSTROUTING -o $gateway_if -j MASQUERADE \
                -m comment --comment "systemd network namespace nat"; }
  
+# d = default
+diptables-add() {
+    if ! dexec iptables -C "$@" &>/dev/null; then
+        dexec iptables -I "$@"
+    fi
+
+}
+
  find_network() {
+    if [[ $network ]]; then
+        return
+    fi
      found=false
      existing=false
      ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
-    for ((i=0; i <= 254; i++)); do
+    for ((i=1; i <= 254; i++)); do
          network=$ip_base.$i
          if printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
              existing=true
@@ -168,34 +212,56 @@ find_network() {
  }
  
  start() {
-
      find_network
      if ! $found; then
          echo "$0: error: no open network found"
          exit 1
      fi
  
+    #### begin mount namespace setup ####
      mkdir -p /root/mount_namespaces
      if ! mountpoint /root/mount_namespaces >/dev/null; then
          mount --bind /root/mount_namespaces /root/mount_namespaces
-        mount --make-private /root/mount_namespaces
      fi
+    # note: This is outside the mount condition because I've mysteriously
+    # had this become shared instead of private, perhaps it
+    # got remounted somehow and lost the setting.
+    mount --make-private /root/mount_namespaces
      if [[ ! -e /root/mount_namespaces/$nn ]]; then
          touch /root/mount_namespaces/$nn
      fi
      if ! mountpoint /root/mount_namespaces/$nn >/dev/null; then
-        unshare --mount=/root/mount_namespaces/$nn
+        # Here, we specify that we only want mount changes changes under
+        # this mountpoint to be propagated into the bind, but changes
+        # from within the bind do not propagate to outside the bind.
+        #
+        # slave is documented in.
+        # /usr/share/doc/linux-doc-4.9/Documentation/filesystems/sharedsubtree.txt.gz
+        # documentation on propagation is a bit weird because it
+        # confusingly talks about binds, namespaces, and mirrors (which
+        # seems to be just another name for bind), shared subtrees
+        # (which seems to a term for binds and namespaces), and does not
+        # properly specify whether the documentation applies to binds,
+        # namespaces, or both. Notably, propagation for binds is marked
+        # on the original mount point, and propagation for a mount
+        # namespace is marked on mounts within the namespace.
+        unshare --propagation slave --mount=/root/mount_namespaces/$nn /bin/true
      fi
  
+    ####   end mount namespace setup ####
+
  
      if $create; then
          ip netns add $nn
          ip -n $nn link set dev lo up
      fi
  
+    echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward 2>/dev/null
  
+    # docker helpfully changes the default FORWARD to drop...
+    diptables-add FORWARD -i $v0 -j ACCEPT
+    diptables-add FORWARD -o $v0 -j ACCEPT
  
-    echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward 2>/dev/null
  
      _errcatch_cleanup=stop
      ipnn link add $v0 type veth peer name $v1
@@ -207,6 +273,33 @@ start() {
      ipnn link set $v1 up
      ipnn route add default via $network.1
  
+    ###### begin setup resolvconf
+    resolv_copy=/root/resolvconf-$nn
+
+    # this condition should never happen, just coding defensively
+    if mexec mountpoint /run/resolvconf &>/dev/null; then
+        mexec umount /run/resolvconf
+    fi
+    cp -aT /run/resolvconf $resolv_copy
+    if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
+        echo "error: resolv-conf bindmount failed"
+        exit 1
+    fi
+    # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
+    # in the network namespace, so adjust the address.
+    if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
+        mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
+        mexec resolvconf -u
+    fi
+    # background: if we did this in openvpn's resolv-conf script, we could guard it in
+    # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
+    # and we could get $nn by
+    # config_basename=${config%%.*}
+    # config_basename=${config_basename##*/}
+    # but dnsmasq forces us to do it earlier.
+    ######  end setup resolvconf
+
+
  }
  
  stop() {
@@ -218,9 +311,16 @@ stop() {
      if ! $existing; then
          if nat -C &>/dev/null; then nat -D; fi
      fi
-    if $create; then
+    dexec iptables -D FORWARD -i $v0 -j ACCEPT ||:
+    if $create && [[ -e /var/run/netns/client ]]; then
          ip netns del $nn
      fi
+
+    # not sure this is necessary since we are tearing down the mount namespace
+    if mexec mountpoint /run/resolvconf &>/dev/null; then
+        mexec umount /run/resolvconf
+    fi
+
      if mountpoint /root/mount_namespaces/$nn >/dev/null; then
          umount /root/mount_namespaces/$nn
      fi