add resolvconf mount namespace
[newns] / newns
1 #!/bin/bash
2 # Copyright (C) 2017 Ian Kelling
3
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7
8 # http://www.apache.org/licenses/LICENSE-2.0
9
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16
17 [[ $EUID == 0 ]] || exec sudo -E "$BASH_SOURCE" "$@"
18
19 if [[ ! $ERRHANDLE_PATH ]]; then
20 ERRHANDLE_PATH=$(readlink -f "${BASH_SOURCE}")
21 ERRHANDLE_PATH=$(readlink -f ${ERRHANDLE_PATH%/*}/../errhandle)
22 fi
23 err_sourced=true
24 for p in $ERRHANDLE_PATH/{errcatch-function,bash-trace-function}; do
25 if [[ -e $p ]]; then
26 source $p
27 else
28 err_sourced=false
29 fi
30 done
31 if $err_sourced; then
32 errcatch
33 else
34 set -eE -o pipefail
35 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
36 fi
37
38 usage() {
39 cat <<EOF
40 usage: ${0##*/} [OPTS] start|stop NS_NAME
41 Nat a network namespace. systemd friendly
42
43 Also creates a mount namespace with a cloned /run/resolvconf.
44
45 -c, --create Create a named network namespace. When running from
46 the same network namespace as pid 1, this is set automatically.
47 A systemd created private network is in a network namespace
48 different than pid 1.
49 -n NETWORK x.x.x /24 private network to use. If not specified, uses
50 the first one starting at 10.173.1
51 -h, --help Show this help and exit.
52
53 From within a systemd network namespace, nat it to the outside. This
54 would be called from ExecStartPre, and or subsequent units called with
55 JoinsNamespaceOf= and PrivateNetwork=true.
56
57 Also create a named mount namespace under /root/mount_namespaces, so we
58 can alter some system config for this namespace. Subsequent systemd
59 command lines would be prefixed with:
60
61 /usr/bin/nsenter --mount=/root/mount_namespaces/NS_NAME
62
63 Note, this means that they can't run as unpriveledged users, but once
64 systemd 233 comes out, it will have a bind mount option from within unit
65 files, so the mount namespace won't be needed for most use cases, and I
66 will update the script to that the mount namespace not created unless a
67 flag is passed in. Patch welcome to add that flag before then.
68
69 A recommmended dependency of this script is my other repo named "errhandle",
70 which prints stack trace on error, and calls a cleanup function:
71 https://iankelling.org/git/?p=errhandle, set ERRHANDLE_PATH, or put it
72 in a directory adjacent to the absolute, resolved directory this file is
73 in.
74
75 Background:
76
77 This script does not make the namespace be named like ip does, because
78 the naming is not necessary, although it could have been done with some
79 more work. For debugging and joining the namespace with a bash shell, I
80 use nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash. Note: if I
81 knew how to easily ask systemd what pid a unit has, i would do that.
82
83 "ip netns new ..." also does a mount namespace, then bind
84 mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
85 for openvpn having it's own resolv.conf by using it's user script which
86 calls resolvconf, this doesn't help much. What we actually want to do is
87 copy /run/resolvconf somehwere then bind mount it on top of
88 /run/resolvconf.
89
90 Note: for debugging, adding set -x is a pretty good option.
91
92 Please email me if you have a patches, bugs, feedback, or republish this
93 somewhere else: Ian Kelling <ian@iankelling.org>.
94 EOF
95 exit ${1:-0}
96 }
97
98
99 #### begin arg parsing ####
100 create=false
101 temp=$(getopt -l help,create hcn: "$@") || usage 1
102 eval set -- "$temp"
103 while true; do
104 case $1 in
105 -c|--create) create=true; shift ;;
106 -n) network=$2; shift 2 ;;
107 -h|--help) usage ;;
108 --) shift; break ;;
109 *) echo "$0: Internal error!" ; exit 1 ;;
110 esac
111 done
112 if (( $# != 2 )); then
113 usage 1
114 fi
115
116 action=$1
117 nn=$2 # namespace name
118 #### end arg parsing ####
119
120 #### begin sanity checking ####
121 install_error=false
122 if ! type -p ip &>/dev/null; then
123 echo "please install the iproute2 package"
124 install_error=true
125 fi
126 if ! type -p iptables &>/dev/null; then
127 echo "please install the iptables package"
128 install_error=true
129 fi
130 if $install_error; then
131 exit 1
132 fi
133 #### end sanity checking ####
134
135
136 v0=veth0-$nn
137 v1=veth1-$nn
138 ip_base=10.173
139
140 if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
141 create=true
142 fi
143
144 # make the default network namespace be named
145 target=/run/netns/default
146 if [[ ! -e $target && ! -L $target ]]; then
147 mkdir -p /run/netns
148 ln -s /proc/1/ns/net $target
149 fi
150
151
152 ipd() { ip -n default "$@"; }
153 if $create; then
154 # run ip in the network namespace
155 ipnn() { ip -n $nn "$@"; }
156 else
157 # we are already in the network namespace and it's unnamed.
158 # run ip in the network namespace
159 ipnn() { ip "$@"; }
160 fi
161 # default network namespace exec
162 dexec() { ip netns exec default "$@"; }
163 # mount namespace exec
164 mexec() { /usr/bin/nsenter --mount=/root/mount_namespaces/$nn "$@"; }
165
166
167 # background: head -n1 is defensive. Not sure if there is some weird feature
168 # for 2 routes to be 0/0.
169 gateway_if=$(ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/')
170 nat() { dexec iptables -t nat $1 POSTROUTING -o $gateway_if -j MASQUERADE \
171 -m comment --comment "systemd network namespace nat"; }
172
173 find_network() {
174 if [[ $network ]]; then
175 return
176 fi
177 found=false
178 existing=false
179 ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
180 for ((i=1; i <= 254; i++)); do
181 network=$ip_base.$i
182 if printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
183 existing=true
184 else
185 found=true
186 break
187 fi
188 done
189 }
190
191 start() {
192 find_network
193 if ! $found; then
194 echo "$0: error: no open network found"
195 exit 1
196 fi
197
198 #### begin mount namespace setup ####
199 mkdir -p /root/mount_namespaces
200 if ! mountpoint /root/mount_namespaces >/dev/null; then
201 mount --bind /root/mount_namespaces /root/mount_namespaces
202 fi
203 # note: This is outside the mount condition because I've mysteriously
204 # had this become shared instead of private, perhaps it
205 # got remounted somehow and lost the setting.
206 mount --make-private /root/mount_namespaces
207 if [[ ! -e /root/mount_namespaces/$nn ]]; then
208 touch /root/mount_namespaces/$nn
209 fi
210 if ! mountpoint /root/mount_namespaces/$nn >/dev/null; then
211 # documentation on propagation is a bit weird because it
212 # confusingly talks about binds, namespaces, and mirrors (which
213 # seems to be just another name for bind), shared subtrees
214 # (which seems to a term for binds and namespaces), and does not
215 # properly specify whether the documentation applies to binds,
216 # namespaces, or both. Notably, propagation for binds is marked
217 # on the original mount point, and propagation for a mount
218 # namespace is marked on mounts within the namespace. Here, we
219 # specify that we want mount changes propagated to us, but not
220 # back.
221 unshare --propagation slave --mount=/root/mount_namespaces/$nn /bin/true
222 fi
223
224 #### end mount namespace setup ####
225
226
227 if $create; then
228 ip netns add $nn
229 ip -n $nn link set dev lo up
230 fi
231
232 echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward 2>/dev/null
233
234 # docker helpfully changes the default FORWARD to drop...
235 if ! dexec iptables -C FORWARD -i $v0 -j ACCEPT &>/dev/null; then
236 dexec iptables -A FORWARD -i $v0 -j ACCEPT
237 fi
238
239 _errcatch_cleanup=stop
240 ipnn link add $v0 type veth peer name $v1
241 ipnn link set $v0 netns default
242 ipd addr add $network.1/24 dev $v0
243 ipd link set $v0 up
244 nat -C &>/dev/null || nat -A
245 ipnn addr add $network.2/24 dev $v1
246 ipnn link set $v1 up
247 ipnn route add default via $network.1
248
249 ###### begin setup resolvconf
250 resolv_copy=/root/resolvconf-$nn
251
252 # this condition should never happen, just coding defensively
253 if mexec mountpoint /run/resolvconf &>/dev/null; then
254 mexec umount /run/resolvconf
255 fi
256 cp -aT /run/resolvconf $resolv_copy
257 if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
258 echo "error: resolv-conf bindmount failed"
259 exit 1
260 fi
261 # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
262 # in the network namespace, so adjust the address.
263 if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
264 mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
265 mexec resolvconf -u
266 fi
267 # background: if we did this in openvpn's resolv-conf script, we could guard it in
268 # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
269 # and we could get $nn by
270 # config_basename=${config%%.*}
271 # config_basename=${config_basename##*/}
272 # but dnsmasq forces us to do it earlier.
273 ###### end setup resolvconf
274
275
276 }
277
278 stop() {
279 if ipd link list $v0 &>/dev/null; then
280 # this also deletes $v1 and the route we added.
281 ipd link del $v0
282 fi
283 find_network
284 if ! $existing; then
285 if nat -C &>/dev/null; then nat -D; fi
286 fi
287 dexec iptables -D FORWARD -i $v0 -j ACCEPT ||:
288 if $create; then
289 ip netns del $nn
290 fi
291
292 # not sure this is necessary since we are tearing down the mount namespace
293 if mexec mountpoint /run/resolvconf &>/dev/null; then
294 mexec umount /run/resolvconf
295 fi
296
297 if mountpoint /root/mount_namespaces/$nn >/dev/null; then
298 umount /root/mount_namespaces/$nn
299 fi
300 }
301
302 case $action in
303 start|stop)
304 $action
305 ;;
306 *)
307 echo "$0: error: unsupported action"
308 exit 1
309 ;;
310 esac