fix regression
[newns] / newns
1 #!/bin/bash
2 # Copyright (C) 2017 Ian Kelling
3
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7
8 # http://www.apache.org/licenses/LICENSE-2.0
9
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16
17 [[ $EUID == 0 ]] || exec sudo -E "$BASH_SOURCE" "$@"
18
19 # https://savannah.nongnu.org/projects/bash-bear-trap/
20 set -e; . /usr/local/lib/bash-bear; set +e
21
22 m() {
23 local out
24 printf "newns: %s\n" "$*"
25 if ! out=$("$@" 2>&1); then
26 echo "newns: WARNING: last command exit code: $?"
27 elif [[ ! $out ]]; then
28 echo "newns: WARNING: no output from last command"
29 fi
30 }
31
32 usage() {
33 cat <<EOF
34 usage: ${0##*/} [OPTS] start|stop|show NS_NAME
35 Nat a network namespace. systemd friendly
36
37 Also creates a mount namespace with a cloned /run/resolvconf.
38
39 Arguments:
40
41 start|stop: these do what they say.
42
43 show: Show the state we expected to be there or not there based on
44 start/stop. This is useful for debugging.
45
46 NS_NAME: We use this to name the interfaces we create, the mount
47 namespace, and if we are creating a named network space, that too.
48
49 -c, --create Create or destroy a named network namespace. When running from
50 the same network namespace as pid 1, this is set automatically.
51 A systemd created private network is in an unnamed network namespace
52 different than pid 1. I haven't found a need for a named network
53 namespace in that case.
54 -n NETWORK x.x.x /24 private network to use. If not specified, uses
55 the first unused one starting at 10.173.1
56 -h, --help Show this help and exit.
57
58 From a normal shell:
59
60 If we do create the netns, to join it with a shell, we can do (as root)
61 /usr/bin/nsenter --mount=/run/mount-namespaces/NAME --net=/var/run/netns/NAME bash
62
63 If you dont care about the mount namespace, you can leave that option off.
64
65
66 For systemd:
67
68 From within a systemd network namespace, we nat it to the outside. This
69 would be called from ExecStartPre, and or subsequent units called with
70 JoinsNamespaceOf= and PrivateNetwork=true.
71
72 If resolvconf is installed, we create a named mount namespace under
73 /run/mount-namespaces, so we can alter some system config for this
74 namespace. systemd command lines would be prefixed with:
75
76 /usr/bin/nsenter --mount=/run/mount-namespaces/NS_NAME
77
78 Note, this means that they can't run as unpriveledged users, but once
79 systemd 233 comes out, it will have a bind mount option from within unit
80 files, so the mount namespace won't be needed for most use cases, and I
81 will update the script to that the mount namespace not created unless a
82 flag is passed in. Patch welcome to add that flag before then.
83
84
85 This script has a dependency
86 https://savannah.nongnu.org/projects/bash-bear-trap/ . Search the script for "source" to see where to install or modify the installed location.
87
88
89 Background on this project (you can skip if you like):
90
91 If we aren't creating a named network namespace, to join the namespace
92 with a shell, I use:
93 nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash
94
95 Note: if I knew how to easily ask systemd what pid a unit has, i would
96 do that.
97
98 "ip netns new ..." also does a mount namespace, then bind
99 mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
100 for openvpn having it's own resolv.conf by using it's user script which
101 calls resolvconf, this doesn't help much. What we actually want to do is
102 copy /run/resolvconf somehwere then bind mount it on top of
103 /run/resolvconf.
104
105
106 Note: for debugging, adding set -x is a pretty good option.
107
108 TODO: make "start" be idempotent.
109
110 Please email me if you have a patches, bugs, feedback, or republish this
111 somewhere else: Ian Kelling <ian@iankelling.org>.
112 EOF
113 exit ${1:-0}
114 }
115
116
117 #### begin arg parsing ####
118 create=false
119 temp=$(getopt -l help,create hcdn: "$@") || usage 1
120 eval set -- "$temp"
121 while true; do
122 case $1 in
123 -c|--create) create=true; shift ;;
124 -n) network=$2; shift 2 ;;
125 -h|--help) usage ;;
126 --) shift; break ;;
127 *) echo "$0: Internal error!" ; exit 1 ;;
128 esac
129 done
130 if (( $# != 2 )); then
131 usage 1
132 fi
133
134 action=$1
135 nn=$2 # namespace name
136 #### end arg parsing ####
137
138 #### begin sanity checking ####
139 install_error=false
140 if ! type -p ip &>/dev/null; then
141 echo "please install the iproute2 package"
142 install_error=true
143 fi
144 if ! type -p iptables &>/dev/null; then
145 echo "please install the iptables package"
146 install_error=true
147 fi
148 if $install_error; then
149 exit 1
150 fi
151 #### end sanity checking ####
152
153 v0=veth0-$nn
154 v1=veth1-$nn
155 ip_base=10.173
156
157
158 ### begin make the default network namespace be named "default" ###
159 mkdir -p /run/netns
160 target=/run/netns/default
161 if [[ ! -e $target && ! -L $target ]]; then
162 # -f to avoid a race condition with running twice
163 ln -sf /proc/1/ns/net $target
164 fi
165 ### end make the default network namespace be named "default" ###
166
167 if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
168 create=true
169 fi
170 # otherwise we are already in the network namespace and it's unnamed.
171 if $create; then
172 ipnnargs="-n $nn"
173 fi
174
175
176 ipd() { ip -n default "$@"; }
177
178 # run ip in the network namespace
179 ipnn() { ip $ipnnargs "$@"; }
180
181 # default network namespace exec
182 dexec() { ip netns exec default "$@"; }
183 # mount namespace exec
184 mexec() { /usr/bin/nsenter --mount=/run/mount-namespaces/$nn "$@"; }
185
186
187 nat() {
188 # Note: duplicated in show()
189 # Note, in a previous commit i specified the output interface with -o,
190 # but that broke things when my gateway interface changed, and I can't
191 # see any advantage to it, so I removed it.
192 dexec iptables -t nat $1 POSTROUTING -s $network.0/24 -j MASQUERADE \
193 -m comment --comment "systemd network namespace nat"
194 }
195
196 # d = default
197 diptables-add() {
198 if ! dexec iptables -C "$@" &>/dev/null; then
199 dexec iptables -I "$@"
200 fi
201
202 }
203
204 find-network() {
205 if [[ $network ]]; then
206 return
207 fi
208 found=false
209 ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
210 for ((i=1; i <= 254; i++)); do
211 network=$ip_base.$i
212 if ! printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
213 found=true
214 break
215 fi
216 done
217 if ! $found; then
218 echo "$0: error: no open network found"
219 exit 1
220 fi
221 }
222
223 start() {
224 find-network
225
226 #### begin mount namespace setup ####
227 mkdir -p /run/mount-namespaces
228 if ! mountpoint /run/mount-namespaces >/dev/null; then
229 mount --bind /run/mount-namespaces /run/mount-namespaces
230 fi
231 # note: This is outside the mount condition because I've mysteriously
232 # had this become shared instead of private, perhaps it
233 # got remounted somehow and lost the setting.
234 mount --make-private /run/mount-namespaces
235 if [[ ! -e /run/mount-namespaces/$nn ]]; then
236 touch /run/mount-namespaces/$nn
237 fi
238 if ! mountpoint /run/mount-namespaces/$nn >/dev/null; then
239 # Here, we specify that we only want mount changes changes under
240 # this mountpoint to be propagated into the bind, but changes
241 # from within the bind do not propagate to outside the bind.
242 #
243 # slave is documented in.
244 # /usr/share/doc/linux-doc-4.9/Documentation/filesystems/sharedsubtree.txt.gz
245 # documentation on propagation is a bit weird because it
246 # confusingly talks about binds, namespaces, and mirrors (which
247 # seems to be just another name for bind), shared subtrees
248 # (which seems to be a term for binds and namespaces), and does not
249 # properly specify whether the documentation applies to binds,
250 # namespaces, or both. Notably, propagation for binds is marked
251 # on the original mount point, and propagation for a mount
252 # namespace is marked on mounts within the namespace.
253 unshare --propagation slave --mount=/run/mount-namespaces/$nn /bin/true
254 fi
255
256 #### end mount namespace setup ####
257
258
259 if $create; then
260 ip netns add $nn
261 ip -n $nn link set dev lo up
262 fi
263
264 echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward status=none
265
266 # docker helpfully changes the default FORWARD to drop...
267 diptables-add FORWARD -i $v0 -j ACCEPT
268 diptables-add FORWARD -o $v0 -j ACCEPT
269
270
271 err-cleanup() { stop; }
272 ipnn link add $v0 type veth peer name $v1
273 ipnn link set $v0 netns default
274 ipd addr add $network.1/24 dev $v0
275 ipd link set $v0 up
276 nat -C &>/dev/null || nat -A
277 ipnn addr add $network.2/24 dev $v1
278 ipnn link set $v1 up
279 cmd="ipnn route add default via $network.1"
280 $cmd
281 fails=0
282 max_fails=2
283 # I've had adding the default route mysteriously fail on boot, so
284 # here we check that it succeeded, do a sleep and a retry.
285 while true; do
286 default_route=$(ipnn route show default | sed -r 's,^[[:space:]]+|[[:space:]]+$,,')
287 if [[ $default_route != "default via $network.1 dev $v1" ]]; then
288 fails=$((fails + 1))
289 else
290 break
291 fi
292 if (( fails >= max_fails )); then
293 echo "$0: ERROR: default route added but not found, retried $max_fails. expected route: 'default via $network.1 dev $v1', found: '$default_route'"
294 # Note: for debugging, if you have a systemd unit which tears down
295 # the newns upon failure, you may want to uncomment the break so
296 # that we proceed and can inspect the system. break
297 exit 1
298 else
299 sleep 1
300 $cmd
301 fi
302 done
303 if (( fails >= 1 )); then
304 echo "$0: WARNING: route added but not found until retried $max_fails times: $cmd"
305 fi
306
307
308 ###### begin setup resolvconf
309 if [[ -e /run/resolvconf ]]; then # resolvconf probably installed
310 resolv_copy=/root/resolvconf-$nn
311
312 # this condition should never happen, just coding defensively
313 if mexec mountpoint /run/resolvconf &>/dev/null; then
314 mexec umount /run/resolvconf
315 fi
316 cp -aT /run/resolvconf $resolv_copy
317 if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
318 echo "error: resolv-conf bindmount failed"
319 exit 1
320 fi
321 # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
322 # in the network namespace, so adjust the address.
323 if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
324 mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
325 mexec resolvconf -u
326 fi
327 # and in debian based distros at least, it runs with --local-service, and needs a restart
328 # to know about the new local network
329 if [[ $(systemctl --no-pager show -p ActiveState dnsmasq ) == ActiveState=active ]]; then
330 systemctl restart dnsmasq
331 fi
332
333 # background: if we did this in openvpn's resolv-conf script, we could guard it in
334 # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
335 # and we could get $nn by
336 # config_basename=${config%%.*}
337 # config_basename=${config_basename##*/}
338 # but dnsmasq forces us to do it earlier.
339
340 fi # end if [[ -e /run/resolvconf ]]
341 ###### end setup resolvconf
342 }
343
344 stop() {
345 if [[ ! $network ]]; then
346 network=$(ipd -f inet a show dev $v0 2>/dev/null | awk '/inet / {print $2}' | sed -r 's,\.[0-9]+/.*,,' ||:)
347 fi
348 if ipd link list $v0 &>/dev/null; then
349 # this also deletes $v1 and the route we added.
350 ipd link del $v0
351 fi
352 if [[ $network ]] && nat -C &>/dev/null; then nat -D; fi
353 dexec iptables -D FORWARD -i $v0 -j ACCEPT &>/dev/null ||:
354 if $create && [[ -e /var/run/netns/$nn ]]; then
355 ip netns del $nn
356 fi
357
358 # not sure this is necessary since we are tearing down the mount namespace
359 if mexec mountpoint /run/resolvconf &>/dev/null; then
360 mexec umount /run/resolvconf
361 fi
362
363 if mountpoint /run/mount-namespaces/$nn >/dev/null; then
364 umount /run/mount-namespaces/$nn
365 fi
366 }
367
368 show() {
369 m ipd link list $v0
370 m dexec iptables -t nat -C POSTROUTING -s $network.0/24 -j MASQUERADE \
371 -m comment --comment "systemd network namespace nat" ||:
372 m dexec iptables -C FORWARD -i $v0 -j ACCEPT
373 m mexec mountpoint /run/resolvconf
374 m mountpoint /run/mount-namespaces/$nn
375 }
376
377 case $action in
378 start|stop|show)
379 $action
380 ;;
381 *)
382 echo "$0: error: unsupported action"
383 exit 1
384 ;;
385 esac