e8c3ca79a24d4a8970d6cc49cc429f92c241183d
[newns] / newns
1 #!/bin/bash
2 # I, Ian Kelling, follow the GNU license recommendations at
3 # https://www.gnu.org/licenses/license-recommendations.en.html. They
4 # recommend that small programs, < 300 lines, be licensed under the
5 # Apache License 2.0. This file contains or is part of one or more small
6 # programs. If a small program grows beyond 300 lines, I plan to switch
7 # its license to GPL.
8
9 # Copyright 2024 Ian Kelling
10
11 # Licensed under the Apache License, Version 2.0 (the "License");
12 # you may not use this file except in compliance with the License.
13 # You may obtain a copy of the License at
14
15 # http://www.apache.org/licenses/LICENSE-2.0
16
17 # Unless required by applicable law or agreed to in writing, software
18 # distributed under the License is distributed on an "AS IS" BASIS,
19 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 # See the License for the specific language governing permissions and
21 # limitations under the License.
22
23
24 [[ $EUID == 0 ]] || exec sudo -E "$BASH_SOURCE" "$@"
25
26 # https://savannah.nongnu.org/projects/bash-bear-trap/
27 set -e; . /usr/local/lib/bash-bear; set +e
28
29 m() {
30 local out
31 printf "newns: %s\n" "$*"
32 if ! out=$("$@" 2>&1); then
33 echo "newns: WARNING: last command exit code: $?"
34 elif [[ ! $out ]]; then
35 echo "newns: WARNING: no output from last command"
36 fi
37 }
38
39 usage() {
40 cat <<EOF
41 usage: ${0##*/} [OPTS] start|stop|show NS_NAME
42 Nat a network namespace. systemd friendly
43
44 Also creates a mount namespace with a cloned /run/resolvconf.
45
46 Arguments:
47
48 start|stop: these do what they say.
49
50 show: Show the state we expected to be there or not there based on
51 start/stop. This is useful for debugging.
52
53 NS_NAME: We use this to name the interfaces we create, the mount
54 namespace, and if we are creating a named network space, that too.
55
56 -c, --create Create or destroy a named network namespace. When running from
57 the same network namespace as pid 1, this is set automatically.
58 A systemd created private network is in an unnamed network namespace
59 different than pid 1. I haven't found a need for a named network
60 namespace in that case.
61 -n NETWORK x.x.x /24 private network to use. If not specified, uses
62 the first unused one starting at 10.173.1
63 -h, --help Show this help and exit.
64
65 From a normal shell:
66
67 If we do create the netns, to join it with a shell, we can do (as root)
68 /usr/bin/nsenter --mount=/run/mount-namespaces/NAME --net=/var/run/netns/NAME bash
69
70 If you dont care about the mount namespace, you can leave that option off.
71
72
73 For systemd:
74
75 From within a systemd network namespace, we nat it to the outside. This
76 would be called from ExecStartPre, and or subsequent units called with
77 JoinsNamespaceOf= and PrivateNetwork=true.
78
79 If resolvconf is installed, we create a named mount namespace under
80 /run/mount-namespaces, so we can alter some system config for this
81 namespace. systemd command lines would be prefixed with:
82
83 /usr/bin/nsenter --mount=/run/mount-namespaces/NS_NAME
84
85 Note, this means that they can't run as unpriveledged users, but once
86 systemd 233 comes out, it will have a bind mount option from within unit
87 files, so the mount namespace won't be needed for most use cases, and I
88 will update the script to that the mount namespace not created unless a
89 flag is passed in. Patch welcome to add that flag before then.
90
91
92 This script has a dependency
93 https://savannah.nongnu.org/projects/bash-bear-trap/ . Search the script for "source" to see where to install or modify the installed location.
94
95
96 Background on this project (you can skip if you like):
97
98 If we aren't creating a named network namespace, to join the namespace
99 with a shell, I use:
100 nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash
101
102 Note: if I knew how to easily ask systemd what pid a unit has, i would
103 do that.
104
105 "ip netns new ..." also does a mount namespace, then bind
106 mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
107 for openvpn having it's own resolv.conf by using it's user script which
108 calls resolvconf, this doesn't help much. What we actually want to do is
109 copy /run/resolvconf somehwere then bind mount it on top of
110 /run/resolvconf.
111
112
113 Note: for debugging, adding set -x is a pretty good option.
114
115 TODO: make "start" be idempotent.
116
117 Please email me if you have a patches, bugs, feedback, or republish this
118 somewhere else: Ian Kelling <ian@iankelling.org>.
119 EOF
120 exit ${1:-0}
121 }
122
123
124 #### begin arg parsing ####
125 create=false
126 temp=$(getopt -l help,create hcdn: "$@") || usage 1
127 eval set -- "$temp"
128 while true; do
129 case $1 in
130 -c|--create) create=true; shift ;;
131 -n) network=$2; shift 2 ;;
132 -h|--help) usage ;;
133 --) shift; break ;;
134 *) echo "$0: Internal error!" ; exit 1 ;;
135 esac
136 done
137 if (( $# != 2 )); then
138 usage 1
139 fi
140
141 action=$1
142 nn=$2 # namespace name
143 #### end arg parsing ####
144
145 #### begin sanity checking ####
146 install_error=false
147 if ! type -p ip &>/dev/null; then
148 echo "please install the iproute2 package"
149 install_error=true
150 fi
151 if ! type -p iptables &>/dev/null; then
152 echo "please install the iptables package"
153 install_error=true
154 fi
155 if $install_error; then
156 exit 1
157 fi
158 #### end sanity checking ####
159
160 v0=veth0-$nn
161 v1=veth1-$nn
162 ip_base=10.173
163
164
165 ### begin make the default network namespace be named "default" ###
166 mkdir -p /run/netns
167 target=/run/netns/default
168 if [[ ! -e $target && ! -L $target ]]; then
169 # -f to avoid a race condition with running twice
170 ln -sf /proc/1/ns/net $target
171 fi
172 ### end make the default network namespace be named "default" ###
173
174 if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
175 create=true
176 fi
177 # otherwise we are already in the network namespace and it's unnamed.
178 if $create; then
179 ipnnargs="-n $nn"
180 fi
181
182
183 ipd() { ip -n default "$@"; }
184
185 # run ip in the network namespace
186 ipnn() { ip $ipnnargs "$@"; }
187
188 # default network namespace exec
189 dexec() { ip netns exec default "$@"; }
190 # mount namespace exec
191 mexec() { /usr/bin/nsenter --mount=/run/mount-namespaces/$nn "$@"; }
192
193
194 nat() {
195 # Note: duplicated in show()
196 # Note, in a previous commit i specified the output interface with -o,
197 # but that broke things when my gateway interface changed, and I can't
198 # see any advantage to it, so I removed it.
199 dexec iptables -t nat $1 POSTROUTING -s $network.0/24 -j MASQUERADE \
200 -m comment --comment "systemd network namespace nat"
201 }
202
203 # d = default
204 diptables-add() {
205 if ! dexec iptables -C "$@" &>/dev/null; then
206 dexec iptables -I "$@"
207 fi
208
209 }
210
211 find-network() {
212 if [[ $network ]]; then
213 return
214 fi
215 found=false
216 ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
217 for ((i=1; i <= 254; i++)); do
218 network=$ip_base.$i
219 if ! printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
220 found=true
221 break
222 fi
223 done
224 if ! $found; then
225 echo "$0: error: no open network found"
226 exit 1
227 fi
228 }
229
230 # ip add idempotent (if it doesn't exist already)
231 ip-add() {
232 local cmd net dev
233 cmd=$1
234 net=$2
235 dev=$3
236 if ! $cmd addr show dev $dev | sed 's/^ *//;s/ *$//' | grep -xF "inet $net scope global $dev"; then
237 $cmd addr add $net dev $dev
238 fi
239
240 }
241
242 start() {
243 find-network
244
245 #### begin mount namespace setup ####
246 mkdir -p /run/mount-namespaces
247 if ! mountpoint /run/mount-namespaces >/dev/null; then
248 mount --bind /run/mount-namespaces /run/mount-namespaces
249 fi
250 # note: This is outside the mount condition because I've mysteriously
251 # had this become shared instead of private, perhaps it
252 # got remounted somehow and lost the setting.
253 mount --make-private /run/mount-namespaces
254 if [[ ! -e /run/mount-namespaces/$nn ]]; then
255 touch /run/mount-namespaces/$nn
256 fi
257 if ! mountpoint /run/mount-namespaces/$nn >/dev/null; then
258 # Here, we specify that we only want mount changes changes under
259 # this mountpoint to be propagated into the bind, but changes
260 # from within the bind do not propagate to outside the bind.
261 #
262 # slave is documented in.
263 # /usr/share/doc/linux-doc-4.9/Documentation/filesystems/sharedsubtree.txt.gz
264 # documentation on propagation is a bit weird because it
265 # confusingly talks about binds, namespaces, and mirrors (which
266 # seems to be just another name for bind), shared subtrees
267 # (which seems to be a term for binds and namespaces), and does not
268 # properly specify whether the documentation applies to binds,
269 # namespaces, or both. Notably, propagation for binds is marked
270 # on the original mount point, and propagation for a mount
271 # namespace is marked on mounts within the namespace.
272 unshare --propagation slave --mount=/run/mount-namespaces/$nn /bin/true
273 fi
274
275 #### end mount namespace setup ####
276
277
278 if $create; then
279 if ! ip netns | grep -xF $nn &>/dev/null; then
280 ip netns add $nn
281 fi
282 ip -n $nn link set dev lo up
283 fi
284
285 echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward status=none
286
287 # docker helpfully changes the default FORWARD to drop...
288 diptables-add FORWARD -i $v0 -j ACCEPT
289 diptables-add FORWARD -o $v0 -j ACCEPT
290
291
292 err-cleanup() { stop; }
293 ipnn link add $v0 type veth peer name $v1
294 ipnn link set $v0 netns default
295 ip-add ipd $network.1/24 $v0
296 ipd link set $v0 up
297 nat -C &>/dev/null || nat -A
298 ip-add ipnn $network.1/24 $v0
299 ipnn link set $v1 up
300 cmd="ipnn route add default via $network.1"
301 $cmd
302 fails=0
303 max_fails=2
304 # I've had adding the default route mysteriously fail on boot, so
305 # here we check that it succeeded, do a sleep and a retry.
306 while true; do
307 default_route=$(ipnn route show default | sed -r 's,^[[:space:]]+|[[:space:]]+$,,')
308 if [[ $default_route != "default via $network.1 dev $v1" ]]; then
309 fails=$((fails + 1))
310 else
311 break
312 fi
313 if (( fails >= max_fails )); then
314 echo "$0: ERROR: default route added but not found, retried $max_fails. expected route: 'default via $network.1 dev $v1', found: '$default_route'"
315 # Note: for debugging, if you have a systemd unit which tears down
316 # the newns upon failure, you may want to uncomment the break so
317 # that we proceed and can inspect the system. break
318 exit 1
319 else
320 sleep 1
321 $cmd
322 fi
323 done
324 if (( fails >= 1 )); then
325 echo "$0: WARNING: route added but not found until retried $max_fails times: $cmd"
326 fi
327
328
329 ###### begin setup resolvconf
330 if [[ -e /run/resolvconf ]]; then # resolvconf probably installed
331 resolv_copy=/root/resolvconf-$nn
332
333 # this condition should never happen, just coding defensively
334 if mexec mountpoint /run/resolvconf &>/dev/null; then
335 mexec umount /run/resolvconf
336 fi
337 cp -aT /run/resolvconf $resolv_copy
338 if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
339 echo "error: resolv-conf bindmount failed"
340 exit 1
341 fi
342 # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
343 # in the network namespace, so adjust the address.
344 if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
345 mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
346 mexec resolvconf -u
347 fi
348 # and in debian based distros at least, it runs with --local-service, and needs a restart
349 # to know about the new local network
350 if [[ $(systemctl --no-pager show -p ActiveState dnsmasq ) == ActiveState=active ]]; then
351 systemctl restart dnsmasq
352 fi
353
354 # background: if we did this in openvpn's resolv-conf script, we could guard it in
355 # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
356 # and we could get $nn by
357 # config_basename=${config%%.*}
358 # config_basename=${config_basename##*/}
359 # but dnsmasq forces us to do it earlier.
360
361 fi # end if [[ -e /run/resolvconf ]]
362 ###### end setup resolvconf
363 }
364
365 stop() {
366 if [[ ! $network ]]; then
367 network=$(ipd -f inet a show dev $v0 2>/dev/null | awk '/inet / {print $2}' | sed -r 's,\.[0-9]+/.*,,' ||:)
368 fi
369 if ipd link list $v0 &>/dev/null; then
370 # this also deletes $v1 and the route we added.
371 ipd link del $v0
372 fi
373 if [[ $network ]] && nat -C &>/dev/null; then nat -D; fi
374 dexec iptables -D FORWARD -i $v0 -j ACCEPT &>/dev/null ||:
375 if $create && [[ -e /var/run/netns/$nn ]]; then
376 ip netns del $nn
377 fi
378
379 # not sure this is necessary since we are tearing down the mount namespace
380 if mexec mountpoint /run/resolvconf &>/dev/null; then
381 mexec umount /run/resolvconf
382 fi
383
384 if mountpoint /run/mount-namespaces/$nn >/dev/null; then
385 umount /run/mount-namespaces/$nn
386 fi
387 }
388
389 show() {
390 m ipd link list $v0
391 m dexec iptables -t nat -C POSTROUTING -s $network.0/24 -j MASQUERADE \
392 -m comment --comment "systemd network namespace nat" ||:
393 m dexec iptables -C FORWARD -i $v0 -j ACCEPT
394 m mexec mountpoint /run/resolvconf
395 m mountpoint /run/mount-namespaces/$nn
396 }
397
398 case $action in
399 start|stop|show)
400 $action
401 ;;
402 *)
403 echo "$0: error: unsupported action"
404 exit 1
405 ;;
406 esac