fix: multiple nns could cause removal of nat iptables rule
[newns] / newns
1 #!/bin/bash
2 # Copyright (C) 2017 Ian Kelling
3
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7
8 # http://www.apache.org/licenses/LICENSE-2.0
9
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16
17 [[ $EUID == 0 ]] || exec sudo -E "$BASH_SOURCE" "$@"
18
19 tmp="$(readlink -f "${BASH_SOURCE}")"; script_dir="${tmp%/*}"
20 if [[ ! $ERRHANDLE_PATH ]]; then
21 ERRHANDLE_PATH="$script_dir"/../errhandle/err
22 fi
23 if [[ -s $ERRHANDLE_PATH ]]; then
24 source $ERRHANDLE_PATH
25 else
26 cd "$script_dir"
27 if ! wget -O err 'https://iankelling.org/git/?p=errhandle;a=blob_plain;f=err;hb=HEAD'; then
28 echo "$0: failed to get errhandle dependency" >&2
29 exit 1
30 fi
31 source err
32 fi
33
34 usage() {
35 cat <<EOF
36 usage: ${0##*/} [OPTS] start|stop NS_NAME
37 Nat a network namespace. systemd friendly
38
39 Also creates a mount namespace with a cloned /run/resolvconf.
40
41 -c, --create Create or destroy a named network namespace. When running from
42 the same network namespace as pid 1, this is set automatically.
43 A systemd created private network is in a network namespace
44 different than pid 1.
45 -n NETWORK x.x.x /24 private network to use. If not specified, uses
46 the first unused one starting at 10.173.1
47 -h, --help Show this help and exit.
48
49 From a normal shell:
50
51 If we do create the netns, to join it with a shell, we can do (as root)
52 /usr/bin/nsenter --mount=/root/mount_namespaces/NAME --net=/var/run/netns/NAME bash
53
54 If you dont care about the mount namespace, you can leave that option off.
55
56
57 For systemd:
58
59 From within a systemd network namespace, we nat it to the outside. This
60 would be called from ExecStartPre, and or subsequent units called with
61 JoinsNamespaceOf= and PrivateNetwork=true.
62
63 If resolvconf is installed, we create a named mount namespace under
64 /root/mount_namespaces, so we can alter some system config for this
65 namespace. systemd command lines would be prefixed with:
66
67 /usr/bin/nsenter --mount=/root/mount_namespaces/NS_NAME
68
69 Note, this means that they can't run as unpriveledged users, but once
70 systemd 233 comes out, it will have a bind mount option from within unit
71 files, so the mount namespace won't be needed for most use cases, and I
72 will update the script to that the mount namespace not created unless a
73 flag is passed in. Patch welcome to add that flag before then.
74
75 This script has a dependency which you can download manually or it
76 will be automatically downloaded into the same directory.
77 It handles errors by printing stack trace and and cleaning up the namespaces.
78 To download manually,
79 git clone https://iankelling.org/git/errhandle
80 into an adjacent directory, or
81 export ERRHANDLE_PATH to point to the 'err' file in that repo.
82
83
84 Background on this project (you can skip if you like):
85
86 If we aren't creating a named network namespace, to join the namespace
87 with a shell, I use:
88 nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash
89
90 Note: if I knew how to easily ask systemd what pid a unit has, i would
91 do that.
92
93 "ip netns new ..." also does a mount namespace, then bind
94 mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
95 for openvpn having it's own resolv.conf by using it's user script which
96 calls resolvconf, this doesn't help much. What we actually want to do is
97 copy /run/resolvconf somehwere then bind mount it on top of
98 /run/resolvconf.
99
100
101 Note: for debugging, adding set -x is a pretty good option.
102
103 Please email me if you have a patches, bugs, feedback, or republish this
104 somewhere else: Ian Kelling <ian@iankelling.org>.
105 EOF
106 exit ${1:-0}
107 }
108
109
110 #### begin arg parsing ####
111 create=false
112 temp=$(getopt -l help,create hcn: "$@") || usage 1
113 eval set -- "$temp"
114 while true; do
115 case $1 in
116 -c|--create) create=true; shift ;;
117 -n) network=$2; shift 2 ;;
118 -h|--help) usage ;;
119 --) shift; break ;;
120 *) echo "$0: Internal error!" ; exit 1 ;;
121 esac
122 done
123 if (( $# != 2 )); then
124 usage 1
125 fi
126
127 action=$1
128 nn=$2 # namespace name
129 #### end arg parsing ####
130
131 #### begin sanity checking ####
132 install_error=false
133 if ! type -p ip &>/dev/null; then
134 echo "please install the iproute2 package"
135 install_error=true
136 fi
137 if ! type -p iptables &>/dev/null; then
138 echo "please install the iptables package"
139 install_error=true
140 fi
141 if $install_error; then
142 exit 1
143 fi
144 #### end sanity checking ####
145
146 v0=veth0-$nn
147 v1=veth1-$nn
148 ip_base=10.173
149
150 if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
151 create=true
152 fi
153
154 # make the default network namespace be named
155
156 mkdir -p /run/netns
157 target=/run/netns/default
158 if [[ ! -e $target && ! -L $target ]]; then
159 # -f to avoid a race condition with running twice
160 ln -sf /proc/1/ns/net $target
161 fi
162
163 ipd() { ip -n default "$@"; }
164
165
166 # otherwise we are already in the network namespace and it's unnamed.
167 if $create; then
168 ipnnargs="-n $nn"
169 fi
170 # run ip in the network namespace
171 ipnn() { ip $ipnnargs "$@"; }
172
173 # default network namespace exec
174 dexec() { ip netns exec default "$@"; }
175 # mount namespace exec
176 mexec() { /usr/bin/nsenter --mount=/root/mount_namespaces/$nn "$@"; }
177
178
179 # background: head -n1 is defensive. Not sure if there is some weird feature
180 # for 2 routes to be 0/0.
181 gateway_ifs=($(ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'))
182
183 if [[ ! $gateway_ifs ]]; then
184 cat >&2 <<EOF
185 $0: error: failed to find gateway interface. No output from:
186 ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'
187 output from "ipd route list exact 0/0":
188 $(ipd route list exact 0/0)
189 EOF
190 exit 1
191 fi
192
193 nat() {
194 for if in ${gateway_ifs[@]}; do
195 dexec iptables -t nat $1 POSTROUTING -s $network.0/24 -o $if -j MASQUERADE \
196 -m comment --comment "systemd network namespace nat"
197 done
198 }
199
200 # d = default
201 diptables-add() {
202 if ! dexec iptables -C "$@" &>/dev/null; then
203 dexec iptables -I "$@"
204 fi
205
206 }
207
208 find_network() {
209 if [[ $network ]]; then
210 return
211 fi
212 found=false
213 existing=false
214 ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
215 for ((i=1; i <= 254; i++)); do
216 network=$ip_base.$i
217 if printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
218 existing=true
219 else
220 found=true
221 break
222 fi
223 done
224 }
225
226 start() {
227 find_network
228 if ! $found; then
229 echo "$0: error: no open network found"
230 exit 1
231 fi
232
233 #### begin mount namespace setup ####
234 mkdir -p /root/mount_namespaces
235 if ! mountpoint /root/mount_namespaces >/dev/null; then
236 mount --bind /root/mount_namespaces /root/mount_namespaces
237 fi
238 # note: This is outside the mount condition because I've mysteriously
239 # had this become shared instead of private, perhaps it
240 # got remounted somehow and lost the setting.
241 mount --make-private /root/mount_namespaces
242 if [[ ! -e /root/mount_namespaces/$nn ]]; then
243 touch /root/mount_namespaces/$nn
244 fi
245 if ! mountpoint /root/mount_namespaces/$nn >/dev/null; then
246 # Here, we specify that we only want mount changes changes under
247 # this mountpoint to be propagated into the bind, but changes
248 # from within the bind do not propagate to outside the bind.
249 #
250 # slave is documented in.
251 # /usr/share/doc/linux-doc-4.9/Documentation/filesystems/sharedsubtree.txt.gz
252 # documentation on propagation is a bit weird because it
253 # confusingly talks about binds, namespaces, and mirrors (which
254 # seems to be just another name for bind), shared subtrees
255 # (which seems to a term for binds and namespaces), and does not
256 # properly specify whether the documentation applies to binds,
257 # namespaces, or both. Notably, propagation for binds is marked
258 # on the original mount point, and propagation for a mount
259 # namespace is marked on mounts within the namespace.
260 unshare --propagation slave --mount=/root/mount_namespaces/$nn /bin/true
261 fi
262
263 #### end mount namespace setup ####
264
265
266 if $create; then
267 ip netns add $nn
268 ip -n $nn link set dev lo up
269 fi
270
271 echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward 2>/dev/null
272
273 # docker helpfully changes the default FORWARD to drop...
274 diptables-add FORWARD -i $v0 -j ACCEPT
275 diptables-add FORWARD -o $v0 -j ACCEPT
276
277
278 err-cleanup() { stop; }
279 ipnn link add $v0 type veth peer name $v1
280 ipnn link set $v0 netns default
281 ipd addr add $network.1/24 dev $v0
282 ipd link set $v0 up
283 nat -C &>/dev/null || nat -A
284 ipnn addr add $network.2/24 dev $v1
285 ipnn link set $v1 up
286 ipnn route add default via $network.1
287
288 ###### begin setup resolvconf
289 if [[ -e /run/resolvconf ]]; then # resolvconf probably installed
290 resolv_copy=/root/resolvconf-$nn
291
292 # this condition should never happen, just coding defensively
293 if mexec mountpoint /run/resolvconf &>/dev/null; then
294 mexec umount /run/resolvconf
295 fi
296 cp -aT /run/resolvconf $resolv_copy
297 if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
298 echo "error: resolv-conf bindmount failed"
299 exit 1
300 fi
301 # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
302 # in the network namespace, so adjust the address.
303 if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
304 mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
305 mexec resolvconf -u
306 fi
307 # and in debian based distros at least, it runs with --local-service, and needs a restart
308 # to know about the new local network
309 if [[ $(systemctl --no-pager show -p ActiveState dnsmasq ) == ActiveState=active ]]; then
310 systemctl restart dnsmasq
311 fi
312
313 # background: if we did this in openvpn's resolv-conf script, we could guard it in
314 # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
315 # and we could get $nn by
316 # config_basename=${config%%.*}
317 # config_basename=${config_basename##*/}
318 # but dnsmasq forces us to do it earlier.
319
320 fi # end if [[ -e /run/resolvconf ]]
321 ###### end setup resolvconf
322
323
324 }
325
326 stop() {
327 if ipd link list $v0 &>/dev/null; then
328 # this also deletes $v1 and the route we added.
329 ipd link del $v0
330 fi
331 find_network
332 if ! $existing; then
333 if nat -C &>/dev/null; then nat -D; fi
334 fi
335 dexec iptables -D FORWARD -i $v0 -j ACCEPT &>/dev/null ||:
336 if $create && [[ -e /var/run/netns/$nn ]]; then
337 ip netns del $nn
338 fi
339
340 # not sure this is necessary since we are tearing down the mount namespace
341 if mexec mountpoint /run/resolvconf &>/dev/null; then
342 mexec umount /run/resolvconf
343 fi
344
345 if mountpoint /root/mount_namespaces/$nn >/dev/null; then
346 umount /root/mount_namespaces/$nn
347 fi
348 }
349
350 case $action in
351 start|stop)
352 $action
353 ;;
354 *)
355 echo "$0: error: unsupported action"
356 exit 1
357 ;;
358 esac