fix for dnsmasq
[newns] / newns
1 #!/bin/bash
2 # Copyright (C) 2017 Ian Kelling
3
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7
8 # http://www.apache.org/licenses/LICENSE-2.0
9
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16
17 [[ $EUID == 0 ]] || exec sudo -E "$BASH_SOURCE" "$@"
18
19 if [[ ! $ERRHANDLE_PATH ]]; then
20 ERRHANDLE_PATH=$(readlink -f "${BASH_SOURCE}")
21 ERRHANDLE_PATH=$(readlink -f ${ERRHANDLE_PATH%/*}/../errhandle)
22 fi
23 err_sourced=true
24 for p in $ERRHANDLE_PATH/{errcatch-function,bash-trace-function}; do
25 if [[ -e $p ]]; then
26 source $p
27 else
28 err_sourced=false
29 fi
30 done
31 if $err_sourced; then
32 errcatch
33 else
34 set -eE -o pipefail
35 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
36 fi
37
38 usage() {
39 cat <<EOF
40 usage: ${0##*/} [OPTS] start|stop NS_NAME
41 Nat a network namespace. systemd friendly
42
43 Also creates a mount namespace with a cloned /run/resolvconf.
44
45 -c, --create Create a named network namespace. When running from
46 the same network namespace as pid 1, this is set automatically.
47 A systemd created private network is in a network namespace
48 different than pid 1.
49 -n NETWORK x.x.x /24 private network to use. If not specified, uses
50 the first unused one starting at 10.173.1
51 -h, --help Show this help and exit.
52
53 From within a systemd network namespace, nat it to the outside. This
54 would be called from ExecStartPre, and or subsequent units called with
55 JoinsNamespaceOf= and PrivateNetwork=true.
56
57 Also create a named mount namespace under /root/mount_namespaces, so we
58 can alter some system config for this namespace. Subsequent systemd
59 command lines would be prefixed with:
60
61 /usr/bin/nsenter --mount=/root/mount_namespaces/NS_NAME
62
63 Note, this means that they can't run as unpriveledged users, but once
64 systemd 233 comes out, it will have a bind mount option from within unit
65 files, so the mount namespace won't be needed for most use cases, and I
66 will update the script to that the mount namespace not created unless a
67 flag is passed in. Patch welcome to add that flag before then.
68
69 A recommmended dependency of this script is my other repo named "errhandle",
70 which prints stack trace on error, and calls a cleanup function:
71 https://iankelling.org/git/?p=errhandle, set ERRHANDLE_PATH, or put it
72 in a directory adjacent to the absolute, resolved directory this file is
73 in.
74
75 Background:
76
77 If we aren't creating a named network namespace, to join the namespace
78 with a shell, I use:
79 nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash
80
81 Note: if I knew how to easily ask systemd what pid a unit has, i would
82 do that.
83
84 If we do create the netns, to join it with a shell, we can do
85 /usr/bin/nsenter --mount=/root/mount_namespaces/NAME --net=/var/run/netns/NAME bash
86
87 "ip netns new ..." also does a mount namespace, then bind
88 mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
89 for openvpn having it's own resolv.conf by using it's user script which
90 calls resolvconf, this doesn't help much. What we actually want to do is
91 copy /run/resolvconf somehwere then bind mount it on top of
92 /run/resolvconf.
93
94 Note: for debugging, adding set -x is a pretty good option.
95
96 Please email me if you have a patches, bugs, feedback, or republish this
97 somewhere else: Ian Kelling <ian@iankelling.org>.
98 EOF
99 exit ${1:-0}
100 }
101
102
103 #### begin arg parsing ####
104 create=false
105 temp=$(getopt -l help,create hcn: "$@") || usage 1
106 eval set -- "$temp"
107 while true; do
108 case $1 in
109 -c|--create) create=true; shift ;;
110 -n) network=$2; shift 2 ;;
111 -h|--help) usage ;;
112 --) shift; break ;;
113 *) echo "$0: Internal error!" ; exit 1 ;;
114 esac
115 done
116 if (( $# != 2 )); then
117 usage 1
118 fi
119
120 action=$1
121 nn=$2 # namespace name
122 #### end arg parsing ####
123
124 #### begin sanity checking ####
125 install_error=false
126 if ! type -p ip &>/dev/null; then
127 echo "please install the iproute2 package"
128 install_error=true
129 fi
130 if ! type -p iptables &>/dev/null; then
131 echo "please install the iptables package"
132 install_error=true
133 fi
134 if $install_error; then
135 exit 1
136 fi
137 #### end sanity checking ####
138
139
140 v0=veth0-$nn
141 v1=veth1-$nn
142 ip_base=10.173
143
144 if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
145 create=true
146 fi
147
148 # make the default network namespace be named
149 target=/run/netns/default
150 if [[ ! -e $target && ! -L $target ]]; then
151 mkdir -p /run/netns
152 ln -s /proc/1/ns/net $target
153 fi
154
155
156 ipd() { ip -n default "$@"; }
157 if $create; then
158 # run ip in the network namespace
159 ipnn() { ip -n $nn "$@"; }
160 else
161 # we are already in the network namespace and it's unnamed.
162 # run ip in the network namespace
163 ipnn() { ip "$@"; }
164 fi
165 # default network namespace exec
166 dexec() { ip netns exec default "$@"; }
167 # mount namespace exec
168 mexec() { /usr/bin/nsenter --mount=/root/mount_namespaces/$nn "$@"; }
169
170
171 # background: head -n1 is defensive. Not sure if there is some weird feature
172 # for 2 routes to be 0/0.
173 gateway_ifs=($(ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'))
174
175 if [[ ! $gateway_ifs ]]; then
176 cat >&2 <<EOF
177 $0: error: failed to find gateway interface. No output from:
178 ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'
179 output from "ipd route list exact 0/0":
180 $(ipd route list exact 0/0)
181 EOF
182 exit 1
183 fi
184
185 nat() {
186 for if in ${gateway_ifs[@]}; do
187 dexec iptables -t nat $1 POSTROUTING -o $if -j MASQUERADE \
188 -m comment --comment "systemd network namespace nat"
189 done
190 }
191
192 # d = default
193 diptables-add() {
194 if ! dexec iptables -C "$@" &>/dev/null; then
195 dexec iptables -I "$@"
196 fi
197
198 }
199
200 find_network() {
201 if [[ $network ]]; then
202 return
203 fi
204 found=false
205 existing=false
206 ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
207 for ((i=1; i <= 254; i++)); do
208 network=$ip_base.$i
209 if printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
210 existing=true
211 else
212 found=true
213 break
214 fi
215 done
216 }
217
218 start() {
219 find_network
220 if ! $found; then
221 echo "$0: error: no open network found"
222 exit 1
223 fi
224
225 #### begin mount namespace setup ####
226 mkdir -p /root/mount_namespaces
227 if ! mountpoint /root/mount_namespaces >/dev/null; then
228 mount --bind /root/mount_namespaces /root/mount_namespaces
229 fi
230 # note: This is outside the mount condition because I've mysteriously
231 # had this become shared instead of private, perhaps it
232 # got remounted somehow and lost the setting.
233 mount --make-private /root/mount_namespaces
234 if [[ ! -e /root/mount_namespaces/$nn ]]; then
235 touch /root/mount_namespaces/$nn
236 fi
237 if ! mountpoint /root/mount_namespaces/$nn >/dev/null; then
238 # Here, we specify that we only want mount changes changes under
239 # this mountpoint to be propagated into the bind, but changes
240 # from within the bind do not propagate to outside the bind.
241 #
242 # slave is documented in.
243 # /usr/share/doc/linux-doc-4.9/Documentation/filesystems/sharedsubtree.txt.gz
244 # documentation on propagation is a bit weird because it
245 # confusingly talks about binds, namespaces, and mirrors (which
246 # seems to be just another name for bind), shared subtrees
247 # (which seems to a term for binds and namespaces), and does not
248 # properly specify whether the documentation applies to binds,
249 # namespaces, or both. Notably, propagation for binds is marked
250 # on the original mount point, and propagation for a mount
251 # namespace is marked on mounts within the namespace.
252 unshare --propagation slave --mount=/root/mount_namespaces/$nn /bin/true
253 fi
254
255 #### end mount namespace setup ####
256
257
258 if $create; then
259 ip netns add $nn
260 ip -n $nn link set dev lo up
261 fi
262
263 echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward 2>/dev/null
264
265 # docker helpfully changes the default FORWARD to drop...
266 diptables-add FORWARD -i $v0 -j ACCEPT
267 diptables-add FORWARD -o $v0 -j ACCEPT
268
269
270 _errcatch_cleanup=stop
271 ipnn link add $v0 type veth peer name $v1
272 ipnn link set $v0 netns default
273 ipd addr add $network.1/24 dev $v0
274 ipd link set $v0 up
275 nat -C &>/dev/null || nat -A
276 ipnn addr add $network.2/24 dev $v1
277 ipnn link set $v1 up
278 ipnn route add default via $network.1
279
280 ###### begin setup resolvconf
281 resolv_copy=/root/resolvconf-$nn
282
283 # this condition should never happen, just coding defensively
284 if mexec mountpoint /run/resolvconf &>/dev/null; then
285 mexec umount /run/resolvconf
286 fi
287 cp -aT /run/resolvconf $resolv_copy
288 if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
289 echo "error: resolv-conf bindmount failed"
290 exit 1
291 fi
292 # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
293 # in the network namespace, so adjust the address.
294 if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
295 mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
296 mexec resolvconf -u
297 fi
298 # and in debian based distros at least, it runs with --local-service, and needs a restart
299 # to know about the new local network
300 if [[ $(systemctl --no-pager show -p ActiveState dnsmasq ) == ActiveState=active ]]; then
301 systemctl restart dnsmasq
302 fi
303
304 # background: if we did this in openvpn's resolv-conf script, we could guard it in
305 # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
306 # and we could get $nn by
307 # config_basename=${config%%.*}
308 # config_basename=${config_basename##*/}
309 # but dnsmasq forces us to do it earlier.
310 ###### end setup resolvconf
311
312
313 }
314
315 stop() {
316 if ipd link list $v0 &>/dev/null; then
317 # this also deletes $v1 and the route we added.
318 ipd link del $v0
319 fi
320 find_network
321 if ! $existing; then
322 if nat -C &>/dev/null; then nat -D; fi
323 fi
324 dexec iptables -D FORWARD -i $v0 -j ACCEPT ||:
325 if $create && [[ -e /var/run/netns/client ]]; then
326 ip netns del $nn
327 fi
328
329 # not sure this is necessary since we are tearing down the mount namespace
330 if mexec mountpoint /run/resolvconf &>/dev/null; then
331 mexec umount /run/resolvconf
332 fi
333
334 if mountpoint /root/mount_namespaces/$nn >/dev/null; then
335 umount /root/mount_namespaces/$nn
336 fi
337 }
338
339 case $action in
340 start|stop)
341 $action
342 ;;
343 *)
344 echo "$0: error: unsupported action"
345 exit 1
346 ;;
347 esac