05e1068b5c5ebe20f377fff62e1aa13bf1f91ea5
[newns] / newns
1 #!/bin/bash
2 # Copyright (C) 2017 Ian Kelling
3
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7
8 # http://www.apache.org/licenses/LICENSE-2.0
9
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16
17 [[ $EUID == 0 ]] || exec sudo -E "$BASH_SOURCE" "$@"
18
19 tmp="$(readlink -f "${BASH_SOURCE}")"; script_dir="${tmp%/*}"
20 if [[ ! $ERRHANDLE_PATH ]]; then
21 ERRHANDLE_PATH="$script_dir"/../errhandle/err
22 fi
23 if [[ -s $ERRHANDLE_PATH ]]; then
24 source $ERRHANDLE_PATH
25 else
26 cd "$script_dir"
27 if ! wget -O err 'https://iankelling.org/git/?p=errhandle;a=blob_plain;f=err;hb=HEAD'; then
28 echo "$0: failed to get errhandle dependency" >&2
29 exit 1
30 fi
31 source err
32 fi
33
34 usage() {
35 cat <<EOF
36 usage: ${0##*/} [OPTS] start|stop NS_NAME
37 Nat a network namespace. systemd friendly
38
39 Also creates a mount namespace with a cloned /run/resolvconf.
40
41 -c, --create Create a named network namespace. When running from
42 the same network namespace as pid 1, this is set automatically.
43 A systemd created private network is in a network namespace
44 different than pid 1.
45 -n NETWORK x.x.x /24 private network to use. If not specified, uses
46 the first unused one starting at 10.173.1
47 -h, --help Show this help and exit.
48
49 From a normal shell:
50
51 If we do create the netns, to join it with a shell, we can do (as root)
52 /usr/bin/nsenter --mount=/root/mount_namespaces/NAME --net=/var/run/netns/NAME bash
53
54 If you dont care about the mount namespace, you can leave that option off.
55
56
57 For systemd:
58
59 From within a systemd network namespace, we nat it to the outside. This
60 would be called from ExecStartPre, and or subsequent units called with
61 JoinsNamespaceOf= and PrivateNetwork=true.
62
63 We also create a named mount namespace under /root/mount_namespaces, so we
64 can alter some system config for this namespace. systemd
65 command lines would be prefixed with:
66
67 /usr/bin/nsenter --mount=/root/mount_namespaces/NS_NAME
68
69 Note, this means that they can't run as unpriveledged users, but once
70 systemd 233 comes out, it will have a bind mount option from within unit
71 files, so the mount namespace won't be needed for most use cases, and I
72 will update the script to that the mount namespace not created unless a
73 flag is passed in. Patch welcome to add that flag before then.
74
75 This script has a dependency which you can download manually or it
76 will be automatically downloaded into the same directory.
77 It handles errors by printing stack trace and and cleaning up the namespaces.
78 To download manually,
79 git clone https://iankelling.org/git/errhandle
80 into an adjacent directory, or
81 export ERRHANDLE_PATH to point to the 'err' file in that repo.
82
83
84 Background on this project (you can skip if you like):
85
86 If we aren't creating a named network namespace, to join the namespace
87 with a shell, I use:
88 nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash
89
90 Note: if I knew how to easily ask systemd what pid a unit has, i would
91 do that.
92
93 "ip netns new ..." also does a mount namespace, then bind
94 mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
95 for openvpn having it's own resolv.conf by using it's user script which
96 calls resolvconf, this doesn't help much. What we actually want to do is
97 copy /run/resolvconf somehwere then bind mount it on top of
98 /run/resolvconf.
99
100
101 Note: for debugging, adding set -x is a pretty good option.
102
103 Please email me if you have a patches, bugs, feedback, or republish this
104 somewhere else: Ian Kelling <ian@iankelling.org>.
105 EOF
106 exit ${1:-0}
107 }
108
109
110 #### begin arg parsing ####
111 create=false
112 temp=$(getopt -l help,create hcn: "$@") || usage 1
113 eval set -- "$temp"
114 while true; do
115 case $1 in
116 -c|--create) create=true; shift ;;
117 -n) network=$2; shift 2 ;;
118 -h|--help) usage ;;
119 --) shift; break ;;
120 *) echo "$0: Internal error!" ; exit 1 ;;
121 esac
122 done
123 if (( $# != 2 )); then
124 usage 1
125 fi
126
127 action=$1
128 nn=$2 # namespace name
129 #### end arg parsing ####
130
131 #### begin sanity checking ####
132 install_error=false
133 if ! type -p ip &>/dev/null; then
134 echo "please install the iproute2 package"
135 install_error=true
136 fi
137 if ! type -p iptables &>/dev/null; then
138 echo "please install the iptables package"
139 install_error=true
140 fi
141 if $install_error; then
142 exit 1
143 fi
144 #### end sanity checking ####
145
146
147 v0=veth0-$nn
148 v1=veth1-$nn
149 ip_base=10.173
150
151 if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
152 create=true
153 fi
154
155 # make the default network namespace be named
156 target=/run/netns/default
157 if [[ ! -e $target && ! -L $target ]]; then
158 mkdir -p /run/netns
159 ln -s /proc/1/ns/net $target
160 fi
161
162
163 ipd() { ip -n default "$@"; }
164 if $create; then
165 # run ip in the network namespace
166 ipnn() { ip -n $nn "$@"; }
167 else
168 # we are already in the network namespace and it's unnamed.
169 # run ip in the network namespace
170 ipnn() { ip "$@"; }
171 fi
172 # default network namespace exec
173 dexec() { ip netns exec default "$@"; }
174 # mount namespace exec
175 mexec() { /usr/bin/nsenter --mount=/root/mount_namespaces/$nn "$@"; }
176
177
178 # background: head -n1 is defensive. Not sure if there is some weird feature
179 # for 2 routes to be 0/0.
180 gateway_ifs=($(ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'))
181
182 if [[ ! $gateway_ifs ]]; then
183 cat >&2 <<EOF
184 $0: error: failed to find gateway interface. No output from:
185 ipd route list exact 0/0 | head -n1| sed -r 's/.*dev\s+(\S+).*/\1/'
186 output from "ipd route list exact 0/0":
187 $(ipd route list exact 0/0)
188 EOF
189 exit 1
190 fi
191
192 nat() {
193 for if in ${gateway_ifs[@]}; do
194 dexec iptables -t nat $1 POSTROUTING -o $if -j MASQUERADE \
195 -m comment --comment "systemd network namespace nat"
196 done
197 }
198
199 # d = default
200 diptables-add() {
201 if ! dexec iptables -C "$@" &>/dev/null; then
202 dexec iptables -I "$@"
203 fi
204
205 }
206
207 find_network() {
208 if [[ $network ]]; then
209 return
210 fi
211 found=false
212 existing=false
213 ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
214 for ((i=1; i <= 254; i++)); do
215 network=$ip_base.$i
216 if printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
217 existing=true
218 else
219 found=true
220 break
221 fi
222 done
223 }
224
225 start() {
226 find_network
227 if ! $found; then
228 echo "$0: error: no open network found"
229 exit 1
230 fi
231
232 #### begin mount namespace setup ####
233 mkdir -p /root/mount_namespaces
234 if ! mountpoint /root/mount_namespaces >/dev/null; then
235 mount --bind /root/mount_namespaces /root/mount_namespaces
236 fi
237 # note: This is outside the mount condition because I've mysteriously
238 # had this become shared instead of private, perhaps it
239 # got remounted somehow and lost the setting.
240 mount --make-private /root/mount_namespaces
241 if [[ ! -e /root/mount_namespaces/$nn ]]; then
242 touch /root/mount_namespaces/$nn
243 fi
244 if ! mountpoint /root/mount_namespaces/$nn >/dev/null; then
245 # Here, we specify that we only want mount changes changes under
246 # this mountpoint to be propagated into the bind, but changes
247 # from within the bind do not propagate to outside the bind.
248 #
249 # slave is documented in.
250 # /usr/share/doc/linux-doc-4.9/Documentation/filesystems/sharedsubtree.txt.gz
251 # documentation on propagation is a bit weird because it
252 # confusingly talks about binds, namespaces, and mirrors (which
253 # seems to be just another name for bind), shared subtrees
254 # (which seems to a term for binds and namespaces), and does not
255 # properly specify whether the documentation applies to binds,
256 # namespaces, or both. Notably, propagation for binds is marked
257 # on the original mount point, and propagation for a mount
258 # namespace is marked on mounts within the namespace.
259 unshare --propagation slave --mount=/root/mount_namespaces/$nn /bin/true
260 fi
261
262 #### end mount namespace setup ####
263
264
265 if $create; then
266 ip netns add $nn
267 ip -n $nn link set dev lo up
268 fi
269
270 echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward 2>/dev/null
271
272 # docker helpfully changes the default FORWARD to drop...
273 diptables-add FORWARD -i $v0 -j ACCEPT
274 diptables-add FORWARD -o $v0 -j ACCEPT
275
276
277 err-cleanup() { stop; }
278 ipnn link add $v0 type veth peer name $v1
279 ipnn link set $v0 netns default
280 ipd addr add $network.1/24 dev $v0
281 ipd link set $v0 up
282 nat -C &>/dev/null || nat -A
283 ipnn addr add $network.2/24 dev $v1
284 ipnn link set $v1 up
285 ipnn route add default via $network.1
286
287 ###### begin setup resolvconf
288 if [[ -e /run/resolvconf ]]; then # resolvconf probably not installed
289 resolv_copy=/root/resolvconf-$nn
290
291 # this condition should never happen, just coding defensively
292 if mexec mountpoint /run/resolvconf &>/dev/null; then
293 mexec umount /run/resolvconf
294 fi
295 cp -aT /run/resolvconf $resolv_copy
296 if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
297 echo "error: resolv-conf bindmount failed"
298 exit 1
299 fi
300 # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
301 # in the network namespace, so adjust the address.
302 if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
303 mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
304 mexec resolvconf -u
305 fi
306 # and in debian based distros at least, it runs with --local-service, and needs a restart
307 # to know about the new local network
308 if [[ $(systemctl --no-pager show -p ActiveState dnsmasq ) == ActiveState=active ]]; then
309 systemctl restart dnsmasq
310 fi
311
312 # background: if we did this in openvpn's resolv-conf script, we could guard it in
313 # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
314 # and we could get $nn by
315 # config_basename=${config%%.*}
316 # config_basename=${config_basename##*/}
317 # but dnsmasq forces us to do it earlier.
318
319 fi # end if [[ -e /run/resolvconf ]]
320 ###### end setup resolvconf
321
322
323 }
324
325 stop() {
326 if ipd link list $v0 &>/dev/null; then
327 # this also deletes $v1 and the route we added.
328 ipd link del $v0
329 fi
330 find_network
331 if ! $existing; then
332 if nat -C &>/dev/null; then nat -D; fi
333 fi
334 dexec iptables -D FORWARD -i $v0 -j ACCEPT &>/dev/null ||:
335 if $create && [[ -e /var/run/netns/$nn ]]; then
336 ip netns del $nn
337 fi
338
339 # not sure this is necessary since we are tearing down the mount namespace
340 if mexec mountpoint /run/resolvconf &>/dev/null; then
341 mexec umount /run/resolvconf
342 fi
343
344 if mountpoint /root/mount_namespaces/$nn >/dev/null; then
345 umount /root/mount_namespaces/$nn
346 fi
347 }
348
349 case $action in
350 start|stop)
351 $action
352 ;;
353 *)
354 echo "$0: error: unsupported action"
355 exit 1
356 ;;
357 esac