make dependency require manual install
[newns] / newns
1 #!/bin/bash
2 # Copyright (C) 2017 Ian Kelling
3
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7
8 # http://www.apache.org/licenses/LICENSE-2.0
9
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16
17 [[ $EUID == 0 ]] || exec sudo -E "$BASH_SOURCE" "$@"
18
19 # https://savannah.nongnu.org/projects/bash-bear-trap/
20 set -e; . /usr/local/lib/bash-bear; set +e
21
22
23 usage() {
24 cat <<EOF
25 usage: ${0##*/} [OPTS] start|stop NS_NAME
26 Nat a network namespace. systemd friendly
27
28 Also creates a mount namespace with a cloned /run/resolvconf.
29
30 -c, --create Create or destroy a named network namespace. When running from
31 the same network namespace as pid 1, this is set automatically.
32 A systemd created private network is in a network namespace
33 different than pid 1.
34 -n NETWORK x.x.x /24 private network to use. If not specified, uses
35 the first unused one starting at 10.173.1
36 -h, --help Show this help and exit.
37
38 From a normal shell:
39
40 If we do create the netns, to join it with a shell, we can do (as root)
41 /usr/bin/nsenter --mount=/root/mount_namespaces/NAME --net=/var/run/netns/NAME bash
42
43 If you dont care about the mount namespace, you can leave that option off.
44
45
46 For systemd:
47
48 From within a systemd network namespace, we nat it to the outside. This
49 would be called from ExecStartPre, and or subsequent units called with
50 JoinsNamespaceOf= and PrivateNetwork=true.
51
52 If resolvconf is installed, we create a named mount namespace under
53 /root/mount_namespaces, so we can alter some system config for this
54 namespace. systemd command lines would be prefixed with:
55
56 /usr/bin/nsenter --mount=/root/mount_namespaces/NS_NAME
57
58 Note, this means that they can't run as unpriveledged users, but once
59 systemd 233 comes out, it will have a bind mount option from within unit
60 files, so the mount namespace won't be needed for most use cases, and I
61 will update the script to that the mount namespace not created unless a
62 flag is passed in. Patch welcome to add that flag before then.
63
64
65 This script has a dependency
66 https://savannah.nongnu.org/projects/bash-bear-trap/ . Search the script for "source" to see where to install or modify the installed location.
67
68
69 Background on this project (you can skip if you like):
70
71 If we aren't creating a named network namespace, to join the namespace
72 with a shell, I use:
73 nsenter -n -m -t \$(pgrep PROCESS_IN_NAMESPACE) bash
74
75 Note: if I knew how to easily ask systemd what pid a unit has, i would
76 do that.
77
78 "ip netns new ..." also does a mount namespace, then bind
79 mounts each file/dir in /etc/netns/NS_NAME to /etc/NS_NAME. Note,
80 for openvpn having it's own resolv.conf by using it's user script which
81 calls resolvconf, this doesn't help much. What we actually want to do is
82 copy /run/resolvconf somehwere then bind mount it on top of
83 /run/resolvconf.
84
85
86 Note: for debugging, adding set -x is a pretty good option.
87
88 Please email me if you have a patches, bugs, feedback, or republish this
89 somewhere else: Ian Kelling <ian@iankelling.org>.
90 EOF
91 exit ${1:-0}
92 }
93
94
95 #### begin arg parsing ####
96 create=false
97 temp=$(getopt -l help,create hcn: "$@") || usage 1
98 eval set -- "$temp"
99 while true; do
100 case $1 in
101 -c|--create) create=true; shift ;;
102 -n) network=$2; shift 2 ;;
103 -h|--help) usage ;;
104 --) shift; break ;;
105 *) echo "$0: Internal error!" ; exit 1 ;;
106 esac
107 done
108 if (( $# != 2 )); then
109 usage 1
110 fi
111
112 action=$1
113 nn=$2 # namespace name
114 #### end arg parsing ####
115
116 #### begin sanity checking ####
117 install_error=false
118 if ! type -p ip &>/dev/null; then
119 echo "please install the iproute2 package"
120 install_error=true
121 fi
122 if ! type -p iptables &>/dev/null; then
123 echo "please install the iptables package"
124 install_error=true
125 fi
126 if $install_error; then
127 exit 1
128 fi
129 #### end sanity checking ####
130
131 v0=veth0-$nn
132 v1=veth1-$nn
133 ip_base=10.173
134
135 if ! $create && [[ $(readlink /proc/self/ns/net) == "$(readlink /proc/1/ns/net)" ]]; then
136 create=true
137 fi
138
139 # make the default network namespace be named
140
141 mkdir -p /run/netns
142 target=/run/netns/default
143 if [[ ! -e $target && ! -L $target ]]; then
144 # -f to avoid a race condition with running twice
145 ln -sf /proc/1/ns/net $target
146 fi
147
148 ipd() { ip -n default "$@"; }
149
150
151 # otherwise we are already in the network namespace and it's unnamed.
152 if $create; then
153 ipnnargs="-n $nn"
154 fi
155 # run ip in the network namespace
156 ipnn() { ip $ipnnargs "$@"; }
157
158 # default network namespace exec
159 dexec() { ip netns exec default "$@"; }
160 # mount namespace exec
161 mexec() { /usr/bin/nsenter --mount=/root/mount_namespaces/$nn "$@"; }
162
163
164 nat() {
165 # note, in a previous commit i specified the output interface with -o,
166 # but that broke things when my gateway interface changed, and I can't
167 # see any advantage to it, so I removed it.
168 dexec iptables -t nat $1 POSTROUTING -s $network.0/24 -j MASQUERADE \
169 -m comment --comment "systemd network namespace nat"
170 }
171
172 # d = default
173 diptables-add() {
174 if ! dexec iptables -C "$@" &>/dev/null; then
175 dexec iptables -I "$@"
176 fi
177
178 }
179
180 find_network() {
181 if [[ $network ]]; then
182 return
183 fi
184 found=false
185 existing=false
186 ips="$(ipd addr show | awk '$1 == "inet" {print $2}')"
187 for ((i=1; i <= 254; i++)); do
188 network=$ip_base.$i
189 if printf "%s\n" "$ips" | grep "^${network//./\\.}" >/dev/null; then
190 existing=true
191 else
192 found=true
193 break
194 fi
195 done
196 }
197
198 start() {
199 find_network
200 if ! $found; then
201 echo "$0: error: no open network found"
202 exit 1
203 fi
204
205 #### begin mount namespace setup ####
206 mkdir -p /root/mount_namespaces
207 if ! mountpoint /root/mount_namespaces >/dev/null; then
208 mount --bind /root/mount_namespaces /root/mount_namespaces
209 fi
210 # note: This is outside the mount condition because I've mysteriously
211 # had this become shared instead of private, perhaps it
212 # got remounted somehow and lost the setting.
213 mount --make-private /root/mount_namespaces
214 if [[ ! -e /root/mount_namespaces/$nn ]]; then
215 touch /root/mount_namespaces/$nn
216 fi
217 if ! mountpoint /root/mount_namespaces/$nn >/dev/null; then
218 # Here, we specify that we only want mount changes changes under
219 # this mountpoint to be propagated into the bind, but changes
220 # from within the bind do not propagate to outside the bind.
221 #
222 # slave is documented in.
223 # /usr/share/doc/linux-doc-4.9/Documentation/filesystems/sharedsubtree.txt.gz
224 # documentation on propagation is a bit weird because it
225 # confusingly talks about binds, namespaces, and mirrors (which
226 # seems to be just another name for bind), shared subtrees
227 # (which seems to a term for binds and namespaces), and does not
228 # properly specify whether the documentation applies to binds,
229 # namespaces, or both. Notably, propagation for binds is marked
230 # on the original mount point, and propagation for a mount
231 # namespace is marked on mounts within the namespace.
232 unshare --propagation slave --mount=/root/mount_namespaces/$nn /bin/true
233 fi
234
235 #### end mount namespace setup ####
236
237
238 if $create; then
239 ip netns add $nn
240 ip -n $nn link set dev lo up
241 fi
242
243 echo 1 | dexec dd of=/proc/sys/net/ipv4/ip_forward status=none
244
245 # docker helpfully changes the default FORWARD to drop...
246 diptables-add FORWARD -i $v0 -j ACCEPT
247 diptables-add FORWARD -o $v0 -j ACCEPT
248
249
250 err-cleanup() { stop; }
251 ipnn link add $v0 type veth peer name $v1
252 ipnn link set $v0 netns default
253 ipd addr add $network.1/24 dev $v0
254 ipd link set $v0 up
255 nat -C &>/dev/null || nat -A
256 ipnn addr add $network.2/24 dev $v1
257 ipnn link set $v1 up
258 ipnn route add default via $network.1
259
260 ###### begin setup resolvconf
261 if [[ -e /run/resolvconf ]]; then # resolvconf probably installed
262 resolv_copy=/root/resolvconf-$nn
263
264 # this condition should never happen, just coding defensively
265 if mexec mountpoint /run/resolvconf &>/dev/null; then
266 mexec umount /run/resolvconf
267 fi
268 cp -aT /run/resolvconf $resolv_copy
269 if ! mexec mount -o bind $resolv_copy /run/resolvconf; then
270 echo "error: resolv-conf bindmount failed"
271 exit 1
272 fi
273 # if running dnsmasq, we have 127.0.0.1 for dns, but it can't listen on the loopback
274 # in the network namespace, so adjust the address.
275 if mexec [ -s /run/resolvconf/interface/lo.dnsmasq ]; then
276 mexec sed --follow-symlinks -i "s/nameserver 127\..*/nameserver $network.1/" /run/resolvconf/interface/lo.dnsmasq
277 mexec resolvconf -u
278 fi
279 # and in debian based distros at least, it runs with --local-service, and needs a restart
280 # to know about the new local network
281 if [[ $(systemctl --no-pager show -p ActiveState dnsmasq ) == ActiveState=active ]]; then
282 systemctl restart dnsmasq
283 fi
284
285 # background: if we did this in openvpn's resolv-conf script, we could guard it in
286 # if capsh --print|grep '\bcap_sys_admin\b' &>/dev/null
287 # and we could get $nn by
288 # config_basename=${config%%.*}
289 # config_basename=${config_basename##*/}
290 # but dnsmasq forces us to do it earlier.
291
292 fi # end if [[ -e /run/resolvconf ]]
293 ###### end setup resolvconf
294
295
296 }
297
298 stop() {
299 if ipd link list $v0 &>/dev/null; then
300 # this also deletes $v1 and the route we added.
301 ipd link del $v0
302 fi
303 find_network
304 if ! $existing; then
305 if nat -C &>/dev/null; then nat -D; fi
306 fi
307 dexec iptables -D FORWARD -i $v0 -j ACCEPT &>/dev/null ||:
308 if $create && [[ -e /var/run/netns/$nn ]]; then
309 ip netns del $nn
310 fi
311
312 # not sure this is necessary since we are tearing down the mount namespace
313 if mexec mountpoint /run/resolvconf &>/dev/null; then
314 mexec umount /run/resolvconf
315 fi
316
317 if mountpoint /root/mount_namespaces/$nn >/dev/null; then
318 umount /root/mount_namespaces/$nn
319 fi
320 }
321
322 case $action in
323 start|stop)
324 $action
325 ;;
326 *)
327 echo "$0: error: unsupported action"
328 exit 1
329 ;;
330 esac