fix vpn host naming
[distro-setup] / system-status
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # usage: runs once every 15 seconds unless any args are passed, or we
6 # then just runs once and have verbose output. On battery power, run
7 # once per minute.
8
9 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
10
11 if [[ $EUID != 1000 ]]; then
12 echo "$0: error, expected to be user 1000"
13 exit 1
14 fi
15
16 source /a/bin/bash-bear-trap/bash-bear
17 status_file=/dev/shm/iank-status
18
19 shopt -s nullglob
20 shopt -s dotglob
21 shopt -s extglob
22
23 for p in ~/.gem/ruby/*/bin; do
24 PATH="$PATH:$p"
25 done
26
27
28 verbose=false
29 if [[ $1 ]]; then
30 verbose=true
31 fi
32 v() {
33 if $verbose; then
34 printf "%s\n" "$*"
35 fi
36 }
37 p() { printf "%s\n" "$*"; }
38 # log-once COUNT NAME [MESSAGE]
39 lo() {
40 if type -p ifne &>/dev/null; then
41 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
42 fi
43 }
44
45 loday() {
46 if type -p ifne &>/dev/null; then
47 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
48 fi
49 }
50 # rm glob
51 rmg() {
52 if (( $# )); then
53 rm -f "$@"
54 fi
55 }
56
57 # todo, consider migrating some of these alerts into prometheus
58 write-status() {
59 chars=("${first_chars[@]}")
60
61 services=( epanicclean )
62 case $HOSTNAME in
63 bk|je|li) : ;;
64 *)
65 services+=(
66 systemstatus
67 btrfsmaintstop
68 dynamicipupdate
69 )
70 bads=()
71 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
72 for s in ${services[@]}; do
73 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
74 bads+=($s)
75 fi
76 done
77 chars+=(MYSERS)
78 fi
79 p ${bads[*]} | lo -240 mysers
80 ;;
81 esac
82
83 case $HOSTNAME in
84 kd)
85 services=(
86 prometheus-node-exporter
87 prometheus-alertmanager
88 prometheus
89 )
90 bads=()
91 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
92 for s in ${services[@]}; do
93 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
94 bads+=($s)
95 fi
96 done
97 chars+=(PROM)
98 fi
99 p ${bads[*]} | lo -240 prom
100 ;;
101 esac
102
103
104 # this section copied from servicepid()
105 unit=exim4
106 pid=$(systemctl show --property MainPID --value $unit ||:)
107 case $pid in
108 [1-9]*) : ;;
109 *)
110 dir=/sys/fs/cgroup/system.slice
111 if [[ ! -d $dir ]]; then
112 dir=/sys/fs/cgroup/systemd/system.slice
113 fi;
114 pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:)
115 ;;
116 esac
117 if [[ ! $pid ]]; then
118 chars+=(EXIM)
119 fi
120
121
122 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
123 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
124 source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
125 fi
126
127
128 ## check if last snapshot was recent
129 old_snap_limit=$(( 3 * 60 * 60 ))
130 vol=o
131 btrbk_root=/mnt/o/btrbk
132 # this section generally copied from btrbk scripts, but
133 # this part modified to speed things up by about half a second.
134 # I'm not sure if its quite as reliable, but it looks pretty safe.
135 # Profiled it using time and also adding to the top of the file:
136 # set -x
137 # PS4='+ $(date "+%2N") '
138 # allow failure in case there are no snapshots yet.
139 shopt -s nullglob
140 files=($btrbk_root/$vol.20*)
141 shopt -u nullglob
142 if (( ${#files[@]} )); then
143 # shellcheck disable=SC2012 # using ls version sort. not sure this is needed.
144 snaps=("$(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : )")
145 now=$EPOCHSECONDS
146 maxtime=0
147 for s in ${snaps[@]}; do
148 file=${s##*/}
149 t=$(date -d "$(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#"$vol."})" +%s)
150 if (( t > maxtime )); then
151 maxtime=$t
152 fi
153 done
154 snapshotmsg=
155 last_snap_age=$(( now - maxtime ))
156 last_snap_hours=$(( last_snap_age / 60 / 60 ))
157 if (( last_snap_age > old_snap_limit )); then
158 chars+=(OLD-SNAP-${last_snap_hours}h)
159 snapshotmsg="/$vol snapshot older than 4 hours"
160 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
161 p "$snapshotmsg" | lo -1 old-snapshot
162 fi
163 # not bothering to get info on all volumes if we find an old one.
164 fi
165 fi
166
167
168 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
169
170 bouncemsg=
171 glob=(/m/md/bounces/new/*)
172 if [[ -e ${glob[0]} ]]; then
173 chars+=(BOUNCE)
174 bouncemsg="message in /m/md/bounces/new"
175 fi
176 p $bouncemsg | loday -1 bounce
177 # emails without the S (seen) flag. this only checks the last flag,
178 # but its good enough for me.
179 glob=(/m/md/alerts/{new,cur}/!(*,S))
180 if [[ -e ${glob[0]} ]]; then
181 chars+=(A)
182 fi
183
184 glob=(/m/md/daylert/{new,cur}/!(*,S))
185 if [[ -e ${glob[0]} ]]; then
186 chars+=(DAY)
187 fi
188
189 bbkmsg=
190 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
191 chars+=(BTRBK.TIMER)
192 bbkmsg="not enabled"
193 fi
194 p "$bbkmsg" | lo -480 btrbk.timer
195
196
197
198 # commented out, only using timetrap retrospectively.
199 # # clock us out in timetrap if are idle too long
200 # if [[ -e /p/.timetrap.db ]]; then
201 # export DISPLAY=:0
202 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
203 # if [[ $xidle == [0-9]* ]]; then
204 # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
205 # idle=300000
206 # if [[ $sheet == w ]]; then
207 # idle=900000
208 # fi
209 # if [[ $sheet && $xidle -gt $idle ]]; then
210 # timetrap out
211 # fi
212 # fi
213 # fi
214 # fi
215 else # end if $MAIL_HOST
216 rmg /home/iank/cron-errors/bounce* \
217 /home/iank/cron-errors/btrbk.timer* \
218 /home/iank/cron-errors/old-snapshot*
219 fi
220
221 if ip l show tunfsf &>/dev/null; then
222 # this is for tracking dns over tls issue, which
223 # fixvpndns() in brc2 fixes.
224 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
225 read -r _ _ _ istls <<<"$stat"
226 case $istls in
227 no) : ;;
228 *)
229 printf "%s\n" "$istls" | ts >> /tmp/istls.log
230 chars+=("T:$istls")
231 ;;
232 esac
233 fi
234
235 # We do this once every 5 minutes, since this is not a grave problem.
236 # For formatted elisp, see /b/ds/unsaved-buffers.el
237 elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))'
238 if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then
239 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
240 # i dun care if this fails
241 emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)"
242 if [[ $emacsfiles ]]; then
243 chars+=("$emacsfiles")
244 fi
245 fi
246 last_emacs_check=$EPOCHSECONDS
247 fi
248
249
250 glob=(/nocow/btrfs-stale/*)
251 if [[ -e ${glob[0]} ]]; then
252 chars+=(STALE)
253 fi
254 var_mail_msg=
255 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
256 var_mail_msg="message in /var/mail"
257 fi
258 p $var_mail_msg | loday -1 var_mail
259
260 # Note, early in install process, we dont have permission yet for exiqgrep.
261 #
262 # todo: don't do this every 15 seconds, more like once every 2 minutes to
263 # save cpu cycles.
264 #
265 # 2400 = 40 mins. This should allow for system restarts, and
266 # 30 minute message delay plus 10 minute queu runs.
267 qlen=$(/usr/sbin/exiqgrep -o 2400 -c -b | awk '{print $1}') ||:
268 qmsg=
269 if ((qlen)); then
270 # Do sending of long delayed messages, and dont count them in our queue warnings.
271 for mid in $(exiqgrep -o 2400 -zi); do
272 if exim -Mvh $mid | awk 'tolower($2) == "fdate:"' | grep -q .; then
273 qlen=$(( qlen - 1 ))
274 # shellcheck disable=SC2016 # exim var, not a bash bar
275 if (( $(date -d "$(exim -Mset $mid -be <<<'$h_date:' | sed -n 's/^> *//;/./p')" +%s) < EPOCHSECONDS )); then
276 if ip a show veth0-mail &>/dev/null; then
277 pid=$(pgrep -f "/usr/sbin/exim4 -bd -q30m -C /etc/exim4/my.conf"|head -n1);
278 nsenter -t $pid -n -m /usr/sbin/exim4 -C /etc/exim4/my.conf -M $mid
279 else
280 /usr/sbin/exim4 -M $mid
281 fi
282 fi
283 fi
284 done
285
286 if ((qlen)); then
287 qmsg="queue length $qlen"
288 chars+=("q $qlen")
289 fi
290 fi
291 case $HOSTNAME in
292 # No point in emailing about the mailq on a host where we don't
293 # check email.
294 $MAIL_HOST)
295 p $qmsg | loday -120 qlen
296 ;;
297 *)
298 rmg /home/iank/cron-errors/qlen*
299 ;;
300 esac
301
302 begin=false
303
304 # todo: make this robust to the case of /a not being mounted
305 if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
306 begin=true
307 fi
308
309 end=false
310 if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
311 end=true
312 fi
313
314 # these conditions are so we dont have an overly verbose prompt
315 if $begin && $end; then
316 chars+=(D)
317 elif $begin; then
318 chars+=(DB)
319 elif $end; then
320 chars+=(DE)
321 else
322 source /a/bin/ds/script-files
323 f=~/.local/conflink
324 # shellcheck disable=SC2043
325 for _ in 1; do
326 if [[ -e $f ]]; then
327 now=$EPOCHSECONDS
328 fsec=$(stat -c%Y $f)
329 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
330 # dont have any false positives.
331 fmin=$(( (fsec - now + 1 ) / 60 ))
332 fminplus=$(( fmin + 60*24 ))
333 # Filesystem files get copied, so find any newer than the last run.
334 # The rest are hueristics:
335 # Given the last time we added a file in git, is that newer than the last conflink run.
336 # Given new files not added to git, were they modified more recently than the last conflink? but,
337 # push their modification time back by a day so we can develop them before needing to add them to git.
338
339 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
340 # This part is copied from conflink
341 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
342 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
343 done
344
345 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
346 if (( fmin < 0 )) && [[ $(find ${all_my_scripts[@]} ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
347 v conflink newer filesystem files
348 chars+=(CONFLINK)
349 break
350 fi
351
352 for d in /a/bin/distro-setup /p/c; do
353 [[ -d $d ]] || continue
354 cd $d
355 if [[ ! -e .git ]]; then
356 # some hosts i dont push all of /p/c
357 continue
358 fi
359 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
360 v conflink: newer files checked in to git
361 chars+=(CONFLINK)
362 break
363 fi
364
365 untracked=()
366 while read -r l; do
367 untracked+=("$l")
368 done < <(git ls-files -o --exclude-standard)
369 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
370 v conflink: untracked in $d
371 chars+=(CONFLINK)
372 break
373 fi
374 done
375 cd /
376
377 fi
378 if [[ ! -e $f || $(<$f) != 0 ]]; then
379 v conflink: last run not found or failed
380 chars+=(CONFLINK)
381 break
382 fi
383 done
384 fi
385
386 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
387 if [[ -s /var/log/exim4/paniclog ]]; then
388 chars+=("PANIC!")
389 # leave it up to epanic-clean to send email notification
390 fi
391
392 mprom=/var/lib/prometheus/node-exporter/mailtest-check.prom
393 if grep -qE 'mailtest_check_(unexpected|missing).*[^ ][^0]$' $mprom; then
394 chars+=("MTEST_SPAM")
395 fi
396 mtest_found=false
397 for t in $(grep -E ^mailtest_check_last_usec $mprom | awk '{print $NF}'); do
398 if (( t + 60 * 20 < EPOCHSECONDS )); then
399 mtest_found=true
400 fi
401 done
402 if $mtest_found; then
403 chars+=("MTEST_AGE")
404 fi
405
406 if [[ ! -e $status_file || -w $status_file ]]; then
407 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
408 cat /a/bin/bash_unpublished/source-state >$status_file
409 fi
410
411 if [[ ${chars[*]} ]]; then
412 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
413 fi
414 fi
415 }
416
417 # This prevents me having to mute notifications when I'm going to bed.
418 mute() {
419 local locked
420 export DISPLAY=:0
421 locked=false
422 if lock_info=$(xscreensaver-command -time); then
423 if [[ $lock_info != *non-blanked* ]]; then
424 locked=true
425 fi
426 else
427 locked=true
428 fi
429 midnight=$(date -d 00:00 +%s)
430 mdiff=$(( EPOCHSECONDS - midnight ))
431 if $locked && (( mdiff < 6 *60*60 || mdiff > 21 *60*60 )); then
432 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
433 no)
434 # for log purposes
435 echo muted
436 pactl set-sink-mute @DEFAULT_SINK@ true
437 ;;
438 esac
439 fi
440 if ! $locked && (( mdiff > 6 *60*60 || mdiff < 12 *60*60 )) && [[ ! -e /tmp/ianknap ]]; then
441 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
442 yes)
443 # for log purposes
444 echo unmuted
445 pactl set-sink-mute @DEFAULT_SINK@ false
446 ;;
447 esac
448 fi
449 }
450
451 # use this if we want to do something just once per minute
452 first_chars=()
453
454 write-status
455 if [[ $1 ]]; then
456 cat $status_file
457 exit 0
458 fi
459
460 loop_count=0
461 main-loop() {
462 while true; do
463 power=true
464 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
465 power=false
466 fi
467 wait=15
468
469 if $power; then
470 if (( loop_count % 10 == 0 )); then
471 if [[ -r /sys/class/power_supply/BAT0/capacity ]]; then
472 bat=$(cat /sys/class/power_supply/BAT0/capacity)
473 else
474 bat=100
475 fi
476 case $bat in
477 100|9?)
478 :
479 bitcoinon &
480 ;;
481 esac
482 fi
483 else
484 bitcoinoff
485 wait=60
486 fi
487
488 sleep $wait
489 write-status
490 mute
491 loop_count=$(( loop_count + 1 ))
492 done
493 }
494
495 # ensure our long operations are one line so we are not prone errors
496 # from this file being modified.
497 main-loop