mainly new feature to intentionally delay sending email
[distro-setup] / system-status
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # usage: runs once every 15 seconds unless any args are passed, or we
6 # then just runs once and have verbose output. On battery power, run
7 # once per minute.
8
9 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
10
11 if [[ $EUID != 1000 ]]; then
12 echo "$0: error, expected to be user 1000"
13 exit 1
14 fi
15
16 source /a/bin/errhandle/err
17 status_file=/dev/shm/iank-status
18
19 shopt -s nullglob
20 shopt -s dotglob
21 shopt -s extglob
22
23 for p in ~/.gem/ruby/*/bin; do
24 PATH="$PATH:$p"
25 done
26
27
28 verbose=false
29 if [[ $1 ]]; then
30 verbose=true
31 fi
32 v() {
33 if $verbose; then
34 printf "%s\n" "$*"
35 fi
36 }
37 p() { printf "%s\n" "$*"; }
38 # log-once COUNT NAME [MESSAGE]
39 lo() {
40 if type -p ifne &>/dev/null; then
41 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
42 fi
43 }
44
45 loday() {
46 if type -p ifne &>/dev/null; then
47 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
48 fi
49 }
50 # rm glob
51 rmg() {
52 if (( $# )); then
53 rm -f "$@"
54 fi
55 }
56
57 # todo, consider migrating some of these alerts into prometheus
58 write-status() {
59 chars=("${first_chars[@]}")
60
61 services=( epanicclean )
62 case $HOSTNAME in
63 bk|je|li) : ;;
64 *)
65 services+=(
66 systemstatus
67 btrfsmaintstop
68 dynamicipupdate
69 )
70 bads=()
71 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
72 for s in ${services[@]}; do
73 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
74 bads+=($s)
75 fi
76 done
77 chars+=(MYSERS)
78 fi
79 p ${bads[*]} | lo -240 mysers
80 ;;
81 esac
82
83 case $HOSTNAME in
84 kd)
85 services=(
86 prometheus-node-exporter
87 prometheus-alertmanager
88 prometheus
89 )
90 bads=()
91 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
92 for s in ${services[@]}; do
93 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
94 bads+=($s)
95 fi
96 done
97 chars+=(PROM)
98 fi
99 p ${bads[*]} | lo -240 prom
100 ;;
101 esac
102
103
104 # this section copied from servicepid()
105 unit=exim4
106 pid=$(systemctl show --property MainPID --value $unit ||:)
107 case $pid in
108 [1-9]*) : ;;
109 *)
110 dir=/sys/fs/cgroup/system.slice
111 if [[ ! -d $dir ]]; then
112 dir=/sys/fs/cgroup/systemd/system.slice
113 fi;
114 pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:)
115 ;;
116 esac
117 if [[ ! $pid ]]; then
118 chars+=(EXIM)
119 fi
120
121
122 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
123 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
124 source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
125 fi
126
127
128 ## check if last snapshot was recent
129 old_snap_limit=$(( 3 * 60 * 60 ))
130 for vol in a o q; do
131 case $vol in
132 o) btrbk_root=/mnt/o/btrbk ;;
133 *) btrbk_root=/mnt/root/btrbk ;;
134 esac
135 # this section generally copied from btrbk scripts, but
136 # this part modified to speed things up by about half a second.
137 # I'm not sure if its quite as reliable, but it looks pretty safe.
138 # Profiled it using time and also adding to the top of the file:
139 # set -x
140 # PS4='+ $(date "+%2N") '
141 # allow failure in case there are no snapshots yet.
142 # shellcheck disable=SC2012
143 shopt -s nullglob
144 files=($btrbk_root/$vol.20*)
145 shopt -u nullglob
146 if (( ! ${#files[@]} )); then
147 continue
148 fi
149 snaps=($(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : ))
150 now=$EPOCHSECONDS
151 maxtime=0
152 for s in ${snaps[@]}; do
153 file=${s##*/}
154 t=$(date -d $(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#$vol.}) +%s)
155 if (( t > maxtime )); then
156 maxtime=$t
157 fi
158 done
159 snapshotmsg=
160 last_snap_age=$(( now - maxtime ))
161 last_snap_hours=$(( last_snap_age / 60 / 60 ))
162 if (( last_snap_age > old_snap_limit )); then
163 chars+=(OLD-SNAP-${last_snap_hours}h)
164 snapshotmsg="/$vol snapshot older than 4 hours"
165 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
166 p "$snapshotmsg" | lo -1 old-snapshot
167 fi
168 # not bothering to get info on all volumes if we find an old one.
169 break
170 fi
171 done
172
173
174 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
175
176 bouncemsg=
177 glob=(/m/md/bounces/new/*)
178 if [[ -e ${glob[0]} ]]; then
179 chars+=(BOUNCE)
180 bouncemsg="message in /m/md/bounces/new"
181 fi
182 p $bouncemsg | loday -1 bounce
183 # emails without the S (seen) flag. this only checks the last flag,
184 # but its good enough for me.
185 glob=(/m/md/alerts/{new,cur}/!(*,S))
186 if [[ -e ${glob[0]} ]]; then
187 chars+=(A)
188 fi
189
190 glob=(/m/md/daylert/{new,cur}/!(*,S))
191 if [[ -e ${glob[0]} ]]; then
192 chars+=(DAY)
193 fi
194
195 bbkmsg=
196 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
197 chars+=(BTRBK.TIMER)
198 bbkmsg="not enabled"
199 fi
200 p "$bbkmsg" | lo -480 btrbk.timer
201
202
203
204 # commented out, only using timetrap retrospectively.
205 # # clock us out in timetrap if are idle too long
206 # if [[ -e /p/.timetrap.db ]]; then
207 # export DISPLAY=:0
208 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
209 # if [[ $xidle == [0-9]* ]]; then
210 # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
211 # idle=300000
212 # if [[ $sheet == w ]]; then
213 # idle=900000
214 # fi
215 # if [[ $sheet && $xidle -gt $idle ]]; then
216 # timetrap out
217 # fi
218 # fi
219 # fi
220 # fi
221 else # end if $MAIL_HOST
222 rmg /home/iank/cron-errors/bounce* \
223 /home/iank/cron-errors/btrbk.timer* \
224 /home/iank/cron-errors/old-snapshot*
225 fi
226
227 if ip l show tunfsf &>/dev/null; then
228 # this is for tracking dns over tls issue, which
229 # fixvpndns() in brc2 fixes.
230 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
231 read -r _ _ _ istls <<<"$stat"
232 case $istls in
233 no) : ;;
234 *)
235 printf "%s\n" "$istls" | ts >> /tmp/istls.log
236 chars+=("T:$istls")
237 ;;
238 esac
239 fi
240
241 # We do this once every 5 minutes, since this is not a grave problem.
242 # For formatted elisp, see /b/ds/unsaved-buffers.el
243 elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))'
244 if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then
245 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
246 # i dun care if this fails
247 emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)"
248 if [[ $emacsfiles ]]; then
249 chars+=("$emacsfiles")
250 fi
251 fi
252 last_emacs_check=$EPOCHSECONDS
253 fi
254
255
256 glob=(/nocow/btrfs-stale/*)
257 if [[ -e ${glob[0]} ]]; then
258 chars+=(STALE)
259 fi
260 var_mail_msg=
261 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
262 var_mail_msg="message in /var/mail"
263 fi
264 p $var_mail_msg | loday -1 var_mail
265
266 # Note, early in install process, we dont have permission yet for exiqgrep.
267 #
268 # todo: don't do this every 15 seconds, more like once every 2 minutes to
269 # save cpu cycles.
270 #
271 # 2400 = 40 mins. This should allow for system restarts, and
272 # 30 minute message delay plus 10 minute queu runs.
273 qlen=$(/usr/sbin/exiqgrep -o 2400 -c -b | awk '{print $1}') ||:
274 qmsg=
275 if ((qlen)); then
276 # Do sending of long delayed messages, and dont count them in our queue warnings.
277 for mid in $(exiqgrep -o 2400 -zi); do
278 if exim -Mvh $mid | awk 'tolower($2) == "fdate:"' | grep -q .; then
279 qlen=$(( qlen - 1 ))
280 if (( $(date -d "$(exim -Mset $mid -be <<<'$h_date:' | sed -n 's/^> *//;/./p')" +%s) < EPOCHSECONDS )); then
281 if ip a show veth0-mail &>/dev/null; then
282 pid=$(pgrep -f "/usr/sbin/exim4 -bd -q30m -C /etc/exim4/my.conf"|head -n1);
283 nsenter -t $pid -n -m /usr/sbin/exim4 -C /etc/exim4/my.conf -M $mid
284 else
285 /usr/sbin/exim4 -M $mid
286 fi
287 fi
288 fi
289 done
290
291 if ((qlen)); then
292 qmsg="queue length $qlen"
293 chars+=("q $qlen")
294 fi
295 fi
296 case $HOSTNAME in
297 # No point in emailing about the mailq on a host where we don't
298 # check email.
299 $MAIL_HOST)
300 p $qmsg | loday -120 qlen
301 ;;
302 *)
303 rmg /home/iank/cron-errors/qlen*
304 ;;
305 esac
306
307 begin=false
308
309 # todo: make this robust to the case of /a not being mounted
310 if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
311 begin=true
312 fi
313
314 end=false
315 if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
316 end=true
317 fi
318
319 # these conditions are so we dont have an overly verbose prompt
320 if $begin && $end; then
321 chars+=(D)
322 elif $begin; then
323 chars+=(DB)
324 elif $end; then
325 chars+=(DE)
326 else
327 source /a/bin/ds/script-files
328 f=~/.local/conflink
329 # shellcheck disable=SC2043
330 for _ in 1; do
331 if [[ -e $f ]]; then
332 now=$EPOCHSECONDS
333 fsec=$(stat -c%Y $f)
334 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
335 # dont have any false positives.
336 fmin=$(( (fsec - now + 1 ) / 60 ))
337 fminplus=$(( fmin + 60*24 ))
338 # Filesystem files get copied, so find any newer than the last run.
339 # The rest are hueristics:
340 # Given the last time we added a file in git, is that newer than the last conflink run.
341 # Given new files not added to git, were they modified more recently than the last conflink? but,
342 # push their modification time back by a day so we can develop them before needing to add them to git.
343
344 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
345 # This part is copied from conflink
346 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
347 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
348 done
349
350 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
351 if (( fmin < 0 )) && [[ $(find ${all_my_scripts[@]} ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
352 v conflink newer filesystem files
353 chars+=(CONFLINK)
354 break
355 fi
356
357 for d in /a/bin/distro-setup /p/c; do
358 [[ -d $d ]] || continue
359 cd $d
360 if [[ ! -e .git ]]; then
361 # some hosts i dont push all of /p/c
362 continue
363 fi
364 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
365 v conflink: newer files checked in to git
366 chars+=(CONFLINK)
367 break
368 fi
369
370 untracked=()
371 while read -r l; do
372 untracked+=("$l")
373 done < <(git ls-files -o --exclude-standard)
374 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
375 v conflink: untracked in $d
376 chars+=(CONFLINK)
377 break
378 fi
379 done
380 cd /
381
382 fi
383 if [[ ! -e $f || $(<$f) != 0 ]]; then
384 v conflink: last run not found or failed
385 chars+=(CONFLINK)
386 break
387 fi
388 done
389 fi
390
391 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
392 if [[ -s /var/log/exim4/paniclog ]]; then
393 chars+=("PANIC!")
394 # leave it up to epanic-clean to send email notification
395 fi
396
397 if [[ ! -e $status_file || -w $status_file ]]; then
398 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
399 cat /a/bin/bash_unpublished/source-state >$status_file
400 fi
401
402 if [[ ${chars[*]} ]]; then
403 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
404 fi
405 fi
406 }
407
408 # This prevents me having to mute notifications when I'm going to bed.
409 mute() {
410 local locked
411 export DISPLAY=:0
412 locked=false
413 if lock_info=$(xscreensaver-command -time); then
414 if [[ $lock_info != *non-blanked* ]]; then
415 locked=true
416 fi
417 else
418 locked=true
419 fi
420 midnight=$(date -d 00:00 +%s)
421 mdiff=$(( EPOCHSECONDS - midnight ))
422 if $locked && (( mdiff < 6 || mdiff > 21 )); then
423 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
424 no)
425 # for log purposes
426 echo unmuted
427 pactl set-sink-mute @DEFAULT_SINK@ true
428 ;;
429 esac
430 fi
431 if ! $locked && (( mdiff > 6 || mdiff < 12 )) && [[ ! -e /tmp/ianknap ]]; then
432 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
433 yes)
434 # for log purposes
435 echo muted
436 pactl set-sink-mute @DEFAULT_SINK@ false
437 ;;
438 esac
439 fi
440 }
441
442 # use this if we want to do something just once per minute
443 first_chars=()
444
445 write-status
446 if [[ $1 ]]; then
447 cat $status_file
448 exit 0
449 fi
450
451 main-loop() {
452 while true; do
453 power=true
454 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
455 power=false
456 fi
457 wait=15
458 if ! $power; then
459 if systemctl -q is-active bitcoind; then
460 bitcoinoff
461 fi
462 wait=60
463 fi
464
465 sleep $wait
466 write-status
467 mute
468 done
469 }
470
471 # ensure our long operations are one line so we are not prone errors
472 # from this file being modified.
473 main-loop