finally fully use gnu license recommendations
[distro-setup] / system-status
1 #!/bin/bash
2
3 # Basic system status on on Ian's computers
4 # Copyright (C) 2024 Ian Kelling
5
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
19 # SPDX-License-Identifier: GPL-3.0-or-later
20
21 # usage: runs once every 15 seconds unless any args are passed, or we
22 # then just runs once and have verbose output. On battery power, run
23 # once per minute.
24
25 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
26
27 if [[ $EUID != 1000 ]]; then
28 echo "$0: error, expected to be user 1000"
29 exit 1
30 fi
31
32 source /a/bin/bash-bear-trap/bash-bear
33 status_file=/dev/shm/iank-status
34
35 shopt -s nullglob
36 shopt -s dotglob
37 shopt -s extglob
38
39 for p in ~/.gem/ruby/*/bin; do
40 PATH="$PATH:$p"
41 done
42
43
44 verbose=false
45 if [[ $1 ]]; then
46 verbose=true
47 fi
48 v() {
49 if $verbose; then
50 printf "%s\n" "$*"
51 fi
52 }
53 p() { printf "%s\n" "$*"; }
54 # log-once COUNT NAME [MESSAGE]
55 lo() {
56 if type -p ifne &>/dev/null; then
57 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
58 fi
59 }
60
61 loday() {
62 if type -p ifne &>/dev/null; then
63 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
64 fi
65 }
66 # rm glob
67 rmg() {
68 if (( $# )); then
69 rm -f "$@"
70 fi
71 }
72
73 # todo, consider migrating some of these alerts into prometheus
74 write-status() {
75 chars=("${first_chars[@]}")
76
77 services=( epanicclean )
78 case $HOSTNAME in
79 bk|je|li) : ;;
80 *)
81 services+=(
82 systemstatus
83 btrfsmaintstop
84 dynamicipupdate
85 )
86 bads=()
87 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
88 for s in ${services[@]}; do
89 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
90 bads+=($s)
91 fi
92 done
93 chars+=(MYSERS)
94 fi
95 p ${bads[*]} | lo -240 mysers
96 ;;
97 esac
98
99 case $HOSTNAME in
100 kd)
101 services=(
102 prometheus-node-exporter
103 prometheus-alertmanager
104 prometheus
105 )
106 bads=()
107 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
108 for s in ${services[@]}; do
109 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
110 bads+=($s)
111 fi
112 done
113 chars+=(PROM)
114 fi
115 p ${bads[*]} | lo -240 prom
116 ;;
117 esac
118
119
120 # this section copied from servicepid()
121 unit=exim4
122 pid=$(systemctl show --property MainPID --value $unit ||:)
123 case $pid in
124 [1-9]*) : ;;
125 *)
126 dir=/sys/fs/cgroup/system.slice
127 if [[ ! -d $dir ]]; then
128 dir=/sys/fs/cgroup/systemd/system.slice
129 fi;
130 pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:)
131 ;;
132 esac
133 if [[ ! $pid ]]; then
134 chars+=(EXIM)
135 fi
136
137
138 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
139 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
140 source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
141 fi
142
143
144 ## check if last snapshot was recent
145 old_snap_limit=$(( 3 * 60 * 60 ))
146 vol=o
147 btrbk_root=/mnt/o/btrbk
148 # this section generally copied from btrbk scripts, but
149 # this part modified to speed things up by about half a second.
150 # I'm not sure if its quite as reliable, but it looks pretty safe.
151 # Profiled it using time and also adding to the top of the file:
152 # set -x
153 # PS4='+ $(date "+%2N") '
154 # allow failure in case there are no snapshots yet.
155 shopt -s nullglob
156 files=($btrbk_root/$vol.20*)
157 shopt -u nullglob
158 if (( ${#files[@]} )); then
159 # shellcheck disable=SC2012 # using ls version sort. not sure this is needed.
160 snaps=("$(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : )")
161 now=$EPOCHSECONDS
162 maxtime=0
163 for s in ${snaps[@]}; do
164 file=${s##*/}
165 t=$(date -d "$(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#"$vol."})" +%s)
166 if (( t > maxtime )); then
167 maxtime=$t
168 fi
169 done
170 snapshotmsg=
171 last_snap_age=$(( now - maxtime ))
172 last_snap_hours=$(( last_snap_age / 60 / 60 ))
173 if (( last_snap_age > old_snap_limit )); then
174 chars+=(OLD-SNAP-${last_snap_hours}h)
175 snapshotmsg="/$vol snapshot older than 4 hours"
176 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
177 p "$snapshotmsg" | lo -1 old-snapshot
178 fi
179 # not bothering to get info on all volumes if we find an old one.
180 fi
181 fi
182
183
184 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
185
186 bouncemsg=
187 glob=(/m/md/bounces/new/*)
188 if [[ -e ${glob[0]} ]]; then
189 chars+=(BOUNCE)
190 bouncemsg="message in /m/md/bounces/new"
191 fi
192 p $bouncemsg | loday -1 bounce
193 # emails without the S (seen) flag. this only checks the last flag,
194 # but its good enough for me.
195 glob=(/m/md/alerts/{new,cur}/!(*,S))
196 if [[ -e ${glob[0]} ]]; then
197 chars+=(A)
198 fi
199
200 glob=(/m/md/daylert/{new,cur}/!(*,S))
201 if [[ -e ${glob[0]} ]]; then
202 chars+=(DAY)
203 fi
204
205 bbkmsg=
206 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
207 chars+=(BTRBK.TIMER)
208 bbkmsg="not enabled"
209 fi
210 p "$bbkmsg" | lo -480 btrbk.timer
211
212
213
214 # commented out, only using timetrap retrospectively.
215 # # clock us out in timetrap if are idle too long
216 # if [[ -e /p/.timetrap.db ]]; then
217 # export DISPLAY=:0
218 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
219 # if [[ $xidle == [0-9]* ]]; then
220 # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
221 # idle=300000
222 # if [[ $sheet == w ]]; then
223 # idle=900000
224 # fi
225 # if [[ $sheet && $xidle -gt $idle ]]; then
226 # timetrap out
227 # fi
228 # fi
229 # fi
230 # fi
231 else # end if $MAIL_HOST
232 rmg /home/iank/cron-errors/bounce* \
233 /home/iank/cron-errors/btrbk.timer* \
234 /home/iank/cron-errors/old-snapshot*
235 fi
236
237 if ip l show tunfsf &>/dev/null; then
238 # this is for tracking dns over tls issue, which
239 # fixvpndns() in brc2 fixes.
240 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
241 read -r _ _ _ istls <<<"$stat"
242 case $istls in
243 no) : ;;
244 *)
245 printf "%s\n" "$istls" | ts >> /tmp/istls.log
246 chars+=("T:$istls")
247 ;;
248 esac
249 fi
250
251 # We do this once every 5 minutes, since this is not a grave problem.
252 # For formatted elisp, see /b/ds/unsaved-buffers.el
253 elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))'
254 if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then
255 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
256 # i dun care if this fails
257 emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)"
258 if [[ $emacsfiles ]]; then
259 chars+=("$emacsfiles")
260 fi
261 fi
262 last_emacs_check=$EPOCHSECONDS
263 fi
264
265
266 glob=(/nocow/btrfs-stale/*)
267 if [[ -e ${glob[0]} ]]; then
268 chars+=(STALE)
269 fi
270 var_mail_msg=
271 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
272 var_mail_msg="message in /var/mail"
273 fi
274 p $var_mail_msg | loday -1 var_mail
275
276 # Note, early in install process, we dont have permission yet for exiqgrep.
277 #
278 # todo: don't do this every 15 seconds, more like once every 2 minutes to
279 # save cpu cycles.
280 #
281 # 2400 = 40 mins. This should allow for system restarts, and
282 # 30 minute message delay plus 10 minute queu runs.
283 qlen=$(/usr/sbin/exiqgrep -o 2400 -c -b | awk '{print $1}') ||:
284 qmsg=
285 if ((qlen)); then
286 # Do sending of long delayed messages, and dont count them in our queue warnings.
287 for mid in $(exiqgrep -o 2400 -zi); do
288 if exim -Mvh $mid | awk 'tolower($2) == "fdate:"' | grep -q .; then
289 qlen=$(( qlen - 1 ))
290 # shellcheck disable=SC2016 # exim var, not a bash bar
291 if (( $(date -d "$(exim -Mset $mid -be <<<'$h_date:' | sed -n 's/^> *//;/./p')" +%s) < EPOCHSECONDS )); then
292 if ip a show veth0-mail &>/dev/null; then
293 pid=$(pgrep -f "/usr/sbin/exim4 -bd -q30m -C /etc/exim4/my.conf"|head -n1);
294 nsenter -t $pid -n -m /usr/sbin/exim4 -C /etc/exim4/my.conf -M $mid
295 else
296 /usr/sbin/exim4 -M $mid
297 fi
298 fi
299 fi
300 done
301
302 if ((qlen)); then
303 qmsg="queue length $qlen"
304 chars+=("q $qlen")
305 fi
306 fi
307 case $HOSTNAME in
308 # No point in emailing about the mailq on a host where we don't
309 # check email.
310 $MAIL_HOST)
311 p $qmsg | loday -120 qlen
312 ;;
313 *)
314 rmg /home/iank/cron-errors/qlen*
315 ;;
316 esac
317
318 begin=false
319
320 # todo: make this robust to the case of /a not being mounted
321 if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
322 begin=true
323 fi
324
325 end=false
326 if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
327 end=true
328 fi
329
330 # these conditions are so we dont have an overly verbose prompt
331 if $begin && $end; then
332 chars+=(D)
333 elif $begin; then
334 chars+=(DB)
335 elif $end; then
336 chars+=(DE)
337 else
338 source /a/bin/ds/script-files
339 f=~/.local/conflink
340 # shellcheck disable=SC2043
341 for _ in 1; do
342 if [[ -e $f ]]; then
343 now=$EPOCHSECONDS
344 fsec=$(stat -c%Y $f)
345 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
346 # dont have any false positives.
347 fmin=$(( (fsec - now + 1 ) / 60 ))
348 fminplus=$(( fmin + 60*24 ))
349 # Filesystem files get copied, so find any newer than the last run.
350 # The rest are hueristics:
351 # Given the last time we added a file in git, is that newer than the last conflink run.
352 # Given new files not added to git, were they modified more recently than the last conflink? but,
353 # push their modification time back by a day so we can develop them before needing to add them to git.
354
355 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
356 # This part is copied from conflink
357 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
358 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
359 done
360
361 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
362 if (( fmin < 0 )) && [[ $(find ${all_my_scripts[@]} ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
363 v conflink newer filesystem files
364 chars+=(CONFLINK)
365 break
366 fi
367
368 for d in /a/bin/distro-setup /p/c; do
369 [[ -d $d ]] || continue
370 cd $d
371 if [[ ! -e .git ]]; then
372 # some hosts i dont push all of /p/c
373 continue
374 fi
375 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
376 v conflink: newer files checked in to git
377 chars+=(CONFLINK)
378 break
379 fi
380
381 untracked=()
382 while read -r l; do
383 untracked+=("$l")
384 done < <(git ls-files -o --exclude-standard)
385 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
386 v conflink: untracked in $d
387 chars+=(CONFLINK)
388 break
389 fi
390 done
391 cd /
392
393 fi
394 if [[ ! -e $f || $(<$f) != 0 ]]; then
395 v conflink: last run not found or failed
396 chars+=(CONFLINK)
397 break
398 fi
399 done
400 fi
401
402 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
403 if [[ -s /var/log/exim4/paniclog ]]; then
404 chars+=("PANIC!")
405 # leave it up to epanic-clean to send email notification
406 fi
407
408 mprom=/var/lib/prometheus/node-exporter/mailtest-check.prom
409 if [[ -s $mprom ]]; then
410 if grep -qE 'mailtest_check_(unexpected|missing).*[^ ][^0]$' $mprom; then
411 chars+=("MTEST_SPAM")
412 fi
413 mtest_found=false
414 # shellcheck disable=SC2013 # these are words
415 for t in $(grep -E ^mailtest_check_last_usec $mprom | awk '{print $NF}'); do
416 if (( t + 60 * 20 < EPOCHSECONDS )); then
417 mtest_found=true
418 fi
419 done
420 if $mtest_found; then
421 chars+=("MTEST_AGE")
422 fi
423 fi
424
425 if [[ ! -e $status_file || -w $status_file ]]; then
426 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
427 cat /a/bin/bash_unpublished/source-state >$status_file
428 fi
429
430 if [[ ${chars[*]} ]]; then
431 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
432 fi
433 fi
434 }
435
436 # This prevents me having to mute notifications when I'm going to bed.
437 mute() {
438 local locked
439 export DISPLAY=:0
440 locked=false
441 if lock_info=$(xscreensaver-command -time); then
442 if [[ $lock_info != *non-blanked* ]]; then
443 locked=true
444 fi
445 else
446 locked=true
447 fi
448 midnight=$(date -d 00:00 +%s)
449 mdiff=$(( EPOCHSECONDS - midnight ))
450 if $locked && (( mdiff < 6 *60*60 || mdiff > 21 *60*60 )); then
451 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
452 no)
453 # for log purposes
454 echo muted
455 pactl set-sink-mute @DEFAULT_SINK@ true
456 ;;
457 esac
458 fi
459 if ! $locked && (( mdiff > 6 *60*60 || mdiff < 12 *60*60 )) && [[ ! -e /tmp/ianknap ]]; then
460 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
461 yes)
462 # for log purposes
463 echo unmuted
464 pactl set-sink-mute @DEFAULT_SINK@ false
465 ;;
466 esac
467 fi
468 }
469
470 # use this if we want to do something just once per minute
471 first_chars=()
472
473 write-status
474 if [[ $1 ]]; then
475 cat $status_file
476 exit 0
477 fi
478
479 loop_count=0
480 main-loop() {
481 while true; do
482 power=true
483 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
484 power=false
485 fi
486 wait=15
487
488 if $power; then
489 if (( loop_count % 10 == 0 )); then
490 if [[ -r /sys/class/power_supply/BAT0/capacity ]]; then
491 bat=$(cat /sys/class/power_supply/BAT0/capacity)
492 else
493 bat=100
494 fi
495 case $bat in
496 100|9?)
497 :
498 bitcoinon &
499 ;;
500 esac
501 fi
502 else
503 bitcoinoff
504 wait=60
505 fi
506
507 sleep $wait
508 write-status
509 mute
510 loop_count=$(( loop_count + 1 ))
511 done
512 }
513
514 # ensure our long operations are one line so we are not prone errors
515 # from this file being modified.
516 main-loop