745e33515fa5c0cc47996b7bc0a7b2d402ebab67
[distro-setup] / system-status
1 #!/bin/bash
2
3 # Basic system status on on Ian's computers
4 # Copyright (C) 2024 Ian Kelling
5
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
19 # SPDX-License-Identifier: GPL-3.0-or-later
20
21 # usage: runs once every 15 seconds unless any args are passed, or we
22 # then just runs once and have verbose output. On battery power, run
23 # once per minute.
24
25 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
26
27 if [[ $EUID != 1000 ]]; then
28 echo "$0: error, expected to be user 1000"
29 exit 1
30 fi
31
32 source /a/bin/bash-bear-trap/bash-bear
33 status_file=/dev/shm/iank-status
34
35 shopt -s nullglob
36 shopt -s dotglob
37 shopt -s extglob
38
39 for p in ~/.gem/ruby/*/bin; do
40 PATH="$PATH:$p"
41 done
42
43
44 verbose=false
45 if [[ $1 ]]; then
46 verbose=true
47 fi
48 v() {
49 if $verbose; then
50 printf "%s\n" "$*"
51 fi
52 }
53 p() { printf "%s\n" "$*"; }
54 # log-once COUNT NAME [MESSAGE]
55 lo() {
56 if type -p ifne &>/dev/null; then
57 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
58 fi
59 }
60
61 loday() {
62 if type -p ifne &>/dev/null; then
63 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
64 fi
65 }
66 # rm glob
67 rmg() {
68 if (( $# )); then
69 rm -f "$@"
70 fi
71 }
72
73 # todo, consider migrating some of these alerts into prometheus
74 write-status() {
75 chars=("${first_chars[@]}")
76
77 services=( epanicclean )
78 case $HOSTNAME in
79 bk|je|li) : ;;
80 *)
81 services+=(
82 systemstatus
83 btrfsmaintstop
84 dynamicipupdate
85 )
86 bads=()
87 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
88 for s in ${services[@]}; do
89 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
90 bads+=($s)
91 fi
92 done
93 chars+=(MYSERS)
94 fi
95 p ${bads[*]} | lo -240 mysers
96 ;;
97 esac
98
99 case $HOSTNAME in
100 kd)
101 services=(
102 prometheus-node-exporter
103 prometheus-alertmanager
104 prometheus
105 )
106 bads=()
107 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
108 for s in ${services[@]}; do
109 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
110 bads+=($s)
111 fi
112 done
113 chars+=(PROM)
114 fi
115 p ${bads[*]} | lo -240 prom
116 ;;
117 esac
118
119
120 # this section copied from servicepid()
121 unit=exim4
122 pid=$(systemctl show --property MainPID --value $unit ||:)
123 case $pid in
124 [1-9]*) : ;;
125 *)
126 dir=/sys/fs/cgroup/system.slice
127 if [[ ! -d $dir ]]; then
128 dir=/sys/fs/cgroup/systemd/system.slice
129 fi;
130 pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:)
131 ;;
132 esac
133 if [[ ! $pid ]]; then
134 chars+=(EXIM)
135 fi
136
137
138 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
139 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
140 source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
141 fi
142
143
144 ## check if last snapshot was recent
145 old_snap_limit=$(( 3 * 60 * 60 ))
146 vol=o
147 btrbk_root=/mnt/o/btrbk
148 # this section generally copied from btrbk scripts, but
149 # this part modified to speed things up by about half a second.
150 # I'm not sure if its quite as reliable, but it looks pretty safe.
151 # Profiled it using time and also adding to the top of the file:
152 # set -x
153 # PS4='+ $(date "+%2N") '
154 # allow failure in case there are no snapshots yet.
155 shopt -s nullglob
156 files=($btrbk_root/$vol.20*)
157 shopt -u nullglob
158 if (( ${#files[@]} )); then
159 # shellcheck disable=SC2012 # using ls version sort. not sure this is needed.
160 snaps=("$(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : )")
161 now=$EPOCHSECONDS
162 maxtime=0
163 for s in ${snaps[@]}; do
164 file=${s##*/}
165 t=$(date -d "$(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#"$vol."})" +%s)
166 if (( t > maxtime )); then
167 maxtime=$t
168 fi
169 done
170 snapshotmsg=
171 last_snap_age=$(( now - maxtime ))
172 last_snap_hours=$(( last_snap_age / 60 / 60 ))
173 if (( last_snap_age > old_snap_limit )); then
174 chars+=(OLD-SNAP-${last_snap_hours}h)
175 snapshotmsg="/$vol snapshot older than 4 hours"
176 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
177 p "$snapshotmsg" | lo -1 old-snapshot
178 fi
179 # not bothering to get info on all volumes if we find an old one.
180 fi
181 fi
182
183
184 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
185
186 bouncemsg=
187 glob=(/m/md/bounces/new/*)
188 if [[ -e ${glob[0]} ]]; then
189 chars+=(BOUNCE)
190 bouncemsg="message in /m/md/bounces/new"
191 fi
192 p $bouncemsg | loday -1 bounce
193 # emails without the S (seen) flag. this only checks the last flag,
194 # but its good enough for me.
195 glob=(/m/md/alerts/{new,cur}/!(*,S))
196 if [[ -e ${glob[0]} ]]; then
197 chars+=(A)
198 fi
199
200 glob=(/m/md/daylert/{new,cur}/!(*,S))
201 if [[ -e ${glob[0]} ]]; then
202 chars+=(DAY)
203 fi
204
205 bbkmsg=
206 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
207 chars+=(BTRBK.TIMER)
208 bbkmsg="not enabled"
209 fi
210 p "$bbkmsg" | lo -480 btrbk.timer
211
212
213
214 # commented out, only using timetrap retrospectively.
215 # # clock us out in timetrap if are idle too long
216 # if [[ -e /p/.timetrap.db ]]; then
217 # export DISPLAY=:0
218 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
219 # if [[ $xidle == [0-9]* ]]; then
220 # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
221 # idle=300000
222 # if [[ $sheet == w ]]; then
223 # idle=900000
224 # fi
225 # if [[ $sheet && $xidle -gt $idle ]]; then
226 # timetrap out
227 # fi
228 # fi
229 # fi
230 # fi
231 else # end if $MAIL_HOST
232 rmg /home/iank/cron-errors/bounce* \
233 /home/iank/cron-errors/btrbk.timer* \
234 /home/iank/cron-errors/old-snapshot*
235 fi
236
237 if ip l show tunfsf &>/dev/null; then
238 # this is for tracking dns over tls issue, which
239 # fixvpndns() in brc2 fixes.
240 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
241 read -r _ _ _ istls <<<"$stat"
242 case $istls in
243 no) : ;;
244 *)
245 printf "%s\n" "$istls" | ts >> /tmp/istls.log
246 chars+=("T:$istls")
247 ;;
248 esac
249 fi
250
251 # We do this once every 5 minutes, since this is not a grave problem.
252 # For formatted elisp, see /b/ds/unsaved-buffers.el
253 elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))'
254 if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then
255 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
256 # i dun care if this fails
257 emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)"
258 if [[ $emacsfiles ]]; then
259 chars+=("$emacsfiles")
260 fi
261 fi
262 last_emacs_check=$EPOCHSECONDS
263 fi
264
265
266 glob=(/nocow/btrfs-stale/*)
267 if [[ -e ${glob[0]} ]]; then
268 chars+=(STALE)
269 fi
270 var_mail_msg=
271 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
272 var_mail_msg="message in /var/mail"
273 fi
274 p $var_mail_msg | loday -1 var_mail
275
276 # Note, early in install process, we dont have permission yet for exiqgrep.
277 #
278 # todo: don't do this every 15 seconds, more like once every 2 minutes to
279 # save cpu cycles.
280 #
281 # 2400 = 40 mins. This should allow for system restarts, and
282 # 30 minute message delay plus 10 minute queu runs.
283 qlen=$(/usr/sbin/exiqgrep -o 2400 -c -b | awk '{print $1}') ||:
284 qmsg=
285 if ((qlen)); then
286 # Do sending of long delayed messages, and dont count them in our queue warnings.
287 for mid in $(exiqgrep -o 2400 -zi); do
288 if exim -Mvh $mid | awk 'tolower($2) == "fdate:"' | grep -q .; then
289 qlen=$(( qlen - 1 ))
290 # shellcheck disable=SC2016 # exim var, not a bash bar
291 if (( $(date -d "$(exim -Mset $mid -be <<<'$h_date:' | sed -n 's/^> *//;/./p')" +%s) < EPOCHSECONDS )); then
292 if ip a show veth0-mail &>/dev/null; then
293 pid=$(pgrep -f "/usr/sbin/exim4 -bd -q30m -C /etc/exim4/my.conf"|head -n1);
294 nsenter -t $pid -n -m /usr/sbin/exim4 -C /etc/exim4/my.conf -M $mid
295 else
296 /usr/sbin/exim4 -M $mid
297 fi
298 fi
299 fi
300 done
301
302 if ((qlen)); then
303 qmsg="queue length $qlen"
304 chars+=("q $qlen")
305 fi
306 fi
307 case $HOSTNAME in
308 # No point in emailing about the mailq on a host where we don't
309 # check email.
310 $MAIL_HOST)
311 p $qmsg | loday -120 qlen
312
313
314 f=/var/spool/exim4/gw/no-delay-eximids
315 if (( loop_count % 10 == 0 )) && \
316 [[ -s $f ]] && [[ $(cat $f) == all ]]; then
317 # I've left this on longer than I intended, so just auto-delete
318 # it after some time.
319 find $f -mmin +180 -delete
320 if [[ -s $f ]]; then
321 chars+=("NO_DELAY")
322 fi
323 fi
324
325
326 ;;
327 *)
328 rmg /home/iank/cron-errors/qlen*
329 ;;
330 esac
331
332 begin=false
333
334 # todo: make this robust to the case of /a not being mounted
335 if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
336 begin=true
337 fi
338
339 end=false
340 if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
341 end=true
342 fi
343
344 # these conditions are so we dont have an overly verbose prompt
345 if $begin && $end; then
346 chars+=(D)
347 elif $begin; then
348 chars+=(DB)
349 elif $end; then
350 chars+=(DE)
351 else
352 source /a/bin/ds/script-files
353 f=~/.local/conflink
354 # shellcheck disable=SC2043
355 for _ in 1; do
356 if [[ -e $f ]]; then
357 now=$EPOCHSECONDS
358 fsec=$(stat -c%Y $f)
359 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
360 # dont have any false positives.
361 fmin=$(( (fsec - now + 1 ) / 60 ))
362 fminplus=$(( fmin + 60*24 ))
363 # Filesystem files get copied, so find any newer than the last run.
364 # The rest are hueristics:
365 # Given the last time we added a file in git, is that newer than the last conflink run.
366 # Given new files not added to git, were they modified more recently than the last conflink? but,
367 # push their modification time back by a day so we can develop them before needing to add them to git.
368
369 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
370 # This part is copied from conflink
371 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
372 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
373 done
374
375 script_files=("${my_service_scripts[@]}" "${my_bin_files[@]}" $my_lib_files)
376
377 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
378 if (( fmin < 0 )) && [[ $(find "${script_files[@]}" ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
379 v conflink newer filesystem files
380 chars+=(CONFLINK)
381 break
382 fi
383
384 for d in /a/bin/distro-setup /p/c; do
385 [[ -d $d ]] || continue
386 cd $d
387 if [[ ! -e .git ]]; then
388 # some hosts i dont push all of /p/c
389 continue
390 fi
391 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
392 v conflink: newer files checked in to git
393 chars+=(CONFLINK)
394 break
395 fi
396
397 untracked=()
398 while read -r l; do
399 untracked+=("$l")
400 done < <(git ls-files -o --exclude-standard)
401 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
402 v conflink: untracked in $d
403 chars+=(CONFLINK)
404 break
405 fi
406 done
407 cd /
408
409 fi
410 if [[ ! -e $f || $(<$f) != 0 ]]; then
411 v conflink: last run not found or failed
412 chars+=(CONFLINK)
413 break
414 fi
415 done
416 fi
417
418 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
419 if [[ -s /var/log/exim4/paniclog ]]; then
420 chars+=("PANIC!")
421 # leave it up to epanic-clean to send email notification
422 fi
423
424 mprom=/var/lib/prometheus/node-exporter/mailtest-check.prom
425 if [[ -s $mprom ]]; then
426 if grep -qE 'mailtest_check_(unexpected|missing).*[^ ][^0]$' $mprom; then
427 chars+=("MTEST_SPAM")
428 fi
429 mtest_found=false
430 # shellcheck disable=SC2013 # these are words
431 for t in $(grep -E ^mailtest_check_last_usec $mprom | awk '{print $NF}'); do
432 if (( t + 60 * 20 < EPOCHSECONDS )); then
433 mtest_found=true
434 fi
435 done
436 if $mtest_found; then
437 chars+=("MTEST_AGE")
438 fi
439 fi
440
441 if [[ ! -e $status_file || -w $status_file ]]; then
442 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
443 cat /a/bin/bash_unpublished/source-state >$status_file
444 fi
445
446 if [[ ${chars[*]} ]]; then
447 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
448 fi
449 fi
450
451 if [[ -e $HOME/.iank-stream-on ]] && ! pgrep -fc '^ffmpeg.*icecast://source.*/fsf-sysops' >/dev/null; then
452 rm -f $HOME/.iank-stream-on
453 fi
454
455 } # end write-status
456
457 # This prevents me having to mute notifications when I'm going to bed.
458 mute() {
459 local locked
460 export DISPLAY=:0
461 locked=false
462 if lock_info=$(xscreensaver-command -time 2>/dev/null); then
463 if [[ $lock_info != *non-blanked* ]]; then
464 locked=true
465 fi
466 midnight=$(date -d 00:00 +%s)
467 mdiff=$(( EPOCHSECONDS - midnight ))
468 if $locked && (( mdiff < 6 *60*60 || mdiff > 21 *60*60 )); then
469 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
470 no)
471 # for log purposes
472 echo muted
473 pactl set-sink-mute @DEFAULT_SINK@ true
474 ;;
475 esac
476 fi
477 if ! $locked && (( mdiff > 6 *60*60 || mdiff < 12 *60*60 )) && [[ ! -e /tmp/ianknap ]]; then
478 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
479 yes)
480 # for log purposes
481 echo unmuted
482 pactl set-sink-mute @DEFAULT_SINK@ false
483 ;;
484 esac
485 fi
486 fi
487 }
488
489 # use this if we want to do something just once per minute
490 first_chars=()
491
492 write-status
493 if [[ $1 ]]; then
494 cat $status_file
495 exit 0
496 fi
497
498 loop_count=0
499 main-loop() {
500 while true; do
501 power=true
502 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
503 power=false
504 fi
505
506 if $power; then
507 wait=15
508 else
509 wait=60
510 fi
511
512 sleep $wait
513 write-status
514 mute
515 loop_count=$(( loop_count + 1 ))
516 done
517 }
518
519 # ensure our long operations are one line so we are not prone errors
520 # from this file being modified.
521 main-loop