host info updates
[distro-setup] / system-status
1 #!/bin/bash
2
3 # Basic system status on on Ian's computers
4 # Copyright (C) 2024 Ian Kelling
5
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
19 # SPDX-License-Identifier: GPL-3.0-or-later
20
21 # usage: runs once every 15 seconds unless any args are passed, or we
22 # then just runs once and have verbose output. On battery power, run
23 # once per minute.
24
25 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
26
27 if [[ $EUID != 1000 ]]; then
28 echo "$0: error, expected to be user 1000"
29 exit 1
30 fi
31
32 source /a/bin/bash-bear-trap/bash-bear
33 status_file=/dev/shm/iank-status
34
35 shopt -s nullglob
36 shopt -s dotglob
37 shopt -s extglob
38
39 for p in ~/.gem/ruby/*/bin; do
40 PATH="$PATH:$p"
41 done
42
43
44 verbose=false
45 if [[ $1 ]]; then
46 verbose=true
47 fi
48 v() {
49 if $verbose; then
50 printf "%s\n" "$*"
51 fi
52 }
53 p() { printf "%s\n" "$*"; }
54 # log-once COUNT NAME [MESSAGE]
55 lo() {
56 if type -p ifne &>/dev/null; then
57 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
58 fi
59 }
60
61 loday() {
62 if type -p ifne &>/dev/null; then
63 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
64 fi
65 }
66 # rm glob
67 rmg() {
68 if (( $# )); then
69 rm -f "$@"
70 fi
71 }
72
73 # todo, consider migrating some of these alerts into prometheus
74 write-status() {
75 chars=("${first_chars[@]}")
76
77 services=( epanicclean )
78 case $HOSTNAME in
79 bk|je|li) : ;;
80 *)
81 services+=(
82 systemstatus
83 btrfsmaintstop
84 dynamicipupdate
85 )
86 bads=()
87 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
88 for s in ${services[@]}; do
89 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
90 bads+=($s)
91 fi
92 done
93 chars+=(MYSERS)
94 fi
95 p ${bads[*]} | lo -240 mysers
96 ;;
97 esac
98
99 case $HOSTNAME in
100 kd)
101 services=(
102 prometheus-node-exporter
103 prometheus-alertmanager
104 prometheus
105 )
106 bads=()
107 if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
108 for s in ${services[@]}; do
109 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
110 bads+=($s)
111 fi
112 done
113 chars+=(PROM)
114 fi
115 p ${bads[*]} | lo -240 prom
116 ;;
117 esac
118
119
120 # this section copied from servicepid()
121 unit=exim4
122 pid=$(systemctl show --property MainPID --value $unit ||:)
123 case $pid in
124 [1-9]*) : ;;
125 *)
126 dir=/sys/fs/cgroup/system.slice
127 if [[ ! -d $dir ]]; then
128 dir=/sys/fs/cgroup/systemd/system.slice
129 fi;
130 pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:)
131 ;;
132 esac
133 if [[ ! $pid ]]; then
134 chars+=(EXIM)
135 fi
136
137
138 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
139 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
140 source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
141 fi
142
143
144 ## check if last snapshot was recent
145 old_snap_limit=$(( 3 * 60 * 60 ))
146 vol=o
147 btrbk_root=/mnt/o/btrbk
148 # this section generally copied from btrbk scripts, but
149 # this part modified to speed things up by about half a second.
150 # I'm not sure if its quite as reliable, but it looks pretty safe.
151 # Profiled it using time and also adding to the top of the file:
152 # set -x
153 # PS4='+ $(date "+%2N") '
154 # allow failure in case there are no snapshots yet.
155 shopt -s nullglob
156 files=($btrbk_root/$vol.20*)
157 shopt -u nullglob
158 if (( ${#files[@]} )); then
159 # shellcheck disable=SC2012 # using ls version sort. not sure this is needed.
160 snaps=("$(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : )")
161 now=$EPOCHSECONDS
162 maxtime=0
163 for s in ${snaps[@]}; do
164 file=${s##*/}
165 t=$(date -d "$(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#"$vol."})" +%s)
166 if (( t > maxtime )); then
167 maxtime=$t
168 fi
169 done
170 snapshotmsg=
171 last_snap_age=$(( now - maxtime ))
172 last_snap_hours=$(( last_snap_age / 60 / 60 ))
173 if (( last_snap_age > old_snap_limit )); then
174 chars+=(OLD-SNAP-${last_snap_hours}h)
175 snapshotmsg="/$vol snapshot older than 4 hours"
176 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
177 p "$snapshotmsg" | lo -1 old-snapshot
178 fi
179 # not bothering to get info on all volumes if we find an old one.
180 fi
181 fi
182
183
184 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
185
186 bouncemsg=
187 glob=(/m/md/bounces/new/*)
188 if [[ -e ${glob[0]} ]]; then
189 chars+=(BOUNCE)
190 bouncemsg="message in /m/md/bounces/new"
191 fi
192 p $bouncemsg | loday -1 bounce
193 # emails without the S (seen) flag. this only checks the last flag,
194 # but its good enough for me.
195 glob=(/m/md/alerts/{new,cur}/!(*,S))
196 if [[ -e ${glob[0]} ]]; then
197 chars+=(A)
198 fi
199
200 glob=(/m/md/daylert/{new,cur}/!(*,S))
201 if [[ -e ${glob[0]} ]]; then
202 chars+=(DAY)
203 fi
204
205 bbkmsg=
206 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
207 chars+=(BTRBK.TIMER)
208 bbkmsg="not enabled"
209 fi
210 p "$bbkmsg" | lo -480 btrbk.timer
211
212
213
214 # commented out, only using timetrap retrospectively.
215 # # clock us out in timetrap if are idle too long
216 # if [[ -e /p/.timetrap.db ]]; then
217 # export DISPLAY=:0
218 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
219 # if [[ $xidle == [0-9]* ]]; then
220 # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
221 # idle=300000
222 # if [[ $sheet == w ]]; then
223 # idle=900000
224 # fi
225 # if [[ $sheet && $xidle -gt $idle ]]; then
226 # timetrap out
227 # fi
228 # fi
229 # fi
230 # fi
231 else # end if $MAIL_HOST
232 rmg /home/iank/cron-errors/bounce* \
233 /home/iank/cron-errors/btrbk.timer* \
234 /home/iank/cron-errors/old-snapshot*
235 fi
236
237 if ip l show tunfsf &>/dev/null; then
238 # this is for tracking dns over tls issue, which
239 # fixvpndns() in brc2 fixes.
240 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
241 read -r _ _ _ istls <<<"$stat"
242 case $istls in
243 no) : ;;
244 *)
245 printf "%s\n" "$istls" | ts >> /tmp/istls.log
246 chars+=("T:$istls")
247 ;;
248 esac
249 fi
250
251 # We do this once every 5 minutes, since this is not a grave problem.
252 # For formatted elisp, see /b/ds/unsaved-buffers.el
253 elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))'
254 if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then
255 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
256 # i dun care if this fails
257 emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)"
258 if [[ $emacsfiles ]]; then
259 chars+=("$emacsfiles")
260 fi
261 fi
262 last_emacs_check=$EPOCHSECONDS
263 fi
264
265
266 glob=(/nocow/btrfs-stale/*)
267 if [[ -e ${glob[0]} ]]; then
268 chars+=(STALE)
269 fi
270 var_mail_msg=
271 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
272 var_mail_msg="message in /var/mail"
273 fi
274 p $var_mail_msg | loday -1 var_mail
275
276 # Note, early in install process, we dont have permission yet for exiqgrep.
277 #
278 # todo: don't do this every 15 seconds, more like once every 2 minutes to
279 # save cpu cycles.
280 #
281 # 2400 = 40 mins. This should allow for system restarts, and
282 # 30 minute message delay plus 10 minute queu runs.
283 qlen=$(/usr/sbin/exiqgrep -o 2400 -c -b | awk '{print $1}') ||:
284 qmsg=
285 if ((qlen)); then
286 # Do sending of long delayed messages, and dont count them in our queue warnings.
287 for mid in $(exiqgrep -o 2400 -zi); do
288 if exim -Mvh $mid | awk 'tolower($2) == "fdate:"' | grep -q .; then
289 qlen=$(( qlen - 1 ))
290 # shellcheck disable=SC2016 # exim var, not a bash bar
291 if (( $(date -d "$(exim -Mset $mid -be <<<'$h_date:' | sed -n 's/^> *//;/./p')" +%s) < EPOCHSECONDS )); then
292 if ip a show veth0-mail &>/dev/null; then
293 pid=$(pgrep -f "/usr/sbin/exim4 -bd -q30m -C /etc/exim4/my.conf"|head -n1);
294 nsenter -t $pid -n -m /usr/sbin/exim4 -C /etc/exim4/my.conf -M $mid
295 else
296 /usr/sbin/exim4 -M $mid
297 fi
298 fi
299 fi
300 done
301
302 if ((qlen)); then
303 qmsg="queue length $qlen"
304 chars+=("q $qlen")
305 fi
306 fi
307 case $HOSTNAME in
308 # No point in emailing about the mailq on a host where we don't
309 # check email.
310 $MAIL_HOST)
311 p $qmsg | loday -120 qlen
312 ;;
313 *)
314 rmg /home/iank/cron-errors/qlen*
315 ;;
316 esac
317
318 begin=false
319
320 # todo: make this robust to the case of /a not being mounted
321 if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
322 begin=true
323 fi
324
325 end=false
326 if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
327 end=true
328 fi
329
330 # these conditions are so we dont have an overly verbose prompt
331 if $begin && $end; then
332 chars+=(D)
333 elif $begin; then
334 chars+=(DB)
335 elif $end; then
336 chars+=(DE)
337 else
338 source /a/bin/ds/script-files
339 f=~/.local/conflink
340 # shellcheck disable=SC2043
341 for _ in 1; do
342 if [[ -e $f ]]; then
343 now=$EPOCHSECONDS
344 fsec=$(stat -c%Y $f)
345 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
346 # dont have any false positives.
347 fmin=$(( (fsec - now + 1 ) / 60 ))
348 fminplus=$(( fmin + 60*24 ))
349 # Filesystem files get copied, so find any newer than the last run.
350 # The rest are hueristics:
351 # Given the last time we added a file in git, is that newer than the last conflink run.
352 # Given new files not added to git, were they modified more recently than the last conflink? but,
353 # push their modification time back by a day so we can develop them before needing to add them to git.
354
355 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
356 # This part is copied from conflink
357 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
358 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
359 done
360
361 script_files=("${my_service_scripts[@]}" "${my_bin_files[@]}" $my_lib_files)
362
363 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
364 if (( fmin < 0 )) && [[ $(find "${script_files[@]}" ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
365 v conflink newer filesystem files
366 chars+=(CONFLINK)
367 break
368 fi
369
370 for d in /a/bin/distro-setup /p/c; do
371 [[ -d $d ]] || continue
372 cd $d
373 if [[ ! -e .git ]]; then
374 # some hosts i dont push all of /p/c
375 continue
376 fi
377 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
378 v conflink: newer files checked in to git
379 chars+=(CONFLINK)
380 break
381 fi
382
383 untracked=()
384 while read -r l; do
385 untracked+=("$l")
386 done < <(git ls-files -o --exclude-standard)
387 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
388 v conflink: untracked in $d
389 chars+=(CONFLINK)
390 break
391 fi
392 done
393 cd /
394
395 fi
396 if [[ ! -e $f || $(<$f) != 0 ]]; then
397 v conflink: last run not found or failed
398 chars+=(CONFLINK)
399 break
400 fi
401 done
402 fi
403
404 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
405 if [[ -s /var/log/exim4/paniclog ]]; then
406 chars+=("PANIC!")
407 # leave it up to epanic-clean to send email notification
408 fi
409
410 mprom=/var/lib/prometheus/node-exporter/mailtest-check.prom
411 if [[ -s $mprom ]]; then
412 if grep -qE 'mailtest_check_(unexpected|missing).*[^ ][^0]$' $mprom; then
413 chars+=("MTEST_SPAM")
414 fi
415 mtest_found=false
416 # shellcheck disable=SC2013 # these are words
417 for t in $(grep -E ^mailtest_check_last_usec $mprom | awk '{print $NF}'); do
418 if (( t + 60 * 20 < EPOCHSECONDS )); then
419 mtest_found=true
420 fi
421 done
422 if $mtest_found; then
423 chars+=("MTEST_AGE")
424 fi
425 fi
426
427 if [[ ! -e $status_file || -w $status_file ]]; then
428 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
429 cat /a/bin/bash_unpublished/source-state >$status_file
430 fi
431
432 if [[ ${chars[*]} ]]; then
433 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
434 fi
435 fi
436 }
437
438 # This prevents me having to mute notifications when I'm going to bed.
439 mute() {
440 local locked
441 export DISPLAY=:0
442 locked=false
443 if lock_info=$(xscreensaver-command -time); then
444 if [[ $lock_info != *non-blanked* ]]; then
445 locked=true
446 fi
447 else
448 locked=true
449 fi
450 midnight=$(date -d 00:00 +%s)
451 mdiff=$(( EPOCHSECONDS - midnight ))
452 if $locked && (( mdiff < 6 *60*60 || mdiff > 21 *60*60 )); then
453 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
454 no)
455 # for log purposes
456 echo muted
457 pactl set-sink-mute @DEFAULT_SINK@ true
458 ;;
459 esac
460 fi
461 if ! $locked && (( mdiff > 6 *60*60 || mdiff < 12 *60*60 )) && [[ ! -e /tmp/ianknap ]]; then
462 case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
463 yes)
464 # for log purposes
465 echo unmuted
466 pactl set-sink-mute @DEFAULT_SINK@ false
467 ;;
468 esac
469 fi
470 }
471
472 # use this if we want to do something just once per minute
473 first_chars=()
474
475 write-status
476 if [[ $1 ]]; then
477 cat $status_file
478 exit 0
479 fi
480
481 loop_count=0
482 main-loop() {
483 while true; do
484 power=true
485 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
486 power=false
487 fi
488 wait=15
489
490 if $power; then
491 if (( loop_count % 10 == 0 )); then
492 if [[ -r /sys/class/power_supply/BAT0/capacity ]]; then
493 bat=$(cat /sys/class/power_supply/BAT0/capacity)
494 else
495 bat=100
496 fi
497 case $bat in
498 100|9?)
499 :
500 bitcoinon &
501 ;;
502 esac
503 fi
504 else
505 bitcoinoff
506 wait=60
507 fi
508
509 sleep $wait
510 write-status
511 mute
512 loop_count=$(( loop_count + 1 ))
513 done
514 }
515
516 # ensure our long operations are one line so we are not prone errors
517 # from this file being modified.
518 main-loop