3 # Basic system status on on Ian's computers
4 # Copyright (C) 2024 Ian Kelling
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 # SPDX-License-Identifier: GPL-3.0-or-later
21 # usage: runs once every 15 seconds unless any args are passed, or we
22 # then just runs once and have verbose output. On battery power, run
25 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
27 if [[ $EUID != 1000 ]]; then
28 echo "$0: error, expected to be user 1000"
32 source /a
/bin
/bash-bear-trap
/bash-bear
33 status_file
=/dev
/shm
/iank-status
39 for p
in ~
/.gem
/ruby
/*/bin
; do
53 p
() { printf "%s\n" "$*"; }
54 # log-once COUNT NAME [MESSAGE]
56 if type -p ifne
&>/dev
/null
; then
57 /usr
/local
/bin
/log-once
"$@" | ifne
mail -s "$HOSTNAME: system-status $2" root@localhost
62 if type -p ifne
&>/dev
/null
; then
63 /usr
/local
/bin
/log-once
"$@" | ifne
mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
73 # todo, consider migrating some of these alerts into prometheus
75 chars
=("${first_chars[@]}")
77 services
=( epanicclean
)
87 if systemctl show
-p SubState
--value ${services[@]} |
grep -E -v '^(running|)$' &>/dev
/null
; then
88 for s
in ${services[@]}; do
89 if [[ $
(systemctl show
-p SubState
--value $s 2>&1) != running
]]; then
95 p
${bads[*]} | lo
-240 mysers
102 prometheus-node-exporter
103 prometheus-alertmanager
107 if systemctl show
-p SubState
--value ${services[@]} |
grep -E -v '^(running|)$' &>/dev
/null
; then
108 for s
in ${services[@]}; do
109 if [[ $
(systemctl show
-p SubState
--value $s 2>&1) != running
]]; then
115 p
${bads[*]} | lo
-240 prom
120 # this section copied from servicepid()
122 pid
=$
(systemctl show
--property MainPID
--value $unit ||
:)
126 dir
=/sys
/fs
/cgroup
/system.slice
127 if [[ ! -d $dir ]]; then
128 dir
=/sys
/fs
/cgroup
/systemd
/system.slice
130 pid
=$
(head -n1 $dir/${unit%.service}.service
/cgroup.procs ||
:)
133 if [[ ! $pid ]]; then
138 if [[ -e /a
/bin
/bash_unpublished
/source-state
]]; then
139 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
140 source /a
/bin
/bash_unpublished
/source-state ||
[[ $?
== 1 ]]
144 ## check if last snapshot was recent
145 old_snap_limit
=$
(( 3 * 60 * 60 ))
147 btrbk_root
=/mnt
/o
/btrbk
148 # this section generally copied from btrbk scripts, but
149 # this part modified to speed things up by about half a second.
150 # I'm not sure if its quite as reliable, but it looks pretty safe.
151 # Profiled it using time and also adding to the top of the file:
153 # PS4='+ $(date "+%2N") '
154 # allow failure in case there are no snapshots yet.
156 files
=($btrbk_root/$vol.20*)
158 if (( ${#files[@]} )); then
159 # shellcheck disable=SC2012 # using ls version sort. not sure this is needed.
160 snaps
=("$(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : )")
163 for s
in ${snaps[@]}; do
165 t
=$
(date -d "$(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#"$vol."})" +%s
)
166 if (( t
> maxtime
)); then
171 last_snap_age
=$
(( now
- maxtime
))
172 last_snap_hours
=$
(( last_snap_age
/ 60 / 60 ))
173 if (( last_snap_age
> old_snap_limit
)); then
174 chars
+=(OLD-SNAP-
${last_snap_hours}h
)
175 snapshotmsg
="/$vol snapshot older than 4 hours"
176 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
177 p
"$snapshotmsg" | lo
-1 old-snapshot
179 # not bothering to get info on all volumes if we find an old one.
184 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
187 glob
=(/m
/md
/bounces
/new
/*)
188 if [[ -e ${glob[0]} ]]; then
190 bouncemsg
="message in /m/md/bounces/new"
192 p
$bouncemsg | loday
-1 bounce
193 # emails without the S (seen) flag. this only checks the last flag,
194 # but its good enough for me.
195 glob
=(/m
/md
/alerts
/{new
,cur
}/!(*,S
))
196 if [[ -e ${glob[0]} ]]; then
200 glob
=(/m
/md
/daylert
/{new
,cur
}/!(*,S
))
201 if [[ -e ${glob[0]} ]]; then
206 if [[ $
(systemctl is-active btrbk.timer
) != active
]]; then
210 p
"$bbkmsg" | lo
-480 btrbk.timer
214 # commented out, only using timetrap retrospectively.
215 # # clock us out in timetrap if are idle too long
216 # if [[ -e /p/.timetrap.db ]]; then
218 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
219 # if [[ $xidle == [0-9]* ]]; then
220 # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
222 # if [[ $sheet == w ]]; then
225 # if [[ $sheet && $xidle -gt $idle ]]; then
231 else # end if $MAIL_HOST
232 rmg
/home
/iank
/cron-errors
/bounce
* \
233 /home
/iank
/cron-errors
/btrbk.timer
* \
234 /home
/iank
/cron-errors
/old-snapshot
*
237 if ip l show tunfsf
&>/dev
/null
; then
238 # this is for tracking dns over tls issue, which
239 # fixvpndns() in brc2 fixes.
240 stat
=$
(resolvectl dnsovertls tunfsf
2>/dev
/null ||
: )
241 read -r _ _ _ istls
<<<"$stat"
245 printf "%s\n" "$istls" | ts
>> /tmp
/istls.log
251 # We do this once every 5 minutes, since this is not a grave problem.
252 # For formatted elisp, see /b/ds/unsaved-buffers.el
253 elisp
='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))'
254 if [[ ! $last_emacs_check ||
$emacsfiles ]] ||
(( last_emacs_check
< EPOCHSECONDS
- 300 )); then
255 if pgrep
-G iank
-u iank
-f 'emacs --daemon' &>/dev
/null
; then
256 # i dun care if this fails
257 emacsfiles
="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil
"$/d;s/^"(/E
: /;s
/)"$//' ||:)"
258 if [[ $emacsfiles ]]; then
259 chars
+=("$emacsfiles")
262 last_emacs_check
=$EPOCHSECONDS
266 glob
=(/nocow
/btrfs-stale
/*)
267 if [[ -e ${glob[0]} ]]; then
271 if [[ $
(find /var
/mail -type f \
! -empty -print -quit) ]]; then
272 var_mail_msg
="message in /var/mail"
274 p
$var_mail_msg | loday
-1 var_mail
276 # Note, early in install process, we dont have permission yet for exiqgrep.
278 # todo: don't do this every 15 seconds, more like once every 2 minutes to
281 # 2400 = 40 mins. This should allow for system restarts, and
282 # 30 minute message delay plus 10 minute queu runs.
283 qlen
=$
(/usr
/sbin
/exiqgrep
-o 2400 -c -b |
awk '{print $1}') ||
:
286 # Do sending of long delayed messages, and dont count them in our queue warnings.
287 for mid
in $
(exiqgrep
-o 2400 -zi); do
288 if exim
-Mvh $mid |
awk 'tolower($2) == "fdate:"' |
grep -q .
; then
290 # shellcheck disable=SC2016 # exim var, not a bash bar
291 if (( $
(date -d "$(exim -Mset $mid -be <<<'$h_date:' | sed -n 's/^> *//;/./p')" +%s
) < EPOCHSECONDS
)); then
292 if ip a show veth0-mail
&>/dev
/null
; then
293 pid
=$
(pgrep
-f "/usr/sbin/exim4 -bd -q30m -C /etc/exim4/my.conf"|
head -n1);
294 nsenter
-t $pid -n -m /usr
/sbin
/exim4
-C /etc
/exim
4/my.conf
-M $mid
296 /usr
/sbin
/exim4
-M $mid
303 qmsg
="queue length $qlen"
308 # No point in emailing about the mailq on a host where we don't
311 p
$qmsg | loday
-120 qlen
314 f
=/var
/spool
/exim
4/gw
/no-delay-eximids
315 if (( loop_count
% 10 == 0 )) && \
316 [[ -s $f ]] && [[ $
(cat $f) == all
]]; then
317 # I've left this on longer than I intended, so just auto-delete
318 # it after some time.
319 find $f -mmin +180 -delete
328 rmg
/home
/iank
/cron-errors
/qlen
*
334 # todo: make this robust to the case of /a not being mounted
335 if ! make -C /b
/ds
-q ~
/.local
/distro-begin
2>/dev
/null ||
[[ $
(<~
/.local
/distro-begin
) != 0 ]]; then
340 if ! make -C /b
/ds
-q ~
/.local
/distro-end
2>/dev
/null ||
[[ $
(<~
/.local
/distro-end
) != 0 ]]; then
344 # these conditions are so we dont have an overly verbose prompt
345 if $begin && $end; then
352 source /a
/bin
/ds
/script-files
354 # shellcheck disable=SC2043
359 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
360 # dont have any false positives.
361 fmin
=$
(( (fsec
- now
+ 1 ) / 60 ))
362 fminplus
=$
(( fmin
+ 60*24 ))
363 # Filesystem files get copied, so find any newer than the last run.
364 # The rest are hueristics:
365 # Given the last time we added a file in git, is that newer than the last conflink run.
366 # Given new files not added to git, were they modified more recently than the last conflink? but,
367 # push their modification time back by a day so we can develop them before needing to add them to git.
369 all_dirs
=({/a
/bin
/ds
,/p
/c
}{/filesystem
,/machine_specific
/$HOSTNAME/filesystem
})
370 # This part is copied from conflink
371 for x
in /p
/c
/machine_specific
/*.hosts
/a
/bin
/ds
/machine_specific
/*.hosts
; do
372 if grep -qxF $HOSTNAME $x; then all_dirs
+=( ${x%.hosts} ); fi
375 script_files
=("${my_service_scripts[@]}" "${my_bin_files[@]}" $my_lib_files)
377 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
378 if (( fmin
< 0 )) && [[ $
(find "${script_files[@]}" ${all_dirs[@]} -mmin $fmin -type f
-print -quit 2>/dev
/null
) ]]; then
379 v conflink newer filesystem files
384 for d
in /a
/bin
/distro-setup
/p
/c
; do
385 [[ -d $d ]] ||
continue
387 if [[ ! -e .git
]]; then
388 # some hosts i dont push all of /p/c
391 if (( $
(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s
) > fsec
)); then
392 v conflink
: newer files checked
in to git
400 done < <(git ls-files
-o --exclude-standard)
401 if [[ ${untracked[0]} && $
(find "${untracked[@]}" -mmin $fminplus -type f
-print -quit) ]]; then
402 v conflink
: untracked
in $d
410 if [[ ! -e $f || $
(<$f) != 0 ]]; then
411 v conflink
: last run not found or failed
418 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
419 if [[ -s /var
/log
/exim
4/paniclog
]]; then
421 # leave it up to epanic-clean to send email notification
424 mprom
=/var
/lib
/prometheus
/node-exporter
/mailtest-check.prom
425 if [[ -s $mprom ]]; then
426 if grep -qE 'mailtest_check_(unexpected|missing).*[^ ][^0]$' $mprom; then
427 chars
+=("MTEST_SPAM")
430 # shellcheck disable=SC2013 # these are words
431 for t
in $
(grep -E ^mailtest_check_last_usec
$mprom |
awk '{print $NF}'); do
432 if (( t
+ 60 * 20 < EPOCHSECONDS
)); then
436 if $mtest_found; then
441 if [[ ! -e $status_file ||
-w $status_file ]]; then
442 if [[ -e /a
/bin
/bash_unpublished
/source-state
]]; then
443 cat /a
/bin
/bash_unpublished
/source-state
>$status_file
446 if [[ ${chars[*]} ]]; then
447 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
451 if [[ -e $HOME/.iank-stream-on
]] && ! pgrep
-fc '^ffmpeg.*icecast://source.*/fsf-sysops' >/dev
/null
; then
452 rm -f $HOME/.iank-stream-on
457 # This prevents me having to mute notifications when I'm going to bed.
462 if lock_info
=$
(xscreensaver-command
-time 2>/dev
/null
); then
463 if [[ $lock_info != *non-blanked
* ]]; then
466 midnight
=$
(date -d 00:00 +%s
)
467 mdiff
=$
(( EPOCHSECONDS
- midnight
))
468 if $locked && (( mdiff
< 6 *60*60 || mdiff
> 21 *60*60 )); then
469 case $
(pactl get-sink-mute @DEFAULT_SINK@ |
awk '{print $2}') in
473 pactl set-sink-mute @DEFAULT_SINK@ true
477 if ! $locked && (( mdiff
> 6 *60*60 || mdiff
< 12 *60*60 )) && [[ ! -e /tmp
/ianknap
]]; then
478 case $
(pactl get-sink-mute @DEFAULT_SINK@ |
awk '{print $2}') in
482 pactl set-sink-mute @DEFAULT_SINK@ false
489 # use this if we want to do something just once per minute
502 if [[ -e /sys
/class
/power_supply
/AC
/online
&& $
(</sys
/class
/power_supply
/AC
/online
) == 0 ]]; then
515 loop_count
=$
(( loop_count
+ 1 ))
519 # ensure our long operations are one line so we are not prone errors
520 # from this file being modified.