#!/bin/bash
-# Copyright (C) 2019 Ian Kelling
-# SPDX-License-Identifier: AGPL-3.0-or-later
-# usage: runs 4 times every 15 seconds unless any args are passed, or we
-# are on battery power, then just runs once.
+# Basic system status on on Ian's computers
+# Copyright (C) 2024 Ian Kelling
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# usage: runs once every 15 seconds unless any args are passed, or we
+# then just runs once and have verbose output. On battery power, run
+# once per minute.
if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
-source /a/bin/errhandle/err
+if [[ $EUID != 1000 ]]; then
+ echo "$0: error, expected to be user 1000"
+ exit 1
+fi
+
+source /a/bin/bash-bear-trap/bash-bear
status_file=/dev/shm/iank-status
shopt -s nullglob
printf "%s\n" "$*"
fi
}
+p() { printf "%s\n" "$*"; }
# log-once COUNT NAME [MESSAGE]
lo() {
- /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
+ if type -p ifne &>/dev/null; then
+ /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
+ fi
}
+loday() {
+ if type -p ifne &>/dev/null; then
+ /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
+ fi
+}
+# rm glob
+rmg() {
+ if (( $# )); then
+ rm -f "$@"
+ fi
+}
+
+# todo, consider migrating some of these alerts into prometheus
write-status() {
chars=("${first_chars[@]}")
- # clock us out in timetrap if are idle too long
- if [[ -e /p/.timetrap.db ]]; then
- export DISPLAY=:0
- if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
- if [[ $xidle == [0-9]* ]]; then
- sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
- idle=300000
- if [[ $sheet == w ]]; then
- idle=900000
- fi
- if [[ $sheet && $xidle -gt $idle ]]; then
- timetrap out
- fi
+ services=( epanicclean )
+ case $HOSTNAME in
+ bk|je|li) : ;;
+ *)
+ services+=(
+ systemstatus
+ btrfsmaintstop
+ dynamicipupdate
+ )
+ bads=()
+ if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
+ for s in ${services[@]}; do
+ if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
+ bads+=($s)
+ fi
+ done
+ chars+=(MYSERS)
+ fi
+ p ${bads[*]} | lo -240 mysers
+ ;;
+ esac
+
+ case $HOSTNAME in
+ kd)
+ services=(
+ prometheus-node-exporter
+ prometheus-alertmanager
+ prometheus
+ )
+ bads=()
+ if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then
+ for s in ${services[@]}; do
+ if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
+ bads+=($s)
+ fi
+ done
+ chars+=(PROM)
+ fi
+ p ${bads[*]} | lo -240 prom
+ ;;
+ esac
+
+
+ # this section copied from servicepid()
+ unit=exim4
+ pid=$(systemctl show --property MainPID --value $unit ||:)
+ case $pid in
+ [1-9]*) : ;;
+ *)
+ dir=/sys/fs/cgroup/system.slice
+ if [[ ! -d $dir ]]; then
+ dir=/sys/fs/cgroup/systemd/system.slice
+ fi;
+ pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:)
+ ;;
+ esac
+ if [[ ! $pid ]]; then
+ chars+=(EXIM)
+ fi
+
+
+ if [[ -e /a/bin/bash_unpublished/source-state ]]; then
+ # /a gets remounted due to btrbk, ignore error code for file doesnt exist
+ source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
+ fi
+
+
+ ## check if last snapshot was recent
+ old_snap_limit=$(( 3 * 60 * 60 ))
+ vol=o
+ btrbk_root=/mnt/o/btrbk
+ # this section generally copied from btrbk scripts, but
+ # this part modified to speed things up by about half a second.
+ # I'm not sure if its quite as reliable, but it looks pretty safe.
+ # Profiled it using time and also adding to the top of the file:
+ # set -x
+ # PS4='+ $(date "+%2N") '
+ # allow failure in case there are no snapshots yet.
+ shopt -s nullglob
+ files=($btrbk_root/$vol.20*)
+ shopt -u nullglob
+ if (( ${#files[@]} )); then
+ # shellcheck disable=SC2012 # using ls version sort. not sure this is needed.
+ snaps=("$(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : )")
+ now=$EPOCHSECONDS
+ maxtime=0
+ for s in ${snaps[@]}; do
+ file=${s##*/}
+ t=$(date -d "$(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#"$vol."})" +%s)
+ if (( t > maxtime )); then
+ maxtime=$t
fi
+ done
+ snapshotmsg=
+ last_snap_age=$(( now - maxtime ))
+ last_snap_hours=$(( last_snap_age / 60 / 60 ))
+ if (( last_snap_age > old_snap_limit )); then
+ chars+=(OLD-SNAP-${last_snap_hours}h)
+ snapshotmsg="/$vol snapshot older than 4 hours"
+ if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
+ p "$snapshotmsg" | lo -1 old-snapshot
+ fi
+ # not bothering to get info on all volumes if we find an old one.
fi
fi
- if pgrep -f 'emacs --daemon' &>/dev/null; then
- emacsfiles="$(emacsclient --eval "$(cat /a/bin/ds/unsaved-buffers.el)"| sed '/^"nil"$/d;s/^"(/E: /;s/)"$//')"
- if [[ $emacsfiles ]]; then
- chars+=("$emacsfiles")
+
+ if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
+
+ bouncemsg=
+ glob=(/m/md/bounces/new/*)
+ if [[ -e ${glob[0]} ]]; then
+ chars+=(BOUNCE)
+ bouncemsg="message in /m/md/bounces/new"
+ fi
+ p $bouncemsg | loday -1 bounce
+ # emails without the S (seen) flag. this only checks the last flag,
+ # but its good enough for me.
+ glob=(/m/md/alerts/{new,cur}/!(*,S))
+ if [[ -e ${glob[0]} ]]; then
+ chars+=(A)
+ fi
+
+ glob=(/m/md/daylert/{new,cur}/!(*,S))
+ if [[ -e ${glob[0]} ]]; then
+ chars+=(DAY)
fi
+
+ bbkmsg=
+ if [[ $(systemctl is-active btrbk.timer) != active ]]; then
+ chars+=(BTRBK.TIMER)
+ bbkmsg="not enabled"
+ fi
+ p "$bbkmsg" | lo -480 btrbk.timer
+
+
+
+ # commented out, only using timetrap retrospectively.
+ # # clock us out in timetrap if are idle too long
+ # if [[ -e /p/.timetrap.db ]]; then
+ # export DISPLAY=:0
+ # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
+ # if [[ $xidle == [0-9]* ]]; then
+ # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
+ # idle=300000
+ # if [[ $sheet == w ]]; then
+ # idle=900000
+ # fi
+ # if [[ $sheet && $xidle -gt $idle ]]; then
+ # timetrap out
+ # fi
+ # fi
+ # fi
+ # fi
+ else # end if $MAIL_HOST
+ rmg /home/iank/cron-errors/bounce* \
+ /home/iank/cron-errors/btrbk.timer* \
+ /home/iank/cron-errors/old-snapshot*
fi
+ if ip l show tunfsf &>/dev/null; then
+ # this is for tracking dns over tls issue, which
+ # fixvpndns() in brc2 fixes.
+ stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
+ read -r _ _ _ istls <<<"$stat"
+ case $istls in
+ no) : ;;
+ *)
+ printf "%s\n" "$istls" | ts >> /tmp/istls.log
+ chars+=("T:$istls")
+ ;;
+ esac
+ fi
+
+ # We do this once every 5 minutes, since this is not a grave problem.
+ # For formatted elisp, see /b/ds/unsaved-buffers.el
+ elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))'
+ if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then
+ if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
+ # i dun care if this fails
+ emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)"
+ if [[ $emacsfiles ]]; then
+ chars+=("$emacsfiles")
+ fi
+ fi
+ last_emacs_check=$EPOCHSECONDS
+ fi
+
+
glob=(/nocow/btrfs-stale/*)
if [[ -e ${glob[0]} ]]; then
- chars+=("STALE")
+ chars+=(STALE)
fi
+ var_mail_msg=
if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
var_mail_msg="message in /var/mail"
fi
- lo -1 var_mail $var_mail_msg
- glob=(/m/md/bounces/new/*)
- if [[ -e ${glob[0]} ]]; then
- chars+=("BOUNCE")
- bouncemsg="message in /m/md/bounces/new"
- fi
- lo -1 bounce $bouncemsg
- # emails without the S (seen) flag. this only checks the last flag,
- # but its good enough for me.
- glob=(/m/md/alerts/{new,cur}/!(*,S))
- if [[ -e ${glob[0]} ]]; then
- chars+=("A")
- fi
- tmp=(/var/local/cron-errors/mailtest-check*)
- if (( ${#tmp[@]} )); then
- chars+=("MAILPING")
- fi
- tmp=(/var/local/cron-errors/mailtest-slow*)
- if (( ${#tmp[@]} )); then
- chars+=("SPAMD")
- fi
+ p $var_mail_msg | loday -1 var_mail
- # early in install process, we dont have permission yet for exiqgrep
- qlen=$(/usr/sbin/exiqgrep -o 600 -c -b | awk '{print $1}') ||:
+ # Note, early in install process, we dont have permission yet for exiqgrep.
+ #
+ # todo: don't do this every 15 seconds, more like once every 2 minutes to
+ # save cpu cycles.
+ #
+ # 2400 = 40 mins. This should allow for system restarts, and
+ # 30 minute message delay plus 10 minute queu runs.
+ qlen=$(/usr/sbin/exiqgrep -o 2400 -c -b | awk '{print $1}') ||:
+ qmsg=
if ((qlen)); then
- qmsg="queue length $qlen"
- chars+=("q $qlen")
+ # Do sending of long delayed messages, and dont count them in our queue warnings.
+ for mid in $(exiqgrep -o 2400 -zi); do
+ if exim -Mvh $mid | awk 'tolower($2) == "fdate:"' | grep -q .; then
+ qlen=$(( qlen - 1 ))
+ # shellcheck disable=SC2016 # exim var, not a bash bar
+ if (( $(date -d "$(exim -Mset $mid -be <<<'$h_date:' | sed -n 's/^> *//;/./p')" +%s) < EPOCHSECONDS )); then
+ if ip a show veth0-mail &>/dev/null; then
+ pid=$(pgrep -f "/usr/sbin/exim4 -bd -q30m -C /etc/exim4/nn-mainlog.conf"|head -n1);
+ nsenter -t $pid -n -m /usr/sbin/exim4 -C /etc/exim4/nn-mainlog.conf -M $mid
+ else
+ /usr/sbin/exim4 -M $mid
+ fi
+ fi
+ fi
+ done
+
+ if ((qlen)); then
+ qmsg="queue length $qlen"
+ chars+=("q $qlen")
+ fi
fi
case $HOSTNAME in
# No point in emailing about the mailq on a host where we don't
# check email.
- $MAIL_HOST|bk)
- lo -120 qlen $qmsg
+ $MAIL_HOST)
+ p $qmsg | loday -120 qlen
+
+
+ f=/var/spool/exim4/gw/no-delay-eximids
+ if (( loop_count % 10 == 0 )) && \
+ [[ -s $f ]] && [[ $(cat $f) == all ]]; then
+ # I've left this on longer than I intended, so just auto-delete
+ # it after some time.
+ find $f -mmin +180 -delete
+ if [[ -s $f ]]; then
+ chars+=("NO_DELAY")
+ fi
+ fi
+
+
+ ;;
+ *)
+ rmg /home/iank/cron-errors/qlen*
;;
esac
begin=false
- if ! make -C /b/ds -q ~/.local/distro-begin || [[ $(<~/.local/distro-begin) != 0 ]]; then
+
+ # todo: make this robust to the case of /a not being mounted
+ if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
begin=true
fi
end=false
- if ! make -C /b/ds -q ~/.local/distro-end || [[ $(<~/.local/distro-end) != 0 ]]; then
+ if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
end=true
fi
# these conditions are so we dont have an overly verbose prompt
if $begin && $end; then
- chars+=("D")
+ chars+=(D)
elif $begin; then
- chars+=("DB")
+ chars+=(DB)
elif $end; then
- chars+=("DE")
+ chars+=(DE)
else
+ source /a/bin/ds/script-files
f=~/.local/conflink
# shellcheck disable=SC2043
for _ in 1; do
if [[ -e $f ]]; then
- now=$(date +%s)
+ now=$EPOCHSECONDS
fsec=$(stat -c%Y $f)
# the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
# dont have any false positives.
if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
done
+ script_files=("${my_service_scripts[@]}" "${my_bin_files[@]}" $my_lib_files)
+
# Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
- if (( fmin < 0 )) && [[ $(find ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
+ if (( fmin < 0 )) && [[ $(find "${script_files[@]}" ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
v conflink newer filesystem files
- chars+=("CONFLINK")
+ chars+=(CONFLINK)
break
fi
for d in /a/bin/distro-setup /p/c; do
+ [[ -d $d ]] || continue
cd $d
if [[ ! -e .git ]]; then
# some hosts i dont push all of /p/c
fi
if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
v conflink: newer files checked in to git
- chars+=("CONFLINK")
+ chars+=(CONFLINK)
break
fi
done < <(git ls-files -o --exclude-standard)
if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
v conflink: untracked in $d
- chars+=("CONFLINK")
+ chars+=(CONFLINK)
break
fi
done
fi
if [[ ! -e $f || $(<$f) != 0 ]]; then
v conflink: last run not found or failed
- chars+=("CONFLINK")
+ chars+=(CONFLINK)
break
fi
done
fi
+ # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
if [[ -s /var/log/exim4/paniclog ]]; then
chars+=("PANIC!")
# leave it up to epanic-clean to send email notification
fi
- source /a/bin/bash_unpublished/source-state
- if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
- bbkmsg=
- if [[ $(systemctl is-active btrbk.timer) != active ]]; then
- chars+=("BTRBK.TIMER")
- bbkmsg="btrbk.timer not enabled"
+ mprom=/var/lib/prometheus/node-exporter/mailtest-check.prom
+ if [[ -s $mprom ]]; then
+ if grep -qE 'mailtest_check_(unexpected|missing).*[^ ][^0]$' $mprom; then
+ chars+=("MTEST_SPAM")
fi
- lo -960 btrbk.timer $bbkmsg
-
- ## check if last snapshot was within an hour
- vol=o
- # this section generally copied from btrbk scripts, but
- # this part modified to speed things up by about half a second.
- # I'm not sure if its quite as reliable, but it looks pretty safe.
- # Profiled it using time and also adding to the top of the file:
- # set -x
- # PS4='+ $(date "+%2N") '
- # allow failure in case there are no snapshots yet.
- # shellcheck disable=SC2012
- shopt -u nullglob
- files=(/mnt/root/btrbk/$vol.20*)
- shopt -s nullglob
- snaps=()
- if (( ${#files[@]} )); then
- snaps=($(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : ))
- fi
- now=$(date +%s)
- maxtime=0
- for s in ${snaps[@]}; do
- file=${s##*/}
- t=$(date -d $(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#$vol.}) +%s)
- if (( t > maxtime )); then
- maxtime=$t
+ mtest_found=false
+ # shellcheck disable=SC2013 # these are words
+ for t in $(grep -E ^mailtest_check_last_usec $mprom | awk '{print $NF}'); do
+ if (( t + 60 * 20 < EPOCHSECONDS )); then
+ mtest_found=true
fi
done
- if (( maxtime < now - 4*60*60 )); then
- chars+=("OLD-SNAP")
- snapshotmsg="/o snapshot older than 4 hours"
+ if $mtest_found; then
+ chars+=("MTEST_AGE")
fi
- lo -1 old-snapshot $snapshotmsg
fi
- cat /a/bin/bash_unpublished/source-state >$status_file
+ if [[ ! -e $status_file || -w $status_file ]]; then
+ if [[ -e /a/bin/bash_unpublished/source-state ]]; then
+ cat /a/bin/bash_unpublished/source-state >$status_file
+ fi
- if [[ ${chars[*]} ]]; then
- echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
+ if [[ ${chars[*]} ]]; then
+ echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
+ fi
fi
+ if [[ -e $HOME/.iank-stream-on ]] && ! pgrep -fc '^ffmpeg.*icecast://source.*/fsf-sysops' >/dev/null; then
+ rm -f $HOME/.iank-stream-on
+ fi
+
+} # end write-status
+
+# This prevents me having to mute notifications when I'm going to bed.
+mute() {
+ local locked
+ export DISPLAY=:0
+ locked=false
+ if lock_info=$(xscreensaver-command -time 2>/dev/null); then
+ if [[ $lock_info != *non-blanked* ]]; then
+ locked=true
+ fi
+ midnight=$(date -d 00:00 +%s)
+ mdiff=$(( EPOCHSECONDS - midnight ))
+ if $locked && (( mdiff < 6 *60*60 || mdiff > 21 *60*60 )); then
+ case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
+ no)
+ # for log purposes
+ echo muted
+ pactl set-sink-mute @DEFAULT_SINK@ true
+ ;;
+ esac
+ fi
+ if ! $locked && (( mdiff > 6 *60*60 || mdiff < 12 *60*60 )) && [[ ! -e /tmp/ianknap ]]; then
+ case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in
+ yes)
+ # for log purposes
+ echo unmuted
+ pactl set-sink-mute @DEFAULT_SINK@ false
+ ;;
+ esac
+ fi
+ fi
}
+
# use this if we want to do something just once per minute
first_chars=()
-power=true
-if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
- power=false
-fi
-
write-status
if [[ $1 ]]; then
cat $status_file
exit 0
fi
-if ! $power; then
- exit 0
-fi
+loop_count=0
+main-loop() {
+ while true; do
+ power=true
+ if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
+ power=false
+ fi
-# about 15 minutes
-for ((i=1; i<=60; i++)); do
- sleep 15
- write-status
-done
+ if $power; then
+ wait=15
+ else
+ wait=60
+ fi
+
+ sleep $wait
+ write-status
+ mute
+ loop_count=$(( loop_count + 1 ))
+ done
+}
+
+# ensure our long operations are one line so we are not prone errors
+# from this file being modified.
+main-loop