#!/bin/bash # Basic system status on on Ian's computers # Copyright (C) 2024 Ian Kelling # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # SPDX-License-Identifier: GPL-3.0-or-later # usage: runs once every 15 seconds unless any args are passed, or we # then just runs once and have verbose output. On battery power, run # once per minute. if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi if [[ $EUID != 1000 ]]; then echo "$0: error, expected to be user 1000" exit 1 fi source /a/bin/bash-bear-trap/bash-bear status_file=/dev/shm/iank-status shopt -s nullglob shopt -s dotglob shopt -s extglob for p in ~/.gem/ruby/*/bin; do PATH="$PATH:$p" done verbose=false if [[ $1 ]]; then verbose=true fi v() { if $verbose; then printf "%s\n" "$*" fi } p() { printf "%s\n" "$*"; } # log-once COUNT NAME [MESSAGE] lo() { if type -p ifne &>/dev/null; then /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost fi } loday() { if type -p ifne &>/dev/null; then /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org fi } # rm glob rmg() { if (( $# )); then rm -f "$@" fi } # todo, consider migrating some of these alerts into prometheus write-status() { chars=("${first_chars[@]}") services=( epanicclean ) case $HOSTNAME in bk|je|li) : ;; *) services+=( systemstatus btrfsmaintstop dynamicipupdate ) bads=() if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then for s in ${services[@]}; do if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then bads+=($s) fi done chars+=(MYSERS) fi p ${bads[*]} | lo -240 mysers ;; esac case $HOSTNAME in kd) services=( prometheus-node-exporter prometheus-alertmanager prometheus ) bads=() if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then for s in ${services[@]}; do if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then bads+=($s) fi done chars+=(PROM) fi p ${bads[*]} | lo -240 prom ;; esac # this section copied from servicepid() unit=exim4 pid=$(systemctl show --property MainPID --value $unit ||:) case $pid in [1-9]*) : ;; *) dir=/sys/fs/cgroup/system.slice if [[ ! -d $dir ]]; then dir=/sys/fs/cgroup/systemd/system.slice fi; pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:) ;; esac if [[ ! $pid ]]; then chars+=(EXIM) fi if [[ -e /a/bin/bash_unpublished/source-state ]]; then # /a gets remounted due to btrbk, ignore error code for file doesnt exist source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]] fi ## check if last snapshot was recent old_snap_limit=$(( 3 * 60 * 60 )) vol=o btrbk_root=/mnt/o/btrbk # this section generally copied from btrbk scripts, but # this part modified to speed things up by about half a second. # I'm not sure if its quite as reliable, but it looks pretty safe. # Profiled it using time and also adding to the top of the file: # set -x # PS4='+ $(date "+%2N") ' # allow failure in case there are no snapshots yet. shopt -s nullglob files=($btrbk_root/$vol.20*) shopt -u nullglob if (( ${#files[@]} )); then # shellcheck disable=SC2012 # using ls version sort. not sure this is needed. snaps=("$(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : )") now=$EPOCHSECONDS maxtime=0 for s in ${snaps[@]}; do file=${s##*/} t=$(date -d "$(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#"$vol."})" +%s) if (( t > maxtime )); then maxtime=$t fi done snapshotmsg= last_snap_age=$(( now - maxtime )) last_snap_hours=$(( last_snap_age / 60 / 60 )) if (( last_snap_age > old_snap_limit )); then chars+=(OLD-SNAP-${last_snap_hours}h) snapshotmsg="/$vol snapshot older than 4 hours" if [[ $MAIL_HOST == "$HOSTNAME" ]]; then p "$snapshotmsg" | lo -1 old-snapshot fi # not bothering to get info on all volumes if we find an old one. fi fi if [[ $MAIL_HOST == "$HOSTNAME" ]]; then bouncemsg= glob=(/m/md/bounces/new/*) if [[ -e ${glob[0]} ]]; then chars+=(BOUNCE) bouncemsg="message in /m/md/bounces/new" fi p $bouncemsg | loday -1 bounce # emails without the S (seen) flag. this only checks the last flag, # but its good enough for me. glob=(/m/md/alerts/{new,cur}/!(*,S)) if [[ -e ${glob[0]} ]]; then chars+=(A) fi glob=(/m/md/daylert/{new,cur}/!(*,S)) if [[ -e ${glob[0]} ]]; then chars+=(DAY) fi bbkmsg= if [[ $(systemctl is-active btrbk.timer) != active ]]; then chars+=(BTRBK.TIMER) bbkmsg="not enabled" fi p "$bbkmsg" | lo -480 btrbk.timer # commented out, only using timetrap retrospectively. # # clock us out in timetrap if are idle too long # if [[ -e /p/.timetrap.db ]]; then # export DISPLAY=:0 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then # if [[ $xidle == [0-9]* ]]; then # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;") # idle=300000 # if [[ $sheet == w ]]; then # idle=900000 # fi # if [[ $sheet && $xidle -gt $idle ]]; then # timetrap out # fi # fi # fi # fi else # end if $MAIL_HOST rmg /home/iank/cron-errors/bounce* \ /home/iank/cron-errors/btrbk.timer* \ /home/iank/cron-errors/old-snapshot* fi if ip l show tunfsf &>/dev/null; then # this is for tracking dns over tls issue, which # fixvpndns() in brc2 fixes. stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: ) read -r _ _ _ istls <<<"$stat" case $istls in no) : ;; *) printf "%s\n" "$istls" | ts >> /tmp/istls.log chars+=("T:$istls") ;; esac fi # We do this once every 5 minutes, since this is not a grave problem. # For formatted elisp, see /b/ds/unsaved-buffers.el elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))' if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then # i dun care if this fails emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)" if [[ $emacsfiles ]]; then chars+=("$emacsfiles") fi fi last_emacs_check=$EPOCHSECONDS fi glob=(/nocow/btrfs-stale/*) if [[ -e ${glob[0]} ]]; then chars+=(STALE) fi var_mail_msg= if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then var_mail_msg="message in /var/mail" fi p $var_mail_msg | loday -1 var_mail # Note, early in install process, we dont have permission yet for exiqgrep. # # todo: don't do this every 15 seconds, more like once every 2 minutes to # save cpu cycles. # # 2400 = 40 mins. This should allow for system restarts, and # 30 minute message delay plus 10 minute queu runs. qlen=$(/usr/sbin/exiqgrep -o 2400 -c -b | awk '{print $1}') ||: qmsg= if ((qlen)); then # Do sending of long delayed messages, and dont count them in our queue warnings. for mid in $(exiqgrep -o 2400 -zi); do if exim -Mvh $mid | awk 'tolower($2) == "fdate:"' | grep -q .; then qlen=$(( qlen - 1 )) # shellcheck disable=SC2016 # exim var, not a bash bar if (( $(date -d "$(exim -Mset $mid -be <<<'$h_date:' | sed -n 's/^> *//;/./p')" +%s) < EPOCHSECONDS )); then if ip a show veth0-mail &>/dev/null; then pid=$(pgrep -f "/usr/sbin/exim4 -bd -q30m -C /etc/exim4/my.conf"|head -n1); nsenter -t $pid -n -m /usr/sbin/exim4 -C /etc/exim4/my.conf -M $mid else /usr/sbin/exim4 -M $mid fi fi fi done if ((qlen)); then qmsg="queue length $qlen" chars+=("q $qlen") fi fi case $HOSTNAME in # No point in emailing about the mailq on a host where we don't # check email. $MAIL_HOST) p $qmsg | loday -120 qlen ;; *) rmg /home/iank/cron-errors/qlen* ;; esac begin=false # todo: make this robust to the case of /a not being mounted if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then begin=true fi end=false if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then end=true fi # these conditions are so we dont have an overly verbose prompt if $begin && $end; then chars+=(D) elif $begin; then chars+=(DB) elif $end; then chars+=(DE) else source /a/bin/ds/script-files f=~/.local/conflink # shellcheck disable=SC2043 for _ in 1; do if [[ -e $f ]]; then now=$EPOCHSECONDS fsec=$(stat -c%Y $f) # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we # dont have any false positives. fmin=$(( (fsec - now + 1 ) / 60 )) fminplus=$(( fmin + 60*24 )) # Filesystem files get copied, so find any newer than the last run. # The rest are hueristics: # Given the last time we added a file in git, is that newer than the last conflink run. # Given new files not added to git, were they modified more recently than the last conflink? but, # push their modification time back by a day so we can develop them before needing to add them to git. all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem}) # This part is copied from conflink for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi done # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago if (( fmin < 0 )) && [[ $(find ${all_my_scripts[@]} ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then v conflink newer filesystem files chars+=(CONFLINK) break fi for d in /a/bin/distro-setup /p/c; do [[ -d $d ]] || continue cd $d if [[ ! -e .git ]]; then # some hosts i dont push all of /p/c continue fi if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then v conflink: newer files checked in to git chars+=(CONFLINK) break fi untracked=() while read -r l; do untracked+=("$l") done < <(git ls-files -o --exclude-standard) if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then v conflink: untracked in $d chars+=(CONFLINK) break fi done cd / fi if [[ ! -e $f || $(<$f) != 0 ]]; then v conflink: last run not found or failed chars+=(CONFLINK) break fi done fi # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then if [[ -s /var/log/exim4/paniclog ]]; then chars+=("PANIC!") # leave it up to epanic-clean to send email notification fi mprom=/var/lib/prometheus/node-exporter/mailtest-check.prom if [[ -s $mprom ]]; then if grep -qE 'mailtest_check_(unexpected|missing).*[^ ][^0]$' $mprom; then chars+=("MTEST_SPAM") fi mtest_found=false # shellcheck disable=SC2013 # these are words for t in $(grep -E ^mailtest_check_last_usec $mprom | awk '{print $NF}'); do if (( t + 60 * 20 < EPOCHSECONDS )); then mtest_found=true fi done if $mtest_found; then chars+=("MTEST_AGE") fi fi if [[ ! -e $status_file || -w $status_file ]]; then if [[ -e /a/bin/bash_unpublished/source-state ]]; then cat /a/bin/bash_unpublished/source-state >$status_file fi if [[ ${chars[*]} ]]; then echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file fi fi } # This prevents me having to mute notifications when I'm going to bed. mute() { local locked export DISPLAY=:0 locked=false if lock_info=$(xscreensaver-command -time); then if [[ $lock_info != *non-blanked* ]]; then locked=true fi else locked=true fi midnight=$(date -d 00:00 +%s) mdiff=$(( EPOCHSECONDS - midnight )) if $locked && (( mdiff < 6 *60*60 || mdiff > 21 *60*60 )); then case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in no) # for log purposes echo muted pactl set-sink-mute @DEFAULT_SINK@ true ;; esac fi if ! $locked && (( mdiff > 6 *60*60 || mdiff < 12 *60*60 )) && [[ ! -e /tmp/ianknap ]]; then case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in yes) # for log purposes echo unmuted pactl set-sink-mute @DEFAULT_SINK@ false ;; esac fi } # use this if we want to do something just once per minute first_chars=() write-status if [[ $1 ]]; then cat $status_file exit 0 fi loop_count=0 main-loop() { while true; do power=true if [[ -e /sys/class/power_supply/AC/online && $(