X-Git-Url: https://iankelling.org/git/?p=distro-setup;a=blobdiff_plain;f=system-status;h=430f8061c1c6611eafea56f369211c46d8804f94;hp=2dd39ca7d99a90c88ccc9bddf7e51a49f37535cf;hb=HEAD;hpb=7d9ec600a5ed9f88b85e02a27ee017b85721a6ac diff --git a/system-status b/system-status old mode 100644 new mode 100755 index 2dd39ca..d6269d9 --- a/system-status +++ b/system-status @@ -1,13 +1,35 @@ #!/bin/bash -# Copyright (C) 2019 Ian Kelling -# SPDX-License-Identifier: AGPL-3.0-or-later -# usage: runs 4 times every 15 seconds unless any args are passed, or we -# are on battery power, then just runs once. +# Basic system status on on Ian's computers +# Copyright (C) 2024 Ian Kelling + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# SPDX-License-Identifier: GPL-3.0-or-later + +# usage: runs once every 15 seconds unless any args are passed, or we +# then just runs once and have verbose output. On battery power, run +# once per minute. if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi -source /a/bin/errhandle/err +if [[ $EUID != 1000 ]]; then + echo "$0: error, expected to be user 1000" + exit 1 +fi + +source /a/bin/bash-bear-trap/bash-bear status_file=/dev/shm/iank-status shopt -s nullglob @@ -28,104 +50,297 @@ v() { printf "%s\n" "$*" fi } +p() { printf "%s\n" "$*"; } # log-once COUNT NAME [MESSAGE] lo() { - /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost + if type -p ifne &>/dev/null; then + /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost + fi } +loday() { + if type -p ifne &>/dev/null; then + /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org + fi +} +# rm glob +rmg() { + if (( $# )); then + rm -f "$@" + fi +} + +# todo, consider migrating some of these alerts into prometheus write-status() { chars=("${first_chars[@]}") - # clock us out in timetrap if are idle too long - if [[ -e /p/.timetrap.db ]]; then - export DISPLAY=:0 - if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then - if [[ $xidle == [0-9]* ]]; then - sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;") - idle=300000 - if [[ $sheet == w ]]; then - idle=900000 - fi - if [[ $sheet && $xidle -gt $idle ]]; then - timetrap out - fi + services=( epanicclean ) + case $HOSTNAME in + bk|je|li) : ;; + *) + services+=( + systemstatus + btrfsmaintstop + dynamicipupdate + ) + bads=() + if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then + for s in ${services[@]}; do + if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then + bads+=($s) + fi + done + chars+=(MYSERS) + fi + p ${bads[*]} | lo -240 mysers + ;; + esac + + case $HOSTNAME in + kd) + services=( + prometheus-node-exporter + prometheus-alertmanager + prometheus + ) + bads=() + if systemctl show -p SubState --value ${services[@]} | grep -E -v '^(running|)$' &>/dev/null; then + for s in ${services[@]}; do + if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then + bads+=($s) + fi + done + chars+=(PROM) fi + p ${bads[*]} | lo -240 prom + ;; + esac + + + # this section copied from servicepid() + unit=exim4 + pid=$(systemctl show --property MainPID --value $unit ||:) + case $pid in + [1-9]*) : ;; + *) + dir=/sys/fs/cgroup/system.slice + if [[ ! -d $dir ]]; then + dir=/sys/fs/cgroup/systemd/system.slice + fi; + pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:) + ;; + esac + if [[ ! $pid ]]; then + chars+=(EXIM) + fi + + + if [[ -e /a/bin/bash_unpublished/source-state ]]; then + # /a gets remounted due to btrbk, ignore error code for file doesnt exist + source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]] + fi + + + ## check if last snapshot was recent + old_snap_limit=$(( 3 * 60 * 60 )) + vol=o + btrbk_root=/mnt/o/btrbk + # this section generally copied from btrbk scripts, but + # this part modified to speed things up by about half a second. + # I'm not sure if its quite as reliable, but it looks pretty safe. + # Profiled it using time and also adding to the top of the file: + # set -x + # PS4='+ $(date "+%2N") ' + # allow failure in case there are no snapshots yet. + shopt -s nullglob + files=($btrbk_root/$vol.20*) + shopt -u nullglob + if (( ${#files[@]} )); then + # shellcheck disable=SC2012 # using ls version sort. not sure this is needed. + snaps=("$(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : )") + now=$EPOCHSECONDS + maxtime=0 + for s in ${snaps[@]}; do + file=${s##*/} + t=$(date -d "$(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#"$vol."})" +%s) + if (( t > maxtime )); then + maxtime=$t + fi + done + snapshotmsg= + last_snap_age=$(( now - maxtime )) + last_snap_hours=$(( last_snap_age / 60 / 60 )) + if (( last_snap_age > old_snap_limit )); then + chars+=(OLD-SNAP-${last_snap_hours}h) + snapshotmsg="/$vol snapshot older than 4 hours" + if [[ $MAIL_HOST == "$HOSTNAME" ]]; then + p "$snapshotmsg" | lo -1 old-snapshot + fi + # not bothering to get info on all volumes if we find an old one. + fi + fi + + + if [[ $MAIL_HOST == "$HOSTNAME" ]]; then + + bouncemsg= + glob=(/m/md/bounces/new/*) + if [[ -e ${glob[0]} ]]; then + chars+=(BOUNCE) + bouncemsg="message in /m/md/bounces/new" + fi + p $bouncemsg | loday -1 bounce + # emails without the S (seen) flag. this only checks the last flag, + # but its good enough for me. + glob=(/m/md/alerts/{new,cur}/!(*,S)) + if [[ -e ${glob[0]} ]]; then + chars+=(A) + fi + + glob=(/m/md/daylert/{new,cur}/!(*,S)) + if [[ -e ${glob[0]} ]]; then + chars+=(DAY) + fi + + bbkmsg= + if [[ $(systemctl is-active btrbk.timer) != active ]]; then + chars+=(BTRBK.TIMER) + bbkmsg="not enabled" fi + p "$bbkmsg" | lo -480 btrbk.timer + + + + # commented out, only using timetrap retrospectively. + # # clock us out in timetrap if are idle too long + # if [[ -e /p/.timetrap.db ]]; then + # export DISPLAY=:0 + # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then + # if [[ $xidle == [0-9]* ]]; then + # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;") + # idle=300000 + # if [[ $sheet == w ]]; then + # idle=900000 + # fi + # if [[ $sheet && $xidle -gt $idle ]]; then + # timetrap out + # fi + # fi + # fi + # fi + else # end if $MAIL_HOST + rmg /home/iank/cron-errors/bounce* \ + /home/iank/cron-errors/btrbk.timer* \ + /home/iank/cron-errors/old-snapshot* + fi + + if ip l show tunfsf &>/dev/null; then + # this is for tracking dns over tls issue, which + # fixvpndns() in brc2 fixes. + stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: ) + read -r _ _ _ istls <<<"$stat" + case $istls in + no) : ;; + *) + printf "%s\n" "$istls" | ts >> /tmp/istls.log + chars+=("T:$istls") + ;; + esac fi - if pgrep -f 'emacs --daemon' &>/dev/null; then - emacsfiles="$(emacsclient --eval "$(cat /a/bin/ds/unsaved-buffers.el)"| sed '/^"nil"$/d;s/^"(/E: /;s/)"$//')" - if [[ $emacsfiles ]]; then - chars+=("$emacsfiles") + # We do this once every 5 minutes, since this is not a grave problem. + # For formatted elisp, see /b/ds/unsaved-buffers.el + elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))' + if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then + if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then + # i dun care if this fails + emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)" + if [[ $emacsfiles ]]; then + chars+=("$emacsfiles") + fi fi + last_emacs_check=$EPOCHSECONDS fi + glob=(/nocow/btrfs-stale/*) if [[ -e ${glob[0]} ]]; then - chars+=("STALE") + chars+=(STALE) fi + var_mail_msg= if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then var_mail_msg="message in /var/mail" fi - lo -1 var_mail $var_mail_msg - glob=(/m/md/bounces/new/*) - if [[ -e ${glob[0]} ]]; then - chars+=("BOUNCE") - bouncemsg="message in /m/md/bounces/new" - fi - lo -1 bounce $bouncemsg - # emails without the S (seen) flag. this only checks the last flag, - # but its good enough for me. - glob=(/m/md/alerts/{new,cur}/!(*,S)) - if [[ -e ${glob[0]} ]]; then - chars+=("A") - fi - tmp=(/var/local/cron-errors/mailtest-check*) - if (( ${#tmp[@]} )); then - chars+=("MAILPING") - fi - tmp=(/var/local/cron-errors/mailtest-slow*) - if (( ${#tmp[@]} )); then - chars+=("SPAMD") - fi + p $var_mail_msg | loday -1 var_mail - # early in install process, we dont have permission yet for exiqgrep - qlen=$(/usr/sbin/exiqgrep -o 600 -c -b | awk '{print $1}') ||: + # Note, early in install process, we dont have permission yet for exiqgrep. + # + # todo: don't do this every 15 seconds, more like once every 2 minutes to + # save cpu cycles. + # + # 2400 = 40 mins. This should allow for system restarts, and + # 30 minute message delay plus 10 minute queu runs. + qlen=$(/usr/sbin/exiqgrep -o 2400 -c -b | awk '{print $1}') ||: + qmsg= if ((qlen)); then - qmsg="queue length $qlen" - chars+=("q $qlen") + # Do sending of long delayed messages, and dont count them in our queue warnings. + for mid in $(exiqgrep -o 2400 -zi); do + if exim -Mvh $mid | awk 'tolower($2) == "fdate:"' | grep -q .; then + qlen=$(( qlen - 1 )) + # shellcheck disable=SC2016 # exim var, not a bash bar + if (( $(date -d "$(exim -Mset $mid -be <<<'$h_date:' | sed -n 's/^> *//;/./p')" +%s) < EPOCHSECONDS )); then + if ip a show veth0-mail &>/dev/null; then + pid=$(pgrep -f "/usr/sbin/exim4 -bd -q30m -C /etc/exim4/my.conf"|head -n1); + nsenter -t $pid -n -m /usr/sbin/exim4 -C /etc/exim4/my.conf -M $mid + else + /usr/sbin/exim4 -M $mid + fi + fi + fi + done + + if ((qlen)); then + qmsg="queue length $qlen" + chars+=("q $qlen") + fi fi case $HOSTNAME in # No point in emailing about the mailq on a host where we don't # check email. - $MAIL_HOST|bk) - lo -120 qlen $qmsg + $MAIL_HOST) + p $qmsg | loday -120 qlen + ;; + *) + rmg /home/iank/cron-errors/qlen* ;; esac begin=false - if ! make -C /b/ds -q ~/.local/distro-begin || [[ $(<~/.local/distro-begin) != 0 ]]; then + + # todo: make this robust to the case of /a not being mounted + if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then begin=true fi end=false - if ! make -C /b/ds -q ~/.local/distro-end || [[ $(<~/.local/distro-end) != 0 ]]; then + if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then end=true fi # these conditions are so we dont have an overly verbose prompt if $begin && $end; then - chars+=("D") + chars+=(D) elif $begin; then - chars+=("DB") + chars+=(DB) elif $end; then - chars+=("DE") + chars+=(DE) else + source /a/bin/ds/script-files f=~/.local/conflink # shellcheck disable=SC2043 for _ in 1; do if [[ -e $f ]]; then - now=$(date +%s) + now=$EPOCHSECONDS fsec=$(stat -c%Y $f) # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we # dont have any false positives. @@ -143,14 +358,17 @@ write-status() { if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi done + script_files=("${my_service_scripts[@]}" "${my_bin_files[@]}" $my_lib_files) + # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago - if (( fmin < 0 )) && [[ $(find ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then + if (( fmin < 0 )) && [[ $(find "${script_files[@]}" ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then v conflink newer filesystem files - chars+=("CONFLINK") + chars+=(CONFLINK) break fi for d in /a/bin/distro-setup /p/c; do + [[ -d $d ]] || continue cd $d if [[ ! -e .git ]]; then # some hosts i dont push all of /p/c @@ -158,7 +376,7 @@ write-status() { fi if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then v conflink: newer files checked in to git - chars+=("CONFLINK") + chars+=(CONFLINK) break fi @@ -168,7 +386,7 @@ write-status() { done < <(git ls-files -o --exclude-standard) if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then v conflink: untracked in $d - chars+=("CONFLINK") + chars+=(CONFLINK) break fi done @@ -177,86 +395,124 @@ write-status() { fi if [[ ! -e $f || $(<$f) != 0 ]]; then v conflink: last run not found or failed - chars+=("CONFLINK") + chars+=(CONFLINK) break fi done fi + # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then if [[ -s /var/log/exim4/paniclog ]]; then chars+=("PANIC!") # leave it up to epanic-clean to send email notification fi - source /a/bin/bash_unpublished/source-state - if [[ $MAIL_HOST == "$HOSTNAME" ]]; then - bbkmsg= - if [[ $(systemctl is-active btrbk.timer) != active ]]; then - chars+=("BTRBK.TIMER") - bbkmsg="btrbk.timer not enabled" + mprom=/var/lib/prometheus/node-exporter/mailtest-check.prom + if [[ -s $mprom ]]; then + if grep -qE 'mailtest_check_(unexpected|missing).*[^ ][^0]$' $mprom; then + chars+=("MTEST_SPAM") fi - lo -960 btrbk.timer $bbkmsg - - ## check if last snapshot was within an hour - vol=o - # this section generally copied from btrbk scripts, but - # this part modified to speed things up by about half a second. - # I'm not sure if its quite as reliable, but it looks pretty safe. - # Profiled it using time and also adding to the top of the file: - # set -x - # PS4='+ $(date "+%2N") ' - # allow failure in case there are no snapshots yet. - # shellcheck disable=SC2012 - shopt -u nullglob - files=(/mnt/root/btrbk/$vol.20*) - shopt -s nullglob - snaps=() - if (( ${#files[@]} )); then - snaps=($(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : )) - fi - now=$(date +%s) - maxtime=0 - for s in ${snaps[@]}; do - file=${s##*/} - t=$(date -d $(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#$vol.}) +%s) - if (( t > maxtime )); then - maxtime=$t + mtest_found=false + # shellcheck disable=SC2013 # these are words + for t in $(grep -E ^mailtest_check_last_usec $mprom | awk '{print $NF}'); do + if (( t + 60 * 20 < EPOCHSECONDS )); then + mtest_found=true fi done - if (( maxtime < now - 4*60*60 )); then - chars+=("OLD-SNAP") - snapshotmsg="/o snapshot older than 4 hours" + if $mtest_found; then + chars+=("MTEST_AGE") fi - lo -1 old-snapshot $snapshotmsg fi - cat /a/bin/bash_unpublished/source-state >$status_file + if [[ ! -e $status_file || -w $status_file ]]; then + if [[ -e /a/bin/bash_unpublished/source-state ]]; then + cat /a/bin/bash_unpublished/source-state >$status_file + fi - if [[ ${chars[*]} ]]; then - echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file + if [[ ${chars[*]} ]]; then + echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file + fi fi +} +# This prevents me having to mute notifications when I'm going to bed. +mute() { + local locked + export DISPLAY=:0 + locked=false + if lock_info=$(xscreensaver-command -time); then + if [[ $lock_info != *non-blanked* ]]; then + locked=true + fi + else + locked=true + fi + midnight=$(date -d 00:00 +%s) + mdiff=$(( EPOCHSECONDS - midnight )) + if $locked && (( mdiff < 6 *60*60 || mdiff > 21 *60*60 )); then + case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in + no) + # for log purposes + echo muted + pactl set-sink-mute @DEFAULT_SINK@ true + ;; + esac + fi + if ! $locked && (( mdiff > 6 *60*60 || mdiff < 12 *60*60 )) && [[ ! -e /tmp/ianknap ]]; then + case $(pactl get-sink-mute @DEFAULT_SINK@ | awk '{print $2}') in + yes) + # for log purposes + echo unmuted + pactl set-sink-mute @DEFAULT_SINK@ false + ;; + esac + fi } + # use this if we want to do something just once per minute first_chars=() -power=true -if [[ -e /sys/class/power_supply/AC/online && $(