make btrbk failures higher priority
[distro-setup] / system-status
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # usage: runs once every 15 seconds unless any args are passed, or we
6 # then just runs once. On battery power, run once per minute.
7
8 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
9
10 if [[ $EUID != 1000 ]]; then
11 echo "$0: error, expected to be user 1000"
12 exit 1
13 fi
14
15 source /a/bin/errhandle/err
16 status_file=/dev/shm/iank-status
17
18 shopt -s nullglob
19 shopt -s dotglob
20 shopt -s extglob
21
22 for p in ~/.gem/ruby/*/bin; do
23 PATH="$PATH:$p"
24 done
25
26
27 verbose=false
28 if [[ $1 ]]; then
29 verbose=true
30 fi
31 v() {
32 if $verbose; then
33 printf "%s\n" "$*"
34 fi
35 }
36 p() { printf "%s\n" "$*"; }
37 # log-once COUNT NAME [MESSAGE]
38 lo() {
39 if type -p ifne &>/dev/null; then
40 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
41 fi
42 }
43
44 loday() {
45 if type -p ifne &>/dev/null; then
46 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
47 fi
48 }
49 # rm glob
50 rmg() {
51 if (( $# )); then
52 rm -f "$@"
53 fi
54 }
55
56 # todo, consider migrating some of these alerts into prometheus
57 write-status() {
58 chars=("${first_chars[@]}")
59
60 services=( epanicclean )
61 case $HOSTNAME in
62 bk|je|li) : ;;
63 *)
64 services+=(
65 systemstatus
66 btrfsmaintstop
67 dynamicipupdate
68 )
69 bads=()
70 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
71 for s in ${services[@]}; do
72 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
73 bads+=($s)
74 fi
75 done
76 chars+=(MYSERS)
77 fi
78 p ${bads[*]} | lo -240 mysers
79 ;;
80 esac
81
82 case $HOSTNAME in
83 kd)
84 services=(
85 prometheus-node-exporter
86 prometheus-alertmanager
87 prometheus
88 )
89 bads=()
90 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
91 for s in ${services[@]}; do
92 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
93 bads+=($s)
94 fi
95 done
96 chars+=(PROM)
97 fi
98 p ${bads[*]} | lo -240 prom
99 ;;
100 esac
101
102
103 # this section copied from servicepid()
104 unit=exim4
105 pid=$(systemctl show --property MainPID --value $unit ||:)
106 case $pid in
107 [1-9]*) : ;;
108 *)
109 dir=/sys/fs/cgroup/system.slice
110 if [[ ! -d $dir ]]; then
111 dir=/sys/fs/cgroup/systemd/system.slice
112 fi;
113 pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:)
114 ;;
115 esac
116 if [[ ! $pid ]]; then
117 chars+=(EXIM)
118 fi
119
120
121 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
122 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
123 source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
124 fi
125 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
126
127 bouncemsg=
128 glob=(/m/md/bounces/new/*)
129 if [[ -e ${glob[0]} ]]; then
130 chars+=(BOUNCE)
131 bouncemsg="message in /m/md/bounces/new"
132 fi
133 p $bouncemsg | loday -1 bounce
134 # emails without the S (seen) flag. this only checks the last flag,
135 # but its good enough for me.
136 glob=(/m/md/alerts/{new,cur}/!(*,S))
137 if [[ -e ${glob[0]} ]]; then
138 chars+=(A)
139 fi
140
141 glob=(/m/md/daylert/{new,cur}/!(*,S))
142 if [[ -e ${glob[0]} ]]; then
143 chars+=(DAY)
144 fi
145
146 bbkmsg=
147 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
148 chars+=(BTRBK.TIMER)
149 bbkmsg="not enabled"
150 fi
151 p "$bbkmsg" | lo -480 btrbk.timer
152
153 ## check if last snapshot was within an hour
154 vol=o
155 # this section generally copied from btrbk scripts, but
156 # this part modified to speed things up by about half a second.
157 # I'm not sure if its quite as reliable, but it looks pretty safe.
158 # Profiled it using time and also adding to the top of the file:
159 # set -x
160 # PS4='+ $(date "+%2N") '
161 # allow failure in case there are no snapshots yet.
162 # shellcheck disable=SC2012
163 shopt -u nullglob
164 files=(/mnt/o/btrbk/$vol.20*)
165 shopt -s nullglob
166 snaps=()
167 if (( ${#files[@]} )); then
168 snaps=($(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : ))
169 fi
170 now=$EPOCHSECONDS
171 maxtime=0
172 for s in ${snaps[@]}; do
173 file=${s##*/}
174 t=$(date -d $(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#$vol.}) +%s)
175 if (( t > maxtime )); then
176 maxtime=$t
177 fi
178 done
179 snapshotmsg=
180 if (( maxtime < now - 4*60*60 )); then
181 chars+=(OLD-SNAP)
182 snapshotmsg="/o snapshot older than 4 hours"
183 fi
184 p "$snapshotmsg" | lo -1 old-snapshot
185
186
187 # commented out, only using timetrap retrospectively.
188 # # clock us out in timetrap if are idle too long
189 # if [[ -e /p/.timetrap.db ]]; then
190 # export DISPLAY=:0
191 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
192 # if [[ $xidle == [0-9]* ]]; then
193 # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
194 # idle=300000
195 # if [[ $sheet == w ]]; then
196 # idle=900000
197 # fi
198 # if [[ $sheet && $xidle -gt $idle ]]; then
199 # timetrap out
200 # fi
201 # fi
202 # fi
203 # fi
204 else # end if $MAIL_HOST
205 rmg /home/iank/cron-errors/bounce* \
206 /home/iank/cron-errors/btrbk.timer* \
207 /home/iank/cron-errors/old-snapshot*
208 fi
209
210 if ip l show tunfsf &>/dev/null; then
211 # this is for tracking dns over tls issue, which
212 # fixvpndns() in brc2 fixes.
213 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
214 read _ _ _ istls <<<"$stat"
215 case $istls in
216 no) : ;;
217 *)
218 printf "%s\n" "$istls" | ts >> /tmp/istls.log
219 chars+=("T:$istls")
220 ;;
221 esac
222 fi
223
224 # We do this once every 5 minutes, since this is not a grave problem.
225 # For formatted elisp, see /b/ds/unsaved-buffers.el
226 elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))'
227 if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then
228 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
229 # i dun care if this fails
230 emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)"
231 if [[ $emacsfiles ]]; then
232 chars+=("$emacsfiles")
233 fi
234 fi
235 last_emacs_check=$EPOCHSECONDS
236 fi
237
238
239 glob=(/nocow/btrfs-stale/*)
240 if [[ -e ${glob[0]} ]]; then
241 chars+=(STALE)
242 fi
243 var_mail_msg=
244 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
245 var_mail_msg="message in /var/mail"
246 fi
247 p $var_mail_msg | loday -1 var_mail
248
249 # early in install process, we dont have permission yet for exiqgrep.
250 # 1100 helps allow for system restarts
251 qlen=$(/usr/sbin/exiqgrep -o 1100 -c -b | awk '{print $1}') ||:
252 qmsg=
253 if ((qlen)); then
254 qmsg="queue length $qlen"
255 chars+=("q $qlen")
256 fi
257 case $HOSTNAME in
258 # No point in emailing about the mailq on a host where we don't
259 # check email.
260 $MAIL_HOST)
261 p $qmsg | loday -120 qlen
262 ;;
263 *)
264 rmg /home/iank/cron-errors/qlen*
265 ;;
266 esac
267
268 begin=false
269
270 if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
271 begin=true
272 fi
273
274 end=false
275 if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
276 end=true
277 fi
278
279 # these conditions are so we dont have an overly verbose prompt
280 if $begin && $end; then
281 chars+=(D)
282 elif $begin; then
283 chars+=(DB)
284 elif $end; then
285 chars+=(DE)
286 else
287 f=~/.local/conflink
288 # shellcheck disable=SC2043
289 for _ in 1; do
290 if [[ -e $f ]]; then
291 now=$EPOCHSECONDS
292 fsec=$(stat -c%Y $f)
293 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
294 # dont have any false positives.
295 fmin=$(( (fsec - now + 1 ) / 60 ))
296 fminplus=$(( fmin + 60*24 ))
297 # Filesystem files get copied, so find any newer than the last run.
298 # The rest are hueristics:
299 # Given the last time we added a file in git, is that newer than the last conflink run.
300 # Given new files not added to git, were they modified more recently than the last conflink? but,
301 # push their modification time back by a day so we can develop them before needing to add them to git.
302
303 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
304 # This part is copied from conflink
305 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
306 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
307 done
308
309 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
310 if (( fmin < 0 )) && [[ $(find ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
311 v conflink newer filesystem files
312 chars+=(CONFLINK)
313 break
314 fi
315
316 for d in /a/bin/distro-setup /p/c; do
317 [[ -d $d ]] || continue
318 cd $d
319 if [[ ! -e .git ]]; then
320 # some hosts i dont push all of /p/c
321 continue
322 fi
323 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
324 v conflink: newer files checked in to git
325 chars+=(CONFLINK)
326 break
327 fi
328
329 untracked=()
330 while read -r l; do
331 untracked+=("$l")
332 done < <(git ls-files -o --exclude-standard)
333 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
334 v conflink: untracked in $d
335 chars+=(CONFLINK)
336 break
337 fi
338 done
339 cd /
340
341 fi
342 if [[ ! -e $f || $(<$f) != 0 ]]; then
343 v conflink: last run not found or failed
344 chars+=(CONFLINK)
345 break
346 fi
347 done
348 fi
349
350 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
351 if [[ -s /var/log/exim4/paniclog ]]; then
352 chars+=("PANIC!")
353 # leave it up to epanic-clean to send email notification
354 fi
355
356 if [[ ! -e $status_file || -w $status_file ]]; then
357 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
358 cat /a/bin/bash_unpublished/source-state >$status_file
359 fi
360
361 if [[ ${chars[*]} ]]; then
362 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
363 fi
364 fi
365 }
366 # use this if we want to do something just once per minute
367 first_chars=()
368
369
370 write-status
371 if [[ $1 ]]; then
372 cat $status_file
373 exit 0
374 fi
375
376 main-loop() {
377 while true; do
378 power=true
379 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
380 power=false
381 fi
382 wait=15
383 if ! $power; then
384 if systemctl -q is-active bitcoind; then
385 bitcoinoff
386 fi
387 wait=60
388 fi
389
390 sleep $wait
391 write-status
392 done
393 }
394
395 # ensure our long operations are one line so we are not prone errors
396 # from this file being modified.
397 main-loop