add terminal warning for uninstalled changes
[distro-setup] / system-status
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # usage: runs once every 15 seconds unless any args are passed, or we
6 # then just runs once and have verbose output. On battery power, run
7 # once per minute.
8
9 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
10
11 if [[ $EUID != 1000 ]]; then
12 echo "$0: error, expected to be user 1000"
13 exit 1
14 fi
15
16 source /a/bin/errhandle/err
17 status_file=/dev/shm/iank-status
18
19 shopt -s nullglob
20 shopt -s dotglob
21 shopt -s extglob
22
23 for p in ~/.gem/ruby/*/bin; do
24 PATH="$PATH:$p"
25 done
26
27
28 verbose=false
29 if [[ $1 ]]; then
30 verbose=true
31 fi
32 v() {
33 if $verbose; then
34 printf "%s\n" "$*"
35 fi
36 }
37 p() { printf "%s\n" "$*"; }
38 # log-once COUNT NAME [MESSAGE]
39 lo() {
40 if type -p ifne &>/dev/null; then
41 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
42 fi
43 }
44
45 loday() {
46 if type -p ifne &>/dev/null; then
47 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
48 fi
49 }
50 # rm glob
51 rmg() {
52 if (( $# )); then
53 rm -f "$@"
54 fi
55 }
56
57 # todo, consider migrating some of these alerts into prometheus
58 write-status() {
59 chars=("${first_chars[@]}")
60
61 services=( epanicclean )
62 case $HOSTNAME in
63 bk|je|li) : ;;
64 *)
65 services+=(
66 systemstatus
67 btrfsmaintstop
68 dynamicipupdate
69 )
70 bads=()
71 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
72 for s in ${services[@]}; do
73 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
74 bads+=($s)
75 fi
76 done
77 chars+=(MYSERS)
78 fi
79 p ${bads[*]} | lo -240 mysers
80 ;;
81 esac
82
83 case $HOSTNAME in
84 kd)
85 services=(
86 prometheus-node-exporter
87 prometheus-alertmanager
88 prometheus
89 )
90 bads=()
91 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
92 for s in ${services[@]}; do
93 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
94 bads+=($s)
95 fi
96 done
97 chars+=(PROM)
98 fi
99 p ${bads[*]} | lo -240 prom
100 ;;
101 esac
102
103
104 # this section copied from servicepid()
105 unit=exim4
106 pid=$(systemctl show --property MainPID --value $unit ||:)
107 case $pid in
108 [1-9]*) : ;;
109 *)
110 dir=/sys/fs/cgroup/system.slice
111 if [[ ! -d $dir ]]; then
112 dir=/sys/fs/cgroup/systemd/system.slice
113 fi;
114 pid=$(head -n1 $dir/${unit%.service}.service/cgroup.procs ||:)
115 ;;
116 esac
117 if [[ ! $pid ]]; then
118 chars+=(EXIM)
119 fi
120
121
122 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
123 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
124 source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
125 fi
126 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
127
128 bouncemsg=
129 glob=(/m/md/bounces/new/*)
130 if [[ -e ${glob[0]} ]]; then
131 chars+=(BOUNCE)
132 bouncemsg="message in /m/md/bounces/new"
133 fi
134 p $bouncemsg | loday -1 bounce
135 # emails without the S (seen) flag. this only checks the last flag,
136 # but its good enough for me.
137 glob=(/m/md/alerts/{new,cur}/!(*,S))
138 if [[ -e ${glob[0]} ]]; then
139 chars+=(A)
140 fi
141
142 glob=(/m/md/daylert/{new,cur}/!(*,S))
143 if [[ -e ${glob[0]} ]]; then
144 chars+=(DAY)
145 fi
146
147 bbkmsg=
148 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
149 chars+=(BTRBK.TIMER)
150 bbkmsg="not enabled"
151 fi
152 p "$bbkmsg" | lo -480 btrbk.timer
153
154 ## check if last snapshot was within an hour
155 vol=o
156 # this section generally copied from btrbk scripts, but
157 # this part modified to speed things up by about half a second.
158 # I'm not sure if its quite as reliable, but it looks pretty safe.
159 # Profiled it using time and also adding to the top of the file:
160 # set -x
161 # PS4='+ $(date "+%2N") '
162 # allow failure in case there are no snapshots yet.
163 # shellcheck disable=SC2012
164 shopt -u nullglob
165 files=(/mnt/o/btrbk/$vol.20*)
166 shopt -s nullglob
167 snaps=()
168 if (( ${#files[@]} )); then
169 snaps=($(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : ))
170 fi
171 now=$EPOCHSECONDS
172 maxtime=0
173 for s in ${snaps[@]}; do
174 file=${s##*/}
175 t=$(date -d $(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#$vol.}) +%s)
176 if (( t > maxtime )); then
177 maxtime=$t
178 fi
179 done
180 snapshotmsg=
181 if (( maxtime < now - 4*60*60 )); then
182 chars+=(OLD-SNAP)
183 snapshotmsg="/o snapshot older than 4 hours"
184 fi
185 p "$snapshotmsg" | lo -1 old-snapshot
186
187
188 # commented out, only using timetrap retrospectively.
189 # # clock us out in timetrap if are idle too long
190 # if [[ -e /p/.timetrap.db ]]; then
191 # export DISPLAY=:0
192 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
193 # if [[ $xidle == [0-9]* ]]; then
194 # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
195 # idle=300000
196 # if [[ $sheet == w ]]; then
197 # idle=900000
198 # fi
199 # if [[ $sheet && $xidle -gt $idle ]]; then
200 # timetrap out
201 # fi
202 # fi
203 # fi
204 # fi
205 else # end if $MAIL_HOST
206 rmg /home/iank/cron-errors/bounce* \
207 /home/iank/cron-errors/btrbk.timer* \
208 /home/iank/cron-errors/old-snapshot*
209 fi
210
211 if ip l show tunfsf &>/dev/null; then
212 # this is for tracking dns over tls issue, which
213 # fixvpndns() in brc2 fixes.
214 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
215 read _ _ _ istls <<<"$stat"
216 case $istls in
217 no) : ;;
218 *)
219 printf "%s\n" "$istls" | ts >> /tmp/istls.log
220 chars+=("T:$istls")
221 ;;
222 esac
223 fi
224
225 # We do this once every 5 minutes, since this is not a grave problem.
226 # For formatted elisp, see /b/ds/unsaved-buffers.el
227 elisp='(format "%s" (-reduce-from (lambda (acc buf) (let ((bpath (buffer-file-name buf))) (if (and bpath (buffer-modified-p buf)) (cons bpath acc ) acc))) nil (buffer-list)))'
228 if [[ ! $last_emacs_check || $emacsfiles ]] || (( last_emacs_check < EPOCHSECONDS - 300 )); then
229 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
230 # i dun care if this fails
231 emacsfiles="$(timeout 1 emacsclient -a /usr/bin/true --eval "$elisp" 2>/dev/null | sed '/^"nil"$/d;s/^"(/E: /;s/)"$//' ||:)"
232 if [[ $emacsfiles ]]; then
233 chars+=("$emacsfiles")
234 fi
235 fi
236 last_emacs_check=$EPOCHSECONDS
237 fi
238
239
240 glob=(/nocow/btrfs-stale/*)
241 if [[ -e ${glob[0]} ]]; then
242 chars+=(STALE)
243 fi
244 var_mail_msg=
245 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
246 var_mail_msg="message in /var/mail"
247 fi
248 p $var_mail_msg | loday -1 var_mail
249
250 # early in install process, we dont have permission yet for exiqgrep.
251 # 1100 helps allow for system restarts
252 qlen=$(/usr/sbin/exiqgrep -o 1100 -c -b | awk '{print $1}') ||:
253 qmsg=
254 if ((qlen)); then
255 qmsg="queue length $qlen"
256 chars+=("q $qlen")
257 fi
258 case $HOSTNAME in
259 # No point in emailing about the mailq on a host where we don't
260 # check email.
261 $MAIL_HOST)
262 p $qmsg | loday -120 qlen
263 ;;
264 *)
265 rmg /home/iank/cron-errors/qlen*
266 ;;
267 esac
268
269 begin=false
270
271 # todo: make this robust to the case of /a not being mounted
272 if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
273 begin=true
274 fi
275
276 end=false
277 if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
278 end=true
279 fi
280
281 # these conditions are so we dont have an overly verbose prompt
282 if $begin && $end; then
283 chars+=(D)
284 elif $begin; then
285 chars+=(DB)
286 elif $end; then
287 chars+=(DE)
288 else
289 source /a/bin/ds/script-files
290 f=~/.local/conflink
291 # shellcheck disable=SC2043
292 for _ in 1; do
293 if [[ -e $f ]]; then
294 now=$EPOCHSECONDS
295 fsec=$(stat -c%Y $f)
296 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
297 # dont have any false positives.
298 fmin=$(( (fsec - now + 1 ) / 60 ))
299 fminplus=$(( fmin + 60*24 ))
300 # Filesystem files get copied, so find any newer than the last run.
301 # The rest are hueristics:
302 # Given the last time we added a file in git, is that newer than the last conflink run.
303 # Given new files not added to git, were they modified more recently than the last conflink? but,
304 # push their modification time back by a day so we can develop them before needing to add them to git.
305
306 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
307 # This part is copied from conflink
308 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
309 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
310 done
311
312 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
313 if (( fmin < 0 )) && [[ $(find ${all_my_scripts[@]} ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
314 v conflink newer filesystem files
315 chars+=(CONFLINK)
316 break
317 fi
318
319 for d in /a/bin/distro-setup /p/c; do
320 [[ -d $d ]] || continue
321 cd $d
322 if [[ ! -e .git ]]; then
323 # some hosts i dont push all of /p/c
324 continue
325 fi
326 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
327 v conflink: newer files checked in to git
328 chars+=(CONFLINK)
329 break
330 fi
331
332 untracked=()
333 while read -r l; do
334 untracked+=("$l")
335 done < <(git ls-files -o --exclude-standard)
336 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
337 v conflink: untracked in $d
338 chars+=(CONFLINK)
339 break
340 fi
341 done
342 cd /
343
344 fi
345 if [[ ! -e $f || $(<$f) != 0 ]]; then
346 v conflink: last run not found or failed
347 chars+=(CONFLINK)
348 break
349 fi
350 done
351 fi
352
353 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
354 if [[ -s /var/log/exim4/paniclog ]]; then
355 chars+=("PANIC!")
356 # leave it up to epanic-clean to send email notification
357 fi
358
359 if [[ ! -e $status_file || -w $status_file ]]; then
360 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
361 cat /a/bin/bash_unpublished/source-state >$status_file
362 fi
363
364 if [[ ${chars[*]} ]]; then
365 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
366 fi
367 fi
368 }
369 # use this if we want to do something just once per minute
370 first_chars=()
371
372
373 write-status
374 if [[ $1 ]]; then
375 cat $status_file
376 exit 0
377 fi
378
379 main-loop() {
380 while true; do
381 power=true
382 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
383 power=false
384 fi
385 wait=15
386 if ! $power; then
387 if systemctl -q is-active bitcoind; then
388 bitcoinoff
389 fi
390 wait=60
391 fi
392
393 sleep $wait
394 write-status
395 done
396 }
397
398 # ensure our long operations are one line so we are not prone errors
399 # from this file being modified.
400 main-loop