lots of fixes, automation for bitfolk
[distro-setup] / system-status
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # usage: runs once every 15 seconds unless any args are passed, or we
6 # then just runs once. On battery power, run once per minute.
7
8 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
9
10 if [[ $EUID != 1000 ]]; then
11 echo "$0: error, expected to be user 1000"
12 exit 1
13 fi
14
15 source /a/bin/errhandle/err
16 status_file=/dev/shm/iank-status
17
18 shopt -s nullglob
19 shopt -s dotglob
20 shopt -s extglob
21
22 for p in ~/.gem/ruby/*/bin; do
23 PATH="$PATH:$p"
24 done
25
26
27 verbose=false
28 if [[ $1 ]]; then
29 verbose=true
30 fi
31 v() {
32 if $verbose; then
33 printf "%s\n" "$*"
34 fi
35 }
36 p() { printf "%s\n" "$*"; }
37 # log-once COUNT NAME [MESSAGE]
38 lo() {
39 if type -p ifne &>/dev/null; then
40 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
41 fi
42 }
43
44 loday() {
45 if type -p ifne &>/dev/null; then
46 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
47 fi
48 }
49
50 # todo, consider migrating some of these alerts into prometheus
51 write-status() {
52 chars=("${first_chars[@]}")
53
54 services=( epanicclean )
55 case $HOSTNAME in
56 bk|je|li) : ;;
57 *)
58 services+=(
59 systemstatus
60 btrfsmaintstop
61 dynamicipupdate
62 )
63 bads=()
64 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
65 for s in ${services[@]}; do
66 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
67 bads+=($s)
68 fi
69 done
70 chars+=(MYSERS)
71 fi
72 p ${bads[*]} | lo -240 mysers
73 ;;
74 esac
75
76 case $HOSTNAME in
77 kd)
78 services=(
79 prometheus-node-exporter
80 prometheus-alertmanager
81 prometheus
82 )
83 bads=()
84 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
85 for s in ${services[@]}; do
86 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
87 bads+=($s)
88 fi
89 done
90 chars+=(PROM)
91 fi
92 p ${bads[*]} | lo -240 prom
93 ;;
94 esac
95
96
97 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
98 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
99 source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
100 fi
101 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
102
103 bouncemsg=
104 glob=(/m/md/bounces/new/*)
105 if [[ -e ${glob[0]} ]]; then
106 chars+=(BOUNCE)
107 bouncemsg="message in /m/md/bounces/new"
108 fi
109 p $bouncemsg | loday -1 bounce
110 # emails without the S (seen) flag. this only checks the last flag,
111 # but its good enough for me.
112 glob=(/m/md/alerts/{new,cur}/!(*,S))
113 if [[ -e ${glob[0]} ]]; then
114 chars+=(A)
115 fi
116
117 glob=(/m/md/daylert/{new,cur}/!(*,S))
118 if [[ -e ${glob[0]} ]]; then
119 chars+=(DAY)
120 fi
121
122 bbkmsg=
123 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
124 chars+=(BTRBK.TIMER)
125 bbkmsg="not enabled"
126 fi
127 p "$bbkmsg" | lo -480 btrbk.timer
128
129 ## check if last snapshot was within an hour
130 vol=o
131 # this section generally copied from btrbk scripts, but
132 # this part modified to speed things up by about half a second.
133 # I'm not sure if its quite as reliable, but it looks pretty safe.
134 # Profiled it using time and also adding to the top of the file:
135 # set -x
136 # PS4='+ $(date "+%2N") '
137 # allow failure in case there are no snapshots yet.
138 # shellcheck disable=SC2012
139 shopt -u nullglob
140 files=(/mnt/root/btrbk/$vol.20*)
141 shopt -s nullglob
142 snaps=()
143 if (( ${#files[@]} )); then
144 snaps=($(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : ))
145 fi
146 now=$(date +%s)
147 maxtime=0
148 for s in ${snaps[@]}; do
149 file=${s##*/}
150 t=$(date -d $(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#$vol.}) +%s)
151 if (( t > maxtime )); then
152 maxtime=$t
153 fi
154 done
155 snapshotmsg=
156 if (( maxtime < now - 4*60*60 )); then
157 chars+=(OLD-SNAP)
158 snapshotmsg="/o snapshot older than 4 hours"
159 fi
160 p "$snapshotmsg" | lo -1 old-snapshot
161
162
163 # commented out, only using timetrap retrospectively.
164 # # clock us out in timetrap if are idle too long
165 # if [[ -e /p/.timetrap.db ]]; then
166 # export DISPLAY=:0
167 # if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
168 # if [[ $xidle == [0-9]* ]]; then
169 # sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
170 # idle=300000
171 # if [[ $sheet == w ]]; then
172 # idle=900000
173 # fi
174 # if [[ $sheet && $xidle -gt $idle ]]; then
175 # timetrap out
176 # fi
177 # fi
178 # fi
179 # fi
180
181 fi
182
183 if ip l show tunfsf &>/dev/null; then
184 # this is for tracking dns over tls issue, which
185 # fixvpndns() in brc2 fixes.
186 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
187 read _ _ _ istls <<<"$stat"
188 case $istls in
189 no) : ;;
190 *)
191 printf "%s\n" "$istls" | ts >> /tmp/istls.log
192 chars+=("T:$istls")
193 ;;
194 esac
195 fi
196
197
198 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
199 emacsfiles="$(emacsclient --eval "$(cat /usr/local/bin/unsaved-buffers.el)"| sed '/^"nil"$/d;s/^"(/E: /;s/)"$//')"
200 if [[ $emacsfiles ]]; then
201 chars+=("$emacsfiles")
202 fi
203 fi
204
205 glob=(/nocow/btrfs-stale/*)
206 if [[ -e ${glob[0]} ]]; then
207 chars+=(STALE)
208 fi
209 var_mail_msg=
210 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
211 var_mail_msg="message in /var/mail"
212 fi
213 p $var_mail_msg | loday -1 var_mail
214
215
216 tmp=(/var/local/cron-errors/mailtest-check*)
217 if (( ${#tmp[@]} )); then
218 chars+=(MAILPING)
219 fi
220 tmp=(/var/local/cron-errors/mailtest-slow*)
221 if (( ${#tmp[@]} )); then
222 chars+=(SPAMD)
223 fi
224
225 # early in install process, we dont have permission yet for exiqgrep.
226 # 1100 helps allow for system restarts
227 qlen=$(/usr/sbin/exiqgrep -o 1100 -c -b | awk '{print $1}') ||:
228 qmsg=
229 if ((qlen)); then
230 qmsg="queue length $qlen"
231 chars+=("q $qlen")
232 fi
233 case $HOSTNAME in
234 # No point in emailing about the mailq on a host where we don't
235 # check email.
236 $MAIL_HOST)
237 p $qmsg | loday -120 qlen
238 ;;
239 esac
240
241 begin=false
242
243 if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
244 begin=true
245 fi
246
247 end=false
248 if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
249 end=true
250 fi
251
252 # these conditions are so we dont have an overly verbose prompt
253 if $begin && $end; then
254 chars+=(D)
255 elif $begin; then
256 chars+=(DB)
257 elif $end; then
258 chars+=(DE)
259 else
260 f=~/.local/conflink
261 # shellcheck disable=SC2043
262 for _ in 1; do
263 if [[ -e $f ]]; then
264 now=$(date +%s)
265 fsec=$(stat -c%Y $f)
266 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
267 # dont have any false positives.
268 fmin=$(( (fsec - now + 1 ) / 60 ))
269 fminplus=$(( fmin + 60*24 ))
270 # Filesystem files get copied, so find any newer than the last run.
271 # The rest are hueristics:
272 # Given the last time we added a file in git, is that newer than the last conflink run.
273 # Given new files not added to git, were they modified more recently than the last conflink? but,
274 # push their modification time back by a day so we can develop them before needing to add them to git.
275
276 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
277 # This part is copied from conflink
278 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
279 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
280 done
281
282 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
283 if (( fmin < 0 )) && [[ $(find ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
284 v conflink newer filesystem files
285 chars+=(CONFLINK)
286 break
287 fi
288
289 for d in /a/bin/distro-setup /p/c; do
290 [[ -d $d ]] || continue
291 cd $d
292 if [[ ! -e .git ]]; then
293 # some hosts i dont push all of /p/c
294 continue
295 fi
296 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
297 v conflink: newer files checked in to git
298 chars+=(CONFLINK)
299 break
300 fi
301
302 untracked=()
303 while read -r l; do
304 untracked+=("$l")
305 done < <(git ls-files -o --exclude-standard)
306 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
307 v conflink: untracked in $d
308 chars+=(CONFLINK)
309 break
310 fi
311 done
312 cd /
313
314 fi
315 if [[ ! -e $f || $(<$f) != 0 ]]; then
316 v conflink: last run not found or failed
317 chars+=(CONFLINK)
318 break
319 fi
320 done
321 fi
322
323 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
324 if [[ -s /var/log/exim4/paniclog ]]; then
325 chars+=("PANIC!")
326 # leave it up to epanic-clean to send email notification
327 fi
328
329 if [[ ! -e $status_file || -w $status_file ]]; then
330 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
331 cat /a/bin/bash_unpublished/source-state >$status_file
332 fi
333
334 if [[ ${chars[*]} ]]; then
335 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
336 fi
337 fi
338 }
339 # use this if we want to do something just once per minute
340 first_chars=()
341
342
343 write-status
344 if [[ $1 ]]; then
345 cat $status_file
346 exit 0
347 fi
348
349 main-loop() {
350 while true; do
351 power=true
352 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
353 power=false
354 fi
355 wait=15
356 if ! $power; then
357 wait=60
358 fi
359
360 sleep $wait
361 write-status
362 done
363 }
364
365 # ensure our long operations are one line so we are not prone errors
366 # from this file being modified.
367 main-loop