fixes, prometheus, lots of stuff
[distro-setup] / system-status
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # usage: runs once every 15 seconds unless any args are passed, or we
6 # then just runs once. On battery power, run once per minute.
7
8 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
9
10 source /a/bin/errhandle/err
11 status_file=/dev/shm/iank-status
12
13 shopt -s nullglob
14 shopt -s dotglob
15 shopt -s extglob
16
17 for p in ~/.gem/ruby/*/bin; do
18 PATH="$PATH:$p"
19 done
20
21
22 verbose=false
23 if [[ $1 ]]; then
24 verbose=true
25 fi
26 v() {
27 if $verbose; then
28 printf "%s\n" "$*"
29 fi
30 }
31 # log-once COUNT NAME [MESSAGE]
32 lo() {
33 if type -p ifne &>/dev/null; then
34 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
35 fi
36 }
37
38 loday() {
39 if type -p ifne &>/dev/null; then
40 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
41 fi
42 }
43
44 # todo, consider migrating some of these alerts into prometheus
45 write-status() {
46 chars=("${first_chars[@]}")
47
48
49 services=( epanicclean )
50 case $HOSTNAME in
51 bk|je|li) : ;;
52 *)
53 services+=(
54 systemstatus
55 btrfsmaintstop
56 dynamicipupdate
57 )
58 ;;
59 esac
60
61 bads=()
62 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
63 for s in ${services[@]}; do
64 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
65 bads+=($s)
66 fi
67 done
68 chars+=(MYSERS)
69
70 fi
71 lo -240 mysers ${bads[*]}
72
73 services=(
74 prometheus-node-exporter
75 prometheus-alertmanager
76 prometheus
77 )
78 case $HOSTNAME in
79 kd)
80 bads=()
81 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
82 for s in ${services[@]}; do
83 if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
84 bads+=($s)
85 fi
86 done
87 chars+=(PROM)
88 fi
89 lo -240 prom ${bads[*]}
90 ;;
91 esac
92
93 # clock us out in timetrap if are idle too long
94 if [[ -e /p/.timetrap.db ]]; then
95 export DISPLAY=:0
96 if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
97 if [[ $xidle == [0-9]* ]]; then
98 sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
99 idle=300000
100 if [[ $sheet == w ]]; then
101 idle=900000
102 fi
103 if [[ $sheet && $xidle -gt $idle ]]; then
104 timetrap out
105 fi
106 fi
107 fi
108 fi
109
110
111 if ip l show tunfsf &>/dev/null; then
112 # this is for tracking dns over tls issue, which
113 # fixvpndns() in brc2 fixes.
114 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
115 read _ _ _ istls <<<"$stat"
116 case $istls in
117 no) : ;;
118 *)
119 printf "%s\n" "$istls" | ts >> /tmp/istls.log
120 chars+=("T:$istls")
121 ;;
122 esac
123 fi
124
125
126 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
127 emacsfiles="$(emacsclient --eval "$(cat /usr/local/bin/unsaved-buffers.el)"| sed '/^"nil"$/d;s/^"(/E: /;s/)"$//')"
128 if [[ $emacsfiles ]]; then
129 chars+=("$emacsfiles")
130 fi
131 fi
132
133 glob=(/nocow/btrfs-stale/*)
134 if [[ -e ${glob[0]} ]]; then
135 chars+=(STALE)
136 fi
137 var_mail_msg=
138 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
139 var_mail_msg="message in /var/mail"
140 fi
141 loday -1 var_mail $var_mail_msg
142
143 bouncemsg=
144 glob=(/m/md/bounces/new/*)
145 if [[ -e ${glob[0]} ]]; then
146 chars+=(BOUNCE)
147 bouncemsg="message in /m/md/bounces/new"
148 fi
149 loday -1 bounce $bouncemsg
150 # emails without the S (seen) flag. this only checks the last flag,
151 # but its good enough for me.
152 glob=(/m/md/alerts/{new,cur}/!(*,S))
153 if [[ -e ${glob[0]} ]]; then
154 chars+=(A)
155 fi
156
157 glob=(/m/md/daylert/{new,cur}/!(*,S))
158 if [[ -e ${glob[0]} ]]; then
159 chars+=(DAY)
160 fi
161
162
163 tmp=(/var/local/cron-errors/mailtest-check*)
164 if (( ${#tmp[@]} )); then
165 chars+=(MAILPING)
166 fi
167 tmp=(/var/local/cron-errors/mailtest-slow*)
168 if (( ${#tmp[@]} )); then
169 chars+=(SPAMD)
170 fi
171
172 # early in install process, we dont have permission yet for exiqgrep.
173 # 1100 helps allow for system restarts
174 qlen=$(/usr/sbin/exiqgrep -o 1100 -c -b | awk '{print $1}') ||:
175 qmsg=
176 if ((qlen)); then
177 qmsg="queue length $qlen"
178 chars+=("q $qlen")
179 fi
180 case $HOSTNAME in
181 # No point in emailing about the mailq on a host where we don't
182 # check email.
183 $MAIL_HOST|bk)
184 loday -120 qlen $qmsg
185 ;;
186 esac
187
188 begin=false
189
190 if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
191 begin=true
192 fi
193
194 end=false
195 if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
196 end=true
197 fi
198
199 # these conditions are so we dont have an overly verbose prompt
200 if $begin && $end; then
201 chars+=(D)
202 elif $begin; then
203 chars+=(DB)
204 elif $end; then
205 chars+=(DE)
206 else
207 f=~/.local/conflink
208 # shellcheck disable=SC2043
209 for _ in 1; do
210 if [[ -e $f ]]; then
211 now=$(date +%s)
212 fsec=$(stat -c%Y $f)
213 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
214 # dont have any false positives.
215 fmin=$(( (fsec - now + 1 ) / 60 ))
216 fminplus=$(( fmin + 60*24 ))
217 # Filesystem files get copied, so find any newer than the last run.
218 # The rest are hueristics:
219 # Given the last time we added a file in git, is that newer than the last conflink run.
220 # Given new files not added to git, were they modified more recently than the last conflink? but,
221 # push their modification time back by a day so we can develop them before needing to add them to git.
222
223 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
224 # This part is copied from conflink
225 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
226 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
227 done
228
229 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
230 if (( fmin < 0 )) && [[ $(find ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
231 v conflink newer filesystem files
232 chars+=(CONFLINK)
233 break
234 fi
235
236 for d in /a/bin/distro-setup /p/c; do
237 [[ -d $d ]] || continue
238 cd $d
239 if [[ ! -e .git ]]; then
240 # some hosts i dont push all of /p/c
241 continue
242 fi
243 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
244 v conflink: newer files checked in to git
245 chars+=(CONFLINK)
246 break
247 fi
248
249 untracked=()
250 while read -r l; do
251 untracked+=("$l")
252 done < <(git ls-files -o --exclude-standard)
253 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
254 v conflink: untracked in $d
255 chars+=(CONFLINK)
256 break
257 fi
258 done
259 cd /
260
261 fi
262 if [[ ! -e $f || $(<$f) != 0 ]]; then
263 v conflink: last run not found or failed
264 chars+=(CONFLINK)
265 break
266 fi
267 done
268 fi
269
270 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
271 if [[ -s /var/log/exim4/paniclog ]]; then
272 chars+=("PANIC!")
273 # leave it up to epanic-clean to send email notification
274 fi
275
276 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
277 # /a gets remounted due to btrbk, ignore error code for file doesnt exist
278 source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
279 fi
280 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
281 bbkmsg=
282 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
283 chars+=(BTRBK.TIMER)
284 bbkmsg="not enabled"
285 fi
286 lo -480 btrbk.timer $bbkmsg
287
288 ## check if last snapshot was within an hour
289 vol=o
290 # this section generally copied from btrbk scripts, but
291 # this part modified to speed things up by about half a second.
292 # I'm not sure if its quite as reliable, but it looks pretty safe.
293 # Profiled it using time and also adding to the top of the file:
294 # set -x
295 # PS4='+ $(date "+%2N") '
296 # allow failure in case there are no snapshots yet.
297 # shellcheck disable=SC2012
298 shopt -u nullglob
299 files=(/mnt/root/btrbk/$vol.20*)
300 shopt -s nullglob
301 snaps=()
302 if (( ${#files[@]} )); then
303 snaps=($(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : ))
304 fi
305 now=$(date +%s)
306 maxtime=0
307 for s in ${snaps[@]}; do
308 file=${s##*/}
309 t=$(date -d $(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#$vol.}) +%s)
310 if (( t > maxtime )); then
311 maxtime=$t
312 fi
313 done
314 snapshotmsg=
315 if (( maxtime < now - 4*60*60 )); then
316 chars+=(OLD-SNAP)
317 snapshotmsg="/o snapshot older than 4 hours"
318 fi
319 lo -1 old-snapshot $snapshotmsg
320 fi
321
322 if [[ ! -e $status_file || -w $status_file ]]; then
323 if [[ -e /a/bin/bash_unpublished/source-state ]]; then
324 cat /a/bin/bash_unpublished/source-state >$status_file
325 fi
326
327 if [[ ${chars[*]} ]]; then
328 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
329 fi
330 fi
331
332 }
333 # use this if we want to do something just once per minute
334 first_chars=()
335
336
337 write-status
338 if [[ $1 ]]; then
339 cat $status_file
340 exit 0
341 fi
342
343 main-loop() {
344 while true; do
345 power=true
346 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
347 power=false
348 fi
349 wait=15
350 if ! $power; then
351 wait=60
352 fi
353
354 sleep $wait
355 write-status
356 done
357 }
358
359 # ensure our long operations are one line so we are not prone errors
360 # from this file being modified.
361 main-loop