various fixes
[distro-setup] / system-status
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # usage: runs once every 15 seconds unless any args are passed, or we
6 # then just runs once. On battery power, run once per minute.
7
8 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
9
10 source /a/bin/errhandle/err
11 status_file=/dev/shm/iank-status
12
13 shopt -s nullglob
14 shopt -s dotglob
15 shopt -s extglob
16
17 for p in ~/.gem/ruby/*/bin; do
18 PATH="$PATH:$p"
19 done
20
21
22 verbose=false
23 if [[ $1 ]]; then
24 verbose=true
25 fi
26 v() {
27 if $verbose; then
28 printf "%s\n" "$*"
29 fi
30 }
31 # log-once COUNT NAME [MESSAGE]
32 lo() {
33 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
34 }
35
36 loday() {
37 /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylerts@iankelling.org
38 }
39
40 # todo, consider migrating some of these alerts into prometheus
41 write-status() {
42 chars=("${first_chars[@]}")
43
44 services=(
45 epanicclean
46 systemstatus
47 btrfsmaintstop
48 dynamicipupdate
49 )
50 bads=()
51 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then
52 for s in ${services[@]}; do
53 if [[ $(systemctl show -p SubState --value $s) != running ]]; then
54 bads+=($s)
55 fi
56 done
57 chars+=(MYSERS)
58
59 fi
60 lo -240 mysers ${bads[*]}
61
62 services=(
63 prometheus-node-exporter
64 prometheus-alertmanager
65 prometheus
66 )
67 case $HOSTNAME in
68 kd)
69 bads=()
70 if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then
71 for s in ${services[@]}; do
72 if [[ $(systemctl show -p SubState --value $s) != running ]]; then
73 bads+=($s)
74 fi
75 done
76 chars+=(PROM)
77 fi
78 lo -240 prom ${bads[*]}
79 ;;
80 esac
81
82 # clock us out in timetrap if are idle too long
83 if [[ -e /p/.timetrap.db ]]; then
84 export DISPLAY=:0
85 if type -p xprintidle &>/dev/null && xidle=$(xprintidle 2>/dev/null); then
86 if [[ $xidle == [0-9]* ]]; then
87 sheet=$(sqlite3 /p/.timetrap.db "select sheet from entries where end is NULL;")
88 idle=300000
89 if [[ $sheet == w ]]; then
90 idle=900000
91 fi
92 if [[ $sheet && $xidle -gt $idle ]]; then
93 timetrap out
94 fi
95 fi
96 fi
97 fi
98
99
100 if ip l show tunfsf &>/dev/null; then
101 # this is for tracking dns over tls issue, which
102 # fixvpndns() in brc2 fixes.
103 stat=$(resolvectl dnsovertls tunfsf 2>/dev/null ||: )
104 read _ _ _ istls <<<"$stat"
105 case $istls in
106 no) : ;;
107 *)
108 printf "%s\n" "$istls" | ts >> /tmp/istls.log
109 chars+=("T:$istls")
110 ;;
111 esac
112 fi
113
114
115 if pgrep -G iank -u iank -f 'emacs --daemon' &>/dev/null; then
116 emacsfiles="$(emacsclient --eval "$(cat /usr/local/bin/unsaved-buffers.el)"| sed '/^"nil"$/d;s/^"(/E: /;s/)"$//')"
117 if [[ $emacsfiles ]]; then
118 chars+=("$emacsfiles")
119 fi
120 fi
121
122 glob=(/nocow/btrfs-stale/*)
123 if [[ -e ${glob[0]} ]]; then
124 chars+=(STALE)
125 fi
126 var_mail_msg=
127 if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
128 var_mail_msg="message in /var/mail"
129 fi
130 loday -1 var_mail $var_mail_msg
131
132 bouncemsg=
133 glob=(/m/md/bounces/new/*)
134 if [[ -e ${glob[0]} ]]; then
135 chars+=(BOUNCE)
136 bouncemsg="message in /m/md/bounces/new"
137 fi
138 loday -1 bounce $bouncemsg
139 # emails without the S (seen) flag. this only checks the last flag,
140 # but its good enough for me.
141 glob=(/m/md/alerts/{new,cur}/!(*,S))
142 if [[ -e ${glob[0]} ]]; then
143 chars+=(A)
144 fi
145
146 glob=(/m/md/daylerts/{new,cur}/!(*,S))
147 if [[ -e ${glob[0]} ]]; then
148 chars+=(DAY)
149 fi
150
151
152 tmp=(/var/local/cron-errors/mailtest-check*)
153 if (( ${#tmp[@]} )); then
154 chars+=(MAILPING)
155 fi
156 tmp=(/var/local/cron-errors/mailtest-slow*)
157 if (( ${#tmp[@]} )); then
158 chars+=(SPAMD)
159 fi
160
161 # early in install process, we dont have permission yet for exiqgrep.
162 # 1100 helps allow for system restarts
163 qlen=$(/usr/sbin/exiqgrep -o 1100 -c -b | awk '{print $1}') ||:
164 qmsg=
165 if ((qlen)); then
166 qmsg="queue length $qlen"
167 chars+=("q $qlen")
168 fi
169 case $HOSTNAME in
170 # No point in emailing about the mailq on a host where we don't
171 # check email.
172 $MAIL_HOST|bk)
173 loday -120 qlen $qmsg
174 ;;
175 esac
176
177 begin=false
178 if ! make -C /b/ds -q ~/.local/distro-begin || [[ $(<~/.local/distro-begin) != 0 ]]; then
179 begin=true
180 fi
181
182 end=false
183 if ! make -C /b/ds -q ~/.local/distro-end || [[ $(<~/.local/distro-end) != 0 ]]; then
184 end=true
185 fi
186
187 # these conditions are so we dont have an overly verbose prompt
188 if $begin && $end; then
189 chars+=(D)
190 elif $begin; then
191 chars+=(DB)
192 elif $end; then
193 chars+=(DE)
194 else
195 f=~/.local/conflink
196 # shellcheck disable=SC2043
197 for _ in 1; do
198 if [[ -e $f ]]; then
199 now=$(date +%s)
200 fsec=$(stat -c%Y $f)
201 # the / 60 makes it 0-59 seconds less strict, +1 to help make sure we
202 # dont have any false positives.
203 fmin=$(( (fsec - now + 1 ) / 60 ))
204 fminplus=$(( fmin + 60*24 ))
205 # Filesystem files get copied, so find any newer than the last run.
206 # The rest are hueristics:
207 # Given the last time we added a file in git, is that newer than the last conflink run.
208 # Given new files not added to git, were they modified more recently than the last conflink? but,
209 # push their modification time back by a day so we can develop them before needing to add them to git.
210
211 all_dirs=({/a/bin/ds,/p/c}{/filesystem,/machine_specific/$HOSTNAME/filesystem})
212 # This part is copied from conflink
213 for x in /p/c/machine_specific/*.hosts /a/bin/ds/machine_specific/*.hosts; do
214 if grep -qxF $HOSTNAME $x; then all_dirs+=( ${x%.hosts} ); fi
215 done
216
217 # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
218 if (( fmin < 0 )) && [[ $(find ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
219 v conflink newer filesystem files
220 chars+=(CONFLINK)
221 break
222 fi
223
224 for d in /a/bin/distro-setup /p/c; do
225 [[ -d $d ]] || continue
226 cd $d
227 if [[ ! -e .git ]]; then
228 # some hosts i dont push all of /p/c
229 continue
230 fi
231 if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
232 v conflink: newer files checked in to git
233 chars+=(CONFLINK)
234 break
235 fi
236
237 untracked=()
238 while read -r l; do
239 untracked+=("$l")
240 done < <(git ls-files -o --exclude-standard)
241 if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
242 v conflink: untracked in $d
243 chars+=(CONFLINK)
244 break
245 fi
246 done
247 cd /
248
249 fi
250 if [[ ! -e $f || $(<$f) != 0 ]]; then
251 v conflink: last run not found or failed
252 chars+=(CONFLINK)
253 break
254 fi
255 done
256 fi
257
258 # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
259 if [[ -s /var/log/exim4/paniclog ]]; then
260 chars+=("PANIC!")
261 # leave it up to epanic-clean to send email notification
262 fi
263
264 source /a/bin/bash_unpublished/source-state
265 if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
266 bbkmsg=
267 if [[ $(systemctl is-active btrbk.timer) != active ]]; then
268 chars+=(BTRBK.TIMER)
269 bbkmsg="not enabled"
270 fi
271 lo -480 btrbk.timer $bbkmsg
272
273 ## check if last snapshot was within an hour
274 vol=o
275 # this section generally copied from btrbk scripts, but
276 # this part modified to speed things up by about half a second.
277 # I'm not sure if its quite as reliable, but it looks pretty safe.
278 # Profiled it using time and also adding to the top of the file:
279 # set -x
280 # PS4='+ $(date "+%2N") '
281 # allow failure in case there are no snapshots yet.
282 # shellcheck disable=SC2012
283 shopt -u nullglob
284 files=(/mnt/root/btrbk/$vol.20*)
285 shopt -s nullglob
286 snaps=()
287 if (( ${#files[@]} )); then
288 snaps=($(ls -1avdr "${files[@]}" 2>/dev/null |head -n1 || : ))
289 fi
290 now=$(date +%s)
291 maxtime=0
292 for s in ${snaps[@]}; do
293 file=${s##*/}
294 t=$(date -d $(sed -r 's/(.{4})(..)(.{5})(..)(.*)/\1-\2-\3:\4:\5/' <<<${file#$vol.}) +%s)
295 if (( t > maxtime )); then
296 maxtime=$t
297 fi
298 done
299 snapshotmsg=
300 if (( maxtime < now - 4*60*60 )); then
301 chars+=(OLD-SNAP)
302 snapshotmsg="/o snapshot older than 4 hours"
303 fi
304 lo -1 old-snapshot $snapshotmsg
305 fi
306
307 cat /a/bin/bash_unpublished/source-state >$status_file
308
309 if [[ ${chars[*]} ]]; then
310 echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
311 fi
312
313 }
314 # use this if we want to do something just once per minute
315 first_chars=()
316
317
318 write-status
319 if [[ $1 ]]; then
320 cat $status_file
321 exit 0
322 fi
323
324 main-loop() {
325 while true; do
326 power=true
327 if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
328 power=false
329 fi
330 wait=15
331 if ! $power; then
332 wait=60
333 fi
334
335 sleep $wait
336 write-status
337 done
338 }
339
340 # ensure our long operations are one line so we are not prone errors
341 # from this file being modified.
342 main-loop