various fixes
[distro-setup] / epanic-clean
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # The panic log regularly gets some stuff in it we dont want to fix.
6 # Detect it and wipe it out.
7
8 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
9
10 shopt -s inherit_errexit 2>/dev/null ||: # ignore fail in bash < 4.4
11 set -eE -o pipefail
12 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
13
14 [[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@"
15
16 debug=false
17 if [[ $1 ]]; then
18 debug=true
19 fi
20
21
22 d() {
23 if $debug; then
24 printf "%s\n" "$*"
25 fi
26 }
27
28
29 pl=/var/log/exim4/paniclog
30 main() {
31 pr_metric=0
32 if [[ ! -s $pl ]]; then
33 echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
34 return 0
35 fi
36
37 # example line:
38 # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
39 if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then
40 regex="socket bind() to port 25 for address"
41 grep "$regex" $pl >> $pl-archive ||:
42 sed -i "/$regex/d" $pl
43 fi
44
45 # this is a strange message due to running as nonroot
46 # regex='exim user lost privilege for using -C option'
47 # sed -i "/$regex/d" $pl
48
49 # seems to randomly be caused by
50 # Starting exim4-base housekeeping, exim4-base.service
51 regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
52 grep "$regex" $pl >> $pl-archive ||:
53 sed -i "/$regex/d" $pl
54
55 while read -r service regex; do
56 found=false
57 wipe=true
58 d "$service $regex"
59 while read -r d1 d2; do
60 d "$d1 $d2"
61 found=true
62 tmptime=$(date -d "$d1 $d2" +%s)
63 # Checking the journal takes a second or two, so
64 # dont consider every matching line, just those > 60 seconds apart. We are
65 # testing the journal for 60 seconds after the message, so should be ok.
66 if [[ ! $logtime ]]; then
67 logtime=$tmptime
68 elif (( tmptime > logtime + 60 )); then
69 logtime=$tmptime
70 else
71 continue
72 fi
73 sec_min=$((logtime - 60))
74 sec_max=$((logtime + 60))
75 jmin="$(date -d @$sec_min "+%F %H:%M:%S")"
76 jmax="$(date -d @$sec_max "+%F %H:%M:%S")"
77 description=$(systemctl cat $service | sed -rn 's/^ *Description=(.*)/\1/p')
78 jrregex="^Starting $description"
79 if [[ $service == spamassassin ]]; then
80 jrregex+="\|^spamd: restarting"
81 fi
82 d "jrregex=$jrregex jmin=$jmin jmax=$jmax"
83 # the sed clears out the initial time and process+pid
84 if ! journalctl -u $service -S "$jmin" -U "$jmax" \
85 | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then
86 wipe=false
87 break
88 fi
89 done < <(awk "/$regex/ "'{print $1,$2}' $pl)
90 if $found && $wipe; then
91 d "wiping $regex"
92 if [[ ! -w $pl-archive ]]; then
93 touch $pl-archive
94 chgrp adm $pl-archive
95 chmod 664 $pl-archive
96 fi
97 grep -E "$regex" $pl >> $pl-archive ||:
98 sed -ri "/$regex/d" $pl
99 fi
100 done <<'EOF'
101 clamav-daemon malware acl condition
102 spamassassin spam acl condition
103 EOF
104
105 ## begin broken pipe & write lock & general alert ##
106 regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
107 newlines=false
108 count=0
109 while read -r day time _; do
110 log_s=$(date -d "$day $time" +%s)
111 count=$((count+1))
112 if (( log_s > EPOCHSECONDS - 300 )); then
113 newlines=true
114 fi
115 done < <(grep "$regex" $pl ||:)
116 if (( count )); then
117 # I see broken pipe in groups of 3 for the same message around once a day
118 # randomly. I'm guessing they are related to running 2 instances of
119 # exim which share the same spool. So, if we have some, but not in
120 # the last 5 minutes, and less than 20, it should be fine to clear
121 # them. write lock happens less but can fit under the same rule.
122 if (( count > 20 )); then
123 pr_metric=1
124 elif ! $newlines; then
125 grep "$regex" $pl >>$pl-archive
126 sed -i "/$regex/d" $pl
127 fi
128 fi
129
130 # I think we could alert on anything else older than 61 seconds,
131 # but lets just add some slack, make it 5 minutes.
132 while read -r day time _; do
133 log_s=$(date -d "$day $time" +%s)
134 if (( log_s < EPOCHSECONDS - 300 )); then
135 pr_metric=1
136 fi
137 done < <(grep -v "$regex" $pl ||:)
138 ## end broken pipe ##
139
140 echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
141
142 }
143
144 loop-main() {
145 while true; do
146 main
147 sleep 30
148 done
149 }
150
151 if [[ $INVOCATION_ID ]]; then
152 loop-main
153 else
154 main
155 fi