f6328c2b41a26bd27624098d013f872b307c68d6
[distro-setup] / epanic-clean
1 #!/bin/bash
2 # I, Ian Kelling, follow the GNU license recommendations at
3 # https://www.gnu.org/licenses/license-recommendations.en.html. They
4 # recommend that small programs, < 300 lines, be licensed under the
5 # Apache License 2.0. This file contains or is part of one or more small
6 # programs. If a small program grows beyond 300 lines, I plan to switch
7 # its license to GPL.
8
9 # Copyright 2024 Ian Kelling
10
11 # Licensed under the Apache License, Version 2.0 (the "License");
12 # you may not use this file except in compliance with the License.
13 # You may obtain a copy of the License at
14
15 # http://www.apache.org/licenses/LICENSE-2.0
16
17 # Unless required by applicable law or agreed to in writing, software
18 # distributed under the License is distributed on an "AS IS" BASIS,
19 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 # See the License for the specific language governing permissions and
21 # limitations under the License.
22
23
24 # The panic log regularly gets some stuff in it we dont want to fix.
25 # Detect it and wipe it out.
26
27 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
28
29 shopt -s inherit_errexit 2>/dev/null ||: # ignore fail in bash < 4.4
30 set -eE -o pipefail
31 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
32
33 [[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@"
34
35 debug=false
36 if [[ $1 ]]; then
37 debug=true
38 fi
39
40 verbose=true
41
42 d() {
43 if $debug; then
44 printf "%s\n" "$*"
45 fi
46 }
47 v() {
48 if $verbose; then
49 printf "%s\n" "$*"
50 fi
51 }
52
53 spamd_ser=spamd
54 if systemctl cat spamassassin &>/dev/null; then
55 spamd_ser=spamassassin
56 fi
57
58
59 pl=/var/log/exim4/paniclog
60 main() {
61 pr_metric=0
62 if [[ ! -s $pl ]]; then
63 echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
64 return 0
65 fi
66
67 # example line:
68 # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
69 if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then
70 regex="socket bind() to port 25 for address"
71 if grep "$regex" $pl |& tee -a $pl-archive; then
72 v "above is from grep $regex"
73 sed -i "/$regex/d" $pl
74 fi
75 fi
76
77 # this is a strange message due to running as nonroot
78 # regex='exim user lost privilege for using -C option'
79 # sed -i "/$regex/d" $pl
80
81 # seems to randomly be caused by
82 # Starting exim4-base housekeeping, exim4-base.service
83 regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
84 if grep "$regex" $pl |& tee -a $pl-archive; then
85 v "above is from grep $regex"
86 sed -i "/$regex/d" $pl
87 fi
88
89 ### begin removing panic lines due to service restarts ###
90 while read -r service regex; do
91 found=false
92 wipe=true
93 d "$service $regex"
94 while read -r d1 d2; do
95 d "$d1 $d2"
96 tmptime=$(date -d "$d1 $d2" +%s)
97 # Checking the journal takes a second or two, so
98 # dont consider every matching line, just those > 20 seconds apart. We are
99 # testing the journal for 60 seconds after the message, so should be ok.
100 # It probably makes sense to even check for >59 seconds apart, using 20
101 # seconds to be conservative.
102 if [[ ! $logtime ]]; then
103 logtime=$tmptime
104 elif (( tmptime > logtime + 20 )); then
105 logtime=$tmptime
106 else
107 continue
108 fi
109 found=true
110 sec_min=$((logtime - 60))
111 sec_max=$((logtime + 60))
112 jmin="$(date -d @$sec_min "+%F %H:%M:%S")"
113 jmax="$(date -d @$sec_max "+%F %H:%M:%S")"
114 description=$(systemctl cat $service | sed -rn 's/^ *Description=(.*)/\1/p')
115 jrregex="^Starting $description"
116 if [[ $service == "$spamd_ser" ]]; then
117 jrregex+="\|^spamd: restarting"
118 fi
119 d "jrregex=$jrregex jmin=$jmin jmax=$jmax"
120 # the sed clears out the initial time and process+pid
121 if journalctl -u $service -S "$jmin" -U "$jmax" \
122 | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then
123 v "messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':"
124 else
125 v "PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'"
126 wipe=false
127 break
128 fi
129 done < <(awk "/$regex/ "'{print $1,$2}' $pl)
130 if $found && $wipe; then
131 d "wiping $regex"
132 if grep -E "$regex" $pl |& tee -a $pl-archive; then
133 v "above is from grep -E $regex"
134 sed -ri "/$regex/d" $pl
135 fi
136 fi
137 done <<EOF
138 clamav-daemon malware acl condition
139 $spamd_ser spam acl condition
140 EOF
141 ### end removing panic lines due to service restarts ###
142
143
144 ## begin broken pipe & write lock & general alert ##
145 regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
146 newlines=false
147 count=0
148 while read -r day time _; do
149 log_s=$(date -d "$day $time" +%s)
150 count=$((count+1))
151 if (( log_s > EPOCHSECONDS - 300 )); then
152 newlines=true
153 fi
154 done < <(grep "$regex" $pl ||:)
155 if (( count )); then
156 # I see broken pipe in groups of 3 for the same message around once a day
157 # randomly. I'm guessing they are related to running 2 instances of
158 # exim which share the same spool. So, if we have some, but not in
159 # the last 5 minutes, and less than 20, it should be fine to clear
160 # them. write lock happens less but can fit under the same rule.
161 if (( count > 20 )); then
162 pr_metric=1
163 elif ! $newlines; then
164 grep "$regex" $pl |& tee -a $pl-archive
165 v "above is from grep $regex"
166 sed -i "/$regex/d" $pl
167 fi
168 fi
169
170 # I think we could alert on anything else older than 61 seconds,
171 # but lets just add some slack, make it 2 minutes.
172 while read -r day time _; do
173 # some lines dont have dates, just skip them
174 # 2022-09-16 15:21:06.250 [438097] Exim configuration error:
175 # can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS"
176 if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then
177 continue
178 fi
179 log_s=$(date -d "$day $time" +%s)
180 if (( EPOCHSECONDS - 120 > log_s )); then
181 pr_metric=1
182 fi
183 # pr_metrix for $regex is handled above
184 done < <(grep -v "$regex" $pl ||:)
185 ## end broken pipe ##
186
187 echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
188
189 }
190
191 loop-main() {
192 while true; do
193 main
194 sleep 30
195 done
196 }
197
198
199 if [[ ! -w $pl-archive ]]; then
200 touch $pl-archive
201 chgrp adm $pl-archive
202 chmod 664 $pl-archive
203 fi
204
205 if [[ $PPID == 1 ]]; then
206 loop-main
207 else
208 main
209 fi