host info updates
[distro-setup] / epanic-clean
1 #!/bin/bash
2 # I, Ian Kelling, follow the GNU license recommendations at
3 # https://www.gnu.org/licenses/license-recommendations.en.html. They
4 # recommend that small programs, < 300 lines, be licensed under the
5 # Apache License 2.0. This file contains or is part of one or more small
6 # programs. If a small program grows beyond 300 lines, I plan to switch
7 # its license to GPL.
8
9 # Copyright 2024 Ian Kelling
10
11 # Licensed under the Apache License, Version 2.0 (the "License");
12 # you may not use this file except in compliance with the License.
13 # You may obtain a copy of the License at
14
15 # http://www.apache.org/licenses/LICENSE-2.0
16
17 # Unless required by applicable law or agreed to in writing, software
18 # distributed under the License is distributed on an "AS IS" BASIS,
19 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 # See the License for the specific language governing permissions and
21 # limitations under the License.
22
23
24 # The panic log regularly gets some stuff in it we dont want to fix.
25 # Detect it and wipe it out.
26
27 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
28
29 shopt -s inherit_errexit 2>/dev/null ||: # ignore fail in bash < 4.4
30 set -eE -o pipefail
31 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
32
33 [[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@"
34
35 debug=false
36 if [[ $1 ]]; then
37 debug=true
38 fi
39
40 verbose=true
41
42 d() {
43 if $debug; then
44 printf "%s\n" "$*"
45 fi
46 }
47 v() {
48 if $verbose; then
49 printf "%s\n" "$*"
50 fi
51 }
52
53
54 pl=/var/log/exim4/paniclog
55 main() {
56 pr_metric=0
57 if [[ ! -s $pl ]]; then
58 echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
59 return 0
60 fi
61
62 # example line:
63 # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
64 if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then
65 regex="socket bind() to port 25 for address"
66 if grep "$regex" $pl |& tee -a $pl-archive; then
67 v "above is from grep $regex"
68 sed -i "/$regex/d" $pl
69 fi
70 fi
71
72 # this is a strange message due to running as nonroot
73 # regex='exim user lost privilege for using -C option'
74 # sed -i "/$regex/d" $pl
75
76 # seems to randomly be caused by
77 # Starting exim4-base housekeeping, exim4-base.service
78 regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
79 if grep "$regex" $pl |& tee -a $pl-archive; then
80 v "above is from grep $regex"
81 sed -i "/$regex/d" $pl
82 fi
83
84 ### begin removing panic lines due to service restarts ###
85 while read -r service regex; do
86 found=false
87 wipe=true
88 d "$service $regex"
89 while read -r d1 d2; do
90 d "$d1 $d2"
91 tmptime=$(date -d "$d1 $d2" +%s)
92 # Checking the journal takes a second or two, so
93 # dont consider every matching line, just those > 20 seconds apart. We are
94 # testing the journal for 60 seconds after the message, so should be ok.
95 # It probably makes sense to even check for >59 seconds apart, using 20
96 # seconds to be conservative.
97 if [[ ! $logtime ]]; then
98 logtime=$tmptime
99 elif (( tmptime > logtime + 20 )); then
100 logtime=$tmptime
101 else
102 continue
103 fi
104 found=true
105 sec_min=$((logtime - 60))
106 sec_max=$((logtime + 60))
107 jmin="$(date -d @$sec_min "+%F %H:%M:%S")"
108 jmax="$(date -d @$sec_max "+%F %H:%M:%S")"
109 description=$(systemctl cat $service | sed -rn 's/^ *Description=(.*)/\1/p')
110 jrregex="^Starting $description"
111 if [[ $service == spamassassin ]]; then
112 jrregex+="\|^spamd: restarting"
113 fi
114 d "jrregex=$jrregex jmin=$jmin jmax=$jmax"
115 # the sed clears out the initial time and process+pid
116 if journalctl -u $service -S "$jmin" -U "$jmax" \
117 | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then
118 v "messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':"
119 else
120 v "PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'"
121 wipe=false
122 break
123 fi
124 done < <(awk "/$regex/ "'{print $1,$2}' $pl)
125 if $found && $wipe; then
126 d "wiping $regex"
127 if grep -E "$regex" $pl |& tee -a $pl-archive; then
128 v "above is from grep -E $regex"
129 sed -ri "/$regex/d" $pl
130 fi
131 fi
132 done <<'EOF'
133 clamav-daemon malware acl condition
134 spamassassin spam acl condition
135 EOF
136 ### end removing panic lines due to service restarts ###
137
138
139 ## begin broken pipe & write lock & general alert ##
140 regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
141 newlines=false
142 count=0
143 while read -r day time _; do
144 log_s=$(date -d "$day $time" +%s)
145 count=$((count+1))
146 if (( log_s > EPOCHSECONDS - 300 )); then
147 newlines=true
148 fi
149 done < <(grep "$regex" $pl ||:)
150 if (( count )); then
151 # I see broken pipe in groups of 3 for the same message around once a day
152 # randomly. I'm guessing they are related to running 2 instances of
153 # exim which share the same spool. So, if we have some, but not in
154 # the last 5 minutes, and less than 20, it should be fine to clear
155 # them. write lock happens less but can fit under the same rule.
156 if (( count > 20 )); then
157 pr_metric=1
158 elif ! $newlines; then
159 grep "$regex" $pl |& tee -a $pl-archive
160 v "above is from grep $regex"
161 sed -i "/$regex/d" $pl
162 fi
163 fi
164
165 # I think we could alert on anything else older than 61 seconds,
166 # but lets just add some slack, make it 2 minutes.
167 while read -r day time _; do
168 # some lines dont have dates, just skip them
169 # 2022-09-16 15:21:06.250 [438097] Exim configuration error:
170 # can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS"
171 if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then
172 continue
173 fi
174 log_s=$(date -d "$day $time" +%s)
175 if (( EPOCHSECONDS - 120 > log_s )); then
176 pr_metric=1
177 fi
178 # pr_metrix for $regex is handled above
179 done < <(grep -v "$regex" $pl ||:)
180 ## end broken pipe ##
181
182 echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
183
184 }
185
186 loop-main() {
187 while true; do
188 main
189 sleep 30
190 done
191 }
192
193
194 if [[ ! -w $pl-archive ]]; then
195 touch $pl-archive
196 chgrp adm $pl-archive
197 chmod 664 $pl-archive
198 fi
199
200 if [[ $INVOCATION_ID ]]; then
201 loop-main
202 else
203 main
204 fi