mostly fixes
[distro-setup] / epanic-clean
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # The panic log regularly gets some stuff in it we dont want to fix.
6 # Detect it and wipe it out.
7
8 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
9
10 shopt -s inherit_errexit 2>/dev/null ||: # ignore fail in bash < 4.4
11 set -eE -o pipefail
12 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
13
14 [[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@"
15
16 debug=false
17 if [[ $1 ]]; then
18 debug=true
19 fi
20
21 verbose=true
22
23 d() {
24 if $debug; then
25 printf "%s\n" "$*"
26 fi
27 }
28 v() {
29 if $verbose; then
30 printf "%s\n" "$*"
31 fi
32 }
33
34
35 pl=/var/log/exim4/paniclog
36 main() {
37 pr_metric=0
38 if [[ ! -s $pl ]]; then
39 echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
40 return 0
41 fi
42
43 # example line:
44 # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
45 if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then
46 regex="socket bind() to port 25 for address"
47 if grep "$regex" $pl |& tee -a $pl-archive; then
48 v "above is from grep $regex"
49 sed -i "/$regex/d" $pl
50 fi
51 fi
52
53 # this is a strange message due to running as nonroot
54 # regex='exim user lost privilege for using -C option'
55 # sed -i "/$regex/d" $pl
56
57 # seems to randomly be caused by
58 # Starting exim4-base housekeeping, exim4-base.service
59 regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
60 if grep "$regex" $pl |& tee -a $pl-archive; then
61 v "above is from grep $regex"
62 sed -i "/$regex/d" $pl
63 fi
64
65 ### begin removing panic lines due to service restarts ###
66 while read -r service regex; do
67 found=false
68 wipe=true
69 d "$service $regex"
70 while read -r d1 d2; do
71 d "$d1 $d2"
72 tmptime=$(date -d "$d1 $d2" +%s)
73 # Checking the journal takes a second or two, so
74 # dont consider every matching line, just those > 20 seconds apart. We are
75 # testing the journal for 60 seconds after the message, so should be ok.
76 # It probably makes sense to even check for >59 seconds apart, using 20
77 # seconds to be conservative.
78 if [[ ! $logtime ]]; then
79 logtime=$tmptime
80 elif (( tmptime > logtime + 20 )); then
81 logtime=$tmptime
82 else
83 continue
84 fi
85 found=true
86 sec_min=$((logtime - 60))
87 sec_max=$((logtime + 60))
88 jmin="$(date -d @$sec_min "+%F %H:%M:%S")"
89 jmax="$(date -d @$sec_max "+%F %H:%M:%S")"
90 description=$(systemctl cat $service | sed -rn 's/^ *Description=(.*)/\1/p')
91 jrregex="^Starting $description"
92 if [[ $service == spamassassin ]]; then
93 jrregex+="\|^spamd: restarting"
94 fi
95 d "jrregex=$jrregex jmin=$jmin jmax=$jmax"
96 # the sed clears out the initial time and process+pid
97 if journalctl -u $service -S "$jmin" -U "$jmax" \
98 | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then
99 v "messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':"
100 else
101 v "PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'"
102 wipe=false
103 break
104 fi
105 done < <(awk "/$regex/ "'{print $1,$2}' $pl)
106 if $found && $wipe; then
107 d "wiping $regex"
108 if grep -E "$regex" $pl |& tee -a $pl-archive; then
109 v "above is from grep -E $regex"
110 sed -ri "/$regex/d" $pl
111 fi
112 fi
113 done <<'EOF'
114 clamav-daemon malware acl condition
115 spamassassin spam acl condition
116 EOF
117 ### end removing panic lines due to service restarts ###
118
119
120 ## begin broken pipe & write lock & general alert ##
121 regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
122 newlines=false
123 count=0
124 while read -r day time _; do
125 log_s=$(date -d "$day $time" +%s)
126 count=$((count+1))
127 if (( log_s > EPOCHSECONDS - 300 )); then
128 newlines=true
129 fi
130 done < <(grep "$regex" $pl ||:)
131 if (( count )); then
132 # I see broken pipe in groups of 3 for the same message around once a day
133 # randomly. I'm guessing they are related to running 2 instances of
134 # exim which share the same spool. So, if we have some, but not in
135 # the last 5 minutes, and less than 20, it should be fine to clear
136 # them. write lock happens less but can fit under the same rule.
137 if (( count > 20 )); then
138 pr_metric=1
139 elif ! $newlines; then
140 grep "$regex" $pl |& tee -a $pl-archive
141 v "above is from grep $regex"
142 sed -i "/$regex/d" $pl
143 fi
144 fi
145
146 # I think we could alert on anything else older than 61 seconds,
147 # but lets just add some slack, make it 2 minutes.
148 while read -r day time _; do
149 # some lines dont have dates, just skip them
150 # 2022-09-16 15:21:06.250 [438097] Exim configuration error:
151 # can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS"
152 if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then
153 continue
154 fi
155 log_s=$(date -d "$day $time" +%s)
156 if (( EPOCHSECONDS - 120 > log_s )); then
157 pr_metric=1
158 fi
159 # pr_metrix for $regex is handled above
160 done < <(grep -v "$regex" $pl ||:)
161 ## end broken pipe ##
162
163 echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
164
165 }
166
167 loop-main() {
168 while true; do
169 main
170 sleep 30
171 done
172 }
173
174
175 if [[ ! -w $pl-archive ]]; then
176 touch $pl-archive
177 chgrp adm $pl-archive
178 chmod 664 $pl-archive
179 fi
180
181 if [[ $INVOCATION_ID ]]; then
182 loop-main
183 else
184 main
185 fi