2 # I, Ian Kelling, follow the GNU license recommendations at
3 # https://www.gnu.org/licenses/license-recommendations.en.html. They
4 # recommend that small programs, < 300 lines, be licensed under the
5 # Apache License 2.0. This file contains or is part of one or more small
6 # programs. If a small program grows beyond 300 lines, I plan to switch
9 # Copyright 2024 Ian Kelling
11 # Licensed under the Apache License, Version 2.0 (the "License");
12 # you may not use this file except in compliance with the License.
13 # You may obtain a copy of the License at
15 # http://www.apache.org/licenses/LICENSE-2.0
17 # Unless required by applicable law or agreed to in writing, software
18 # distributed under the License is distributed on an "AS IS" BASIS,
19 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 # See the License for the specific language governing permissions and
21 # limitations under the License.
24 # The panic log regularly gets some stuff in it we dont want to fix.
25 # Detect it and wipe it out.
27 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
29 shopt -s inherit_errexit
2>/dev
/null ||
: # ignore fail in bash < 4.4
31 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
33 [[ $EUID == 0 ]] ||
exec sudo
-E "${BASH_SOURCE[0]}" "$@"
54 if systemctl
cat spamassassin
&>/dev
/null
; then
55 spamd_ser
=spamassassin
59 pl
=/var
/log
/exim
4/paniclog
62 if [[ ! -s $pl ]]; then
63 echo "exim_paniclog $pr_metric" >/var
/lib
/prometheus
/node-exporter
/exim_paniclog.prom
68 # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
69 if [[ -e /etc
/systemd
/system
/exim4.service.d
/backup.conf
]]; then
70 regex
="socket bind() to port 25 for address"
71 if grep "$regex" $pl |
& tee -a $pl-archive; then
72 v
"above is from grep $regex"
73 sed -i "/$regex/d" $pl
77 # this is a strange message due to running as nonroot
78 # regex='exim user lost privilege for using -C option'
79 # sed -i "/$regex/d" $pl
81 # seems to randomly be caused by
82 # Starting exim4-base housekeeping, exim4-base.service
83 regex
="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
84 if grep "$regex" $pl |
& tee -a $pl-archive; then
85 v
"above is from grep $regex"
86 sed -i "/$regex/d" $pl
89 ### begin removing panic lines due to service restarts ###
90 while read -r service regex
; do
94 while read -r d1 d2
; do
96 tmptime
=$
(date -d "$d1 $d2" +%s
)
97 # Checking the journal takes a second or two, so
98 # dont consider every matching line, just those > 20 seconds apart. We are
99 # testing the journal for 60 seconds after the message, so should be ok.
100 # It probably makes sense to even check for >59 seconds apart, using 20
101 # seconds to be conservative.
102 if [[ ! $logtime ]]; then
104 elif (( tmptime
> logtime
+ 20 )); then
110 sec_min
=$
((logtime
- 60))
111 sec_max
=$
((logtime
+ 60))
112 jmin
="$(date -d @$sec_min "+%F
%H
:%M
:%S
")"
113 jmax
="$(date -d @$sec_max "+%F
%H
:%M
:%S
")"
114 description
=$
(systemctl
cat $service |
sed -rn 's/^ *Description=(.*)/\1/p')
115 jrregex
="^Starting $description"
116 if [[ $service == "$spamd_ser" ]]; then
117 jrregex
+="\|^spamd: restarting"
119 d
"jrregex=$jrregex jmin=$jmin jmax=$jmax"
120 # the sed clears out the initial time and process+pid
121 if journalctl
-u $service -S "$jmin" -U "$jmax" \
122 |
sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' |
grep "$jrregex" &>/dev
/null
; then
123 v
"messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':"
125 v
"PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'"
129 done < <(awk "/$regex/ "'{print $1,$2}' $pl)
130 if $found && $wipe; then
132 if grep -E "$regex" $pl |
& tee -a $pl-archive; then
133 v
"above is from grep -E $regex"
134 sed -ri "/$regex/d" $pl
138 clamav-daemon malware acl condition
139 $spamd_ser spam acl condition
141 ### end removing panic lines due to service restarts ###
144 ## begin broken pipe & write lock & general alert ##
145 regex
="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
148 while read -r day
time _
; do
149 log_s
=$
(date -d "$day $time" +%s
)
151 if (( log_s
> EPOCHSECONDS
- 300 )); then
154 done < <(grep "$regex" $pl ||
:)
156 # I see broken pipe in groups of 3 for the same message around once a day
157 # randomly. I'm guessing they are related to running 2 instances of
158 # exim which share the same spool. So, if we have some, but not in
159 # the last 5 minutes, and less than 20, it should be fine to clear
160 # them. write lock happens less but can fit under the same rule.
161 if (( count
> 20 )); then
163 elif ! $newlines; then
164 grep "$regex" $pl |
& tee -a $pl-archive
165 v
"above is from grep $regex"
166 sed -i "/$regex/d" $pl
170 # I think we could alert on anything else older than 61 seconds,
171 # but lets just add some slack, make it 2 minutes.
172 while read -r day
time _
; do
173 # some lines dont have dates, just skip them
174 # 2022-09-16 15:21:06.250 [438097] Exim configuration error:
175 # can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS"
176 if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then
179 log_s
=$
(date -d "$day $time" +%s
)
180 if (( EPOCHSECONDS
- 120 > log_s
)); then
183 # pr_metrix for $regex is handled above
184 done < <(grep -v "$regex" $pl ||
:)
185 ## end broken pipe ##
187 echo "exim_paniclog $pr_metric" >/var
/lib
/prometheus
/node-exporter
/exim_paniclog.prom
199 if [[ ! -w $pl-archive ]]; then
201 chgrp adm
$pl-archive
202 chmod 664 $pl-archive
205 if [[ $PPID == 1 ]]; then