2 # I, Ian Kelling, follow the GNU license recommendations at
3 # https://www.gnu.org/licenses/license-recommendations.en.html. They
4 # recommend that small programs, < 300 lines, be licensed under the
5 # Apache License 2.0. This file contains or is part of one or more small
6 # programs. If a small program grows beyond 300 lines, I plan to switch
9 # Copyright 2024 Ian Kelling
11 # Licensed under the Apache License, Version 2.0 (the "License");
12 # you may not use this file except in compliance with the License.
13 # You may obtain a copy of the License at
15 # http://www.apache.org/licenses/LICENSE-2.0
17 # Unless required by applicable law or agreed to in writing, software
18 # distributed under the License is distributed on an "AS IS" BASIS,
19 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 # See the License for the specific language governing permissions and
21 # limitations under the License.
24 # The panic log regularly gets some stuff in it we dont want to fix.
25 # Detect it and wipe it out.
27 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
29 shopt -s inherit_errexit
2>/dev
/null ||
: # ignore fail in bash < 4.4
31 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
33 [[ $EUID == 0 ]] ||
exec sudo
-E "${BASH_SOURCE[0]}" "$@"
54 pl
=/var
/log
/exim
4/paniclog
57 if [[ ! -s $pl ]]; then
58 echo "exim_paniclog $pr_metric" >/var
/lib
/prometheus
/node-exporter
/exim_paniclog.prom
63 # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
64 if [[ -e /etc
/systemd
/system
/exim4.service.d
/backup.conf
]]; then
65 regex
="socket bind() to port 25 for address"
66 if grep "$regex" $pl |
& tee -a $pl-archive; then
67 v
"above is from grep $regex"
68 sed -i "/$regex/d" $pl
72 # this is a strange message due to running as nonroot
73 # regex='exim user lost privilege for using -C option'
74 # sed -i "/$regex/d" $pl
76 # seems to randomly be caused by
77 # Starting exim4-base housekeeping, exim4-base.service
78 regex
="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
79 if grep "$regex" $pl |
& tee -a $pl-archive; then
80 v
"above is from grep $regex"
81 sed -i "/$regex/d" $pl
84 ### begin removing panic lines due to service restarts ###
85 while read -r service regex
; do
89 while read -r d1 d2
; do
91 tmptime
=$
(date -d "$d1 $d2" +%s
)
92 # Checking the journal takes a second or two, so
93 # dont consider every matching line, just those > 20 seconds apart. We are
94 # testing the journal for 60 seconds after the message, so should be ok.
95 # It probably makes sense to even check for >59 seconds apart, using 20
96 # seconds to be conservative.
97 if [[ ! $logtime ]]; then
99 elif (( tmptime
> logtime
+ 20 )); then
105 sec_min
=$
((logtime
- 60))
106 sec_max
=$
((logtime
+ 60))
107 jmin
="$(date -d @$sec_min "+%F
%H
:%M
:%S
")"
108 jmax
="$(date -d @$sec_max "+%F
%H
:%M
:%S
")"
109 description
=$
(systemctl
cat $service |
sed -rn 's/^ *Description=(.*)/\1/p')
110 jrregex
="^Starting $description"
111 if [[ $service == spamassassin
]]; then
112 jrregex
+="\|^spamd: restarting"
114 d
"jrregex=$jrregex jmin=$jmin jmax=$jmax"
115 # the sed clears out the initial time and process+pid
116 if journalctl
-u $service -S "$jmin" -U "$jmax" \
117 |
sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' |
grep "$jrregex" &>/dev
/null
; then
118 v
"messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':"
120 v
"PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'"
124 done < <(awk "/$regex/ "'{print $1,$2}' $pl)
125 if $found && $wipe; then
127 if grep -E "$regex" $pl |
& tee -a $pl-archive; then
128 v
"above is from grep -E $regex"
129 sed -ri "/$regex/d" $pl
133 clamav-daemon malware acl condition
134 spamassassin spam acl condition
136 ### end removing panic lines due to service restarts ###
139 ## begin broken pipe & write lock & general alert ##
140 regex
="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
143 while read -r day
time _
; do
144 log_s
=$
(date -d "$day $time" +%s
)
146 if (( log_s
> EPOCHSECONDS
- 300 )); then
149 done < <(grep "$regex" $pl ||
:)
151 # I see broken pipe in groups of 3 for the same message around once a day
152 # randomly. I'm guessing they are related to running 2 instances of
153 # exim which share the same spool. So, if we have some, but not in
154 # the last 5 minutes, and less than 20, it should be fine to clear
155 # them. write lock happens less but can fit under the same rule.
156 if (( count
> 20 )); then
158 elif ! $newlines; then
159 grep "$regex" $pl |
& tee -a $pl-archive
160 v
"above is from grep $regex"
161 sed -i "/$regex/d" $pl
165 # I think we could alert on anything else older than 61 seconds,
166 # but lets just add some slack, make it 2 minutes.
167 while read -r day
time _
; do
168 # some lines dont have dates, just skip them
169 # 2022-09-16 15:21:06.250 [438097] Exim configuration error:
170 # can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS"
171 if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then
174 log_s
=$
(date -d "$day $time" +%s
)
175 if (( EPOCHSECONDS
- 120 > log_s
)); then
178 # pr_metrix for $regex is handled above
179 done < <(grep -v "$regex" $pl ||
:)
180 ## end broken pipe ##
182 echo "exim_paniclog $pr_metric" >/var
/lib
/prometheus
/node-exporter
/exim_paniclog.prom
194 if [[ ! -w $pl-archive ]]; then
196 chgrp adm
$pl-archive
197 chmod 664 $pl-archive
200 if [[ $INVOCATION_ID ]]; then