2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
5 # The panic log regularly gets some stuff in it we dont want to fix.
6 # Detect it and wipe it out.
8 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
10 shopt -s inherit_errexit
2>/dev
/null ||
: # ignore fail in bash < 4.4
12 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
14 [[ $EUID == 0 ]] ||
exec sudo
-E "${BASH_SOURCE[0]}" "$@"
29 pl
=/var
/log
/exim
4/paniclog
32 if [[ ! -s $pl ]]; then
33 echo "exim_paniclog $pr_metric" >/var
/lib
/prometheus
/node-exporter
/exim_paniclog.prom
38 # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
39 if [[ -e /etc
/systemd
/system
/exim4.service.d
/backup.conf
]]; then
40 regex
="socket bind() to port 25 for address"
41 grep "$regex" $pl >> $pl-archive ||
:
42 sed -i "/$regex/d" $pl
45 # this is a strange message due to running as nonroot
46 # regex='exim user lost privilege for using -C option'
47 # sed -i "/$regex/d" $pl
49 # seems to randomly be caused by
50 # Starting exim4-base housekeeping, exim4-base.service
51 regex
="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
52 grep "$regex" $pl >> $pl-archive ||
:
53 sed -i "/$regex/d" $pl
55 while read -r service regex
; do
59 while read -r d1 d2
; do
62 tmptime
=$
(date -d "$d1 $d2" +%s
)
63 # Checking the journal takes a second or two, so
64 # dont consider every matching line, just those > 60 seconds apart. We are
65 # testing the journal for 60 seconds after the message, so should be ok.
66 if [[ ! $logtime ]]; then
68 elif (( tmptime
> logtime
+ 60 )); then
73 sec_min
=$
((logtime
- 60))
74 sec_max
=$
((logtime
+ 60))
75 jmin
="$(date -d @$sec_min "+%F
%H
:%M
:%S
")"
76 jmax
="$(date -d @$sec_max "+%F
%H
:%M
:%S
")"
77 description
=$
(systemctl
cat $service |
sed -rn 's/^ *Description=(.*)/\1/p')
78 jrregex
="^Starting $description"
79 if [[ $service == spamassassin
]]; then
80 jrregex
+="\|^spamd: restarting"
82 d
"jrregex=$jrregex jmin=$jmin jmax=$jmax"
83 # the sed clears out the initial time and process+pid
84 if ! journalctl
-u $service -S "$jmin" -U "$jmax" \
85 |
sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' |
grep "$jrregex" &>/dev
/null
; then
89 done < <(awk "/$regex/ "'{print $1,$2}' $pl)
90 if $found && $wipe; then
92 if [[ ! -w $pl-archive ]]; then
97 grep -E "$regex" $pl >> $pl-archive ||
:
98 sed -ri "/$regex/d" $pl
101 clamav-daemon malware acl condition
102 spamassassin spam acl condition
105 ## begin broken pipe & write lock & general alert ##
106 regex
="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
109 while read -r day
time _
; do
110 log_s
=$
(date -d "$day $time" +%s
)
112 if (( log_s
> EPOCHSECONDS
- 300 )); then
115 done < <(grep "$regex" $pl ||
:)
117 # I see broken pipe in groups of 3 for the same message around once a day
118 # randomly. I'm guessing they are related to running 2 instances of
119 # exim which share the same spool. So, if we have some, but not in
120 # the last 5 minutes, and less than 20, it should be fine to clear
121 # them. write lock happens less but can fit under the same rule.
122 if (( count
> 20 )); then
124 elif ! $newlines; then
125 grep "$regex" $pl >>$pl-archive
126 sed -i "/$regex/d" $pl
130 # I think we could alert on anything else older than 61 seconds,
131 # but lets just add some slack, make it 5 minutes.
132 while read -r day
time _
; do
133 log_s
=$
(date -d "$day $time" +%s
)
134 if (( log_s
< EPOCHSECONDS
- 300 )); then
137 done < <(grep -v "$regex" $pl ||
:)
138 ## end broken pipe ##
140 echo "exim_paniclog $pr_metric" >/var
/lib
/prometheus
/node-exporter
/exim_paniclog.prom
151 if [[ $INVOCATION_ID ]]; then