pl=/var/log/exim4/paniclog
main() {
+ pr_metric=0
if [[ ! -s $pl ]]; then
+ echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
return 0
fi
grep "$regex" $pl >> $pl-archive ||:
sed -i "/$regex/d" $pl
- ## begin broken pipe & write lock ##
- regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
- now_s=$(date +%s)
- newlines=false
- count=0
- while read -r day time _; do
- log_s=$(date -d "$day $time" +%s)
- count=$((count+1))
- if (( now_s - 300 > log_s )); then
- newlines=true
- fi
- done < <(grep "$regex" $pl ||:)
- if (( count )); then
- # I see broken pipe in groups of 3 for the same message around once a day
- # randomly. I'm guessing they are related to running 2 instances of
- # exim which share the same spool. So, if we have some, but not in
- # the last 5 minutes, and less than 20, it should be fine to clear
- # them. write lock happens less but can fit under the same rule.
- if (( count > 20 )); then
- cat $pl
- elif ! $newlines; then
- grep "$regex" $pl >>$pl-archive
- sed -i "/$regex/d" $pl
- fi
- fi
- ## end broken pipe ##
-
while read -r service regex; do
found=false
wipe=true
d "$d1 $d2"
found=true
tmptime=$(date -d "$d1 $d2" +%s)
- # dont consider every matching line, just those in > 60 second intervals
+ # Checking the journal takes a second or two, so
+ # dont consider every matching line, just those > 60 seconds apart. We are
+ # testing the journal for 60 seconds after the message, so should be ok.
if [[ ! $logtime ]]; then
logtime=$tmptime
elif (( tmptime > logtime + 60 )); then
clamav-daemon malware acl condition
spamassassin spam acl condition
EOF
+
+ ## begin broken pipe & write lock & general alert ##
+ regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
+ newlines=false
+ count=0
+ while read -r day time _; do
+ log_s=$(date -d "$day $time" +%s)
+ count=$((count+1))
+ if (( log_s > EPOCHSECONDS - 300 )); then
+ newlines=true
+ fi
+ done < <(grep "$regex" $pl ||:)
+ if (( count )); then
+ # I see broken pipe in groups of 3 for the same message around once a day
+ # randomly. I'm guessing they are related to running 2 instances of
+ # exim which share the same spool. So, if we have some, but not in
+ # the last 5 minutes, and less than 20, it should be fine to clear
+ # them. write lock happens less but can fit under the same rule.
+ if (( count > 20 )); then
+ pr_metric=1
+ elif ! $newlines; then
+ grep "$regex" $pl >>$pl-archive
+ sed -i "/$regex/d" $pl
+ fi
+ fi
+
+ # I think we could alert on anything else older than 61 seconds,
+ # but lets just add some slack, make it 5 minutes.
+ while read -r day time _; do
+ log_s=$(date -d "$day $time" +%s)
+ if (( log_s < EPOCHSECONDS - 300 )); then
+ pr_metric=1
+ fi
+ done < <(grep -v "$regex" $pl ||:)
+ ## end broken pipe ##
+
+ echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
+
}
-if [[ $INVOCATION_ID ]]; then
- # this is to prevent systemd from filling up the journal
- for (( runcount=0; runcount < 100; runcount++ )); do
+loop-main() {
+ while true; do
main
sleep 30
done
+}
+
+if [[ $INVOCATION_ID ]]; then
+ loop-main
else
main
fi