#!/bin/bash
-# Copyright (C) 2019 Ian Kelling
-# SPDX-License-Identifier: AGPL-3.0-or-later
+# I, Ian Kelling, follow the GNU license recommendations at
+# https://www.gnu.org/licenses/license-recommendations.en.html. They
+# recommend that small programs, < 300 lines, be licensed under the
+# Apache License 2.0. This file contains or is part of one or more small
+# programs. If a small program grows beyond 300 lines, I plan to switch
+# its license to GPL.
+
+# Copyright 2024 Ian Kelling
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# The panic log regularly gets some stuff in it we dont want to fix.
# Detect it and wipe it out.
debug=true
fi
+verbose=true
d() {
if $debug; then
printf "%s\n" "$*"
fi
}
+v() {
+ if $verbose; then
+ printf "%s\n" "$*"
+ fi
+}
+
+spamd_ser=spamd
+if systemctl cat spamassassin &>/dev/null; then
+ spamd_ser=spamassassin
+fi
pl=/var/log/exim4/paniclog
main() {
+ pr_metric=0
if [[ ! -s $pl ]]; then
+ echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
return 0
fi
# 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then
regex="socket bind() to port 25 for address"
- grep "$regex" $pl >> $pl-archive ||:
- sed -i "/$regex/d" $pl
+ if grep "$regex" $pl |& tee -a $pl-archive; then
+ v "above is from grep $regex"
+ sed -i "/$regex/d" $pl
+ fi
fi
+ # this is a strange message due to running as nonroot
+ # regex='exim user lost privilege for using -C option'
+ # sed -i "/$regex/d" $pl
+
# seems to randomly be caused by
# Starting exim4-base housekeeping, exim4-base.service
regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
- grep "$regex" $pl >> $pl-archive ||:
- sed -i "/$regex/d" $pl
-
- ## begin broken pipe & write lock ##
- regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
- now_s=$(date +%s)
- newlines=false
- count=0
- while read -r day time _; do
- log_s=$(date -d "$day $time" +%s)
- count=$((count+1))
- if (( now_s - 300 > log_s )); then
- newlines=true
- fi
- done < <(grep "$regex" $pl ||:)
- if (( count )); then
- # I see broken pipe in groups of 3 for the same message around once a day
- # randomly. I'm guessing they are related to running 2 instances of
- # exim which share the same spool. So, if we have some, but not in
- # the last 5 minutes, and less than 20, it should be fine to clear
- # them. write lock happens less but can fit under the same rule.
- if (( count > 20 )); then
- cat $pl
- elif ! $newlines; then
- grep "$regex" $pl >>$pl-archive
- sed -i "/$regex/d" $pl
- fi
+ if grep "$regex" $pl |& tee -a $pl-archive; then
+ v "above is from grep $regex"
+ sed -i "/$regex/d" $pl
fi
- ## end broken pipe ##
+ ### begin removing panic lines due to service restarts ###
while read -r service regex; do
found=false
wipe=true
d "$service $regex"
while read -r d1 d2; do
d "$d1 $d2"
- found=true
tmptime=$(date -d "$d1 $d2" +%s)
- # dont consider every matching line, just those in > 60 second intervals
+ # Checking the journal takes a second or two, so
+ # dont consider every matching line, just those > 20 seconds apart. We are
+ # testing the journal for 60 seconds after the message, so should be ok.
+ # It probably makes sense to even check for >59 seconds apart, using 20
+ # seconds to be conservative.
if [[ ! $logtime ]]; then
logtime=$tmptime
- elif (( tmptime > logtime + 60 )); then
+ elif (( tmptime > logtime + 20 )); then
logtime=$tmptime
else
continue
fi
+ found=true
sec_min=$((logtime - 60))
sec_max=$((logtime + 60))
jmin="$(date -d @$sec_min "+%F %H:%M:%S")"
jmax="$(date -d @$sec_max "+%F %H:%M:%S")"
description=$(systemctl cat $service | sed -rn 's/^ *Description=(.*)/\1/p')
jrregex="^Starting $description"
- if [[ $service == spamassassin ]]; then
+ if [[ $service == "$spamd_ser" ]]; then
jrregex+="\|^spamd: restarting"
fi
d "jrregex=$jrregex jmin=$jmin jmax=$jmax"
# the sed clears out the initial time and process+pid
- if ! journalctl -u $service -S "$jmin" -U "$jmax" \
+ if journalctl -u $service -S "$jmin" -U "$jmax" \
| sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then
+ v "messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':"
+ else
+ v "PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'"
wipe=false
break
fi
done < <(awk "/$regex/ "'{print $1,$2}' $pl)
if $found && $wipe; then
d "wiping $regex"
- if [[ ! -w $pl-archive ]]; then
- touch $pl-archive
- chgrp adm $pl-archive
- chmod 664 $pl-archive
+ if grep -E "$regex" $pl |& tee -a $pl-archive; then
+ v "above is from grep -E $regex"
+ sed -ri "/$regex/d" $pl
fi
- grep -E "$regex" $pl >> $pl-archive ||:
- sed -ri "/$regex/d" $pl
fi
- done <<'EOF'
+ done <<EOF
clamav-daemon malware acl condition
-spamassassin spam acl condition
+$spamd_ser spam acl condition
EOF
+ ### end removing panic lines due to service restarts ###
+
+
+ ## begin broken pipe & write lock & general alert ##
+ regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
+ newlines=false
+ count=0
+ while read -r day time _; do
+ log_s=$(date -d "$day $time" +%s)
+ count=$((count+1))
+ if (( log_s > EPOCHSECONDS - 300 )); then
+ newlines=true
+ fi
+ done < <(grep "$regex" $pl ||:)
+ if (( count )); then
+ # I see broken pipe in groups of 3 for the same message around once a day
+ # randomly. I'm guessing they are related to running 2 instances of
+ # exim which share the same spool. So, if we have some, but not in
+ # the last 5 minutes, and less than 20, it should be fine to clear
+ # them. write lock happens less but can fit under the same rule.
+ if (( count > 20 )); then
+ pr_metric=1
+ elif ! $newlines; then
+ grep "$regex" $pl |& tee -a $pl-archive
+ v "above is from grep $regex"
+ sed -i "/$regex/d" $pl
+ fi
+ fi
+
+ # I think we could alert on anything else older than 61 seconds,
+ # but lets just add some slack, make it 2 minutes.
+ while read -r day time _; do
+ # some lines dont have dates, just skip them
+ # 2022-09-16 15:21:06.250 [438097] Exim configuration error:
+ # can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS"
+ if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then
+ continue
+ fi
+ log_s=$(date -d "$day $time" +%s)
+ if (( EPOCHSECONDS - 120 > log_s )); then
+ pr_metric=1
+ fi
+ # pr_metrix for $regex is handled above
+ done < <(grep -v "$regex" $pl ||:)
+ ## end broken pipe ##
+
+ echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
+
}
-if [[ $INVOCATION_ID ]]; then
- # this is to prevent systemd from filling up the journal
- for (( runcount=0; runcount < 100; runcount++ )); do
+loop-main() {
+ while true; do
main
sleep 30
done
+}
+
+
+if [[ ! -w $pl-archive ]]; then
+ touch $pl-archive
+ chgrp adm $pl-archive
+ chmod 664 $pl-archive
+fi
+
+if [[ $PPID == 1 ]]; then
+ loop-main
else
main
fi