#!/bin/bash # I, Ian Kelling, follow the GNU license recommendations at # https://www.gnu.org/licenses/license-recommendations.en.html. They # recommend that small programs, < 300 lines, be licensed under the # Apache License 2.0. This file contains or is part of one or more small # programs. If a small program grows beyond 300 lines, I plan to switch # its license to GPL. # Copyright 2024 Ian Kelling # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The panic log regularly gets some stuff in it we dont want to fix. # Detect it and wipe it out. if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi shopt -s inherit_errexit 2>/dev/null ||: # ignore fail in bash < 4.4 set -eE -o pipefail trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR [[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@" debug=false if [[ $1 ]]; then debug=true fi verbose=true d() { if $debug; then printf "%s\n" "$*" fi } v() { if $verbose; then printf "%s\n" "$*" fi } pl=/var/log/exim4/paniclog main() { pr_metric=0 if [[ ! -s $pl ]]; then echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom return 0 fi # example line: # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then regex="socket bind() to port 25 for address" if grep "$regex" $pl |& tee -a $pl-archive; then v "above is from grep $regex" sed -i "/$regex/d" $pl fi fi # this is a strange message due to running as nonroot # regex='exim user lost privilege for using -C option' # sed -i "/$regex/d" $pl # seems to randomly be caused by # Starting exim4-base housekeeping, exim4-base.service regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$" if grep "$regex" $pl |& tee -a $pl-archive; then v "above is from grep $regex" sed -i "/$regex/d" $pl fi ### begin removing panic lines due to service restarts ### while read -r service regex; do found=false wipe=true d "$service $regex" while read -r d1 d2; do d "$d1 $d2" tmptime=$(date -d "$d1 $d2" +%s) # Checking the journal takes a second or two, so # dont consider every matching line, just those > 20 seconds apart. We are # testing the journal for 60 seconds after the message, so should be ok. # It probably makes sense to even check for >59 seconds apart, using 20 # seconds to be conservative. if [[ ! $logtime ]]; then logtime=$tmptime elif (( tmptime > logtime + 20 )); then logtime=$tmptime else continue fi found=true sec_min=$((logtime - 60)) sec_max=$((logtime + 60)) jmin="$(date -d @$sec_min "+%F %H:%M:%S")" jmax="$(date -d @$sec_max "+%F %H:%M:%S")" description=$(systemctl cat $service | sed -rn 's/^ *Description=(.*)/\1/p') jrregex="^Starting $description" if [[ $service == spamassassin ]]; then jrregex+="\|^spamd: restarting" fi d "jrregex=$jrregex jmin=$jmin jmax=$jmax" # the sed clears out the initial time and process+pid if journalctl -u $service -S "$jmin" -U "$jmax" \ | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then v "messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':" else v "PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'" wipe=false break fi done < <(awk "/$regex/ "'{print $1,$2}' $pl) if $found && $wipe; then d "wiping $regex" if grep -E "$regex" $pl |& tee -a $pl-archive; then v "above is from grep -E $regex" sed -ri "/$regex/d" $pl fi fi done <<'EOF' clamav-daemon malware acl condition spamassassin spam acl condition EOF ### end removing panic lines due to service restarts ### ## begin broken pipe & write lock & general alert ## regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$" newlines=false count=0 while read -r day time _; do log_s=$(date -d "$day $time" +%s) count=$((count+1)) if (( log_s > EPOCHSECONDS - 300 )); then newlines=true fi done < <(grep "$regex" $pl ||:) if (( count )); then # I see broken pipe in groups of 3 for the same message around once a day # randomly. I'm guessing they are related to running 2 instances of # exim which share the same spool. So, if we have some, but not in # the last 5 minutes, and less than 20, it should be fine to clear # them. write lock happens less but can fit under the same rule. if (( count > 20 )); then pr_metric=1 elif ! $newlines; then grep "$regex" $pl |& tee -a $pl-archive v "above is from grep $regex" sed -i "/$regex/d" $pl fi fi # I think we could alert on anything else older than 61 seconds, # but lets just add some slack, make it 2 minutes. while read -r day time _; do # some lines dont have dates, just skip them # 2022-09-16 15:21:06.250 [438097] Exim configuration error: # can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS" if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then continue fi log_s=$(date -d "$day $time" +%s) if (( EPOCHSECONDS - 120 > log_s )); then pr_metric=1 fi # pr_metrix for $regex is handled above done < <(grep -v "$regex" $pl ||:) ## end broken pipe ## echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom } loop-main() { while true; do main sleep 30 done } if [[ ! -w $pl-archive ]]; then touch $pl-archive chgrp adm $pl-archive chmod 664 $pl-archive fi if [[ $INVOCATION_ID ]]; then loop-main else main fi