X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=epanic-clean;h=01755304a08de69fd8e5a5d0d245bc175e40565f;hb=5abdcfce00c6ff61bf3856f7b5101915dc096107;hp=bb9115ed8f1b6debae834f15f31a7b4df2411ce7;hpb=fa5deaee2e0182ddfc7b39eea7ee2acedb259ddf;p=distro-setup diff --git a/epanic-clean b/epanic-clean index bb9115e..0175530 100755 --- a/epanic-clean +++ b/epanic-clean @@ -18,12 +18,18 @@ if [[ $1 ]]; then debug=true fi +verbose=true d() { if $debug; then printf "%s\n" "$*" fi } +v() { + if $verbose; then + printf "%s\n" "$*" + fi +} pl=/var/log/exim4/paniclog @@ -38,8 +44,10 @@ main() { # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then regex="socket bind() to port 25 for address" - grep "$regex" $pl >> $pl-archive ||: - sed -i "/$regex/d" $pl + if grep "$regex" $pl |& tee -a $pl-archive; then + v "above is from grep $regex" + sed -i "/$regex/d" $pl + fi fi # this is a strange message due to running as nonroot @@ -49,27 +57,32 @@ main() { # seems to randomly be caused by # Starting exim4-base housekeeping, exim4-base.service regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$" - grep "$regex" $pl >> $pl-archive ||: - sed -i "/$regex/d" $pl + if grep "$regex" $pl |& tee -a $pl-archive; then + v "above is from grep $regex" + sed -i "/$regex/d" $pl + fi + ### begin removing panic lines due to service restarts ### while read -r service regex; do found=false wipe=true d "$service $regex" while read -r d1 d2; do d "$d1 $d2" - found=true tmptime=$(date -d "$d1 $d2" +%s) # Checking the journal takes a second or two, so - # dont consider every matching line, just those > 60 seconds apart. We are + # dont consider every matching line, just those > 20 seconds apart. We are # testing the journal for 60 seconds after the message, so should be ok. + # It probably makes sense to even check for >59 seconds apart, using 20 + # seconds to be conservative. if [[ ! $logtime ]]; then logtime=$tmptime - elif (( tmptime > logtime + 60 )); then + elif (( tmptime > logtime + 20 )); then logtime=$tmptime else continue fi + found=true sec_min=$((logtime - 60)) sec_max=$((logtime + 60)) jmin="$(date -d @$sec_min "+%F %H:%M:%S")" @@ -81,26 +94,28 @@ main() { fi d "jrregex=$jrregex jmin=$jmin jmax=$jmax" # the sed clears out the initial time and process+pid - if ! journalctl -u $service -S "$jmin" -U "$jmax" \ + if journalctl -u $service -S "$jmin" -U "$jmax" \ | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then + v "messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':" + else + v "PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'" wipe=false break fi done < <(awk "/$regex/ "'{print $1,$2}' $pl) if $found && $wipe; then d "wiping $regex" - if [[ ! -w $pl-archive ]]; then - touch $pl-archive - chgrp adm $pl-archive - chmod 664 $pl-archive + if grep -E "$regex" $pl |& tee -a $pl-archive; then + v "above is from grep -E $regex" + sed -ri "/$regex/d" $pl fi - grep -E "$regex" $pl >> $pl-archive ||: - sed -ri "/$regex/d" $pl fi done <<'EOF' clamav-daemon malware acl condition spamassassin spam acl condition EOF + ### end removing panic lines due to service restarts ### + ## begin broken pipe & write lock & general alert ## regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$" @@ -122,18 +137,26 @@ EOF if (( count > 20 )); then pr_metric=1 elif ! $newlines; then - grep "$regex" $pl >>$pl-archive + grep "$regex" $pl |& tee -a $pl-archive + v "above is from grep $regex" sed -i "/$regex/d" $pl fi fi # I think we could alert on anything else older than 61 seconds, - # but lets just add some slack, make it 5 minutes. + # but lets just add some slack, make it 2 minutes. while read -r day time _; do + # some lines dont have dates, just skip them + # 2022-09-16 15:21:06.250 [438097] Exim configuration error: + # can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS" + if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then + continue + fi log_s=$(date -d "$day $time" +%s) - if (( log_s < EPOCHSECONDS - 300 )); then + if (( EPOCHSECONDS - 120 > log_s )); then pr_metric=1 fi + # pr_metrix for $regex is handled above done < <(grep -v "$regex" $pl ||:) ## end broken pipe ## @@ -148,6 +171,13 @@ loop-main() { done } + +if [[ ! -w $pl-archive ]]; then + touch $pl-archive + chgrp adm $pl-archive + chmod 664 $pl-archive +fi + if [[ $INVOCATION_ID ]]; then loop-main else