X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=epanic-clean;h=bb9115ed8f1b6debae834f15f31a7b4df2411ce7;hb=205510f1ebae68147df7c73bc71c692fd03ff045;hp=a7220c2cf03267c2c2aacdd6822c172537d2d297;hpb=aa9cb10514e29340a6d6a194ee189fa4364f1f2e;p=distro-setup diff --git a/epanic-clean b/epanic-clean index a7220c2..bb9115e 100755 --- a/epanic-clean +++ b/epanic-clean @@ -28,43 +28,30 @@ d() { pl=/var/log/exim4/paniclog main() { + pr_metric=0 if [[ ! -s $pl ]]; then + echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom return 0 fi + # example line: + # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned + if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then + regex="socket bind() to port 25 for address" + grep "$regex" $pl >> $pl-archive ||: + sed -i "/$regex/d" $pl + fi + + # this is a strange message due to running as nonroot + # regex='exim user lost privilege for using -C option' + # sed -i "/$regex/d" $pl + # seems to randomly be caused by # Starting exim4-base housekeeping, exim4-base.service regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$" grep "$regex" $pl >> $pl-archive ||: sed -i "/$regex/d" $pl - ## begin broken pipe ## - regex="Failed writing transport results to pipe: Broken pipe$" - now_s=$(date +%s) - newlines=false - count=0 - while read -r day time _; do - log_s=$(date -d "$day $time" +%s) - count=$((count+1)) - if (( now_s - 300 > log_s )); then - newlines=true - fi - done < <(grep "$regex" $pl ||:) - if (( count )); then - # i see these in groups of 3 for the same message around once a day - # randomly. I'm guessing they are related to running 2 instances of - # exim which share the same spool. So, if we have some, but not in - # the last 5 minutes, and less than 20, it should be fine to clear - # them. - if (( count > 20 )); then - cat $pl - elif ! $newlines; then - grep "$regex" $pl >>$pl-archive - sed -i "/$regex/d" $pl - fi - fi - ## end broken pipe ## - while read -r service regex; do found=false wipe=true @@ -73,7 +60,9 @@ main() { d "$d1 $d2" found=true tmptime=$(date -d "$d1 $d2" +%s) - # dont consider every matching line, just those in > 60 second intervals + # Checking the journal takes a second or two, so + # dont consider every matching line, just those > 60 seconds apart. We are + # testing the journal for 60 seconds after the message, so should be ok. if [[ ! $logtime ]]; then logtime=$tmptime elif (( tmptime > logtime + 60 )); then @@ -112,14 +101,55 @@ main() { clamav-daemon malware acl condition spamassassin spam acl condition EOF + + ## begin broken pipe & write lock & general alert ## + regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$" + newlines=false + count=0 + while read -r day time _; do + log_s=$(date -d "$day $time" +%s) + count=$((count+1)) + if (( log_s > EPOCHSECONDS - 300 )); then + newlines=true + fi + done < <(grep "$regex" $pl ||:) + if (( count )); then + # I see broken pipe in groups of 3 for the same message around once a day + # randomly. I'm guessing they are related to running 2 instances of + # exim which share the same spool. So, if we have some, but not in + # the last 5 minutes, and less than 20, it should be fine to clear + # them. write lock happens less but can fit under the same rule. + if (( count > 20 )); then + pr_metric=1 + elif ! $newlines; then + grep "$regex" $pl >>$pl-archive + sed -i "/$regex/d" $pl + fi + fi + + # I think we could alert on anything else older than 61 seconds, + # but lets just add some slack, make it 5 minutes. + while read -r day time _; do + log_s=$(date -d "$day $time" +%s) + if (( log_s < EPOCHSECONDS - 300 )); then + pr_metric=1 + fi + done < <(grep -v "$regex" $pl ||:) + ## end broken pipe ## + + echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom + } -if [[ $INVOCATION_ID ]]; then - # this is to prevent systemd from filling up the journal - for (( runcount=0; runcount < 100; runcount++ )); do +loop-main() { + while true; do main sleep 30 done +} + +if [[ $INVOCATION_ID ]]; then + loop-main else main fi