X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=epanic-clean;h=bb9115ed8f1b6debae834f15f31a7b4df2411ce7;hb=205510f1ebae68147df7c73bc71c692fd03ff045;hp=9886cb1228980d1bb1b997efaf82d5166187115f;hpb=d4366929e6e200155b010dc05ce74255ee6a45ed;p=distro-setup diff --git a/epanic-clean b/epanic-clean index 9886cb1..bb9115e 100755 --- a/epanic-clean +++ b/epanic-clean @@ -28,7 +28,9 @@ d() { pl=/var/log/exim4/paniclog main() { + pr_metric=0 if [[ ! -s $pl ]]; then + echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom return 0 fi @@ -50,33 +52,6 @@ main() { grep "$regex" $pl >> $pl-archive ||: sed -i "/$regex/d" $pl - ## begin broken pipe & write lock ## - regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$" - now_s=$(date +%s) - newlines=false - count=0 - while read -r day time _; do - log_s=$(date -d "$day $time" +%s) - count=$((count+1)) - if (( now_s - 300 > log_s )); then - newlines=true - fi - done < <(grep "$regex" $pl ||:) - if (( count )); then - # I see broken pipe in groups of 3 for the same message around once a day - # randomly. I'm guessing they are related to running 2 instances of - # exim which share the same spool. So, if we have some, but not in - # the last 5 minutes, and less than 20, it should be fine to clear - # them. write lock happens less but can fit under the same rule. - if (( count > 20 )); then - cat $pl - elif ! $newlines; then - grep "$regex" $pl >>$pl-archive - sed -i "/$regex/d" $pl - fi - fi - ## end broken pipe ## - while read -r service regex; do found=false wipe=true @@ -85,7 +60,9 @@ main() { d "$d1 $d2" found=true tmptime=$(date -d "$d1 $d2" +%s) - # dont consider every matching line, just those in > 60 second intervals + # Checking the journal takes a second or two, so + # dont consider every matching line, just those > 60 seconds apart. We are + # testing the journal for 60 seconds after the message, so should be ok. if [[ ! $logtime ]]; then logtime=$tmptime elif (( tmptime > logtime + 60 )); then @@ -124,14 +101,55 @@ main() { clamav-daemon malware acl condition spamassassin spam acl condition EOF + + ## begin broken pipe & write lock & general alert ## + regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$" + newlines=false + count=0 + while read -r day time _; do + log_s=$(date -d "$day $time" +%s) + count=$((count+1)) + if (( log_s > EPOCHSECONDS - 300 )); then + newlines=true + fi + done < <(grep "$regex" $pl ||:) + if (( count )); then + # I see broken pipe in groups of 3 for the same message around once a day + # randomly. I'm guessing they are related to running 2 instances of + # exim which share the same spool. So, if we have some, but not in + # the last 5 minutes, and less than 20, it should be fine to clear + # them. write lock happens less but can fit under the same rule. + if (( count > 20 )); then + pr_metric=1 + elif ! $newlines; then + grep "$regex" $pl >>$pl-archive + sed -i "/$regex/d" $pl + fi + fi + + # I think we could alert on anything else older than 61 seconds, + # but lets just add some slack, make it 5 minutes. + while read -r day time _; do + log_s=$(date -d "$day $time" +%s) + if (( log_s < EPOCHSECONDS - 300 )); then + pr_metric=1 + fi + done < <(grep -v "$regex" $pl ||:) + ## end broken pipe ## + + echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom + } -if [[ $INVOCATION_ID ]]; then - # this is to prevent systemd from filling up the journal - for (( runcount=0; runcount < 100; runcount++ )); do +loop-main() { + while true; do main sleep 30 done +} + +if [[ $INVOCATION_ID ]]; then + loop-main else main fi