X-Git-Url: https://iankelling.org/git/?p=distro-setup;a=blobdiff_plain;f=epanic-clean;h=83533ea4fef9f5a5c72cb9546c00014a3de5f572;hp=4f365fda5aaf907dc66e69323166b0412c46fc72;hb=HEAD;hpb=b18dade73dedfe69aa741f8417947d83c4208f2d diff --git a/epanic-clean b/epanic-clean index 4f365fd..c171746 100755 --- a/epanic-clean +++ b/epanic-clean @@ -1,6 +1,25 @@ #!/bin/bash -# Copyright (C) 2019 Ian Kelling -# SPDX-License-Identifier: AGPL-3.0-or-later +# I, Ian Kelling, follow the GNU license recommendations at +# https://www.gnu.org/licenses/license-recommendations.en.html. They +# recommend that small programs, < 300 lines, be licensed under the +# Apache License 2.0. This file contains or is part of one or more small +# programs. If a small program grows beyond 300 lines, I plan to switch +# its license to GPL. + +# Copyright 2024 Ian Kelling + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # The panic log regularly gets some stuff in it we dont want to fix. # Detect it and wipe it out. @@ -18,17 +37,25 @@ if [[ $1 ]]; then debug=true fi +verbose=true d() { if $debug; then printf "%s\n" "$*" fi } +v() { + if $verbose; then + printf "%s\n" "$*" + fi +} pl=/var/log/exim4/paniclog main() { + pr_metric=0 if [[ ! -s $pl ]]; then + echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom return 0 fi @@ -36,8 +63,10 @@ main() { # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then regex="socket bind() to port 25 for address" - grep "$regex" $pl >> $pl-archive ||: - sed -i "/$regex/d" $pl + if grep "$regex" $pl |& tee -a $pl-archive; then + v "above is from grep $regex" + sed -i "/$regex/d" $pl + fi fi # this is a strange message due to running as nonroot @@ -47,52 +76,32 @@ main() { # seems to randomly be caused by # Starting exim4-base housekeeping, exim4-base.service regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$" - grep "$regex" $pl >> $pl-archive ||: - sed -i "/$regex/d" $pl - - ## begin broken pipe & write lock ## - regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$" - now_s=$(date +%s) - newlines=false - count=0 - while read -r day time _; do - log_s=$(date -d "$day $time" +%s) - count=$((count+1)) - if (( now_s - 300 > log_s )); then - newlines=true - fi - done < <(grep "$regex" $pl ||:) - if (( count )); then - # I see broken pipe in groups of 3 for the same message around once a day - # randomly. I'm guessing they are related to running 2 instances of - # exim which share the same spool. So, if we have some, but not in - # the last 5 minutes, and less than 20, it should be fine to clear - # them. write lock happens less but can fit under the same rule. - if (( count > 20 )); then - cat $pl - elif ! $newlines; then - grep "$regex" $pl >>$pl-archive - sed -i "/$regex/d" $pl - fi + if grep "$regex" $pl |& tee -a $pl-archive; then + v "above is from grep $regex" + sed -i "/$regex/d" $pl fi - ## end broken pipe ## + ### begin removing panic lines due to service restarts ### while read -r service regex; do found=false wipe=true d "$service $regex" while read -r d1 d2; do d "$d1 $d2" - found=true tmptime=$(date -d "$d1 $d2" +%s) - # dont consider every matching line, just those in > 60 second intervals + # Checking the journal takes a second or two, so + # dont consider every matching line, just those > 20 seconds apart. We are + # testing the journal for 60 seconds after the message, so should be ok. + # It probably makes sense to even check for >59 seconds apart, using 20 + # seconds to be conservative. if [[ ! $logtime ]]; then logtime=$tmptime - elif (( tmptime > logtime + 60 )); then + elif (( tmptime > logtime + 20 )); then logtime=$tmptime else continue fi + found=true sec_min=$((logtime - 60)) sec_max=$((logtime + 60)) jmin="$(date -d @$sec_min "+%F %H:%M:%S")" @@ -104,26 +113,74 @@ main() { fi d "jrregex=$jrregex jmin=$jmin jmax=$jmax" # the sed clears out the initial time and process+pid - if ! journalctl -u $service -S "$jmin" -U "$jmax" \ + if journalctl -u $service -S "$jmin" -U "$jmax" \ | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then + v "messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':" + else + v "PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'" wipe=false break fi done < <(awk "/$regex/ "'{print $1,$2}' $pl) if $found && $wipe; then d "wiping $regex" - if [[ ! -w $pl-archive ]]; then - touch $pl-archive - chgrp adm $pl-archive - chmod 664 $pl-archive + if grep -E "$regex" $pl |& tee -a $pl-archive; then + v "above is from grep -E $regex" + sed -ri "/$regex/d" $pl fi - grep -E "$regex" $pl >> $pl-archive ||: - sed -ri "/$regex/d" $pl fi done <<'EOF' clamav-daemon malware acl condition spamassassin spam acl condition EOF + ### end removing panic lines due to service restarts ### + + + ## begin broken pipe & write lock & general alert ## + regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$" + newlines=false + count=0 + while read -r day time _; do + log_s=$(date -d "$day $time" +%s) + count=$((count+1)) + if (( log_s > EPOCHSECONDS - 300 )); then + newlines=true + fi + done < <(grep "$regex" $pl ||:) + if (( count )); then + # I see broken pipe in groups of 3 for the same message around once a day + # randomly. I'm guessing they are related to running 2 instances of + # exim which share the same spool. So, if we have some, but not in + # the last 5 minutes, and less than 20, it should be fine to clear + # them. write lock happens less but can fit under the same rule. + if (( count > 20 )); then + pr_metric=1 + elif ! $newlines; then + grep "$regex" $pl |& tee -a $pl-archive + v "above is from grep $regex" + sed -i "/$regex/d" $pl + fi + fi + + # I think we could alert on anything else older than 61 seconds, + # but lets just add some slack, make it 2 minutes. + while read -r day time _; do + # some lines dont have dates, just skip them + # 2022-09-16 15:21:06.250 [438097] Exim configuration error: + # can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS" + if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then + continue + fi + log_s=$(date -d "$day $time" +%s) + if (( EPOCHSECONDS - 120 > log_s )); then + pr_metric=1 + fi + # pr_metrix for $regex is handled above + done < <(grep -v "$regex" $pl ||:) + ## end broken pipe ## + + echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom + } loop-main() { @@ -133,6 +190,13 @@ loop-main() { done } + +if [[ ! -w $pl-archive ]]; then + touch $pl-archive + chgrp adm $pl-archive + chmod 664 $pl-archive +fi + if [[ $INVOCATION_ID ]]; then loop-main else