bunch of new stuff, a few fixes

[distro-setup] / epanic-clean
diff --git a/epanic-clean b/epanic-clean

index a7220c2cf03267c2c2aacdd6822c172537d2d297..01755304a08de69fd8e5a5d0d245bc175e40565f 100755 (executable)
--- a/epanic-clean
+++ b/epanic-clean
@@ -18,69 +18,71 @@ if [[ $1 ]]; then
    debug=true
  fi
  
    debug=true
  fi
  
+verbose=true
  
  d() {
    if $debug; then
      printf "%s\n" "$*"
    fi
  }
  
  d() {
    if $debug; then
      printf "%s\n" "$*"
    fi
  }
+v() {
+  if $verbose; then
+    printf "%s\n" "$*"
+  fi
+}
  
  
  pl=/var/log/exim4/paniclog
  main() {
  
  
  pl=/var/log/exim4/paniclog
  main() {
+  pr_metric=0
    if [[ ! -s $pl ]]; then
    if [[ ! -s $pl ]]; then
+    echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
      return 0
    fi
  
      return 0
    fi
  
+  # example line:
+  # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
+  if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then
+    regex="socket bind() to port 25 for address"
+    if grep "$regex" $pl |& tee -a $pl-archive; then
+      v "above is from grep $regex"
+      sed -i "/$regex/d" $pl
+    fi
+  fi
+
+  # this is a strange message due to running as nonroot
+  # regex='exim user lost privilege for using -C option'
+  # sed -i "/$regex/d" $pl
+
    # seems to randomly be caused by
    # Starting exim4-base housekeeping, exim4-base.service
    regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
    # seems to randomly be caused by
    # Starting exim4-base housekeeping, exim4-base.service
    regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
-  grep "$regex" $pl >> $pl-archive ||:
-  sed -i "/$regex/d" $pl
-
-  ## begin broken pipe ##
-  regex="Failed writing transport results to pipe: Broken pipe$"
-  now_s=$(date +%s)
-  newlines=false
-  count=0
-  while read -r day time _; do
-    log_s=$(date -d "$day $time" +%s)
-    count=$((count+1))
-    if (( now_s - 300 > log_s )); then
-      newlines=true
-    fi
-  done < <(grep "$regex" $pl ||:)
-  if (( count )); then
-    # i see these in groups of 3 for the same message around once a day
-    # randomly.  I'm guessing they are related to running 2 instances of
-    # exim which share the same spool.  So, if we have some, but not in
-    # the last 5 minutes, and less than 20, it should be fine to clear
-    # them.
-    if (( count > 20 )); then
-      cat $pl
-      elif ! $newlines; then
-      grep "$regex" $pl >>$pl-archive
-      sed -i "/$regex/d" $pl
-      fi
+  if grep "$regex" $pl |& tee -a  $pl-archive; then
+    v "above is from grep $regex"
+    sed -i "/$regex/d" $pl
    fi
    fi
-  ## end broken pipe ##
  
  
+  ### begin removing panic lines due to service restarts ###
    while read -r service regex; do
      found=false
      wipe=true
      d "$service $regex"
      while read -r d1 d2; do
        d "$d1 $d2"
    while read -r service regex; do
      found=false
      wipe=true
      d "$service $regex"
      while read -r d1 d2; do
        d "$d1 $d2"
-      found=true
        tmptime=$(date -d "$d1 $d2" +%s)
        tmptime=$(date -d "$d1 $d2" +%s)
-      # dont consider every matching line, just those in > 60 second intervals
+      # Checking the journal takes a second or two, so
+      # dont consider every matching line, just those > 20 seconds apart. We are
+      # testing the journal for 60 seconds after the message, so should be ok.
+      # It probably makes sense to even check for >59 seconds apart, using 20
+      # seconds to be conservative.
        if [[ ! $logtime ]]; then
          logtime=$tmptime
        if [[ ! $logtime ]]; then
          logtime=$tmptime
-      elif (( tmptime > logtime + 60 )); then
+      elif (( tmptime > logtime + 20 )); then
          logtime=$tmptime
        else
          continue
        fi
          logtime=$tmptime
        else
          continue
        fi
+      found=true
        sec_min=$((logtime - 60))
        sec_max=$((logtime + 60))
        jmin="$(date -d @$sec_min "+%F %H:%M:%S")"
        sec_min=$((logtime - 60))
        sec_max=$((logtime + 60))
        jmin="$(date -d @$sec_min "+%F %H:%M:%S")"
@@ -92,34 +94,92 @@ main() {
        fi
        d "jrregex=$jrregex jmin=$jmin jmax=$jmax"
        # the sed clears out the initial time and process+pid
        fi
        d "jrregex=$jrregex jmin=$jmin jmax=$jmax"
        # the sed clears out the initial time and process+pid
-      if ! journalctl -u $service -S "$jmin" -U "$jmax" \
+      if journalctl -u $service -S "$jmin" -U "$jmax" \
            | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then
            | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then
+        v "messages worth wiping in: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex':"
+      else
+        v "PANIC: message not found via: journalctl -u $service -S '$jmin' -U '$jmax' | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep '$jrregex'"
          wipe=false
          break
        fi
      done < <(awk "/$regex/ "'{print $1,$2}' $pl)
      if $found && $wipe; then
        d "wiping $regex"
          wipe=false
          break
        fi
      done < <(awk "/$regex/ "'{print $1,$2}' $pl)
      if $found && $wipe; then
        d "wiping $regex"
-      if [[ ! -w $pl-archive ]]; then
-        touch $pl-archive
-        chgrp adm $pl-archive
-        chmod 664 $pl-archive
+      if grep -E "$regex" $pl |& tee -a $pl-archive; then
+        v "above is from grep -E $regex"
+        sed -ri "/$regex/d" $pl
        fi
        fi
-      grep -E "$regex" $pl >> $pl-archive ||:
-      sed -ri "/$regex/d" $pl
      fi
    done <<'EOF'
  clamav-daemon malware acl condition
  spamassassin spam acl condition
  EOF
      fi
    done <<'EOF'
  clamav-daemon malware acl condition
  spamassassin spam acl condition
  EOF
+  ### end removing panic lines due to service restarts ###
+
+
+  ## begin broken pipe & write lock & general alert ##
+  regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
+  newlines=false
+  count=0
+  while read -r day time _; do
+    log_s=$(date -d "$day $time" +%s)
+    count=$((count+1))
+    if (( log_s > EPOCHSECONDS - 300 )); then
+      newlines=true
+    fi
+  done < <(grep "$regex" $pl ||:)
+  if (( count )); then
+    # I see broken pipe in groups of 3 for the same message around once a day
+    # randomly.  I'm guessing they are related to running 2 instances of
+    # exim which share the same spool.  So, if we have some, but not in
+    # the last 5 minutes, and less than 20, it should be fine to clear
+    # them. write lock happens less but can fit under the same rule.
+    if (( count > 20 )); then
+      pr_metric=1
+    elif ! $newlines; then
+      grep "$regex" $pl |& tee -a $pl-archive
+      v "above is from grep $regex"
+      sed -i "/$regex/d" $pl
+    fi
+  fi
+
+  # I think we could alert on anything else older than 61 seconds,
+  # but lets just add some slack, make it 2 minutes.
+  while read -r day time _; do
+    # some lines dont have dates, just skip them
+    # 2022-09-16 15:21:06.250 [438097] Exim configuration error:
+    #  can't redefine an undefined macro "REMOTE_SMTP_SMARTHOST_TLS_VERIFY_HOSTS"
+    if [[ $day != [2-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] ]]; then
+      continue
+    fi
+    log_s=$(date -d "$day $time" +%s)
+    if (( EPOCHSECONDS - 120 > log_s  )); then
+      pr_metric=1
+    fi
+    # pr_metrix for $regex is handled above
+  done < <(grep -v "$regex" $pl ||:)
+  ## end broken pipe ##
+
+  echo "exim_paniclog $pr_metric" >/var/lib/prometheus/node-exporter/exim_paniclog.prom
+
  }
  
  }
  
-if [[ $INVOCATION_ID ]]; then
-  # this is to prevent systemd from filling up the journal
-  for (( runcount=0; runcount < 100; runcount++ )); do
+loop-main() {
+  while true; do
      main
      sleep 30
    done
      main
      sleep 30
    done
+}
+
+
+if [[ ! -w $pl-archive ]]; then
+  touch $pl-archive
+  chgrp adm $pl-archive
+  chmod 664 $pl-archive
+fi
+
+if [[ $INVOCATION_ID ]]; then
+  loop-main
  else
    main
  fi
  else
    main
  fi