various improvements
[distro-setup] / epanic-clean
1 #!/bin/bash
2 # Copyright (C) 2019 Ian Kelling
3 # SPDX-License-Identifier: AGPL-3.0-or-later
4
5 # The panic log regularly gets some stuff in it we dont want to fix.
6 # Detect it and wipe it out.
7
8 if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi
9
10 shopt -s inherit_errexit 2>/dev/null ||: # ignore fail in bash < 4.4
11 set -eE -o pipefail
12 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
13
14 [[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@"
15
16 debug=false
17 if [[ $1 ]]; then
18 debug=true
19 fi
20
21
22 d() {
23 if $debug; then
24 printf "%s\n" "$*"
25 fi
26 }
27
28
29 pl=/var/log/exim4/paniclog
30 main() {
31 if [[ ! -s $pl ]]; then
32 return 0
33 fi
34
35 # example line:
36 # 2022-02-09 22:08:14.683 [59759] socket bind() to port 25 for address 10.8.0.28 failed: Cannot assign requested address: daemon abandoned
37 if [[ -e /etc/systemd/system/exim4.service.d/backup.conf ]]; then
38 regex="socket bind() to port 25 for address"
39 grep "$regex" $pl >> $pl-archive ||:
40 sed -i "/$regex/d" $pl
41 fi
42
43 # this is a strange message due to running as nonroot
44 # regex='exim user lost privilege for using -C option'
45 # sed -i "/$regex/d" $pl
46
47 # seems to randomly be caused by
48 # Starting exim4-base housekeeping, exim4-base.service
49 regex="^[^ ]* 00:00:0.* Failed writing transport results to pipe: Broken pipe$"
50 grep "$regex" $pl >> $pl-archive ||:
51 sed -i "/$regex/d" $pl
52
53 ## begin broken pipe & write lock ##
54 regex="Failed to get write lock\|Failed writing transport results to pipe: Broken pipe$"
55 now_s=$(date +%s)
56 newlines=false
57 count=0
58 while read -r day time _; do
59 log_s=$(date -d "$day $time" +%s)
60 count=$((count+1))
61 if (( now_s - 300 > log_s )); then
62 newlines=true
63 fi
64 done < <(grep "$regex" $pl ||:)
65 if (( count )); then
66 # I see broken pipe in groups of 3 for the same message around once a day
67 # randomly. I'm guessing they are related to running 2 instances of
68 # exim which share the same spool. So, if we have some, but not in
69 # the last 5 minutes, and less than 20, it should be fine to clear
70 # them. write lock happens less but can fit under the same rule.
71 if (( count > 20 )); then
72 cat $pl
73 elif ! $newlines; then
74 grep "$regex" $pl >>$pl-archive
75 sed -i "/$regex/d" $pl
76 fi
77 fi
78 ## end broken pipe ##
79
80 while read -r service regex; do
81 found=false
82 wipe=true
83 d "$service $regex"
84 while read -r d1 d2; do
85 d "$d1 $d2"
86 found=true
87 tmptime=$(date -d "$d1 $d2" +%s)
88 # dont consider every matching line, just those in > 60 second intervals
89 if [[ ! $logtime ]]; then
90 logtime=$tmptime
91 elif (( tmptime > logtime + 60 )); then
92 logtime=$tmptime
93 else
94 continue
95 fi
96 sec_min=$((logtime - 60))
97 sec_max=$((logtime + 60))
98 jmin="$(date -d @$sec_min "+%F %H:%M:%S")"
99 jmax="$(date -d @$sec_max "+%F %H:%M:%S")"
100 description=$(systemctl cat $service | sed -rn 's/^ *Description=(.*)/\1/p')
101 jrregex="^Starting $description"
102 if [[ $service == spamassassin ]]; then
103 jrregex+="\|^spamd: restarting"
104 fi
105 d "jrregex=$jrregex jmin=$jmin jmax=$jmax"
106 # the sed clears out the initial time and process+pid
107 if ! journalctl -u $service -S "$jmin" -U "$jmax" \
108 | sed -r 's/^([^[:space:]]*[[:space:]]+){5}//' | grep "$jrregex" &>/dev/null; then
109 wipe=false
110 break
111 fi
112 done < <(awk "/$regex/ "'{print $1,$2}' $pl)
113 if $found && $wipe; then
114 d "wiping $regex"
115 if [[ ! -w $pl-archive ]]; then
116 touch $pl-archive
117 chgrp adm $pl-archive
118 chmod 664 $pl-archive
119 fi
120 grep -E "$regex" $pl >> $pl-archive ||:
121 sed -ri "/$regex/d" $pl
122 fi
123 done <<'EOF'
124 clamav-daemon malware acl condition
125 spamassassin spam acl condition
126 EOF
127 }
128
129 if [[ $INVOCATION_ID ]]; then
130 # this is to prevent systemd from filling up the journal
131 for (( runcount=0; runcount < 100; runcount++ )); do
132 main
133 sleep 30
134 done
135 else
136 main
137 fi