From b18dade73dedfe69aa741f8417947d83c4208f2d Mon Sep 17 00:00:00 2001 From: Ian Kelling Date: Sun, 6 Mar 2022 08:09:05 -0500 Subject: [PATCH] constantly firing timers cause systemd to think startup never finishes --- brc2 | 42 ++- btrfsmaint | 9 +- conflink | 11 + distro-end | 19 +- dynamic-ip-update | 14 +- epanic-clean | 9 +- filesystem/etc/prometheus/alertmanager.yml | 21 -- filesystem/etc/prometheus/rules/iank.yml | 331 +++++++++--------- filesystem/etc/schroot/desktop/copyfiles | 9 + .../etc/systemd/system/btrfsmaintstop.timer | 11 - .../etc/systemd/system/dynamicipupdate.timer | 11 - .../etc/systemd/system/epanicclean.service | 10 +- .../etc/systemd/system/epanicclean.timer | 11 - .../etc/systemd/system/systemstatus.service | 11 +- .../etc/systemd/system/systemstatus.timer | 11 - install-my-scripts | 34 +- mail-cert-cron | 2 +- mail-setup | 22 +- mailtest-check | 4 +- primary-setup | 12 + schrootupdate | 3 +- system-status | 55 ++- 22 files changed, 370 insertions(+), 292 deletions(-) delete mode 100644 filesystem/etc/prometheus/alertmanager.yml create mode 100644 filesystem/etc/schroot/desktop/copyfiles delete mode 100644 filesystem/etc/systemd/system/btrfsmaintstop.timer delete mode 100644 filesystem/etc/systemd/system/dynamicipupdate.timer delete mode 100644 filesystem/etc/systemd/system/epanicclean.timer delete mode 100644 filesystem/etc/systemd/system/systemstatus.timer diff --git a/brc2 b/brc2 index 5cd83b2..ddfe816 100644 --- a/brc2 +++ b/brc2 @@ -169,8 +169,10 @@ EOF fi sudo chroot $d apt-get update sudo DEBIAN_FRONTEND=noninteractive chroot $d apt-get -y dist-upgrade --purge --auto-remove - sudo DEBIAN_FRONTEND=noninteractive schroot -c $n -- apt-get install --allow-unauthenticated -y ${apps[@]} sudo cp -P {,$d}/etc/localtime + if (( ${#apps[@]} )); then + sudo DEBIAN_FRONTEND=noninteractive schroot -c $n -- apt-get install --allow-unauthenticated -y ${apps[@]} + fi } @@ -1473,7 +1475,6 @@ testmail() { # always run this first, edit the test files, then run the following testsieve() { sieve-filter ~/sieve/maintest.sieve ${1:-INBOX} delete 2> >(head; tail) >/tmp/testsieve.log && sed -rn '/^Performed actions:/,/^[^ ]/{/^ /p}' /tmp/testsieve.log | sort | uniq -c - _dosieve } runsieve() { c ~/sieve; cp personal{test,}.sieve; cp lists{test,}.sieve; cp personalend{test,}.sieve @@ -1487,14 +1488,14 @@ runsieve() { alertme() { if [[ -t 0 ]]; then exim -t < 10 - for: 1h + time() - mailtest_check_last_usec > 60 * 12 labels: - severity: warning - - alert: NodeNetworkTransmitErrs + severity: day annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - summary: Network interface is reporting many transmit errors. - expr: |- - increase(node_network_transmit_errs_total[2m]) > 10 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used' - summary: Number of conntrack are getting close to the limit - expr: |- - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure - NTP is configured correctly on this host. - summary: Clock skew detected. - expr: |- - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured - on this host. - summary: Clock not synchronising. - expr: |- - min_over_time(node_timex_sync_status[5m]) == 0 - for: 10m - labels: - severity: warning - - alert: ianktest - expr: node_systemd_version >= 300 - labels: - severity: critical - annotations: - description: '{{ $labels.instance }} ianktest.' - summary: Instance {{ $labels.instance }} - ianktest - - alert: ianktest - expr: node_systemd_version >= 300 - labels: - severity: critical - annotations: - description: '{{ $labels.instance }} ianktest.' - summary: Instance {{ $labels.instance }} - ianktest + description: '{{ $labels.instance }} mailtest-check' + summary: {{ $labels.instance }} mailtest-check - - alert: ianktest + - alert: mailtest-check expr: |- - time() - mailtest_check_last_usec > 60 * 8 + # 42 mins: enough for a 30 min queue run plus 12 + time() - mailtest_check_last_usec > 60 * 42 labels: - severity: critical + severity: prod annotations: - description: '{{ $labels.instance }} mailtest' - summary: Instance {{ $labels.instance }} - ianktest + description: '{{ $labels.instance }} mailtest-check' + summary: {{ $labels.instance }} mailtest-check + + + + # - alert: NodeFilesystemAlmostOutOfSpace + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available space left. + # summary: Filesystem has less than 5% space left. + # expr: |- + # ( + # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: warning + # - alert: NodeFilesystemAlmostOutOfSpace + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available space left. + # summary: Filesystem has less than 3% space left. + # expr: |- + # ( + # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: critical + # - alert: NodeFilesystemFilesFillingUp + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available inodes left and is filling up. + # summary: Filesystem is predicted to run out of inodes within the next 24 hours. + # expr: |- + # ( + # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 + # and + # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: warning + # - alert: NodeFilesystemFilesFillingUp + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + # summary: Filesystem is predicted to run out of inodes within the next 4 hours. + # expr: |- + # ( + # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 + # and + # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: critical + # - alert: NodeFilesystemAlmostOutOfFiles + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available inodes left. + # summary: Filesystem has less than 5% inodes left. + # expr: |- + # ( + # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: warning + # - alert: NodeFilesystemAlmostOutOfFiles + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available inodes left. + # summary: Filesystem has less than 3% inodes left. + # expr: |- + # ( + # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: critical + # - alert: NodeNetworkReceiveErrs + # annotations: + # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + # {{ printf "%.0f" $value }} receive errors in the last two minutes.' + # summary: Network interface is reporting many receive errors. + # expr: |- + # increase(node_network_receive_errs_total[2m]) > 10 + # for: 1h + # labels: + # severity: warning + # - alert: NodeNetworkTransmitErrs + # annotations: + # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + # {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + # summary: Network interface is reporting many transmit errors. + # expr: |- + # increase(node_network_transmit_errs_total[2m]) > 10 + # for: 1h + # labels: + # severity: warning + # - alert: NodeHighNumberConntrackEntriesUsed + # annotations: + # description: '{{ $value | humanizePercentage }} of conntrack entries are used' + # summary: Number of conntrack are getting close to the limit + # expr: |- + # (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + # labels: + # severity: warning + # - alert: NodeClockSkewDetected + # annotations: + # message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure + # NTP is configured correctly on this host. + # summary: Clock skew detected. + # expr: |- + # ( + # node_timex_offset_seconds > 0.05 + # and + # deriv(node_timex_offset_seconds[5m]) >= 0 + # ) + # or + # ( + # node_timex_offset_seconds < -0.05 + # and + # deriv(node_timex_offset_seconds[5m]) <= 0 + # ) + # for: 10m + # labels: + # severity: warning + # - alert: NodeClockNotSynchronising + # annotations: + # message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured + # on this host. + # summary: Clock not synchronising. + # expr: |- + # min_over_time(node_timex_sync_status[5m]) == 0 + # for: 10m + # labels: + # severity: warning + # - alert: ianktest + # expr: node_systemd_version >= 300 + # labels: + # severity: critical + # annotations: + # description: '{{ $labels.instance }} ianktest.' + # summary: Instance {{ $labels.instance }} - ianktest diff --git a/filesystem/etc/schroot/desktop/copyfiles b/filesystem/etc/schroot/desktop/copyfiles new file mode 100644 index 0000000..3ac2289 --- /dev/null +++ b/filesystem/etc/schroot/desktop/copyfiles @@ -0,0 +1,9 @@ +# Files to copy into the chroot from the host system. +# +# + +# ian comented out for this error +#E: 20copyfiles: cp: '/etc/resolv.conf' and '/var/run/schroot/mount/impish-59f88d2b-b06e-4413-9f1e-33ad4d1af6e4/etc/resolv.conf' are the same file +#E: impish-59f88d2b-b06e-4413-9f1e-33ad4d1af6e4: Chroot setup failed: stage=setup-start + +#/etc/resolv.conf diff --git a/filesystem/etc/systemd/system/btrfsmaintstop.timer b/filesystem/etc/systemd/system/btrfsmaintstop.timer deleted file mode 100644 index eebfc94..0000000 --- a/filesystem/etc/systemd/system/btrfsmaintstop.timer +++ /dev/null @@ -1,11 +0,0 @@ -[Unit] -Description=btrfsmaintstop - -[Timer] -# for initial run. required. -OnActiveSec=10 -# for subsequent runs. -OnUnitInactiveSec=20 - -[Install] -WantedBy=timers.target diff --git a/filesystem/etc/systemd/system/dynamicipupdate.timer b/filesystem/etc/systemd/system/dynamicipupdate.timer deleted file mode 100644 index 099350d..0000000 --- a/filesystem/etc/systemd/system/dynamicipupdate.timer +++ /dev/null @@ -1,11 +0,0 @@ -[Unit] -Description=dynamicipupdate - -[Timer] -# for initial run. required. -OnActiveSec=10 -# for subsequent runs. -OnUnitInactiveSec=30 - -[Install] -WantedBy=timers.target diff --git a/filesystem/etc/systemd/system/epanicclean.service b/filesystem/etc/systemd/system/epanicclean.service index d7f2231..bc79520 100644 --- a/filesystem/etc/systemd/system/epanicclean.service +++ b/filesystem/etc/systemd/system/epanicclean.service @@ -1,7 +1,13 @@ [Unit] Description=epanic-clean -After=multi-user.target +After=local-fs.target +StartLimitIntervalSec=0 [Service] -Type=oneshot +Type=simple ExecStart=/usr/local/bin/sysd-mail-once -3 epanic-clean /usr/local/bin/epanic-clean +Restart=always +RestartSec=600 + +[Install] +WantedBy=grahical.target diff --git a/filesystem/etc/systemd/system/epanicclean.timer b/filesystem/etc/systemd/system/epanicclean.timer deleted file mode 100644 index c8d7d39..0000000 --- a/filesystem/etc/systemd/system/epanicclean.timer +++ /dev/null @@ -1,11 +0,0 @@ -[Unit] -Description=epanic-clean - -[Timer] -# for initial run. required. -OnActiveSec=10 -# for subsequent runs. -OnUnitInactiveSec=30 - -[Install] -WantedBy=timers.target diff --git a/filesystem/etc/systemd/system/systemstatus.service b/filesystem/etc/systemd/system/systemstatus.service index 725df34..e21a4a3 100644 --- a/filesystem/etc/systemd/system/systemstatus.service +++ b/filesystem/etc/systemd/system/systemstatus.service @@ -1,12 +1,19 @@ [Unit] Description=systemstatus -After=multi-user.target +StartLimitIntervalSec=0 +After=local-fs.target [Service] -Type=oneshot +Type=simple Environment=XDG_RUNTIME_DIR=/run/user/1000 ExecStart=/usr/local/bin/sysd-mail-once -3 systemstatus /usr/local/bin/system-status IOSchedulingClass=idle CPUSchedulingPolicy=idle User=iank Group=iank +Restart=always +RestartSec=600 + + +[Install] +WantedBy=grahical.target diff --git a/filesystem/etc/systemd/system/systemstatus.timer b/filesystem/etc/systemd/system/systemstatus.timer deleted file mode 100644 index 80e2f74..0000000 --- a/filesystem/etc/systemd/system/systemstatus.timer +++ /dev/null @@ -1,11 +0,0 @@ -[Unit] -Description=systemstatus - -[Timer] -# for initial run. required. -OnActiveSec=10 -# for subsequent runs. -OnUnitInactiveSec=20 - -[Install] -WantedBy=timers.target diff --git a/install-my-scripts b/install-my-scripts index 67a576a..5833373 100755 --- a/install-my-scripts +++ b/install-my-scripts @@ -36,11 +36,9 @@ x="$(readlink -f -- "${BASH_SOURCE[0]}")"; cd ${x%/*} # directory of this file # changed, so that should fix it. /a/bin/log-quiet/setup rsync -t --chmod=755 --chown=root:root switch-mail-host btrbk-run mount-latest-subvol \ - check-subvol-stale system-status myi3status mailtest-check \ - epanic-clean mailbindwatchdog \ + check-subvol-stale myi3status mailtest-check \ + mailbindwatchdog \ /a/bin/log-quiet/sysd-mail-once hssh \ - btrfsmaint \ - dynamic-ip-update \ check-mailq \ unsaved-buffers.el \ mail-backup-clean \ @@ -48,3 +46,31 @@ rsync -t --chmod=755 --chown=root:root switch-mail-host btrbk-run mount-latest-s ip6tables-exim \ /usr/local/bin rsync -t --chmod=755 --chown=root:root /a/bin/errhandle/err /usr/local/lib + +cmd=( rsync -aiSAX --chown=root:root --chmod=g-s + epanic-clean + system-status + btrfsmaint + dynamic-ip-update /usr/local/bin + ) + +sre() { + service=$1 + if [[ $(systemctl is-active $1.service ||:) != inactive ]]; then + systemctl restart $service + fi + +} + +while read -r line; do + file="${line:12}" + echo $file + case $file in + btrfsmaint) + sre btrfsmaintstop + ;; + *) + sre ${file//-/} + ;; + esac +done < <("${cmd[@]}") diff --git a/mail-cert-cron b/mail-cert-cron index 5b63b9a..cee7568 100755 --- a/mail-cert-cron +++ b/mail-cert-cron @@ -20,7 +20,7 @@ case $HOSTNAME in $MAIL_HOST|bk) local_mx=mail.iankelling.org # ||: is to allow for temporary connection issues. - rsync "${opt[@]}" -ogtL --chown=root:Debian-exim --chmod=640 \ + rsync ${opt[@]} -ogtL --chown=root:Debian-exim --chmod=640 \ root@li.iankelling.org:/etc/letsencrypt/live/mail.iankelling.org/{fullchain.pem,privkey.pem} /etc/exim4 ||: if ! openssl x509 -checkend $(( 60 * 60 * 24 * 3 )) -noout -in /etc/exim4/fullchain.pem; then echo "$0: error!: cert rsync failed and it will expire in less than 3 days" diff --git a/mail-setup b/mail-setup index aca7d8d..71b086b 100755 --- a/mail-setup +++ b/mail-setup @@ -169,9 +169,10 @@ fi # background: dovecot does not yet have ocsp stapling support # reference: https://community.letsencrypt.org/t/simple-guide-using-lets-encrypt-ssl-certs-with-dovecot/2921 # -# for phone, k9mail, same thing but username alerts, pass in ivy-pass. +# for phone, k9mail, fdroid, same thing but username alerts, pass in ivy-pass. # also, bk.b8.nz for secondary alerts, username is iank. same alerts pass. -# fetching mail settings: folder poll frequency 10 minutes +# fetching mail settings: folder poll frequency 10 minutes. +# account settings, fetching mail, push folders: All. Then disable the persistent notification. ####### @@ -2996,7 +2997,7 @@ if $reload; then m systemctl daemon-reload fi -m systemctl --now enable epanicclean.timer +m systemctl --now enable epanicclean case $HOSTNAME in je) @@ -3037,7 +3038,7 @@ case $HOSTNAME in fi if ! systemctl is-active clamav-daemon >/dev/null; then m systemctl --now enable clamav-daemon - out=$(rsync -aiSAX --chown=root:root --chmod=g-s /a/bin/ds/filesystem/etc/systemd/system/epanicclean.{timer,service} /etc/systemd/system) + out=$(rsync -aiSAX --chown=root:root --chmod=g-s /a/bin/ds/filesystem/etc/systemd/system/epanicclean.service /etc/systemd/system) if [[ $out ]]; then reload=true fi @@ -3104,16 +3105,13 @@ case $HOSTNAME in cat >/etc/cron.d/mailtest <&2; } if [[ $1 ]]; then new_host=$1 if [[ $new_host == localhost ]]; then + mailhost_p=1 new_host=$HOSTNAME + else + mailhost_p=0 + fi + + # https://www.robustperception.io/how-to-have-labels-for-machine-roles + dir=/var/lib/prometheus/node-exporter + if [[ -e $dir ]]; then + cat > $dir/mailhost.prom <&2' ERR [[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@" -for n in buster; do +for n in bullseye; do if [[ -e /etc/schroot/chroot.d/$n.conf ]]; then cd / + schroot -c $n -- apt-get -y update schroot -c $n -- apt-get -y dist-upgrade --purge --auto-remove fi done diff --git a/system-status b/system-status index 051de62..28c0035 100644 --- a/system-status +++ b/system-status @@ -2,8 +2,8 @@ # Copyright (C) 2019 Ian Kelling # SPDX-License-Identifier: AGPL-3.0-or-later -# usage: runs 4 times every 15 seconds unless any args are passed, or we -# are on battery power, then just runs once. +# usage: runs once every 15 seconds unless any args are passed, or we +# then just runs once. On battery power, run once per minute. if [ -z "$BASH_VERSION" ]; then echo "error: shell is not bash" >&2; exit 1; fi @@ -33,6 +33,11 @@ lo() { /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost } +loday() { + /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylerts@iankelling.org +} + + write-status() { chars=("${first_chars[@]}") @@ -83,19 +88,26 @@ write-status() { if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then var_mail_msg="message in /var/mail" fi - lo -1 var_mail $var_mail_msg + loday -1 var_mail $var_mail_msg glob=(/m/md/bounces/new/*) if [[ -e ${glob[0]} ]]; then chars+=("BOUNCE") bouncemsg="message in /m/md/bounces/new" fi - lo -1 bounce $bouncemsg + loday -1 bounce $bouncemsg # emails without the S (seen) flag. this only checks the last flag, # but its good enough for me. glob=(/m/md/alerts/{new,cur}/!(*,S)) if [[ -e ${glob[0]} ]]; then chars+=("A") fi + + glob=(/m/md/daylerts/{new,cur}/!(*,S)) + if [[ -e ${glob[0]} ]]; then + chars+=("L") + fi + + tmp=(/var/local/cron-errors/mailtest-check*) if (( ${#tmp[@]} )); then chars+=("MAILPING") @@ -105,8 +117,9 @@ write-status() { chars+=("SPAMD") fi - # early in install process, we dont have permission yet for exiqgrep - qlen=$(/usr/sbin/exiqgrep -o 600 -c -b | awk '{print $1}') ||: + # early in install process, we dont have permission yet for exiqgrep. + # 1100 helps allow for system restarts + qlen=$(/usr/sbin/exiqgrep -o 1100 -c -b | awk '{print $1}') ||: if ((qlen)); then qmsg="queue length $qlen" chars+=("q $qlen") @@ -115,7 +128,7 @@ write-status() { # No point in emailing about the mailq on a host where we don't # check email. $MAIL_HOST|bk) - lo -120 qlen $qmsg + loday -120 qlen $qmsg ;; esac @@ -213,7 +226,7 @@ write-status() { chars+=("BTRBK.TIMER") bbkmsg="btrbk.timer not enabled" fi - lo -960 btrbk.timer $bbkmsg + lo -48 btrbk.timer $bbkmsg ## check if last snapshot was within an hour vol=o @@ -258,10 +271,6 @@ write-status() { # use this if we want to do something just once per minute first_chars=() -power=true -if [[ -e /sys/class/power_supply/AC/online && $(