From: Ian Kelling <ian@iankelling.org>
Date: Tue, 8 Mar 2022 03:04:19 +0000 (-0500)
Subject: various fixes
X-Git-Url: https://iankelling.org/git/?p=distro-setup;a=commitdiff_plain;h=d7551546ac323c5d4b49370c885646bcf96e959f

various fixes
---

diff --git a/brc2 b/brc2
index ddfe816..6b97783 100644
--- a/brc2
+++ b/brc2
@@ -381,6 +381,22 @@ bigclock() {
 
 nnn() { /a/opt/nnn -H "$@"; }
 
+locat() { # log-once cat
+  local files
+  ngset
+  files=(/var/local/cron-errors/* /home/iank/cron-errors/* /sysd-mail-once-state/*)
+  case ${#files[@]} in
+    0) : ;;
+    1)
+      echo ${files[0]}
+      head ${files[0]}
+      ;;
+    *)
+      head ${files[@]}
+      ;;
+  esac
+  ngreset
+}
 
 # duplicated somewhat below.
 jrun() { # journal run. run args, log to journal, tail and grep the journal.
diff --git a/check-stale-alerts b/check-stale-alerts
index a6d8c82..1ecb58b 100755
--- a/check-stale-alerts
+++ b/check-stale-alerts
@@ -1,25 +1,19 @@
 #!/bin/bash
 
-time_arg="-ctime +4"
-case $1 in
-  now)
-    time_arg=
-    ;;
-esac
 
 if [[ ! -e /dev/shm/iank-status ]]; then
   exit 0
 fi
 eval $(< /dev/shm/iank-status)
 
-if [[ $HOSTNAME != "$MAIL_HOST" ]]; then
-  exit 0
-fi
-out=$(find /var/local/cron-errors /home/iank/cron-errors /sysd-mail-once-state -type f -ctime +4)
+out=$(find /var/local/cron-errors /home/iank/cron-errors /sysd-mail-once-state -type f)
 if [[ $out ]]; then
   echo HOSTNAME: $HOSTNAME
   printf "%s\n" "$out"
 fi
+if [[ $HOSTNAME != "$MAIL_HOST" ]]; then
+  exit 0
+fi
 for h in {li,bk,je}.b8.nz; do
   out=$(ssh $h find /m/md/bounces/new /var/local/cron-errors /home/iank/cron-errors /sysd-mail-once-state -type f)
   if [[ $out ]]; then
diff --git a/conflink b/conflink
index d7323fd..d3a7b2a 100755
--- a/conflink
+++ b/conflink
@@ -90,6 +90,11 @@ common-file-setup() {
       while read -r line; do
         file="${line:12}"
         case $file in
+          etc/prometheus/rules/iank.yml)
+            case $HOSTNAME in
+              kd) m s systemctl reload prometheus ;;
+            esac
+            ;;
           etc/systemd/system/*)
             reload_systemd=true
             ;;
diff --git a/distro-end b/distro-end
index ff974f9..407987a 100755
--- a/distro-end
+++ b/distro-end
@@ -1881,7 +1881,6 @@ case $HOSTNAME in
   # either use iptables or, in
   # /etc/default/prometheus-node-exporter
   # listen on the wireguard interface
-  ;;
   li|je|bk)
     # ex for exporter
     web-conf -p 9101 -f 9100 - apache2 ${HOSTNAME}ex.b8.nz <<'EOF'
@@ -1896,7 +1895,7 @@ Require valid-user
 EOF
     ;;
   *)
-    wgip=$(sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf)
+    wgip=$(command sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf)
     web-conf -i -a $wgip -p 9101 -f 9100 - apache2 ${HOSTNAME}wg.b8.nz <<'EOF'
 <Location "/">
 AuthType Basic
diff --git a/filesystem/etc/default/prometheus b/filesystem/etc/default/prometheus
index 63d1ee3..e733d59 100644
--- a/filesystem/etc/default/prometheus
+++ b/filesystem/etc/default/prometheus
@@ -2,7 +2,7 @@
 
 # Set the command-line arguments to pass to the server.
 
-ARGS="--web.listen-address=127.0.0.1:9090"
+ARGS="--web.listen-address=127.0.0.1:9090 --web.external-url=https://i.b8.nz:9091"
 
 
 
diff --git a/filesystem/etc/prometheus/file_sd/node.yml b/filesystem/etc/prometheus/file_sd/node.yml
index 8372ddc..e58d520 100644
--- a/filesystem/etc/prometheus/file_sd/node.yml
+++ b/filesystem/etc/prometheus/file_sd/node.yml
@@ -1,10 +1,10 @@
 - targets:
   - kdwg:9101
-  # - sywg:9101
+  - sywg:9101
   # - bk:9101
   # - je:9101
   # - li:9101
   # - frodo:9101
   # - kwwg:9101
   # - x3wg:9101
-  # - x2wg:9101
+  - x2wg:9101
diff --git a/filesystem/etc/prometheus/prometheus.yml b/filesystem/etc/prometheus/prometheus.yml
index 9932335..97ac447 100644
--- a/filesystem/etc/prometheus/prometheus.yml
+++ b/filesystem/etc/prometheus/prometheus.yml
@@ -29,6 +29,9 @@ scrape_configs:
 
     static_configs:
       - targets: ['localhost:9090']
+  - job_name: 'alertmanager'
+    static_configs:
+      - targets: ['localhost:9093']
 
   - job_name: node
     basic_auth:
diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml
index 72b0701..4439c41 100644
--- a/filesystem/etc/prometheus/rules/iank.yml
+++ b/filesystem/etc/prometheus/rules/iank.yml
@@ -1,6 +1,12 @@
+# other rules to consider:
+# filesystem, network, ntp rules:
+# https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml
+# on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml
+#
+
 
 groups:
-- name: standard alerts
+- name: standard
   rules:
   - alert: mailtest-check
     expr: |-
@@ -9,170 +15,258 @@ groups:
       severity: day
     annotations:
       description: '{{ $labels.instance }} mailtest-check'
-      summary: {{ $labels.instance }} mailtest-check
+      summary: '{{ $labels.instance }} mailtest-check'
 
+  # 42 mins: enough for a 30 min queue run plus 12
   - alert: mailtest-check
     expr: |-
-      # 42 mins: enough for a 30 min queue run plus 12
       time() - mailtest_check_last_usec > 60 * 42
     labels:
       severity: prod
     annotations:
       description: '{{ $labels.instance }} mailtest-check'
-      summary: {{ $labels.instance }} mailtest-check
+      summary: '{{ $labels.instance }} mailtest-check'
 
+  - alert: 1pmtest
+    expr: hour() == 18 and minute() < 5
+    for: 0m
+    labels:
+      severity: daytest
+    annotations:
+      summary: Prometheus daily test alert (instance {{ $labels.instance }})
+      description: "Prometheus daily test alert if no other alerts. It
+    is an end to end test.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
 
-  # - alert: NodeFilesystemAlmostOutOfSpace
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available space left.
-  #     summary: Filesystem has less than 5% space left.
-  #   expr: |-
-  #     (
-  #       node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeFilesystemAlmostOutOfSpace
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available space left.
-  #     summary: Filesystem has less than 3% space left.
-  #   expr: |-
-  #     (
-  #       node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: critical
-  # - alert: NodeFilesystemFilesFillingUp
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available inodes left and is filling up.
-  #     summary: Filesystem is predicted to run out of inodes within the next 24 hours.
-  #   expr: |-
-  #     (
-  #       node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
-  #     and
-  #       predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeFilesystemFilesFillingUp
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
-  #     summary: Filesystem is predicted to run out of inodes within the next 4 hours.
-  #   expr: |-
-  #     (
-  #       node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
-  #     and
-  #       predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: critical
-  # - alert: NodeFilesystemAlmostOutOfFiles
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available inodes left.
-  #     summary: Filesystem has less than 5% inodes left.
-  #   expr: |-
-  #     (
-  #       node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeFilesystemAlmostOutOfFiles
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available inodes left.
-  #     summary: Filesystem has less than 3% inodes left.
-  #   expr: |-
-  #     (
-  #       node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: critical
-  # - alert: NodeNetworkReceiveErrs
-  #   annotations:
-  #     description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
-  #       {{ printf "%.0f" $value }} receive errors in the last two minutes.'
-  #     summary: Network interface is reporting many receive errors.
-  #   expr: |-
-  #     increase(node_network_receive_errs_total[2m]) > 10
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeNetworkTransmitErrs
-  #   annotations:
-  #     description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
-  #       {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
-  #     summary: Network interface is reporting many transmit errors.
-  #   expr: |-
-  #     increase(node_network_transmit_errs_total[2m]) > 10
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeHighNumberConntrackEntriesUsed
-  #   annotations:
-  #     description: '{{ $value | humanizePercentage }} of conntrack entries are used'
-  #     summary: Number of conntrack are getting close to the limit
-  #   expr: |-
-  #     (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
+# https://awesome-prometheus-alerts.grep.to/rules
+
+
+# todo, we should probably group the prometheus alerts that indicate a
+# host-local problem.
+# eg, set a label  alert-group: local-prom, then make a receiver that
+# groups by it when the alert-group is local-prom.
+
+- name: awesome prometheus alerts
+  rules:
+
+  - alert: PrometheusJobMissing
+    expr: absent(up{job="prometheus"})
+    for: 30m
+    labels:
+      severity: day
+    annotations:
+      summary: Prometheus job missing (instance {{ $labels.instance }})
+      description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTargetMissing
+    expr: up == 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus target missing (instance {{ $labels.instance }})
+      description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      # todo: this should supress the above alert
+  # - alert: PrometheusAllTargetsMissing
+  #   expr: count by (job) (up) == 0
+  #   for: 30m
   #   labels:
-  #     severity: warning
-  # - alert: NodeClockSkewDetected
+  #     severity: day
+  #     alert-group: local-prom
   #   annotations:
-  #     message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
-  #       NTP is configured correctly on this host.
-  #     summary: Clock skew detected.
-  #   expr: |-
-  #     (
-  #       node_timex_offset_seconds > 0.05
-  #     and
-  #       deriv(node_timex_offset_seconds[5m]) >= 0
-  #     )
-  #     or
-  #     (
-  #       node_timex_offset_seconds < -0.05
-  #     and
-  #       deriv(node_timex_offset_seconds[5m]) <= 0
-  #     )
-  #   for: 10m
+  #     summary: Prometheus all targets missing (instance {{ $labels.instance }})
+  #     description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusConfigurationReloadFailure
+    expr: prometheus_config_last_reload_successful != 1
+    for: 30m
+    labels:
+      severity: day
+    annotations:
+      summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      # I have an out of band alert to make sure prometheus is up. this
+      # looks like it would generate false positives. todo: think
+      # through what a valid crash loop detection would look like.
+  # - alert: PrometheusTooManyRestarts
+  #   expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 10
+  #   for: 0m
   #   labels:
   #     severity: warning
-  # - alert: NodeClockNotSynchronising
   #   annotations:
-  #     message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
-  #       on this host.
-  #     summary: Clock not synchronising.
-  #   expr: |-
-  #     min_over_time(node_timex_sync_status[5m]) == 0
-  #   for: 10m
-  #   labels:
-  #     severity: warning
-  # - alert: ianktest
-  #   expr: node_systemd_version >= 300
+  #     summary: Prometheus too many restarts (instance {{ $labels.instance }})
+  #     description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusAlertmanagerJobMissing
+    expr: absent(up{job="alertmanager"})
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+      description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusAlertmanagerConfigurationReloadFailure
+    expr: alertmanager_config_last_reload_successful != 1
+    for: 30m
+    labels:
+      severity: day
+    annotations:
+      summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusNotConnectedToAlertmanager
+    expr: prometheus_notifications_alertmanagers_discovered < 1
+    for: 30m
+    labels:
+      severity: day
+    annotations:
+      summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusRuleEvaluationFailures
+    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTemplateTextExpansionFailures
+    expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusRuleEvaluationSlow
+    expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
+    for: 5m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+      description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusNotificationsBacklog
+    expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0
+    for: 0m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+      description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusAlertmanagerNotificationFailing
+    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+      description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  # file_sd doesnt count as service discovery, so 0 is expected.
+  # - alert: PrometheusTargetEmpty
+  #   expr: prometheus_sd_discovered_targets == 0
+  #   for: 30m
   #   labels:
-  #     severity: critical
+  #     severity: day
   #   annotations:
-  #     description: '{{ $labels.instance }} ianktest.'
-  #     summary: Instance {{ $labels.instance }} - ianktest
+  #     summary: Prometheus target empty (instance {{ $labels.instance }})
+  #     description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTargetScrapingSlow
+    expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+      description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusLargeScrape
+    expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus large scrape (instance {{ $labels.instance }})
+      description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTargetScrapeDuplicate
+    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+      description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbCheckpointCreationFailures
+    expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbCheckpointDeletionFailures
+    expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbCompactionsFailed
+    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbHeadTruncationsFailed
+    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbReloadFailures
+    expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbWalCorruptions
+    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbWalTruncationsFailed
+    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/filesystem/etc/systemd/system/btrfsmaintstop.service b/filesystem/etc/systemd/system/btrfsmaintstop.service
index 31b4a65..5e8160d 100644
--- a/filesystem/etc/systemd/system/btrfsmaintstop.service
+++ b/filesystem/etc/systemd/system/btrfsmaintstop.service
@@ -1,7 +1,13 @@
 [Unit]
 Description=btrfsmaintstop
-After=multi-user.target
+After=local-fs.target
+StartLimitIntervalSec=0
 
 [Service]
-Type=oneshot
+Type=simple
 ExecStart=/usr/local/bin/sysd-mail-once -10 btrfsmaintstop /usr/local/bin/btrfsmaint check
+Restart=always
+RestartSec=600
+
+[Install]
+WantedBy=grahical.target
diff --git a/filesystem/etc/systemd/system/dynamicipupdate.service b/filesystem/etc/systemd/system/dynamicipupdate.service
index 48c3d44..54b04f9 100644
--- a/filesystem/etc/systemd/system/dynamicipupdate.service
+++ b/filesystem/etc/systemd/system/dynamicipupdate.service
@@ -1,7 +1,13 @@
 [Unit]
 Description=dynamicipupdate
-After=multi-user.target
+After=local-fs.target
+StartLimitIntervalSec=0
 
 [Service]
-Type=oneshot
+Type=simple
 ExecStart=/usr/local/bin/sysd-mail-once -40 dynamicipupdate /usr/local/bin/dynamic-ip-update
+Restart=always
+RestartSec=600
+
+[Install]
+WantedBy=grahical.target
diff --git a/install-my-scripts b/install-my-scripts
index 5833373..cb54350 100755
--- a/install-my-scripts
+++ b/install-my-scripts
@@ -38,7 +38,7 @@ x="$(readlink -f -- "${BASH_SOURCE[0]}")"; cd ${x%/*} # directory of this file
 rsync -t --chmod=755 --chown=root:root switch-mail-host btrbk-run mount-latest-subvol \
       check-subvol-stale myi3status mailtest-check \
       mailbindwatchdog \
-      /a/bin/log-quiet/sysd-mail-once hssh \
+      /a/bin/log-quiet/sysd-mail-once \
       check-mailq \
       unsaved-buffers.el \
       mail-backup-clean \
@@ -57,20 +57,21 @@ cmd=( rsync -aiSAX --chown=root:root --chmod=g-s
 sre() {
   service=$1
   if [[ $(systemctl is-active $1.service ||:) != inactive ]]; then
-    systemctl restart $service
+    # just fire and forget. sometimes a script restart can fail, but then
+    # then auto restart mechanism makes it succeed.
+    systemctl restart $service ||: &
   fi
 
 }
 
 while read -r line; do
   file="${line:12}"
-  echo $file
   case $file in
     btrfsmaint)
-      sre btrfsmaintstop
+      sre btrfsmaintstop &
       ;;
     *)
-      sre ${file//-/}
+      sre ${file//-/} &
       ;;
   esac
 done < <("${cmd[@]}")
diff --git a/mailtest-check b/mailtest-check
index befdbbf..7454086 100755
--- a/mailtest-check
+++ b/mailtest-check
@@ -78,7 +78,14 @@ esac
 
 getspamdpid() {
   if [[ ! $spamdpid || ! -d /proc/$spamdpid ]]; then
-    spamdpid=$(systemctl show --property MainPID --value spamassassin | sed 's/^1$//' ||:)
+    # try twice in case we are restarting, it happens.
+    for i in 1 2; do
+      spamdpid=$(systemctl show --property MainPID --value spamassassin | sed 's/^[10]$//' ||:)
+      if [[ $spamdpid ]]; then
+        break
+      fi
+      sleep 30
+    done
   fi
 }
 getspamdpid
@@ -135,7 +142,8 @@ for folder in ${folders[@]}; do
           # servers.
           # example line that sed is parsing:
           # (-0.1 / 5.0 requ) DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,SPF_HELO_PASS=-0.001,SPF_PASS=-0.001,TVD_SPACE_RATIO=0.001 autolearn=_AUTOLEARN
-          for r in $($spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" | tail -n2 | head -n1 | sed -r 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /g'); do
+          raw_results="$($spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" | tail -n2 | head -n1 | sed -r 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /g')"
+          for r in $raw_results; do
             case $r in
               # got this in an update 2022-01. dun care
               T_SCC_BODY_TEXT_LINE|SCC_BODY_SINGLE_WORD) : ;;
diff --git a/rootsshsync b/rootsshsync
index 6b28fcb..fa36a56 100755
--- a/rootsshsync
+++ b/rootsshsync
@@ -59,8 +59,8 @@ if [[ -e $user_ssh_dir/config ]]; then
 fi
 chown -R root:root /root/.ssh
 
-# notably: installs hssh
-/a/exe/install-my-scripts
+rsync -t --chmod=755 --chown=root:root /b/ds/hssh /usr/local/bin
+
 if [[ -e /a/opt/btrbk/ssh_filter_btrbk.sh ]]; then
   install /a/opt/btrbk/ssh_filter_btrbk.sh /usr/local/bin
 fi
diff --git a/system-status b/system-status
old mode 100644
new mode 100755
index 28c0035..07c730d
--- a/system-status
+++ b/system-status
@@ -37,10 +37,48 @@ loday() {
   /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylerts@iankelling.org
 }
 
-
+# todo, consider migrating some of these alerts into prometheus
 write-status() {
   chars=("${first_chars[@]}")
 
+  services=(
+    epanicclean
+    systemstatus
+    btrfsmaintstop
+    dynamicipupdate
+  )
+  bads=()
+  if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then
+    for s in ${services[@]}; do
+      if [[ $(systemctl show -p SubState --value $s) != running ]]; then
+        bads+=($s)
+      fi
+    done
+    chars+=(MYSERS)
+
+  fi
+  lo -240 mysers ${bads[*]}
+
+  services=(
+    prometheus-node-exporter
+    prometheus-alertmanager
+    prometheus
+  )
+  case $HOSTNAME in
+    kd)
+      bads=()
+      if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then
+        for s in ${services[@]}; do
+          if [[ $(systemctl show -p SubState --value $s) != running ]]; then
+            bads+=($s)
+          fi
+        done
+        chars+=(PROM)
+      fi
+      lo -240 prom ${bads[*]}
+      ;;
+  esac
+
   # clock us out in timetrap if are idle too long
   if [[ -e /p/.timetrap.db ]]; then
     export DISPLAY=:0
@@ -83,15 +121,18 @@ write-status() {
 
   glob=(/nocow/btrfs-stale/*)
   if [[ -e ${glob[0]} ]]; then
-    chars+=("STALE")
+    chars+=(STALE)
   fi
+  var_mail_msg=
   if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
     var_mail_msg="message in /var/mail"
   fi
   loday -1 var_mail $var_mail_msg
+
+  bouncemsg=
   glob=(/m/md/bounces/new/*)
   if [[ -e ${glob[0]} ]]; then
-    chars+=("BOUNCE")
+    chars+=(BOUNCE)
     bouncemsg="message in /m/md/bounces/new"
   fi
   loday -1 bounce $bouncemsg
@@ -99,27 +140,28 @@ write-status() {
   # but its good enough for me.
   glob=(/m/md/alerts/{new,cur}/!(*,S))
   if [[ -e ${glob[0]} ]]; then
-    chars+=("A")
+    chars+=(A)
   fi
 
   glob=(/m/md/daylerts/{new,cur}/!(*,S))
   if [[ -e ${glob[0]} ]]; then
-    chars+=("L")
+    chars+=(DAY)
   fi
 
 
   tmp=(/var/local/cron-errors/mailtest-check*)
   if (( ${#tmp[@]} )); then
-    chars+=("MAILPING")
+    chars+=(MAILPING)
   fi
   tmp=(/var/local/cron-errors/mailtest-slow*)
   if (( ${#tmp[@]} )); then
-    chars+=("SPAMD")
+    chars+=(SPAMD)
   fi
 
   # early in install process, we dont have permission yet for exiqgrep.
   # 1100 helps allow for system restarts
   qlen=$(/usr/sbin/exiqgrep -o 1100 -c -b | awk '{print $1}') ||:
+  qmsg=
   if ((qlen)); then
     qmsg="queue length $qlen"
     chars+=("q $qlen")
@@ -144,11 +186,11 @@ write-status() {
 
   # these conditions are so we dont have an overly verbose prompt
   if $begin && $end; then
-    chars+=("D")
+    chars+=(D)
   elif $begin; then
-    chars+=("DB")
+    chars+=(DB)
   elif $end; then
-    chars+=("DE")
+    chars+=(DE)
   else
     f=~/.local/conflink
     # shellcheck disable=SC2043
@@ -175,7 +217,7 @@ write-status() {
         # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
         if (( fmin < 0 )) && [[ $(find ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
           v conflink newer filesystem files
-          chars+=("CONFLINK")
+          chars+=(CONFLINK)
           break
         fi
 
@@ -188,7 +230,7 @@ write-status() {
           fi
           if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
             v conflink: newer files checked in to git
-            chars+=("CONFLINK")
+            chars+=(CONFLINK)
             break
           fi
 
@@ -198,7 +240,7 @@ write-status() {
           done < <(git ls-files -o --exclude-standard)
           if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
             v conflink: untracked in $d
-            chars+=("CONFLINK")
+            chars+=(CONFLINK)
             break
           fi
         done
@@ -207,13 +249,13 @@ write-status() {
       fi
       if [[ ! -e $f || $(<$f) != 0 ]]; then
         v conflink: last run not found or failed
-        chars+=("CONFLINK")
+        chars+=(CONFLINK)
         break
       fi
     done
   fi
 
-#  if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
+  #  if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
   if [[ -s /var/log/exim4/paniclog ]]; then
     chars+=("PANIC!")
     # leave it up to epanic-clean to send email notification
@@ -223,10 +265,10 @@ write-status() {
   if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
     bbkmsg=
     if [[ $(systemctl is-active btrbk.timer) != active ]]; then
-      chars+=("BTRBK.TIMER")
-      bbkmsg="btrbk.timer not enabled"
+      chars+=(BTRBK.TIMER)
+      bbkmsg="not enabled"
     fi
-    lo -48 btrbk.timer $bbkmsg
+    lo -480 btrbk.timer $bbkmsg
 
     ## check if last snapshot was within an hour
     vol=o
@@ -254,8 +296,9 @@ write-status() {
         maxtime=$t
       fi
     done
+    snapshotmsg=
     if (( maxtime < now - 4*60*60 )); then
-      chars+=("OLD-SNAP")
+      chars+=(OLD-SNAP)
       snapshotmsg="/o snapshot older than 4 hours"
     fi
     lo -1 old-snapshot $snapshotmsg
@@ -279,19 +322,19 @@ if [[ $1 ]]; then
 fi
 
 main-loop() {
-while true; do
-  power=true
-  if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
-    power=false
-  fi
-  wait=15
-  if ! $power; then
-    wait=60
-  fi
+  while true; do
+    power=true
+    if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
+      power=false
+    fi
+    wait=15
+    if ! $power; then
+      wait=60
+    fi
 
-  sleep $wait
-  write-status
-done
+    sleep $wait
+    write-status
+  done
 }
 
 # ensure our long operations are one line so we are not prone errors
diff --git a/ziva-backup-check b/ziva-backup-check
index 31ae7dd..664750d 100755
--- a/ziva-backup-check
+++ b/ziva-backup-check
@@ -20,10 +20,11 @@ fi
 ## begin check on btrbk
 now=$(date +%s)
 age_limit_sec=$(( 60 * 60 * 50 )) # 50 hours
-for vol in {root,boot}_ubuntubionic; do
-  snaps=(/mnt/r7/amy/btrbk/${vol}.20*)
+for prefix in root boot; do
+  vol=${prefix}_ubuntubionic
+  snaps=(/mnt/r7/amy/$prefix/btrbk/${vol}.20*)
   if [[ ! ${snaps[*]} ]]; then
-    err no snapshots starting with /mnt/r7/amy/btrbk/${vol}_ubuntubionic.20
+    err no snapshots starting with /mnt/r7/amy/$prefix/btrbk/${vol}.20
     break
   fi