various fixes

author Ian Kelling <ian@iankelling.org>

Tue, 8 Mar 2022 03:04:19 +0000 (22:04 -0500)

committer Ian Kelling <ian@iankelling.org>

Tue, 8 Mar 2022 03:08:40 +0000 (22:08 -0500)
author Ian Kelling <ian@iankelling.org>
Tue, 8 Mar 2022 03:04:19 +0000 (22:04 -0500)
committer Ian Kelling <ian@iankelling.org>
Tue, 8 Mar 2022 03:08:40 +0000 (22:08 -0500)
diff --git a/brc2 b/brc2

index ddfe816e236aff742d1b75e95f6c8c10d400f0ad..6b977839c870f2a4d37c0ae8bc3a3cc31aebe250 100644 (file)
--- a/brc2
+++ b/brc2
@@ -381,6 +381,22 @@ bigclock() {
  
  nnn() { /a/opt/nnn -H "$@"; }
  
+locat() { # log-once cat
+  local files
+  ngset
+  files=(/var/local/cron-errors/* /home/iank/cron-errors/* /sysd-mail-once-state/*)
+  case ${#files[@]} in
+    0) : ;;
+    1)
+      echo ${files[0]}
+      head ${files[0]}
+      ;;
+    *)
+      head ${files[@]}
+      ;;
+  esac
+  ngreset
+}
  
  # duplicated somewhat below.
  jrun() { # journal run. run args, log to journal, tail and grep the journal.
diff --git a/check-stale-alerts b/check-stale-alerts

index a6d8c822ee0f9f3db3859f7c3e4e40607b701cc1..1ecb58bf761aed1cacb9140ac845792d03c46163 100755 (executable)
--- a/check-stale-alerts
+++ b/check-stale-alerts
@@ -1,25 +1,19 @@
  #!/bin/bash
  
-time_arg="-ctime +4"
-case $1 in
-  now)
-    time_arg=
-    ;;
-esac
  
  if [[ ! -e /dev/shm/iank-status ]]; then
    exit 0
  fi
  eval $(< /dev/shm/iank-status)
  
-if [[ $HOSTNAME != "$MAIL_HOST" ]]; then
-  exit 0
-fi
-out=$(find /var/local/cron-errors /home/iank/cron-errors /sysd-mail-once-state -type f -ctime +4)
+out=$(find /var/local/cron-errors /home/iank/cron-errors /sysd-mail-once-state -type f)
  if [[ $out ]]; then
    echo HOSTNAME: $HOSTNAME
    printf "%s\n" "$out"
  fi
+if [[ $HOSTNAME != "$MAIL_HOST" ]]; then
+  exit 0
+fi
  for h in {li,bk,je}.b8.nz; do
    out=$(ssh $h find /m/md/bounces/new /var/local/cron-errors /home/iank/cron-errors /sysd-mail-once-state -type f)
    if [[ $out ]]; then
diff --git a/conflink b/conflink

index d7323fdf0e9d1ee29ff4c3cf8e8d035cf752aacd..d3a7b2a7663c09dbbf6cbee679a75790383c8dc6 100755 (executable)
--- a/conflink
+++ b/conflink
@@ -90,6 +90,11 @@ common-file-setup() {
        while read -r line; do
          file="${line:12}"
          case $file in
+          etc/prometheus/rules/iank.yml)
+            case $HOSTNAME in
+              kd) m s systemctl reload prometheus ;;
+            esac
+            ;;
            etc/systemd/system/*)
              reload_systemd=true
              ;;
diff --git a/distro-end b/distro-end

index ff974f9e6619642711d1696044326bb3c802ff8a..407987ad8c34852078ce352dd803d12fa589250f 100755 (executable)
--- a/distro-end
+++ b/distro-end
@@ -1881,7 +1881,6 @@ case $HOSTNAME in
    # either use iptables or, in
    # /etc/default/prometheus-node-exporter
    # listen on the wireguard interface
-  ;;
    li|je|bk)
      # ex for exporter
      web-conf -p 9101 -f 9100 - apache2 ${HOSTNAME}ex.b8.nz <<'EOF'
@@ -1896,7 +1895,7 @@ Require valid-user
  EOF
      ;;
    *)
-    wgip=$(sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf)
+    wgip=$(command sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf)
      web-conf -i -a $wgip -p 9101 -f 9100 - apache2 ${HOSTNAME}wg.b8.nz <<'EOF'
  <Location "/">
  AuthType Basic
diff --git a/filesystem/etc/default/prometheus b/filesystem/etc/default/prometheus

index 63d1ee3798fa0b2c109f67aa57718e9b2089e6c1..e733d592554eb0586fb29405e1c77be19f835b04 100644 (file)
--- a/filesystem/etc/default/prometheus
+++ b/filesystem/etc/default/prometheus
@@ -2,7 +2,7 @@
  
  # Set the command-line arguments to pass to the server.
  
-ARGS="--web.listen-address=127.0.0.1:9090"
+ARGS="--web.listen-address=127.0.0.1:9090 --web.external-url=https://i.b8.nz:9091"
  
  
  
diff --git a/filesystem/etc/prometheus/file_sd/node.yml b/filesystem/etc/prometheus/file_sd/node.yml

index 8372ddcc70863fdabdb6a9df04797f8c6862d7e4..e58d520af1d30823de94f403590b7ec9d632b46d 100644 (file)
--- a/filesystem/etc/prometheus/file_sd/node.yml
+++ b/filesystem/etc/prometheus/file_sd/node.yml
@@ -1,10 +1,10 @@
  - targets:
    - kdwg:9101
-  # - sywg:9101
+  - sywg:9101
    # - bk:9101
    # - je:9101
    # - li:9101
    # - frodo:9101
    # - kwwg:9101
    # - x3wg:9101
-  # - x2wg:9101
+  - x2wg:9101
diff --git a/filesystem/etc/prometheus/prometheus.yml b/filesystem/etc/prometheus/prometheus.yml

index 9932335b0dca4ef0d2728047879a5400fd8eab93..97ac447f19f42ebf5e84f48bf53fd78c6af267e2 100644 (file)
--- a/filesystem/etc/prometheus/prometheus.yml
+++ b/filesystem/etc/prometheus/prometheus.yml
@@ -29,6 +29,9 @@ scrape_configs:
  
      static_configs:
        - targets: ['localhost:9090']
+  - job_name: 'alertmanager'
+    static_configs:
+      - targets: ['localhost:9093']
  
    - job_name: node
      basic_auth:
diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml

index 72b0701be8cd0a808d1924eaab94e50dc3b41786..4439c41d41cb61c6a59a4bbb5b967fbf0aeebbb3 100644 (file)
--- a/filesystem/etc/prometheus/rules/iank.yml
+++ b/filesystem/etc/prometheus/rules/iank.yml
@@ -1,6 +1,12 @@
+# other rules to consider:
+# filesystem, network, ntp rules:
+# https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml
+# on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml
+#
+
  
  groups:
-- name: standard alerts
+- name: standard
    rules:
    - alert: mailtest-check
      expr: |-
@@ -9,170 +15,258 @@ groups:
        severity: day
      annotations:
        description: '{{ $labels.instance }} mailtest-check'
-      summary: {{ $labels.instance }} mailtest-check
+      summary: '{{ $labels.instance }} mailtest-check'
  
+  # 42 mins: enough for a 30 min queue run plus 12
    - alert: mailtest-check
      expr: |-
-      # 42 mins: enough for a 30 min queue run plus 12
        time() - mailtest_check_last_usec > 60 * 42
      labels:
        severity: prod
      annotations:
        description: '{{ $labels.instance }} mailtest-check'
-      summary: {{ $labels.instance }} mailtest-check
+      summary: '{{ $labels.instance }} mailtest-check'
  
+  - alert: 1pmtest
+    expr: hour() == 18 and minute() < 5
+    for: 0m
+    labels:
+      severity: daytest
+    annotations:
+      summary: Prometheus daily test alert (instance {{ $labels.instance }})
+      description: "Prometheus daily test alert if no other alerts. It
+    is an end to end test.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
  
  
-  # - alert: NodeFilesystemAlmostOutOfSpace
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available space left.
-  #     summary: Filesystem has less than 5% space left.
-  #   expr: |-
-  #     (
-  #       node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeFilesystemAlmostOutOfSpace
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available space left.
-  #     summary: Filesystem has less than 3% space left.
-  #   expr: |-
-  #     (
-  #       node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: critical
-  # - alert: NodeFilesystemFilesFillingUp
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available inodes left and is filling up.
-  #     summary: Filesystem is predicted to run out of inodes within the next 24 hours.
-  #   expr: |-
-  #     (
-  #       node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
-  #     and
-  #       predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeFilesystemFilesFillingUp
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
-  #     summary: Filesystem is predicted to run out of inodes within the next 4 hours.
-  #   expr: |-
-  #     (
-  #       node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
-  #     and
-  #       predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: critical
-  # - alert: NodeFilesystemAlmostOutOfFiles
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available inodes left.
-  #     summary: Filesystem has less than 5% inodes left.
-  #   expr: |-
-  #     (
-  #       node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeFilesystemAlmostOutOfFiles
-  #   annotations:
-  #     description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
-  #       only {{ printf "%.2f" $value }}% available inodes left.
-  #     summary: Filesystem has less than 3% inodes left.
-  #   expr: |-
-  #     (
-  #       node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
-  #     and
-  #       node_filesystem_readonly{job="node",fstype!=""} == 0
-  #     )
-  #   for: 1h
-  #   labels:
-  #     severity: critical
-  # - alert: NodeNetworkReceiveErrs
-  #   annotations:
-  #     description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
-  #       {{ printf "%.0f" $value }} receive errors in the last two minutes.'
-  #     summary: Network interface is reporting many receive errors.
-  #   expr: |-
-  #     increase(node_network_receive_errs_total[2m]) > 10
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeNetworkTransmitErrs
-  #   annotations:
-  #     description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
-  #       {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
-  #     summary: Network interface is reporting many transmit errors.
-  #   expr: |-
-  #     increase(node_network_transmit_errs_total[2m]) > 10
-  #   for: 1h
-  #   labels:
-  #     severity: warning
-  # - alert: NodeHighNumberConntrackEntriesUsed
-  #   annotations:
-  #     description: '{{ $value | humanizePercentage }} of conntrack entries are used'
-  #     summary: Number of conntrack are getting close to the limit
-  #   expr: |-
-  #     (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
+# https://awesome-prometheus-alerts.grep.to/rules
+
+
+# todo, we should probably group the prometheus alerts that indicate a
+# host-local problem.
+# eg, set a label  alert-group: local-prom, then make a receiver that
+# groups by it when the alert-group is local-prom.
+
+- name: awesome prometheus alerts
+  rules:
+
+  - alert: PrometheusJobMissing
+    expr: absent(up{job="prometheus"})
+    for: 30m
+    labels:
+      severity: day
+    annotations:
+      summary: Prometheus job missing (instance {{ $labels.instance }})
+      description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTargetMissing
+    expr: up == 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus target missing (instance {{ $labels.instance }})
+      description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      # todo: this should supress the above alert
+  # - alert: PrometheusAllTargetsMissing
+  #   expr: count by (job) (up) == 0
+  #   for: 30m
    #   labels:
-  #     severity: warning
-  # - alert: NodeClockSkewDetected
+  #     severity: day
+  #     alert-group: local-prom
    #   annotations:
-  #     message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
-  #       NTP is configured correctly on this host.
-  #     summary: Clock skew detected.
-  #   expr: |-
-  #     (
-  #       node_timex_offset_seconds > 0.05
-  #     and
-  #       deriv(node_timex_offset_seconds[5m]) >= 0
-  #     )
-  #     or
-  #     (
-  #       node_timex_offset_seconds < -0.05
-  #     and
-  #       deriv(node_timex_offset_seconds[5m]) <= 0
-  #     )
-  #   for: 10m
+  #     summary: Prometheus all targets missing (instance {{ $labels.instance }})
+  #     description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusConfigurationReloadFailure
+    expr: prometheus_config_last_reload_successful != 1
+    for: 30m
+    labels:
+      severity: day
+    annotations:
+      summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+      # I have an out of band alert to make sure prometheus is up. this
+      # looks like it would generate false positives. todo: think
+      # through what a valid crash loop detection would look like.
+  # - alert: PrometheusTooManyRestarts
+  #   expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 10
+  #   for: 0m
    #   labels:
    #     severity: warning
-  # - alert: NodeClockNotSynchronising
    #   annotations:
-  #     message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
-  #       on this host.
-  #     summary: Clock not synchronising.
-  #   expr: |-
-  #     min_over_time(node_timex_sync_status[5m]) == 0
-  #   for: 10m
-  #   labels:
-  #     severity: warning
-  # - alert: ianktest
-  #   expr: node_systemd_version >= 300
+  #     summary: Prometheus too many restarts (instance {{ $labels.instance }})
+  #     description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusAlertmanagerJobMissing
+    expr: absent(up{job="alertmanager"})
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+      description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusAlertmanagerConfigurationReloadFailure
+    expr: alertmanager_config_last_reload_successful != 1
+    for: 30m
+    labels:
+      severity: day
+    annotations:
+      summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusNotConnectedToAlertmanager
+    expr: prometheus_notifications_alertmanagers_discovered < 1
+    for: 30m
+    labels:
+      severity: day
+    annotations:
+      summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusRuleEvaluationFailures
+    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTemplateTextExpansionFailures
+    expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusRuleEvaluationSlow
+    expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
+    for: 5m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+      description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusNotificationsBacklog
+    expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0
+    for: 0m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+      description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusAlertmanagerNotificationFailing
+    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+      description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  # file_sd doesnt count as service discovery, so 0 is expected.
+  # - alert: PrometheusTargetEmpty
+  #   expr: prometheus_sd_discovered_targets == 0
+  #   for: 30m
    #   labels:
-  #     severity: critical
+  #     severity: day
    #   annotations:
-  #     description: '{{ $labels.instance }} ianktest.'
-  #     summary: Instance {{ $labels.instance }} - ianktest
+  #     summary: Prometheus target empty (instance {{ $labels.instance }})
+  #     description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTargetScrapingSlow
+    expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+      description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusLargeScrape
+    expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus large scrape (instance {{ $labels.instance }})
+      description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTargetScrapeDuplicate
+    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+      description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbCheckpointCreationFailures
+    expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbCheckpointDeletionFailures
+    expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbCompactionsFailed
+    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbHeadTruncationsFailed
+    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbReloadFailures
+    expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbWalCorruptions
+    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - alert: PrometheusTsdbWalTruncationsFailed
+    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+    for: 30m
+    labels:
+      severity: warn
+    annotations:
+      summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/filesystem/etc/systemd/system/btrfsmaintstop.service b/filesystem/etc/systemd/system/btrfsmaintstop.service

index 31b4a652b3d6836a782a4e6a33d9af30b475a55e..5e8160dc48eeee97a11792814684c6f7609e62a6 100644 (file)
--- a/filesystem/etc/systemd/system/btrfsmaintstop.service
+++ b/filesystem/etc/systemd/system/btrfsmaintstop.service
@@ -1,7 +1,13 @@
  [Unit]
  Description=btrfsmaintstop
-After=multi-user.target
+After=local-fs.target
+StartLimitIntervalSec=0
  
  [Service]
-Type=oneshot
+Type=simple
  ExecStart=/usr/local/bin/sysd-mail-once -10 btrfsmaintstop /usr/local/bin/btrfsmaint check
+Restart=always
+RestartSec=600
+
+[Install]
+WantedBy=grahical.target
diff --git a/filesystem/etc/systemd/system/dynamicipupdate.service b/filesystem/etc/systemd/system/dynamicipupdate.service

index 48c3d447cd7ff2214e8ac18686f43303d7c3b80a..54b04f9222630dc0cc95cfbffc56a5fc4229e8be 100644 (file)
--- a/filesystem/etc/systemd/system/dynamicipupdate.service
+++ b/filesystem/etc/systemd/system/dynamicipupdate.service
@@ -1,7 +1,13 @@
  [Unit]
  Description=dynamicipupdate
-After=multi-user.target
+After=local-fs.target
+StartLimitIntervalSec=0
  
  [Service]
-Type=oneshot
+Type=simple
  ExecStart=/usr/local/bin/sysd-mail-once -40 dynamicipupdate /usr/local/bin/dynamic-ip-update
+Restart=always
+RestartSec=600
+
+[Install]
+WantedBy=grahical.target
diff --git a/install-my-scripts b/install-my-scripts

index 58333732b71fd6b7dfb04269026dacc9c601af54..cb5435057993d96d92ca3e11576138e727a88b39 100755 (executable)
--- a/install-my-scripts
+++ b/install-my-scripts
@@ -38,7 +38,7 @@ x="$(readlink -f -- "${BASH_SOURCE[0]}")"; cd ${x%/*} # directory of this file
  rsync -t --chmod=755 --chown=root:root switch-mail-host btrbk-run mount-latest-subvol \
        check-subvol-stale myi3status mailtest-check \
        mailbindwatchdog \
-      /a/bin/log-quiet/sysd-mail-once hssh \
+      /a/bin/log-quiet/sysd-mail-once \
        check-mailq \
        unsaved-buffers.el \
        mail-backup-clean \
@@ -57,20 +57,21 @@ cmd=( rsync -aiSAX --chown=root:root --chmod=g-s
  sre() {
    service=$1
    if [[ $(systemctl is-active $1.service ||:) != inactive ]]; then
-    systemctl restart $service
+    # just fire and forget. sometimes a script restart can fail, but then
+    # then auto restart mechanism makes it succeed.
+    systemctl restart $service ||: &
    fi
  
  }
  
  while read -r line; do
    file="${line:12}"
-  echo $file
    case $file in
      btrfsmaint)
-      sre btrfsmaintstop
+      sre btrfsmaintstop &
        ;;
      *)
-      sre ${file//-/}
+      sre ${file//-/} &
        ;;
    esac
  done < <("${cmd[@]}")
diff --git a/mailtest-check b/mailtest-check

index befdbbfbe8b8b56b496dd352e4cc804b025c5dfa..7454086099322db19eec59bee04471d555fc297d 100755 (executable)
--- a/mailtest-check
+++ b/mailtest-check
@@ -78,7 +78,14 @@ esac
  
  getspamdpid() {
    if [[ ! $spamdpid || ! -d /proc/$spamdpid ]]; then
-    spamdpid=$(systemctl show --property MainPID --value spamassassin | sed 's/^1$//' ||:)
+    # try twice in case we are restarting, it happens.
+    for i in 1 2; do
+      spamdpid=$(systemctl show --property MainPID --value spamassassin | sed 's/^[10]$//' ||:)
+      if [[ $spamdpid ]]; then
+        break
+      fi
+      sleep 30
+    done
    fi
  }
  getspamdpid
@@ -135,7 +142,8 @@ for folder in ${folders[@]}; do
            # servers.
            # example line that sed is parsing:
            # (-0.1 / 5.0 requ) DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,SPF_HELO_PASS=-0.001,SPF_PASS=-0.001,TVD_SPACE_RATIO=0.001 autolearn=_AUTOLEARN
-          for r in $($spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" | tail -n2 | head -n1 | sed -r 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /g'); do
+          raw_results="$($spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" | tail -n2 | head -n1 | sed -r 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /g')"
+          for r in $raw_results; do
              case $r in
                # got this in an update 2022-01. dun care
                T_SCC_BODY_TEXT_LINE|SCC_BODY_SINGLE_WORD) : ;;
diff --git a/rootsshsync b/rootsshsync

index 6b28fcbe41bdede68ae8d1a28c85a659fba3f2ba..fa36a563898ea2ca6b100acb4f0fc85cb705e660 100755 (executable)
--- a/rootsshsync
+++ b/rootsshsync
@@ -59,8 +59,8 @@ if [[ -e $user_ssh_dir/config ]]; then
  fi
  chown -R root:root /root/.ssh
  
-# notably: installs hssh
-/a/exe/install-my-scripts
+rsync -t --chmod=755 --chown=root:root /b/ds/hssh /usr/local/bin
+
  if [[ -e /a/opt/btrbk/ssh_filter_btrbk.sh ]]; then
    install /a/opt/btrbk/ssh_filter_btrbk.sh /usr/local/bin
  fi
diff --git a/system-status b/system-status

old mode 100644 (file)

new mode 100755 (executable)

index 28c0035..07c730d
--- a/system-status
+++ b/system-status
@@ -37,10 +37,48 @@ loday() {
    /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylerts@iankelling.org
  }
  
-
+# todo, consider migrating some of these alerts into prometheus
  write-status() {
    chars=("${first_chars[@]}")
  
+  services=(
+    epanicclean
+    systemstatus
+    btrfsmaintstop
+    dynamicipupdate
+  )
+  bads=()
+  if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then
+    for s in ${services[@]}; do
+      if [[ $(systemctl show -p SubState --value $s) != running ]]; then
+        bads+=($s)
+      fi
+    done
+    chars+=(MYSERS)
+
+  fi
+  lo -240 mysers ${bads[*]}
+
+  services=(
+    prometheus-node-exporter
+    prometheus-alertmanager
+    prometheus
+  )
+  case $HOSTNAME in
+    kd)
+      bads=()
+      if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then
+        for s in ${services[@]}; do
+          if [[ $(systemctl show -p SubState --value $s) != running ]]; then
+            bads+=($s)
+          fi
+        done
+        chars+=(PROM)
+      fi
+      lo -240 prom ${bads[*]}
+      ;;
+  esac
+
    # clock us out in timetrap if are idle too long
    if [[ -e /p/.timetrap.db ]]; then
      export DISPLAY=:0
@@ -83,15 +121,18 @@ write-status() {
  
    glob=(/nocow/btrfs-stale/*)
    if [[ -e ${glob[0]} ]]; then
-    chars+=("STALE")
+    chars+=(STALE)
    fi
+  var_mail_msg=
    if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then
      var_mail_msg="message in /var/mail"
    fi
    loday -1 var_mail $var_mail_msg
+
+  bouncemsg=
    glob=(/m/md/bounces/new/*)
    if [[ -e ${glob[0]} ]]; then
-    chars+=("BOUNCE")
+    chars+=(BOUNCE)
      bouncemsg="message in /m/md/bounces/new"
    fi
    loday -1 bounce $bouncemsg
@@ -99,27 +140,28 @@ write-status() {
    # but its good enough for me.
    glob=(/m/md/alerts/{new,cur}/!(*,S))
    if [[ -e ${glob[0]} ]]; then
-    chars+=("A")
+    chars+=(A)
    fi
  
    glob=(/m/md/daylerts/{new,cur}/!(*,S))
    if [[ -e ${glob[0]} ]]; then
-    chars+=("L")
+    chars+=(DAY)
    fi
  
  
    tmp=(/var/local/cron-errors/mailtest-check*)
    if (( ${#tmp[@]} )); then
-    chars+=("MAILPING")
+    chars+=(MAILPING)
    fi
    tmp=(/var/local/cron-errors/mailtest-slow*)
    if (( ${#tmp[@]} )); then
-    chars+=("SPAMD")
+    chars+=(SPAMD)
    fi
  
    # early in install process, we dont have permission yet for exiqgrep.
    # 1100 helps allow for system restarts
    qlen=$(/usr/sbin/exiqgrep -o 1100 -c -b | awk '{print $1}') ||:
+  qmsg=
    if ((qlen)); then
      qmsg="queue length $qlen"
      chars+=("q $qlen")
@@ -144,11 +186,11 @@ write-status() {
  
    # these conditions are so we dont have an overly verbose prompt
    if $begin && $end; then
-    chars+=("D")
+    chars+=(D)
    elif $begin; then
-    chars+=("DB")
+    chars+=(DB)
    elif $end; then
-    chars+=("DE")
+    chars+=(DE)
    else
      f=~/.local/conflink
      # shellcheck disable=SC2043
@@ -175,7 +217,7 @@ write-status() {
          # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago
          if (( fmin < 0 )) && [[ $(find ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then
            v conflink newer filesystem files
-          chars+=("CONFLINK")
+          chars+=(CONFLINK)
            break
          fi
  
@@ -188,7 +230,7 @@ write-status() {
            fi
            if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then
              v conflink: newer files checked in to git
-            chars+=("CONFLINK")
+            chars+=(CONFLINK)
              break
            fi
  
@@ -198,7 +240,7 @@ write-status() {
            done < <(git ls-files -o --exclude-standard)
            if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then
              v conflink: untracked in $d
-            chars+=("CONFLINK")
+            chars+=(CONFLINK)
              break
            fi
          done
@@ -207,13 +249,13 @@ write-status() {
        fi
        if [[ ! -e $f || $(<$f) != 0 ]]; then
          v conflink: last run not found or failed
-        chars+=("CONFLINK")
+        chars+=(CONFLINK)
          break
        fi
      done
    fi
  
-#  if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
+  #  if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then
    if [[ -s /var/log/exim4/paniclog ]]; then
      chars+=("PANIC!")
      # leave it up to epanic-clean to send email notification
@@ -223,10 +265,10 @@ write-status() {
    if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
      bbkmsg=
      if [[ $(systemctl is-active btrbk.timer) != active ]]; then
-      chars+=("BTRBK.TIMER")
-      bbkmsg="btrbk.timer not enabled"
+      chars+=(BTRBK.TIMER)
+      bbkmsg="not enabled"
      fi
-    lo -48 btrbk.timer $bbkmsg
+    lo -480 btrbk.timer $bbkmsg
  
      ## check if last snapshot was within an hour
      vol=o
@@ -254,8 +296,9 @@ write-status() {
          maxtime=$t
        fi
      done
+    snapshotmsg=
      if (( maxtime < now - 4*60*60 )); then
-      chars+=("OLD-SNAP")
+      chars+=(OLD-SNAP)
        snapshotmsg="/o snapshot older than 4 hours"
      fi
      lo -1 old-snapshot $snapshotmsg
@@ -279,19 +322,19 @@ if [[ $1 ]]; then
  fi
  
  main-loop() {
-while true; do
-  power=true
-  if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
-    power=false
-  fi
-  wait=15
-  if ! $power; then
-    wait=60
-  fi
+  while true; do
+    power=true
+    if [[ -e /sys/class/power_supply/AC/online && $(</sys/class/power_supply/AC/online) == 0 ]]; then
+      power=false
+    fi
+    wait=15
+    if ! $power; then
+      wait=60
+    fi
  
-  sleep $wait
-  write-status
-done
+    sleep $wait
+    write-status
+  done
  }
  
  # ensure our long operations are one line so we are not prone errors
diff --git a/ziva-backup-check b/ziva-backup-check

index 31ae7ddcf42956ffdc98723e1e6195a2270ab731..664750d91b0b7022fde344cecf1e680449ccd15d 100755 (executable)
--- a/ziva-backup-check
+++ b/ziva-backup-check
@@ -20,10 +20,11 @@ fi
  ## begin check on btrbk
  now=$(date +%s)
  age_limit_sec=$(( 60 * 60 * 50 )) # 50 hours
-for vol in {root,boot}_ubuntubionic; do
-  snaps=(/mnt/r7/amy/btrbk/${vol}.20*)
+for prefix in root boot; do
+  vol=${prefix}_ubuntubionic
+  snaps=(/mnt/r7/amy/$prefix/btrbk/${vol}.20*)
    if [[ ! ${snaps[*]} ]]; then
-    err no snapshots starting with /mnt/r7/amy/btrbk/${vol}_ubuntubionic.20
+    err no snapshots starting with /mnt/r7/amy/$prefix/btrbk/${vol}.20
      break
    fi
author	Ian Kelling <ian@iankelling.org>
	Tue, 8 Mar 2022 03:04:19 +0000 (22:04 -0500)
committer	Ian Kelling <ian@iankelling.org>
	Tue, 8 Mar 2022 03:08:40 +0000 (22:08 -0500)
brc2		patch \| blob \| history
check-stale-alerts		patch \| blob \| history
conflink		patch \| blob \| history
distro-end		patch \| blob \| history
filesystem/etc/default/prometheus		patch \| blob \| history
filesystem/etc/prometheus/file_sd/node.yml		patch \| blob \| history
filesystem/etc/prometheus/prometheus.yml		patch \| blob \| history
filesystem/etc/prometheus/rules/iank.yml		patch \| blob \| history
filesystem/etc/systemd/system/btrfsmaintstop.service		patch \| blob \| history
filesystem/etc/systemd/system/dynamicipupdate.service		patch \| blob \| history
install-my-scripts		patch \| blob \| history
mailtest-check		patch \| blob \| history
rootsshsync		patch \| blob \| history
system-status	[changed mode: 0644->0755]	patch \| blob \| history
ziva-backup-check		patch \| blob \| history