fixes and qd for better source subvol error repo

[distro-setup] / filesystem / etc / prometheus / rules / iank.yml
diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml

index 651eb00de164a8a41b84fe53d6429a5821f8f3d9..47012ccbcb74b20f517f825d1cc327fd70fe2c0d 100644 (file)
--- a/filesystem/etc/prometheus/rules/iank.yml
+++ b/filesystem/etc/prometheus/rules/iank.yml
@@ -182,7 +182,7 @@ groups:
      labels:
        severity: day
      annotations:
-      summary: 'jr -u mailtest-check -e'
+      summary: 'jr -u mailtest-check -e -n 10000'
  
    - alert: mailtest_check_missing_dnswl
      expr: |-
@@ -191,7 +191,7 @@ groups:
      labels:
        severity: day
      annotations:
-      summary: 'jr -u mailtest-check -e'
+      summary: 'jr -u mailtest-check -e -n 10000'
  
    # We expect to be getting metrics, if we come up and notice we have
    # any missing in the past, and it wasn't from a reboot, and we haven't
@@ -199,17 +199,20 @@ groups:
    # 19 for 19 minutes, but I make it 18 just to give a bit of slack.
    - alert: historical_missing_metric
      expr: |-
-      count_over_time(up{job="prometheus"}[19m]) <= 18 unless on() present_over_time(ALERTS[19m]) unless on() time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17
+      count_over_time(up{job="prometheus"}[19m]) <= 18 unless on() present_over_time(ALERTS[19m]) unless on() time() - node_boot_time_seconds{instance="kd"} <= 60 * 17
      labels:
        severity: warn
  
-  - alert: 1pmtest
-    expr: hour() == 17 and minute() < 5
+  # 10 am friday. but, do it 1 minute early so it is closer to actually
+  # firing at 10 am.
+  - alert: dead_man_test
+    expr: |-
+      ( hour() == 13 and minute() >= 59 or hour() == 14 and minute() < 3 )  and day_of_week() == 5
      for: 0m
      labels:
        severity: daytest
      annotations:
-      summary: Prometheus daily test alert
+      summary: Prometheus weekly test alert
  
  
  #### Inhibit notes ####
@@ -271,13 +274,16 @@ groups:
        description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
  
    - alert: lowpri_target_down
-    expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0
+    expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100|kwwg:9101"} == 0
      for: 30m
      labels:
        severity: warn
      annotations:
        summary: Target down for 30m
  
+  # note PrometheusAllTargetsMissing is intentionally omitted because it
+  # is redundant to the above.
+
    - alert: target_down
      expr: up{instance=~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0
      for: 5m
@@ -294,10 +300,6 @@ groups:
      annotations:
        summary: MAIL_HOST likely down for 5m
  
-
-# note, the next upstream metric is intentionally omitted:
-# https://github.com/samber/awesome-prometheus-alerts/issues/283
-
    - alert: PrometheusConfigurationReloadFailure
      expr: prometheus_config_last_reload_successful != 1
      for: 30m