X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=47012ccbcb74b20f517f825d1cc327fd70fe2c0d;hb=3f6c5b1f2c72614ee8cb4d0e4325e7beddda9c04;hp=14d4275069de0f57b81c32a469a95814d844037b;hpb=01ccff895787ca94ad37d11cb93f0440a29edd7c;p=distro-setup

diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml
index 14d4275..47012cc 100644
--- a/filesystem/etc/prometheus/rules/iank.yml
+++ b/filesystem/etc/prometheus/rules/iank.yml
@@ -182,8 +182,16 @@ groups:
     labels:
       severity: day
     annotations:
-      summary: 'jr -u mailtest-check -e'
+      summary: 'jr -u mailtest-check -e -n 10000'
 
+  - alert: mailtest_check_missing_dnswl
+    expr: |-
+      mailtest_check_missing_dnswl >= 1
+    for: 30m
+    labels:
+      severity: day
+    annotations:
+      summary: 'jr -u mailtest-check -e -n 10000'
 
   # We expect to be getting metrics, if we come up and notice we have
   # any missing in the past, and it wasn't from a reboot, and we haven't
@@ -191,17 +199,20 @@ groups:
   # 19 for 19 minutes, but I make it 18 just to give a bit of slack.
   - alert: historical_missing_metric
     expr: |-
-      count_over_time(up{job="prometheus"}[19m]) <= 18 unless on() present_over_time(ALERTS[19m]) unless on() time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17
+      count_over_time(up{job="prometheus"}[19m]) <= 18 unless on() present_over_time(ALERTS[19m]) unless on() time() - node_boot_time_seconds{instance="kd"} <= 60 * 17
     labels:
       severity: warn
 
-  - alert: 1pmtest
-    expr: hour() == 17 and minute() < 5
+  # 10 am friday. but, do it 1 minute early so it is closer to actually
+  # firing at 10 am.
+  - alert: dead_man_test
+    expr: |-
+      ( hour() == 13 and minute() >= 59 or hour() == 14 and minute() < 3 )  and day_of_week() == 5
     for: 0m
     labels:
       severity: daytest
     annotations:
-      summary: Prometheus daily test alert
+      summary: Prometheus weekly test alert
 
 
 #### Inhibit notes ####
@@ -263,13 +274,16 @@ groups:
       description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
 
   - alert: lowpri_target_down
-    expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0
+    expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100|kwwg:9101"} == 0
     for: 30m
     labels:
       severity: warn
     annotations:
       summary: Target down for 30m
 
+  # note PrometheusAllTargetsMissing is intentionally omitted because it
+  # is redundant to the above.
+
   - alert: target_down
     expr: up{instance=~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0
     for: 5m
@@ -286,10 +300,6 @@ groups:
     annotations:
       summary: MAIL_HOST likely down for 5m
 
-
-# note, the next upstream metric is intentionally omitted:
-# https://github.com/samber/awesome-prometheus-alerts/issues/283
-
   - alert: PrometheusConfigurationReloadFailure
     expr: prometheus_config_last_reload_successful != 1
     for: 30m