X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=f64322b2c98ef4f48755daec9fe81f185b8c9488;hb=b28eebdf9143aa17733f233b30b96f462008f3b6;hp=651eb00de164a8a41b84fe53d6429a5821f8f3d9;hpb=ea108a03dfa2d7f73447c0b14210d766e5ee5d9b;p=distro-setup diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index 651eb00..f64322b 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -182,7 +182,7 @@ groups: labels: severity: day annotations: - summary: 'jr -u mailtest-check -e' + summary: 'jr -u mailtest-check -e -n 10000' - alert: mailtest_check_missing_dnswl expr: |- @@ -191,7 +191,7 @@ groups: labels: severity: day annotations: - summary: 'jr -u mailtest-check -e' + summary: 'jr -u mailtest-check -e -n 10000' # We expect to be getting metrics, if we come up and notice we have # any missing in the past, and it wasn't from a reboot, and we haven't @@ -203,13 +203,16 @@ groups: labels: severity: warn - - alert: 1pmtest - expr: hour() == 17 and minute() < 5 + # 10 am friday. but, do it 1 minute early so it is closer to actually + # firing at 10 am. + - alert: dead_man_test + expr: |- + ( hour() == 13 and minute() >= 59 or hour() == 14 and minute() < 3 ) and day_of_week() == 5 for: 0m labels: severity: daytest annotations: - summary: Prometheus daily test alert + summary: Prometheus weekly test alert #### Inhibit notes #### @@ -278,6 +281,9 @@ groups: annotations: summary: Target down for 30m + # note PrometheusAllTargetsMissing is intentionally omitted because it + # is redundant to the above. + - alert: target_down expr: up{instance=~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0 for: 5m @@ -294,10 +300,6 @@ groups: annotations: summary: MAIL_HOST likely down for 5m - -# note, the next upstream metric is intentionally omitted: -# https://github.com/samber/awesome-prometheus-alerts/issues/283 - - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 30m