X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=f64322b2c98ef4f48755daec9fe81f185b8c9488;hb=5c8530653c87af3757a2c649772e0405bcd143a0;hp=14d4275069de0f57b81c32a469a95814d844037b;hpb=01ccff895787ca94ad37d11cb93f0440a29edd7c;p=distro-setup diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index 14d4275..f64322b 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -182,8 +182,16 @@ groups: labels: severity: day annotations: - summary: 'jr -u mailtest-check -e' + summary: 'jr -u mailtest-check -e -n 10000' + - alert: mailtest_check_missing_dnswl + expr: |- + mailtest_check_missing_dnswl >= 1 + for: 30m + labels: + severity: day + annotations: + summary: 'jr -u mailtest-check -e -n 10000' # We expect to be getting metrics, if we come up and notice we have # any missing in the past, and it wasn't from a reboot, and we haven't @@ -195,13 +203,16 @@ groups: labels: severity: warn - - alert: 1pmtest - expr: hour() == 17 and minute() < 5 + # 10 am friday. but, do it 1 minute early so it is closer to actually + # firing at 10 am. + - alert: dead_man_test + expr: |- + ( hour() == 13 and minute() >= 59 or hour() == 14 and minute() < 3 ) and day_of_week() == 5 for: 0m labels: severity: daytest annotations: - summary: Prometheus daily test alert + summary: Prometheus weekly test alert #### Inhibit notes #### @@ -270,6 +281,9 @@ groups: annotations: summary: Target down for 30m + # note PrometheusAllTargetsMissing is intentionally omitted because it + # is redundant to the above. + - alert: target_down expr: up{instance=~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0 for: 5m @@ -286,10 +300,6 @@ groups: annotations: summary: MAIL_HOST likely down for 5m - -# note, the next upstream metric is intentionally omitted: -# https://github.com/samber/awesome-prometheus-alerts/issues/283 - - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 30m