X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=47012ccbcb74b20f517f825d1cc327fd70fe2c0d;hb=3f6c5b1f2c72614ee8cb4d0e4325e7beddda9c04;hp=14d4275069de0f57b81c32a469a95814d844037b;hpb=01ccff895787ca94ad37d11cb93f0440a29edd7c;p=distro-setup diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index 14d4275..47012cc 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -182,8 +182,16 @@ groups: labels: severity: day annotations: - summary: 'jr -u mailtest-check -e' + summary: 'jr -u mailtest-check -e -n 10000' + - alert: mailtest_check_missing_dnswl + expr: |- + mailtest_check_missing_dnswl >= 1 + for: 30m + labels: + severity: day + annotations: + summary: 'jr -u mailtest-check -e -n 10000' # We expect to be getting metrics, if we come up and notice we have # any missing in the past, and it wasn't from a reboot, and we haven't @@ -191,17 +199,20 @@ groups: # 19 for 19 minutes, but I make it 18 just to give a bit of slack. - alert: historical_missing_metric expr: |- - count_over_time(up{job="prometheus"}[19m]) <= 18 unless on() present_over_time(ALERTS[19m]) unless on() time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17 + count_over_time(up{job="prometheus"}[19m]) <= 18 unless on() present_over_time(ALERTS[19m]) unless on() time() - node_boot_time_seconds{instance="kd"} <= 60 * 17 labels: severity: warn - - alert: 1pmtest - expr: hour() == 17 and minute() < 5 + # 10 am friday. but, do it 1 minute early so it is closer to actually + # firing at 10 am. + - alert: dead_man_test + expr: |- + ( hour() == 13 and minute() >= 59 or hour() == 14 and minute() < 3 ) and day_of_week() == 5 for: 0m labels: severity: daytest annotations: - summary: Prometheus daily test alert + summary: Prometheus weekly test alert #### Inhibit notes #### @@ -263,13 +274,16 @@ groups: description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" - alert: lowpri_target_down - expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0 + expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100|kwwg:9101"} == 0 for: 30m labels: severity: warn annotations: summary: Target down for 30m + # note PrometheusAllTargetsMissing is intentionally omitted because it + # is redundant to the above. + - alert: target_down expr: up{instance=~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0 for: 5m @@ -286,10 +300,6 @@ groups: annotations: summary: MAIL_HOST likely down for 5m - -# note, the next upstream metric is intentionally omitted: -# https://github.com/samber/awesome-prometheus-alerts/issues/283 - - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 30m