+ count_over_time(up{job="prometheus"}[19m]) <= 18 unless on() present_over_time(ALERTS[19m]) unless on() time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17
+ labels:
+ severity: warn
+
+ - alert: 1pmtest
+ expr: hour() == 17 and minute() < 5
+ for: 0m
+ labels:
+ severity: daytest
+ annotations:
+ summary: Prometheus daily test alert
+
+
+#### Inhibit notes ####
+## Example of expressions to detect if the target_down alert
+# fired in the last 24 hours. Initially, I thought his could
+# be an alert which inhibits up_resets, but eventually I figured
+# that doesn't make much sense, and the idea of using an alert
+# that is not an indication of something wrong, only inhibits another
+# alert, I think works better to integrate directly into the
+# alert it would inhibit, this may mean a recording rule. That avoids
+# an alert we have to ignore or filter out.
+#
+# Alternate expression, to calculate if the alert would have fired is:
+# min_over_time(sum_over_time(up[30m])[1d:]) == 0
+# where 30m matches the for: time in target_down
+#
+# Note: for graphing, surround in the expression in sum_over_time()
+# ALERTS{alertname="target_down",alertstate="firing"}[1d]
+#### end Inhibit notes ####
+
+
+# For targets where we alert only on long downtimes, we
+# still want to know if it is going down many times for short times over
+# a long period of time. But ignore reboots.
+#
+## Another way would be to detect an overall downtime:
+# avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95
+
+# However, this seems to just find too many false positives for now, so
+# commenting it out.
+
+ # - alert: up_resets
+ # expr: |-
+ # resets(up[1d]) - changes(node_boot_time_seconds[1d]) > 12
+ # labels:
+ # severity: warn
+ # annotations:
+ # summary: "Target has gone down {{ $value }} times in 1 day, > 12"
+
+
+
+# https://awesome-prometheus-alerts.grep.to/rules
+
+# todo, we should probably group the prometheus alerts that indicate a
+# host-local problem.
+# eg, set a label alert-group: local-prom, then make a receiver that
+# groups by it when the alert-group is local-prom.
+
+- name: awesome prometheus alerts
+ rules:
+
+ - alert: PrometheusJobMissing
+ expr: absent(up{job="prometheus"})
+ for: 30m
+ labels:
+ severity: day
+ annotations:
+ summary: Prometheus job missing (instance {{ $labels.instance }})
+ description: "A Prometheus job has disappeared\n VALUE = {{ $value }}"
+
+ - alert: lowpri_target_down
+ expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0
+ for: 30m
+ labels:
+ severity: warn
+ annotations:
+ summary: Target down for 30m
+
+ - alert: target_down
+ expr: up{instance=~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0
+ for: 5m
+ labels:
+ severity: day
+ annotations:
+ summary: High priority target down for 5m
+
+ - alert: target_down
+ expr: absent(present_over_time(mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}[5m]))
+ for: 5m
+ labels:
+ severity: day
+ annotations:
+ summary: MAIL_HOST likely down for 5m
+
+
+# note, the next upstream metric is intentionally omitted:
+# https://github.com/samber/awesome-prometheus-alerts/issues/283
+
+ - alert: PrometheusConfigurationReloadFailure
+ expr: prometheus_config_last_reload_successful != 1
+ for: 30m
+ labels:
+ severity: day
+ annotations:
+ description: "Prometheus configuration reload error\n VALUE = {{ $value }}"
+
+ - alert: PrometheusTooManyRestarts
+ expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[30m]) > 10
+ for: 0m