+ node_network_up{instance="kdwg:9101",device="eth0"} != 1
+ labels:
+ severity: day
+
+
+# alerting on missing metrics:
+# https://www.robustperception.io/absent-alerting-for-scraped-metrics
+# that doesnt work if we want to alert across multiple hosts, eg
+# up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
+# however, google lead me to a solution here
+# https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
+# there is also the absent() function, but i didnt see a way to make that work
+ - alert: mysers_units_missing
+ expr: |-
+ count(up{job="node"} == 1) by (instance) * 3 unless
+ count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: epanicclean_not_active
+ expr: |-
+ node_systemd_unit_state{name="epanicclean.service",state="active"} != 1
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: epanicclean_missing
+ expr: |-
+ count(up{job=~"node|tlsnode"} == 1) by (instance) unless
+ count(node_systemd_unit_state{job=~"node|tlsnode",name="epanicclean.service",state="active"}) by (instance)
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: mysers_not_active
+ expr: |-
+ node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
+ for: 20m
+ labels:
+ severity: warn
+
+ # todo: at some point, look into making mailtest-check either be resilient to the internet going down,
+ # or inhibit or group this alert with it going down.
+ - alert: sysd_result_fail
+ # not sure 30m is really needed, it prevents the alert from flapping
+ # i guess.
+ expr: |-
+ rate(node_systemd_unit_result_fail_count[30m]) > 0
+ labels:
+ severity: day
+
+ - alert: exim_paniclog
+ expr: |-
+ exim_paniclog > 0
+ labels:
+ severity: day
+
+ - alert: check_crypttab
+ expr: |-
+ check_crypttab > 0
+ labels:
+ severity: prod
+
+# 17 minutes: We try to send every 5 minutes. if we reboot causing 1
+# send to fail, thats 10 minutes between 2 sends. we test this every 5
+# minutes, so thats 15 minutes of time we can expect for 1 failed email,
+# and 1 failed email is expected due to reboots or other tiny issues we
+# dont care about.
+#
+# cmc_wan_down etc, inhibits other alerts, but mailtest_check needs
+# additional time to recover after an outage. We can only inhibit while
+# an alert is actually firing, it doesnt affect the "for:"
+# condition. So, we have those alerts that need to be delayed be
+# conditioned on a query for that alert having not been firing in the
+# last X minutes. However, there is a special case when prometheus
+# itself was down, and so there was no alert. So, I test for missing
+# of metric that gets generated for prometheus itself. If for some
+# reason that has a problem, I could make it more conservative by
+# checking that we booted recently instead, eg:
+# time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17
+ - alert: mailtest_check_vps
+ expr: |-
+ time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 17 unless on() mailtest_lag_inhibit
+ labels:
+ severity: day
+ annotations:
+ summary: '17 minutes down'
+
+ - alert: mailtest_check_mailhost
+ expr: |-
+ time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 17 unless on() mailtest_lag_inhibit