+
+# ## uncomment for testing an alert firing
+# - alert: test-alert4
+# expr: vector(1)
+# for: 0m
+# labels:
+# severity: day
+# annotations:
+# description: "always-firing alert VALUE = {{ $value }}"
+
+
+
+###### BEGIN MISC NOTES ######
+
+#
+# other interesting exporters
+# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
+#
+
+# interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/
+
+# interesting promql query that could be useful later.
+# changes(ALERTS_FOR_STATE[24h])
+#
+#
+#
+# alert flap strategy.
+# https://roidelapluie.be/blog/2019/02/21/prometheus-last/
+#
+# Another idea generally is to make an alert that fires for 24 hours and
+# inhibits another alert for the same thing, which we want at most
+# 1 alert per 24 hours.
+
+###### END MISC NOTES ######
+
+
+# various queries only look at increases, so invert the up metric so we
+# can better query on down.
+ - record: down
+ expr: up == bool 0
+
+ # convenience metric to use in multiple alert expressions
+ - record: mailtest_lag_inhibit
+ expr: present_over_time(ALERTS{alertname=~"kd_eth0_down|target_down|cmc_wan_down"}[17m]) or on() count_over_time(up{job="prometheus"}[19m]) <= 18
+
+
+ # the node_network_info here goes away when it is down,
+ # https://www.robustperception.io/absent-alerting-for-scraped-metrics
+ #
+ # What this says is: return metric if up == 1 if there isnt also
+ # the right hand metric (with the same instance+job).
+ #
+ # aka:
+ # ! exists(operstate=up) && up
+ - alert: cmc_wan_down
+ expr: |-
+ up{instance="10.2.0.1:9100"} == 1 unless on(instance,job) node_network_info{instance="10.2.0.1:9100",device="wan",operstate="up"}
+ labels:
+ severity: day
+
+ - alert: kd_eth0_down
+ expr: |-
+ node_network_up{instance="kdwg:9101",device="eth0"} != 1
+ labels:
+ severity: day
+
+
+# alerting on missing metrics:
+# https://www.robustperception.io/absent-alerting-for-scraped-metrics
+# that doesnt work if we want to alert across multiple hosts, eg
+# up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
+# however, google lead me to a solution here
+# https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
+# there is also the absent() function, but i didnt see a way to make that work
+ - alert: mysers_units_missing
+ expr: |-
+ count(up{job="node"} == 1) by (instance) * 3 unless
+ count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: epanicclean_not_active
+ expr: |-
+ node_systemd_unit_state{name="epanicclean.service",state="active"} != 1
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: epanicclean_missing
+ expr: |-
+ count(up{job=~"node|tlsnode"} == 1) by (instance) unless
+ count(node_systemd_unit_state{job=~"node|tlsnode",name="epanicclean.service",state="active"}) by (instance)
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: mysers_not_active
+ expr: |-
+ node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
+ for: 20m
+ labels:
+ severity: warn
+
+ # todo: at some point, look into making mailtest-check either be resilient to the internet going down,
+ # or inhibit or group this alert with it going down.
+ - alert: sysd_result_fail
+ # not sure 30m is really needed, it prevents the alert from flapping
+ # i guess.
+ expr: |-
+ rate(node_systemd_unit_result_fail_count[30m]) > 0
+ labels:
+ severity: day
+
+ - alert: exim_paniclog
+ expr: |-
+ exim_paniclog > 0
+ labels:
+ severity: day
+
+ - alert: check_crypttab
+ expr: |-
+ check_crypttab > 0
+ labels:
+ severity: prod
+
+# 17 minutes: We try to send every 5 minutes. if we reboot causing 1
+# send to fail, thats 10 minutes between 2 sends. we test this every 5
+# minutes, so thats 15 minutes of time we can expect for 1 failed email,
+# and 1 failed email is expected due to reboots or other tiny issues we
+# dont care about.
+#
+# cmc_wan_down etc, inhibits other alerts, but mailtest_check needs
+# additional time to recover after an outage. We can only inhibit while
+# an alert is actually firing, it doesnt affect the "for:"
+# condition. So, we have those alerts that need to be delayed be
+# conditioned on a query for that alert having not been firing in the
+# last X minutes. However, there is a special case when prometheus
+# itself was down, and so there was no alert. So, I test for missing
+# of metric that gets generated for prometheus itself. If for some
+# reason that has a problem, I could make it more conservative by
+# checking that we booted recently instead, eg:
+# time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17
+ - alert: mailtest_check_vps