+
+## uncomment for testing an alert firing
+# - alert: test-alert4
+# expr: vector(1)
+# # expr: nonexistent_metric
+# for: 0m
+# labels:
+# severity: day
+# annotations:
+# description: "always-firing alert VALUE = {{ $value }}"
+
+
+
+###### BEGIN MISC NOTES ######
+
+#
+# other interesting exporters
+# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
+#
+
+# interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/
+
+# interesting promql query that could be useful later.
+# changes(ALERTS_FOR_STATE[24h])
+#
+#
+#
+# alert flap strategy.
+# https://roidelapluie.be/blog/2019/02/21/prometheus-last/
+#
+# Another idea generally is to make an alert that fires for 24 hours and
+# inhibits another alert for the same thing, which we want at most
+# 1 alert per 24 hours.
+
+###### END MISC NOTES ######
+
+# various queries only look at increases, so invert the up metric so we
+# can better query on down.
+ - record: down
+ expr: up == bool 0
+
+
+# alerting on missing metrics:
+# https://www.robustperception.io/absent-alerting-for-scraped-metrics
+# that doesnt work if we want to alert across multiple hosts, eg
+# up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
+# however, google lead me to a solution here
+# https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
+# there is also the absent() function, but i didnt see a way to make that work
+ - alert: mysers_units_missing
+ expr: |-
+ count(up{job="node"} == 1) by (instance) * 3 unless
+ count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: epanicclean_not_active
+ expr: |-
+ node_systemd_unit_state{name="epanicclean.service",state="active"} != 1
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: epanicclean_missing
+ expr: |-
+ count(up{job=~"node|tlsnode"} == 1) by (instance) unless
+ count(node_systemd_unit_state{job=~"node|tlsnode",name="epanicclean.service",state="active"}) by (instance)
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: mysers_not_active
+ expr: |-
+ node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: sysd_result_fail
+ # not sure 30m is really needed, it prevents the alert from flapping
+ # i guess.
+ expr: |-
+ rate(node_systemd_unit_result_fail_count[30m]) > 0
+ labels:
+ severity: day
+
+ - alert: exim_paniclog
+ expr: |-
+ exim_paniclog > 0
+ labels:
+ severity: warn
+
+ - alert: check_crypttab
+ expr: |-
+ check_crypttab > 0
+ labels:
+ severity: prod
+
+ - alert: mailtest_check_vps
+ expr: |-
+ time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 12
+ labels:
+ severity: day
+ annotations:
+ summary: '12 minutes down'
+
+ - alert: mailtest_check_mailhost