X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=bae264e19262f1d148ce710836f299cce1009ced;hb=ae10fa08bb841b99b0df8e827735bef08c05f3ca;hp=971e392fd63b55387089c47529060ebd91543d03;hpb=4b3f043829a57215e1251122a8ab0019b717ca8d;p=distro-setup diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index 971e392..bae264e 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -9,10 +9,9 @@ groups: - name: standard rules: -## uncomment for testing an alert firing +# ## uncomment for testing an alert firing # - alert: test-alert4 # expr: vector(1) -# # expr: nonexistent_metric # for: 0m # labels: # severity: day @@ -99,7 +98,7 @@ groups: expr: |- exim_paniclog > 0 labels: - severity: warn + severity: day - alert: check_crypttab expr: |- @@ -107,30 +106,40 @@ groups: labels: severity: prod +# 17 minutes: if we reboot causing 1 send to fail, thats 10 minutes. we +# test this every 5 minutes, so thats 15 minutes at most. - alert: mailtest_check_vps expr: |- - time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 12 + time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 17 + labels: + severity: day + annotations: + summary: '17 minutes down' + + - alert: mailtest_check_unexpected_spamd_vps + expr: |- + mailtest_check_unexpected_spamd_results >= 1 labels: severity: day annotations: - summary: '12 minutes down' + summary: 'jr -u mailtest-check -e' - alert: mailtest_check_mailhost expr: |- - time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 12 + time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 17 labels: severity: day annotations: - summary: '12 minutes down' + summary: '17 minutes down' - # 42 mins: enough for a 30 min queue run plus 12 + # 20 minutes. just allow for more due to prod alert. - alert: mailtest_check_gnu_mailhost expr: |- - time() - max by (folder,from) (mailtest_check_last_usec{folder="/m/md/l/testignore", from="iank@gnu.org"}) >= 60 * 42 + time() - max by (folder,from) (mailtest_check_last_usec{folder="/m/md/l/testignore", from="iank@gnu.org"}) >= 60 * 20 labels: severity: prod annotations: - summary: '42 minutes down' + summary: '20 minutes down' - alert: 1pmtest