iankelling.org Git - distro-setup/blob - filesystem/etc/prometheus/rules/iank.yml

   1 # other rules to consider:
   2 # filesystem, network, ntp rules:
   3 # https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml
   4 # on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml
   5 #
   6
   7
   8 groups:
   9 - name: standard
  10   rules:
  11
  12 # ## uncomment for testing an alert firing
  13 #   - alert: test-alert4
  14 #     expr: vector(1)
  15 #     for: 0m
  16 #     labels:
  17 #       severity: day
  18 #     annotations:
  19 #       description: "always-firing alert VALUE = {{ $value }}"
  20
  21
  22
  23 ###### BEGIN MISC NOTES ######
  24
  25 #
  26 # other interesting exporters
  27 # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
  28 #
  29
  30 # interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/
  31
  32 # interesting promql query that could be useful later.
  33 # changes(ALERTS_FOR_STATE[24h])
  34 #
  35 #
  36 #
  37 # alert flap strategy.
  38 # https://roidelapluie.be/blog/2019/02/21/prometheus-last/
  39 #
  40 # Another idea generally is to make an alert that fires for 24 hours and
  41 # inhibits another alert for the same thing, which we want at most
  42 # 1 alert per 24 hours.
  43
  44 ###### END MISC NOTES ######
  45
  46 # various queries only look at increases, so invert the up metric so we
  47 # can better query on down.
  48   - record: down
  49     expr: up == bool 0
  50
  51
  52 # alerting on missing metrics:
  53 # https://www.robustperception.io/absent-alerting-for-scraped-metrics
  54 # that doesnt work if we want to alert across multiple hosts, eg
  55 # up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
  56 # however, google lead me to a solution here
  57 # https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
  58 # there is also the absent() function, but i didnt see a way to make that work
  59   - alert: mysers_units_missing
  60     expr: |-
  61       count(up{job="node"} == 1) by (instance) * 3 unless
  62       count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
  63     for: 20m
  64     labels:
  65       severity: warn
  66
  67   - alert: epanicclean_not_active
  68     expr: |-
  69       node_systemd_unit_state{name="epanicclean.service",state="active"} != 1
  70     for: 20m
  71     labels:
  72       severity: warn
  73
  74   - alert: epanicclean_missing
  75     expr: |-
  76       count(up{job=~"node|tlsnode"} == 1) by (instance) unless
  77       count(node_systemd_unit_state{job=~"node|tlsnode",name="epanicclean.service",state="active"}) by (instance)
  78     for: 20m
  79     labels:
  80       severity: warn
  81
  82   - alert: mysers_not_active
  83     expr: |-
  84       node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
  85     for: 20m
  86     labels:
  87       severity: warn
  88
  89   - alert: sysd_result_fail
  90     # not sure 30m is really needed, it prevents the alert from flapping
  91     # i guess.
  92     expr: |-
  93       rate(node_systemd_unit_result_fail_count[30m]) > 0
  94     labels:
  95       severity: day
  96
  97   - alert: exim_paniclog
  98     expr: |-
  99       exim_paniclog > 0
 100     labels:
 101       severity: warn
 102
 103   - alert: check_crypttab
 104     expr: |-
 105       check_crypttab > 0
 106     labels:
 107       severity: prod
 108
 109 # 17 minutes: if we reboot causing 1 send to fail, thats 10 minutes. we
 110 # test this every 5 minutes, so thats 15 minutes at most.
 111   - alert: mailtest_check_vps
 112     expr: |-
 113       time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 17
 114     labels:
 115       severity: day
 116     annotations:
 117       summary: '17 minutes down'
 118
 119   - alert: mailtest_check_unexpected_spamd_vps
 120     expr: |-
 121       mailtest_check_unexpected_spamd_results >= 1
 122     labels:
 123       severity: day
 124     annotations:
 125       summary: 'jr -u mailtest-check -e'
 126
 127   - alert: mailtest_check_mailhost
 128     expr: |-
 129       time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 17
 130     labels:
 131       severity: day
 132     annotations:
 133       summary: '17 minutes down'
 134
 135   # 20 minutes. just allow for more due to prod alert.
 136   - alert: mailtest_check_gnu_mailhost
 137     expr: |-
 138       time() - max by (folder,from) (mailtest_check_last_usec{folder="/m/md/l/testignore", from="iank@gnu.org"}) >= 60 * 20
 139     labels:
 140       severity: prod
 141     annotations:
 142       summary: '20 minutes down'
 143
 144
 145   - alert: 1pmtest
 146     expr: hour() == 17 and minute() < 5
 147     for: 0m
 148     labels:
 149       severity: daytest
 150     annotations:
 151       summary: Prometheus daily test alert
 152
 153
 154 #### Inhibit notes ####
 155 ## Example of expressions to detect if the target_down alert
 156 # fired in the last 24 hours. Initially, I thought his could
 157 # be an alert which inhibits up_resets, but eventually I figured
 158 # that doesn't make much sense, and the idea of using an alert
 159 # that is not an indication of something wrong, only inhibits another
 160 # alert, I think works better to integrate directly into the
 161 # alert it would inhibit, this may mean a recording rule. That avoids
 162 # an alert we have to ignore or filter out.
 163 #
 164 # Alternate expression, to calculate if the alert would have fired is:
 165 #  min_over_time(sum_over_time(up[30m])[1d:]) == 0
 166 #  where 30m matches the for: time in target_down
 167 #
 168 # Note: for graphing, surround in the expression in sum_over_time()
 169 # ALERTS{alertname="target_down",alertstate="firing"}[1d]
 170 #### end Inhibit notes ####
 171
 172
 173 # For targets where we alert only on long downtimes, we
 174 # still want to know if it is going down many times for short times over
 175 # a long period of time. But ignore reboots.
 176 #
 177 ## Another way would be to detect an overall downtime:
 178 # avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95
 179   - alert: up_resets
 180     expr: |-
 181       resets(up[1d]) - changes(node_boot_time_seconds[1d]) > 12
 182     labels:
 183       severity: warn
 184     annotations:
 185       summary: "Target has gone down {{ $value }} times in 1 day, > 12"
 186
 187
 188
 189 # https://awesome-prometheus-alerts.grep.to/rules
 190
 191 # todo, we should probably group the prometheus alerts that indicate a
 192 # host-local problem.
 193 # eg, set a label  alert-group: local-prom, then make a receiver that
 194 # groups by it when the alert-group is local-prom.
 195
 196 - name: awesome prometheus alerts
 197   rules:
 198
 199   - alert: PrometheusJobMissing
 200     expr: absent(up{job="prometheus"})
 201     for: 30m
 202     labels:
 203       severity: day
 204     annotations:
 205       summary: Prometheus job missing (instance {{ $labels.instance }})
 206       description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
 207
 208 # TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m,
 209 # and severity to day. mail host is tricky since it roams, but I think the
 210 # right way to do it is to check for absence of this metric:
 211 # mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}
 212   - alert: target_down
 213     expr: up == 0
 214     for: 30m
 215     labels:
 216       severity: warn
 217     annotations:
 218       summary: Target down for 30m
 219
 220
 221     # todo: this should group with the above alert
 222   - alert: PrometheusAllTargetsMissing
 223     expr: count by (job) (up) == 0
 224     for: 10m
 225     labels:
 226       severity: day
 227 #      alert-group: local-prom
 228     annotations:
 229       description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
 230
 231   - alert: PrometheusConfigurationReloadFailure
 232     expr: prometheus_config_last_reload_successful != 1
 233     for: 30m
 234     labels:
 235       severity: day
 236     annotations:
 237       description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
 238
 239   - alert: PrometheusTooManyRestarts
 240     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[30m]) > 10
 241     for: 0m
 242     labels:
 243       severity: warning
 244     annotations:
 245       description: "Prometheus has restarted more than ten times in the last 30 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
 246
 247   - alert: PrometheusAlertmanagerJobMissing
 248     expr: absent(up{job="alertmanager"})
 249     for: 30m
 250     labels:
 251       severity: warn
 252     annotations:
 253       description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
 254
 255   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 256     expr: alertmanager_config_last_reload_successful != 1
 257     for: 30m
 258     labels:
 259       severity: day
 260     annotations:
 261       description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
 262
 263   - alert: PrometheusNotConnectedToAlertmanager
 264     expr: prometheus_notifications_alertmanagers_discovered < 1
 265     for: 30m
 266     labels:
 267       severity: day
 268     annotations:
 269       description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
 270
 271   - alert: PrometheusRuleEvaluationFailures
 272     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 273     for: 30m
 274     labels:
 275       severity: warn
 276     annotations:
 277       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
 278
 279   - alert: PrometheusTemplateTextExpansionFailures
 280     expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
 281     for: 30m
 282     labels:
 283       severity: warn
 284     annotations:
 285       description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
 286
 287   - alert: PrometheusRuleEvaluationSlow
 288     expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
 289     for: 5m
 290     labels:
 291       severity: warn
 292     annotations:
 293       description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
 294
 295   - alert: PrometheusNotificationsBacklog
 296     expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0
 297     for: 0m
 298     labels:
 299       severity: warn
 300     annotations:
 301       description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
 302
 303   - alert: PrometheusAlertmanagerNotificationFailing
 304     expr: rate(alertmanager_notifications_failed_total[1m]) > 0
 305     for: 30m
 306     labels:
 307       severity: warn
 308     annotations:
 309       description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
 310
 311   # file_sd doesnt count as service discovery, so 0 is expected.
 312   # - alert: PrometheusTargetEmpty
 313   #   expr: prometheus_sd_discovered_targets == 0
 314   #   for: 30m
 315   #   labels:
 316   #     severity: day
 317   #   annotations:
 318   #     description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
 319
 320   - alert: PrometheusTargetScrapingSlow
 321     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
 322     for: 30m
 323     labels:
 324       severity: warn
 325     annotations:
 326       description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}"
 327
 328   - alert: PrometheusLargeScrape
 329     expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
 330     for: 30m
 331     labels:
 332       severity: warn
 333     annotations:
 334       description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
 335
 336   - alert: PrometheusTargetScrapeDuplicate
 337     expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
 338     for: 30m
 339     labels:
 340       severity: warn
 341     annotations:
 342       description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
 343
 344   - alert: PrometheusTsdbCheckpointCreationFailures
 345     expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
 346     for: 30m
 347     labels:
 348       severity: warn
 349     annotations:
 350       description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
 351
 352   - alert: PrometheusTsdbCheckpointDeletionFailures
 353     expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
 354     for: 30m
 355     labels:
 356       severity: warn
 357     annotations:
 358       description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
 359
 360   - alert: PrometheusTsdbCompactionsFailed
 361     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 362     for: 30m
 363     labels:
 364       severity: warn
 365     annotations:
 366       description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
 367
 368   - alert: PrometheusTsdbHeadTruncationsFailed
 369     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 370     for: 30m
 371     labels:
 372       severity: warn
 373     annotations:
 374       description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
 375
 376   - alert: PrometheusTsdbReloadFailures
 377     expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
 378     for: 30m
 379     labels:
 380       severity: warn
 381     annotations:
 382       description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
 383
 384   - alert: PrometheusTsdbWalCorruptions
 385     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 386     for: 30m
 387     labels:
 388       severity: warn
 389     annotations:
 390       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
 391
 392   - alert: PrometheusTsdbWalTruncationsFailed
 393     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 394     for: 30m
 395     labels:
 396       severity: warn
 397     annotations:
 398       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"