iankelling.org Git - distro-setup/blob - filesystem/etc/prometheus/rules/iank.yml

   1 # other rules to consider:
   2 # filesystem, network, ntp rules:
   3 # https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml
   4 # on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml
   5 #
   6
   7
   8 groups:
   9 - name: standard
  10   rules:
  11
  12 ## uncomment for testing an alert firing
  13 #   - alert: test-alert4
  14 #     expr: vector(1)
  15 # #    expr: nonexistent_metric
  16 #     for: 0m
  17 #     labels:
  18 #       severity: day
  19 #     annotations:
  20 #       description: "always-firing alert VALUE = {{ $value }}"
  21
  22
  23
  24 ###### BEGIN MISC NOTES ######
  25
  26 #
  27 # other interesting exporters
  28 # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
  29 #
  30
  31 # interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/
  32
  33 # interesting promql query that could be useful later.
  34 # changes(ALERTS_FOR_STATE[24h])
  35 #
  36 #
  37 #
  38 # alert flap strategy.
  39 # https://roidelapluie.be/blog/2019/02/21/prometheus-last/
  40 #
  41 # Another idea generally is to make an alert that fires for 24 hours and
  42 # inhibits another alert for the same thing, which we want at most
  43 # 1 alert per 24 hours.
  44
  45 ###### END MISC NOTES ######
  46
  47
  48
  49
  50 # alerting on missing metrics:
  51 # https://www.robustperception.io/absent-alerting-for-scraped-metrics
  52 # that doesnt work if we want to alert across multiple hosts, eg
  53 # up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
  54 # however, google lead me to a solution here
  55 # https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
  56 # there is also the absent() function, but i didnt see a way to make that work
  57   - alert: mysers_units_missing
  58     expr: |-
  59       count(up{job="node"} == 1) by (instance) * 3 unless
  60       count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
  61     for: 20m
  62     labels:
  63       severity: warn
  64
  65   - alert: epanicclean_not_active
  66     expr: |-
  67       node_systemd_unit_state{name="epanicclean.service",state="active"} != 1
  68     for: 20m
  69     labels:
  70       severity: warn
  71
  72   - alert: epanicclean_missing
  73     expr: |-
  74       count(up{job=~"node|tlsnode"} == 1) by (instance) unless
  75       count(node_systemd_unit_state{job=~"node|tlsnode",name="epanicclean.service",state="active"}) by (instance)
  76     for: 20m
  77     labels:
  78       severity: warn
  79
  80   - alert: mysers_not_active
  81     expr: |-
  82       node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
  83     for: 20m
  84     labels:
  85       severity: warn
  86
  87   - alert: sysd_result_fail
  88     # not sure 30m is really needed, it prevents the alert from flapping
  89     # i guess.
  90     expr: |-
  91       rate(node_systemd_unit_result_fail_count[30m]) > 0
  92     labels:
  93       severity: day
  94
  95   - alert: exim_paniclog
  96     expr: |-
  97       exim_paniclog > 0
  98     labels:
  99       severity: warn
 100
 101   - alert: check_crypttab
 102     expr: |-
 103       check_crypttab > 0
 104     labels:
 105       severity: prod
 106
 107   - alert: mailtest_check_vps
 108     expr: |-
 109       time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 12
 110     labels:
 111       severity: day
 112     annotations:
 113       summary: '12 minutes down'
 114
 115   # 42 mins: enough for a 30 min queue run plus 12
 116   - alert: mailtest_check_vps
 117     expr: |-
 118       time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 42
 119     labels:
 120       severity: prod
 121     annotations:
 122       summary: '42 minutes down'
 123
 124   - alert: mailtest_check_mailhost
 125     expr: |-
 126       time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 12
 127     labels:
 128       severity: day
 129     annotations:
 130       summary: '12 minutes down'
 131
 132   # 42 mins: enough for a 30 min queue run plus 12
 133   - alert: mailtest_check_mailhost
 134     expr: |-
 135       time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 42
 136     labels:
 137       severity: prod
 138     annotations:
 139       summary: '42 minutes down'
 140
 141
 142   - alert: 1pmtest
 143     expr: hour() == 17 and minute() < 5
 144     for: 0m
 145     labels:
 146       severity: daytest
 147     annotations:
 148       summary: Prometheus daily test alert
 149
 150
 151 #### Inhibit notes ####
 152 ## Example of expressions to detect if the target_down alert
 153 # fired in the last 24 hours. Initially, I thought his could
 154 # be an alert which inhibits up_resets, but eventually I figured
 155 # that doesn't make much sense, and the idea of using an alert
 156 # that is not an indication of something wrong, only inhibits another
 157 # alert, I think works better to integrate directly into the
 158 # alert it would inhibit, this may mean a recording rule. That avoids
 159 # an alert we have to ignore or filter out.
 160 #
 161 # Alternate expression, to calculate if the alert would have fired is:
 162 #  min_over_time(sum_over_time(up[30m])[1d:]) == 0
 163 #  where 30m matches the for: time in target_down
 164 #
 165 # Note: for graphing, surround in the expression in sum_over_time()
 166 # ALERTS{alertname="target_down",alertstate="firing"}[1d]
 167 #### end Inhibit notes ####
 168
 169
 170 # For targets where we alert only on long downtimes, we
 171 # still want to know if it is going down many times for short times over
 172 # a long period of time. But ignore reboots.
 173 #
 174 ## Another way would be to detect an overall downtime:
 175 # avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95
 176   - alert: up_resets
 177     expr: |-
 178       resets(up[2d]) - changes(node_boot_time_seconds[2d]) > 12
 179     labels:
 180       severity: warn
 181     annotations:
 182       summary: "Target has gone down {{ $value }} times in 2 days, > 12"
 183
 184
 185
 186 # https://awesome-prometheus-alerts.grep.to/rules
 187
 188 # todo, we should probably group the prometheus alerts that indicate a
 189 # host-local problem.
 190 # eg, set a label  alert-group: local-prom, then make a receiver that
 191 # groups by it when the alert-group is local-prom.
 192
 193 - name: awesome prometheus alerts
 194   rules:
 195
 196   - alert: PrometheusJobMissing
 197     expr: absent(up{job="prometheus"})
 198     for: 30m
 199     labels:
 200       severity: day
 201     annotations:
 202       summary: Prometheus job missing (instance {{ $labels.instance }})
 203       description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
 204
 205 # TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m,
 206 # and severity to day. mail host is tricky since it roams, but I think the
 207 # right way to do it is to check for absence of this metric:
 208 # mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}
 209   - alert: target_down
 210     expr: up == 0
 211     for: 30m
 212     labels:
 213       severity: warn
 214     annotations:
 215       summary: Target down for 30m
 216
 217
 218     # todo: this should group with the above alert
 219   - alert: PrometheusAllTargetsMissing
 220     expr: count by (job) (up) == 0
 221     for: 10m
 222     labels:
 223       severity: day
 224 #      alert-group: local-prom
 225     annotations:
 226       description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
 227
 228   - alert: PrometheusConfigurationReloadFailure
 229     expr: prometheus_config_last_reload_successful != 1
 230     for: 30m
 231     labels:
 232       severity: day
 233     annotations:
 234       description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
 235
 236   - alert: PrometheusTooManyRestarts
 237     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[30m]) > 10
 238     for: 0m
 239     labels:
 240       severity: warning
 241     annotations:
 242       description: "Prometheus has restarted more than ten times in the last 30 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
 243
 244   - alert: PrometheusAlertmanagerJobMissing
 245     expr: absent(up{job="alertmanager"})
 246     for: 30m
 247     labels:
 248       severity: warn
 249     annotations:
 250       description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
 251
 252   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 253     expr: alertmanager_config_last_reload_successful != 1
 254     for: 30m
 255     labels:
 256       severity: day
 257     annotations:
 258       description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
 259
 260   - alert: PrometheusNotConnectedToAlertmanager
 261     expr: prometheus_notifications_alertmanagers_discovered < 1
 262     for: 30m
 263     labels:
 264       severity: day
 265     annotations:
 266       description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
 267
 268   - alert: PrometheusRuleEvaluationFailures
 269     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 270     for: 30m
 271     labels:
 272       severity: warn
 273     annotations:
 274       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
 275
 276   - alert: PrometheusTemplateTextExpansionFailures
 277     expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
 278     for: 30m
 279     labels:
 280       severity: warn
 281     annotations:
 282       description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
 283
 284   - alert: PrometheusRuleEvaluationSlow
 285     expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
 286     for: 5m
 287     labels:
 288       severity: warn
 289     annotations:
 290       description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
 291
 292   - alert: PrometheusNotificationsBacklog
 293     expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0
 294     for: 0m
 295     labels:
 296       severity: warn
 297     annotations:
 298       description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
 299
 300   - alert: PrometheusAlertmanagerNotificationFailing
 301     expr: rate(alertmanager_notifications_failed_total[1m]) > 0
 302     for: 30m
 303     labels:
 304       severity: warn
 305     annotations:
 306       description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
 307
 308   # file_sd doesnt count as service discovery, so 0 is expected.
 309   # - alert: PrometheusTargetEmpty
 310   #   expr: prometheus_sd_discovered_targets == 0
 311   #   for: 30m
 312   #   labels:
 313   #     severity: day
 314   #   annotations:
 315   #     description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
 316
 317   - alert: PrometheusTargetScrapingSlow
 318     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
 319     for: 30m
 320     labels:
 321       severity: warn
 322     annotations:
 323       description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}"
 324
 325   - alert: PrometheusLargeScrape
 326     expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
 327     for: 30m
 328     labels:
 329       severity: warn
 330     annotations:
 331       description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
 332
 333   - alert: PrometheusTargetScrapeDuplicate
 334     expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
 335     for: 30m
 336     labels:
 337       severity: warn
 338     annotations:
 339       description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
 340
 341   - alert: PrometheusTsdbCheckpointCreationFailures
 342     expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
 343     for: 30m
 344     labels:
 345       severity: warn
 346     annotations:
 347       description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
 348
 349   - alert: PrometheusTsdbCheckpointDeletionFailures
 350     expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
 351     for: 30m
 352     labels:
 353       severity: warn
 354     annotations:
 355       description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
 356
 357   - alert: PrometheusTsdbCompactionsFailed
 358     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 359     for: 30m
 360     labels:
 361       severity: warn
 362     annotations:
 363       description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
 364
 365   - alert: PrometheusTsdbHeadTruncationsFailed
 366     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 367     for: 30m
 368     labels:
 369       severity: warn
 370     annotations:
 371       description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
 372
 373   - alert: PrometheusTsdbReloadFailures
 374     expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
 375     for: 30m
 376     labels:
 377       severity: warn
 378     annotations:
 379       description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
 380
 381   - alert: PrometheusTsdbWalCorruptions
 382     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 383     for: 30m
 384     labels:
 385       severity: warn
 386     annotations:
 387       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
 388
 389   - alert: PrometheusTsdbWalTruncationsFailed
 390     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 391     for: 30m
 392     labels:
 393       severity: warn
 394     annotations:
 395       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"