iankelling.org Git - distro-setup/blob - rules/iank.yml

   1 # other rules to consider:
   2 # filesystem, network, ntp rules:
   3 # https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml
   4 # on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml
   5 #
   6
   7
   8 groups:
   9 - name: standard
  10   rules:
  11
  12 ## uncomment for testing an alert firing
  13 #   - alert: test-alert4
  14 #     expr: vector(1)
  15 # #    expr: nonexistent_metric
  16 #     for: 0m
  17 #     labels:
  18 #       severity: day
  19 #     annotations:
  20 #       description: "always-firing alert VALUE = {{ $value }}"
  21
  22
  23
  24 ###### BEGIN MISC NOTES ######
  25
  26 #
  27 # other interesting exporters
  28 # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
  29 #
  30
  31 # interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/
  32
  33 # interesting promql query that could be useful later.
  34 # changes(ALERTS_FOR_STATE[24h])
  35 #
  36 #
  37 #
  38 # alert flap strategy.
  39 # https://roidelapluie.be/blog/2019/02/21/prometheus-last/
  40 #
  41 # Another idea generally is to make an alert that fires for 24 hours and
  42 # inhibits another alert for the same thing, which we want at most
  43 # 1 alert per 24 hours.
  44
  45 ###### END MISC NOTES ######
  46
  47
  48
  49
  50 # alerting on missing metrics:
  51 # https://www.robustperception.io/absent-alerting-for-scraped-metrics
  52 # that doesnt work if we want to alert across multiple hosts, eg
  53 # up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
  54 # however, google lead me to a solution here
  55 # https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
  56 # there is also the absent() function, but i didnt see a way to make that work
  57   - alert: mysers_units_missing
  58     expr: |-
  59       count(up{job="node"} == 1) by (instance) * 3 unless
  60       count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
  61     for: 20m
  62     labels:
  63       severity: warn
  64
  65   - alert: epanicclean_not_active
  66     expr: |-
  67       node_systemd_unit_state{name="epanicclean.service",state="active"} != 1
  68     for: 20m
  69     labels:
  70       severity: warn
  71
  72   - alert: epanicclean_missing
  73     expr: |-
  74       count(up{job=~"node|tlsnode"} == 1) by (instance) unless
  75       count(node_systemd_unit_state{job=~"node|tlsnode",name="epanicclean.service",state="active"}) by (instance)
  76     for: 20m
  77     labels:
  78       severity: warn
  79
  80   - alert: mysers_not_active
  81     expr: |-
  82       node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
  83     for: 20m
  84     labels:
  85       severity: warn
  86
  87   - alert: sysd_result_fail
  88     # not sure 30m is really needed, it prevents the alert from flapping
  89     # i guess.
  90     expr: |-
  91       rate(node_systemd_unit_result_fail_count[30m]) > 0
  92     labels:
  93       severity: day
  94
  95   - alert: exim_paniclog
  96     expr: |-
  97       exim_paniclog > 0
  98     labels:
  99       severity: warn
 100
 101   - alert: mailtest_check_vps
 102     expr: |-
 103       time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 12
 104     labels:
 105       severity: day
 106     annotations:
 107       summary: '12 minutes down'
 108
 109   # 42 mins: enough for a 30 min queue run plus 12
 110   - alert: mailtest_check_vps
 111     expr: |-
 112       time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 42
 113     labels:
 114       severity: prod
 115     annotations:
 116       summary: '42 minutes down'
 117
 118   - alert: mailtest_check_mailhost
 119     expr: |-
 120       time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 12
 121     labels:
 122       severity: day
 123     annotations:
 124       summary: '12 minutes down'
 125
 126   # 42 mins: enough for a 30 min queue run plus 12
 127   - alert: mailtest_check_mailhost
 128     expr: |-
 129       time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 42
 130     labels:
 131       severity: prod
 132     annotations:
 133       summary: '42 minutes down'
 134
 135
 136   - alert: 1pmtest
 137     expr: hour() == 17 and minute() < 5
 138     for: 0m
 139     labels:
 140       severity: daytest
 141     annotations:
 142       summary: Prometheus daily test alert
 143
 144
 145 #### Inhibit notes ####
 146 ## Example of expressions to detect if the target_down alert
 147 # fired in the last 24 hours. Initially, I thought his could
 148 # be an alert which inhibits up_resets, but eventually I figured
 149 # that doesn't make much sense, and the idea of using an alert
 150 # that is not an indication of something wrong, only inhibits another
 151 # alert, I think works better to integrate directly into the
 152 # alert it would inhibit, this may mean a recording rule. That avoids
 153 # an alert we have to ignore or filter out.
 154 #
 155 # Alternate expression, to calculate if the alert would have fired is:
 156 #  min_over_time(sum_over_time(up[30m])[1d:]) == 0
 157 #  where 30m matches the for: time in target_down
 158 #
 159 # Note: for graphing, surround in the expression in sum_over_time()
 160 # ALERTS{alertname="target_down",alertstate="firing"}[1d]
 161 #### end Inhibit notes ####
 162
 163
 164 # For targets where we alert only on long downtimes, we
 165 # still want to know if it is going down many times for short times over
 166 # a long period of time. But ignore reboots.
 167 #
 168 ## Another way would be to detect an overall downtime:
 169 # avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95
 170   - alert: up_resets
 171     expr: |-
 172       resets(up[2d]) - changes(node_boot_time_seconds[2d]) > 12
 173     labels:
 174       severity: warn
 175     annotations:
 176       summary: "Target has gone down {{ $value }} times in 2 days, > 12"
 177
 178
 179
 180 # https://awesome-prometheus-alerts.grep.to/rules
 181
 182 # todo, we should probably group the prometheus alerts that indicate a
 183 # host-local problem.
 184 # eg, set a label  alert-group: local-prom, then make a receiver that
 185 # groups by it when the alert-group is local-prom.
 186
 187 - name: awesome prometheus alerts
 188   rules:
 189
 190   - alert: PrometheusJobMissing
 191     expr: absent(up{job="prometheus"})
 192     for: 30m
 193     labels:
 194       severity: day
 195     annotations:
 196       summary: Prometheus job missing (instance {{ $labels.instance }})
 197       description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
 198
 199 # TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m,
 200 # and severity to day. mail host is tricky since it roams, but I think the
 201 # right way to do it is to check for absence of this metric:
 202 # mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}
 203   - alert: target_down
 204     expr: up == 0
 205     for: 30m
 206     labels:
 207       severity: warn
 208     annotations:
 209       summary: Target down for 30m
 210
 211
 212     # todo: this should group with the above alert
 213   - alert: PrometheusAllTargetsMissing
 214     expr: count by (job) (up) == 0
 215     for: 10m
 216     labels:
 217       severity: day
 218 #      alert-group: local-prom
 219     annotations:
 220       description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
 221
 222   - alert: PrometheusConfigurationReloadFailure
 223     expr: prometheus_config_last_reload_successful != 1
 224     for: 30m
 225     labels:
 226       severity: day
 227     annotations:
 228       description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
 229
 230   - alert: PrometheusTooManyRestarts
 231     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[30m]) > 10
 232     for: 0m
 233     labels:
 234       severity: warning
 235     annotations:
 236       description: "Prometheus has restarted more than ten times in the last 30 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
 237
 238   - alert: PrometheusAlertmanagerJobMissing
 239     expr: absent(up{job="alertmanager"})
 240     for: 30m
 241     labels:
 242       severity: warn
 243     annotations:
 244       description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
 245
 246   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 247     expr: alertmanager_config_last_reload_successful != 1
 248     for: 30m
 249     labels:
 250       severity: day
 251     annotations:
 252       description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
 253
 254   - alert: PrometheusNotConnectedToAlertmanager
 255     expr: prometheus_notifications_alertmanagers_discovered < 1
 256     for: 30m
 257     labels:
 258       severity: day
 259     annotations:
 260       description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
 261
 262   - alert: PrometheusRuleEvaluationFailures
 263     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 264     for: 30m
 265     labels:
 266       severity: warn
 267     annotations:
 268       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
 269
 270   - alert: PrometheusTemplateTextExpansionFailures
 271     expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
 272     for: 30m
 273     labels:
 274       severity: warn
 275     annotations:
 276       description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
 277
 278   - alert: PrometheusRuleEvaluationSlow
 279     expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
 280     for: 5m
 281     labels:
 282       severity: warn
 283     annotations:
 284       description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
 285
 286   - alert: PrometheusNotificationsBacklog
 287     expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0
 288     for: 0m
 289     labels:
 290       severity: warn
 291     annotations:
 292       description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
 293
 294   - alert: PrometheusAlertmanagerNotificationFailing
 295     expr: rate(alertmanager_notifications_failed_total[1m]) > 0
 296     for: 30m
 297     labels:
 298       severity: warn
 299     annotations:
 300       description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
 301
 302   # file_sd doesnt count as service discovery, so 0 is expected.
 303   # - alert: PrometheusTargetEmpty
 304   #   expr: prometheus_sd_discovered_targets == 0
 305   #   for: 30m
 306   #   labels:
 307   #     severity: day
 308   #   annotations:
 309   #     description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
 310
 311   - alert: PrometheusTargetScrapingSlow
 312     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
 313     for: 30m
 314     labels:
 315       severity: warn
 316     annotations:
 317       description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}"
 318
 319   - alert: PrometheusLargeScrape
 320     expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
 321     for: 30m
 322     labels:
 323       severity: warn
 324     annotations:
 325       description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
 326
 327   - alert: PrometheusTargetScrapeDuplicate
 328     expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
 329     for: 30m
 330     labels:
 331       severity: warn
 332     annotations:
 333       description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
 334
 335   - alert: PrometheusTsdbCheckpointCreationFailures
 336     expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
 337     for: 30m
 338     labels:
 339       severity: warn
 340     annotations:
 341       description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
 342
 343   - alert: PrometheusTsdbCheckpointDeletionFailures
 344     expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
 345     for: 30m
 346     labels:
 347       severity: warn
 348     annotations:
 349       description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
 350
 351   - alert: PrometheusTsdbCompactionsFailed
 352     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 353     for: 30m
 354     labels:
 355       severity: warn
 356     annotations:
 357       description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
 358
 359   - alert: PrometheusTsdbHeadTruncationsFailed
 360     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 361     for: 30m
 362     labels:
 363       severity: warn
 364     annotations:
 365       description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
 366
 367   - alert: PrometheusTsdbReloadFailures
 368     expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
 369     for: 30m
 370     labels:
 371       severity: warn
 372     annotations:
 373       description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
 374
 375   - alert: PrometheusTsdbWalCorruptions
 376     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 377     for: 30m
 378     labels:
 379       severity: warn
 380     annotations:
 381       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
 382
 383   - alert: PrometheusTsdbWalTruncationsFailed
 384     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 385     for: 30m
 386     labels:
 387       severity: warn
 388     annotations:
 389       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"