iankelling.org Git - distro-setup/blob - iank.yml

   1 # other rules to consider:
   2 # filesystem, network, ntp rules:
   3 # https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml
   4 # on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml
   5 #
   6
   7
   8 groups:
   9 - name: standard
  10   rules:
  11
  12 ## uncomment for testing an alert firing
  13 #   - alert: test-alert4
  14 #     expr: vector(1)
  15 # #    expr: nonexistent_metric
  16 #     for: 0m
  17 #     labels:
  18 #       severity: day
  19 #     annotations:
  20 #       description: "always-firing alert VALUE = {{ $value }}"
  21
  22
  23
  24 ###### BEGIN MISC NOTES ######
  25
  26 #
  27 # other interesting exporters
  28 # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
  29 #
  30
  31 # interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/
  32
  33 # interesting promql query that could be useful later.
  34 # changes(ALERTS_FOR_STATE[24h])
  35 #
  36 #
  37 #
  38 # alert flap strategy.
  39 # https://roidelapluie.be/blog/2019/02/21/prometheus-last/
  40 #
  41 # Another idea generally is to make an alert that fires for 24 hours and
  42 # inhibits another alert for the same thing, which we want at most
  43 # 1 alert per 24 hours.
  44
  45 ###### END MISC NOTES ######
  46
  47
  48
  49
  50 # alerting on missing metrics:
  51 # https://www.robustperception.io/absent-alerting-for-scraped-metrics
  52 # that doesnt work if we want to alert across multiple hosts, eg
  53 # up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
  54 # however, google lead me to a solution here
  55 # https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
  56 # there is also the absent() function, but i didnt see a way to make that work
  57   - alert: mysers_units_missing
  58     expr: |-
  59       count(up{job="node"} == 1) by (instance) * 3 unless
  60       count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
  61     for: 20m
  62     labels:
  63       severity: warn
  64
  65   - alert: epanicclean_not_active
  66     expr: |-
  67       node_systemd_unit_state{name="epanicclean.service",state="active"} != 1
  68     for: 20m
  69     labels:
  70       severity: warn
  71
  72   - alert: epanicclean_missing
  73     expr: |-
  74       count(up{job=~"node|tlsnode"} == 1) by (instance) unless
  75       count(node_systemd_unit_state{job=~"node|tlsnode",name="epanicclean.service",state="active"}) by (instance)
  76     for: 20m
  77     labels:
  78       severity: warn
  79
  80   - alert: mysers_not_active
  81     expr: |-
  82       node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
  83     for: 20m
  84     labels:
  85       severity: warn
  86
  87   - alert: sysd_result_fail
  88     expr: |-
  89       rate(node_systemd_unit_result_fail_count[30m]) > 0
  90     labels:
  91       severity: day
  92
  93   - alert: mailtest_check_vps
  94     expr: |-
  95       time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 12
  96     labels:
  97       severity: day
  98     annotations:
  99       summary: '12 minutes down'
 100
 101   # 42 mins: enough for a 30 min queue run plus 12
 102   - alert: mailtest_check_vps
 103     expr: |-
 104       time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 42
 105     labels:
 106       severity: prod
 107     annotations:
 108       summary: '42 minutes down'
 109
 110   - alert: mailtest_check_mailhost
 111     expr: |-
 112       time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 12
 113     labels:
 114       severity: day
 115     annotations:
 116       summary: '12 minutes down'
 117
 118   # 42 mins: enough for a 30 min queue run plus 12
 119   - alert: mailtest_check_mailhost
 120     expr: |-
 121       time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 42
 122     labels:
 123       severity: prod
 124     annotations:
 125       summary: '42 minutes down'
 126
 127
 128   - alert: 1pmtest
 129     expr: hour() == 17 and minute() < 5
 130     for: 0m
 131     labels:
 132       severity: daytest
 133     annotations:
 134       summary: Prometheus daily test alert
 135
 136
 137 #### Inhibit notes ####
 138 ## Example of expressions to detect if the target_down alert
 139 # fired in the last 24 hours. Initially, I thought his could
 140 # be an alert which inhibits up_resets, but eventually I figured
 141 # that doesn't make much sense, and the idea of using an alert
 142 # that is not an indication of something wrong, only inhibits another
 143 # alert, I think works better to integrate directly into the
 144 # alert it would inhibit, this may mean a recording rule. That avoids
 145 # an alert we have to ignore or filter out.
 146 #
 147 # Alternate expression, to calculate if the alert would have fired is:
 148 #  min_over_time(sum_over_time(up[30m])[1d:]) == 0
 149 #  where 30m matches the for: time in target_down
 150 #
 151 # Note: for graphing, surround in the expression in sum_over_time()
 152 # ALERTS{alertname="target_down",alertstate="firing"}[1d]
 153 #### end Inhibit notes ####
 154
 155
 156 # For targets where we alert only on long downtimes, we
 157 # still want to know if it is going down many times for short times over
 158 # a long period of time. But ignore reboots.
 159 #
 160 ## Another way would be to detect an overall downtime:
 161 # avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95
 162   - alert: up_resets
 163     expr: |-
 164       resets(up[2d]) - changes(node_boot_time_seconds[2d]) > 12
 165     labels:
 166       severity: warn
 167     annotations:
 168       summary: "Target has gone down {{ $value }} times in 2 days, > 12"
 169
 170
 171
 172 # https://awesome-prometheus-alerts.grep.to/rules
 173
 174 # todo, we should probably group the prometheus alerts that indicate a
 175 # host-local problem.
 176 # eg, set a label  alert-group: local-prom, then make a receiver that
 177 # groups by it when the alert-group is local-prom.
 178
 179 - name: awesome prometheus alerts
 180   rules:
 181
 182   - alert: PrometheusJobMissing
 183     expr: absent(up{job="prometheus"})
 184     for: 30m
 185     labels:
 186       severity: day
 187     annotations:
 188       summary: Prometheus job missing (instance {{ $labels.instance }})
 189       description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
 190
 191 # TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m,
 192 # and severity to day. mail host is tricky since it roams, but I think the
 193 # right way to do it is to check for absence of this metric:
 194 # mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}
 195   - alert: target_down
 196     expr: up == 0
 197     for: 30m
 198     labels:
 199       severity: warn
 200     annotations:
 201       summary: Target down for 30m
 202
 203
 204     # todo: this should group with the above alert
 205   - alert: PrometheusAllTargetsMissing
 206     expr: count by (job) (up) == 0
 207     for: 10m
 208     labels:
 209       severity: day
 210 #      alert-group: local-prom
 211     annotations:
 212       description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
 213
 214   - alert: PrometheusConfigurationReloadFailure
 215     expr: prometheus_config_last_reload_successful != 1
 216     for: 30m
 217     labels:
 218       severity: day
 219     annotations:
 220       description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
 221
 222   - alert: PrometheusTooManyRestarts
 223     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[30m]) > 10
 224     for: 0m
 225     labels:
 226       severity: warning
 227     annotations:
 228       description: "Prometheus has restarted more than ten times in the last 30 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
 229
 230   - alert: PrometheusAlertmanagerJobMissing
 231     expr: absent(up{job="alertmanager"})
 232     for: 30m
 233     labels:
 234       severity: warn
 235     annotations:
 236       description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
 237
 238   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 239     expr: alertmanager_config_last_reload_successful != 1
 240     for: 30m
 241     labels:
 242       severity: day
 243     annotations:
 244       description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
 245
 246   - alert: PrometheusNotConnectedToAlertmanager
 247     expr: prometheus_notifications_alertmanagers_discovered < 1
 248     for: 30m
 249     labels:
 250       severity: day
 251     annotations:
 252       description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
 253
 254   - alert: PrometheusRuleEvaluationFailures
 255     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 256     for: 30m
 257     labels:
 258       severity: warn
 259     annotations:
 260       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
 261
 262   - alert: PrometheusTemplateTextExpansionFailures
 263     expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
 264     for: 30m
 265     labels:
 266       severity: warn
 267     annotations:
 268       description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
 269
 270   - alert: PrometheusRuleEvaluationSlow
 271     expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
 272     for: 5m
 273     labels:
 274       severity: warn
 275     annotations:
 276       description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
 277
 278   - alert: PrometheusNotificationsBacklog
 279     expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0
 280     for: 0m
 281     labels:
 282       severity: warn
 283     annotations:
 284       description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
 285
 286   - alert: PrometheusAlertmanagerNotificationFailing
 287     expr: rate(alertmanager_notifications_failed_total[1m]) > 0
 288     for: 30m
 289     labels:
 290       severity: warn
 291     annotations:
 292       description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
 293
 294   # file_sd doesnt count as service discovery, so 0 is expected.
 295   # - alert: PrometheusTargetEmpty
 296   #   expr: prometheus_sd_discovered_targets == 0
 297   #   for: 30m
 298   #   labels:
 299   #     severity: day
 300   #   annotations:
 301   #     description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
 302
 303   - alert: PrometheusTargetScrapingSlow
 304     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
 305     for: 30m
 306     labels:
 307       severity: warn
 308     annotations:
 309       description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}"
 310
 311   - alert: PrometheusLargeScrape
 312     expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
 313     for: 30m
 314     labels:
 315       severity: warn
 316     annotations:
 317       description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
 318
 319   - alert: PrometheusTargetScrapeDuplicate
 320     expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
 321     for: 30m
 322     labels:
 323       severity: warn
 324     annotations:
 325       description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
 326
 327   - alert: PrometheusTsdbCheckpointCreationFailures
 328     expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
 329     for: 30m
 330     labels:
 331       severity: warn
 332     annotations:
 333       description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
 334
 335   - alert: PrometheusTsdbCheckpointDeletionFailures
 336     expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
 337     for: 30m
 338     labels:
 339       severity: warn
 340     annotations:
 341       description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
 342
 343   - alert: PrometheusTsdbCompactionsFailed
 344     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 345     for: 30m
 346     labels:
 347       severity: warn
 348     annotations:
 349       description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
 350
 351   - alert: PrometheusTsdbHeadTruncationsFailed
 352     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 353     for: 30m
 354     labels:
 355       severity: warn
 356     annotations:
 357       description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
 358
 359   - alert: PrometheusTsdbReloadFailures
 360     expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
 361     for: 30m
 362     labels:
 363       severity: warn
 364     annotations:
 365       description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
 366
 367   - alert: PrometheusTsdbWalCorruptions
 368     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 369     for: 30m
 370     labels:
 371       severity: warn
 372     annotations:
 373       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
 374
 375   - alert: PrometheusTsdbWalTruncationsFailed
 376     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 377     for: 30m
 378     labels:
 379       severity: warn
 380     annotations:
 381       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"