iankelling.org Git - distro-setup/blob - filesystem/etc/prometheus/rules/iank.yml

   1 # other rules to consider:
   2 # filesystem, network, ntp rules:
   3 # https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml
   4 # on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml
   5 #
   6
   7
   8 groups:
   9 - name: standard
  10   rules:
  11
  12 ## uncomment for testing an alert firing
  13 #   - alert: test-alert4
  14 #     expr: vector(1)
  15 # #    expr: nonexistent_metric
  16 #     for: 0m
  17 #     labels:
  18 #       severity: day
  19 #     annotations:
  20 #       description: "always-firing alert VALUE = {{ $value }}"
  21
  22
  23
  24 ###### BEGIN MISC NOTES ######
  25
  26 #
  27 # other interesting exporters
  28 # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
  29 #
  30
  31 # interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/
  32
  33 # interesting promql query that could be useful later.
  34 # changes(ALERTS_FOR_STATE[24h])
  35 #
  36 #
  37 #
  38 # alert flap strategy.
  39 # https://roidelapluie.be/blog/2019/02/21/prometheus-last/
  40 #
  41 # Another idea generally is to make an alert that fires for 24 hours and
  42 # inhibits another alert for the same thing, which we want at most
  43 # 1 alert per 24 hours.
  44
  45 ###### END MISC NOTES ######
  46
  47
  48
  49
  50 # alerting on missing metrics:
  51 # https://www.robustperception.io/absent-alerting-for-scraped-metrics
  52 # that doesnt work if we want to alert across multiple hosts, eg
  53 # up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
  54 # however, google lead me to a solution here
  55 # https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
  56 # there is also the absent() function, but i didnt see a way to make that work
  57   - alert: mysers_units_missing
  58     expr: |-
  59       count(up{job="node"}) by (instance) * 3 unless count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
  60     for: 20m
  61     labels:
  62       severity: warn
  63
  64   - alert: mysers_not_active
  65     expr: |-
  66       node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
  67     for: 20m
  68     labels:
  69       severity: warn
  70
  71   - alert: sysd_result_fail
  72     expr: |-
  73       rate(node_systemd_unit_result_fail_count[30m]) > 0
  74     labels:
  75       severity: day
  76
  77
  78   - alert: mailtest_check
  79     expr: |-
  80       time() - mailtest_check_last_usec > 60 * 12
  81     labels:
  82       severity: day
  83     annotations:
  84       summary: '12 minutes down'
  85
  86   # 42 mins: enough for a 30 min queue run plus 12
  87   - alert: mailtest_check
  88     expr: |-
  89       time() - mailtest_check_last_usec > 60 * 42
  90     labels:
  91       severity: prod
  92     annotations:
  93       summary: '43 minutes down'
  94
  95   - alert: 1pmtest
  96     expr: hour() == 17 and minute() < 5
  97     for: 0m
  98     labels:
  99       severity: daytest
 100     annotations:
 101       summary: Prometheus daily test alert
 102
 103
 104
 105 # alternate expression, to calculate if the alert would have fired is:
 106 #  min_over_time(sum_over_time(up[30m])[1d:]) == 0
 107 #  where 30m matches the for: time in target_down
 108 #
 109 # sum_over_time is not needed, just convenience for graphing
 110   - alert: target_down_inhibitor
 111     expr: |-
 112       sum_over_time(ALERTS{alertname="target_down"}[1d])
 113     labels:
 114       severity: ignore
 115     annotations:
 116       summary: alert that indicates target_down alert fired in the last day
 117       description: "VALUE = {{ $value }}"
 118
 119 # For targets where we alert except for longer downtimes, we
 120 # still want to know if it is going down many times for short times over
 121 # a long period of time. But ignore reboots.
 122 #
 123 ## Another way would be to detect an overall downtime:
 124 # avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95
 125   - alert: up_resets
 126     expr: |-
 127       resets(up[3d]) - changes(node_boot_time_seconds[3d]) > 15
 128     labels:
 129       severity: warn
 130     annotations:
 131       summary: "Target has gone down {{ $value }} times in 3 days, > 15"
 132
 133
 134
 135
 136 # https://awesome-prometheus-alerts.grep.to/rules
 137
 138
 139 # todo, we should probably group the prometheus alerts that indicate a
 140 # host-local problem.
 141 # eg, set a label  alert-group: local-prom, then make a receiver that
 142 # groups by it when the alert-group is local-prom.
 143
 144 - name: awesome prometheus alerts
 145   rules:
 146
 147   - alert: PrometheusJobMissing
 148     expr: absent(up{job="prometheus"})
 149     for: 30m
 150     labels:
 151       severity: day
 152     annotations:
 153       summary: Prometheus job missing (instance {{ $labels.instance }})
 154       description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
 155
 156 # TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m,
 157 # and severity to day. mail host is tricky since it roams, but I think the
 158 # right way to do it is to check for absence of this metric:
 159 # mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}
 160   - alert: target_down
 161     expr: up == 0
 162     for: 30m
 163     labels:
 164       severity: warn
 165     annotations:
 166       summary: Target down for 30m
 167
 168
 169     # todo: this should group with the above alert
 170   - alert: PrometheusAllTargetsMissing
 171     expr: count by (job) (up) == 0
 172     for: 10m
 173     labels:
 174       severity: day
 175 #      alert-group: local-prom
 176     annotations:
 177       description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
 178
 179   - alert: PrometheusConfigurationReloadFailure
 180     expr: prometheus_config_last_reload_successful != 1
 181     for: 30m
 182     labels:
 183       severity: day
 184     annotations:
 185       description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
 186
 187   - alert: PrometheusTooManyRestarts
 188     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[30m]) > 10
 189     for: 0m
 190     labels:
 191       severity: warning
 192     annotations:
 193       description: "Prometheus has restarted more than ten times in the last 30 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
 194
 195   - alert: PrometheusAlertmanagerJobMissing
 196     expr: absent(up{job="alertmanager"})
 197     for: 30m
 198     labels:
 199       severity: warn
 200     annotations:
 201       description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
 202
 203   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 204     expr: alertmanager_config_last_reload_successful != 1
 205     for: 30m
 206     labels:
 207       severity: day
 208     annotations:
 209       description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
 210
 211   - alert: PrometheusNotConnectedToAlertmanager
 212     expr: prometheus_notifications_alertmanagers_discovered < 1
 213     for: 30m
 214     labels:
 215       severity: day
 216     annotations:
 217       description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
 218
 219   - alert: PrometheusRuleEvaluationFailures
 220     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 221     for: 30m
 222     labels:
 223       severity: warn
 224     annotations:
 225       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
 226
 227   - alert: PrometheusTemplateTextExpansionFailures
 228     expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
 229     for: 30m
 230     labels:
 231       severity: warn
 232     annotations:
 233       description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
 234
 235   - alert: PrometheusRuleEvaluationSlow
 236     expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
 237     for: 5m
 238     labels:
 239       severity: warn
 240     annotations:
 241       description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
 242
 243   - alert: PrometheusNotificationsBacklog
 244     expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0
 245     for: 0m
 246     labels:
 247       severity: warn
 248     annotations:
 249       description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
 250
 251   - alert: PrometheusAlertmanagerNotificationFailing
 252     expr: rate(alertmanager_notifications_failed_total[1m]) > 0
 253     for: 30m
 254     labels:
 255       severity: warn
 256     annotations:
 257       description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
 258
 259   # file_sd doesnt count as service discovery, so 0 is expected.
 260   # - alert: PrometheusTargetEmpty
 261   #   expr: prometheus_sd_discovered_targets == 0
 262   #   for: 30m
 263   #   labels:
 264   #     severity: day
 265   #   annotations:
 266   #     description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
 267
 268   - alert: PrometheusTargetScrapingSlow
 269     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
 270     for: 30m
 271     labels:
 272       severity: warn
 273     annotations:
 274       description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}"
 275
 276   - alert: PrometheusLargeScrape
 277     expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
 278     for: 30m
 279     labels:
 280       severity: warn
 281     annotations:
 282       description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
 283
 284   - alert: PrometheusTargetScrapeDuplicate
 285     expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
 286     for: 30m
 287     labels:
 288       severity: warn
 289     annotations:
 290       description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
 291
 292   - alert: PrometheusTsdbCheckpointCreationFailures
 293     expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
 294     for: 30m
 295     labels:
 296       severity: warn
 297     annotations:
 298       description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
 299
 300   - alert: PrometheusTsdbCheckpointDeletionFailures
 301     expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
 302     for: 30m
 303     labels:
 304       severity: warn
 305     annotations:
 306       description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
 307
 308   - alert: PrometheusTsdbCompactionsFailed
 309     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 310     for: 30m
 311     labels:
 312       severity: warn
 313     annotations:
 314       description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
 315
 316   - alert: PrometheusTsdbHeadTruncationsFailed
 317     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 318     for: 30m
 319     labels:
 320       severity: warn
 321     annotations:
 322       description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
 323
 324   - alert: PrometheusTsdbReloadFailures
 325     expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
 326     for: 30m
 327     labels:
 328       severity: warn
 329     annotations:
 330       description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
 331
 332   - alert: PrometheusTsdbWalCorruptions
 333     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 334     for: 30m
 335     labels:
 336       severity: warn
 337     annotations:
 338       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
 339
 340   - alert: PrometheusTsdbWalTruncationsFailed
 341     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 342     for: 30m
 343     labels:
 344       severity: warn
 345     annotations:
 346       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"