iankelling.org Git - distro-setup/blob - filesystem/etc/prometheus/rules/iank.yml

   1 # other rules to consider:
   2 # filesystem, network, ntp rules:
   3 # https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml
   4 # on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml
   5 #
   6
   7
   8 groups:
   9 - name: standard
  10   rules:
  11
  12 ## uncomment for testing an alert firing
  13 #   - alert: test-alert4
  14 #     expr: vector(1)
  15 # #    expr: nonexistent_metric
  16 #     for: 0m
  17 #     labels:
  18 #       severity: day
  19 #     annotations:
  20 #       description: "always-firing alert VALUE = {{ $value }}"
  21
  22
  23
  24 ###### BEGIN MISC NOTES ######
  25
  26 #
  27 # other interesting exporters
  28 # https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
  29 #
  30
  31 # interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/
  32
  33 # interesting promql query that could be useful later.
  34 # changes(ALERTS_FOR_STATE[24h])
  35 #
  36 #
  37 #
  38 # alert flap strategy.
  39 # https://roidelapluie.be/blog/2019/02/21/prometheus-last/
  40 #
  41 # Another idea generally is to make an alert that fires for 24 hours and
  42 # inhibits another alert for the same thing, which we want at most
  43 # 1 alert per 24 hours.
  44
  45 ###### END MISC NOTES ######
  46
  47 # various queries only look at increases, so invert the up metric so we
  48 # can better query on down.
  49   - record: down
  50     expr: up == bool 0
  51
  52
  53 # alerting on missing metrics:
  54 # https://www.robustperception.io/absent-alerting-for-scraped-metrics
  55 # that doesnt work if we want to alert across multiple hosts, eg
  56 # up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
  57 # however, google lead me to a solution here
  58 # https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
  59 # there is also the absent() function, but i didnt see a way to make that work
  60   - alert: mysers_units_missing
  61     expr: |-
  62       count(up{job="node"} == 1) by (instance) * 3 unless
  63       count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
  64     for: 20m
  65     labels:
  66       severity: warn
  67
  68   - alert: epanicclean_not_active
  69     expr: |-
  70       node_systemd_unit_state{name="epanicclean.service",state="active"} != 1
  71     for: 20m
  72     labels:
  73       severity: warn
  74
  75   - alert: epanicclean_missing
  76     expr: |-
  77       count(up{job=~"node|tlsnode"} == 1) by (instance) unless
  78       count(node_systemd_unit_state{job=~"node|tlsnode",name="epanicclean.service",state="active"}) by (instance)
  79     for: 20m
  80     labels:
  81       severity: warn
  82
  83   - alert: mysers_not_active
  84     expr: |-
  85       node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
  86     for: 20m
  87     labels:
  88       severity: warn
  89
  90   - alert: sysd_result_fail
  91     # not sure 30m is really needed, it prevents the alert from flapping
  92     # i guess.
  93     expr: |-
  94       rate(node_systemd_unit_result_fail_count[30m]) > 0
  95     labels:
  96       severity: day
  97
  98   - alert: exim_paniclog
  99     expr: |-
 100       exim_paniclog > 0
 101     labels:
 102       severity: warn
 103
 104   - alert: check_crypttab
 105     expr: |-
 106       check_crypttab > 0
 107     labels:
 108       severity: prod
 109
 110   - alert: mailtest_check_vps
 111     expr: |-
 112       time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 12
 113     labels:
 114       severity: day
 115     annotations:
 116       summary: '12 minutes down'
 117
 118   - alert: mailtest_check_unexpected_spamd_vps
 119     expr: |-
 120       mailtest_check_unexpected_spamd_results >= 1
 121     labels:
 122       severity: day
 123     annotations:
 124       summary: 'jr -u mailtest-check -e'
 125
 126   - alert: mailtest_check_mailhost
 127     expr: |-
 128       time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 12
 129     labels:
 130       severity: day
 131     annotations:
 132       summary: '12 minutes down'
 133
 134   # 20 minutes. just allow for more due to prod alert.
 135   - alert: mailtest_check_gnu_mailhost
 136     expr: |-
 137       time() - max by (folder,from) (mailtest_check_last_usec{folder="/m/md/l/testignore", from="iank@gnu.org"}) >= 60 * 20
 138     labels:
 139       severity: prod
 140     annotations:
 141       summary: '20 minutes down'
 142
 143
 144   - alert: 1pmtest
 145     expr: hour() == 17 and minute() < 5
 146     for: 0m
 147     labels:
 148       severity: daytest
 149     annotations:
 150       summary: Prometheus daily test alert
 151
 152
 153 #### Inhibit notes ####
 154 ## Example of expressions to detect if the target_down alert
 155 # fired in the last 24 hours. Initially, I thought his could
 156 # be an alert which inhibits up_resets, but eventually I figured
 157 # that doesn't make much sense, and the idea of using an alert
 158 # that is not an indication of something wrong, only inhibits another
 159 # alert, I think works better to integrate directly into the
 160 # alert it would inhibit, this may mean a recording rule. That avoids
 161 # an alert we have to ignore or filter out.
 162 #
 163 # Alternate expression, to calculate if the alert would have fired is:
 164 #  min_over_time(sum_over_time(up[30m])[1d:]) == 0
 165 #  where 30m matches the for: time in target_down
 166 #
 167 # Note: for graphing, surround in the expression in sum_over_time()
 168 # ALERTS{alertname="target_down",alertstate="firing"}[1d]
 169 #### end Inhibit notes ####
 170
 171
 172 # For targets where we alert only on long downtimes, we
 173 # still want to know if it is going down many times for short times over
 174 # a long period of time. But ignore reboots.
 175 #
 176 ## Another way would be to detect an overall downtime:
 177 # avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95
 178   - alert: up_resets
 179     expr: |-
 180       resets(up[1d]) - changes(node_boot_time_seconds[1d]) > 12
 181     labels:
 182       severity: warn
 183     annotations:
 184       summary: "Target has gone down {{ $value }} times in 1 day, > 12"
 185
 186
 187
 188 # https://awesome-prometheus-alerts.grep.to/rules
 189
 190 # todo, we should probably group the prometheus alerts that indicate a
 191 # host-local problem.
 192 # eg, set a label  alert-group: local-prom, then make a receiver that
 193 # groups by it when the alert-group is local-prom.
 194
 195 - name: awesome prometheus alerts
 196   rules:
 197
 198   - alert: PrometheusJobMissing
 199     expr: absent(up{job="prometheus"})
 200     for: 30m
 201     labels:
 202       severity: day
 203     annotations:
 204       summary: Prometheus job missing (instance {{ $labels.instance }})
 205       description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
 206
 207 # TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m,
 208 # and severity to day. mail host is tricky since it roams, but I think the
 209 # right way to do it is to check for absence of this metric:
 210 # mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}
 211   - alert: target_down
 212     expr: up == 0
 213     for: 30m
 214     labels:
 215       severity: warn
 216     annotations:
 217       summary: Target down for 30m
 218
 219
 220     # todo: this should group with the above alert
 221   - alert: PrometheusAllTargetsMissing
 222     expr: count by (job) (up) == 0
 223     for: 10m
 224     labels:
 225       severity: day
 226 #      alert-group: local-prom
 227     annotations:
 228       description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
 229
 230   - alert: PrometheusConfigurationReloadFailure
 231     expr: prometheus_config_last_reload_successful != 1
 232     for: 30m
 233     labels:
 234       severity: day
 235     annotations:
 236       description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
 237
 238   - alert: PrometheusTooManyRestarts
 239     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[30m]) > 10
 240     for: 0m
 241     labels:
 242       severity: warning
 243     annotations:
 244       description: "Prometheus has restarted more than ten times in the last 30 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
 245
 246   - alert: PrometheusAlertmanagerJobMissing
 247     expr: absent(up{job="alertmanager"})
 248     for: 30m
 249     labels:
 250       severity: warn
 251     annotations:
 252       description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
 253
 254   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 255     expr: alertmanager_config_last_reload_successful != 1
 256     for: 30m
 257     labels:
 258       severity: day
 259     annotations:
 260       description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
 261
 262   - alert: PrometheusNotConnectedToAlertmanager
 263     expr: prometheus_notifications_alertmanagers_discovered < 1
 264     for: 30m
 265     labels:
 266       severity: day
 267     annotations:
 268       description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
 269
 270   - alert: PrometheusRuleEvaluationFailures
 271     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 272     for: 30m
 273     labels:
 274       severity: warn
 275     annotations:
 276       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
 277
 278   - alert: PrometheusTemplateTextExpansionFailures
 279     expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
 280     for: 30m
 281     labels:
 282       severity: warn
 283     annotations:
 284       description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
 285
 286   - alert: PrometheusRuleEvaluationSlow
 287     expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
 288     for: 5m
 289     labels:
 290       severity: warn
 291     annotations:
 292       description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
 293
 294   - alert: PrometheusNotificationsBacklog
 295     expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0
 296     for: 0m
 297     labels:
 298       severity: warn
 299     annotations:
 300       description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
 301
 302   - alert: PrometheusAlertmanagerNotificationFailing
 303     expr: rate(alertmanager_notifications_failed_total[1m]) > 0
 304     for: 30m
 305     labels:
 306       severity: warn
 307     annotations:
 308       description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
 309
 310   # file_sd doesnt count as service discovery, so 0 is expected.
 311   # - alert: PrometheusTargetEmpty
 312   #   expr: prometheus_sd_discovered_targets == 0
 313   #   for: 30m
 314   #   labels:
 315   #     severity: day
 316   #   annotations:
 317   #     description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
 318
 319   - alert: PrometheusTargetScrapingSlow
 320     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
 321     for: 30m
 322     labels:
 323       severity: warn
 324     annotations:
 325       description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}"
 326
 327   - alert: PrometheusLargeScrape
 328     expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
 329     for: 30m
 330     labels:
 331       severity: warn
 332     annotations:
 333       description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
 334
 335   - alert: PrometheusTargetScrapeDuplicate
 336     expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
 337     for: 30m
 338     labels:
 339       severity: warn
 340     annotations:
 341       description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
 342
 343   - alert: PrometheusTsdbCheckpointCreationFailures
 344     expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
 345     for: 30m
 346     labels:
 347       severity: warn
 348     annotations:
 349       description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
 350
 351   - alert: PrometheusTsdbCheckpointDeletionFailures
 352     expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
 353     for: 30m
 354     labels:
 355       severity: warn
 356     annotations:
 357       description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
 358
 359   - alert: PrometheusTsdbCompactionsFailed
 360     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 361     for: 30m
 362     labels:
 363       severity: warn
 364     annotations:
 365       description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
 366
 367   - alert: PrometheusTsdbHeadTruncationsFailed
 368     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 369     for: 30m
 370     labels:
 371       severity: warn
 372     annotations:
 373       description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
 374
 375   - alert: PrometheusTsdbReloadFailures
 376     expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
 377     for: 30m
 378     labels:
 379       severity: warn
 380     annotations:
 381       description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
 382
 383   - alert: PrometheusTsdbWalCorruptions
 384     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
 385     for: 30m
 386     labels:
 387       severity: warn
 388     annotations:
 389       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
 390
 391   - alert: PrometheusTsdbWalTruncationsFailed
 392     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 393     for: 30m
 394     labels:
 395       severity: warn
 396     annotations:
 397       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"