X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=75b5cbcfc760925b79d51fd6c3262de7a6186a97;hb=802e885e3e7fa3857f8bc4f54c261d5ca76f2454;hp=72b0701be8cd0a808d1924eaab94e50dc3b41786;hpb=b18dade73dedfe69aa741f8417947d83c4208f2d;p=distro-setup diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index 72b0701..75b5cbc 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -1,178 +1,381 @@ +# other rules to consider: +# filesystem, network, ntp rules: +# https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml +# on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml +# + groups: -- name: standard alerts +- name: standard rules: - - alert: mailtest-check + +## uncomment for testing an alert firing +# - alert: test-alert4 +# expr: vector(1) +# # expr: nonexistent_metric +# for: 0m +# labels: +# severity: day +# annotations: +# description: "always-firing alert VALUE = {{ $value }}" + + + +###### BEGIN MISC NOTES ###### + +# +# other interesting exporters +# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts +# + +# interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/ + +# interesting promql query that could be useful later. +# changes(ALERTS_FOR_STATE[24h]) +# +# +# +# alert flap strategy. +# https://roidelapluie.be/blog/2019/02/21/prometheus-last/ +# +# Another idea generally is to make an alert that fires for 24 hours and +# inhibits another alert for the same thing, which we want at most +# 1 alert per 24 hours. + +###### END MISC NOTES ###### + + + + +# alerting on missing metrics: +# https://www.robustperception.io/absent-alerting-for-scraped-metrics +# that doesnt work if we want to alert across multiple hosts, eg +# up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"} +# however, google lead me to a solution here +# https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah +# there is also the absent() function, but i didnt see a way to make that work + - alert: mysers_units_missing + expr: |- + count(up{job="node"} == 1) by (instance) * 3 unless + count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance) + for: 20m + labels: + severity: warn + + - alert: epanicclean_not_active + expr: |- + node_systemd_unit_state{name="epanicclean.service",state="active"} != 1 + for: 20m + labels: + severity: warn + + - alert: epanicclean_missing expr: |- - time() - mailtest_check_last_usec > 60 * 12 + count(up{job=~"node|tlsnode"} == 1) by (instance) unless + count(node_systemd_unit_state{job=~"node|tlsnode",name="epanicclean.service",state="active"}) by (instance) + for: 20m + labels: + severity: warn + + - alert: mysers_not_active + expr: |- + node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1 + for: 20m + labels: + severity: warn + + - alert: sysd_result_fail + expr: |- + rate(node_systemd_unit_result_fail_count[30m]) > 0 + labels: + severity: day + + - alert: mailtest_check_vps + expr: |- + time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 12 labels: severity: day annotations: - description: '{{ $labels.instance }} mailtest-check' - summary: {{ $labels.instance }} mailtest-check + summary: '12 minutes down' - - alert: mailtest-check + # 42 mins: enough for a 30 min queue run plus 12 + - alert: mailtest_check_vps expr: |- - # 42 mins: enough for a 30 min queue run plus 12 - time() - mailtest_check_last_usec > 60 * 42 + time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 42 labels: severity: prod annotations: - description: '{{ $labels.instance }} mailtest-check' - summary: {{ $labels.instance }} mailtest-check + summary: '42 minutes down' + - alert: mailtest_check_mailhost + expr: |- + time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 12 + labels: + severity: day + annotations: + summary: '12 minutes down' + # 42 mins: enough for a 30 min queue run plus 12 + - alert: mailtest_check_mailhost + expr: |- + time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 42 + labels: + severity: prod + annotations: + summary: '42 minutes down' - # - alert: NodeFilesystemAlmostOutOfSpace - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available space left. - # summary: Filesystem has less than 5% space left. - # expr: |- - # ( - # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: warning - # - alert: NodeFilesystemAlmostOutOfSpace - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available space left. - # summary: Filesystem has less than 3% space left. - # expr: |- - # ( - # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: critical - # - alert: NodeFilesystemFilesFillingUp - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left and is filling up. - # summary: Filesystem is predicted to run out of inodes within the next 24 hours. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 - # and - # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: warning - # - alert: NodeFilesystemFilesFillingUp - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - # summary: Filesystem is predicted to run out of inodes within the next 4 hours. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 - # and - # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: critical - # - alert: NodeFilesystemAlmostOutOfFiles - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left. - # summary: Filesystem has less than 5% inodes left. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: warning - # - alert: NodeFilesystemAlmostOutOfFiles - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left. - # summary: Filesystem has less than 3% inodes left. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: critical - # - alert: NodeNetworkReceiveErrs - # annotations: - # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - # {{ printf "%.0f" $value }} receive errors in the last two minutes.' - # summary: Network interface is reporting many receive errors. - # expr: |- - # increase(node_network_receive_errs_total[2m]) > 10 - # for: 1h - # labels: - # severity: warning - # - alert: NodeNetworkTransmitErrs - # annotations: - # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - # {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - # summary: Network interface is reporting many transmit errors. - # expr: |- - # increase(node_network_transmit_errs_total[2m]) > 10 - # for: 1h - # labels: - # severity: warning - # - alert: NodeHighNumberConntrackEntriesUsed - # annotations: - # description: '{{ $value | humanizePercentage }} of conntrack entries are used' - # summary: Number of conntrack are getting close to the limit - # expr: |- - # (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - # labels: - # severity: warning - # - alert: NodeClockSkewDetected - # annotations: - # message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure - # NTP is configured correctly on this host. - # summary: Clock skew detected. - # expr: |- - # ( - # node_timex_offset_seconds > 0.05 - # and - # deriv(node_timex_offset_seconds[5m]) >= 0 - # ) - # or - # ( - # node_timex_offset_seconds < -0.05 - # and - # deriv(node_timex_offset_seconds[5m]) <= 0 - # ) - # for: 10m - # labels: - # severity: warning - # - alert: NodeClockNotSynchronising - # annotations: - # message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured - # on this host. - # summary: Clock not synchronising. - # expr: |- - # min_over_time(node_timex_sync_status[5m]) == 0 - # for: 10m - # labels: - # severity: warning - # - alert: ianktest - # expr: node_systemd_version >= 300 + + - alert: 1pmtest + expr: hour() == 17 and minute() < 5 + for: 0m + labels: + severity: daytest + annotations: + summary: Prometheus daily test alert + + +#### Inhibit notes #### +## Example of expressions to detect if the target_down alert +# fired in the last 24 hours. Initially, I thought his could +# be an alert which inhibits up_resets, but eventually I figured +# that doesn't make much sense, and the idea of using an alert +# that is not an indication of something wrong, only inhibits another +# alert, I think works better to integrate directly into the +# alert it would inhibit, this may mean a recording rule. That avoids +# an alert we have to ignore or filter out. +# +# Alternate expression, to calculate if the alert would have fired is: +# min_over_time(sum_over_time(up[30m])[1d:]) == 0 +# where 30m matches the for: time in target_down +# +# Note: for graphing, surround in the expression in sum_over_time() +# ALERTS{alertname="target_down",alertstate="firing"}[1d] +#### end Inhibit notes #### + + +# For targets where we alert only on long downtimes, we +# still want to know if it is going down many times for short times over +# a long period of time. But ignore reboots. +# +## Another way would be to detect an overall downtime: +# avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95 + - alert: up_resets + expr: |- + resets(up[2d]) - changes(node_boot_time_seconds[2d]) > 12 + labels: + severity: warn + annotations: + summary: "Target has gone down {{ $value }} times in 2 days, > 12" + + + +# https://awesome-prometheus-alerts.grep.to/rules + +# todo, we should probably group the prometheus alerts that indicate a +# host-local problem. +# eg, set a label alert-group: local-prom, then make a receiver that +# groups by it when the alert-group is local-prom. + +- name: awesome prometheus alerts + rules: + + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 30m + labels: + severity: day + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" + +# TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m, +# and severity to day. mail host is tricky since it roams, but I think the +# right way to do it is to check for absence of this metric: +# mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"} + - alert: target_down + expr: up == 0 + for: 30m + labels: + severity: warn + annotations: + summary: Target down for 30m + + + # todo: this should group with the above alert + - alert: PrometheusAllTargetsMissing + expr: count by (job) (up) == 0 + for: 10m + labels: + severity: day +# alert-group: local-prom + annotations: + description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}" + + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 30m + labels: + severity: day + annotations: + description: "Prometheus configuration reload error\n VALUE = {{ $value }}" + + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[30m]) > 10 + for: 0m + labels: + severity: warning + annotations: + description: "Prometheus has restarted more than ten times in the last 30 minutes. It might be crashlooping.\n VALUE = {{ $value }}" + + - alert: PrometheusAlertmanagerJobMissing + expr: absent(up{job="alertmanager"}) + for: 30m + labels: + severity: warn + annotations: + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 30m + labels: + severity: day + annotations: + description: "AlertManager configuration reload error\n VALUE = {{ $value }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 30m + labels: + severity: day + annotations: + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warn + annotations: + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}" + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0 + for: 0m + labels: + severity: warn + annotations: + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}" + + # file_sd doesnt count as service discovery, so 0 is expected. + # - alert: PrometheusTargetEmpty + # expr: prometheus_sd_discovered_targets == 0 + # for: 30m # labels: - # severity: critical + # severity: day # annotations: - # description: '{{ $labels.instance }} ianktest.' - # summary: Instance {{ $labels.instance }} - ianktest + # description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}" + + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}" + + - alert: PrometheusTsdbReloadFailures + expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}"