X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;fp=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=4439c41d41cb61c6a59a4bbb5b967fbf0aeebbb3;hb=d7551546ac323c5d4b49370c885646bcf96e959f;hp=72b0701be8cd0a808d1924eaab94e50dc3b41786;hpb=b18dade73dedfe69aa741f8417947d83c4208f2d;p=distro-setup diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index 72b0701..4439c41 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -1,6 +1,12 @@ +# other rules to consider: +# filesystem, network, ntp rules: +# https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml +# on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml +# + groups: -- name: standard alerts +- name: standard rules: - alert: mailtest-check expr: |- @@ -9,170 +15,258 @@ groups: severity: day annotations: description: '{{ $labels.instance }} mailtest-check' - summary: {{ $labels.instance }} mailtest-check + summary: '{{ $labels.instance }} mailtest-check' + # 42 mins: enough for a 30 min queue run plus 12 - alert: mailtest-check expr: |- - # 42 mins: enough for a 30 min queue run plus 12 time() - mailtest_check_last_usec > 60 * 42 labels: severity: prod annotations: description: '{{ $labels.instance }} mailtest-check' - summary: {{ $labels.instance }} mailtest-check + summary: '{{ $labels.instance }} mailtest-check' + - alert: 1pmtest + expr: hour() == 18 and minute() < 5 + for: 0m + labels: + severity: daytest + annotations: + summary: Prometheus daily test alert (instance {{ $labels.instance }}) + description: "Prometheus daily test alert if no other alerts. It + is an end to end test.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # - alert: NodeFilesystemAlmostOutOfSpace - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available space left. - # summary: Filesystem has less than 5% space left. - # expr: |- - # ( - # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: warning - # - alert: NodeFilesystemAlmostOutOfSpace - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available space left. - # summary: Filesystem has less than 3% space left. - # expr: |- - # ( - # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: critical - # - alert: NodeFilesystemFilesFillingUp - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left and is filling up. - # summary: Filesystem is predicted to run out of inodes within the next 24 hours. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 - # and - # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: warning - # - alert: NodeFilesystemFilesFillingUp - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - # summary: Filesystem is predicted to run out of inodes within the next 4 hours. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 - # and - # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: critical - # - alert: NodeFilesystemAlmostOutOfFiles - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left. - # summary: Filesystem has less than 5% inodes left. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: warning - # - alert: NodeFilesystemAlmostOutOfFiles - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left. - # summary: Filesystem has less than 3% inodes left. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: critical - # - alert: NodeNetworkReceiveErrs - # annotations: - # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - # {{ printf "%.0f" $value }} receive errors in the last two minutes.' - # summary: Network interface is reporting many receive errors. - # expr: |- - # increase(node_network_receive_errs_total[2m]) > 10 - # for: 1h - # labels: - # severity: warning - # - alert: NodeNetworkTransmitErrs - # annotations: - # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - # {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - # summary: Network interface is reporting many transmit errors. - # expr: |- - # increase(node_network_transmit_errs_total[2m]) > 10 - # for: 1h - # labels: - # severity: warning - # - alert: NodeHighNumberConntrackEntriesUsed - # annotations: - # description: '{{ $value | humanizePercentage }} of conntrack entries are used' - # summary: Number of conntrack are getting close to the limit - # expr: |- - # (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 +# https://awesome-prometheus-alerts.grep.to/rules + + +# todo, we should probably group the prometheus alerts that indicate a +# host-local problem. +# eg, set a label alert-group: local-prom, then make a receiver that +# groups by it when the alert-group is local-prom. + +- name: awesome prometheus alerts + rules: + + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 30m + labels: + severity: day + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissing + expr: up == 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # todo: this should supress the above alert + # - alert: PrometheusAllTargetsMissing + # expr: count by (job) (up) == 0 + # for: 30m # labels: - # severity: warning - # - alert: NodeClockSkewDetected + # severity: day + # alert-group: local-prom # annotations: - # message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure - # NTP is configured correctly on this host. - # summary: Clock skew detected. - # expr: |- - # ( - # node_timex_offset_seconds > 0.05 - # and - # deriv(node_timex_offset_seconds[5m]) >= 0 - # ) - # or - # ( - # node_timex_offset_seconds < -0.05 - # and - # deriv(node_timex_offset_seconds[5m]) <= 0 - # ) - # for: 10m + # summary: Prometheus all targets missing (instance {{ $labels.instance }}) + # description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 30m + labels: + severity: day + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # I have an out of band alert to make sure prometheus is up. this + # looks like it would generate false positives. todo: think + # through what a valid crash loop detection would look like. + # - alert: PrometheusTooManyRestarts + # expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 10 + # for: 0m # labels: # severity: warning - # - alert: NodeClockNotSynchronising # annotations: - # message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured - # on this host. - # summary: Clock not synchronising. - # expr: |- - # min_over_time(node_timex_sync_status[5m]) == 0 - # for: 10m - # labels: - # severity: warning - # - alert: ianktest - # expr: node_systemd_version >= 300 + # summary: Prometheus too many restarts (instance {{ $labels.instance }}) + # description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerJobMissing + expr: absent(up{job="alertmanager"}) + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 30m + labels: + severity: day + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 30m + labels: + severity: day + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warn + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0 + for: 0m + labels: + severity: warn + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # file_sd doesnt count as service discovery, so 0 is expected. + # - alert: PrometheusTargetEmpty + # expr: prometheus_sd_discovered_targets == 0 + # for: 30m # labels: - # severity: critical + # severity: day # annotations: - # description: '{{ $labels.instance }} ianktest.' - # summary: Instance {{ $labels.instance }} - ianktest + # summary: Prometheus target empty (instance {{ $labels.instance }}) + # description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbReloadFailures + expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"