X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=4439c41d41cb61c6a59a4bbb5b967fbf0aeebbb3;hb=7e673f2615101982a17ffa0e23cb10b5b3803f07;hp=043b64d695a4226e8c9103cb7fcce25314527eef;hpb=e958999a4ab6fddd723270b596b4899c0811fa41;p=distro-setup diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index 043b64d..4439c41 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -1,157 +1,272 @@ +# other rules to consider: +# filesystem, network, ntp rules: +# https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml +# on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml +# + groups: -- name: ansible managed alert rules +- name: standard rules: - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left. - summary: Filesystem has less than 5% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left. - summary: Filesystem has less than 3% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left and is filling up. - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: |- - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: |- - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left. - summary: Filesystem has less than 5% inodes left. + - alert: mailtest-check expr: |- - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left. - summary: Filesystem has less than 3% inodes left. - expr: |- - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} receive errors in the last two minutes.' - summary: Network interface is reporting many receive errors. - expr: |- - increase(node_network_receive_errs_total[2m]) > 10 - for: 1h + time() - mailtest_check_last_usec > 60 * 12 labels: - severity: warning - - alert: NodeNetworkTransmitErrs + severity: day annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - summary: Network interface is reporting many transmit errors. + description: '{{ $labels.instance }} mailtest-check' + summary: '{{ $labels.instance }} mailtest-check' + + # 42 mins: enough for a 30 min queue run plus 12 + - alert: mailtest-check expr: |- - increase(node_network_transmit_errs_total[2m]) > 10 - for: 1h + time() - mailtest_check_last_usec > 60 * 42 labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed + severity: prod annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used' - summary: Number of conntrack are getting close to the limit - expr: |- - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + description: '{{ $labels.instance }} mailtest-check' + summary: '{{ $labels.instance }} mailtest-check' + + - alert: 1pmtest + expr: hour() == 18 and minute() < 5 + for: 0m labels: - severity: warning - - alert: NodeClockSkewDetected + severity: daytest annotations: - message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure - NTP is configured correctly on this host. - summary: Clock skew detected. - expr: |- - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured - on this host. - summary: Clock not synchronising. - expr: |- - min_over_time(node_timex_sync_status[5m]) == 0 - for: 10m + summary: Prometheus daily test alert (instance {{ $labels.instance }}) + description: "Prometheus daily test alert if no other alerts. It + is an end to end test.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + +# https://awesome-prometheus-alerts.grep.to/rules + + +# todo, we should probably group the prometheus alerts that indicate a +# host-local problem. +# eg, set a label alert-group: local-prom, then make a receiver that +# groups by it when the alert-group is local-prom. + +- name: awesome prometheus alerts + rules: + + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 30m + labels: + severity: day + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissing + expr: up == 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # todo: this should supress the above alert + # - alert: PrometheusAllTargetsMissing + # expr: count by (job) (up) == 0 + # for: 30m + # labels: + # severity: day + # alert-group: local-prom + # annotations: + # summary: Prometheus all targets missing (instance {{ $labels.instance }}) + # description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 30m + labels: + severity: day + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # I have an out of band alert to make sure prometheus is up. this + # looks like it would generate false positives. todo: think + # through what a valid crash loop detection would look like. + # - alert: PrometheusTooManyRestarts + # expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 10 + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: Prometheus too many restarts (instance {{ $labels.instance }}) + # description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerJobMissing + expr: absent(up{job="alertmanager"}) + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 30m + labels: + severity: day + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 30m + labels: + severity: day + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warn + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0 + for: 0m + labels: + severity: warn + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # file_sd doesnt count as service discovery, so 0 is expected. + # - alert: PrometheusTargetEmpty + # expr: prometheus_sd_discovered_targets == 0 + # for: 30m + # labels: + # severity: day + # annotations: + # summary: Prometheus target empty (instance {{ $labels.instance }}) + # description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 30m labels: - severity: warning - - alert: ianktest - expr: node_systemd_version >= 300 + severity: warn + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbReloadFailures + expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 30m labels: - severity: critical + severity: warn annotations: - description: '{{ $labels.instance }} ianktest.' - summary: Instance {{ $labels.instance }} - ianktest + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"