X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=72b0701be8cd0a808d1924eaab94e50dc3b41786;hb=b18dade73dedfe69aa741f8417947d83c4208f2d;hp=b78368fef48df7af5332d0633fc5e8fa3fafeb7d;hpb=2b981100a8b45f117d6f5c165404937f3c917e8f;p=distro-setup diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index b78368f..72b0701 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -1,173 +1,178 @@ groups: -- name: ansible managed alert rules +- name: standard alerts rules: - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left. - summary: Filesystem has less than 5% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left. - summary: Filesystem has less than 3% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left and is filling up. - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: |- - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: |- - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left. - summary: Filesystem has less than 5% inodes left. - expr: |- - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left. - summary: Filesystem has less than 3% inodes left. - expr: |- - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} receive errors in the last two minutes.' - summary: Network interface is reporting many receive errors. + - alert: mailtest-check expr: |- - increase(node_network_receive_errs_total[2m]) > 10 - for: 1h + time() - mailtest_check_last_usec > 60 * 12 labels: - severity: warning - - alert: NodeNetworkTransmitErrs + severity: day annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - summary: Network interface is reporting many transmit errors. - expr: |- - increase(node_network_transmit_errs_total[2m]) > 10 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used' - summary: Number of conntrack are getting close to the limit - expr: |- - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure - NTP is configured correctly on this host. - summary: Clock skew detected. - expr: |- - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured - on this host. - summary: Clock not synchronising. - expr: |- - min_over_time(node_timex_sync_status[5m]) == 0 - for: 10m - labels: - severity: warning - - alert: ianktest - expr: node_systemd_version >= 300 - labels: - severity: critical - annotations: - description: '{{ $labels.instance }} ianktest.' - summary: Instance {{ $labels.instance }} - ianktest - - alert: ianktest - expr: node_systemd_version >= 300 - labels: - severity: critical - annotations: - description: '{{ $labels.instance }} ianktest.' - summary: Instance {{ $labels.instance }} - ianktest + description: '{{ $labels.instance }} mailtest-check' + summary: {{ $labels.instance }} mailtest-check - - alert: ianktest + - alert: mailtest-check expr: |- - time() - mailtest_check_last_usec > 60 * 8 + # 42 mins: enough for a 30 min queue run plus 12 + time() - mailtest_check_last_usec > 60 * 42 labels: - severity: critical + severity: prod annotations: - description: '{{ $labels.instance }} mailtest' - summary: Instance {{ $labels.instance }} - ianktest + description: '{{ $labels.instance }} mailtest-check' + summary: {{ $labels.instance }} mailtest-check + + + + # - alert: NodeFilesystemAlmostOutOfSpace + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available space left. + # summary: Filesystem has less than 5% space left. + # expr: |- + # ( + # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: warning + # - alert: NodeFilesystemAlmostOutOfSpace + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available space left. + # summary: Filesystem has less than 3% space left. + # expr: |- + # ( + # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: critical + # - alert: NodeFilesystemFilesFillingUp + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available inodes left and is filling up. + # summary: Filesystem is predicted to run out of inodes within the next 24 hours. + # expr: |- + # ( + # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 + # and + # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: warning + # - alert: NodeFilesystemFilesFillingUp + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + # summary: Filesystem is predicted to run out of inodes within the next 4 hours. + # expr: |- + # ( + # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 + # and + # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: critical + # - alert: NodeFilesystemAlmostOutOfFiles + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available inodes left. + # summary: Filesystem has less than 5% inodes left. + # expr: |- + # ( + # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: warning + # - alert: NodeFilesystemAlmostOutOfFiles + # annotations: + # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + # only {{ printf "%.2f" $value }}% available inodes left. + # summary: Filesystem has less than 3% inodes left. + # expr: |- + # ( + # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 + # and + # node_filesystem_readonly{job="node",fstype!=""} == 0 + # ) + # for: 1h + # labels: + # severity: critical + # - alert: NodeNetworkReceiveErrs + # annotations: + # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + # {{ printf "%.0f" $value }} receive errors in the last two minutes.' + # summary: Network interface is reporting many receive errors. + # expr: |- + # increase(node_network_receive_errs_total[2m]) > 10 + # for: 1h + # labels: + # severity: warning + # - alert: NodeNetworkTransmitErrs + # annotations: + # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + # {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + # summary: Network interface is reporting many transmit errors. + # expr: |- + # increase(node_network_transmit_errs_total[2m]) > 10 + # for: 1h + # labels: + # severity: warning + # - alert: NodeHighNumberConntrackEntriesUsed + # annotations: + # description: '{{ $value | humanizePercentage }} of conntrack entries are used' + # summary: Number of conntrack are getting close to the limit + # expr: |- + # (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + # labels: + # severity: warning + # - alert: NodeClockSkewDetected + # annotations: + # message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure + # NTP is configured correctly on this host. + # summary: Clock skew detected. + # expr: |- + # ( + # node_timex_offset_seconds > 0.05 + # and + # deriv(node_timex_offset_seconds[5m]) >= 0 + # ) + # or + # ( + # node_timex_offset_seconds < -0.05 + # and + # deriv(node_timex_offset_seconds[5m]) <= 0 + # ) + # for: 10m + # labels: + # severity: warning + # - alert: NodeClockNotSynchronising + # annotations: + # message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured + # on this host. + # summary: Clock not synchronising. + # expr: |- + # min_over_time(node_timex_sync_status[5m]) == 0 + # for: 10m + # labels: + # severity: warning + # - alert: ianktest + # expr: node_systemd_version >= 300 + # labels: + # severity: critical + # annotations: + # description: '{{ $labels.instance }} ianktest.' + # summary: Instance {{ $labels.instance }} - ianktest