X-Git-Url: https://iankelling.org/git/?a=blobdiff_plain;f=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;fp=filesystem%2Fetc%2Fprometheus%2Frules%2Fiank.yml;h=043b64d695a4226e8c9103cb7fcce25314527eef;hb=e958999a4ab6fddd723270b596b4899c0811fa41;hp=0000000000000000000000000000000000000000;hpb=608a1255fc3700611bdabdc9c8635940ac3390af;p=distro-setup diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml new file mode 100644 index 0000000..043b64d --- /dev/null +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -0,0 +1,157 @@ + +groups: +- name: ansible managed alert rules + rules: + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 5% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 3% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available inodes left and is filling up. + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: |- + ( + node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: |- + ( + node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 5% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + {{ printf "%.0f" $value }} receive errors in the last two minutes.' + summary: Network interface is reporting many receive errors. + expr: |- + increase(node_network_receive_errs_total[2m]) > 10 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + summary: Network interface is reporting many transmit errors. + expr: |- + increase(node_network_transmit_errs_total[2m]) > 10 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are used' + summary: Number of conntrack are getting close to the limit + expr: |- + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure + NTP is configured correctly on this host. + summary: Clock skew detected. + expr: |- + ( + node_timex_offset_seconds > 0.05 + and + deriv(node_timex_offset_seconds[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds < -0.05 + and + deriv(node_timex_offset_seconds[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured + on this host. + summary: Clock not synchronising. + expr: |- + min_over_time(node_timex_sync_status[5m]) == 0 + for: 10m + labels: + severity: warning + - alert: ianktest + expr: node_systemd_version >= 300 + labels: + severity: critical + annotations: + description: '{{ $labels.instance }} ianktest.' + summary: Instance {{ $labels.instance }} - ianktest