mostly start using prometheus
[distro-setup] / filesystem / etc / prometheus / rules / iank.yml
diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml
new file mode 100644 (file)
index 0000000..043b64d
--- /dev/null
@@ -0,0 +1,157 @@
+
+groups:
+- name: ansible managed alert rules
+  rules:
+  - alert: NodeFilesystemAlmostOutOfSpace
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available space left.
+      summary: Filesystem has less than 5% space left.
+    expr: |-
+      (
+        node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeFilesystemAlmostOutOfSpace
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available space left.
+      summary: Filesystem has less than 3% space left.
+    expr: |-
+      (
+        node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: critical
+  - alert: NodeFilesystemFilesFillingUp
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available inodes left and is filling up.
+      summary: Filesystem is predicted to run out of inodes within the next 24 hours.
+    expr: |-
+      (
+        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
+      and
+        predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeFilesystemFilesFillingUp
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
+      summary: Filesystem is predicted to run out of inodes within the next 4 hours.
+    expr: |-
+      (
+        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
+      and
+        predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: critical
+  - alert: NodeFilesystemAlmostOutOfFiles
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available inodes left.
+      summary: Filesystem has less than 5% inodes left.
+    expr: |-
+      (
+        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeFilesystemAlmostOutOfFiles
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available inodes left.
+      summary: Filesystem has less than 3% inodes left.
+    expr: |-
+      (
+        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: critical
+  - alert: NodeNetworkReceiveErrs
+    annotations:
+      description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
+        {{ printf "%.0f" $value }} receive errors in the last two minutes.'
+      summary: Network interface is reporting many receive errors.
+    expr: |-
+      increase(node_network_receive_errs_total[2m]) > 10
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeNetworkTransmitErrs
+    annotations:
+      description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
+        {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
+      summary: Network interface is reporting many transmit errors.
+    expr: |-
+      increase(node_network_transmit_errs_total[2m]) > 10
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeHighNumberConntrackEntriesUsed
+    annotations:
+      description: '{{ $value | humanizePercentage }} of conntrack entries are used'
+      summary: Number of conntrack are getting close to the limit
+    expr: |-
+      (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
+    labels:
+      severity: warning
+  - alert: NodeClockSkewDetected
+    annotations:
+      message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
+        NTP is configured correctly on this host.
+      summary: Clock skew detected.
+    expr: |-
+      (
+        node_timex_offset_seconds > 0.05
+      and
+        deriv(node_timex_offset_seconds[5m]) >= 0
+      )
+      or
+      (
+        node_timex_offset_seconds < -0.05
+      and
+        deriv(node_timex_offset_seconds[5m]) <= 0
+      )
+    for: 10m
+    labels:
+      severity: warning
+  - alert: NodeClockNotSynchronising
+    annotations:
+      message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
+        on this host.
+      summary: Clock not synchronising.
+    expr: |-
+      min_over_time(node_timex_sync_status[5m]) == 0
+    for: 10m
+    labels:
+      severity: warning
+  - alert: ianktest
+    expr: node_systemd_version >= 300
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $labels.instance }} ianktest.'
+      summary: Instance {{ $labels.instance }} - ianktest