mostly start using prometheus
[distro-setup] / filesystem / etc / prometheus / rules / iank.yml
1
2 groups:
3 - name: ansible managed alert rules
4 rules:
5 - alert: NodeFilesystemAlmostOutOfSpace
6 annotations:
7 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
8 only {{ printf "%.2f" $value }}% available space left.
9 summary: Filesystem has less than 5% space left.
10 expr: |-
11 (
12 node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
13 and
14 node_filesystem_readonly{job="node",fstype!=""} == 0
15 )
16 for: 1h
17 labels:
18 severity: warning
19 - alert: NodeFilesystemAlmostOutOfSpace
20 annotations:
21 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
22 only {{ printf "%.2f" $value }}% available space left.
23 summary: Filesystem has less than 3% space left.
24 expr: |-
25 (
26 node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
27 and
28 node_filesystem_readonly{job="node",fstype!=""} == 0
29 )
30 for: 1h
31 labels:
32 severity: critical
33 - alert: NodeFilesystemFilesFillingUp
34 annotations:
35 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
36 only {{ printf "%.2f" $value }}% available inodes left and is filling up.
37 summary: Filesystem is predicted to run out of inodes within the next 24 hours.
38 expr: |-
39 (
40 node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
41 and
42 predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
43 and
44 node_filesystem_readonly{job="node",fstype!=""} == 0
45 )
46 for: 1h
47 labels:
48 severity: warning
49 - alert: NodeFilesystemFilesFillingUp
50 annotations:
51 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
52 only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
53 summary: Filesystem is predicted to run out of inodes within the next 4 hours.
54 expr: |-
55 (
56 node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
57 and
58 predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
59 and
60 node_filesystem_readonly{job="node",fstype!=""} == 0
61 )
62 for: 1h
63 labels:
64 severity: critical
65 - alert: NodeFilesystemAlmostOutOfFiles
66 annotations:
67 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
68 only {{ printf "%.2f" $value }}% available inodes left.
69 summary: Filesystem has less than 5% inodes left.
70 expr: |-
71 (
72 node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
73 and
74 node_filesystem_readonly{job="node",fstype!=""} == 0
75 )
76 for: 1h
77 labels:
78 severity: warning
79 - alert: NodeFilesystemAlmostOutOfFiles
80 annotations:
81 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
82 only {{ printf "%.2f" $value }}% available inodes left.
83 summary: Filesystem has less than 3% inodes left.
84 expr: |-
85 (
86 node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
87 and
88 node_filesystem_readonly{job="node",fstype!=""} == 0
89 )
90 for: 1h
91 labels:
92 severity: critical
93 - alert: NodeNetworkReceiveErrs
94 annotations:
95 description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
96 {{ printf "%.0f" $value }} receive errors in the last two minutes.'
97 summary: Network interface is reporting many receive errors.
98 expr: |-
99 increase(node_network_receive_errs_total[2m]) > 10
100 for: 1h
101 labels:
102 severity: warning
103 - alert: NodeNetworkTransmitErrs
104 annotations:
105 description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
106 {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
107 summary: Network interface is reporting many transmit errors.
108 expr: |-
109 increase(node_network_transmit_errs_total[2m]) > 10
110 for: 1h
111 labels:
112 severity: warning
113 - alert: NodeHighNumberConntrackEntriesUsed
114 annotations:
115 description: '{{ $value | humanizePercentage }} of conntrack entries are used'
116 summary: Number of conntrack are getting close to the limit
117 expr: |-
118 (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
119 labels:
120 severity: warning
121 - alert: NodeClockSkewDetected
122 annotations:
123 message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
124 NTP is configured correctly on this host.
125 summary: Clock skew detected.
126 expr: |-
127 (
128 node_timex_offset_seconds > 0.05
129 and
130 deriv(node_timex_offset_seconds[5m]) >= 0
131 )
132 or
133 (
134 node_timex_offset_seconds < -0.05
135 and
136 deriv(node_timex_offset_seconds[5m]) <= 0
137 )
138 for: 10m
139 labels:
140 severity: warning
141 - alert: NodeClockNotSynchronising
142 annotations:
143 message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
144 on this host.
145 summary: Clock not synchronising.
146 expr: |-
147 min_over_time(node_timex_sync_status[5m]) == 0
148 for: 10m
149 labels:
150 severity: warning
151 - alert: ianktest
152 expr: node_systemd_version >= 300
153 labels:
154 severity: critical
155 annotations:
156 description: '{{ $labels.instance }} ianktest.'
157 summary: Instance {{ $labels.instance }} - ianktest