- # - alert: NodeFilesystemAlmostOutOfSpace
- # annotations:
- # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
- # only {{ printf "%.2f" $value }}% available space left.
- # summary: Filesystem has less than 5% space left.
- # expr: |-
- # (
- # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
- # and
- # node_filesystem_readonly{job="node",fstype!=""} == 0
- # )
- # for: 1h
- # labels:
- # severity: warning
- # - alert: NodeFilesystemAlmostOutOfSpace
- # annotations:
- # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
- # only {{ printf "%.2f" $value }}% available space left.
- # summary: Filesystem has less than 3% space left.
- # expr: |-
- # (
- # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
- # and
- # node_filesystem_readonly{job="node",fstype!=""} == 0
- # )
- # for: 1h
- # labels:
- # severity: critical
- # - alert: NodeFilesystemFilesFillingUp
- # annotations:
- # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
- # only {{ printf "%.2f" $value }}% available inodes left and is filling up.
- # summary: Filesystem is predicted to run out of inodes within the next 24 hours.
- # expr: |-
- # (
- # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
- # and
- # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
- # and
- # node_filesystem_readonly{job="node",fstype!=""} == 0
- # )
- # for: 1h
- # labels:
- # severity: warning
- # - alert: NodeFilesystemFilesFillingUp
- # annotations:
- # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
- # only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
- # summary: Filesystem is predicted to run out of inodes within the next 4 hours.
- # expr: |-
- # (
- # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
- # and
- # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
- # and
- # node_filesystem_readonly{job="node",fstype!=""} == 0
- # )
- # for: 1h
- # labels:
- # severity: critical
- # - alert: NodeFilesystemAlmostOutOfFiles
- # annotations:
- # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
- # only {{ printf "%.2f" $value }}% available inodes left.
- # summary: Filesystem has less than 5% inodes left.
- # expr: |-
- # (
- # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
- # and
- # node_filesystem_readonly{job="node",fstype!=""} == 0
- # )
- # for: 1h
- # labels:
- # severity: warning
- # - alert: NodeFilesystemAlmostOutOfFiles
- # annotations:
- # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
- # only {{ printf "%.2f" $value }}% available inodes left.
- # summary: Filesystem has less than 3% inodes left.
- # expr: |-
- # (
- # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
- # and
- # node_filesystem_readonly{job="node",fstype!=""} == 0
- # )
- # for: 1h
- # labels:
- # severity: critical
- # - alert: NodeNetworkReceiveErrs
- # annotations:
- # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
- # {{ printf "%.0f" $value }} receive errors in the last two minutes.'
- # summary: Network interface is reporting many receive errors.
- # expr: |-
- # increase(node_network_receive_errs_total[2m]) > 10
- # for: 1h
- # labels:
- # severity: warning
- # - alert: NodeNetworkTransmitErrs
- # annotations:
- # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
- # {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
- # summary: Network interface is reporting many transmit errors.
- # expr: |-
- # increase(node_network_transmit_errs_total[2m]) > 10
- # for: 1h
- # labels:
- # severity: warning
- # - alert: NodeHighNumberConntrackEntriesUsed
- # annotations:
- # description: '{{ $value | humanizePercentage }} of conntrack entries are used'
- # summary: Number of conntrack are getting close to the limit
- # expr: |-
- # (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
+# https://awesome-prometheus-alerts.grep.to/rules
+
+
+# todo, we should probably group the prometheus alerts that indicate a
+# host-local problem.
+# eg, set a label alert-group: local-prom, then make a receiver that
+# groups by it when the alert-group is local-prom.
+
+- name: awesome prometheus alerts
+ rules:
+
+ - alert: PrometheusJobMissing
+ expr: absent(up{job="prometheus"})
+ for: 30m
+ labels:
+ severity: day
+ annotations:
+ summary: Prometheus job missing (instance {{ $labels.instance }})
+ description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: PrometheusTargetMissing
+ expr: up == 0
+ for: 30m
+ labels:
+ severity: warn
+ annotations:
+ summary: Prometheus target missing (instance {{ $labels.instance }})
+ description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ # todo: this should supress the above alert
+ # - alert: PrometheusAllTargetsMissing
+ # expr: count by (job) (up) == 0
+ # for: 30m