From e958999a4ab6fddd723270b596b4899c0811fa41 Mon Sep 17 00:00:00 2001 From: Ian Kelling Date: Sun, 20 Feb 2022 01:33:39 -0500 Subject: [PATCH] mostly start using prometheus --- .gitignore | 11 +- a/ansible.cfg | 46 ---- a/group_vars/all | 2 - a/setup.sh | 17 -- a/site.yml | 84 ------- brc2 | 21 +- conflink | 15 +- distro-end | 62 +++++ filesystem/etc/apt/preferences.d/prometheus | 3 + filesystem/etc/default/prometheus | 117 ++++++++++ .../etc/default/prometheus-alertmanager | 71 ++++++ .../etc/default/prometheus-node-exporter | 216 ++++++++++++++++++ filesystem/etc/prometheus/alertmanager.yml | 21 ++ filesystem/etc/prometheus/file_sd/node.yml | 10 + filesystem/etc/prometheus/file_sd/tlsnode.yml | 4 + filesystem/etc/prometheus/prometheus.yml | 47 ++++ filesystem/etc/prometheus/rules/iank.yml | 157 +++++++++++++ mailtest-check | 24 +- 18 files changed, 756 insertions(+), 172 deletions(-) delete mode 100644 a/ansible.cfg delete mode 100644 a/group_vars/all delete mode 100755 a/setup.sh delete mode 100644 a/site.yml create mode 100644 filesystem/etc/apt/preferences.d/prometheus create mode 100644 filesystem/etc/default/prometheus create mode 100644 filesystem/etc/default/prometheus-alertmanager create mode 100644 filesystem/etc/default/prometheus-node-exporter create mode 100644 filesystem/etc/prometheus/alertmanager.yml create mode 100644 filesystem/etc/prometheus/file_sd/node.yml create mode 100644 filesystem/etc/prometheus/file_sd/tlsnode.yml create mode 100644 filesystem/etc/prometheus/prometheus.yml create mode 100644 filesystem/etc/prometheus/rules/iank.yml diff --git a/.gitignore b/.gitignore index afa57c5..718065b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,4 @@ -/t.org -/old-unused +/filesystem/etc/prometheus-export-htpasswd +# other git repos /.emacs.d /Arduino -# 3rd party git repos -/a/roles/prom -/a/roles/grafana -/a/roles/node-exporter -/a/roles/alertmanager -/sl/.ianketiona/src/ -/sl/.iankflidas/src/ diff --git a/a/ansible.cfg b/a/ansible.cfg deleted file mode 100644 index d367ba2..0000000 --- a/a/ansible.cfg +++ /dev/null @@ -1,46 +0,0 @@ -[defaults] -# 2.7.4-1ppa~xenial would use 194M of memory resident in htop -# and run out of 4g of memory. just ran ansible 2.9.9-1ppa~bionic+9.0trisquel1 -# and with 23 fork limit, it topped out at 1 gig used of memory. -# we have about 60 hosts, so 100 should allow them all to run in -# parallell without a problem without having memory problems. -# oldused=0; while true; do used=$(free -w -t | tail -n1 | awk '{print $3}'); if ((used > oldused )); then oldused=$used; echo $(date) $used | tee used; fi; sleep 1; done -forks = 100 - -# Ansible doesnt have have trisquels python path in its os info. -# Silence a warning -# https://docs.ansible.com/ansible/2.9/reference_appendices/interpreter_discovery.html -interpreter_python = auto_silent -# strategy = free # DO NOT ENABLE. -# -# As of 2019-08-07, include_tasks is very broken with the free strategy. -# tasks will not be run for some hosts, or "when" rules ignored and run -# for the wrong host, or some hosts a task will be run twice. Even if we -# switch to import_tasks, I wouldn't trust using this until that bug is -# found and fixed. repro: tested with 2.7.4, (no bug reports or fixes -# found), Running just the common role, then searching for which hosts -# an install.yml included role got run using -# -# f() { awk '/xfsprogs/ { x = 1; next }; /^TASK/ { x = 0 }; x && /\[/ { print }' $1 | sort | uniq -c | pee cat wc; } -# f LOGFILE -host_key_checking = False -display_skipped_hosts = False -retry_files_enabled = False -# readable output -stdout_callback = yaml -# Our logs are already pretty big. You can temporarily uncomment to enable -# profiling info in the logs. -#callback_whitelist = timer, profile_tasks, profile_roles - -# Ansible suggests using the file module instead of chmod, but then it -# follows symlinks without an option to turn it off, which is completely -# braindead and screwed up my system. -command_warnings=False - -[ssh_connection] -pipelining = True -retries = 2 - -[colors] -# found in color.py. default blue is hard to read on a black background -verbose = bright blue diff --git a/a/group_vars/all b/a/group_vars/all deleted file mode 100644 index b74039d..0000000 --- a/a/group_vars/all +++ /dev/null @@ -1,2 +0,0 @@ -# iank: 1.1 because prometheus is configered elsewhere to use fqdn, which maps -prometheus_web_listen_address: "127.0.1.1:9090" diff --git a/a/setup.sh b/a/setup.sh deleted file mode 100755 index 4d253b9..0000000 --- a/a/setup.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# Copyright (C) 2019 Ian Kelling -# SPDX-License-Identifier: AGPL-3.0-or-later - -source /a/bin/errhandle/err - -[[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@" - -# shellcheck source=/a/bin/ds/.bashrc -export LC_USEBASHRC=t; if [[ -s ~/.bashrc ]]; then . ~/.bashrc; fi - -# dependency of node exporter, per README.md -pi python3-passlib - -# after running ansible, run -# conflink -# ser restart prometheus diff --git a/a/site.yml b/a/site.yml deleted file mode 100644 index b9a0276..0000000 --- a/a/site.yml +++ /dev/null @@ -1,84 +0,0 @@ ---- -- hosts: localhost - roles: - - role: prom - tags: a - prometheus_targets: - node: - - targets: - - "{{ ansible_fqdn }}:9100" - prometheus_scrape_configs: - - job_name: "prometheus" - metrics_path: "{{ prometheus_metrics_path }}" - static_configs: - - targets: - - "{{ ansible_fqdn }}:9090" - - job_name: "node" - basic_auth: - username: prom - password_file: /etc/prometheus-pass - #scheme: "https" - file_sd_configs: - - files: - - "{{ prometheus_config_dir }}/file_sd/node.yml" - # added because of warning in log - prometheus_alertmanager_config: - - static_configs: - - targets: - - "{{ ansible_fqdn }}:9093" - - - - role: node-exporter - tags: a - # node_exporter_tls_server_config: - # cert_file: /etc/node_exporter/fullchain.pem - # key_file: /etc/node_exporter/privkey.pem - node_exporter_web_listen_address: "127.0.1.1:9100" - node_exporter_basic_auth_users: - prom: "incarnadine.bloodied.maker" - - - role: alertmanager - alertmanager_smtp: - smarthost: 'mx.iankelling.org:587' - from: "alerts@iankelling.org" - require_tls: false - hello: 'defaultnn.b8.nz' - alertmanager_route: - receiver: defaultreceiver - repeat_interval: 7d - alertmanager_receivers: - - name: defaultreceiver - email_configs: - - to: alerts@iankelling.org - send_resolved: true - # the html was a bit ugly and just a huge waste of text, - # https://github.com/prometheus/alertmanager/issues/2232 - # lead me to find a convenient text option to use - html: - text: '{% raw -%}{{ template "opsgenie.default.description" . }}{% endraw -%}' - alertmanager_web_listen_address: '127.0.1.1:9093' - - - role: grafana - grafana_address: "127.0.1.1" - # iank: playbook will halt if no password is set. this is only - # available to localhost, so i dont really care, but might as well - # generate a pass isntead of putting in pw123 etc. - grafana_security: { admin_user: admin, admin_password: spheroid.recantation.shank } - grafana_datasources: - - name: prometheus - type: prometheus - access: proxy - url: 'http://{{ prometheus_web_listen_address }}' - basicAuth: false - isDefault: true - # This is based on looking at highly downloaded dashboards here - # https://grafana.com/dashboards?dataSource=prometheus&collector=nodeExporter - # Which is where you are lead to from - # https://prometheus.io/docs/visualization/grafana/ - grafana_dashboards: - - dashboard_id: 1860 - revision_id: 21 - datasource: prometheus - - dashboard_id: 405 - revision_id: 8 - datasource: prometheus diff --git a/brc2 b/brc2 index bd959d8..9fa1010 100644 --- a/brc2 +++ b/brc2 @@ -1085,31 +1085,31 @@ lom() { l=$(losetup -j $1 | sed -rn 's/^([^ ]+): .*/\1/p' | head -n1 ||:) if [[ ! $l ]]; then l=$(sudo losetup -f) - sudo losetup $l $1 + m sudo losetup $l $1 fi if ! sudo cryptsetup status /dev/mapper/$base &>/dev/null; then if ! sudo cryptsetup luksOpen $l $base; then - sudo losetup -d $l + m sudo losetup -d $l return 1 fi fi - sudo mkdir -p /mnt/$base - sudo mount /dev/mapper/$base /mnt/$base - sudo chown $USER:$USER /mnt/$base + m sudo mkdir -p /mnt/$base + m sudo mount /dev/mapper/$base /mnt/$base + m sudo chown $USER:$USER /mnt/$base else base=$1 if mountpoint /mnt/$base &>/dev/null; then - sudo umount /mnt/$base + m sudo umount /mnt/$base fi if sudo cryptsetup status /dev/mapper/$base &>/dev/null; then - if ! sudo cryptsetup luksClose /dev/mapper/$base; then + if ! m sudo cryptsetup luksClose /dev/mapper/$base; then echo lom: failed cryptsetup luksClose /dev/mapper/$base return 1 fi fi - l=$(losetup -j $1 | sed -rn 's/^([^ ]+): .*/\1/p' | head -n1 ||:) + l=$(losetup -l --noheadings | awk '$6 ~ /\/'$1'$/ {print $1}') if [[ $l ]]; then - sudo losetup -d $l + m sudo losetup -d $l else echo lom: warning: no loopback device found fi @@ -1266,6 +1266,9 @@ ngo() { otp() { oathtool --totp -b "$*" | xclip -selection clipboard } +j() { + "$@" |& pee "xclip -r -selection clipboard" +} pakaraoke() { diff --git a/conflink b/conflink index 90debe6..bfd4e44 100755 --- a/conflink +++ b/conflink @@ -166,15 +166,24 @@ case $user in if [[ -e /var/lib/znc ]] && getent group znc; then s chown -R znc:znc /var/lib/znc fi - f=/etc/prometheus-htpasswd + for f in /etc/prometheus-{,export-}htpasswd; do + if [[ -e $f ]]; then + s chmod 640 $f + if getent passwd www-data; then + s chown root:www-data $f + fi + fi + done + f=/etc/prometheus-pass if [[ -e $f ]]; then + # note: this is duplicative of the file's own permissions s chmod 640 $f /etc/prometheus-pass - s chown root:www-data $f if getent passwd prometheus; then - s chown root:prometheus /etc/prometheus-pass + s chown root:prometheus $f fi fi + ##### end special extra stuff ##### m sudo -H -u user2 "${BASH_SOURCE[0]}" diff --git a/distro-end b/distro-end index 7f2f371..ae628c5 100755 --- a/distro-end +++ b/distro-end @@ -1844,6 +1844,68 @@ case $HOSTNAME in ;; esac +### begin prometheus ### + +case $HOSTNAME in + kd) + # ive got these + a needed dependency pinned to bullseye, just to get + # versions more in line with the main docs. + pi prometheus-alertmanager prometheus prometheus-node-exporter + web-conf -p 9091 -f 9090 - apache2 i.b8.nz <<'EOF' + +AuthType Basic +AuthName "basic_auth" +# created with +# htpasswd -c prometheus-htpasswd USERNAME +AuthUserFile "/etc/prometheus-htpasswd" +Require valid-user + +EOF + ;; + *) + pi prometheus-node-exporter + ;; +esac + +case $HOSTNAME in + # frodo needs upgrade first. + frodo) : ;; + # todo, for limiting node exporter http, + # either use iptables or, in + # /etc/default/prometheus-node-exporter + # listen on the wireguard interface + ;; + li|je|bk) + # ex for exporter + web-conf -p 9101 -f 9100 - apache2 ${HOSTNAME}ex.b8.nz <<'EOF' + +AuthType Basic +AuthName "basic_auth" +# created with +# htpasswd -c prometheus-export-htpasswd USERNAME +AuthUserFile "/etc/prometheus-export-htpasswd" +Require valid-user + +EOF + ;; + *) + wgip=$(sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf) + web-conf -i -a $wgip -p 9101 -f 9100 - apache2 ${HOSTNAME}wg.b8.nz <<'EOF' + +AuthType Basic +AuthName "basic_auth" +# created with +# htpasswd -c prometheus-export-htpasswd USERNAME +AuthUserFile "/etc/prometheus-export-htpasswd" +Require valid-user + +EOF + ;; +esac + +### end prometheus ### + + end_msg <<'EOF' In mate settings settings, change scrolling to two-finger, because the default edge scroll doesn\'t work. Originally found this in debian. diff --git a/filesystem/etc/apt/preferences.d/prometheus b/filesystem/etc/apt/preferences.d/prometheus new file mode 100644 index 0000000..974f95b --- /dev/null +++ b/filesystem/etc/apt/preferences.d/prometheus @@ -0,0 +1,3 @@ +Package: prometheus-alertmanager prometheus prometheus-node-exporter libjs-jquery +Pin: release n=bullseye,o=Debian +Pin-Priority: 500 diff --git a/filesystem/etc/default/prometheus b/filesystem/etc/default/prometheus new file mode 100644 index 0000000..63d1ee3 --- /dev/null +++ b/filesystem/etc/default/prometheus @@ -0,0 +1,117 @@ +# iank: initial file from 2.24, added to empty ARGS. + +# Set the command-line arguments to pass to the server. + +ARGS="--web.listen-address=127.0.0.1:9090" + + + +# Prometheus supports the following options: +# --config.file="/etc/prometheus/prometheus.yml" +# Prometheus configuration file path. +# --web.listen-address="0.0.0.0:9090" +# Address to listen on for UI, API, and telemetry. +# --web.read-timeout=5m Maximum duration before timing out read of the +# request, and closing idle connections. +# --web.max-connections=512 Maximum number of simultaneous connections. +# --web.external-url= The URL under which Prometheus is externally +# reachable (for example, if Prometheus is served +# via a reverse proxy). Used for generating +# relative and absolute links back to Prometheus +# itself. If the URL has a path portion, it will +# be used to prefix all HTTP endpoints served by +# Prometheus. If omitted, relevant URL components +# will be derived automatically. +# --web.route-prefix= Prefix for the internal routes of web endpoints. +# Defaults to path of --web.external-url. +# --web.local-assets="/usr/share/prometheus/web/" +# Path to static asset/templates directory. +# --web.user-assets= Path to user asset directory, available at +# /user. +# --web.enable-lifecycle Enable shutdown and reload via HTTP request. +# --web.enable-admin-api Enable API endpoints for admin control actions. +# --web.console.templates="/etc/prometheus/consoles" +# Path to the console template directory, +# available at /consoles. +# --web.console.libraries="/etc/prometheus/console_libraries" +# Path to the console library directory. +# --web.page-title="Prometheus Time Series Collection and Processing Server" +# Document title of Prometheus instance. +# --web.cors.origin=".*" Regex for CORS origin. It is fully anchored. +# Example: 'https?://(domain1|domain2)\.com' +# --storage.tsdb.path="/var/lib/prometheus/metrics2/" +# Base path for metrics storage. +# --storage.tsdb.retention=15d +# [DEPRECATED] How long to retain samples in +# storage. This flag has been deprecated, use +# "storage.tsdb.retention.time" instead +# --storage.tsdb.retention.time=15d +# How long to retain samples in storage. When this +# flag is set it overrides +# "storage.tsdb.retention". +# If neither this flag nor "storage.tsdb.retention" +# nor "storage.tsdb.retention.size" is set, the +# retention time defaults to 15d. +# Units Supported: y, w, d, h, m, s, ms. +# --storage.tsdb.retention.size= +# [EXPERIMENTAL] Maximum number of bytes that can +# be stored for blocks. Units supported: KB, MB, +# GB, TB, PB. This flag is experimental and can be +# changed in future releases. +# --storage.tsdb.use-lockfile +# Create a lockfile in data directory. +# --storage.tsdb.allow-overlapping-blocks +# [EXPERIMENTAL] Allow overlapping blocks, which +# in turn enables vertical compaction and +# vertical query merge. +# --storage.tsdb.wal-compression +# Compress the tsdb WAL. +# --storage.remote.flush-deadline= +# How long to wait flushing sample on shutdown or +# config reload. +# --storage.remote.read-sample-limit=5e7 +# Maximum overall number of samples to return via +# the remote read interface, in a single query. 0 +# means no limit. This limit is ignored for +# streamed response types. +# --storage.remote.read-concurrent-limit=10 +# Maximum number of concurrent remote read calls. +# 0 means no limit. +# --storage.remote.read-max-bytes-in-frame=1048576 +# Maximum number of bytes in a single frame for +# streaming remote read response types before +# marshalling. Note that client might have limit on +# frame size as well. 1MB as recommended by +# protobuf by default. +# --rules.alert.for-outage-tolerance=1h +# Max time to tolerate prometheus outage for +# restoring "for" state of alert. +# --rules.alert.for-grace-period=10m +# Minimum duration between alert and restored "for" +# state. This is maintained only for alerts with +# configured "for" time greater than grace period. +# --rules.alert.resend-delay=1m +# Minimum amount of time to wait before resending +# an alert to Alertmanager. +# --alertmanager.notification-queue-capacity=10000 +# The capacity of the queue for pending +# Alertmanager notifications. +# --alertmanager.timeout=10s +# Timeout for sending alerts to Alertmanager. +# --query.lookback-delta=5m The maximum lookback duration for retrieving +# metrics during expression evaluations and +# federation. +# --query.timeout=2m Maximum time a query may take before being +# aborted. +# --query.max-concurrency=20 +# Maximum number of queries executed concurrently. +# --query.max-samples=50000000 +# Maximum number of samples a single query can load +# into memory. Note that queries will fail if they +# try to load more samples than this into memory, +# so this also limits the number of samples a query +# can return. +# --log.level=info Only log messages with the given severity or +# above. One of: [debug, info, warn, error] +# --log.format=logfmt Output format of log messages. One of: [logfmt, +# json] diff --git a/filesystem/etc/default/prometheus-alertmanager b/filesystem/etc/default/prometheus-alertmanager new file mode 100644 index 0000000..4ff43f2 --- /dev/null +++ b/filesystem/etc/default/prometheus-alertmanager @@ -0,0 +1,71 @@ +# Set the command-line arguments to pass to the server. +# default: +#ARGS="" + +# iank: +ARGS="--web.listen-address=127.0.0.1:9093" + +# this file is from version 0.21 + +# The alert manager supports the following options: + +# --config.file="/etc/prometheus/alertmanager.yml" +# Alertmanager configuration file name. +# --storage.path="/var/lib/prometheus/alertmanager/" +# Base path for data storage. +# --data.retention=120h +# How long to keep data for. +# --alerts.gc-interval=30m +# Interval between alert GC. +# --log.level=info +# Only log messages with the given severity or above. +# --web.external-url=WEB.EXTERNAL-URL +# The URL under which Alertmanager is externally reachable (for example, +# if Alertmanager is served via a reverse proxy). Used for generating +# relative and absolute links back to Alertmanager itself. If the URL has +# a path portion, it will be used to prefix all HTTP endpoints served by +# Alertmanager. If omitted, relevant URL components will be derived +# automatically. +# --web.route-prefix=WEB.ROUTE-PREFIX +# Prefix for the internal routes of web endpoints. Defaults to path of +# --web.external-url. +# --web.listen-address=":9093" +# Address to listen on for the web interface and API. +# --web.ui-path="/usr/share/prometheus/alertmanager/ui/" +# Path to static UI directory. +# --template.default="/usr/share/prometheus/alertmanager/default.tmpl" +# Path to default notification template. +# --cluster.listen-address="0.0.0.0:9094" +# Listen address for cluster. +# --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS +# Explicit address to advertise in cluster. +# --cluster.peer=CLUSTER.PEER ... +# Initial peers (may be repeated). +# --cluster.peer-timeout=15s +# Time to wait between peers to send notifications. +# --cluster.gossip-interval=200ms +# Interval between sending gossip messages. By lowering this value (more +# frequent) gossip messages are propagated across the cluster more +# quickly at the expense of increased bandwidth. +# --cluster.pushpull-interval=1m0s +# Interval for gossip state syncs. Setting this interval lower (more +# frequent) will increase convergence speeds across larger clusters at +# the expense of increased bandwidth usage. +# --cluster.tcp-timeout=10s Timeout for establishing a stream connection +# with a remote node for a full state sync, and for stream read and write +# operations. +# --cluster.probe-timeout=500ms +# Timeout to wait for an ack from a probed node before assuming it is +# unhealthy. This should be set to 99-percentile of RTT (round-trip time) +# on your network. +# --cluster.probe-interval=1s +# Interval between random node probes. Setting this lower (more frequent) +# will cause the cluster to detect failed nodes more quickly at the +# expense of increased bandwidth usage. +# --cluster.settle-timeout=1m0s +# Maximum time to wait for cluster connections to settle before +# evaluating notifications. +# --cluster.reconnect-interval=10s +# Interval between attempting to reconnect to lost peers. +# --cluster.reconnect-timeout=6h0m0s +# Length of time to attempt to reconnect to a lost peer. diff --git a/filesystem/etc/default/prometheus-node-exporter b/filesystem/etc/default/prometheus-node-exporter new file mode 100644 index 0000000..1d6d906 --- /dev/null +++ b/filesystem/etc/default/prometheus-node-exporter @@ -0,0 +1,216 @@ +# Set the command-line arguments to pass to the server. +# Due to shell scaping, to pass backslashes for regexes, you need to double +# them (\\d for \d). If running under systemd, you need to double them again +# (\\\\d to mean \d), and escape newlines too. +ARGS="--web.listen-address=127.0.0.1:9100" + +# prometheus-node-exporter supports the following options: +# +# --collector.arp +# Enable the arp collector (default: enabled). +# --collector.bcache +# Enable the bcache collector (default: enabled). +# --collector.bcache.priorityStats +# Expose expensive priority stats. +# --collector.bonding +# Enable the bonding collector (default: enabled). +# --collector.btrfs +# Enable the btrfs collector (default: enabled). +# --collector.buddyinfo +# Enable the buddyinfo collector (default: disabled). +# --collector.conntrack +# Enable the conntrack collector (default: enabled). +# --collector.cpu +# Enable the cpu collector (default: enabled). +# --collector.cpu.info +# Enables metric cpu_info. +# --collector.cpu.info.bugs-include=COLLECTOR.CPU.INFO.BUGS-INCLUDE +# Filter the `bugs` field in cpuInfo with a value that must be a regular +# expression. +# --collector.cpu.info.flags-include=COLLECTOR.CPU.INFO.FLAGS-INCLUDE +# Filter the `flags` field in cpuInfo with a value that must be a regular +# expression. +# --collector.cpufreq +# Enable the cpufreq collector (default: enabled). +# --collector.disable-defaults +# Set all collectors to disabled by default. +# --collector.diskstats +# Enable the diskstats collector (default: enabled). +# --collector.diskstats.ignored-devices="^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$" +# Regexp of devices to ignore for diskstats. +# --collector.drbd +# Enable the drbd collector (default: disabled). +# --collector.edac +# Enable the edac collector (default: enabled). +# --collector.entropy +# Enable the entropy collector (default: enabled). +# --collector.fibrechannel +# Enable the fibrechannel collector (default: enabled). +# --collector.filefd +# Enable the filefd collector (default: enabled). +# --collector.filesystem +# Enable the filesystem collector (default: enabled). +# --collector.filesystem.ignored-fs-types="^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$" +# Regexp of filesystem types to ignore for filesystem collector. +# --collector.filesystem.ignored-mount-points="^/(dev|proc|run|sys|mnt|media|var/lib/docker/.+)($|/)" +# Regexp of mount points to ignore for filesystem collector. +# --collector.hwmon +# Enable the hwmon collector (default: enabled). +# --collector.infiniband +# Enable the infiniband collector (default: enabled). +# --collector.interrupts +# Enable the interrupts collector (default: disabled). +# --collector.ipvs +# Enable the ipvs collector (default: enabled). +# --collector.ipvs.backend-labels="local_address,local_port,remote_address,remote_port,proto,local_mark" +# Comma separated list for IPVS backend stats labels. +# --collector.ksmd +# Enable the ksmd collector (default: disabled). +# --collector.loadavg +# Enable the loadavg collector (default: enabled). +# --collector.logind +# Enable the logind collector (default: disabled). +# --collector.mdadm +# Enable the mdadm collector (default: enabled). +# --collector.meminfo +# Enable the meminfo collector (default: enabled). +# --collector.meminfo_numa +# Enable the meminfo_numa collector (default: disabled). +# --collector.mountstats +# Enable the mountstats collector (default: disabled). +# --collector.netclass +# Enable the netclass collector (default: enabled). +# --collector.netclass.ignored-devices="^$" +# Regexp of net devices to ignore for netclass collector. +# --collector.netdev +# Enable the netdev collector (default: enabled). +# --collector.netdev.device-exclude="^lo$" +# Regexp of net devices to exclude (mutually exclusive to device-include). +# --collector.netdev.device-include=COLLECTOR.NETDEV.DEVICE-INCLUDE +# Regexp of net devices to include (mutually exclusive to device-exclude). +# --collector.netstat +# Enable the netstat collector (default: enabled). +# --collector.netstat.fields="^(.*_(InErrors|InErrs)|Ip_Forwarding|Ip(6|Ext)_(InOctets|OutOctets)|Icmp6?_(InMsgs|OutMsgs)|TcpExt_(Listen.*|Syncookies.*|TCPSynRetrans)|Tcp_(ActiveOpens|InSegs|OutSegs|OutRsts|PassiveOpens|RetransSegs|CurrEstab)|Udp6?_(InDatagrams|OutDatagrams|NoPorts|RcvbufErrors|SndbufErrors))$" +# Regexp of fields to return for netstat collector. +# --collector.network_route +# Enable the network_route collector (default: disabled). +# --collector.nfs +# Enable the nfs collector (default: enabled). +# --collector.nfsd +# Enable the nfsd collector (default: enabled). +# --collector.ntp +# Enable the ntp collector (default: disabled). +# --collector.ntp.ip-ttl=1 +# IP TTL to use while sending NTP query. +# --collector.ntp.local-offset-tolerance=1ms +# Offset between local clock and local ntpd time to tolerate. +# --collector.ntp.max-distance=3.46608s +# Max accumulated distance to the root. +# --collector.ntp.protocol-version=4 +# NTP protocol version. +# --collector.ntp.server-is-local +# Certify that collector.ntp.server address is not a public ntp server. +# --collector.ntp.server="127.0.0.1" +# NTP server to use for ntp collector. +# --collector.perf +# Enable the perf collector (default: disabled). +# --collector.perf.cpus="" +# List of CPUs from which perf metrics should be collected. +# --collector.perf.tracepoint=COLLECTOR.PERF.TRACEPOINT... +# Perf tracepoint that should be collected. +# --collector.powersupply.ignored-supplies="^$" +# Regexp of power supplies to ignore for powersupplyclass collector. +# --collector.powersupplyclass +# Enable the powersupplyclass collector (default: enabled). +# --collector.pressure +# Enable the pressure collector (default: enabled). +# --collector.processes +# Enable the processes collector (default: disabled). +# --collector.qdisc +# Enable the qdisc collector (default: disabled). +# --collector.qdisc.fixtures="" +# Test fixtures to use for qdisc collector end-to-end testing. +# --collector.rapl +# Enable the rapl collector (default: enabled). +# --collector.runit +# Enable the runit collector (default: disabled). +# --collector.runit.servicedir="/etc/service" +# Path to runit service directory. +# --collector.schedstat +# Enable the schedstat collector (default: enabled). +# --collector.sockstat +# Enable the sockstat collector (default: enabled). +# --collector.softnet +# Enable the softnet collector (default: enabled). +# --collector.stat +# Enable the stat collector (default: enabled). +# --collector.supervisord +# Enable the supervisord collector (default: disabled). +# --collector.supervisord.url="http://localhost:9001/RPC2" +# XML RPC endpoint. +# --collector.systemd +# Enable the systemd collector (default: enabled). +# --collector.systemd.enable-restarts-metrics +# Enables service unit metric service_restart_total. +# --collector.systemd.enable-start-time-metrics +# Enables service unit metric unit_start_time_seconds. +# --collector.systemd.enable-task-metrics +# Enables service unit tasks metrics unit_tasks_current and unit_tasks_max. +# --collector.systemd.unit-exclude=".+\\.(automount|device|mount|scope|slice|target)" +# Regexp of systemd units to exclude. Units must both match include and not +# match exclude to be included. +# --collector.systemd.unit-include=".+" +# Regexp of systemd units to include. Units must both match include and not +# match exclude to be included. +# --collector.tcpstat +# Enable the tcpstat collector (default: disabled). +# --collector.textfile +# Enable the textfile collector (default: enabled). +# --collector.textfile.directory="/var/lib/prometheus/node-exporter" +# Directory to read text files with metrics from. +# --collector.thermal_zone +# Enable the thermal_zone collector (default: enabled). +# --collector.time +# Enable the time collector (default: enabled). +# --collector.timex +# Enable the timex collector (default: enabled). +# --collector.udp_queues +# Enable the udp_queues collector (default: enabled). +# --collector.uname +# Enable the uname collector (default: enabled). +# --collector.vmstat +# Enable the vmstat collector (default: enabled). +# --collector.vmstat.fields="^(oom_kill|pgpg|pswp|pg.*fault).*" +# Regexp of fields to return for vmstat collector. +# --collector.wifi +# Enable the wifi collector (default: disabled). +# --collector.wifi.fixtures="" +# Test fixtures to use for wifi collector metrics. +# --collector.xfs +# Enable the xfs collector (default: enabled). +# --collector.zfs +# Enable the zfs collector (default: enabled). +# --collector.zoneinfo +# Enable the zoneinfo collector (default: disabled). +# --log.format=logfmt +# Output format of log messages. One of: [logfmt, json]. +# --log.level=info +# Only log messages with the given severity or above. One of: [debug, info, +# warn, error]. +# --path.procfs="/proc" +# Procfs mountpoint. +# --path.rootfs="/" +# Rootfs mountpoint. +# --path.sysfs="/sys" +# Sysfs mountpoint. +# --web.config="" +# [EXPERIMENTAL] Path to config yaml file that can enable TLS or +# authentication. +# --web.disable-exporter-metrics +# Exclude metrics about the exporter itself (promhttp_*, process_*, go_*). +# --web.listen-address=":9100" +# Address on which to expose metrics and web interface. +# --web.max-requests=40 +# Maximum number of parallel scrape requests. Use 0 to disable. +# --web.telemetry-path="/metrics" +# Path under which to expose metrics. diff --git a/filesystem/etc/prometheus/alertmanager.yml b/filesystem/etc/prometheus/alertmanager.yml new file mode 100644 index 0000000..c171bfc --- /dev/null +++ b/filesystem/etc/prometheus/alertmanager.yml @@ -0,0 +1,21 @@ +# See https://prometheus.io/docs/alerting/configuration/ for documentation. + +global: + resolve_timeout: 3m + smtp_smarthost: mx.iankelling.org:587 + smtp_from: alerts@iankelling.org + smtp_require_tls: False + smtp_hello: defaultnn.b8.nz +templates: +- '/etc/prometheus/alertmanager_templates/*.tmpl' +receivers: +- email_configs: + - html: null + send_resolved: true + text: '{{ template "opsgenie.default.description" . }}' + to: alerts@iankelling.org + name: defaultreceiver + +route: + receiver: defaultreceiver + repeat_interval: 5d diff --git a/filesystem/etc/prometheus/file_sd/node.yml b/filesystem/etc/prometheus/file_sd/node.yml new file mode 100644 index 0000000..8372ddc --- /dev/null +++ b/filesystem/etc/prometheus/file_sd/node.yml @@ -0,0 +1,10 @@ +- targets: + - kdwg:9101 + # - sywg:9101 + # - bk:9101 + # - je:9101 + # - li:9101 + # - frodo:9101 + # - kwwg:9101 + # - x3wg:9101 + # - x2wg:9101 diff --git a/filesystem/etc/prometheus/file_sd/tlsnode.yml b/filesystem/etc/prometheus/file_sd/tlsnode.yml new file mode 100644 index 0000000..47f8c7c --- /dev/null +++ b/filesystem/etc/prometheus/file_sd/tlsnode.yml @@ -0,0 +1,4 @@ +- targets: + # - bk:9101 + # - je:9101 + # - li:9101 diff --git a/filesystem/etc/prometheus/prometheus.yml b/filesystem/etc/prometheus/prometheus.yml new file mode 100644 index 0000000..9932335 --- /dev/null +++ b/filesystem/etc/prometheus/prometheus.yml @@ -0,0 +1,47 @@ +# Sample config for Prometheus. + +global: + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: kd.b8.nz + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + - /etc/prometheus/rules/*.yml + # - "first_rules.yml" + # - "second_rules.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['localhost:9090'] + + - job_name: node + basic_auth: + username: prom + password_file: /etc/prometheus-pass + file_sd_configs: + - files: + - /etc/prometheus/file_sd/node.yml + - job_name: tlsnode + scheme: https + basic_auth: + username: prom + password_file: /etc/prometheus-pass + file_sd_configs: + - files: + - /etc/prometheus/file_sd/tlsnode.yml diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml new file mode 100644 index 0000000..043b64d --- /dev/null +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -0,0 +1,157 @@ + +groups: +- name: ansible managed alert rules + rules: + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 5% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available space left. + summary: Filesystem has less than 3% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available inodes left and is filling up. + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: |- + ( + node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: |- + ( + node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 5% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has + only {{ printf "%.2f" $value }}% available inodes left. + summary: Filesystem has less than 3% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + {{ printf "%.0f" $value }} receive errors in the last two minutes.' + summary: Network interface is reporting many receive errors. + expr: |- + increase(node_network_receive_errs_total[2m]) > 10 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + summary: Network interface is reporting many transmit errors. + expr: |- + increase(node_network_transmit_errs_total[2m]) > 10 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are used' + summary: Number of conntrack are getting close to the limit + expr: |- + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure + NTP is configured correctly on this host. + summary: Clock skew detected. + expr: |- + ( + node_timex_offset_seconds > 0.05 + and + deriv(node_timex_offset_seconds[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds < -0.05 + and + deriv(node_timex_offset_seconds[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured + on this host. + summary: Clock not synchronising. + expr: |- + min_over_time(node_timex_sync_status[5m]) == 0 + for: 10m + labels: + severity: warning + - alert: ianktest + expr: node_systemd_version >= 300 + labels: + severity: critical + annotations: + description: '{{ $labels.instance }} ianktest.' + summary: Instance {{ $labels.instance }} - ianktest diff --git a/mailtest-check b/mailtest-check index 3fdefff..8136301 100755 --- a/mailtest-check +++ b/mailtest-check @@ -76,16 +76,23 @@ esac getspamdpid() { if [[ ! $spamdpid || ! -d /proc/$spamdpid ]]; then - spamdpid=$(systemctl status spamassassin| sed -n '/^ *Main PID:/s/[^0-9]//gp' ||:) + spamdpid=$(systemctl show --property MainPID --value spamassassin | sed 's/^1$//' ||:) fi } getspamdpid +pr() { + cat >>/var/lib/prometheus/node-exporter/mailtest-check.prom.$$ +} +pr < 0 )) +EOF e spamdpid: $spamdpid if [[ ! $spamdpid ]]; then echo $HOSTNAME mailtest spamd pid not found. systemctl status spamassassin: systemctl status spamassassin fi tmpfile=$(mktemp) +declare -i unexpected=0 for folder in ${folders[@]}; do for from in ${froms[@]}; do latest= @@ -102,7 +109,10 @@ for folder in ${folders[@]}; do fi done <$tmpfile - if [[ $latest ]]; then + if [[ ! $latest ]]; then + # 10 is an arbitrary bad value + unexpected+=10 + else to=$(awk '/^Envelope-to: / {print $2}' $latest) last_sec=$(awk '/^Subject: / {print $4}' $latest) @@ -196,5 +206,15 @@ for folder in ${folders[@]}; do if (( last_sec <= limit )); then echo $HOSTNAME mailtest $folder $from $(date -d @$last_sec +'%a %m-%d %H:%M') fi + # usec = unix seconds + pr <