From: Ian Kelling <ian@iankelling.org>
Date: Sun, 20 Feb 2022 06:33:39 +0000 (-0500)
Subject: mostly start using prometheus
X-Git-Url: https://iankelling.org/git/?a=commitdiff_plain;h=e958999a4ab6fddd723270b596b4899c0811fa41;p=distro-setup

mostly start using prometheus
---

diff --git a/.gitignore b/.gitignore
index afa57c5..718065b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1,4 @@
-/t.org
-/old-unused
+/filesystem/etc/prometheus-export-htpasswd
+# other git repos
 /.emacs.d
 /Arduino
-# 3rd party git repos
-/a/roles/prom
-/a/roles/grafana
-/a/roles/node-exporter
-/a/roles/alertmanager
-/sl/.ianketiona/src/
-/sl/.iankflidas/src/
diff --git a/a/ansible.cfg b/a/ansible.cfg
deleted file mode 100644
index d367ba2..0000000
--- a/a/ansible.cfg
+++ /dev/null
@@ -1,46 +0,0 @@
-[defaults]
-# 2.7.4-1ppa~xenial would use 194M of memory resident in htop
-# and run out of 4g of memory. just ran ansible 2.9.9-1ppa~bionic+9.0trisquel1
-# and with 23 fork limit, it topped out at 1 gig used of memory.
-# we have about 60 hosts, so 100 should allow them all to run in
-# parallell without a problem without having memory problems.
-# oldused=0; while true; do used=$(free -w -t | tail -n1 | awk '{print $3}'); if ((used > oldused )); then oldused=$used; echo $(date) $used | tee used; fi; sleep 1; done
-forks = 100
-
-# Ansible doesnt have have trisquels python path in its os info.
-# Silence a warning
-# https://docs.ansible.com/ansible/2.9/reference_appendices/interpreter_discovery.html
-interpreter_python = auto_silent
-# strategy = free # DO NOT ENABLE.
-#
-# As of 2019-08-07, include_tasks is very broken with the free strategy.
-# tasks will not be run for some hosts, or "when" rules ignored and run
-# for the wrong host, or some hosts a task will be run twice. Even if we
-# switch to import_tasks, I wouldn't trust using this until that bug is
-# found and fixed. repro: tested with 2.7.4, (no bug reports or fixes
-# found), Running just the common role, then searching for which hosts
-# an install.yml included role got run using
-#
-# f() { awk '/xfsprogs/ { x = 1; next }; /^TASK/ { x = 0 }; x && /\[/ { print }' $1 | sort | uniq -c | pee cat wc; }
-# f LOGFILE
-host_key_checking = False
-display_skipped_hosts = False
-retry_files_enabled = False
-# readable output
-stdout_callback = yaml
-# Our logs are already pretty big. You can temporarily uncomment to enable
-# profiling info in the logs.
-#callback_whitelist = timer, profile_tasks, profile_roles
-
-# Ansible suggests using the file module instead of chmod, but then it
-# follows symlinks without an option to turn it off, which is completely
-# braindead and screwed up my system.
-command_warnings=False
-
-[ssh_connection]
-pipelining = True
-retries = 2
-
-[colors]
-# found in color.py. default blue is hard to read on a black background
-verbose = bright blue
diff --git a/a/group_vars/all b/a/group_vars/all
deleted file mode 100644
index b74039d..0000000
--- a/a/group_vars/all
+++ /dev/null
@@ -1,2 +0,0 @@
-# iank: 1.1 because prometheus is configered elsewhere to use fqdn, which maps
-prometheus_web_listen_address: "127.0.1.1:9090"
diff --git a/a/setup.sh b/a/setup.sh
deleted file mode 100755
index 4d253b9..0000000
--- a/a/setup.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2019 Ian Kelling
-# SPDX-License-Identifier: AGPL-3.0-or-later
-
-source /a/bin/errhandle/err
-
-[[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@"
-
-# shellcheck source=/a/bin/ds/.bashrc
-export LC_USEBASHRC=t; if [[ -s ~/.bashrc ]]; then . ~/.bashrc; fi
-
-# dependency of node exporter, per README.md
-pi python3-passlib
-
-# after running ansible, run
-# conflink
-# ser restart prometheus
diff --git a/a/site.yml b/a/site.yml
deleted file mode 100644
index b9a0276..0000000
--- a/a/site.yml
+++ /dev/null
@@ -1,84 +0,0 @@
----
-- hosts: localhost
-  roles:
-    - role: prom
-      tags: a
-      prometheus_targets:
-        node:
-          - targets:
-              - "{{ ansible_fqdn }}:9100"
-      prometheus_scrape_configs:
-        - job_name: "prometheus"
-          metrics_path: "{{ prometheus_metrics_path }}"
-          static_configs:
-            - targets:
-                - "{{ ansible_fqdn }}:9090"
-        - job_name: "node"
-          basic_auth:
-            username: prom
-            password_file: /etc/prometheus-pass
-          #scheme: "https"
-          file_sd_configs:
-            - files:
-                - "{{ prometheus_config_dir }}/file_sd/node.yml"
-      # added because of warning in log
-      prometheus_alertmanager_config:
-        - static_configs:
-            - targets:
-                - "{{ ansible_fqdn }}:9093"
-
-
-    - role: node-exporter
-      tags: a
-      # node_exporter_tls_server_config:
-      #   cert_file: /etc/node_exporter/fullchain.pem
-      #   key_file: /etc/node_exporter/privkey.pem
-      node_exporter_web_listen_address: "127.0.1.1:9100"
-      node_exporter_basic_auth_users:
-        prom: "incarnadine.bloodied.maker"
-
-    - role: alertmanager
-      alertmanager_smtp:
-        smarthost: 'mx.iankelling.org:587'
-        from: "alerts@iankelling.org"
-        require_tls: false
-        hello: 'defaultnn.b8.nz'
-      alertmanager_route:
-        receiver: defaultreceiver
-        repeat_interval: 7d
-      alertmanager_receivers:
-        - name: defaultreceiver
-          email_configs:
-            - to: alerts@iankelling.org
-              send_resolved: true
-              # the html was a bit ugly and just a huge waste of text,
-              # https://github.com/prometheus/alertmanager/issues/2232
-              # lead me to find a convenient text option to use
-              html:
-              text: '{% raw -%}{{ template "opsgenie.default.description" . }}{% endraw -%}'
-      alertmanager_web_listen_address: '127.0.1.1:9093'
-
-    - role: grafana
-      grafana_address: "127.0.1.1"
-      # iank: playbook will halt if no password is set. this is only
-      # available to localhost, so i dont really care, but might as well
-      # generate a pass isntead of putting in pw123 etc.
-      grafana_security: { admin_user: admin, admin_password: spheroid.recantation.shank }
-      grafana_datasources:
-        - name: prometheus
-          type: prometheus
-          access: proxy
-          url: 'http://{{ prometheus_web_listen_address }}'
-          basicAuth: false
-          isDefault: true
-      # This is based on looking at highly downloaded dashboards here
-      # https://grafana.com/dashboards?dataSource=prometheus&collector=nodeExporter
-      # Which is where you are lead to from
-      # https://prometheus.io/docs/visualization/grafana/
-      grafana_dashboards:
-        - dashboard_id: 1860
-          revision_id: 21
-          datasource: prometheus
-        - dashboard_id: 405
-          revision_id: 8
-          datasource: prometheus
diff --git a/brc2 b/brc2
index bd959d8..9fa1010 100644
--- a/brc2
+++ b/brc2
@@ -1085,31 +1085,31 @@ lom() {
     l=$(losetup -j $1 | sed -rn 's/^([^ ]+): .*/\1/p' | head -n1 ||:)
     if [[ ! $l ]]; then
       l=$(sudo losetup -f)
-      sudo losetup $l $1
+      m sudo losetup $l $1
     fi
     if ! sudo cryptsetup status /dev/mapper/$base &>/dev/null; then
       if ! sudo cryptsetup luksOpen $l $base; then
-        sudo losetup -d $l
+        m sudo losetup -d $l
         return 1
       fi
     fi
-    sudo mkdir -p /mnt/$base
-    sudo mount /dev/mapper/$base /mnt/$base
-    sudo chown $USER:$USER /mnt/$base
+    m sudo mkdir -p /mnt/$base
+    m sudo mount /dev/mapper/$base /mnt/$base
+    m sudo chown $USER:$USER /mnt/$base
   else
     base=$1
     if mountpoint /mnt/$base &>/dev/null; then
-      sudo umount /mnt/$base
+      m sudo umount /mnt/$base
     fi
     if sudo cryptsetup status /dev/mapper/$base &>/dev/null; then
-      if ! sudo cryptsetup luksClose /dev/mapper/$base; then
+      if ! m sudo cryptsetup luksClose /dev/mapper/$base; then
         echo lom: failed cryptsetup luksClose /dev/mapper/$base
         return 1
       fi
     fi
-    l=$(losetup -j $1 | sed -rn 's/^([^ ]+): .*/\1/p' | head -n1 ||:)
+    l=$(losetup -l --noheadings | awk '$6 ~ /\/'$1'$/ {print $1}')
     if [[ $l ]]; then
-      sudo losetup -d $l
+      m sudo losetup -d $l
     else
       echo lom: warning: no loopback device found
     fi
@@ -1266,6 +1266,9 @@ ngo() {
 otp() {
   oathtool --totp -b "$*" | xclip -selection clipboard
 }
+j() {
+  "$@" |& pee "xclip -r -selection clipboard"
+}
 
 
 pakaraoke() {
diff --git a/conflink b/conflink
index 90debe6..bfd4e44 100755
--- a/conflink
+++ b/conflink
@@ -166,15 +166,24 @@ case $user in
     if [[ -e /var/lib/znc ]] && getent group znc; then
       s chown -R znc:znc /var/lib/znc
     fi
-    f=/etc/prometheus-htpasswd
+    for f in /etc/prometheus-{,export-}htpasswd; do
+      if [[ -e $f ]]; then
+        s chmod 640 $f
+        if getent passwd www-data; then
+          s chown root:www-data $f
+        fi
+      fi
+    done
+    f=/etc/prometheus-pass
     if [[ -e $f ]]; then
+      # note: this is duplicative of the file's own permissions
       s chmod 640 $f /etc/prometheus-pass
-      s chown root:www-data $f
       if getent passwd prometheus; then
-        s chown root:prometheus /etc/prometheus-pass
+        s chown root:prometheus $f
       fi
     fi
 
+
     ##### end special extra stuff #####
 
     m sudo -H -u user2 "${BASH_SOURCE[0]}"
diff --git a/distro-end b/distro-end
index 7f2f371..ae628c5 100755
--- a/distro-end
+++ b/distro-end
@@ -1844,6 +1844,68 @@ case $HOSTNAME in
     ;;
 esac
 
+### begin prometheus ###
+
+case $HOSTNAME in
+  kd)
+    # ive got these + a needed dependency pinned to bullseye, just to get
+    # versions more in line with the main docs.
+    pi prometheus-alertmanager prometheus prometheus-node-exporter
+    web-conf -p 9091 -f 9090 - apache2 i.b8.nz <<'EOF'
+<Location "/">
+AuthType Basic
+AuthName "basic_auth"
+# created with
+# htpasswd -c prometheus-htpasswd USERNAME
+AuthUserFile "/etc/prometheus-htpasswd"
+Require valid-user
+</Location>
+EOF
+    ;;
+  *)
+    pi prometheus-node-exporter
+    ;;
+esac
+
+case $HOSTNAME in
+  # frodo needs upgrade first.
+  frodo) : ;;
+  # todo, for limiting node exporter http,
+  # either use iptables or, in
+  # /etc/default/prometheus-node-exporter
+  # listen on the wireguard interface
+  ;;
+  li|je|bk)
+    # ex for exporter
+    web-conf -p 9101 -f 9100 - apache2 ${HOSTNAME}ex.b8.nz <<'EOF'
+<Location "/">
+AuthType Basic
+AuthName "basic_auth"
+# created with
+# htpasswd -c prometheus-export-htpasswd USERNAME
+AuthUserFile "/etc/prometheus-export-htpasswd"
+Require valid-user
+</Location>
+EOF
+    ;;
+  *)
+    wgip=$(sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf)
+    web-conf -i -a $wgip -p 9101 -f 9100 - apache2 ${HOSTNAME}wg.b8.nz <<'EOF'
+<Location "/">
+AuthType Basic
+AuthName "basic_auth"
+# created with
+# htpasswd -c prometheus-export-htpasswd USERNAME
+AuthUserFile "/etc/prometheus-export-htpasswd"
+Require valid-user
+</Location>
+EOF
+  ;;
+esac
+
+### end prometheus ###
+
+
 end_msg <<'EOF'
 In mate settings settings, change scrolling to two-finger,
 because the default edge scroll doesn\'t work. Originally found this in debian.
diff --git a/filesystem/etc/apt/preferences.d/prometheus b/filesystem/etc/apt/preferences.d/prometheus
new file mode 100644
index 0000000..974f95b
--- /dev/null
+++ b/filesystem/etc/apt/preferences.d/prometheus
@@ -0,0 +1,3 @@
+Package: prometheus-alertmanager prometheus prometheus-node-exporter libjs-jquery
+Pin: release n=bullseye,o=Debian
+Pin-Priority: 500
diff --git a/filesystem/etc/default/prometheus b/filesystem/etc/default/prometheus
new file mode 100644
index 0000000..63d1ee3
--- /dev/null
+++ b/filesystem/etc/default/prometheus
@@ -0,0 +1,117 @@
+# iank: initial file from 2.24, added to empty ARGS.
+
+# Set the command-line arguments to pass to the server.
+
+ARGS="--web.listen-address=127.0.0.1:9090"
+
+
+
+# Prometheus supports the following options:
+#  --config.file="/etc/prometheus/prometheus.yml"
+#                             Prometheus configuration file path.
+#  --web.listen-address="0.0.0.0:9090"
+#                             Address to listen on for UI, API, and telemetry.
+#  --web.read-timeout=5m      Maximum duration before timing out read of the
+#                             request, and closing idle connections.
+#  --web.max-connections=512  Maximum number of simultaneous connections.
+#  --web.external-url=<URL>   The URL under which Prometheus is externally
+#                             reachable (for example, if Prometheus is served
+#                             via a reverse proxy). Used for generating
+#                             relative and absolute links back to Prometheus
+#                             itself. If the URL has a path portion, it will
+#                             be used to prefix all HTTP endpoints served by
+#                             Prometheus. If omitted, relevant URL components
+#                             will be derived automatically.
+#  --web.route-prefix=<path>  Prefix for the internal routes of web endpoints.
+#                             Defaults to path of --web.external-url.
+#  --web.local-assets="/usr/share/prometheus/web/"
+#                             Path to static asset/templates directory.
+#  --web.user-assets=<path>   Path to user asset directory, available at
+#                             /user.
+#  --web.enable-lifecycle     Enable shutdown and reload via HTTP request.
+#  --web.enable-admin-api     Enable API endpoints for admin control actions.
+#  --web.console.templates="/etc/prometheus/consoles"
+#                             Path to the console template directory,
+#                             available at /consoles.
+#  --web.console.libraries="/etc/prometheus/console_libraries"
+#                             Path to the console library directory.
+#  --web.page-title="Prometheus Time Series Collection and Processing Server"
+#                             Document title of Prometheus instance.
+#  --web.cors.origin=".*"     Regex for CORS origin. It is fully anchored.
+#                             Example: 'https?://(domain1|domain2)\.com'
+#  --storage.tsdb.path="/var/lib/prometheus/metrics2/"
+#                             Base path for metrics storage.
+#  --storage.tsdb.retention=15d
+#                             [DEPRECATED] How long to retain samples in
+#                             storage. This flag has been deprecated, use
+#                             "storage.tsdb.retention.time" instead
+#  --storage.tsdb.retention.time=15d
+#                             How long to retain samples in storage. When this
+#                             flag is set it overrides
+#                             "storage.tsdb.retention".
+#                             If neither this flag nor "storage.tsdb.retention"
+#                             nor "storage.tsdb.retention.size" is set, the
+#                             retention time defaults to 15d.
+#                             Units Supported: y, w, d, h, m, s, ms.
+#  --storage.tsdb.retention.size=
+#                             [EXPERIMENTAL] Maximum number of bytes that can
+#                             be stored for blocks. Units supported: KB, MB,
+#                             GB, TB, PB. This flag is experimental and can be
+#                             changed in future releases.
+#  --storage.tsdb.use-lockfile
+#                             Create a lockfile in data directory.
+#  --storage.tsdb.allow-overlapping-blocks
+#                             [EXPERIMENTAL] Allow overlapping blocks, which
+#                             in turn enables vertical compaction and
+#                             vertical query merge.
+#  --storage.tsdb.wal-compression
+#                             Compress the tsdb WAL.
+#  --storage.remote.flush-deadline=<duration>
+#                             How long to wait flushing sample on shutdown or
+#                             config reload.
+#  --storage.remote.read-sample-limit=5e7
+#                             Maximum overall number of samples to return via
+#                             the remote read interface, in a single query. 0
+#                             means no limit. This limit is ignored for
+#                             streamed response types.
+#  --storage.remote.read-concurrent-limit=10
+#                             Maximum number of concurrent remote read calls.
+#                             0 means no limit.
+#  --storage.remote.read-max-bytes-in-frame=1048576
+#                             Maximum number of bytes in a single frame for
+#                             streaming remote read response types before
+#                             marshalling. Note that client might have limit on
+#                             frame size as well. 1MB as recommended by
+#                             protobuf by default.
+#  --rules.alert.for-outage-tolerance=1h
+#                             Max time to tolerate prometheus outage for
+#                             restoring "for" state of alert.
+#  --rules.alert.for-grace-period=10m
+#                             Minimum duration between alert and restored "for"
+#                             state. This is maintained only for alerts with
+#                             configured "for" time greater than grace period.
+#  --rules.alert.resend-delay=1m
+#                             Minimum amount of time to wait before resending
+#                             an alert to Alertmanager.
+#  --alertmanager.notification-queue-capacity=10000
+#                             The capacity of the queue for pending
+#                             Alertmanager notifications.
+#  --alertmanager.timeout=10s
+#                             Timeout for sending alerts to Alertmanager.
+#  --query.lookback-delta=5m  The maximum lookback duration for retrieving
+#                             metrics during expression evaluations and
+#                             federation.
+#  --query.timeout=2m         Maximum time a query may take before being
+#                             aborted.
+#  --query.max-concurrency=20
+#                             Maximum number of queries executed concurrently.
+#  --query.max-samples=50000000
+#                             Maximum number of samples a single query can load
+#                             into memory. Note that queries will fail if they
+#                             try to load more samples than this into memory,
+#                             so this also limits the number of samples a query
+#                             can return.
+#  --log.level=info           Only log messages with the given severity or
+#                             above. One of: [debug, info, warn, error]
+#  --log.format=logfmt        Output format of log messages. One of: [logfmt,
+#                             json]
diff --git a/filesystem/etc/default/prometheus-alertmanager b/filesystem/etc/default/prometheus-alertmanager
new file mode 100644
index 0000000..4ff43f2
--- /dev/null
+++ b/filesystem/etc/default/prometheus-alertmanager
@@ -0,0 +1,71 @@
+# Set the command-line arguments to pass to the server.
+# default:
+#ARGS=""
+
+# iank:
+ARGS="--web.listen-address=127.0.0.1:9093"
+
+# this file is from version 0.21
+
+# The alert manager supports the following options:
+
+#  --config.file="/etc/prometheus/alertmanager.yml"
+#       Alertmanager configuration file name.
+#  --storage.path="/var/lib/prometheus/alertmanager/"
+#       Base path for data storage.
+#  --data.retention=120h
+#       How long to keep data for.
+#  --alerts.gc-interval=30m
+#       Interval between alert GC.
+#  --log.level=info
+#       Only log messages with the given severity or above.
+#  --web.external-url=WEB.EXTERNAL-URL
+#       The URL under which Alertmanager is externally reachable (for example,
+#       if Alertmanager is served via a reverse proxy). Used for generating
+#       relative and absolute links back to Alertmanager itself. If the URL has
+#       a path portion, it will be used to prefix all HTTP endpoints served by
+#       Alertmanager. If omitted, relevant URL components will be derived
+#       automatically.
+#  --web.route-prefix=WEB.ROUTE-PREFIX
+#       Prefix for the internal routes of web endpoints. Defaults to path of
+#       --web.external-url.
+#  --web.listen-address=":9093"
+#       Address to listen on for the web interface and API.
+#  --web.ui-path="/usr/share/prometheus/alertmanager/ui/"
+#       Path to static UI directory.
+#  --template.default="/usr/share/prometheus/alertmanager/default.tmpl"
+#       Path to default notification template.
+#  --cluster.listen-address="0.0.0.0:9094"
+#       Listen address for cluster.
+#  --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS
+#       Explicit address to advertise in cluster.
+#  --cluster.peer=CLUSTER.PEER ...
+#       Initial peers (may be repeated).
+#  --cluster.peer-timeout=15s
+#       Time to wait between peers to send notifications.
+#  --cluster.gossip-interval=200ms
+#       Interval between sending gossip messages. By lowering this value (more
+#       frequent) gossip messages are propagated across the cluster more
+#       quickly at the expense of increased bandwidth.
+#  --cluster.pushpull-interval=1m0s
+#       Interval for gossip state syncs. Setting this interval lower (more
+#       frequent) will increase convergence speeds across larger clusters at
+#       the expense of increased bandwidth usage.
+#  --cluster.tcp-timeout=10s  Timeout for establishing a stream connection
+#       with a remote node for a full state sync, and for stream read and write
+#       operations.
+#  --cluster.probe-timeout=500ms
+#       Timeout to wait for an ack from a probed node before assuming it is
+#       unhealthy. This should be set to 99-percentile of RTT (round-trip time)
+#       on your network.
+#  --cluster.probe-interval=1s
+#       Interval between random node probes. Setting this lower (more frequent)
+#       will cause the cluster to detect failed nodes more quickly at the
+#       expense of increased bandwidth usage.
+#  --cluster.settle-timeout=1m0s
+#       Maximum time to wait for cluster connections to settle before
+#       evaluating notifications.
+#  --cluster.reconnect-interval=10s
+#       Interval between attempting to reconnect to lost peers.
+#  --cluster.reconnect-timeout=6h0m0s
+#       Length of time to attempt to reconnect to a lost peer.
diff --git a/filesystem/etc/default/prometheus-node-exporter b/filesystem/etc/default/prometheus-node-exporter
new file mode 100644
index 0000000..1d6d906
--- /dev/null
+++ b/filesystem/etc/default/prometheus-node-exporter
@@ -0,0 +1,216 @@
+# Set the command-line arguments to pass to the server.
+# Due to shell scaping, to pass backslashes for regexes, you need to double
+# them (\\d for \d). If running under systemd, you need to double them again
+# (\\\\d to mean \d), and escape newlines too.
+ARGS="--web.listen-address=127.0.0.1:9100"
+
+# prometheus-node-exporter supports the following options:
+#
+#  --collector.arp
+#    Enable the arp collector (default: enabled).
+#  --collector.bcache
+#    Enable the bcache collector (default: enabled).
+#  --collector.bcache.priorityStats
+#    Expose expensive priority stats.
+#  --collector.bonding
+#    Enable the bonding collector (default: enabled).
+#  --collector.btrfs
+#    Enable the btrfs collector (default: enabled).
+#  --collector.buddyinfo
+#    Enable the buddyinfo collector (default: disabled).
+#  --collector.conntrack
+#    Enable the conntrack collector (default: enabled).
+#  --collector.cpu
+#    Enable the cpu collector (default: enabled).
+#  --collector.cpu.info
+#    Enables metric cpu_info.
+#  --collector.cpu.info.bugs-include=COLLECTOR.CPU.INFO.BUGS-INCLUDE
+#    Filter the `bugs` field in cpuInfo with a value that must be a regular
+#    expression.
+#  --collector.cpu.info.flags-include=COLLECTOR.CPU.INFO.FLAGS-INCLUDE
+#    Filter the `flags` field in cpuInfo with a value that must be a regular
+#    expression.
+#  --collector.cpufreq
+#    Enable the cpufreq collector (default: enabled).
+#  --collector.disable-defaults
+#    Set all collectors to disabled by default.
+#  --collector.diskstats
+#    Enable the diskstats collector (default: enabled).
+#  --collector.diskstats.ignored-devices="^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$"
+#    Regexp of devices to ignore for diskstats.
+#  --collector.drbd
+#    Enable the drbd collector (default: disabled).
+#  --collector.edac
+#    Enable the edac collector (default: enabled).
+#  --collector.entropy
+#    Enable the entropy collector (default: enabled).
+#  --collector.fibrechannel
+#    Enable the fibrechannel collector (default: enabled).
+#  --collector.filefd
+#    Enable the filefd collector (default: enabled).
+#  --collector.filesystem
+#    Enable the filesystem collector (default: enabled).
+#  --collector.filesystem.ignored-fs-types="^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
+#    Regexp of filesystem types to ignore for filesystem collector.
+#  --collector.filesystem.ignored-mount-points="^/(dev|proc|run|sys|mnt|media|var/lib/docker/.+)($|/)"
+#    Regexp of mount points to ignore for filesystem collector.
+#  --collector.hwmon
+#    Enable the hwmon collector (default: enabled).
+#  --collector.infiniband
+#    Enable the infiniband collector (default: enabled).
+#  --collector.interrupts
+#    Enable the interrupts collector (default: disabled).
+#  --collector.ipvs
+#    Enable the ipvs collector (default: enabled).
+#  --collector.ipvs.backend-labels="local_address,local_port,remote_address,remote_port,proto,local_mark"
+#    Comma separated list for IPVS backend stats labels.
+#  --collector.ksmd
+#    Enable the ksmd collector (default: disabled).
+#  --collector.loadavg
+#    Enable the loadavg collector (default: enabled).
+#  --collector.logind
+#    Enable the logind collector (default: disabled).
+#  --collector.mdadm
+#    Enable the mdadm collector (default: enabled).
+#  --collector.meminfo
+#    Enable the meminfo collector (default: enabled).
+#  --collector.meminfo_numa
+#    Enable the meminfo_numa collector (default: disabled).
+#  --collector.mountstats
+#    Enable the mountstats collector (default: disabled).
+#  --collector.netclass
+#    Enable the netclass collector (default: enabled).
+#  --collector.netclass.ignored-devices="^$"
+#    Regexp of net devices to ignore for netclass collector.
+#  --collector.netdev
+#    Enable the netdev collector (default: enabled).
+#  --collector.netdev.device-exclude="^lo$"
+#    Regexp of net devices to exclude (mutually exclusive to device-include).
+#  --collector.netdev.device-include=COLLECTOR.NETDEV.DEVICE-INCLUDE
+#    Regexp of net devices to include (mutually exclusive to device-exclude).
+#  --collector.netstat
+#    Enable the netstat collector (default: enabled).
+#  --collector.netstat.fields="^(.*_(InErrors|InErrs)|Ip_Forwarding|Ip(6|Ext)_(InOctets|OutOctets)|Icmp6?_(InMsgs|OutMsgs)|TcpExt_(Listen.*|Syncookies.*|TCPSynRetrans)|Tcp_(ActiveOpens|InSegs|OutSegs|OutRsts|PassiveOpens|RetransSegs|CurrEstab)|Udp6?_(InDatagrams|OutDatagrams|NoPorts|RcvbufErrors|SndbufErrors))$"
+#    Regexp of fields to return for netstat collector.
+#  --collector.network_route
+#    Enable the network_route collector (default: disabled).
+#  --collector.nfs
+#    Enable the nfs collector (default: enabled).
+#  --collector.nfsd
+#    Enable the nfsd collector (default: enabled).
+#  --collector.ntp
+#    Enable the ntp collector (default: disabled).
+#  --collector.ntp.ip-ttl=1
+#    IP TTL to use while sending NTP query.
+#  --collector.ntp.local-offset-tolerance=1ms
+#    Offset between local clock and local ntpd time to tolerate.
+#  --collector.ntp.max-distance=3.46608s
+#    Max accumulated distance to the root.
+#  --collector.ntp.protocol-version=4
+#    NTP protocol version.
+#  --collector.ntp.server-is-local
+#    Certify that collector.ntp.server address is not a public ntp server.
+#  --collector.ntp.server="127.0.0.1"
+#    NTP server to use for ntp collector.
+#  --collector.perf
+#    Enable the perf collector (default: disabled).
+#  --collector.perf.cpus=""
+#    List of CPUs from which perf metrics should be collected.
+#  --collector.perf.tracepoint=COLLECTOR.PERF.TRACEPOINT...
+#    Perf tracepoint that should be collected.
+#  --collector.powersupply.ignored-supplies="^$"
+#    Regexp of power supplies to ignore for powersupplyclass collector.
+#  --collector.powersupplyclass
+#    Enable the powersupplyclass collector (default: enabled).
+#  --collector.pressure
+#    Enable the pressure collector (default: enabled).
+#  --collector.processes
+#    Enable the processes collector (default: disabled).
+#  --collector.qdisc
+#    Enable the qdisc collector (default: disabled).
+#  --collector.qdisc.fixtures=""
+#    Test fixtures to use for qdisc collector end-to-end testing.
+#  --collector.rapl
+#    Enable the rapl collector (default: enabled).
+#  --collector.runit
+#    Enable the runit collector (default: disabled).
+#  --collector.runit.servicedir="/etc/service"
+#    Path to runit service directory.
+#  --collector.schedstat
+#    Enable the schedstat collector (default: enabled).
+#  --collector.sockstat
+#    Enable the sockstat collector (default: enabled).
+#  --collector.softnet
+#    Enable the softnet collector (default: enabled).
+#  --collector.stat
+#    Enable the stat collector (default: enabled).
+#  --collector.supervisord
+#    Enable the supervisord collector (default: disabled).
+#  --collector.supervisord.url="http://localhost:9001/RPC2"
+#    XML RPC endpoint.
+#  --collector.systemd
+#    Enable the systemd collector (default: enabled).
+#  --collector.systemd.enable-restarts-metrics
+#    Enables service unit metric service_restart_total.
+#  --collector.systemd.enable-start-time-metrics
+#    Enables service unit metric unit_start_time_seconds.
+#  --collector.systemd.enable-task-metrics
+#    Enables service unit tasks metrics unit_tasks_current and unit_tasks_max.
+#  --collector.systemd.unit-exclude=".+\\.(automount|device|mount|scope|slice|target)"
+#    Regexp of systemd units to exclude. Units must both match include and not
+#    match exclude to be included.
+#  --collector.systemd.unit-include=".+"
+#    Regexp of systemd units to include. Units must both match include and not
+#    match exclude to be included.
+#  --collector.tcpstat
+#    Enable the tcpstat collector (default: disabled).
+#  --collector.textfile
+#    Enable the textfile collector (default: enabled).
+#  --collector.textfile.directory="/var/lib/prometheus/node-exporter"
+#    Directory to read text files with metrics from.
+#  --collector.thermal_zone
+#    Enable the thermal_zone collector (default: enabled).
+#  --collector.time
+#    Enable the time collector (default: enabled).
+#  --collector.timex
+#    Enable the timex collector (default: enabled).
+#  --collector.udp_queues
+#    Enable the udp_queues collector (default: enabled).
+#  --collector.uname
+#    Enable the uname collector (default: enabled).
+#  --collector.vmstat
+#    Enable the vmstat collector (default: enabled).
+#  --collector.vmstat.fields="^(oom_kill|pgpg|pswp|pg.*fault).*"
+#    Regexp of fields to return for vmstat collector.
+#  --collector.wifi
+#    Enable the wifi collector (default: disabled).
+#  --collector.wifi.fixtures=""
+#    Test fixtures to use for wifi collector metrics.
+#  --collector.xfs
+#    Enable the xfs collector (default: enabled).
+#  --collector.zfs
+#    Enable the zfs collector (default: enabled).
+#  --collector.zoneinfo
+#    Enable the zoneinfo collector (default: disabled).
+#  --log.format=logfmt
+#    Output format of log messages. One of: [logfmt, json].
+#  --log.level=info
+#    Only log messages with the given severity or above. One of: [debug, info,
+#    warn, error].
+#  --path.procfs="/proc"
+#    Procfs mountpoint.
+#  --path.rootfs="/"
+#    Rootfs mountpoint.
+#  --path.sysfs="/sys"
+#    Sysfs mountpoint.
+#  --web.config=""
+#    [EXPERIMENTAL] Path to config yaml file that can enable TLS or
+#    authentication.
+#  --web.disable-exporter-metrics
+#    Exclude metrics about the exporter itself (promhttp_*, process_*, go_*).
+#  --web.listen-address=":9100"
+#    Address on which to expose metrics and web interface.
+#  --web.max-requests=40
+#    Maximum number of parallel scrape requests. Use 0 to disable.
+#  --web.telemetry-path="/metrics"
+#    Path under which to expose metrics.
diff --git a/filesystem/etc/prometheus/alertmanager.yml b/filesystem/etc/prometheus/alertmanager.yml
new file mode 100644
index 0000000..c171bfc
--- /dev/null
+++ b/filesystem/etc/prometheus/alertmanager.yml
@@ -0,0 +1,21 @@
+# See https://prometheus.io/docs/alerting/configuration/ for documentation.
+
+global:
+  resolve_timeout: 3m
+  smtp_smarthost: mx.iankelling.org:587
+  smtp_from: alerts@iankelling.org
+  smtp_require_tls: False
+  smtp_hello: defaultnn.b8.nz
+templates:
+- '/etc/prometheus/alertmanager_templates/*.tmpl'
+receivers:
+- email_configs:
+  - html: null
+    send_resolved: true
+    text: '{{ template "opsgenie.default.description" . }}'
+    to: alerts@iankelling.org
+  name: defaultreceiver
+
+route:
+  receiver: defaultreceiver
+  repeat_interval: 5d
diff --git a/filesystem/etc/prometheus/file_sd/node.yml b/filesystem/etc/prometheus/file_sd/node.yml
new file mode 100644
index 0000000..8372ddc
--- /dev/null
+++ b/filesystem/etc/prometheus/file_sd/node.yml
@@ -0,0 +1,10 @@
+- targets:
+  - kdwg:9101
+  # - sywg:9101
+  # - bk:9101
+  # - je:9101
+  # - li:9101
+  # - frodo:9101
+  # - kwwg:9101
+  # - x3wg:9101
+  # - x2wg:9101
diff --git a/filesystem/etc/prometheus/file_sd/tlsnode.yml b/filesystem/etc/prometheus/file_sd/tlsnode.yml
new file mode 100644
index 0000000..47f8c7c
--- /dev/null
+++ b/filesystem/etc/prometheus/file_sd/tlsnode.yml
@@ -0,0 +1,4 @@
+- targets:
+  # - bk:9101
+  # - je:9101
+  # - li:9101
diff --git a/filesystem/etc/prometheus/prometheus.yml b/filesystem/etc/prometheus/prometheus.yml
new file mode 100644
index 0000000..9932335
--- /dev/null
+++ b/filesystem/etc/prometheus/prometheus.yml
@@ -0,0 +1,47 @@
+# Sample config for Prometheus.
+
+global:
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+      monitor: kd.b8.nz
+
+# Alertmanager configuration
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets: ['localhost:9093']
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+  - /etc/prometheus/rules/*.yml
+  # - "first_rules.yml"
+  # - "second_rules.yml"
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
+  - job_name: 'prometheus'
+
+    # metrics_path defaults to '/metrics'
+    # scheme defaults to 'http'.
+
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: node
+    basic_auth:
+      username: prom
+      password_file: /etc/prometheus-pass
+    file_sd_configs:
+      - files:
+        - /etc/prometheus/file_sd/node.yml
+  - job_name: tlsnode
+    scheme: https
+    basic_auth:
+      username: prom
+      password_file: /etc/prometheus-pass
+    file_sd_configs:
+      - files:
+        - /etc/prometheus/file_sd/tlsnode.yml
diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml
new file mode 100644
index 0000000..043b64d
--- /dev/null
+++ b/filesystem/etc/prometheus/rules/iank.yml
@@ -0,0 +1,157 @@
+
+groups:
+- name: ansible managed alert rules
+  rules:
+  - alert: NodeFilesystemAlmostOutOfSpace
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available space left.
+      summary: Filesystem has less than 5% space left.
+    expr: |-
+      (
+        node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeFilesystemAlmostOutOfSpace
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available space left.
+      summary: Filesystem has less than 3% space left.
+    expr: |-
+      (
+        node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: critical
+  - alert: NodeFilesystemFilesFillingUp
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available inodes left and is filling up.
+      summary: Filesystem is predicted to run out of inodes within the next 24 hours.
+    expr: |-
+      (
+        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
+      and
+        predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeFilesystemFilesFillingUp
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
+      summary: Filesystem is predicted to run out of inodes within the next 4 hours.
+    expr: |-
+      (
+        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
+      and
+        predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: critical
+  - alert: NodeFilesystemAlmostOutOfFiles
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available inodes left.
+      summary: Filesystem has less than 5% inodes left.
+    expr: |-
+      (
+        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeFilesystemAlmostOutOfFiles
+    annotations:
+      description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+        only {{ printf "%.2f" $value }}% available inodes left.
+      summary: Filesystem has less than 3% inodes left.
+    expr: |-
+      (
+        node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
+      and
+        node_filesystem_readonly{job="node",fstype!=""} == 0
+      )
+    for: 1h
+    labels:
+      severity: critical
+  - alert: NodeNetworkReceiveErrs
+    annotations:
+      description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
+        {{ printf "%.0f" $value }} receive errors in the last two minutes.'
+      summary: Network interface is reporting many receive errors.
+    expr: |-
+      increase(node_network_receive_errs_total[2m]) > 10
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeNetworkTransmitErrs
+    annotations:
+      description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
+        {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
+      summary: Network interface is reporting many transmit errors.
+    expr: |-
+      increase(node_network_transmit_errs_total[2m]) > 10
+    for: 1h
+    labels:
+      severity: warning
+  - alert: NodeHighNumberConntrackEntriesUsed
+    annotations:
+      description: '{{ $value | humanizePercentage }} of conntrack entries are used'
+      summary: Number of conntrack are getting close to the limit
+    expr: |-
+      (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
+    labels:
+      severity: warning
+  - alert: NodeClockSkewDetected
+    annotations:
+      message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
+        NTP is configured correctly on this host.
+      summary: Clock skew detected.
+    expr: |-
+      (
+        node_timex_offset_seconds > 0.05
+      and
+        deriv(node_timex_offset_seconds[5m]) >= 0
+      )
+      or
+      (
+        node_timex_offset_seconds < -0.05
+      and
+        deriv(node_timex_offset_seconds[5m]) <= 0
+      )
+    for: 10m
+    labels:
+      severity: warning
+  - alert: NodeClockNotSynchronising
+    annotations:
+      message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
+        on this host.
+      summary: Clock not synchronising.
+    expr: |-
+      min_over_time(node_timex_sync_status[5m]) == 0
+    for: 10m
+    labels:
+      severity: warning
+  - alert: ianktest
+    expr: node_systemd_version >= 300
+    labels:
+      severity: critical
+    annotations:
+      description: '{{ $labels.instance }} ianktest.'
+      summary: Instance {{ $labels.instance }} - ianktest
diff --git a/mailtest-check b/mailtest-check
index 3fdefff..8136301 100755
--- a/mailtest-check
+++ b/mailtest-check
@@ -76,16 +76,23 @@ esac
 
 getspamdpid() {
   if [[ ! $spamdpid || ! -d /proc/$spamdpid ]]; then
-    spamdpid=$(systemctl status spamassassin| sed -n '/^ *Main PID:/s/[^0-9]//gp' ||:)
+    spamdpid=$(systemctl show --property MainPID --value spamassassin | sed 's/^1$//' ||:)
   fi
 }
 getspamdpid
+pr() {
+  cat >>/var/lib/prometheus/node-exporter/mailtest-check.prom.$$
+}
+pr <<EOF
+mailtest_check_found_spamd_pid_bool $(( ${spamdpid:-0} > 0 ))
+EOF
 e spamdpid: $spamdpid
 if [[ ! $spamdpid ]]; then
   echo $HOSTNAME mailtest spamd pid not found. systemctl status spamassassin:
   systemctl status spamassassin
 fi
 tmpfile=$(mktemp)
+declare -i unexpected=0
 for folder in ${folders[@]}; do
   for from in ${froms[@]}; do
     latest=
@@ -102,7 +109,10 @@ for folder in ${folders[@]}; do
       fi
     done <$tmpfile
 
-    if [[ $latest ]]; then
+    if [[ ! $latest ]]; then
+      # 10 is an arbitrary bad value
+      unexpected+=10
+    else
       to=$(awk '/^Envelope-to: / {print $2}' $latest)
       last_sec=$(awk '/^Subject: / {print $4}' $latest)
 
@@ -196,5 +206,15 @@ for folder in ${folders[@]}; do
     if (( last_sec <= limit )); then
       echo $HOSTNAME mailtest $folder $from $(date -d @$last_sec +'%a %m-%d %H:%M')
     fi
+    # usec = unix seconds
+    pr <<EOF
+mailtest_check_last_usec{folder="$folder",from="$from"} $last_sec
+EOF
   done
 done
+if $slow; then
+  pr <<EOF
+mailtest_check_unexpected_spamd_results $unexpected
+EOF
+fi
+mv /var/lib/prometheus/node-exporter/mailtest-check.prom.$$ /var/lib/prometheus/node-exporter/mailtest-check.prom