-/t.org
-/old-unused
+/filesystem/etc/prometheus-export-htpasswd
+# other git repos
/.emacs.d
/Arduino
-# 3rd party git repos
-/a/roles/prom
-/a/roles/grafana
-/a/roles/node-exporter
-/a/roles/alertmanager
-/sl/.ianketiona/src/
-/sl/.iankflidas/src/
+++ /dev/null
-[defaults]
-# 2.7.4-1ppa~xenial would use 194M of memory resident in htop
-# and run out of 4g of memory. just ran ansible 2.9.9-1ppa~bionic+9.0trisquel1
-# and with 23 fork limit, it topped out at 1 gig used of memory.
-# we have about 60 hosts, so 100 should allow them all to run in
-# parallell without a problem without having memory problems.
-# oldused=0; while true; do used=$(free -w -t | tail -n1 | awk '{print $3}'); if ((used > oldused )); then oldused=$used; echo $(date) $used | tee used; fi; sleep 1; done
-forks = 100
-
-# Ansible doesnt have have trisquels python path in its os info.
-# Silence a warning
-# https://docs.ansible.com/ansible/2.9/reference_appendices/interpreter_discovery.html
-interpreter_python = auto_silent
-# strategy = free # DO NOT ENABLE.
-#
-# As of 2019-08-07, include_tasks is very broken with the free strategy.
-# tasks will not be run for some hosts, or "when" rules ignored and run
-# for the wrong host, or some hosts a task will be run twice. Even if we
-# switch to import_tasks, I wouldn't trust using this until that bug is
-# found and fixed. repro: tested with 2.7.4, (no bug reports or fixes
-# found), Running just the common role, then searching for which hosts
-# an install.yml included role got run using
-#
-# f() { awk '/xfsprogs/ { x = 1; next }; /^TASK/ { x = 0 }; x && /\[/ { print }' $1 | sort | uniq -c | pee cat wc; }
-# f LOGFILE
-host_key_checking = False
-display_skipped_hosts = False
-retry_files_enabled = False
-# readable output
-stdout_callback = yaml
-# Our logs are already pretty big. You can temporarily uncomment to enable
-# profiling info in the logs.
-#callback_whitelist = timer, profile_tasks, profile_roles
-
-# Ansible suggests using the file module instead of chmod, but then it
-# follows symlinks without an option to turn it off, which is completely
-# braindead and screwed up my system.
-command_warnings=False
-
-[ssh_connection]
-pipelining = True
-retries = 2
-
-[colors]
-# found in color.py. default blue is hard to read on a black background
-verbose = bright blue
+++ /dev/null
-# iank: 1.1 because prometheus is configered elsewhere to use fqdn, which maps
-prometheus_web_listen_address: "127.0.1.1:9090"
+++ /dev/null
-#!/bin/bash
-# Copyright (C) 2019 Ian Kelling
-# SPDX-License-Identifier: AGPL-3.0-or-later
-
-source /a/bin/errhandle/err
-
-[[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@"
-
-# shellcheck source=/a/bin/ds/.bashrc
-export LC_USEBASHRC=t; if [[ -s ~/.bashrc ]]; then . ~/.bashrc; fi
-
-# dependency of node exporter, per README.md
-pi python3-passlib
-
-# after running ansible, run
-# conflink
-# ser restart prometheus
+++ /dev/null
----
-- hosts: localhost
- roles:
- - role: prom
- tags: a
- prometheus_targets:
- node:
- - targets:
- - "{{ ansible_fqdn }}:9100"
- prometheus_scrape_configs:
- - job_name: "prometheus"
- metrics_path: "{{ prometheus_metrics_path }}"
- static_configs:
- - targets:
- - "{{ ansible_fqdn }}:9090"
- - job_name: "node"
- basic_auth:
- username: prom
- password_file: /etc/prometheus-pass
- #scheme: "https"
- file_sd_configs:
- - files:
- - "{{ prometheus_config_dir }}/file_sd/node.yml"
- # added because of warning in log
- prometheus_alertmanager_config:
- - static_configs:
- - targets:
- - "{{ ansible_fqdn }}:9093"
-
-
- - role: node-exporter
- tags: a
- # node_exporter_tls_server_config:
- # cert_file: /etc/node_exporter/fullchain.pem
- # key_file: /etc/node_exporter/privkey.pem
- node_exporter_web_listen_address: "127.0.1.1:9100"
- node_exporter_basic_auth_users:
- prom: "incarnadine.bloodied.maker"
-
- - role: alertmanager
- alertmanager_smtp:
- smarthost: 'mx.iankelling.org:587'
- from: "alerts@iankelling.org"
- require_tls: false
- hello: 'defaultnn.b8.nz'
- alertmanager_route:
- receiver: defaultreceiver
- repeat_interval: 7d
- alertmanager_receivers:
- - name: defaultreceiver
- email_configs:
- - to: alerts@iankelling.org
- send_resolved: true
- # the html was a bit ugly and just a huge waste of text,
- # https://github.com/prometheus/alertmanager/issues/2232
- # lead me to find a convenient text option to use
- html:
- text: '{% raw -%}{{ template "opsgenie.default.description" . }}{% endraw -%}'
- alertmanager_web_listen_address: '127.0.1.1:9093'
-
- - role: grafana
- grafana_address: "127.0.1.1"
- # iank: playbook will halt if no password is set. this is only
- # available to localhost, so i dont really care, but might as well
- # generate a pass isntead of putting in pw123 etc.
- grafana_security: { admin_user: admin, admin_password: spheroid.recantation.shank }
- grafana_datasources:
- - name: prometheus
- type: prometheus
- access: proxy
- url: 'http://{{ prometheus_web_listen_address }}'
- basicAuth: false
- isDefault: true
- # This is based on looking at highly downloaded dashboards here
- # https://grafana.com/dashboards?dataSource=prometheus&collector=nodeExporter
- # Which is where you are lead to from
- # https://prometheus.io/docs/visualization/grafana/
- grafana_dashboards:
- - dashboard_id: 1860
- revision_id: 21
- datasource: prometheus
- - dashboard_id: 405
- revision_id: 8
- datasource: prometheus
l=$(losetup -j $1 | sed -rn 's/^([^ ]+): .*/\1/p' | head -n1 ||:)
if [[ ! $l ]]; then
l=$(sudo losetup -f)
- sudo losetup $l $1
+ m sudo losetup $l $1
fi
if ! sudo cryptsetup status /dev/mapper/$base &>/dev/null; then
if ! sudo cryptsetup luksOpen $l $base; then
- sudo losetup -d $l
+ m sudo losetup -d $l
return 1
fi
fi
- sudo mkdir -p /mnt/$base
- sudo mount /dev/mapper/$base /mnt/$base
- sudo chown $USER:$USER /mnt/$base
+ m sudo mkdir -p /mnt/$base
+ m sudo mount /dev/mapper/$base /mnt/$base
+ m sudo chown $USER:$USER /mnt/$base
else
base=$1
if mountpoint /mnt/$base &>/dev/null; then
- sudo umount /mnt/$base
+ m sudo umount /mnt/$base
fi
if sudo cryptsetup status /dev/mapper/$base &>/dev/null; then
- if ! sudo cryptsetup luksClose /dev/mapper/$base; then
+ if ! m sudo cryptsetup luksClose /dev/mapper/$base; then
echo lom: failed cryptsetup luksClose /dev/mapper/$base
return 1
fi
fi
- l=$(losetup -j $1 | sed -rn 's/^([^ ]+): .*/\1/p' | head -n1 ||:)
+ l=$(losetup -l --noheadings | awk '$6 ~ /\/'$1'$/ {print $1}')
if [[ $l ]]; then
- sudo losetup -d $l
+ m sudo losetup -d $l
else
echo lom: warning: no loopback device found
fi
otp() {
oathtool --totp -b "$*" | xclip -selection clipboard
}
+j() {
+ "$@" |& pee "xclip -r -selection clipboard"
+}
pakaraoke() {
if [[ -e /var/lib/znc ]] && getent group znc; then
s chown -R znc:znc /var/lib/znc
fi
- f=/etc/prometheus-htpasswd
+ for f in /etc/prometheus-{,export-}htpasswd; do
+ if [[ -e $f ]]; then
+ s chmod 640 $f
+ if getent passwd www-data; then
+ s chown root:www-data $f
+ fi
+ fi
+ done
+ f=/etc/prometheus-pass
if [[ -e $f ]]; then
+ # note: this is duplicative of the file's own permissions
s chmod 640 $f /etc/prometheus-pass
- s chown root:www-data $f
if getent passwd prometheus; then
- s chown root:prometheus /etc/prometheus-pass
+ s chown root:prometheus $f
fi
fi
+
##### end special extra stuff #####
m sudo -H -u user2 "${BASH_SOURCE[0]}"
;;
esac
+### begin prometheus ###
+
+case $HOSTNAME in
+ kd)
+ # ive got these + a needed dependency pinned to bullseye, just to get
+ # versions more in line with the main docs.
+ pi prometheus-alertmanager prometheus prometheus-node-exporter
+ web-conf -p 9091 -f 9090 - apache2 i.b8.nz <<'EOF'
+<Location "/">
+AuthType Basic
+AuthName "basic_auth"
+# created with
+# htpasswd -c prometheus-htpasswd USERNAME
+AuthUserFile "/etc/prometheus-htpasswd"
+Require valid-user
+</Location>
+EOF
+ ;;
+ *)
+ pi prometheus-node-exporter
+ ;;
+esac
+
+case $HOSTNAME in
+ # frodo needs upgrade first.
+ frodo) : ;;
+ # todo, for limiting node exporter http,
+ # either use iptables or, in
+ # /etc/default/prometheus-node-exporter
+ # listen on the wireguard interface
+ ;;
+ li|je|bk)
+ # ex for exporter
+ web-conf -p 9101 -f 9100 - apache2 ${HOSTNAME}ex.b8.nz <<'EOF'
+<Location "/">
+AuthType Basic
+AuthName "basic_auth"
+# created with
+# htpasswd -c prometheus-export-htpasswd USERNAME
+AuthUserFile "/etc/prometheus-export-htpasswd"
+Require valid-user
+</Location>
+EOF
+ ;;
+ *)
+ wgip=$(sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf)
+ web-conf -i -a $wgip -p 9101 -f 9100 - apache2 ${HOSTNAME}wg.b8.nz <<'EOF'
+<Location "/">
+AuthType Basic
+AuthName "basic_auth"
+# created with
+# htpasswd -c prometheus-export-htpasswd USERNAME
+AuthUserFile "/etc/prometheus-export-htpasswd"
+Require valid-user
+</Location>
+EOF
+ ;;
+esac
+
+### end prometheus ###
+
+
end_msg <<'EOF'
In mate settings settings, change scrolling to two-finger,
because the default edge scroll doesn\'t work. Originally found this in debian.
--- /dev/null
+Package: prometheus-alertmanager prometheus prometheus-node-exporter libjs-jquery
+Pin: release n=bullseye,o=Debian
+Pin-Priority: 500
--- /dev/null
+# iank: initial file from 2.24, added to empty ARGS.
+
+# Set the command-line arguments to pass to the server.
+
+ARGS="--web.listen-address=127.0.0.1:9090"
+
+
+
+# Prometheus supports the following options:
+# --config.file="/etc/prometheus/prometheus.yml"
+# Prometheus configuration file path.
+# --web.listen-address="0.0.0.0:9090"
+# Address to listen on for UI, API, and telemetry.
+# --web.read-timeout=5m Maximum duration before timing out read of the
+# request, and closing idle connections.
+# --web.max-connections=512 Maximum number of simultaneous connections.
+# --web.external-url=<URL> The URL under which Prometheus is externally
+# reachable (for example, if Prometheus is served
+# via a reverse proxy). Used for generating
+# relative and absolute links back to Prometheus
+# itself. If the URL has a path portion, it will
+# be used to prefix all HTTP endpoints served by
+# Prometheus. If omitted, relevant URL components
+# will be derived automatically.
+# --web.route-prefix=<path> Prefix for the internal routes of web endpoints.
+# Defaults to path of --web.external-url.
+# --web.local-assets="/usr/share/prometheus/web/"
+# Path to static asset/templates directory.
+# --web.user-assets=<path> Path to user asset directory, available at
+# /user.
+# --web.enable-lifecycle Enable shutdown and reload via HTTP request.
+# --web.enable-admin-api Enable API endpoints for admin control actions.
+# --web.console.templates="/etc/prometheus/consoles"
+# Path to the console template directory,
+# available at /consoles.
+# --web.console.libraries="/etc/prometheus/console_libraries"
+# Path to the console library directory.
+# --web.page-title="Prometheus Time Series Collection and Processing Server"
+# Document title of Prometheus instance.
+# --web.cors.origin=".*" Regex for CORS origin. It is fully anchored.
+# Example: 'https?://(domain1|domain2)\.com'
+# --storage.tsdb.path="/var/lib/prometheus/metrics2/"
+# Base path for metrics storage.
+# --storage.tsdb.retention=15d
+# [DEPRECATED] How long to retain samples in
+# storage. This flag has been deprecated, use
+# "storage.tsdb.retention.time" instead
+# --storage.tsdb.retention.time=15d
+# How long to retain samples in storage. When this
+# flag is set it overrides
+# "storage.tsdb.retention".
+# If neither this flag nor "storage.tsdb.retention"
+# nor "storage.tsdb.retention.size" is set, the
+# retention time defaults to 15d.
+# Units Supported: y, w, d, h, m, s, ms.
+# --storage.tsdb.retention.size=
+# [EXPERIMENTAL] Maximum number of bytes that can
+# be stored for blocks. Units supported: KB, MB,
+# GB, TB, PB. This flag is experimental and can be
+# changed in future releases.
+# --storage.tsdb.use-lockfile
+# Create a lockfile in data directory.
+# --storage.tsdb.allow-overlapping-blocks
+# [EXPERIMENTAL] Allow overlapping blocks, which
+# in turn enables vertical compaction and
+# vertical query merge.
+# --storage.tsdb.wal-compression
+# Compress the tsdb WAL.
+# --storage.remote.flush-deadline=<duration>
+# How long to wait flushing sample on shutdown or
+# config reload.
+# --storage.remote.read-sample-limit=5e7
+# Maximum overall number of samples to return via
+# the remote read interface, in a single query. 0
+# means no limit. This limit is ignored for
+# streamed response types.
+# --storage.remote.read-concurrent-limit=10
+# Maximum number of concurrent remote read calls.
+# 0 means no limit.
+# --storage.remote.read-max-bytes-in-frame=1048576
+# Maximum number of bytes in a single frame for
+# streaming remote read response types before
+# marshalling. Note that client might have limit on
+# frame size as well. 1MB as recommended by
+# protobuf by default.
+# --rules.alert.for-outage-tolerance=1h
+# Max time to tolerate prometheus outage for
+# restoring "for" state of alert.
+# --rules.alert.for-grace-period=10m
+# Minimum duration between alert and restored "for"
+# state. This is maintained only for alerts with
+# configured "for" time greater than grace period.
+# --rules.alert.resend-delay=1m
+# Minimum amount of time to wait before resending
+# an alert to Alertmanager.
+# --alertmanager.notification-queue-capacity=10000
+# The capacity of the queue for pending
+# Alertmanager notifications.
+# --alertmanager.timeout=10s
+# Timeout for sending alerts to Alertmanager.
+# --query.lookback-delta=5m The maximum lookback duration for retrieving
+# metrics during expression evaluations and
+# federation.
+# --query.timeout=2m Maximum time a query may take before being
+# aborted.
+# --query.max-concurrency=20
+# Maximum number of queries executed concurrently.
+# --query.max-samples=50000000
+# Maximum number of samples a single query can load
+# into memory. Note that queries will fail if they
+# try to load more samples than this into memory,
+# so this also limits the number of samples a query
+# can return.
+# --log.level=info Only log messages with the given severity or
+# above. One of: [debug, info, warn, error]
+# --log.format=logfmt Output format of log messages. One of: [logfmt,
+# json]
--- /dev/null
+# Set the command-line arguments to pass to the server.
+# default:
+#ARGS=""
+
+# iank:
+ARGS="--web.listen-address=127.0.0.1:9093"
+
+# this file is from version 0.21
+
+# The alert manager supports the following options:
+
+# --config.file="/etc/prometheus/alertmanager.yml"
+# Alertmanager configuration file name.
+# --storage.path="/var/lib/prometheus/alertmanager/"
+# Base path for data storage.
+# --data.retention=120h
+# How long to keep data for.
+# --alerts.gc-interval=30m
+# Interval between alert GC.
+# --log.level=info
+# Only log messages with the given severity or above.
+# --web.external-url=WEB.EXTERNAL-URL
+# The URL under which Alertmanager is externally reachable (for example,
+# if Alertmanager is served via a reverse proxy). Used for generating
+# relative and absolute links back to Alertmanager itself. If the URL has
+# a path portion, it will be used to prefix all HTTP endpoints served by
+# Alertmanager. If omitted, relevant URL components will be derived
+# automatically.
+# --web.route-prefix=WEB.ROUTE-PREFIX
+# Prefix for the internal routes of web endpoints. Defaults to path of
+# --web.external-url.
+# --web.listen-address=":9093"
+# Address to listen on for the web interface and API.
+# --web.ui-path="/usr/share/prometheus/alertmanager/ui/"
+# Path to static UI directory.
+# --template.default="/usr/share/prometheus/alertmanager/default.tmpl"
+# Path to default notification template.
+# --cluster.listen-address="0.0.0.0:9094"
+# Listen address for cluster.
+# --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS
+# Explicit address to advertise in cluster.
+# --cluster.peer=CLUSTER.PEER ...
+# Initial peers (may be repeated).
+# --cluster.peer-timeout=15s
+# Time to wait between peers to send notifications.
+# --cluster.gossip-interval=200ms
+# Interval between sending gossip messages. By lowering this value (more
+# frequent) gossip messages are propagated across the cluster more
+# quickly at the expense of increased bandwidth.
+# --cluster.pushpull-interval=1m0s
+# Interval for gossip state syncs. Setting this interval lower (more
+# frequent) will increase convergence speeds across larger clusters at
+# the expense of increased bandwidth usage.
+# --cluster.tcp-timeout=10s Timeout for establishing a stream connection
+# with a remote node for a full state sync, and for stream read and write
+# operations.
+# --cluster.probe-timeout=500ms
+# Timeout to wait for an ack from a probed node before assuming it is
+# unhealthy. This should be set to 99-percentile of RTT (round-trip time)
+# on your network.
+# --cluster.probe-interval=1s
+# Interval between random node probes. Setting this lower (more frequent)
+# will cause the cluster to detect failed nodes more quickly at the
+# expense of increased bandwidth usage.
+# --cluster.settle-timeout=1m0s
+# Maximum time to wait for cluster connections to settle before
+# evaluating notifications.
+# --cluster.reconnect-interval=10s
+# Interval between attempting to reconnect to lost peers.
+# --cluster.reconnect-timeout=6h0m0s
+# Length of time to attempt to reconnect to a lost peer.
--- /dev/null
+# Set the command-line arguments to pass to the server.
+# Due to shell scaping, to pass backslashes for regexes, you need to double
+# them (\\d for \d). If running under systemd, you need to double them again
+# (\\\\d to mean \d), and escape newlines too.
+ARGS="--web.listen-address=127.0.0.1:9100"
+
+# prometheus-node-exporter supports the following options:
+#
+# --collector.arp
+# Enable the arp collector (default: enabled).
+# --collector.bcache
+# Enable the bcache collector (default: enabled).
+# --collector.bcache.priorityStats
+# Expose expensive priority stats.
+# --collector.bonding
+# Enable the bonding collector (default: enabled).
+# --collector.btrfs
+# Enable the btrfs collector (default: enabled).
+# --collector.buddyinfo
+# Enable the buddyinfo collector (default: disabled).
+# --collector.conntrack
+# Enable the conntrack collector (default: enabled).
+# --collector.cpu
+# Enable the cpu collector (default: enabled).
+# --collector.cpu.info
+# Enables metric cpu_info.
+# --collector.cpu.info.bugs-include=COLLECTOR.CPU.INFO.BUGS-INCLUDE
+# Filter the `bugs` field in cpuInfo with a value that must be a regular
+# expression.
+# --collector.cpu.info.flags-include=COLLECTOR.CPU.INFO.FLAGS-INCLUDE
+# Filter the `flags` field in cpuInfo with a value that must be a regular
+# expression.
+# --collector.cpufreq
+# Enable the cpufreq collector (default: enabled).
+# --collector.disable-defaults
+# Set all collectors to disabled by default.
+# --collector.diskstats
+# Enable the diskstats collector (default: enabled).
+# --collector.diskstats.ignored-devices="^(ram|loop|fd|(h|s|v|xv)d[a-z]|nvme\\d+n\\d+p)\\d+$"
+# Regexp of devices to ignore for diskstats.
+# --collector.drbd
+# Enable the drbd collector (default: disabled).
+# --collector.edac
+# Enable the edac collector (default: enabled).
+# --collector.entropy
+# Enable the entropy collector (default: enabled).
+# --collector.fibrechannel
+# Enable the fibrechannel collector (default: enabled).
+# --collector.filefd
+# Enable the filefd collector (default: enabled).
+# --collector.filesystem
+# Enable the filesystem collector (default: enabled).
+# --collector.filesystem.ignored-fs-types="^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
+# Regexp of filesystem types to ignore for filesystem collector.
+# --collector.filesystem.ignored-mount-points="^/(dev|proc|run|sys|mnt|media|var/lib/docker/.+)($|/)"
+# Regexp of mount points to ignore for filesystem collector.
+# --collector.hwmon
+# Enable the hwmon collector (default: enabled).
+# --collector.infiniband
+# Enable the infiniband collector (default: enabled).
+# --collector.interrupts
+# Enable the interrupts collector (default: disabled).
+# --collector.ipvs
+# Enable the ipvs collector (default: enabled).
+# --collector.ipvs.backend-labels="local_address,local_port,remote_address,remote_port,proto,local_mark"
+# Comma separated list for IPVS backend stats labels.
+# --collector.ksmd
+# Enable the ksmd collector (default: disabled).
+# --collector.loadavg
+# Enable the loadavg collector (default: enabled).
+# --collector.logind
+# Enable the logind collector (default: disabled).
+# --collector.mdadm
+# Enable the mdadm collector (default: enabled).
+# --collector.meminfo
+# Enable the meminfo collector (default: enabled).
+# --collector.meminfo_numa
+# Enable the meminfo_numa collector (default: disabled).
+# --collector.mountstats
+# Enable the mountstats collector (default: disabled).
+# --collector.netclass
+# Enable the netclass collector (default: enabled).
+# --collector.netclass.ignored-devices="^$"
+# Regexp of net devices to ignore for netclass collector.
+# --collector.netdev
+# Enable the netdev collector (default: enabled).
+# --collector.netdev.device-exclude="^lo$"
+# Regexp of net devices to exclude (mutually exclusive to device-include).
+# --collector.netdev.device-include=COLLECTOR.NETDEV.DEVICE-INCLUDE
+# Regexp of net devices to include (mutually exclusive to device-exclude).
+# --collector.netstat
+# Enable the netstat collector (default: enabled).
+# --collector.netstat.fields="^(.*_(InErrors|InErrs)|Ip_Forwarding|Ip(6|Ext)_(InOctets|OutOctets)|Icmp6?_(InMsgs|OutMsgs)|TcpExt_(Listen.*|Syncookies.*|TCPSynRetrans)|Tcp_(ActiveOpens|InSegs|OutSegs|OutRsts|PassiveOpens|RetransSegs|CurrEstab)|Udp6?_(InDatagrams|OutDatagrams|NoPorts|RcvbufErrors|SndbufErrors))$"
+# Regexp of fields to return for netstat collector.
+# --collector.network_route
+# Enable the network_route collector (default: disabled).
+# --collector.nfs
+# Enable the nfs collector (default: enabled).
+# --collector.nfsd
+# Enable the nfsd collector (default: enabled).
+# --collector.ntp
+# Enable the ntp collector (default: disabled).
+# --collector.ntp.ip-ttl=1
+# IP TTL to use while sending NTP query.
+# --collector.ntp.local-offset-tolerance=1ms
+# Offset between local clock and local ntpd time to tolerate.
+# --collector.ntp.max-distance=3.46608s
+# Max accumulated distance to the root.
+# --collector.ntp.protocol-version=4
+# NTP protocol version.
+# --collector.ntp.server-is-local
+# Certify that collector.ntp.server address is not a public ntp server.
+# --collector.ntp.server="127.0.0.1"
+# NTP server to use for ntp collector.
+# --collector.perf
+# Enable the perf collector (default: disabled).
+# --collector.perf.cpus=""
+# List of CPUs from which perf metrics should be collected.
+# --collector.perf.tracepoint=COLLECTOR.PERF.TRACEPOINT...
+# Perf tracepoint that should be collected.
+# --collector.powersupply.ignored-supplies="^$"
+# Regexp of power supplies to ignore for powersupplyclass collector.
+# --collector.powersupplyclass
+# Enable the powersupplyclass collector (default: enabled).
+# --collector.pressure
+# Enable the pressure collector (default: enabled).
+# --collector.processes
+# Enable the processes collector (default: disabled).
+# --collector.qdisc
+# Enable the qdisc collector (default: disabled).
+# --collector.qdisc.fixtures=""
+# Test fixtures to use for qdisc collector end-to-end testing.
+# --collector.rapl
+# Enable the rapl collector (default: enabled).
+# --collector.runit
+# Enable the runit collector (default: disabled).
+# --collector.runit.servicedir="/etc/service"
+# Path to runit service directory.
+# --collector.schedstat
+# Enable the schedstat collector (default: enabled).
+# --collector.sockstat
+# Enable the sockstat collector (default: enabled).
+# --collector.softnet
+# Enable the softnet collector (default: enabled).
+# --collector.stat
+# Enable the stat collector (default: enabled).
+# --collector.supervisord
+# Enable the supervisord collector (default: disabled).
+# --collector.supervisord.url="http://localhost:9001/RPC2"
+# XML RPC endpoint.
+# --collector.systemd
+# Enable the systemd collector (default: enabled).
+# --collector.systemd.enable-restarts-metrics
+# Enables service unit metric service_restart_total.
+# --collector.systemd.enable-start-time-metrics
+# Enables service unit metric unit_start_time_seconds.
+# --collector.systemd.enable-task-metrics
+# Enables service unit tasks metrics unit_tasks_current and unit_tasks_max.
+# --collector.systemd.unit-exclude=".+\\.(automount|device|mount|scope|slice|target)"
+# Regexp of systemd units to exclude. Units must both match include and not
+# match exclude to be included.
+# --collector.systemd.unit-include=".+"
+# Regexp of systemd units to include. Units must both match include and not
+# match exclude to be included.
+# --collector.tcpstat
+# Enable the tcpstat collector (default: disabled).
+# --collector.textfile
+# Enable the textfile collector (default: enabled).
+# --collector.textfile.directory="/var/lib/prometheus/node-exporter"
+# Directory to read text files with metrics from.
+# --collector.thermal_zone
+# Enable the thermal_zone collector (default: enabled).
+# --collector.time
+# Enable the time collector (default: enabled).
+# --collector.timex
+# Enable the timex collector (default: enabled).
+# --collector.udp_queues
+# Enable the udp_queues collector (default: enabled).
+# --collector.uname
+# Enable the uname collector (default: enabled).
+# --collector.vmstat
+# Enable the vmstat collector (default: enabled).
+# --collector.vmstat.fields="^(oom_kill|pgpg|pswp|pg.*fault).*"
+# Regexp of fields to return for vmstat collector.
+# --collector.wifi
+# Enable the wifi collector (default: disabled).
+# --collector.wifi.fixtures=""
+# Test fixtures to use for wifi collector metrics.
+# --collector.xfs
+# Enable the xfs collector (default: enabled).
+# --collector.zfs
+# Enable the zfs collector (default: enabled).
+# --collector.zoneinfo
+# Enable the zoneinfo collector (default: disabled).
+# --log.format=logfmt
+# Output format of log messages. One of: [logfmt, json].
+# --log.level=info
+# Only log messages with the given severity or above. One of: [debug, info,
+# warn, error].
+# --path.procfs="/proc"
+# Procfs mountpoint.
+# --path.rootfs="/"
+# Rootfs mountpoint.
+# --path.sysfs="/sys"
+# Sysfs mountpoint.
+# --web.config=""
+# [EXPERIMENTAL] Path to config yaml file that can enable TLS or
+# authentication.
+# --web.disable-exporter-metrics
+# Exclude metrics about the exporter itself (promhttp_*, process_*, go_*).
+# --web.listen-address=":9100"
+# Address on which to expose metrics and web interface.
+# --web.max-requests=40
+# Maximum number of parallel scrape requests. Use 0 to disable.
+# --web.telemetry-path="/metrics"
+# Path under which to expose metrics.
--- /dev/null
+# See https://prometheus.io/docs/alerting/configuration/ for documentation.
+
+global:
+ resolve_timeout: 3m
+ smtp_smarthost: mx.iankelling.org:587
+ smtp_from: alerts@iankelling.org
+ smtp_require_tls: False
+ smtp_hello: defaultnn.b8.nz
+templates:
+- '/etc/prometheus/alertmanager_templates/*.tmpl'
+receivers:
+- email_configs:
+ - html: null
+ send_resolved: true
+ text: '{{ template "opsgenie.default.description" . }}'
+ to: alerts@iankelling.org
+ name: defaultreceiver
+
+route:
+ receiver: defaultreceiver
+ repeat_interval: 5d
--- /dev/null
+- targets:
+ - kdwg:9101
+ # - sywg:9101
+ # - bk:9101
+ # - je:9101
+ # - li:9101
+ # - frodo:9101
+ # - kwwg:9101
+ # - x3wg:9101
+ # - x2wg:9101
--- /dev/null
+- targets:
+ # - bk:9101
+ # - je:9101
+ # - li:9101
--- /dev/null
+# Sample config for Prometheus.
+
+global:
+ # Attach these labels to any time series or alerts when communicating with
+ # external systems (federation, remote storage, Alertmanager).
+ external_labels:
+ monitor: kd.b8.nz
+
+# Alertmanager configuration
+alerting:
+ alertmanagers:
+ - static_configs:
+ - targets: ['localhost:9093']
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+ - /etc/prometheus/rules/*.yml
+ # - "first_rules.yml"
+ # - "second_rules.yml"
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+ # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
+ - job_name: 'prometheus'
+
+ # metrics_path defaults to '/metrics'
+ # scheme defaults to 'http'.
+
+ static_configs:
+ - targets: ['localhost:9090']
+
+ - job_name: node
+ basic_auth:
+ username: prom
+ password_file: /etc/prometheus-pass
+ file_sd_configs:
+ - files:
+ - /etc/prometheus/file_sd/node.yml
+ - job_name: tlsnode
+ scheme: https
+ basic_auth:
+ username: prom
+ password_file: /etc/prometheus-pass
+ file_sd_configs:
+ - files:
+ - /etc/prometheus/file_sd/tlsnode.yml
--- /dev/null
+
+groups:
+- name: ansible managed alert rules
+ rules:
+ - alert: NodeFilesystemAlmostOutOfSpace
+ annotations:
+ description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+ only {{ printf "%.2f" $value }}% available space left.
+ summary: Filesystem has less than 5% space left.
+ expr: |-
+ (
+ node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
+ and
+ node_filesystem_readonly{job="node",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeFilesystemAlmostOutOfSpace
+ annotations:
+ description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+ only {{ printf "%.2f" $value }}% available space left.
+ summary: Filesystem has less than 3% space left.
+ expr: |-
+ (
+ node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
+ and
+ node_filesystem_readonly{job="node",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: critical
+ - alert: NodeFilesystemFilesFillingUp
+ annotations:
+ description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+ only {{ printf "%.2f" $value }}% available inodes left and is filling up.
+ summary: Filesystem is predicted to run out of inodes within the next 24 hours.
+ expr: |-
+ (
+ node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
+ and
+ predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
+ and
+ node_filesystem_readonly{job="node",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeFilesystemFilesFillingUp
+ annotations:
+ description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+ only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
+ summary: Filesystem is predicted to run out of inodes within the next 4 hours.
+ expr: |-
+ (
+ node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
+ and
+ predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
+ and
+ node_filesystem_readonly{job="node",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: critical
+ - alert: NodeFilesystemAlmostOutOfFiles
+ annotations:
+ description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+ only {{ printf "%.2f" $value }}% available inodes left.
+ summary: Filesystem has less than 5% inodes left.
+ expr: |-
+ (
+ node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
+ and
+ node_filesystem_readonly{job="node",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeFilesystemAlmostOutOfFiles
+ annotations:
+ description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has
+ only {{ printf "%.2f" $value }}% available inodes left.
+ summary: Filesystem has less than 3% inodes left.
+ expr: |-
+ (
+ node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
+ and
+ node_filesystem_readonly{job="node",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: critical
+ - alert: NodeNetworkReceiveErrs
+ annotations:
+ description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
+ {{ printf "%.0f" $value }} receive errors in the last two minutes.'
+ summary: Network interface is reporting many receive errors.
+ expr: |-
+ increase(node_network_receive_errs_total[2m]) > 10
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeNetworkTransmitErrs
+ annotations:
+ description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
+ {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
+ summary: Network interface is reporting many transmit errors.
+ expr: |-
+ increase(node_network_transmit_errs_total[2m]) > 10
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeHighNumberConntrackEntriesUsed
+ annotations:
+ description: '{{ $value | humanizePercentage }} of conntrack entries are used'
+ summary: Number of conntrack are getting close to the limit
+ expr: |-
+ (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
+ labels:
+ severity: warning
+ - alert: NodeClockSkewDetected
+ annotations:
+ message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
+ NTP is configured correctly on this host.
+ summary: Clock skew detected.
+ expr: |-
+ (
+ node_timex_offset_seconds > 0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) >= 0
+ )
+ or
+ (
+ node_timex_offset_seconds < -0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) <= 0
+ )
+ for: 10m
+ labels:
+ severity: warning
+ - alert: NodeClockNotSynchronising
+ annotations:
+ message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
+ on this host.
+ summary: Clock not synchronising.
+ expr: |-
+ min_over_time(node_timex_sync_status[5m]) == 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: ianktest
+ expr: node_systemd_version >= 300
+ labels:
+ severity: critical
+ annotations:
+ description: '{{ $labels.instance }} ianktest.'
+ summary: Instance {{ $labels.instance }} - ianktest
getspamdpid() {
if [[ ! $spamdpid || ! -d /proc/$spamdpid ]]; then
- spamdpid=$(systemctl status spamassassin| sed -n '/^ *Main PID:/s/[^0-9]//gp' ||:)
+ spamdpid=$(systemctl show --property MainPID --value spamassassin | sed 's/^1$//' ||:)
fi
}
getspamdpid
+pr() {
+ cat >>/var/lib/prometheus/node-exporter/mailtest-check.prom.$$
+}
+pr <<EOF
+mailtest_check_found_spamd_pid_bool $(( ${spamdpid:-0} > 0 ))
+EOF
e spamdpid: $spamdpid
if [[ ! $spamdpid ]]; then
echo $HOSTNAME mailtest spamd pid not found. systemctl status spamassassin:
systemctl status spamassassin
fi
tmpfile=$(mktemp)
+declare -i unexpected=0
for folder in ${folders[@]}; do
for from in ${froms[@]}; do
latest=
fi
done <$tmpfile
- if [[ $latest ]]; then
+ if [[ ! $latest ]]; then
+ # 10 is an arbitrary bad value
+ unexpected+=10
+ else
to=$(awk '/^Envelope-to: / {print $2}' $latest)
last_sec=$(awk '/^Subject: / {print $4}' $latest)
if (( last_sec <= limit )); then
echo $HOSTNAME mailtest $folder $from $(date -d @$last_sec +'%a %m-%d %H:%M')
fi
+ # usec = unix seconds
+ pr <<EOF
+mailtest_check_last_usec{folder="$folder",from="$from"} $last_sec
+EOF
done
done
+if $slow; then
+ pr <<EOF
+mailtest_check_unexpected_spamd_results $unexpected
+EOF
+fi
+mv /var/lib/prometheus/node-exporter/mailtest-check.prom.$$ /var/lib/prometheus/node-exporter/mailtest-check.prom