/run/user/0 /run/user/0 none rw,bind 0 0
EOF
+# todo: consider if this should use the new sysd-prom-fail
sd /etc/systemd/system/schrootupdate.service <<'EOF'
[Unit]
Description=schrootupdate
case $HOSTNAME in
kd)
- # ive got these + a needed dependency pinned to bullseye, just to get
- # versions more in line with the main docs.
+ /a/bin/buildscripts/prometheus
# Font awesome is needed for the alertmanager ui.
pi prometheus-alertmanager prometheus prometheus-node-exporter fonts-font-awesome
web-conf -p 9091 -f 9090 - apache2 i.b8.nz <<'EOF'
Require valid-user
</Location>
EOF
+
+ web-conf -p 9094 -f 9093 - apache2 i.b8.nz <<'EOF'
+<Location "/">
+AuthType Basic
+AuthName "basic_auth"
+# created with
+# htpasswd -c prometheus-htpasswd USERNAME
+AuthUserFile "/etc/prometheus-htpasswd"
+Require valid-user
+</Location>
+EOF
+
# by default, the alertmanager web ui is not enabled other than a page
# that suggests to use the amtool cli. that tool is good, but you cant
# silence things nearly as fast.
sysd-prom-fail-install $ser
done
- ## get upstream because it has the react ui, which has localtime, and general better usability.
- ## begin get latest upstream prometheus ###
- cd /a/opt/promdl
- url=$(curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest | jq -r '.assets[].browser_download_url | match(".*linux-amd64.tar.gz$").string')
- f=${url##*/}
- if [[ -e $f ]]; then
- timestamp=$(stat -c %Y $f)
- else
- timestamp=0
- fi
- m wget -nv -N $url
- new_timestamp=$(stat -c %Y $f)
- if [[ $timestamp != $new_timestamp || ! -e /usr/local/bin/prometheus ]]; then
- ngset
- to_rm=( !($f) )
- ngreset
- if (( ${#to_rm[@]} )); then
- rm -rf ${to_rm[@]}
- fi
- m ex $f
- dir=${f%.tar.gz}
- s install $dir/prometheus $dir/promtool /usr/local/bin
- fi
- ## end get latest upstream prometheus ###
-
;;
*)
pi prometheus-node-exporter
</Location>
EOF
# For work, i think we will just use the firewall for hosts in the main data center, and
- # apache/nginx + tls + basic auth outside of it. or consider stunnel.
-
+ # vpn for hosts outside it.
# TODO: figure out how to detect the ping failure and try again.
# it doesn't wait for network.target, and gives this error message:
# component=cluster err="couldn't deduce an advertise address: no private IP found, explicit advertise addr not provided"
+# config.file and storage.path are set to match the debian package
-ARGS="--cluster.listen-address= --web.listen-address=127.0.0.1:9093"
+ARGS="--cluster.listen-address=
+--config.file=/etc/prometheus/alertmanager.yml
+--storage.path=/var/lib/prometheus/alertmanager/
+--web.listen-address=127.0.0.1:9093"
# this file is from version 0.21
# The alert manager supports the following options:
-# --config.file="/etc/prometheus/alertmanager.yml"
-# Alertmanager configuration file name.
-# --storage.path="/var/lib/prometheus/alertmanager/"
-# Base path for data storage.
-# --data.retention=120h
-# How long to keep data for.
-# --alerts.gc-interval=30m
-# Interval between alert GC.
-# --log.level=info
-# Only log messages with the given severity or above.
-# --web.external-url=WEB.EXTERNAL-URL
-# The URL under which Alertmanager is externally reachable (for example,
-# if Alertmanager is served via a reverse proxy). Used for generating
-# relative and absolute links back to Alertmanager itself. If the URL has
-# a path portion, it will be used to prefix all HTTP endpoints served by
-# Alertmanager. If omitted, relevant URL components will be derived
-# automatically.
-# --web.route-prefix=WEB.ROUTE-PREFIX
-# Prefix for the internal routes of web endpoints. Defaults to path of
-# --web.external-url.
-# --web.listen-address=":9093"
-# Address to listen on for the web interface and API.
-# --web.ui-path="/usr/share/prometheus/alertmanager/ui/"
-# Path to static UI directory.
-# --template.default="/usr/share/prometheus/alertmanager/default.tmpl"
-# Path to default notification template.
-# --cluster.listen-address="0.0.0.0:9094"
-# Listen address for cluster.
-# --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS
-# Explicit address to advertise in cluster.
-# --cluster.peer=CLUSTER.PEER ...
-# Initial peers (may be repeated).
-# --cluster.peer-timeout=15s
-# Time to wait between peers to send notifications.
-# --cluster.gossip-interval=200ms
-# Interval between sending gossip messages. By lowering this value (more
-# frequent) gossip messages are propagated across the cluster more
-# quickly at the expense of increased bandwidth.
-# --cluster.pushpull-interval=1m0s
-# Interval for gossip state syncs. Setting this interval lower (more
-# frequent) will increase convergence speeds across larger clusters at
-# the expense of increased bandwidth usage.
-# --cluster.tcp-timeout=10s Timeout for establishing a stream connection
-# with a remote node for a full state sync, and for stream read and write
-# operations.
-# --cluster.probe-timeout=500ms
-# Timeout to wait for an ack from a probed node before assuming it is
-# unhealthy. This should be set to 99-percentile of RTT (round-trip time)
-# on your network.
-# --cluster.probe-interval=1s
-# Interval between random node probes. Setting this lower (more frequent)
-# will cause the cluster to detect failed nodes more quickly at the
-# expense of increased bandwidth usage.
-# --cluster.settle-timeout=1m0s
-# Maximum time to wait for cluster connections to settle before
-# evaluating notifications.
-# --cluster.reconnect-interval=10s
-# Interval between attempting to reconnect to lost peers.
-# --cluster.reconnect-timeout=6h0m0s
-# Length of time to attempt to reconnect to a lost peer.
+
+# --config.file="alertmanager.yml"
+# Alertmanager configuration file name.
+# --storage.path="data/" Base path for data storage.
+# --data.retention=120h How long to keep data for.
+# --alerts.gc-interval=30m Interval between alert GC.
+# --web.external-url=WEB.EXTERNAL-URL
+# The URL under which Alertmanager is externally reachable (for
+# example, if Alertmanager is served via a reverse proxy). Used
+# for generating relative and absolute links back to
+# Alertmanager itself. If the URL has a path portion, it will
+# be used to prefix all HTTP endpoints served by Alertmanager.
+# If omitted, relevant URL components will be derived
+# automatically.
+# --web.route-prefix=WEB.ROUTE-PREFIX
+# Prefix for the internal routes of web endpoints. Defaults to
+# path of --web.external-url.
+# --web.listen-address=":9093"
+# Address to listen on for the web interface and API.
+# --web.get-concurrency=0 Maximum number of GET requests processed concurrently. If
+# negative or zero, the limit is GOMAXPROC or 8, whichever is
+# larger.
+# --web.timeout=0 Timeout for HTTP requests. If negative or zero, no timeout is
+# set.
+# --cluster.listen-address="0.0.0.0:9094"
+# Listen address for cluster. Set to empty string to disable HA
+# mode.
+# --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS
+# Explicit address to advertise in cluster.
+# --cluster.peer=CLUSTER.PEER ...
+# Initial peers (may be repeated).
+# --cluster.peer-timeout=15s
+# Time to wait between peers to send notifications.
+# --cluster.gossip-interval=200ms
+# Interval between sending gossip messages. By lowering this
+# value (more frequent) gossip messages are propagated across
+# the cluster more quickly at the expense of increased
+# bandwidth.
+# --cluster.pushpull-interval=1m0s
+# Interval for gossip state syncs. Setting this interval lower
+# (more frequent) will increase convergence speeds across
+# larger clusters at the expense of increased bandwidth usage.
+# --cluster.tcp-timeout=10s Timeout for establishing a stream connection with a remote
+# node for a full state sync, and for stream read and write
+# operations.
+# --cluster.probe-timeout=500ms
+# Timeout to wait for an ack from a probed node before assuming
+# it is unhealthy. This should be set to 99-percentile of RTT
+# (round-trip time) on your network.
+# --cluster.probe-interval=1s
+# Interval between random node probes. Setting this lower (more
+# frequent) will cause the cluster to detect failed nodes more
+# quickly at the expense of increased bandwidth usage.
+# --cluster.settle-timeout=1m0s
+# Maximum time to wait for cluster connections to settle before
+# evaluating notifications.
+# --cluster.reconnect-interval=10s
+# Interval between attempting to reconnect to lost peers.
+# --cluster.reconnect-timeout=6h0m0s
+# Length of time to attempt to reconnect to a lost peer.
+# --log.level=info Only log messages with the given severity or above. One of:
+# [debug, info, warn, error]
+# --log.format=logfmt Output format of log messages. One of: [logfmt, json]
+# --version Show application version.