if [[ $x ]]; then echo "$x"; else echo $l; fi;
done
}
+nonet() {
+ if ! s ip netns list | grep -Fx nonet &>/dev/null; then
+ s ip netns add nonet
+ fi
+ sudo -E env /sbin/ip netns exec nonet sudo -E -u iank /bin/bash
+}
m() { printf "%s\n" "$*"; "$@"; }
a="-ahviSAXPH --specials --devices --delete --relative --exclude-from=/p/c/li-rsync-excludes"
ret=0
for h in li je bk; do
- m s rsync "$@" $a ${p[@]} /p/c/machine_specific/$h root@$h.b8.nz:/ || ret=$?
+ m s rsync "$@" $a ${p[@]} /p/c/machine_specific/$h root@$h.b8.nz:/
## only li is debian11
#p[0]=/a/opt/emacs-trisuqel10
#p[1]=/a/opt/emacs-trisquel10-nox
}
vpnf() {
sudo -v
- vpncmd sudo -E -u iank env "PATH=$PATH" abrowser -no-remote -P vpn & r
+ vpncmd sudo -E -u iank env "PATH=$PATH" abrowser -no-remote -P vpn &
+ sleep 5
+ r
}
vpn2f() {
sudo -v
source $f
fi
-
+electrum() {
+ # https://electrum.readthedocs.io/en/latest/tor.html
+ # https://github.com/spesmilo/electrum-docs/issues/129
+ s rsync -ptog --chown bitcoin:bitcoin ~/.Xauthority /var/lib/bitcoind/.Xauthority
+ sudo -u bitcoin DISPLAY=$DISPLAY XAUTHORITY=/var/lib/bitcoind/.Xauthority /a/opt/electrum-4.2.1-x86_64.AppImage -p socks5:localhost:9050
+}
+monero() {
+ sudo -u bitcoin DISPLAY=$DISPLAY XAUTHORITY=/var/lib/bitcoind/.Xauthority /a/opt/monero-gui-v0.17.3.2/monero-wallet-gui
+}
reset-konsole() {
EOF
}
+# make a page of links found in the files $@. redirect output
+linkhtml() {
+ gr -oh 'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)' "$@" | \
+ rev | sort -u | rev | sed 's,.*,<a href="\0">\0</a><br\>,'
+}
+
reset-xscreensaver() {
# except for spash, i set these by setting gui options in
# xscreensaver-command -demo
while read -r line; do
file="${line:12}"
case $file in
- etc/prometheus/rules/iank.yml)
+ etc/prometheus/rules/iank.yml|etc/prometheus/prometheus.yml)
case $HOSTNAME in
kd)
if systemctl is-active prometheus &>/dev/null; then
--- /dev/null
+- targets:
+ - 10.2.0.1:9100
# Sample config for Prometheus.
global:
+
+
+ ## temporary for quickly testing during development. default is 60s
+ ## for both.
+ # scrape_interval: 5s
+ # evaluation_interval: 5s
+
+
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
static_configs:
- targets: ['localhost:9093']
+ - job_name: simple_node
+ file_sd_configs:
+ - files:
+ - /etc/prometheus/file_sd/simple_node.yml
- job_name: node
basic_auth:
username: prom
###### END MISC NOTES ######
+
# various queries only look at increases, so invert the up metric so we
# can better query on down.
- record: down
expr: up == bool 0
+ # convenience metric to use in multiple alert expressions
+ - record: mailtest_lag_inhibit
+ expr: present_over_time(ALERTS{alertname=~"kd_eth0_down|target_down|cmc_wan_down"}[17m]) or on() count_over_time(up{job="prometheus"}[19m]) <= 18
+
+
+ # the node_network_info here goes away when it is down,
+ # https://www.robustperception.io/absent-alerting-for-scraped-metrics
+ #
+ # What this says is: return metric if up == 1 if there isnt also
+ # the right hand metric (with the same instance+job).
+ #
+ # aka:
+ # ! exists(operstate=up) && up
+ - alert: cmc_wan_down
+ expr: |-
+ up{instance="10.2.0.1:9100"} == 1 unless on(instance,job) node_network_info{instance="10.2.0.1:9100",device="wan",operstate="up"}
+ labels:
+ severity: day
+
+ - alert: kd_eth0_down
+ expr: |-
+ node_network_up{instance="kdwg:9101",device="eth0"} != 1
+ labels:
+ severity: day
+
# alerting on missing metrics:
# https://www.robustperception.io/absent-alerting-for-scraped-metrics
labels:
severity: warn
+ # todo: at some point, look into making mailtest-check either be resilient to the internet going down,
+ # or inhibit or group this alert with it going down.
- alert: sysd_result_fail
# not sure 30m is really needed, it prevents the alert from flapping
# i guess.
labels:
severity: prod
-# 17 minutes: if we reboot causing 1 send to fail, thats 10 minutes. we
-# test this every 5 minutes, so thats 15 minutes at most.
+# 17 minutes: We try to send every 5 minutes. if we reboot causing 1
+# send to fail, thats 10 minutes between 2 sends. we test this every 5
+# minutes, so thats 15 minutes of time we can expect for 1 failed email,
+# and 1 failed email is expected due to reboots or other tiny issues we
+# dont care about.
+#
+# cmc_wan_down etc, inhibits other alerts, but mailtest_check needs
+# additional time to recover after an outage. We can only inhibit while
+# an alert is actually firing, it doesnt affect the "for:"
+# condition. So, we have those alerts that need to be delayed be
+# conditioned on a query for that alert having not been firing in the
+# last X minutes. However, there is a special case when prometheus
+# itself was down, and so there was no alert. So, I test for missing
+# of metric that gets generated for prometheus itself. If for some
+# reason that has a problem, I could make it more conservative by
+# checking that we booted recently instead, eg:
+# time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17
- alert: mailtest_check_vps
expr: |-
- time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 17
+ time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 17 unless on() mailtest_lag_inhibit
labels:
severity: day
annotations:
summary: '17 minutes down'
- - alert: mailtest_check_unexpected_spamd_vps
- expr: |-
- mailtest_check_unexpected_spamd_results >= 1
- labels:
- severity: day
- annotations:
- summary: 'jr -u mailtest-check -e'
-
- alert: mailtest_check_mailhost
expr: |-
- time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 17
+ time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 17 unless on() mailtest_lag_inhibit
labels:
severity: day
annotations:
# 20 minutes. just allow for more due to prod alert.
- alert: mailtest_check_gnu_mailhost
expr: |-
- time() - max by (folder,from) (mailtest_check_last_usec{folder="/m/md/l/testignore", from="iank@gnu.org"}) >= 60 * 20
+ time() - max by (folder,from) (mailtest_check_last_usec{folder="/m/md/l/testignore", from="iank@gnu.org"}) >= 60 * 20 unless on() mailtest_lag_inhibit
labels:
severity: prod
annotations:
summary: '20 minutes down'
+ - alert: mailtest_check_unexpected_spamd_vps
+ expr: |-
+ mailtest_check_unexpected_spamd_results >= 1
+ labels:
+ severity: day
+ annotations:
+ summary: 'jr -u mailtest-check -e'
+
+
+ # We expect to be getting metrics, if we come up and notice we have
+ # any missing in the past, and it wasn't from a reboot, and we haven't
+ # fired any other alerts, make an alert. In testing, the the count is
+ # 19 for 19 minutes, but I make it 18 just to give a bit of slack.
+ - alert: historical_missing_metric
+ expr: |-
+ count_over_time(up{job="prometheus"}[19m]) <= 18 unless on() present_over_time(ALERTS[19m]) unless on() time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17
+ labels:
+ severity: warn
- alert: 1pmtest
expr: hour() == 17 and minute() < 5
#
## Another way would be to detect an overall downtime:
# avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95
- - alert: up_resets
- expr: |-
- resets(up[1d]) - changes(node_boot_time_seconds[1d]) > 12
- labels:
- severity: warn
- annotations:
- summary: "Target has gone down {{ $value }} times in 1 day, > 12"
+
+# However, this seems to just find too many false positives for now, so
+# commenting it out.
+
+ # - alert: up_resets
+ # expr: |-
+ # resets(up[1d]) - changes(node_boot_time_seconds[1d]) > 12
+ # labels:
+ # severity: warn
+ # annotations:
+ # summary: "Target has gone down {{ $value }} times in 1 day, > 12"
summary: Prometheus job missing (instance {{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}"
-# TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m,
-# and severity to day. mail host is tricky since it roams, but I think the
-# right way to do it is to check for absence of this metric:
-# mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}
- - alert: target_down
- expr: up == 0
+ - alert: lowpri_target_down
+ expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0
for: 30m
labels:
severity: warn
annotations:
summary: Target down for 30m
+ - alert: target_down
+ expr: up{instance=~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0
+ for: 5m
+ labels:
+ severity: day
+ annotations:
+ summary: High priority target down for 5m
- # todo: this should group with the above alert
- - alert: PrometheusAllTargetsMissing
- expr: count by (job) (up) == 0
- for: 10m
+ - alert: target_down
+ expr: absent(present_over_time(mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}[5m]))
+ for: 5m
labels:
severity: day
-# alert-group: local-prom
annotations:
- description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}"
+ summary: MAIL_HOST likely down for 5m
+
+
+# note, the next upstream metric is intentionally omitted:
+# https://github.com/samber/awesome-prometheus-alerts/issues/283
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
bindsym $mod+equal focus parent
-# move firefox to current workspace
+# move firefox to current workspace.
+# https://i3wm.org/docs/userguide.html#keybindings
+# get class with xprop, example output
+# WM_CLASS(STRING) = "irssi", "URxvt"
+# xprop |& grep WM_CLASS
bindsym $mod+w [class="abrowser"] move workspace current
bindsym $mod+e fullscreen toggle
# todo, in newer i3, make this split toggle
bindsym $mod+v split vertical
-bindsym $mod+b split horizontal
+bindsym $mod+Shift+v split horizontal
+# https://faq.i3wm.org/question/7662/reverse-perl-matches-in-criteria-in-i3-config.1.html
+# I found their regex slightly wrong. This is a hacky way to
+# ignore my irc emacs instances, their window titles
+# are irc room names. Another way would be to hack on the
+# window title, or xprop stuff, but I figure I'm switching
+# to wayland soon, lets wait and see how things work there.
+bindsym $mod+b [class="Emacs" title="^(?!#[a-zA-Z][a-zA-Z-]*$)"] move workspace current
bindsym $mod+c kill
# todo: handle errors like this:
# Mar 02 12:44:26 kw systemd[1]: exim4.service: Found left-over process 68210 (exim4) in control group while starting unit. Ignoring.
# Mar 02 12:44:26 kw systemd[1]: This usually indicates unclean termination of a previous run, or service implementation deficiencies.
+#eg: on eggs, on may 1st, ps grep for exim, 2 daemons running. 1 leftover from a month ago
+#Debian-+ 1954 1 0 36231 11560 4 Apr02 ? 00:40:25 /usr/sbin/exim4 -bd -q30m
+#Debian-+ 23058 1954 0 36821 10564 0 20:38 ? 00:00:00 /usr/sbin/exim4 -bd -q30m
# todo: harden dovecot. need to do some research. one way is for it to only listen on a wireguard vpn interface, so only clients that are on the vpn can access it.
# todo: consider hardening cups listening on 0.0.0.0
;;
*)
soff mailtest-check.service
- rm -fv /etc/cron.d/mailtest /var/lib/prometheus/node-exporter/mailtest-check.prom*
+ rm -fv /etc/cron.d/mailtest \
+ /var/lib/prometheus/node-exporter/mailtest-check.prom* \
+ /var/local/cron-errors/check-remote-mailqs*
;;
esac
if [[ $(readlink /proc/$$/ns/net) != "$(readlink /proc/$spamdpid/ns/net)" ]]; then
spamcpre="nsenter -t $spamdpid -n -m"
fi
-
+ unset results
declare -A results
# pyzor fails for our test message, so dont put useless load on their
# servers.
# example line that sed is parsing:
# (-0.1 / 5.0 requ) DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,SPF_HELO_PASS=-0.001,SPF_PASS=-0.001,TVD_SPACE_RATIO=0.001 autolearn=_AUTOLEARN
resultfile=$(mktemp)
- $spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" &>$resultfile
+ $spamcpre sudo -u Debian-exim spamassassin -D -t --cf='score PYZOR_CHECK 0' <"$latest" &>$resultfile
- raw_results="$(tail $resultfile | grep -A1 -Fx /usr/local/bin/send-test-forward | tail -n1 | sed -r 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /g')"
+ # note: on some mail, its 1 line after the send-test-forward, on others its 2 with a blank inbetween.
+ # I use the sed -n to filter this.
+ raw_results="$(tail $resultfile | grep -A2 -Fx /usr/local/bin/send-test-forward | tail -n+2 | sed -nr 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /gp')"
for r in $raw_results; do
case $r in
# got this in an update 2022-01. dun care
cat $resultfile
echo mailtest-check: end of spam debug results
+ # lets just handle 1 failure at a time in interactive mode.
+ if $int; then
+ echo mailtest-check: from: $from, to: $to
+ exit 0
+ fi
+
# less verbose debug output, commented since I might want it another time.
# if $int; then
# echo mailtest-check: cat $latest:
# off is in mail-setup. no reason for this to be in the rss2email block.
m systemctl --now enable btrbk.timer
else
+ files=(/sysd-mail-once/btrbk*)
+ if (( ${#files[@]} )); then
+ rm -f ${files[@]}
+ fi
m systemctl --now disable btrbk.timer
m systemctl stop rss2email.service
bindsym $mod+equal focus parent
-# move firefox to current workspace
+# move firefox to current workspace.
+# https://i3wm.org/docs/userguide.html#keybindings
+# get class with xprop, example output
+# WM_CLASS(STRING) = "irssi", "URxvt"
+# xprop |& grep WM_CLASS
bindsym $mod+w [class="abrowser"] move workspace current
bindsym $mod+e fullscreen toggle
# todo, in newer i3, make this split toggle
bindsym $mod+v split vertical
-bindsym $mod+b split horizontal
+bindsym $mod+Shift+v split horizontal
+# https://faq.i3wm.org/question/7662/reverse-perl-matches-in-criteria-in-i3-config.1.html
+bindsym $mod+b [class="Emacs" title="^(?!#[a-zA-Z][a-zA-Z-]*$)"] move workspace current
bindsym $mod+c kill
loop-file=inf
shuffle
#vo=gpu
+no-resume-playback
+no-save-position-on-quit
[s]
shuffle
bindsym $mod+equal focus parent
-# move firefox to current workspace
+# move firefox to current workspace.
+# https://i3wm.org/docs/userguide.html#keybindings
+# get class with xprop, example output
+# WM_CLASS(STRING) = "irssi", "URxvt"
+# xprop |& grep WM_CLASS
bindsym $mod+w [class="abrowser"] move workspace current
bindsym $mod+e fullscreen toggle
# todo, in newer i3, make this split toggle
bindsym $mod+v split vertical
-bindsym $mod+b split horizontal
+bindsym $mod+Shift+v split horizontal
+# https://faq.i3wm.org/question/7662/reverse-perl-matches-in-criteria-in-i3-config.1.html
+bindsym $mod+b [class="Emacs" title="^(?!#[a-zA-Z][a-zA-Z-]*$)"] move workspace current
bindsym $mod+c kill
header :contains "list-id" "<seabios.seabios.org>",
header :contains "list-id" "<freetype-devel.nongnu.org>",
header :contains "list-id" "<mailman-developers.python.org>",
+ header :contains "list-id" "<emacs-erc.gnu.org>",
header :contains "list-id" "<linux-raid.vger.kernel.org>",
header :contains "list-id" "<mailop.mailop.org>",
header :contains "list-id" "<xmonad.haskell.org>") {
header :contains "list-id" "<seabios.seabios.org>",
header :contains "list-id" "<freetype-devel.nongnu.org>",
header :contains "list-id" "<mailman-developers.python.org>",
+ header :contains "list-id" "<emacs-erc.gnu.org>",
header :contains "list-id" "<linux-raid.vger.kernel.org>",
header :contains "list-id" "<mailop.mailop.org>",
header :contains "list-id" "<xmonad.haskell.org>") {
/usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
fi
}
+# rm glob
+rmg() {
+ if (( $# )); then
+ rm -f "$@"
+ fi
+}
# todo, consider migrating some of these alerts into prometheus
write-status() {
# fi
# fi
# fi
-
+ else # end if $MAIL_HOST
+ rmg /home/iank/cron-errors/bounce* \
+ /home/iank/cron-errors/btrbk.timer* \
+ /home/iank/cron-errors/old-snapshot*
fi
if ip l show tunfsf &>/dev/null; then
$MAIL_HOST)
p $qmsg | loday -120 qlen
;;
+ *)
+ rmg /home/iank/cron-errors/qlen*
+ ;;
esac
begin=false