From 01ccff895787ca94ad37d11cb93f0440a29edd7c Mon Sep 17 00:00:00 2001 From: Ian Kelling Date: Tue, 10 May 2022 05:36:36 -0400 Subject: [PATCH] mostly fixes --- brc | 6 + brc2 | 22 ++- conflink | 2 +- .../etc/prometheus/file_sd/simple_node.yml | 2 + filesystem/etc/prometheus/prometheus.yml | 12 ++ filesystem/etc/prometheus/rules/iank.yml | 126 +++++++++++++----- i3-sway/common.conf | 15 ++- mail-setup | 7 +- mailtest-check | 14 +- primary-setup | 4 + subdir_files/.config/i3/config | 10 +- subdir_files/.config/mpv/mpv.conf | 2 + subdir_files/.config/sway/config | 10 +- subdir_files/sieve/lists.sieve | 1 + subdir_files/sieve/liststest.sieve | 1 + system-status | 14 +- 16 files changed, 201 insertions(+), 47 deletions(-) create mode 100644 filesystem/etc/prometheus/file_sd/simple_node.yml diff --git a/brc b/brc index ffb7964..ebbe38e 100644 --- a/brc +++ b/brc @@ -1904,6 +1904,12 @@ psnetns() { if [[ $x ]]; then echo "$x"; else echo $l; fi; done } +nonet() { + if ! s ip netns list | grep -Fx nonet &>/dev/null; then + s ip netns add nonet + fi + sudo -E env /sbin/ip netns exec nonet sudo -E -u iank /bin/bash +} m() { printf "%s\n" "$*"; "$@"; } diff --git a/brc2 b/brc2 index d678668..491dcad 100644 --- a/brc2 +++ b/brc2 @@ -507,7 +507,7 @@ lipush() { a="-ahviSAXPH --specials --devices --delete --relative --exclude-from=/p/c/li-rsync-excludes" ret=0 for h in li je bk; do - m s rsync "$@" $a ${p[@]} /p/c/machine_specific/$h root@$h.b8.nz:/ || ret=$? + m s rsync "$@" $a ${p[@]} /p/c/machine_specific/$h root@$h.b8.nz:/ ## only li is debian11 #p[0]=/a/opt/emacs-trisuqel10 #p[1]=/a/opt/emacs-trisquel10-nox @@ -1758,7 +1758,9 @@ vpncmd() { } vpnf() { sudo -v - vpncmd sudo -E -u iank env "PATH=$PATH" abrowser -no-remote -P vpn & r + vpncmd sudo -E -u iank env "PATH=$PATH" abrowser -no-remote -P vpn & + sleep 5 + r } vpn2f() { sudo -v @@ -1917,7 +1919,15 @@ if [[ -e $f ]]; then source $f fi - +electrum() { + # https://electrum.readthedocs.io/en/latest/tor.html + # https://github.com/spesmilo/electrum-docs/issues/129 + s rsync -ptog --chown bitcoin:bitcoin ~/.Xauthority /var/lib/bitcoind/.Xauthority + sudo -u bitcoin DISPLAY=$DISPLAY XAUTHORITY=/var/lib/bitcoind/.Xauthority /a/opt/electrum-4.2.1-x86_64.AppImage -p socks5:localhost:9050 +} +monero() { + sudo -u bitcoin DISPLAY=$DISPLAY XAUTHORITY=/var/lib/bitcoind/.Xauthority /a/opt/monero-gui-v0.17.3.2/monero-wallet-gui +} reset-konsole() { @@ -1944,6 +1954,12 @@ scrollbar true EOF } +# make a page of links found in the files $@. redirect output +linkhtml() { + gr -oh 'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)' "$@" | \ + rev | sort -u | rev | sed 's,.*,\0,' +} + reset-xscreensaver() { # except for spash, i set these by setting gui options in # xscreensaver-command -demo diff --git a/conflink b/conflink index e53605e..0f2e64c 100755 --- a/conflink +++ b/conflink @@ -93,7 +93,7 @@ common-file-setup() { while read -r line; do file="${line:12}" case $file in - etc/prometheus/rules/iank.yml) + etc/prometheus/rules/iank.yml|etc/prometheus/prometheus.yml) case $HOSTNAME in kd) if systemctl is-active prometheus &>/dev/null; then diff --git a/filesystem/etc/prometheus/file_sd/simple_node.yml b/filesystem/etc/prometheus/file_sd/simple_node.yml new file mode 100644 index 0000000..debc981 --- /dev/null +++ b/filesystem/etc/prometheus/file_sd/simple_node.yml @@ -0,0 +1,2 @@ +- targets: + - 10.2.0.1:9100 diff --git a/filesystem/etc/prometheus/prometheus.yml b/filesystem/etc/prometheus/prometheus.yml index 97ac447..fb1e537 100644 --- a/filesystem/etc/prometheus/prometheus.yml +++ b/filesystem/etc/prometheus/prometheus.yml @@ -1,6 +1,14 @@ # Sample config for Prometheus. global: + + + ## temporary for quickly testing during development. default is 60s + ## for both. + # scrape_interval: 5s + # evaluation_interval: 5s + + # Attach these labels to any time series or alerts when communicating with # external systems (federation, remote storage, Alertmanager). external_labels: @@ -33,6 +41,10 @@ scrape_configs: static_configs: - targets: ['localhost:9093'] + - job_name: simple_node + file_sd_configs: + - files: + - /etc/prometheus/file_sd/simple_node.yml - job_name: node basic_auth: username: prom diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index bae264e..14d4275 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -43,11 +43,37 @@ groups: ###### END MISC NOTES ###### + # various queries only look at increases, so invert the up metric so we # can better query on down. - record: down expr: up == bool 0 + # convenience metric to use in multiple alert expressions + - record: mailtest_lag_inhibit + expr: present_over_time(ALERTS{alertname=~"kd_eth0_down|target_down|cmc_wan_down"}[17m]) or on() count_over_time(up{job="prometheus"}[19m]) <= 18 + + + # the node_network_info here goes away when it is down, + # https://www.robustperception.io/absent-alerting-for-scraped-metrics + # + # What this says is: return metric if up == 1 if there isnt also + # the right hand metric (with the same instance+job). + # + # aka: + # ! exists(operstate=up) && up + - alert: cmc_wan_down + expr: |- + up{instance="10.2.0.1:9100"} == 1 unless on(instance,job) node_network_info{instance="10.2.0.1:9100",device="wan",operstate="up"} + labels: + severity: day + + - alert: kd_eth0_down + expr: |- + node_network_up{instance="kdwg:9101",device="eth0"} != 1 + labels: + severity: day + # alerting on missing metrics: # https://www.robustperception.io/absent-alerting-for-scraped-metrics @@ -86,6 +112,8 @@ groups: labels: severity: warn + # todo: at some point, look into making mailtest-check either be resilient to the internet going down, + # or inhibit or group this alert with it going down. - alert: sysd_result_fail # not sure 30m is really needed, it prevents the alert from flapping # i guess. @@ -106,27 +134,34 @@ groups: labels: severity: prod -# 17 minutes: if we reboot causing 1 send to fail, thats 10 minutes. we -# test this every 5 minutes, so thats 15 minutes at most. +# 17 minutes: We try to send every 5 minutes. if we reboot causing 1 +# send to fail, thats 10 minutes between 2 sends. we test this every 5 +# minutes, so thats 15 minutes of time we can expect for 1 failed email, +# and 1 failed email is expected due to reboots or other tiny issues we +# dont care about. +# +# cmc_wan_down etc, inhibits other alerts, but mailtest_check needs +# additional time to recover after an outage. We can only inhibit while +# an alert is actually firing, it doesnt affect the "for:" +# condition. So, we have those alerts that need to be delayed be +# conditioned on a query for that alert having not been firing in the +# last X minutes. However, there is a special case when prometheus +# itself was down, and so there was no alert. So, I test for missing +# of metric that gets generated for prometheus itself. If for some +# reason that has a problem, I could make it more conservative by +# checking that we booted recently instead, eg: +# time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17 - alert: mailtest_check_vps expr: |- - time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 17 + time() - mailtest_check_last_usec{job="tlsnode"} >= 60 * 17 unless on() mailtest_lag_inhibit labels: severity: day annotations: summary: '17 minutes down' - - alert: mailtest_check_unexpected_spamd_vps - expr: |- - mailtest_check_unexpected_spamd_results >= 1 - labels: - severity: day - annotations: - summary: 'jr -u mailtest-check -e' - - alert: mailtest_check_mailhost expr: |- - time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 17 + time() - max by (folder,from) (mailtest_check_last_usec{job="node"}) >= 60 * 17 unless on() mailtest_lag_inhibit labels: severity: day annotations: @@ -135,12 +170,30 @@ groups: # 20 minutes. just allow for more due to prod alert. - alert: mailtest_check_gnu_mailhost expr: |- - time() - max by (folder,from) (mailtest_check_last_usec{folder="/m/md/l/testignore", from="iank@gnu.org"}) >= 60 * 20 + time() - max by (folder,from) (mailtest_check_last_usec{folder="/m/md/l/testignore", from="iank@gnu.org"}) >= 60 * 20 unless on() mailtest_lag_inhibit labels: severity: prod annotations: summary: '20 minutes down' + - alert: mailtest_check_unexpected_spamd_vps + expr: |- + mailtest_check_unexpected_spamd_results >= 1 + labels: + severity: day + annotations: + summary: 'jr -u mailtest-check -e' + + + # We expect to be getting metrics, if we come up and notice we have + # any missing in the past, and it wasn't from a reboot, and we haven't + # fired any other alerts, make an alert. In testing, the the count is + # 19 for 19 minutes, but I make it 18 just to give a bit of slack. + - alert: historical_missing_metric + expr: |- + count_over_time(up{job="prometheus"}[19m]) <= 18 unless on() present_over_time(ALERTS[19m]) unless on() time() - node_boot_time_seconds{instance="kdwg:9101"} <= 60 * 17 + labels: + severity: warn - alert: 1pmtest expr: hour() == 17 and minute() < 5 @@ -176,13 +229,17 @@ groups: # ## Another way would be to detect an overall downtime: # avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95 - - alert: up_resets - expr: |- - resets(up[1d]) - changes(node_boot_time_seconds[1d]) > 12 - labels: - severity: warn - annotations: - summary: "Target has gone down {{ $value }} times in 1 day, > 12" + +# However, this seems to just find too many false positives for now, so +# commenting it out. + + # - alert: up_resets + # expr: |- + # resets(up[1d]) - changes(node_boot_time_seconds[1d]) > 12 + # labels: + # severity: warn + # annotations: + # summary: "Target has gone down {{ $value }} times in 1 day, > 12" @@ -205,28 +262,33 @@ groups: summary: Prometheus job missing (instance {{ $labels.instance }}) description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" -# TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m, -# and severity to day. mail host is tricky since it roams, but I think the -# right way to do it is to check for absence of this metric: -# mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"} - - alert: target_down - expr: up == 0 + - alert: lowpri_target_down + expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0 for: 30m labels: severity: warn annotations: summary: Target down for 30m + - alert: target_down + expr: up{instance=~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0 + for: 5m + labels: + severity: day + annotations: + summary: High priority target down for 5m - # todo: this should group with the above alert - - alert: PrometheusAllTargetsMissing - expr: count by (job) (up) == 0 - for: 10m + - alert: target_down + expr: absent(present_over_time(mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}[5m])) + for: 5m labels: severity: day -# alert-group: local-prom annotations: - description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}" + summary: MAIL_HOST likely down for 5m + + +# note, the next upstream metric is intentionally omitted: +# https://github.com/samber/awesome-prometheus-alerts/issues/283 - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 diff --git a/i3-sway/common.conf b/i3-sway/common.conf index 39ee9b4..0963109 100644 --- a/i3-sway/common.conf +++ b/i3-sway/common.conf @@ -19,7 +19,11 @@ bindsym $mod+6 exec "/a/bin/redshift.sh" bindsym $mod+equal focus parent -# move firefox to current workspace +# move firefox to current workspace. +# https://i3wm.org/docs/userguide.html#keybindings +# get class with xprop, example output +# WM_CLASS(STRING) = "irssi", "URxvt" +# xprop |& grep WM_CLASS bindsym $mod+w [class="abrowser"] move workspace current bindsym $mod+e fullscreen toggle @@ -64,7 +68,14 @@ bindsym $mod+x workspace 6 # todo, in newer i3, make this split toggle bindsym $mod+v split vertical -bindsym $mod+b split horizontal +bindsym $mod+Shift+v split horizontal +# https://faq.i3wm.org/question/7662/reverse-perl-matches-in-criteria-in-i3-config.1.html +# I found their regex slightly wrong. This is a hacky way to +# ignore my irc emacs instances, their window titles +# are irc room names. Another way would be to hack on the +# window title, or xprop stuff, but I figure I'm switching +# to wayland soon, lets wait and see how things work there. +bindsym $mod+b [class="Emacs" title="^(?!#[a-zA-Z][a-zA-Z-]*$)"] move workspace current bindsym $mod+c kill diff --git a/mail-setup b/mail-setup index abd8633..9872346 100755 --- a/mail-setup +++ b/mail-setup @@ -14,6 +14,9 @@ # todo: handle errors like this: # Mar 02 12:44:26 kw systemd[1]: exim4.service: Found left-over process 68210 (exim4) in control group while starting unit. Ignoring. # Mar 02 12:44:26 kw systemd[1]: This usually indicates unclean termination of a previous run, or service implementation deficiencies. +#eg: on eggs, on may 1st, ps grep for exim, 2 daemons running. 1 leftover from a month ago +#Debian-+ 1954 1 0 36231 11560 4 Apr02 ? 00:40:25 /usr/sbin/exim4 -bd -q30m +#Debian-+ 23058 1954 0 36821 10564 0 20:38 ? 00:00:00 /usr/sbin/exim4 -bd -q30m # todo: harden dovecot. need to do some research. one way is for it to only listen on a wireguard vpn interface, so only clients that are on the vpn can access it. # todo: consider hardening cups listening on 0.0.0.0 @@ -3301,7 +3304,9 @@ EOFOUTER ;; *) soff mailtest-check.service - rm -fv /etc/cron.d/mailtest /var/lib/prometheus/node-exporter/mailtest-check.prom* + rm -fv /etc/cron.d/mailtest \ + /var/lib/prometheus/node-exporter/mailtest-check.prom* \ + /var/local/cron-errors/check-remote-mailqs* ;; esac diff --git a/mailtest-check b/mailtest-check index 02cdfcb..5cf79fe 100755 --- a/mailtest-check +++ b/mailtest-check @@ -148,16 +148,18 @@ EOF if [[ $(readlink /proc/$$/ns/net) != "$(readlink /proc/$spamdpid/ns/net)" ]]; then spamcpre="nsenter -t $spamdpid -n -m" fi - + unset results declare -A results # pyzor fails for our test message, so dont put useless load on their # servers. # example line that sed is parsing: # (-0.1 / 5.0 requ) DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,SPF_HELO_PASS=-0.001,SPF_PASS=-0.001,TVD_SPACE_RATIO=0.001 autolearn=_AUTOLEARN resultfile=$(mktemp) - $spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" &>$resultfile + $spamcpre sudo -u Debian-exim spamassassin -D -t --cf='score PYZOR_CHECK 0' <"$latest" &>$resultfile - raw_results="$(tail $resultfile | grep -A1 -Fx /usr/local/bin/send-test-forward | tail -n1 | sed -r 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /g')" + # note: on some mail, its 1 line after the send-test-forward, on others its 2 with a blank inbetween. + # I use the sed -n to filter this. + raw_results="$(tail $resultfile | grep -A2 -Fx /usr/local/bin/send-test-forward | tail -n+2 | sed -nr 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /gp')" for r in $raw_results; do case $r in # got this in an update 2022-01. dun care @@ -219,6 +221,12 @@ EOF cat $resultfile echo mailtest-check: end of spam debug results + # lets just handle 1 failure at a time in interactive mode. + if $int; then + echo mailtest-check: from: $from, to: $to + exit 0 + fi + # less verbose debug output, commented since I might want it another time. # if $int; then # echo mailtest-check: cat $latest: diff --git a/primary-setup b/primary-setup index 01ddd53..8ea0d06 100755 --- a/primary-setup +++ b/primary-setup @@ -48,6 +48,10 @@ if dpkg -s rss2email &>/dev/null; then # off is in mail-setup. no reason for this to be in the rss2email block. m systemctl --now enable btrbk.timer else + files=(/sysd-mail-once/btrbk*) + if (( ${#files[@]} )); then + rm -f ${files[@]} + fi m systemctl --now disable btrbk.timer m systemctl stop rss2email.service diff --git a/subdir_files/.config/i3/config b/subdir_files/.config/i3/config index d185b50..a99c8c3 100644 --- a/subdir_files/.config/i3/config +++ b/subdir_files/.config/i3/config @@ -19,7 +19,11 @@ bindsym $mod+6 exec "/a/bin/redshift.sh" bindsym $mod+equal focus parent -# move firefox to current workspace +# move firefox to current workspace. +# https://i3wm.org/docs/userguide.html#keybindings +# get class with xprop, example output +# WM_CLASS(STRING) = "irssi", "URxvt" +# xprop |& grep WM_CLASS bindsym $mod+w [class="abrowser"] move workspace current bindsym $mod+e fullscreen toggle @@ -64,7 +68,9 @@ bindsym $mod+x workspace 6 # todo, in newer i3, make this split toggle bindsym $mod+v split vertical -bindsym $mod+b split horizontal +bindsym $mod+Shift+v split horizontal +# https://faq.i3wm.org/question/7662/reverse-perl-matches-in-criteria-in-i3-config.1.html +bindsym $mod+b [class="Emacs" title="^(?!#[a-zA-Z][a-zA-Z-]*$)"] move workspace current bindsym $mod+c kill diff --git a/subdir_files/.config/mpv/mpv.conf b/subdir_files/.config/mpv/mpv.conf index 4de0cbe..4fc20de 100644 --- a/subdir_files/.config/mpv/mpv.conf +++ b/subdir_files/.config/mpv/mpv.conf @@ -11,6 +11,8 @@ player-operation-mode=pseudo-gui loop-file=inf shuffle #vo=gpu +no-resume-playback +no-save-position-on-quit [s] shuffle diff --git a/subdir_files/.config/sway/config b/subdir_files/.config/sway/config index fbebd2f..04ecbbd 100644 --- a/subdir_files/.config/sway/config +++ b/subdir_files/.config/sway/config @@ -19,7 +19,11 @@ bindsym $mod+6 exec "/a/bin/redshift.sh" bindsym $mod+equal focus parent -# move firefox to current workspace +# move firefox to current workspace. +# https://i3wm.org/docs/userguide.html#keybindings +# get class with xprop, example output +# WM_CLASS(STRING) = "irssi", "URxvt" +# xprop |& grep WM_CLASS bindsym $mod+w [class="abrowser"] move workspace current bindsym $mod+e fullscreen toggle @@ -64,7 +68,9 @@ bindsym $mod+x workspace 6 # todo, in newer i3, make this split toggle bindsym $mod+v split vertical -bindsym $mod+b split horizontal +bindsym $mod+Shift+v split horizontal +# https://faq.i3wm.org/question/7662/reverse-perl-matches-in-criteria-in-i3-config.1.html +bindsym $mod+b [class="Emacs" title="^(?!#[a-zA-Z][a-zA-Z-]*$)"] move workspace current bindsym $mod+c kill diff --git a/subdir_files/sieve/lists.sieve b/subdir_files/sieve/lists.sieve index 01f5349..aa7354c 100644 --- a/subdir_files/sieve/lists.sieve +++ b/subdir_files/sieve/lists.sieve @@ -90,6 +90,7 @@ if anyof ( header :contains "list-id" "", header :contains "list-id" "", header :contains "list-id" "", + header :contains "list-id" "", header :contains "list-id" "", header :contains "list-id" "", header :contains "list-id" "") { diff --git a/subdir_files/sieve/liststest.sieve b/subdir_files/sieve/liststest.sieve index 01f5349..aa7354c 100644 --- a/subdir_files/sieve/liststest.sieve +++ b/subdir_files/sieve/liststest.sieve @@ -90,6 +90,7 @@ if anyof ( header :contains "list-id" "", header :contains "list-id" "", header :contains "list-id" "", + header :contains "list-id" "", header :contains "list-id" "", header :contains "list-id" "", header :contains "list-id" "") { diff --git a/system-status b/system-status index 3ccde7c..47103ac 100755 --- a/system-status +++ b/system-status @@ -46,6 +46,12 @@ loday() { /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org fi } +# rm glob +rmg() { + if (( $# )); then + rm -f "$@" + fi +} # todo, consider migrating some of these alerts into prometheus write-status() { @@ -177,7 +183,10 @@ write-status() { # fi # fi # fi - + else # end if $MAIL_HOST + rmg /home/iank/cron-errors/bounce* \ + /home/iank/cron-errors/btrbk.timer* \ + /home/iank/cron-errors/old-snapshot* fi if ip l show tunfsf &>/dev/null; then @@ -233,6 +242,9 @@ write-status() { $MAIL_HOST) p $qmsg | loday -120 qlen ;; + *) + rmg /home/iank/cron-errors/qlen* + ;; esac begin=false -- 2.30.2