some fixes, and dns debugging on bk
authorIan Kelling <ian@iankelling.org>
Sun, 12 Jun 2022 18:10:43 +0000 (14:10 -0400)
committerIan Kelling <ian@iankelling.org>
Sun, 12 Jun 2022 18:10:43 +0000 (14:10 -0400)
filesystem/etc/prometheus/rules/iank.yml
machine_specific/bitfolk/filesystem/etc/logrotate-fast.conf [new file with mode: 0644]
machine_specific/bitfolk/filesystem/etc/systemd/system/logrotate-fast.service [new file with mode: 0644]
machine_specific/bitfolk/filesystem/etc/systemd/system/logrotate-fast.timer [new file with mode: 0644]
machine_specific/bitfolk/filesystem/etc/unbound/unbound.conf.d/ian.conf
mail-setup
mailtest-check

index 651eb00de164a8a41b84fe53d6429a5821f8f3d9..f64322b2c98ef4f48755daec9fe81f185b8c9488 100644 (file)
@@ -182,7 +182,7 @@ groups:
     labels:
       severity: day
     annotations:
-      summary: 'jr -u mailtest-check -e'
+      summary: 'jr -u mailtest-check -e -n 10000'
 
   - alert: mailtest_check_missing_dnswl
     expr: |-
@@ -191,7 +191,7 @@ groups:
     labels:
       severity: day
     annotations:
-      summary: 'jr -u mailtest-check -e'
+      summary: 'jr -u mailtest-check -e -n 10000'
 
   # We expect to be getting metrics, if we come up and notice we have
   # any missing in the past, and it wasn't from a reboot, and we haven't
@@ -203,13 +203,16 @@ groups:
     labels:
       severity: warn
 
-  - alert: 1pmtest
-    expr: hour() == 17 and minute() < 5
+  # 10 am friday. but, do it 1 minute early so it is closer to actually
+  # firing at 10 am.
+  - alert: dead_man_test
+    expr: |-
+      ( hour() == 13 and minute() >= 59 or hour() == 14 and minute() < 3 )  and day_of_week() == 5
     for: 0m
     labels:
       severity: daytest
     annotations:
-      summary: Prometheus daily test alert
+      summary: Prometheus weekly test alert
 
 
 #### Inhibit notes ####
@@ -278,6 +281,9 @@ groups:
     annotations:
       summary: Target down for 30m
 
+  # note PrometheusAllTargetsMissing is intentionally omitted because it
+  # is redundant to the above.
+
   - alert: target_down
     expr: up{instance=~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0
     for: 5m
@@ -294,10 +300,6 @@ groups:
     annotations:
       summary: MAIL_HOST likely down for 5m
 
-
-# note, the next upstream metric is intentionally omitted:
-# https://github.com/samber/awesome-prometheus-alerts/issues/283
-
   - alert: PrometheusConfigurationReloadFailure
     expr: prometheus_config_last_reload_successful != 1
     for: 30m
diff --git a/machine_specific/bitfolk/filesystem/etc/logrotate-fast.conf b/machine_specific/bitfolk/filesystem/etc/logrotate-fast.conf
new file mode 100644 (file)
index 0000000..d1170e6
--- /dev/null
@@ -0,0 +1,12 @@
+compress
+/dev/shm/u.log {
+  # dunno if this is needed but it can avoid problems.
+  delaycompress
+  su unbound unbound
+  rotate 20
+  size 10M
+  # copied from clamav
+  postrotate
+      systemctl -q is-active unbound && systemctl kill --signal=SIGHUP unbound || true
+  endscript
+}
diff --git a/machine_specific/bitfolk/filesystem/etc/systemd/system/logrotate-fast.service b/machine_specific/bitfolk/filesystem/etc/systemd/system/logrotate-fast.service
new file mode 100644 (file)
index 0000000..54bb56f
--- /dev/null
@@ -0,0 +1,31 @@
+# modified from
+# /lib/systemd/system/logrotate.service
+[Unit]
+Description=logrotate-fast
+Documentation=man:logrotate(8) man:logrotate.conf(5)
+ConditionACPower=true
+
+[Service]
+Type=oneshot
+ExecStart=/usr/sbin/logrotate /etc/logrotate-fast.conf
+
+# performance options
+Nice=19
+IOSchedulingClass=best-effort
+IOSchedulingPriority=7
+
+# hardening options
+#  details: https://www.freedesktop.org/software/systemd/man/systemd.exec.html
+#  no ProtectHome for userdir logs
+#  no PrivateNetwork for mail deliviery
+#  no ProtectKernelTunables for working SELinux with systemd older than 235
+#  no MemoryDenyWriteExecute for gzip on i686
+
+# iank, commented, we need /dev
+#PrivateDevices=true
+
+PrivateTmp=true
+ProtectControlGroups=true
+ProtectKernelModules=true
+ProtectSystem=full
+RestrictRealtime=true
diff --git a/machine_specific/bitfolk/filesystem/etc/systemd/system/logrotate-fast.timer b/machine_specific/bitfolk/filesystem/etc/systemd/system/logrotate-fast.timer
new file mode 100644 (file)
index 0000000..962c7e8
--- /dev/null
@@ -0,0 +1,14 @@
+[Unit]
+Description=btrfsmaint
+
+[Timer]
+# we could programatically get this via:
+# timedatectl show --property=Timezone | sed 's/^[^=]*=//'
+# or
+# readlink /etc/localtime | sed -r 's,^.*/([^/]+/[^/]+)$,\1,'
+
+# every 5 minutes
+OnCalendar=*-*-* *:00/5:00
+
+[Install]
+WantedBy=timers.target
index 5b1cfeff0fe3511512c5823a4b25cd1fc1c79483..7117bf9eff265d76d19ad94a94895bc4a0189523 100644 (file)
@@ -13,8 +13,11 @@ server:
 ## This is very verbose, fills up 4g of logs in 8 hours on bk.b8.nz. I think
 ## it leads to spamassassin dns timeout (1 second) when the system first
 ## starts.
-#verbosity: 4
-#
+verbosity: 4
+logfile: /dev/shm/u.log
+log-time-ascii: yes
+
+
 
 interface: 127.0.0.1
 interface: ::1
index de9db482acad97f7c8cb4b23e4b856c7a8d87994..5d184932481fdbafee2d9c5e63be3584446df8dc 100755 (executable)
@@ -1249,13 +1249,6 @@ warn
   add_header = X-Spam_report: $spam_report
   add_header = X-Spam_action: $spam_action
 
-warn
-  !hosts = +iank_trusted
-  !authenticated = plain_server:login_server
-  condition = ${if def:malware_name}
-  remove_header = Subject:
-  add_header = Subject: [Clamav warning: $malware_name] $h_subject
-  log_message = heuristic malware warning: $malware_name
 
 #accept
 #  spf = pass:fail:softfail:none:neutral:permerror:temperror
@@ -2281,8 +2274,10 @@ fwrite(STDOUT, "<?php\n\\\$CONFIG = ");
 var_export(\$CONFIG);
 fwrite(STDOUT, ";\n");
 EOF
-    m php tmp.php >config.php
-    m rm -f tmp.php
+    e running php tmp.php
+    php tmp.php >config.php
+    # leave in place for debugging
+    #m rm -f tmp.php
     m sudo -u www-data php $ncdir/occ maintenance:update:htaccess
     list=$(sudo -u www-data php $ncdir/occ --output=json_pretty app:list)
     # user_external not compaible with nc 23
@@ -2316,14 +2311,12 @@ EOF
     systemctl enable --now $ncbase.timer
     i /usr/local/bin/ncup <<'EOFOUTER'
 #!/bin/bash
-if ! test "$BASH_VERSION"; then echo "error: shell is not bash" >&2; exit 1; fi
-shopt -s inherit_errexit 2>/dev/null ||: # ignore fail in bash < 4.4
-set -eE -o pipefail
-trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" exit status: $?, PIPESTATUS: ${PIPESTATUS[*]}" >&2' ERR
 
-ncbase=$1
-if ! php /var/www/$ncbase/updater/updater.phar -n; then
-  echo failed nextcloud update for $ncbase >&2
+source /usr/local/lib/err
+
+m() { printf "%s\n" "$*";  "$@"; }
+err-cleanup() {
+echo failed nextcloud update for $ncbase >&2
   /sbin/exim -t <<EOF
 To: alerts@iankelling.org
 From: root@$(hostname -f)
@@ -2331,7 +2324,23 @@ Subject: failed nextcloud update for $ncbase
 
 For logs, run: jr -u $ncbase
 EOF
+}
+
+if [[ $(id -u -n) != www-data ]]; then
+  echo error: running as wrong user: $(id -u -n), expected www-data
+  exit 1
 fi
+
+if [[ ! $1 ]]; then
+  echo error: expected an arg, nextcloud relative base dir
+  exit 1
+fi
+
+ncbase=$1
+echo running: php /var/www/$ncbase/updater/updater.phar -n
+m php /var/www/$ncbase/updater/updater.phar -n
+cd /var/www/$ncbase
+m running php occ -n upgrade
 EOFOUTER
     chmod +x /usr/local/bin/ncup
 
@@ -2523,6 +2532,18 @@ EOF
   # ** $MAIL_HOST|bk)
   $MAIL_HOST|bk)
 
+
+    # no clamav on je, it has 1.5g memory and clamav uses most of it
+    i /etc/exim4/conf.d/clamav_data_acl <<'EOF'
+warn
+!hosts = +iank_trusted
+!authenticated = plain_server:login_server
+condition = ${if def:malware_name}
+remove_header = Subject:
+add_header = Subject: [Clamav warning: $malware_name] $h_subject
+log_message = heuristic malware warning: $malware_name
+EOF
+
     cat >>/etc/exim4/conf.d/main/000_local <<EOF
 # je.b8.nz will run out of memory with freshclam
 av_scanner = clamd:/var/run/clamav/clamd.ctl
@@ -2867,6 +2888,7 @@ EOF
     echo|i /etc/exim4/conf.d/rcpt_local_acl
     echo|i /etc/exim4/conf.d/router/890_backup_copy
     echo|i /etc/exim4/conf.d/main/000_local-nn
+    echo|i /etc/exim4/conf.d/clamav_data_acl
 
 
     if $bhost_t; then
@@ -3166,6 +3188,13 @@ case $HOSTNAME in
     ;;&
 esac
 
+# for debugging dns issues
+case $HOSTNAME in
+  je|bk)
+    systemctl enable --now logrotate-fast.timer
+    ;;
+esac
+
 # last use of $reload happens in previous block
 rm -f /var/local/mail-setup-reload
 
@@ -3242,7 +3271,8 @@ EOF
     test_tos=(testignore@expertpathologyreview.com testignore@je.b8.nz testignore@amnimal.ninja jtuttle@gnu.org)
 
     cat >>/etc/cron.d/mailtest <<EOF
-0   13 * * *  root echo "1pm alert. You are not in the matrix."
+# 10 am friday
+0   10 * * 5  root echo "weekly alert. You are not in the matrix."
 2   * * * *   root check-remote-mailqs |& log-once check-remote-mailqs
 EOF
     ;;&
index c7d40c17502e967b808e2e251a6b50b63cb1464c..cce5908fb78267d53c06a8bb224df0248b40daca 100755 (executable)
@@ -57,10 +57,11 @@ if [[ $1 == nonint ]]; then
 fi
 #### end arg processing ####
 
-
-if ! $int; then
-  sleep 60
-fi
+# we put this in to avoid dns errors that happen on reboot,
+# but I want to debug them.
+# if ! $int; then
+#   sleep 60
+# fi
 
 
 # TODO, get je to deliver the local mailbox: /m/md/INBOX
@@ -145,6 +146,7 @@ EOF
   tmpfile=$(mktemp)
   declare -i unexpected=0
   declare -i missing_dnswl=0
+  declare -i dnsfail=0
   for folder in ${folders[@]}; do
     for from in ${froms[@]}; do
       latest=
@@ -264,9 +266,10 @@ EOF
           rm -f $resultfile
           for r in ${results[@]}; do
             case $r in
-              DKIM_INVALID|T_SPF_TEMPERROR|T_SPF_HELO_TEMPERROR)
-                missing_dnswl+=1
-                ;;
+              # iank: for when we want to handle dns errors differently
+              # DKIM_INVALID|T_SPF_TEMPERROR|T_SPF_HELO_TEMPERROR)
+              #   dnsfail+=1
+              #   ;;
               *)
                 unexpected=$(( unexpected + 1 ))
                 ;;
@@ -276,7 +279,9 @@ EOF
             # We expect dns failures from time to time, so
             # we count them separately and alert differently.
             case $miss in
-              DKIM_VALID|DKIM_VALID_AU|DKIM_VALID_EF|SPF_HELO_PASS|SPF_PASS|RCVD_IN_DNSWL_MED|DKIMWL_WL_HIGH)
+              # iank: dns fail
+              # DKIM_VALID|DKIM_VALID_AU|DKIM_VALID_EF|SPF_HELO_PASS|SPF_PASS|
+              RCVD_IN_DNSWL_MED|DKIMWL_WL_HIGH)
                 missing_dnswl+=1
                 ;;
               *)