use rspamd, speed up mailtest-check

author Ian Kelling <ian@iankelling.org>

Mon, 24 Jun 2024 10:09:57 +0000 (06:09 -0400)

committer Ian Kelling <ian@iankelling.org>

Mon, 24 Jun 2024 10:48:24 +0000 (06:48 -0400)
author Ian Kelling <ian@iankelling.org>
Mon, 24 Jun 2024 10:09:57 +0000 (06:09 -0400)
committer Ian Kelling <ian@iankelling.org>
Mon, 24 Jun 2024 10:48:24 +0000 (06:48 -0400)
diff --git a/mail-setup b/mail-setup

index 0e0c6d01cbc05f27e1e80a42991224b2225bf841..9ad7eea2cef3e8a220d06cc8b551b02a60afb103 100755 (executable)
--- a/mail-setup
+++ b/mail-setup
@@ -476,7 +476,7 @@ fi
  
  # light version of exim does not have sasl auth support.
  # note: for bitfolk hosts, unbound has important config with conflink.
-pi-nostart exim4 exim4-daemon-heavy spamassassin unbound clamav-daemon wireguard
+pi-nostart exim4 exim4-daemon-heavy spamassassin unbound clamav-daemon wireguard rspamd
  
  spamd_remove=spamassassin
  spamd_ser=spamd
@@ -881,7 +881,7 @@ nn_progs=(exim4)
  if mailhost; then
    # Note dovecots lmtp doesnt need to be in the same nn to accept delivery.
    # Its in the nn so remote clients can connect to it.
-  nn_progs+=($spamd_ser dovecot)
+  nn_progs+=($spamd_ser rspamd dovecot)
  fi
  
  case $HOSTNAME in
@@ -961,7 +961,7 @@ EOF
      done
      ;;
    *)
-    for unit in exim4 $spamd_ser $spamd_remove dovecot unbound; do
+    for unit in exim4 $spamd_ser rspamd $spamd_remove dovecot unbound; do
        f=/etc/systemd/system/$unit.service.d/nn.conf
        if [[ -s $f ]]; then
          rm -fv $f
@@ -984,6 +984,19 @@ RestartSec=20
  EOF
  fi
  
+# * rspamd config
+
+#/a/exe/cedit /etc/redis/redis.conf <<'EOF'
+# redis config is only readable by redis. if we wanted to not do
+# that for our modifications, we could add this.
+# include /etc/redis-local.conf
+
+# if we wanted to, we could run redis outside the mail nn by adding to
+# its bind config option like this, and then tell rspamd to connect to
+# this address. But it is slightly simpler to not do that.
+# bind 127.0.0.1 -::1 10.173.8.1
+#EOF
+
  # * spamassassin config
  u /etc/sysctl.d/80-iank-mail.conf <<'EOF'
  # see exim spec
@@ -1480,6 +1493,7 @@ acl_not_smtp = acl_check_not_smtp
  
  
  DEBBUGS_DOMAIN = b.b8.nz
+spamd_address = 127.0.0.1 11333 variant=rspamd
  EOF
  
  if dpkg --compare-versions "$(dpkg-query -f='${Version}\n' --show exim4)" ge 4.94; then
@@ -1525,6 +1539,14 @@ EOF
  rm -fv /etc/exim4/data_local_acl # old path
  
  u /etc/exim4/conf.d/data_local_acl <<'EOF'
+
+warn
+  remove_header = X-Spam_score: X-Spam_score_int : X-Spam_bar : X-Spam_report
+
+warn
+  !hosts = +iank_trusted
+  # Smarthosts connect with residential ips and thus get flagged as spam if we do a spam check.
+  !authenticated = plain_server:login_server
  # Except for the "condition =", this was
  # a comment in the check_data acl. The comment about this not
  # being suitable has been changed in newer exim versions. The only thing
@@ -1535,14 +1557,19 @@ u /etc/exim4/conf.d/data_local_acl <<'EOF'
  # suggested in official docs, and 100k in the wiki example because
  # those docs are rather old and I see a 110k spam message
  # pretty quickly looking through my spam folder.
+  condition = ${if < {$message_size}{5000K}}
+  spam = Debian-exim:true
+  add_header = X-Spam_score_int: $spam_score_int
+  add_header = X-Spam_score: $spam_score
+  add_header = X-Spam_bar: $spam_bar
+  add_header = X-Spam_report: $spam_report
+  add_header = X-Spam_action: $spam_action
  
+# i don't want mail to myself getting wastefully scanned or
+# mistakenly flagged as spam, but I do want to scan my spam test emails.
  warn
-  !hosts = +iank_trusted
-  remove_header = X-Spam_score: X-Spam_score_int : X-Spam_bar : X-Spam_report
-
-warn
-  !hosts = +iank_trusted
-  # Smarthosts connect with residential ips and thus get flagged as spam if we do a spam check.
+  condition = ${if forany{<, $recipients}{match{$item}{\N^testignore@\N}}}
+  hosts = +iank_trusted
    !authenticated = plain_server:login_server
    condition = ${if < {$message_size}{5000K}}
    spam = Debian-exim:true
@@ -4081,7 +4108,7 @@ case $HOSTNAME in
      ;;&
    $MAIL_HOST|bk|je)
      # start spamassassin/dovecot before exim.
-    sre dovecot $spamd_ser mailtest-check
+    sre dovecot rspamd mailtest-check
      # Wait a bit before restarting exim, else I get a paniclog entry
      # like: spam acl condition: all spamd servers failed. But I'm tired
      # of waiting. I'll deal with this some other way.
@@ -4115,7 +4142,7 @@ case $HOSTNAME in
      :
      ;;
    *)
-    soff radicale mailclean.timer dovecot $spamd_ser $vpnser mailnn clamav-daemon
+    soff radicale mailclean.timer dovecot $spamd_ser rspamd $vpnser mailnn clamav-daemon
      ;;
  esac
  
diff --git a/mailtest-check b/mailtest-check

index 9f37cb94ab6513352295c0e5debdcf8a24bba540..aa9b776e49437550199c945a34f7b336670bccff 100755 (executable)
--- a/mailtest-check
+++ b/mailtest-check
@@ -52,8 +52,136 @@ getspamdpid() {
    fi
  }
  
+parse-rspamd() {
+  # rspamc uses $3.
+  awk '$1 == "Symbol:" && $2 !~ /\(0\.00\)/ && $3 !~ /\(0\.00\)/ {print $2}' | sed 's/(.*//'
+}
+
+rspamc-process() {
+
+  # note, this could in theory break since we aren't limiting it to the
+  # specific header. but that is unlikely, I'm doing all the header generation.
+  # example header:
+  # X-Spam_report: Action: no action
+  # Symbol: HFILTER_HOSTNAME_UNKNOWN(2.50)
+  # Symbol: RCVD_COUNT_TWO(0.00)
+  # Symbol: FROM_EQ_ENVFROM(0.00)
+  # Symbol: DMARC_POLICY_ALLOW(-0.50)
+  # Symbol: TO_DN_NONE(0.00)
+  # Symbol: TO_MATCH_ENVRCPT_SOME(0.00)
+  # Symbol: RCVD_TLS_LAST(0.00)
+  # Symbol: RBL_SENDERSCORE_FAIL(0.00)
+  # Symbol: R_DKIM_ALLOW(-0.20)
+  # Symbol: MIME_GOOD(-0.10)
+  # Symbol: MID_RHS_MATCH_FROM(0.00)
+  # Symbol: RCVD_IN_DNSWL_FAIL(0.00)
+  # Symbol: SINGLE_SHORT_PART(0.00)
+  # Symbol: R_SPF_ALLOW(-0.20)
+  # Symbol: ARC_NA(0.00)
+  # Symbol: ASN(0.00)
+  # Symbol: FROM_NO_DN(0.00)
+  # Symbol: MIME_TRACE(0.00)
+  # Symbol: MISSING_XM_UA(0.00)
+  # Symbol: RCPT_COUNT_THREE(0.00)
+  # Symbol: DKIM_TRACE(0.00)
+  # Message-ID: E1sLckD-004Ucv-P2@je.b8.nz
+
+  if [[ $to == jtuttle@gnu.org ]]; then
+    raw_results=$($spamcpre sudo -u _rspamd rspamc --helo=mail.iankelling.org --hostname=mail.iankelling.org  <"$latest" |& parse-rspamd)
+  else
+    raw_results=$( parse-rspamd <"$latest")
+  fi
+  for r in $raw_results; do
+    case $r in
+      # based on my spamassassin experience, these may change and are not important.
+      RCVD_IN_DNSWL_MED|RCVD_DKIM_ARC_DNSWL_MED) : ;;
+      *)
+        results[$r]=t
+        ;;
+    esac
+  done
+  keys=(DMARC_POLICY_ALLOW R_DKIM_ALLOW MIME_GOOD R_SPF_ALLOW)
+  for t in  ${keys[@]}; do
+    if [[ ${results[$t]} ]]; then
+      unset "results[$t]"
+    else
+      missing+=($t)
+    fi
+  done
+}
+
+spamc-process() {
+  # pyzor fails for our test message, so dont put useless load on their
+  # servers.
+  # example line that sed is parsing:
+  # (-0.1 / 5.0 requ) DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,SPF_HELO_PASS=-0.001,SPF_PASS=-0.001,TVD_SPACE_RATIO=0.001 autolearn=_AUTOLEARN
+  # add -D for debug info. i haven't found it to be useful so it is off by default
+  resultstr=$($spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" 2>&1)
+  #resultstr=$($spamcpre sudo -u _rspamd rspamc  <"$latest" 2>&1)
+
+  # note: on some mail, its 1 line after the send-test-forward,
+  # on others its 2 with a blank in between.  I use the sed -n to
+  # filter this.
+  ## spamassassin parsing. disabled, using rspamd
+  raw_results="$(printf "%s\n" "$resultstr"| tail | grep -A2 -Fx /usr/local/bin/send-test-forward | tail -n+2 | sed -nr 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /gp')"
+
+  # consider results we want to ignore or pre-process in some way.
+  for r in $raw_results; do
+    case $r in
+      # This came in t12, but its just dkim + spf, and my
+      # systems aren't all t12, so ignore it for now.
+      DMARC_PASS) : ;;
+      # got this in an update 2022-01. dun care
+      T_SCC_BODY_TEXT_LINE|SCC_BODY_SINGLE_WORD) : ;;
+      # we have a new domain, ignore this.
+      # it seems like some versions of spamassassin do BODY_SINGLE_WORD, others dont, we dun care.
+      # bayes_00 is a new one indicating ham, we dont care if its missing.
+      BAYES_00|BODY_SINGLE_WORD|FROM_FMBLA_NEWDOM*|autolearn) : ;;
+
+      # These have somewhat randomly been added and removed, resulting in useless alerts, so ignore them.
+      RCVD_IN_DNSWL_MED|DKIMWL_WL_HIGH) : ;;
+
+      SPF_HELO_NEUTRAL)
+        # some of my domains use neutral spf, treat them the same.
+        results[SPF_HELO_PASS]=t
+        ;;
+      *)
+        results[$r]=t
+        ;;
+    esac
+  done
+  # debugging
+  # e results = ${!results[@]}
+
+  keys=(DKIM_SIGNED DKIM_VALID{,_AU,_EF} SPF_HELO_PASS SPF_PASS TVD_SPACE_RATIO)
+  if [[ $to == *@gnu.org && $from == *@gnu.org ]]; then
+    keys=(ALL_TRUSTED TVD_SPACE_RATIO)
+    # from eggs had DKIMWL_WL_HIGH sometime in 2022, then DKIMWL_WL_MED unti march 2023
+  fi
+
+  for t in  ${keys[@]}; do
+    if [[ ${results[$t]} ]]; then
+      unset "results[$t]"
+    elif [[ $t == DKIM_VALID_EF && $from == *@[^.]*.[^.]*.[^.]* ]]; then
+      :
+      # third level domains dont hit this. its because
+      # /usr/share/perl5/Mail/SpamAssassin/Plugin/DKIM.pm checks
+      # if its signed with the registryboundaries domain. afaik:
+      # we need the actual domain to sign it, this would result in
+      # a second signature. I only use second level domains for
+      # testing atm, fsf doesnt use them for anything but the
+      # forum and I dont expect that to have any deliverability
+      # problems.  So, not bothering atm.
+    else
+      missing+=($t)
+    fi
+  done
+}
  
  #### begin arg processing ####
+
+do_spama=false
+
  # spamassassin checking takes about 8 seconds.
  slow=false
  if [[ $1 == slow ]]; then
@@ -87,10 +215,11 @@ fi
  
  maini=0
  
-spamd_ser=spamd
-if systemctl cat spamassassin &>/dev/null; then
-  spamd_ser=spamassassin
-fi
+# spamd_ser=spamd
+# if systemctl cat spamassassin &>/dev/null; then
+#   spamd_ser=spamassassin
+# fi
+spamd_ser=rspamd
  
  source /a/bin/bash_unpublished/source-state
  
@@ -119,10 +248,7 @@ main() {
      *)
        folders=(/m/md/l/testignore)
        # save some cpu cycles
-      froms=(testignore@je.b8.nz ian@iankelling.org)
-      if (( maini % 10 == 0 )); then
-        froms=(testignore@je.b8.nz testignore@expertpathologyreview.com testignore@amnimal.ninja ian@iankelling.org z@zroe.org)
-      fi
+      froms=(testignore@je.b8.nz testignore@expertpathologyreview.com testignore@amnimal.ninja ian@iankelling.org z@zroe.org)
        if ! $int; then
          ### begin rsyncing fencepost email ###
          # We dont want to exit if rsync fails, that will get caught by
@@ -177,26 +303,20 @@ EOF
    tmpfile=$(mktemp)
    declare -i unexpected=0
    for folder in ${folders[@]}; do
+    awk '/^Subject: / {t=$4}; /^From: / {f=$2}; ENDFILE {print t, f, FILENAME}' $folder/new/* $folder/cur/* | sort -rn >$tmpfile
      for from in ${froms[@]}; do
        declare -i missing_dnswl=0
        #declare -i dnsfail=0
        declare -i unexpected=0
        latest=
        last_sec=0
+      tmp=$(awk '$2 == "'$from'" {print $1,$3; exit}' $tmpfile)
+      read -r last_sec latest <<<"$tmp"
+      if [[ ! $latest ]]; then
  
-      if ! grep -rlFx "From: $from" $folder/{new,cur} >$tmpfile; then
          echo "no message found from: $from"
          continue
        fi
-      # webmail sends them to cur it seems
-      while read -r file; do
-        file_sec=$(awk '/^Subject: / {print $4}' $file)
-        if [[ $file_sec ]] && (( file_sec > last_sec )); then
-          latest=$file
-          last_sec="$file_sec"
-        fi
-      done <$tmpfile
-      rm -f $tmpfile
  
        to=$(awk '/^Envelope-to: / {print $2}' $latest)
  
@@ -209,70 +329,20 @@ EOF
            if [[ $(readlink /proc/$$/ns/net) != "$(readlink /proc/$spamdpid/ns/net)" ]]; then
              spamcpre="nsenter -t $spamdpid -n -m"
            fi
+          missing=()
            unset results
            declare -A results
-          # pyzor fails for our test message, so dont put useless load on their
-          # servers.
-          # example line that sed is parsing:
-          # (-0.1 / 5.0 requ) DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,SPF_HELO_PASS=-0.001,SPF_PASS=-0.001,TVD_SPACE_RATIO=0.001 autolearn=_AUTOLEARN
-          resultfile=$(mktemp)
-          # add -D for debug info. usually it
-          $spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" &>$resultfile
-
-          # note: on some mail, its 1 line after the send-test-forward, on others its 2 with a blank inbetween.
-          # I use the sed -n to filter this.
-          raw_results="$(tail $resultfile | grep -A2 -Fx /usr/local/bin/send-test-forward | tail -n+2 | sed -nr 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /gp')"
-          for r in $raw_results; do
-            case $r in
-              # This came in t12, but its just dkim + spf, and my
-              # systems aren't all t12, so ignore it for now.
-              DMARC_PASS) : ;;
-              # got this in an update 2022-01. dun care
-              T_SCC_BODY_TEXT_LINE|SCC_BODY_SINGLE_WORD) : ;;
-              # we have a new domain, ignore this.
-              # it seems like some versions of spamassassin do BODY_SINGLE_WORD, others dont, we dun care.
-              # bayes_00 is a new one indicating ham, we dont care if its missing.
-              BAYES_00|BODY_SINGLE_WORD|FROM_FMBLA_NEWDOM*|autolearn) : ;;
-
-              # These have somewhat randomly been added and removed, resulting in useless alerts, so ignore them.
-              RCVD_IN_DNSWL_MED|DKIMWL_WL_HIGH) : ;;
-
-              SPF_HELO_NEUTRAL)
-                # some of my domains use neutral spf, treat them the same.
-                results[SPF_HELO_PASS]=t
-                ;;
-              *)
-                results[$r]=t
-                ;;
-            esac
-          done
-          # debugging
-          # e results = ${!results[@]}
-          missing=()
+          # It would be useful for debugging & development to optionally
+          # run rspamc here but I haven't totally figured out
+          # rspamc, i might need to pass --helo=helo_string to avoid
+          # hostname_unknown result.
  
-          keys=(DKIM_SIGNED DKIM_VALID{,_AU,_EF} SPF_HELO_PASS SPF_PASS TVD_SPACE_RATIO)
-          if [[ $to == *@gnu.org && $from == *@gnu.org ]]; then
-            keys=(ALL_TRUSTED TVD_SPACE_RATIO)
-            # from eggs had DKIMWL_WL_HIGH sometime in 2022, then DKIMWL_WL_MED unti march 2023
+          if $do_spama; then
+            spamc-process
+          else
+            rspamc-process
            fi
  
-          for t in  ${keys[@]}; do
-            if [[ ${results[$t]} ]]; then
-              unset "results[$t]"
-            elif [[ $t == DKIM_VALID_EF && $from == *@[^.]*.[^.]*.[^.]* ]]; then
-              :
-              # third level domains dont hit this. its because
-              # /usr/share/perl5/Mail/SpamAssassin/Plugin/DKIM.pm checks
-              # if its signed with the registryboundaries domain. afaik:
-              # we need the actual domain to sign it, this would result in
-              # a second signature. I only use second level domains for
-              # testing atm, fsf doesnt use them for anything but the
-              # forum and I dont expect that to have any deliverability
-              # problems.  So, not bothering atm.
-            else
-              missing+=($t)
-            fi
-          done
            if (( ${#results[@]} || ${#missing[@]} )); then
              printf "$HOSTNAME spamtest %s\n" "$latest"
              if (( ${#results[@]} )); then
@@ -282,7 +352,7 @@ EOF
                printf "missing %s" "${missing[*]}"
              fi
              echo # ends our printf string buildup
-            cat $resultfile
+            if [[ $resultstr ]]; then printf "%s\n" "$resultstr"; fi
              echo mailtest-check: end of spam debug results
              # lets just handle 1 failure at a time in interactive mode.
              if $int; then
@@ -297,7 +367,6 @@ EOF
              #   echo mailtest-check: end of cat
              #fi
            fi
-          rm -f $resultfile
            for r in ${results[@]}; do
              case $r in
                # iank: for when we want to handle dns errors differently.
@@ -339,6 +408,7 @@ mailtest_check_last_usec{folder="$folder",from="$from"} $last_sec
  EOF
      done # end for from in ${froms[@]}
    done # end for folder in ${folders[@]}
+  rm -f $tmpfile
  
    dir=/var/lib/prometheus/node-exporter
    path=$dir/mailtest-check.prom.$$
diff --git a/subdir_files/.local/share/konsole/profileian.profile b/subdir_files/.local/share/konsole/profileian.profile

index 55cd61e18db730d5864aca5d15a834b79a7d6b88..2893b268741486d2504367c74a647e032044c3cd 100644 (file)
--- a/subdir_files/.local/share/konsole/profileian.profile
+++ b/subdir_files/.local/share/konsole/profileian.profile
@@ -15,7 +15,7 @@ SemanticInputClick=true
  SemanticUpDown=false
  
  [Interaction Options]
-OpenLinksByDirectClickEnabled=true
+OpenLinksByDirectClickEnabled=false
  TextEditorCmd=6
  TextEditorCmdCustom=/a/exe/g +LINE:COLUMN PATH
  UnderlineFilesEnabled=true
diff --git a/subdir_files/sieve/maintest.sieve b/subdir_files/sieve/maintest.sieve

index fefbd71ccef6a9291971be95934dc9a4a3cc72b0..73f409d17783176586588454326c11b5858b315a 100644 (file)
--- a/subdir_files/sieve/maintest.sieve
+++ b/subdir_files/sieve/maintest.sieve
@@ -3,15 +3,6 @@
  ##
  require [ "regex", "variables", "fileinto", "envelope", "mailbox", "imap4flags", "include" ];
  
-# many examples out there check for "X-Spam-Status" "^Yes", but we do
-# this in exim, which doesn't add that by default.  We could modify it's
-# config to add $spam_action to a header, like other headers, but simply
-# using an integer threshold here is simpler: the default threshold for
-# spamassassin is 5, so we have 5 plus symbols here.
-if header :regex "x-spam_bar" "^\\+{5}" {
-    fileinto :create "Junk";
-    stop;
-}
  
  include :personal "personaltest";
  include :personal "liststest";
author	Ian Kelling <ian@iankelling.org>
	Mon, 24 Jun 2024 10:09:57 +0000 (06:09 -0400)
committer	Ian Kelling <ian@iankelling.org>
	Mon, 24 Jun 2024 10:48:24 +0000 (06:48 -0400)
mail-setup		patch \| blob \| history
mailtest-check		patch \| blob \| history
subdir_files/.local/share/konsole/profileian.profile		patch \| blob \| history
subdir_files/sieve/maintest.sieve		patch \| blob \| history