From d7551546ac323c5d4b49370c885646bcf96e959f Mon Sep 17 00:00:00 2001 From: Ian Kelling Date: Mon, 7 Mar 2022 22:04:19 -0500 Subject: [PATCH] various fixes --- brc2 | 16 + check-stale-alerts | 14 +- conflink | 5 + distro-end | 3 +- filesystem/etc/default/prometheus | 2 +- filesystem/etc/prometheus/file_sd/node.yml | 4 +- filesystem/etc/prometheus/prometheus.yml | 3 + filesystem/etc/prometheus/rules/iank.yml | 394 +++++++++++------- .../etc/systemd/system/btrfsmaintstop.service | 10 +- .../systemd/system/dynamicipupdate.service | 10 +- install-my-scripts | 11 +- mailtest-check | 12 +- rootsshsync | 4 +- system-status | 105 +++-- ziva-backup-check | 7 +- 15 files changed, 388 insertions(+), 212 deletions(-) mode change 100644 => 100755 system-status diff --git a/brc2 b/brc2 index ddfe816..6b97783 100644 --- a/brc2 +++ b/brc2 @@ -381,6 +381,22 @@ bigclock() { nnn() { /a/opt/nnn -H "$@"; } +locat() { # log-once cat + local files + ngset + files=(/var/local/cron-errors/* /home/iank/cron-errors/* /sysd-mail-once-state/*) + case ${#files[@]} in + 0) : ;; + 1) + echo ${files[0]} + head ${files[0]} + ;; + *) + head ${files[@]} + ;; + esac + ngreset +} # duplicated somewhat below. jrun() { # journal run. run args, log to journal, tail and grep the journal. diff --git a/check-stale-alerts b/check-stale-alerts index a6d8c82..1ecb58b 100755 --- a/check-stale-alerts +++ b/check-stale-alerts @@ -1,25 +1,19 @@ #!/bin/bash -time_arg="-ctime +4" -case $1 in - now) - time_arg= - ;; -esac if [[ ! -e /dev/shm/iank-status ]]; then exit 0 fi eval $(< /dev/shm/iank-status) -if [[ $HOSTNAME != "$MAIL_HOST" ]]; then - exit 0 -fi -out=$(find /var/local/cron-errors /home/iank/cron-errors /sysd-mail-once-state -type f -ctime +4) +out=$(find /var/local/cron-errors /home/iank/cron-errors /sysd-mail-once-state -type f) if [[ $out ]]; then echo HOSTNAME: $HOSTNAME printf "%s\n" "$out" fi +if [[ $HOSTNAME != "$MAIL_HOST" ]]; then + exit 0 +fi for h in {li,bk,je}.b8.nz; do out=$(ssh $h find /m/md/bounces/new /var/local/cron-errors /home/iank/cron-errors /sysd-mail-once-state -type f) if [[ $out ]]; then diff --git a/conflink b/conflink index d7323fd..d3a7b2a 100755 --- a/conflink +++ b/conflink @@ -90,6 +90,11 @@ common-file-setup() { while read -r line; do file="${line:12}" case $file in + etc/prometheus/rules/iank.yml) + case $HOSTNAME in + kd) m s systemctl reload prometheus ;; + esac + ;; etc/systemd/system/*) reload_systemd=true ;; diff --git a/distro-end b/distro-end index ff974f9..407987a 100755 --- a/distro-end +++ b/distro-end @@ -1881,7 +1881,6 @@ case $HOSTNAME in # either use iptables or, in # /etc/default/prometheus-node-exporter # listen on the wireguard interface - ;; li|je|bk) # ex for exporter web-conf -p 9101 -f 9100 - apache2 ${HOSTNAME}ex.b8.nz <<'EOF' @@ -1896,7 +1895,7 @@ Require valid-user EOF ;; *) - wgip=$(sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf) + wgip=$(command sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf) web-conf -i -a $wgip -p 9101 -f 9100 - apache2 ${HOSTNAME}wg.b8.nz <<'EOF' AuthType Basic diff --git a/filesystem/etc/default/prometheus b/filesystem/etc/default/prometheus index 63d1ee3..e733d59 100644 --- a/filesystem/etc/default/prometheus +++ b/filesystem/etc/default/prometheus @@ -2,7 +2,7 @@ # Set the command-line arguments to pass to the server. -ARGS="--web.listen-address=127.0.0.1:9090" +ARGS="--web.listen-address=127.0.0.1:9090 --web.external-url=https://i.b8.nz:9091" diff --git a/filesystem/etc/prometheus/file_sd/node.yml b/filesystem/etc/prometheus/file_sd/node.yml index 8372ddc..e58d520 100644 --- a/filesystem/etc/prometheus/file_sd/node.yml +++ b/filesystem/etc/prometheus/file_sd/node.yml @@ -1,10 +1,10 @@ - targets: - kdwg:9101 - # - sywg:9101 + - sywg:9101 # - bk:9101 # - je:9101 # - li:9101 # - frodo:9101 # - kwwg:9101 # - x3wg:9101 - # - x2wg:9101 + - x2wg:9101 diff --git a/filesystem/etc/prometheus/prometheus.yml b/filesystem/etc/prometheus/prometheus.yml index 9932335..97ac447 100644 --- a/filesystem/etc/prometheus/prometheus.yml +++ b/filesystem/etc/prometheus/prometheus.yml @@ -29,6 +29,9 @@ scrape_configs: static_configs: - targets: ['localhost:9090'] + - job_name: 'alertmanager' + static_configs: + - targets: ['localhost:9093'] - job_name: node basic_auth: diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index 72b0701..4439c41 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -1,6 +1,12 @@ +# other rules to consider: +# filesystem, network, ntp rules: +# https://github.com/cloudalchemy/ansible-prometheus defaults/main.yml +# on my system, the interpolated values are in /a/opt/ansible-prometheus/rules.yml +# + groups: -- name: standard alerts +- name: standard rules: - alert: mailtest-check expr: |- @@ -9,170 +15,258 @@ groups: severity: day annotations: description: '{{ $labels.instance }} mailtest-check' - summary: {{ $labels.instance }} mailtest-check + summary: '{{ $labels.instance }} mailtest-check' + # 42 mins: enough for a 30 min queue run plus 12 - alert: mailtest-check expr: |- - # 42 mins: enough for a 30 min queue run plus 12 time() - mailtest_check_last_usec > 60 * 42 labels: severity: prod annotations: description: '{{ $labels.instance }} mailtest-check' - summary: {{ $labels.instance }} mailtest-check + summary: '{{ $labels.instance }} mailtest-check' + - alert: 1pmtest + expr: hour() == 18 and minute() < 5 + for: 0m + labels: + severity: daytest + annotations: + summary: Prometheus daily test alert (instance {{ $labels.instance }}) + description: "Prometheus daily test alert if no other alerts. It + is an end to end test.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # - alert: NodeFilesystemAlmostOutOfSpace - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available space left. - # summary: Filesystem has less than 5% space left. - # expr: |- - # ( - # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: warning - # - alert: NodeFilesystemAlmostOutOfSpace - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available space left. - # summary: Filesystem has less than 3% space left. - # expr: |- - # ( - # node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: critical - # - alert: NodeFilesystemFilesFillingUp - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left and is filling up. - # summary: Filesystem is predicted to run out of inodes within the next 24 hours. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 - # and - # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: warning - # - alert: NodeFilesystemFilesFillingUp - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - # summary: Filesystem is predicted to run out of inodes within the next 4 hours. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 - # and - # predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: critical - # - alert: NodeFilesystemAlmostOutOfFiles - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left. - # summary: Filesystem has less than 5% inodes left. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: warning - # - alert: NodeFilesystemAlmostOutOfFiles - # annotations: - # description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - # only {{ printf "%.2f" $value }}% available inodes left. - # summary: Filesystem has less than 3% inodes left. - # expr: |- - # ( - # node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 - # and - # node_filesystem_readonly{job="node",fstype!=""} == 0 - # ) - # for: 1h - # labels: - # severity: critical - # - alert: NodeNetworkReceiveErrs - # annotations: - # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - # {{ printf "%.0f" $value }} receive errors in the last two minutes.' - # summary: Network interface is reporting many receive errors. - # expr: |- - # increase(node_network_receive_errs_total[2m]) > 10 - # for: 1h - # labels: - # severity: warning - # - alert: NodeNetworkTransmitErrs - # annotations: - # description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - # {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - # summary: Network interface is reporting many transmit errors. - # expr: |- - # increase(node_network_transmit_errs_total[2m]) > 10 - # for: 1h - # labels: - # severity: warning - # - alert: NodeHighNumberConntrackEntriesUsed - # annotations: - # description: '{{ $value | humanizePercentage }} of conntrack entries are used' - # summary: Number of conntrack are getting close to the limit - # expr: |- - # (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 +# https://awesome-prometheus-alerts.grep.to/rules + + +# todo, we should probably group the prometheus alerts that indicate a +# host-local problem. +# eg, set a label alert-group: local-prom, then make a receiver that +# groups by it when the alert-group is local-prom. + +- name: awesome prometheus alerts + rules: + + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 30m + labels: + severity: day + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissing + expr: up == 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # todo: this should supress the above alert + # - alert: PrometheusAllTargetsMissing + # expr: count by (job) (up) == 0 + # for: 30m # labels: - # severity: warning - # - alert: NodeClockSkewDetected + # severity: day + # alert-group: local-prom # annotations: - # message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure - # NTP is configured correctly on this host. - # summary: Clock skew detected. - # expr: |- - # ( - # node_timex_offset_seconds > 0.05 - # and - # deriv(node_timex_offset_seconds[5m]) >= 0 - # ) - # or - # ( - # node_timex_offset_seconds < -0.05 - # and - # deriv(node_timex_offset_seconds[5m]) <= 0 - # ) - # for: 10m + # summary: Prometheus all targets missing (instance {{ $labels.instance }}) + # description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 30m + labels: + severity: day + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # I have an out of band alert to make sure prometheus is up. this + # looks like it would generate false positives. todo: think + # through what a valid crash loop detection would look like. + # - alert: PrometheusTooManyRestarts + # expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 10 + # for: 0m # labels: # severity: warning - # - alert: NodeClockNotSynchronising # annotations: - # message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured - # on this host. - # summary: Clock not synchronising. - # expr: |- - # min_over_time(node_timex_sync_status[5m]) == 0 - # for: 10m - # labels: - # severity: warning - # - alert: ianktest - # expr: node_systemd_version >= 300 + # summary: Prometheus too many restarts (instance {{ $labels.instance }}) + # description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerJobMissing + expr: absent(up{job="alertmanager"}) + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 30m + labels: + severity: day + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 30m + labels: + severity: day + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warn + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0 + for: 0m + labels: + severity: warn + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # file_sd doesnt count as service discovery, so 0 is expected. + # - alert: PrometheusTargetEmpty + # expr: prometheus_sd_discovered_targets == 0 + # for: 30m # labels: - # severity: critical + # severity: day # annotations: - # description: '{{ $labels.instance }} ianktest.' - # summary: Instance {{ $labels.instance }} - ianktest + # summary: Prometheus target empty (instance {{ $labels.instance }}) + # description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbReloadFailures + expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 30m + labels: + severity: warn + annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/filesystem/etc/systemd/system/btrfsmaintstop.service b/filesystem/etc/systemd/system/btrfsmaintstop.service index 31b4a65..5e8160d 100644 --- a/filesystem/etc/systemd/system/btrfsmaintstop.service +++ b/filesystem/etc/systemd/system/btrfsmaintstop.service @@ -1,7 +1,13 @@ [Unit] Description=btrfsmaintstop -After=multi-user.target +After=local-fs.target +StartLimitIntervalSec=0 [Service] -Type=oneshot +Type=simple ExecStart=/usr/local/bin/sysd-mail-once -10 btrfsmaintstop /usr/local/bin/btrfsmaint check +Restart=always +RestartSec=600 + +[Install] +WantedBy=grahical.target diff --git a/filesystem/etc/systemd/system/dynamicipupdate.service b/filesystem/etc/systemd/system/dynamicipupdate.service index 48c3d44..54b04f9 100644 --- a/filesystem/etc/systemd/system/dynamicipupdate.service +++ b/filesystem/etc/systemd/system/dynamicipupdate.service @@ -1,7 +1,13 @@ [Unit] Description=dynamicipupdate -After=multi-user.target +After=local-fs.target +StartLimitIntervalSec=0 [Service] -Type=oneshot +Type=simple ExecStart=/usr/local/bin/sysd-mail-once -40 dynamicipupdate /usr/local/bin/dynamic-ip-update +Restart=always +RestartSec=600 + +[Install] +WantedBy=grahical.target diff --git a/install-my-scripts b/install-my-scripts index 5833373..cb54350 100755 --- a/install-my-scripts +++ b/install-my-scripts @@ -38,7 +38,7 @@ x="$(readlink -f -- "${BASH_SOURCE[0]}")"; cd ${x%/*} # directory of this file rsync -t --chmod=755 --chown=root:root switch-mail-host btrbk-run mount-latest-subvol \ check-subvol-stale myi3status mailtest-check \ mailbindwatchdog \ - /a/bin/log-quiet/sysd-mail-once hssh \ + /a/bin/log-quiet/sysd-mail-once \ check-mailq \ unsaved-buffers.el \ mail-backup-clean \ @@ -57,20 +57,21 @@ cmd=( rsync -aiSAX --chown=root:root --chmod=g-s sre() { service=$1 if [[ $(systemctl is-active $1.service ||:) != inactive ]]; then - systemctl restart $service + # just fire and forget. sometimes a script restart can fail, but then + # then auto restart mechanism makes it succeed. + systemctl restart $service ||: & fi } while read -r line; do file="${line:12}" - echo $file case $file in btrfsmaint) - sre btrfsmaintstop + sre btrfsmaintstop & ;; *) - sre ${file//-/} + sre ${file//-/} & ;; esac done < <("${cmd[@]}") diff --git a/mailtest-check b/mailtest-check index befdbbf..7454086 100755 --- a/mailtest-check +++ b/mailtest-check @@ -78,7 +78,14 @@ esac getspamdpid() { if [[ ! $spamdpid || ! -d /proc/$spamdpid ]]; then - spamdpid=$(systemctl show --property MainPID --value spamassassin | sed 's/^1$//' ||:) + # try twice in case we are restarting, it happens. + for i in 1 2; do + spamdpid=$(systemctl show --property MainPID --value spamassassin | sed 's/^[10]$//' ||:) + if [[ $spamdpid ]]; then + break + fi + sleep 30 + done fi } getspamdpid @@ -135,7 +142,8 @@ for folder in ${folders[@]}; do # servers. # example line that sed is parsing: # (-0.1 / 5.0 requ) DKIM_SIGNED=0.1,DKIM_VALID=-0.1,DKIM_VALID_AU=-0.1,SPF_HELO_PASS=-0.001,SPF_PASS=-0.001,TVD_SPACE_RATIO=0.001 autolearn=_AUTOLEARN - for r in $($spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" | tail -n2 | head -n1 | sed -r 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /g'); do + raw_results="$($spamcpre sudo -u Debian-exim spamassassin -t --cf='score PYZOR_CHECK 0' <"$latest" | tail -n2 | head -n1 | sed -r 's/^\([^)]*\) *//;s/=[^, ]*([, ]|$)/ /g')" + for r in $raw_results; do case $r in # got this in an update 2022-01. dun care T_SCC_BODY_TEXT_LINE|SCC_BODY_SINGLE_WORD) : ;; diff --git a/rootsshsync b/rootsshsync index 6b28fcb..fa36a56 100755 --- a/rootsshsync +++ b/rootsshsync @@ -59,8 +59,8 @@ if [[ -e $user_ssh_dir/config ]]; then fi chown -R root:root /root/.ssh -# notably: installs hssh -/a/exe/install-my-scripts +rsync -t --chmod=755 --chown=root:root /b/ds/hssh /usr/local/bin + if [[ -e /a/opt/btrbk/ssh_filter_btrbk.sh ]]; then install /a/opt/btrbk/ssh_filter_btrbk.sh /usr/local/bin fi diff --git a/system-status b/system-status old mode 100644 new mode 100755 index 28c0035..07c730d --- a/system-status +++ b/system-status @@ -37,10 +37,48 @@ loday() { /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylerts@iankelling.org } - +# todo, consider migrating some of these alerts into prometheus write-status() { chars=("${first_chars[@]}") + services=( + epanicclean + systemstatus + btrfsmaintstop + dynamicipupdate + ) + bads=() + if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then + for s in ${services[@]}; do + if [[ $(systemctl show -p SubState --value $s) != running ]]; then + bads+=($s) + fi + done + chars+=(MYSERS) + + fi + lo -240 mysers ${bads[*]} + + services=( + prometheus-node-exporter + prometheus-alertmanager + prometheus + ) + case $HOSTNAME in + kd) + bads=() + if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then + for s in ${services[@]}; do + if [[ $(systemctl show -p SubState --value $s) != running ]]; then + bads+=($s) + fi + done + chars+=(PROM) + fi + lo -240 prom ${bads[*]} + ;; + esac + # clock us out in timetrap if are idle too long if [[ -e /p/.timetrap.db ]]; then export DISPLAY=:0 @@ -83,15 +121,18 @@ write-status() { glob=(/nocow/btrfs-stale/*) if [[ -e ${glob[0]} ]]; then - chars+=("STALE") + chars+=(STALE) fi + var_mail_msg= if [[ $(find /var/mail -type f \! -empty -print -quit) ]]; then var_mail_msg="message in /var/mail" fi loday -1 var_mail $var_mail_msg + + bouncemsg= glob=(/m/md/bounces/new/*) if [[ -e ${glob[0]} ]]; then - chars+=("BOUNCE") + chars+=(BOUNCE) bouncemsg="message in /m/md/bounces/new" fi loday -1 bounce $bouncemsg @@ -99,27 +140,28 @@ write-status() { # but its good enough for me. glob=(/m/md/alerts/{new,cur}/!(*,S)) if [[ -e ${glob[0]} ]]; then - chars+=("A") + chars+=(A) fi glob=(/m/md/daylerts/{new,cur}/!(*,S)) if [[ -e ${glob[0]} ]]; then - chars+=("L") + chars+=(DAY) fi tmp=(/var/local/cron-errors/mailtest-check*) if (( ${#tmp[@]} )); then - chars+=("MAILPING") + chars+=(MAILPING) fi tmp=(/var/local/cron-errors/mailtest-slow*) if (( ${#tmp[@]} )); then - chars+=("SPAMD") + chars+=(SPAMD) fi # early in install process, we dont have permission yet for exiqgrep. # 1100 helps allow for system restarts qlen=$(/usr/sbin/exiqgrep -o 1100 -c -b | awk '{print $1}') ||: + qmsg= if ((qlen)); then qmsg="queue length $qlen" chars+=("q $qlen") @@ -144,11 +186,11 @@ write-status() { # these conditions are so we dont have an overly verbose prompt if $begin && $end; then - chars+=("D") + chars+=(D) elif $begin; then - chars+=("DB") + chars+=(DB) elif $end; then - chars+=("DE") + chars+=(DE) else f=~/.local/conflink # shellcheck disable=SC2043 @@ -175,7 +217,7 @@ write-status() { # Just because i forget a lot, -mmin -NUM means files modified <= NUM minutes ago if (( fmin < 0 )) && [[ $(find ${all_dirs[@]} -mmin $fmin -type f -print -quit 2>/dev/null) ]]; then v conflink newer filesystem files - chars+=("CONFLINK") + chars+=(CONFLINK) break fi @@ -188,7 +230,7 @@ write-status() { fi if (( $(date -d "$(git log --diff-filter=ACR --format=%aD -1)" +%s) > fsec )); then v conflink: newer files checked in to git - chars+=("CONFLINK") + chars+=(CONFLINK) break fi @@ -198,7 +240,7 @@ write-status() { done < <(git ls-files -o --exclude-standard) if [[ ${untracked[0]} && $(find "${untracked[@]}" -mmin $fminplus -type f -print -quit) ]]; then v conflink: untracked in $d - chars+=("CONFLINK") + chars+=(CONFLINK) break fi done @@ -207,13 +249,13 @@ write-status() { fi if [[ ! -e $f || $(<$f) != 0 ]]; then v conflink: last run not found or failed - chars+=("CONFLINK") + chars+=(CONFLINK) break fi done fi -# if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then + # if [[ $(grep -v "exim user lost privilege for using -C option" /var/log/exim4/paniclog 2>/dev/null ||:) ]]; then if [[ -s /var/log/exim4/paniclog ]]; then chars+=("PANIC!") # leave it up to epanic-clean to send email notification @@ -223,10 +265,10 @@ write-status() { if [[ $MAIL_HOST == "$HOSTNAME" ]]; then bbkmsg= if [[ $(systemctl is-active btrbk.timer) != active ]]; then - chars+=("BTRBK.TIMER") - bbkmsg="btrbk.timer not enabled" + chars+=(BTRBK.TIMER) + bbkmsg="not enabled" fi - lo -48 btrbk.timer $bbkmsg + lo -480 btrbk.timer $bbkmsg ## check if last snapshot was within an hour vol=o @@ -254,8 +296,9 @@ write-status() { maxtime=$t fi done + snapshotmsg= if (( maxtime < now - 4*60*60 )); then - chars+=("OLD-SNAP") + chars+=(OLD-SNAP) snapshotmsg="/o snapshot older than 4 hours" fi lo -1 old-snapshot $snapshotmsg @@ -279,19 +322,19 @@ if [[ $1 ]]; then fi main-loop() { -while true; do - power=true - if [[ -e /sys/class/power_supply/AC/online && $(