From 9a1c9e1c16555a4158cfd4044a615d89ab877abc Mon Sep 17 00:00:00 2001 From: Ian Kelling Date: Fri, 3 Mar 2023 10:48:44 -0500 Subject: [PATCH] fix alerts for down hosts --- brc2 | 2 +- check-remote-mailqs | 3 ++- distro-begin | 9 ++++++++- filesystem/etc/prometheus/rules/iank.yml | 2 +- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/brc2 b/brc2 index 1a69562..fa84ca3 100644 --- a/brc2 +++ b/brc2 @@ -349,7 +349,7 @@ alerts() { ralerts() { # remote alerts local ret shell # this list is duplicated in check-remote-mailqs - for h in bk je li frodo kwwg x3wg x2wg kdwg sywg; do + for h in bk je li frodo x3wg kdwg sywg; do echo $h: shell="ssh $h" if [[ $HOSTNAME == "${h%wg}" ]]; then diff --git a/check-remote-mailqs b/check-remote-mailqs index df67a04..2e98521 100755 --- a/check-remote-mailqs +++ b/check-remote-mailqs @@ -9,8 +9,9 @@ source /a/bin/errhandle/err shopt -s nullglob shopt -s dotglob +# temp disabled: x2wg kwwg # this list duplicated in brc2 ralerts -for h in bk je li frodo kwwg x3wg x2wg kdwg sywg; do +for h in bk je li frodo x3wg kdwg sywg; do statedir=/b/bash_unpublished/mailq-state statefile=$statedir/$h [[ -d $statedir ]] || continue diff --git a/distro-begin b/distro-begin index 84ba7c4..c5d10cc 100755 --- a/distro-begin +++ b/distro-begin @@ -13,10 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -# for setting up a new machine +#### for setting up a new machine # usage: $0 [-r] [HOSTNAME] # HOSTNAME changes the machine's hostname +# Update target_down alerts in +# /a/bin/ds/filesystem/etc/prometheus/rules/iank.yml +# +# Update hostnames in /b/ds/check-remote-mailqs + +### end new machine setup + # tips: # run any sudo command first so your pass is cached # set the scrollback to unlimited in case something goes wrong diff --git a/filesystem/etc/prometheus/rules/iank.yml b/filesystem/etc/prometheus/rules/iank.yml index f64322b..0049743 100644 --- a/filesystem/etc/prometheus/rules/iank.yml +++ b/filesystem/etc/prometheus/rules/iank.yml @@ -274,7 +274,7 @@ groups: description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" - alert: lowpri_target_down - expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100"} == 0 + expr: up{instance!~"kdwg:9101|bkex.b8.nz:9101|liex.b8.nz:9101|10.2.0.1:9100|kwwg:9101"} == 0 for: 30m labels: severity: warn -- 2.30.2