From c95baa6003490dcbbda39f5b6340ab68ba121224 Mon Sep 17 00:00:00 2001 From: Ian Kelling Date: Sun, 27 Mar 2022 06:05:24 -0400 Subject: [PATCH] add script for prometheus logging of systemd service failures --- sysd-prom-fail | 73 ++++++++++++++++++++++++++++++++++++++++++ sysd-prom-fail-install | 47 +++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100755 sysd-prom-fail create mode 100755 sysd-prom-fail-install diff --git a/sysd-prom-fail b/sysd-prom-fail new file mode 100755 index 0000000..20e7c9b --- /dev/null +++ b/sysd-prom-fail @@ -0,0 +1,73 @@ +#!/bin/bash +# Copyright (C) 2022 Ian Kelling + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eE -o pipefail +trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR + +if [[ ! $SERVICE_RESULT ]]; then + echo "$0: error, no SERVICE_RESULT var." + exit 1 +fi + +ser_name="$1" + +if [[ ! $ser_name ]]; then + echo "$0: error, no arg service name passed." + exit 1 +fi + +dir=/var/lib/prometheus/node-exporter +if [[ ! -e $dir ]]; then + exit 0 +fi + +# we have to merge files due to this: +# https://github.com/prometheus/node_exporter/issues/1885 +# or else we could put the label in the metric name, but that +# is a bad practice. +# Note, i found this https://github.com/hansmi/prometheus-textformat-merge +# but it seems overkill. +f=$dir/${ser_name}-result-fail.premerge +ftmp=$f.$$ + + +write_count=false +if [[ -s $f ]]; then + # https://www.freedesktop.org/software/systemd/man/systemd.exec.html + if [[ $SERVICE_RESULT != success ]]; then + write_count=true + read -r _ count <$f + case $count in + ''|*[!0-9]*) + count=0 + ;; + *) + count=$(( count + 1 )) + ;; + esac + fi +else + count=0 + write_count=true +fi + +if $write_count; then + printf 'node_systemd_unit_result_fail_count{name="%s"} %s\n' "$ser_name" "$count" >$ftmp + mv $ftmp $f +fi + +finaltmp=$dir/sysd-result-fail.prom.$$ +cat $dir/*-result-fail.premerge >$finaltmp +mv $finaltmp $dir/sysd-result-fail.prom diff --git a/sysd-prom-fail-install b/sysd-prom-fail-install new file mode 100755 index 0000000..244e0a5 --- /dev/null +++ b/sysd-prom-fail-install @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright (C) 2022 Ian Kelling + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +source /a/bin/errhandle/err +[[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@" + +ser_name=$1 + +mkdir -p /var/lib/prometheus/node-exporter + +f=/var/local/sysd-prom-fail/${ser_name}-result-fail.prom + + +if [[ ! -s /var/lib/prometheus/node-exporter/${ser_name}-result-fail.premerge ]]; then + SERVICE_RESULT=success /usr/local/bin/sysd-prom-fail $ser_name +fi + +tmp=$(mktemp) + +cat >>$tmp </dev/null; then + echo $0: systemctl restart $ser_name + systemctl restart $ser_name ||: + fi +fi -- 2.30.2