add script for prometheus logging of systemd service failures
authorIan Kelling <iank@fsf.org>
Sun, 27 Mar 2022 10:05:24 +0000 (06:05 -0400)
committerIan Kelling <iank@fsf.org>
Sun, 27 Mar 2022 10:05:24 +0000 (06:05 -0400)
sysd-prom-fail [new file with mode: 0755]
sysd-prom-fail-install [new file with mode: 0755]

diff --git a/sysd-prom-fail b/sysd-prom-fail
new file mode 100755 (executable)
index 0000000..20e7c9b
--- /dev/null
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Copyright (C) 2022 Ian Kelling
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eE -o pipefail
+trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
+
+if [[ ! $SERVICE_RESULT ]]; then
+  echo "$0: error, no SERVICE_RESULT var."
+  exit 1
+fi
+
+ser_name="$1"
+
+if [[ ! $ser_name ]]; then
+  echo "$0: error, no arg service name passed."
+  exit 1
+fi
+
+dir=/var/lib/prometheus/node-exporter
+if [[ ! -e $dir  ]]; then
+  exit 0
+fi
+
+# we have to merge files due to this:
+# https://github.com/prometheus/node_exporter/issues/1885
+# or else we could put the label in the metric name, but that
+# is a bad practice.
+# Note, i found this https://github.com/hansmi/prometheus-textformat-merge
+# but it seems overkill.
+f=$dir/${ser_name}-result-fail.premerge
+ftmp=$f.$$
+
+
+write_count=false
+if [[ -s $f ]]; then
+  # https://www.freedesktop.org/software/systemd/man/systemd.exec.html
+  if [[ $SERVICE_RESULT != success ]]; then
+    write_count=true
+    read -r _ count <$f
+    case $count in
+      ''|*[!0-9]*)
+        count=0
+        ;;
+      *)
+        count=$(( count + 1 ))
+        ;;
+    esac
+  fi
+else
+  count=0
+  write_count=true
+fi
+
+if $write_count; then
+  printf 'node_systemd_unit_result_fail_count{name="%s"} %s\n' "$ser_name" "$count" >$ftmp
+  mv $ftmp $f
+fi
+
+finaltmp=$dir/sysd-result-fail.prom.$$
+cat $dir/*-result-fail.premerge >$finaltmp
+mv $finaltmp $dir/sysd-result-fail.prom
diff --git a/sysd-prom-fail-install b/sysd-prom-fail-install
new file mode 100755 (executable)
index 0000000..244e0a5
--- /dev/null
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright (C) 2022 Ian Kelling
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source /a/bin/errhandle/err
+[[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@"
+
+ser_name=$1
+
+mkdir -p /var/lib/prometheus/node-exporter
+
+f=/var/local/sysd-prom-fail/${ser_name}-result-fail.prom
+
+
+if [[ ! -s /var/lib/prometheus/node-exporter/${ser_name}-result-fail.premerge ]]; then
+  SERVICE_RESULT=success /usr/local/bin/sysd-prom-fail $ser_name
+fi
+
+tmp=$(mktemp)
+
+cat >>$tmp <<EOF
+[Service]
+ExecStopPost=+/usr/local/bin/sysd-prom-fail $ser_name
+EOF
+
+dir=/etc/systemd/system/$ser_name.service.d
+mkdir -p $dir
+out=$(rsync -cipgo --chmod=644 --chown=root:root $tmp $dir/sysd-prom-fail.conf)
+if [[ $out ]]; then
+  echo $0: systemctl daemon-reload
+  systemctl daemon-reload
+  if systemctl is-active $ser_name &>/dev/null; then
+    echo $0: systemctl restart $ser_name
+    systemctl restart $ser_name ||:
+  fi
+fi