cam deal with filesystem bug
[log-quiet] / sysd-prom-fail
1 #!/bin/bash
2 # Copyright (C) 2022 Ian Kelling
3
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7
8 # http://www.apache.org/licenses/LICENSE-2.0
9
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16 set -eE -o pipefail
17 trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR
18
19 if [[ ! $SERVICE_RESULT ]]; then
20 echo "$0: error, no SERVICE_RESULT var."
21 exit 1
22 fi
23
24 ser_name="$1"
25
26 if [[ ! $ser_name ]]; then
27 echo "$0: error, no arg service name passed."
28 exit 1
29 fi
30
31 dir=/var/lib/prometheus/node-exporter
32 if [[ ! -e $dir ]]; then
33 exit 0
34 fi
35
36 # we have to merge files due to this:
37 # https://github.com/prometheus/node_exporter/issues/1885
38 # or else we could put the label in the metric name, but that
39 # is a bad practice.
40 # Note, i found this https://github.com/hansmi/prometheus-textformat-merge
41 # but it seems overkill.
42 f=$dir/${ser_name}-result-fail.premerge
43 ftmp=$f.$$
44
45
46 write_count=false
47 if [[ -s $f ]]; then
48 # https://www.freedesktop.org/software/systemd/man/systemd.exec.html
49 if [[ $SERVICE_RESULT != success ]]; then
50 write_count=true
51 read -r _ count <$f
52 case $count in
53 ''|*[!0-9]*)
54 count=0
55 ;;
56 *)
57 count=$(( count + 1 ))
58 ;;
59 esac
60 fi
61 else
62 count=0
63 write_count=true
64 fi
65
66 if $write_count; then
67 printf 'node_systemd_unit_result_fail_count{name="%s"} %s\n' "$ser_name" "$count" >$ftmp
68 mv $ftmp $f
69 finaltmp=$dir/sysd-result-fail.prom.$$
70 # i had a minor problem that multiple of these scripts executed at the same time, and ended up with
71 # multiple instances of the same file
72 # ls -lai
73 # ...
74 # 3896242 -rw-r--r-- 1 root root 439 Apr 4 00:02 sysd-result-fail.prom
75 # 3896242 -rw-r--r-- 1 root root 439 Apr 4 00:02 sysd-result-fail.prom
76 # 3896242 -rw-r--r-- 1 root root 439 Apr 4 00:02 sysd-result-fail.prom
77 # this seems harmless, https://www.spinics.net/lists/linux-btrfs/msg111245.html
78 # however, lets do a bit to avoid it: sleep a random amount of time from 0-.5 seconds.
79 sleep 0.$(( RANDOM % 500 ))
80 cat $dir/*-result-fail.premerge >$finaltmp
81 mv $finaltmp $dir/sysd-result-fail.prom
82 fi