--- /dev/null
+#!/bin/bash
+
+if ! test "$BASH_VERSION"; then echo "error: shell is not bash" >&2; exit 1; fi
+shopt -s inherit_errexit 2>/dev/null ||: # ignore fail in bash < 4.4
+set -eE -o pipefail
+trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" exit status: $?, PIPESTATUS: ${PIPESTATUS[*]}" >&2' ERR
+
+
+host=$1
+
+case $host in
+ je)
+ ip6=2001:ba8:1f1:f09d
+ ip4=85.119.82.128
+ ;;
+ bk)
+ ip6=2001:ba8:1f1:f0c9
+ ip4=85.119.83.50
+ ;;
+esac
+
+debconf-set-selections <<'EOF'
+locales locales/default_environment_locale select en_US.UTF-8
+locales locales/locales_to_be_generated multiselect en_US.UTF-8 UTF-8
+EOF
+
+# /a/bin/fai/fai/config/hooks/updatebase.UBUNTU
+debconf --owner=locales sh -c '
+ . /usr/share/debconf/confmodule
+ db_version 2.0
+ db_get locales/locales_to_be_generated &&
+ mkdir -p /var/lib/locales/supported.d &&
+ echo "$RET" > /var/lib/locales/supported.d/local'
+dpkg-reconfigure -fnoninteractive locales
+
+apt -y remove --purge --auto-remove netplan.io libnetplan0
+apt update
+apt -y install linux-virtual-hwe-20.04 grub-pc-bin openssh-server ifupdown rsync
+mkdir -p /root/.ssh
+chmod 700 /root/.ssh
+cat >/root/.ssh/authorized_keys <<'EOF'
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDX42yru/h6r6UDRy/VwVZjcYEmNLG5/SUjv7xwu43OaW0wL+uHYg2rkfn4Ygh5o1I5pgBh2SWg8TeWuVGhgL1SCuBzzeai/+58Sny065Qak+D2WjVOuRonRelh+CBA5EpNZPuiWQkoWdf9NACTBCbS2Zu7r8OOgRqu/ruaDNePlG5+U0Wlpy3oBnpbzQiuSA3AKMW30fsCJtOBjz5qQaiPbYEKJy3AOvtbq10wliKx9TpsTzrq8dKWs7PLhZnzqVCsaq6D95IzjqXcSpx4Cga5bn+YEuAnJQ53PGA5eO+hpz6HDmawTbJlaV/Dufb9bJ/ZZy1DXzs07yWRtTEY54/X ian@iankelling.org
+EOF
+
+
+# todo update this and hostname depending on host
+cat >/etc/network/interfaces <<EOF
+auto lo
+iface lo inet loopback
+
+# The primary network interface
+auto eth0
+iface eth0 inet static
+ address $ip4/21
+ gateway 85.119.80.1
+
+iface eth0 inet6 static
+ address $ip6::2
+ netmask 64
+ gateway $ip6::1
+ post-up echo 0 > /proc/sys/net/ipv6/conf/default/accept_ra
+ post-up echo 0 > /proc/sys/net/ipv6/conf/all/accept_ra
+ post-up echo 0 > /proc/sys/net/ipv6/conf/$IFACE/accept_ra
+ post-up echo 0 > /proc/sys/net/ipv6/conf/default/autoconf
+ post-up echo 0 > /proc/sys/net/ipv6/conf/all/autoconf
+ post-up echo 0 > /proc/sys/net/ipv6/conf/$IFACE/autoconf
+EOF
+
+cat >/etc/fstab <<'EOF'
+/dev/xvda1 / ext4 noatime,nodiratime 0 1
+/dev/xvdb1 none swap nofail,x-systemd.device-timeout=30s,x-systemd.mount-timeout=30s,sw 0 0
+EOF
+
+cat >> /etc/default/grub <<'EOF'
+GRUB_CMDLINE_LINUX_DEFAULT=""
+GRUB_CMDLINE_LINUX="console=hvc0"
+EOF
+
+update-grub
+
+cat >/etc/systemd/resolved.conf.d/servers.conf <<'EOF'
+[Resolve]
+DNS=85.119.80.232 85.119.80.233
+Domains=~.
+EOF
+
+cat >/etc/hostname <<EOF
+$host
+EOF
--- /dev/null
+#!/bin/bash
+
+# meant to be copy/pasted
+
+ssh bk@bk.console.bitfolk.com
+# or
+ssh iankelling@iankelling.console.bitfolk.com
+
+destroy
+rescue
+sudo -i
+mkdir -p /root/.ssh
+chmod 700 /root/.ssh
+cat >/root/.ssh/authorized_keys <<'EOF'
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDX42yru/h6r6UDRy/VwVZjcYEmNLG5/SUjv7xwu43OaW0wL+uHYg2rkfn4Ygh5o1I5pgBh2SWg8TeWuVGhgL1SCuBzzeai/+58Sny065Qak+D2WjVOuRonRelh+CBA5EpNZPuiWQkoWdf9NACTBCbS2Zu7r8OOgRqu/ruaDNePlG5+U0Wlpy3oBnpbzQiuSA3AKMW30fsCJtOBjz5qQaiPbYEKJy3AOvtbq10wliKx9TpsTzrq8dKWs7PLhZnzqVCsaq6D95IzjqXcSpx4Cga5bn+YEuAnJQ53PGA5eO+hpz6HDmawTbJlaV/Dufb9bJ/ZZy1DXzs07yWRtTEY54/X ian@iankelling.org
+EOF
+apt update
+apt -y install openssh-server
+
+
+##### in another terminal ######
+
+host=je
+scp /b/ds/bitfolk* root@$host.b8.nz:
+ssh root@$host ./bitfolk-rescue-install $host
+
+### back to the 1st terminal
+
+poweroff
+boot
+
+# press ctrl ]
+exit
+
+jepush
+# todo: lets copy the host keys around so we dont have to do this.
+khfix je
+sl root@je /a/bin/ds/distro-begin
+
+# todo, fix it so i can ssh to
+sl je /a/bin/ds/distro-begin
+sl je /a/bin/ds/distro-end
--- /dev/null
+#!/bin/bash
+
+# assumes we've partitioned /dev/xvda1
+
+if ! test "$BASH_VERSION"; then echo "error: shell is not bash" >&2; exit 1; fi
+shopt -s inherit_errexit 2>/dev/null ||: # ignore fail in bash < 4.4
+set -eE -o pipefail
+trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" exit status: $?, PIPESTATUS: ${PIPESTATUS[*]}" >&2' ERR
+
+# already did this to ssh in
+#apt update
+apt install -y mmdebstrap
+
+wipefs -a /dev/xvda1
+
+mkfs.ext4 /dev/xvda1
+
+
+#apt-key export B138CA450C05112F
+cat >trisquel.key <<'EOF'
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBFhxcQMBEADaT9jTxXNbmjx7kZdTK7JCFx2OAeSS0+XltJnGOPnd6Vj4W1u2
+QLReYD1rVVYA8kneT3VjvZpKO/Ho3TbQZ4E8hNDEwaVmKnTfrmptIzv44cHJexsZ
+eTol74rac/pC/oXCHGINWLflfyQt8iB0dGSEPjtDXvYNFTwBYrQDI9apO5JVWvCB
+qLaQdXU+MCsTaD5OZ4bJ2dQleI34UGea+NrrnAZP68d5hsLL+WTa65fhB3Bws8zc
+1v+JhVJhLYNQOcTHRXU7ieUN2zy+JzbD/10XV64YZQseEkhXG8LMRvfbTDD/SigD
+wKViVFkDa7NFfmpOE9kaF6Nh/XrWmMJjG49KtYUln/G2XCV4TpJrbrWW0OxGqki+
+HH/36N63CZR4lj0EENFQduikTq1LeyQBXQSccwH9FJEI/4Uz5uuVM+tviScmgWEC
+YXo7PdCoUUJDiR6Uma4COAYOTHM/7C5wVofkH1mq5fSz6rbBNIDIoy/W6GrN41s0
+WOl+z9ZgNlY1qtvpwSDenY9WERLajUAJKVTAwI2mvIETo+q8b2J8iIlfI8KkW/+q
+qt87BkFmo2Acgh9l0O1j/Ysp5p7OtCjz8uuO+WzbBo6RlXafx/9e6QoatbtwSBUp
+4W3w2/VANRYtL5DyDCbcuwMk77r9jXp1913sTFlin4xIs840gcVahetA+QARAQAB
+tDFUcmlzcXVlbCBHTlUvTGludXggPHRyaXNxdWVsLWRldmVsQHRyaXNxdWVsLmlu
+Zm8+iQI2BBMBCgAhBQJYcXEDAhsDBQsJCAcDBRUKCQgLBRYDAgEAAh4BAheAAAoJ
+ELE4ykUMBREv6NUP9jTl1CZKHqL3NmF2Df/ID+za7YO5IebxbzRC9vkjrWSuoMpw
+xJ/U5pBCsz0bDewJXMx0XeSNgo/WAzWoPmh3PTUXKhLjlGS2DII96XXbMy2zi+3r
++apIY3wedYkC/BiK9w8pGXGGlTXTo9zitWZC2/yWC4I9W2818mMJPXpQjvkzqdwU
+UlV54fpnqoMNsLFEa1w1ahDerdCTe0Azrr+3YrKaQ287MpkWwO/Cf/yYg0UhbDih
+FFMZ4Wa9aywvxQ86khghOafLLkIHcakMBdVRodym00bGeAjeNHnMffDi3k8tfejk
+g6iLVrZf21+KsVfV+PLX4QQsPCR/dlneKKCAEPh+awserncssizx2/ujhvTd7z3l
+tXGG7UcQP7fYTBWNkU7+ddMOWp26hOsINt0NyxhiGT2ZPEy1vpZ1H80rlaMkiISH
+Z56SCfcHGuEHlkDKdz7ZsS8gU+zqMAnNEDb6UrqZZbhJcR3N7DsTQC/okoF8egDM
+nHD9pUdDakPungnt6j7eLA6Ogca58mbIvwsQ+Qn9Urcd0m30to6WCTlj4jKsrMy/
+QtbyVSc/G3PZXVqP8xWIpuZtu0eMx+NjCKWmOYvTgIjbjLp0z801Weexn9uva+1z
+5nRy+00aOddoLhtXqNFxNS94gXvH3D6ZJ2ejADooEBiqk1M+KWFtOsW01QG5Ag0E
+WHFxAwEQALAKjsSSREoYjswMG1/znBkoNz199DkKJ3DnOk5NulkZcAoHeQVLnv2M
+/1qycG4ndoIkINdz37fKdFziEJd4cBSQ+3gNollaxM2x7KdF7M13Z4YgFgg40qxO
+8id8CSLga2klnFU8aa5PtRPYd4XZ5azpxzCRF8u+1ojM+rLAO0hKLGDhBqjKFvG4
+ASeX14F1R2yiGvZU2lQKQu2ZIk9IqN7M0IsCEh1O8+GNd7lCTFyvAYK0ai0dg9Q5
+F6X2YvQVYDik7rOuP6D6oUmGXufi2vc0OxFX5dBHa4z7XR0BRzg9VtkUerHSbVPI
+c+3mgG5+QmlD+3NKYqiTZvKOWQbgKD/Pg0E0hqw8IjSThmge8XQcTh6qhW8ww0Gz
+ha6HN0At5kMGbQqsTARjfgjhJZdyjA09NGYu1KVKDrKMrN9le9tO05ztZeP3y5My
+S9LaWDE6Flm0BBqkkrHDk+9ID/qDixe/3ZCppu3dJsCF8aaG+sIQjxlMAeXtKOLl
+ZuQbPaVJbQXElwZo3Nz20N2RAZJLXycCev7EbC8Afpg6TYjlJyJX9uyKxDv+QORG
+RJ5vFA4evNCmUrS0PpcodJxk5TMSuR9vRuvT1jVVMe18T1F74XRqTW6xizC7EEM8
+X5QLDuVMRErSUPfcNYLTGJAvPTQ/EgU5aK+H1qv1EEbXeMiuksdvABEBAAGJAh8E
+GAEKAAkFAlhxcQMCGwwACgkQsTjKRQwFES/C0A//aT9JDbwF4JYgyxQuPuxb8G/e
+9thHNBhPmGL7gpyGzUW3q/c6HHnFxT7YPA37fsN/JD9Mcdx2rRFhz0XVR6cfdQZy
+299s2/aX4Tu4FbMnmM+Du5uFFgStJA7LjaacHn6MxEohUeZAL7LMYiUovbwnsaiP
+0sPhLaMrOQkRL/9mEKJiNbn6r/xX4xegzYNqoNdDKbcARaAzm5AH03Mmbc7Ss+OZ
+4v/7vlcUnyEZ2c4jazP7W+pGWIw9f3SqnIxuCeDrCD35IFsUrE27dbtaNpkKw9zF
+lfaEC+6PAI7M78gg2RNvaurCJR5B7bENrobf0lxbYGLGFcOIqTXkbuWjjO3eI/5Q
+rmnO8Uy41Zos03Gsa6QkQ4p6OtVN4hHLxXkirs31cIocPqiJ7Vi+OH8stMNukvVT
+dgnuw4dbPEhDnrFREDNSuRtV+2Lxl4JLr7gQUQDZKEf8cYZUAdN69dcW48Ugdvgu
+6cRDVWakfim6kvZiQ0vxGxGM02V3RdhhZqrwXXYUPyyWMW230IjYc9cYQ+3C/1K4
+MUUeMjKDMPQ/jlUiMjZeE+X0W/TaUj8uCOJ4M6+oYMqwUECPSFe9Of7VTKhB3+Ex
+wGEtYWJUfhuYu8Tph2GZmud0vz4+ugpkliFVliGJfPPJ1EfgAAiUUvomoIXKsynV
+McDbwCjFQn2iazszZsg=
+=UAIm
+-----END PGP PUBLIC KEY BLOCK-----
+EOF
+
+apt-key add trisquel.key
+
+mount /dev/xvda1 /mnt
+
+mmdebstrap nabia /mnt - <<'EOF'
+deb http://archive.trisquel.org/trisquel/ nabia main
+deb-src http://archive.trisquel.org/trisquel/ nabia main
+
+deb http://archive.trisquel.org/trisquel/ nabia-updates main
+deb-src http://archive.trisquel.org/trisquel/ nabia-updates main
+
+deb http://archive.trisquel.info/trisquel/ nabia-security main
+deb-src http://archive.trisquel.info/trisquel/ nabia-security main
+
+deb http://archive.trisquel.org/trisquel/ nabia-backports main
+deb-src http://archive.trisquel.org/trisquel/ nabia-backports main
+EOF
+
+cd /mnt
+
+chrbind() {
+ local d
+ # dev/pts needed for pacman signature check
+ for d in dev proc sys dev/pts; do
+ [[ -d $d ]]
+ if ! mountpoint $d &>/dev/null; then
+ mount -o bind /$d $d
+ fi
+ done
+}
+chrbind
+
+host=$1
+cp /root/bitfolk-chroot-install /mnt
+chroot . /bitfolk-chroot-install $host
+
+poweroff
+
+boot
if [[ $HOSTNAME == $MAIL_HOST ]]; then
mkdir -p /p/bkbackup
for ncdir in /var/www/ncexpertpath /var/www/ncninja; do
+ if [[ ! -d $ncdir ]]; then
+ continue
+ fi
ncbase=${ncdir##*/}
mkdir -p /p/bkbackup/$ncbase
ssh root@$host sudo -u www-data php $ncdir/occ -q maintenance:mode --on
rsync -ra --exclude=testignore --delete root@$host:$ncdir/{config,data,themes} /p/bkbackup/$ncbase || ret=$?
ssh root@$host sudo -u www-data php $ncdir/occ -q maintenance:mode --off
if (( ret )); then
- echo "$0: error: failed rsync $ncdir"
- ret=1
+ echo "$0: error: failed rsync $ncdir"
+ ret=1
fi
done
rsync -ra --delete root@$host:/m /p/bkbackup
c -
}
+vp9() {
+ in=$PWD/$1
+
+ if [[ $2 ]]; then
+ out=$PWD/$2
+ else
+ out=$PWD/vp9/$1
+ fi
+ cd $(mktemp -d)
+ pwd
+ ffmpeg -threads 0 -i $in -g 192 -vcodec libvpx-vp9 -vf scale=-1:720 -max_muxing_queue_size 9999 -b:v 750K -pass 1 -an -f null /dev/null && \
+ ffmpeg -y -threads 0 -i $in -g 192 -vcodec libvpx-vp9 -vf scale=-1:720 -max_muxing_queue_size 9999 -b:v 750K -pass 2 -c:a libvorbis -qscale:a 5 $out
+ cd -
+}
+
+utcl() { # utc 24 hour time to local hour 24 hour time
+ echo "print( ($1 $(date +%z | sed -r 's/..$//;s/^(-?)0*/\1/')) % 24)"|python3
+}
# c. better cd
if type -p wcd &>/dev/null; then
done
}
bindpushb8() {
- dsign iankelling.org expertpathologyreview.com zroe.org amnimal.ninja
lipush
for h in li bk; do
m sl $h <<'EOF'
source ~/.bashrc
-m dnsup
m dnsb8
EOF
done
}
dnsup() {
- conflink
+ conflink -f
m ser reload bind9
}
dnsb8() {
local f=/var/lib/bind/db.b8.nz
ser stop bind9
+ sleep 1
sudo rm -fv $f.jnl
sudo install -m 644 -o bind -g bind /p/c/machine_specific/vps/bind-initial/db.b8.nz $f
ser restart bind9
PostUp = ping -c1 10.8.0.1 ||:
[Peer]
-# li
-PublicKey = zePGl7LoS3iv6ziTI/k8BMh4L3iL3K2t9xJheMR4hQA=
+# li. called wgmail on that server
+PublicKey = CTFsje45qLAU44AbX71Vo+xFJ6rt7Cu6+vdMGyWjBjU=
AllowedIPs = 10.8.0.0/24
Endpoint = 72.14.176.105:1194
PersistentKeepalive = 25
umask $umask_orig
# old approach. systemd seems to work fine and cleaner.
rm -f ../network/interfaces.d/wghole
- cedit -q $host /p/c/machine_specific/li/filesystem/etc/wireguard/wghole.conf <<EOF || [[ $? == 1 ]]
+ cedit -q $host /p/c/machine_specific/li/filesystem/etc/wireguard/wgmail.conf <<EOF || [[ $? == 1 ]]
[Peer]
PublicKey = $(cat hole-pub.key)
AllowedIPs = 10.8.0.$ipsuf/32
if [[ -t 0 ]]; then
exim -t <<EOF
From: alertme@b8.nz
-To: daylerts@iankelling.org
+To: daylert@iankelling.org
Subject: $*
EOF
else
read sub
{ cat <<EOF
From: alertme@b8.nz
-To: daylerts@iankelling.org
+To: daylert@iankelling.org
Subject: $sub
EOF
if torsocks wget -q "$url"; then
alertme $tmpdir
fi
- sleep 600 + $(( RANDOM % 300 ))
+ sleep $(( 600 + RANDOM % 300 ))
done
}
sudo systemd-tty-ask-password-agent
}
+ufix() {
+ ls -lad /run/user/1000
+ s chmod 700 /run/user/1000; s chown iank.iank /run/user/1000
+}
+
# systemctl is-enabled / status / cat says nothing, instead theres
# some obscure symlink. paths copied from man systemd.unit.
# possibly also usefull, but incomplete, doesnt show units not loaded in memory:
dusage="5 10"
musage="5"
-e() { echo "cron: $*"; "$@"; }
+e() {
+ echo "cron: $*"
+ if ! $dryrun; then
+ "$@"
+ fi
+}
check-idle() {
type -p xprintidle &>/dev/null || return 0
usage() {
cat <<EOF
-Usage: ${0##*/} args
+Usage: ${0##*/} [ARGS]
Do btrfs maintence or stop if xprintidle shows a user
force Run regardless of user idle status
force=false
check=false
+dryrun=false
if [[ $1 ]]; then
case $1 in
check)
force)
force=true
;;
+ dryrun)
+ dryrun=true
+ ;;
*)
echo "$0: error: unexpected arg" >&2
usage 1
fi
if ! $idle; then
- btrfs scrub cancel $mnt &>/dev/null ||:
- continue
+ if $dryrun; then
+ echo "$0: not idle. if this wasnt a dry run, btrfs scrub cancel $mnt"
+ else
+ btrfs scrub cancel $mnt &>/dev/null ||:
+ continue
+ fi
fi
if $check; then
continue
e ionice -c 3 btrfs balance start -musage=$usage $mnt
done
fi
- # e btrfs filesystem df $mnt
- # e df -H $mnt
- date=$(
- btrfs scrub status $mnt | \
- sed -rn 's/^\s*scrub started at (.*) and finished.*/\1/p'
- )
+ date=
+ scrub_status=$(btrfs scrub status $mnt)
+ if printf "%s\n" "$scrub_status" | grep -i '^status:[[:space:]]*finished$' &>/dev/null; then
+ date=$(printf "%s\n" "$scrub_status" | sed -rn 's/^Scrub started:[[:space:]]*(.*)/\1/p')
+ fi
+ if [[ ! $date ]]; then
+ # output from older versions, at least btrfs v4.15.1
+ date=$(
+ printf "%s\n" "$scrub_status" | \
+ sed -rn 's/^\s*scrub started at (.*) and finished.*/\1/p'
+ )
+ fi
if [[ $date ]]; then
+ if $dryrun; then
+ echo "$0: last scrub finish for $mnt: $date"
+ fi
date=$(date --date="$date" +%s)
# if date is sooner than 90 days ago
# the wiki recommends 30 days or so, but
# it makes the comp lag like shit for a day,
# so I'm going with 90 days.
if (( date > $(date +%s) - 60*60*24*30 )); then
- echo "cron: skiping scrub of $mnt"
+ if $dryrun; then
+ echo "$0: skiping scrub of $mnt, last was $(( ($(date +%s) - date) / 60/60/24 )) days ago, < 30 days"
+ fi
continue
fi
fi
if [[ -s $statefile ]]; then
logsec=$(date +%s -d "$(head -n1 $statefile | awk '{print $1,$2}')")
nowsec=$(date +%s)
- if (( logsec < nowsec - 60*60*48 )); then
- echo $0: host $h ssh /usr/local/bin/check-mailq fail for over 48 hours
+ if (( logsec < nowsec - 60*60*20 )); then
+ echo $0: host $h ssh /usr/local/bin/check-mailq fail for over 20 hours
fi
fi
printf "%s\n" "$c" | ts "%F %T" >> $statefile
for dir in "$@"; do
fs=$dir/filesystem
if [[ -e $fs && $user =~ ^iank?$ ]]; then
- cmd=( s rsync -aiSAX --chown=root:root --chmod=g-s
+ # we dont want t, instead c for checksum.
+ # That way we dont set times on directories.
+ # -a = -rlptgoD
+ cmd=( s rsync -rclpgoDiSAX --chown=root:root --chmod=g-s
--exclude=/etc/dovecot/users
--exclude='/etc/exim4/passwd*'
--exclude='/etc/exim4/*.pem'
case $file in
etc/prometheus/rules/iank.yml)
case $HOSTNAME in
- kd) m s systemctl reload prometheus ;;
+ kd)
+ if systemctl is-active prometheus &>/dev/null; then
+ m s systemctl reload prometheus
+ fi
+ ;;
esac
;;
etc/systemd/system/*)
done < <("${cmd[@]}")
fi
- if [[ -e $dir/subdir_files ]]; then
+ if ! $fast && [[ -e $dir/subdir_files ]]; then
m subdir-link-r $dir/subdir_files
fi
local x=( $dir/!(binds|subdir_files|filesystem|machine_specific|..|.|.#*) )
for f in /etc/prometheus-{,export-}htpasswd; do
if [[ -e $f ]]; then
s chmod 640 $f
- if getent passwd www-data; then
+ if getent passwd www-data &>/dev/null; then
s chown root:www-data $f
fi
fi
if [[ -e $f ]]; then
# note: this is duplicative of the file's own permissions
s chmod 640 $f /etc/prometheus-pass
- if getent passwd prometheus; then
+ if getent passwd prometheus &>/dev/null; then
s chown root:prometheus $f
fi
fi
-
-
##### end special extra stuff #####
- m sudo -H -u user2 "${BASH_SOURCE[0]}"
+ if ! $fast; then
+ m sudo -H -u user2 "${BASH_SOURCE[0]}"
+ fi
f=/a/bin/distro-setup/system-status
if [[ -x $f ]]; then
### arg parsing
recompile=false
-emacs=true
+emacs=false
+if [[ -e /a/opt/emacs ]]; then
+ emacs=true
+fi
while [[ $1 == -* ]]; do
case $1 in
-r) recompile=true; shift ;;
set +x
source /a/bin/distro-functions/src/identify-distros
$interactive || set -x
-for f in kd x2 x3 frodo tp li bk je demohost kw; do
+for f in kd x2 x3 frodo tp li bk je demohost kw sy bo; do
eval "$f() { [[ $HOSTNAME == $f ]]; }"
done
codename=$(debian-codename)
/^127\.0\.1\.1/d
EOF
+if bitfolk; then
+ sudo systemctl disable systemd-networkd
+fi
+
##### exit first stage if running as root
if [[ $EUID == 0 ]]; then
echo "$0: running as root. exiting now that users are setup"
EOF
done
+###### link files
+# convenient to just do all file linking in one place
+sudo /a/exe/lnf -T /a/bin /b
+sudo /a/exe/lnf -T /a/f /f
+sudo /a/exe/lnf -T /var/log/exim4 /el
+sudo /a/exe/lnf -T /a/f/ans /c
+sudo /a/exe/lnf -T /nocow/t /t
+if has_p; then
+ lnf -T /p/News ~/News
+fi
+dirs=(/q/root /q/root/.editor-backups /q/root/.undo-tree-history)
+sudo mkdir -p ${dirs[@]}
+sudo chmod 600 ${dirs[@]}
+sudo /a/exe/lnf /q/root/.editor-backups /q/root/.undo-tree-history \
+ /a/opt /a/c/.emacs.d $HOME/mw_vars /k/backup /root
+/a/bin/ds/install-my-scripts # needed for rootsshsync cronjob
+sudo /a/exe/lnf /a/c/.vim /a/c/.vimrc /a/c/.gvimrc /root
+
+
###### do conflink
# vps needs bind group before conflink
if vps; then
source /etc/rootsudoenv
fi
+
+
###### bash environment setup
set +x
err-allow
esac
-###### link files
-# convenient to just do all file linking in one place
-sudo /a/exe/lnf -T /a/bin /b
-sudo /a/exe/lnf -T /a/f /f
-sudo /a/exe/lnf -T /var/log/exim4 /el
-sudo /a/exe/lnf -T /a/f/ans /c
-sudo /a/exe/lnf -T /nocow/t /t
-if has_p; then
- lnf -T /p/News ~/News
-fi
-dirs=(/q/root /q/root/.editor-backups /q/root/.undo-tree-history)
-sudo mkdir -p ${dirs[@]}
-sudo chmod 600 ${dirs[@]}
-sudo /a/exe/lnf /q/root/.editor-backups /q/root/.undo-tree-history \
- /a/opt /a/c/.emacs.d $HOME/mw_vars /k/backup /root
-/a/bin/ds/install-my-scripts # needed for rootsshsync cronjob
-sudo /a/exe/lnf /a/c/.vim /a/c/.vimrc /a/c/.gvimrc /root
-
-
-
#### arch specific early packages
case $(distro-name) in
Pin: release n=bionic
Pin-Priority: 500
EOF
-
+ ;;
+ nabia)
+ sd /etc/apt/preferences.d/chromium-bullseye <<EOF
+Package: chromium chromium-* libicu67 libjpeg62-turbo libjsoncpp24 libre2-9 libwebpmux3
+Pin: release o=Debian*,n=bullseye*
+Pin-Priority: 500
+EOF
;;
esac
if [[ ! -e $f ]]; then
dnsb8
fi
+
+ pi prometheus-node-exporter
+
+ # ex for exporter
+ web-conf -p 9101 -f 9100 - apache2 ${HOSTNAME}ex.b8.nz <<'EOF'
+<Location "/">
+AuthType Basic
+AuthName "basic_auth"
+# created with
+# htpasswd -c prometheus-export-htpasswd USERNAME
+AuthUserFile "/etc/prometheus-export-htpasswd"
+Require valid-user
+</Location>
+EOF
;;&
+
bk)
sgo wg-quick@wgmail
m /a/bin/buildscripts/misc
m /a/bin/buildscripts/pithosfly
#m /a/bin/buildscripts/alacritty
-m /a/bin/buildscripts/kitty
+#m /a/bin/buildscripts/kitty
pi-nostart virtinst virt-manager
soff libvirtd
grub-pc grub-pc/install_devices multiselect ${devs[*]}
EOF
-# btrfs maintenance
+
+sysd-prom-fail-install dynamicipupdate
+sysd-prom-fail-install systemstatus
+sysd-prom-fail-install btrfsmaintstop
sgo btrfsmaint.timer
sgo btrfsmaintstop
sgo systemstatus
kd)
# ive got these + a needed dependency pinned to bullseye, just to get
# versions more in line with the main docs.
- pi prometheus-alertmanager prometheus prometheus-node-exporter
+ # Font awesome is needed for the alertmanager ui.
+ pi prometheus-alertmanager prometheus prometheus-node-exporter fonts-font-awesome
web-conf -p 9091 -f 9090 - apache2 i.b8.nz <<'EOF'
<Location "/">
AuthType Basic
Require valid-user
</Location>
EOF
+ # by default, the alertmanager web ui is not enabled other than a page
+ # that suggests to use the amtool cli. that tool is good, but you cant
+ # silence things nearly as fast.
+ if [[ ! -e /usr/share/prometheus/alertmanager/ui/index.html ]]; then
+ sudo chroot /nocow/schroot/bullseye prometheus-alertmanager
+ sudo chroot /nocow/schroot/bullseye /usr/share/prometheus/alertmanager/generate-ui.sh
+ sudo rsync -avih /nocow/schroot/bullseye/usr/share/prometheus/alertmanager/ui/ /usr/share/prometheus/alertmanager/ui
+ ser restart prometheus-alertmanager
+ fi
+
+ for ser in prometheus-node-exporter prometheus-alertmanager prometheus; do
+ sysd-prom-fail-install $ser
+ done
+
;;
*)
pi prometheus-node-exporter
# either use iptables or, in
# /etc/default/prometheus-node-exporter
# listen on the wireguard interface
- li|je|bk)
- # ex for exporter
- web-conf -p 9101 -f 9100 - apache2 ${HOSTNAME}ex.b8.nz <<'EOF'
-<Location "/">
-AuthType Basic
-AuthName "basic_auth"
-# created with
-# htpasswd -c prometheus-export-htpasswd USERNAME
-AuthUserFile "/etc/prometheus-export-htpasswd"
-Require valid-user
-</Location>
-EOF
- ;;
*)
wgip=$(command sudo sed -rn 's,^ *Address *= *([^/]+).*,\1,p' /etc/wireguard/wghole.conf)
+ # old filename. remove once all hosts are updated.
+ s rm -fv /etc/apache2/sites-enabled/${HOSTNAME}wg.b8.nz.conf
web-conf -i -a $wgip -p 9101 -f 9100 - apache2 ${HOSTNAME}wg.b8.nz <<'EOF'
<Location "/">
AuthType Basic
Require valid-user
</Location>
EOF
- ;;
+ # For work, i think we will just use the firewall for hosts in the main data center, and
+ # apache/nginx + tls + basic auth outside of it. or consider stunnel.
+
+
+ # TODO: figure out how to detect the ping failure and try again.
+
+ # Binding to the wg interface, it might go down, so always restart, and wait for it on boot.
+ s mkdir /etc/systemd/system/apache2.service.d
+ sd /etc/systemd/system/apache2.service.d/restart.conf <<EOF
+[Unit]
+After=wg-quick@wghole.service
+StartLimitIntervalSec=0
+
+[Service]
+Restart=always
+RestartSec=30
+EOF
+
+ ;;
esac
### end prometheus ###
#!/bin/bash
source ~/.bashrc
-
-
main() {
fqdn=$(hostname -f)
if $athome; then
- cur4="$(dig +short $dynhost @iankelling.org | tail -1)"
+ if ! cur4="$(dig +short $dynhost @iankelling.org | tail -1)"; then
+ if [[ ! $INVOCATION_ID ]]; then
+ echo "$0: dig failed. internet looks down. giving up"
+ fi
+ return 0
+ fi
if ip4=$(curl -s4 https://iankelling.org/cgi/pubip); then
if [[ $cur4 && $ip4 && $cur4 != $ip4 ]]; then
up4=true # update ipv4
# Set the command-line arguments to pass to the server.
-ARGS="--web.listen-address=127.0.0.1:9090 --web.external-url=https://i.b8.nz:9091"
+ARGS="--web.listen-address=127.0.0.1:9090 --web.external-url=https://i.b8.nz:9091 --log.level=info"
# default:
#ARGS=""
-# iank:
-ARGS="--web.listen-address=127.0.0.1:9093"
+# from its README: If running Alertmanager in high availability mode is not
+# desired, setting --cluster.listen-address= prevents Alertmanager from
+# listening to incoming peer requests.
+# Why the fuck is that in the readme, not the docs below?
+# If you don't add that, it will fail to start on boot because
+# it doesn't wait for network.target, and gives this error message:
+# component=cluster err="couldn't deduce an advertise address: no private IP found, explicit advertise addr not provided"
+
+
+ARGS="--cluster.listen-address= --web.listen-address=127.0.0.1:9093"
# this file is from version 0.21
--- /dev/null
+{{ define "iank.default.description" }}
+{{ if gt (len .Alerts.Firing) 0 -}}
+Alerts Firing:
+{{ template "__text_alert_list" .Alerts.Firing }}
+{{- end }}
+{{ if gt (len .Alerts.Resolved) 0 -}}
+Alerts Resolved:
+{{ template "__text_alert_list" .Alerts.Resolved }}
+{{- end }}
+{{- end }}
- targets:
- kdwg:9101
- sywg:9101
- # - bk:9101
- # - je:9101
- # - li:9101
# - frodo:9101
- # - kwwg:9101
- # - x3wg:9101
- - x2wg:9101
+ - kwwg:9101
+ - x3wg:9101
+# - x2wg:9101
- targets:
- # - bk:9101
- # - je:9101
- # - li:9101
+ - bkex.b8.nz:9101
+ - jeex.b8.nz:9101
+ - liex.b8.nz:9101
groups:
- name: standard
rules:
- - alert: mailtest-check
+
+## uncomment for testing an alert firing
+# - alert: test-alert4
+# expr: vector(1)
+# # expr: nonexistent_metric
+# for: 0m
+# labels:
+# severity: day
+# annotations:
+# description: "always-firing alert VALUE = {{ $value }}"
+
+
+
+###### BEGIN MISC NOTES ######
+
+#
+# other interesting exporters
+# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
+#
+
+# interesting post: https://www.metricfire.com/blog/top-5-prometheus-alertmanager-gotchas/
+
+# interesting promql query that could be useful later.
+# changes(ALERTS_FOR_STATE[24h])
+#
+#
+#
+# alert flap strategy.
+# https://roidelapluie.be/blog/2019/02/21/prometheus-last/
+#
+# Another idea generally is to make an alert that fires for 24 hours and
+# inhibits another alert for the same thing, which we want at most
+# 1 alert per 24 hours.
+
+###### END MISC NOTES ######
+
+
+
+
+# alerting on missing metrics:
+# https://www.robustperception.io/absent-alerting-for-scraped-metrics
+# that doesnt work if we want to alert across multiple hosts, eg
+# up{job="node"} == 1 unless node_systemd_unit_state{name="systemstatus.service",state="active",job="node"}
+# however, google lead me to a solution here
+# https://www.linkedin.com/pulse/prometheus-alert-missing-metrics-labels-nirav-shah
+# there is also the absent() function, but i didnt see a way to make that work
+ - alert: mysers_units_missing
+ expr: |-
+ count(up{job="node"}) by (instance) * 3 unless count(node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"}) by (instance)
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: mysers_not_active
+ expr: |-
+ node_systemd_unit_state{name=~"(systemstatus|btrfsmaintstop|dynamicipupdate).service",state="active"} != 1
+ for: 20m
+ labels:
+ severity: warn
+
+ - alert: sysd_result_fail
+ expr: |-
+ rate(node_systemd_unit_result_fail_count[30m]) > 0
+ labels:
+ severity: day
+
+
+ - alert: mailtest_check
expr: |-
time() - mailtest_check_last_usec > 60 * 12
labels:
severity: day
annotations:
- description: '{{ $labels.instance }} mailtest-check'
- summary: '{{ $labels.instance }} mailtest-check'
+ summary: '12 minutes down'
# 42 mins: enough for a 30 min queue run plus 12
- - alert: mailtest-check
+ - alert: mailtest_check
expr: |-
time() - mailtest_check_last_usec > 60 * 42
labels:
severity: prod
annotations:
- description: '{{ $labels.instance }} mailtest-check'
- summary: '{{ $labels.instance }} mailtest-check'
+ summary: '43 minutes down'
- alert: 1pmtest
- expr: hour() == 18 and minute() < 5
+ expr: hour() == 17 and minute() < 5
for: 0m
labels:
severity: daytest
annotations:
- summary: Prometheus daily test alert (instance {{ $labels.instance }})
- description: "Prometheus daily test alert if no other alerts. It
- is an end to end test.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Prometheus daily test alert
+
+
+
+# alternate expression, to calculate if the alert would have fired is:
+# min_over_time(sum_over_time(up[30m])[1d:]) == 0
+# where 30m matches the for: time in target_down
+#
+# sum_over_time is not needed, just convenience for graphing
+ - alert: target_down_inhibitor
+ expr: |-
+ sum_over_time(ALERTS{alertname="target_down"}[1d])
+ labels:
+ severity: ignore
+ annotations:
+ summary: alert that indicates target_down alert fired in the last day
+ description: "VALUE = {{ $value }}"
+
+# For targets where we alert except for longer downtimes, we
+# still want to know if it is going down many times for short times over
+# a long period of time. But ignore reboots.
+#
+## Another way would be to detect an overall downtime:
+# avg_over_time(node_systemd_unit_state{name="dynamicipupdate.service",state="active"}[1d]) < .95
+ - alert: up_resets
+ expr: |-
+ resets(up[3d]) - changes(node_boot_time_seconds[3d]) > 15
+ labels:
+ severity: warn
+ annotations:
+ summary: "Target has gone down {{ $value }} times in 3 days, > 15"
+
+
# https://awesome-prometheus-alerts.grep.to/rules
severity: day
annotations:
summary: Prometheus job missing (instance {{ $labels.instance }})
- description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "A Prometheus job has disappeared\n VALUE = {{ $value }}"
- - alert: PrometheusTargetMissing
+# TODO: some hosts, notably li and MAIL_HOST, we want to alert sooner than 30m,
+# and severity to day. mail host is tricky since it roams, but I think the
+# right way to do it is to check for absence of this metric:
+# mailtest_check_last_usec{folder="/m/md/l/testignore",from="ian@iankelling.org"}
+ - alert: target_down
expr: up == 0
for: 30m
labels:
severity: warn
annotations:
- summary: Prometheus target missing (instance {{ $labels.instance }})
- description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Target down for 30m
- # todo: this should supress the above alert
- # - alert: PrometheusAllTargetsMissing
- # expr: count by (job) (up) == 0
- # for: 30m
- # labels:
- # severity: day
- # alert-group: local-prom
- # annotations:
- # summary: Prometheus all targets missing (instance {{ $labels.instance }})
- # description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ # todo: this should group with the above alert
+ - alert: PrometheusAllTargetsMissing
+ expr: count by (job) (up) == 0
+ for: 10m
+ labels:
+ severity: day
+# alert-group: local-prom
+ annotations:
+ description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}"
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
labels:
severity: day
annotations:
- summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
- description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
-
- # I have an out of band alert to make sure prometheus is up. this
- # looks like it would generate false positives. todo: think
- # through what a valid crash loop detection would look like.
- # - alert: PrometheusTooManyRestarts
- # expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 10
- # for: 0m
- # labels:
- # severity: warning
- # annotations:
- # summary: Prometheus too many restarts (instance {{ $labels.instance }})
- # description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus configuration reload error\n VALUE = {{ $value }}"
+
+ - alert: PrometheusTooManyRestarts
+ expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[30m]) > 10
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ description: "Prometheus has restarted more than ten times in the last 30 minutes. It might be crashlooping.\n VALUE = {{ $value }}"
- alert: PrometheusAlertmanagerJobMissing
expr: absent(up{job="alertmanager"})
labels:
severity: warn
annotations:
- summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
- description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}"
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: alertmanager_config_last_reload_successful != 1
labels:
severity: day
annotations:
- summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
- description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "AlertManager configuration reload error\n VALUE = {{ $value }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
labels:
severity: day
annotations:
- summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
- description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
- description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
- description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}"
- alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
labels:
severity: warn
annotations:
- summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
- description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[30m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus notifications backlog (instance {{ $labels.instance }})
- description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
- description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}"
# file_sd doesnt count as service discovery, so 0 is expected.
# - alert: PrometheusTargetEmpty
# labels:
# severity: day
# annotations:
- # summary: Prometheus target empty (instance {{ $labels.instance }})
- # description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ # description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 90
labels:
severity: warn
annotations:
- summary: Prometheus target scraping slow (instance {{ $labels.instance }})
- description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}"
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
labels:
severity: warn
annotations:
- summary: Prometheus large scrape (instance {{ $labels.instance }})
- description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}"
- alert: PrometheusTargetScrapeDuplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
- description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
- description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbCheckpointDeletionFailures
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
- description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
- description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbHeadTruncationsFailed
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
- description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbReloadFailures
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
- description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
- description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
labels:
severity: warn
annotations:
- summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
- description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}"
[Service]
Type=simple
-ExecStart=/usr/local/bin/sysd-mail-once -10 btrfsmaintstop /usr/local/bin/btrfsmaint check
+ExecStart=/usr/local/bin/btrfsmaint check
Restart=always
RestartSec=600
[Install]
-WantedBy=grahical.target
+WantedBy=graphical.target
[Service]
Type=simple
-ExecStart=/usr/local/bin/sysd-mail-once -40 dynamicipupdate /usr/local/bin/dynamic-ip-update
+ExecStart=/usr/local/bin/dynamic-ip-update
Restart=always
RestartSec=600
[Install]
-WantedBy=grahical.target
+WantedBy=graphical.target
RestartSec=600
[Install]
-WantedBy=grahical.target
+WantedBy=graphical.target
--- /dev/null
+# apparently alertmanager failes to start when the internet is down:
+# Mar 10 13:20:09 kd prometheus-alertmanager[2719]: level=error ts=2022-03-10T18:20:09.907Z caller=main.go:243 msg="unable to initialize gossip mesh" err="create memberlist: Failed to get final advertise address: No private IP address found, and explicit IP not provided"
+
+[Unit]
+# needed to continually restart
+StartLimitIntervalSec=0
+
+[Service]
+Restart=always
+# time to sleep before restarting a service
+RestartSec=300
--- /dev/null
+[Unit]
+# needed to continually restart
+StartLimitIntervalSec=0
+
+[Service]
+Restart=always
+# time to sleep before restarting a service
+RestartSec=600
[Service]
Type=simple
Environment=XDG_RUNTIME_DIR=/run/user/1000
-ExecStart=/usr/local/bin/sysd-mail-once -3 systemstatus /usr/local/bin/system-status
+ExecStart=/usr/local/bin/system-status
IOSchedulingClass=idle
CPUSchedulingPolicy=idle
User=iank
[Install]
-WantedBy=grahical.target
+WantedBy=graphical.target
--- /dev/null
+#!/bin/sh
+# using bin/sh for speed since the script is very simpl.e
+
+# these system76 systems have garbled display with konsole
+# and some other apps like mumble. something about the intel
+# graphics i think.
+case $HOSTNAME in
+ sy|bo)
+ exec sakura "$@"
+ ;;
+ *)
+ exec konsole "$@"
+ ;;
+esac
if [[ $DEBUG ]]; then
pee cat "wall -n"
else
- echo ok | sed 's/^/myupgrade /' | pee logger "wall -n"
+ sed 's/^/myupgrade /' | pee logger "wall -n"
fi
}
myreboot() {
# We should figure some workaround.
if ! out=$(/sbin/needrestart -p 2>&1); then
- printf "%s\n\n" "$out"
- if [[ $HOSTNAME != "$MAIL_HOST" ]]; then
+ if [[ $HOSTNAME == "$MAIL_HOST" ]]; then
+ needrestart -r l
+ else
myreboot
fi
fi
bindsym $mod+Shift+m border toggle
bindsym $mod+j exec emacsclient -c
-bindsym $mod+k exec kitty
+bindsym $mod+k exec myterm
bindsym $mod+l exec dmenu_run
# note default is 27% on my system76. not sure if these
# keybinds will screw up other laptop brightness keys.
rsync -t --chmod=755 --chown=root:root switch-mail-host btrbk-run mount-latest-subvol \
check-subvol-stale myi3status mailtest-check \
mailbindwatchdog \
- /a/bin/log-quiet/sysd-mail-once \
check-mailq \
unsaved-buffers.el \
mail-backup-clean \
case $HOSTNAME in
$MAIL_HOST)
+ # todo, should this be after vpn service
i /etc/systemd/system/unbound.service.d/nn.conf <<EOF
[Unit]
After=mailnn.service
je.b8.nz * F,1d,10m;F,14d,1h
zroe.org * F,1d,10m;F,14d,1h
eximbackup.b8.nz * F,1d,4m;F,14d,1h
+
+# The spec says the target domain will be used for temporary host errors,
+# but i've found that isn't correct, the hostname is required
+# at least sometimes.
+nn.b8.nz * F,1d,4m;F,14d,1h
+defaultnn.b8.nz * F,1d,4m;F,14d,1h
+mx.iankelling.org * F,1d,4m;F,14d,1h
+bk.b8.nz * F,1d,4m;F,14d,1h
+eggs.gnu.org * F,1d,4m;F,14d,1h
+mail.fsf.org * F,1d,15m;F,14d,1h
EOF
# sieve has the benefit of being supported in postfix and
# proprietary/weird environments, so there is more examples on the
# internet.
- pi dovecot-core dovecot-imapd dovecot-sieve dovecot-lmtpd dovecot-sqlite sqlite3
+ pi-nostart dovecot-core dovecot-imapd dovecot-sieve dovecot-lmtpd dovecot-sqlite sqlite3
for f in /p/c{/machine_specific/$HOSTNAME,}/filesystem/etc/dovecot/users; do
if [[ -e $f ]]; then
### end composer install
rcdirs=(/usr/local/lib/rcexpertpath /usr/local/lib/rcninja)
+ ncdirs=(/var/www/ncninja)
ncdirs=(/var/www/ncexpertpath /var/www/ncninja)
# point debian cronjob to our local install, preventing daily cron error
m usermod -u 608 Debian-exim
m groupmod -g 608 Debian-exim
m usermod -g 608 Debian-exim
- m find / /nocow -path ./var/tmp -prune -o -xdev -uid $uid -execdir chown -h 608 {} +
- m find / /nocow -path ./var/tmp -prune -o -xdev -gid $gid -execdir chgrp -h 608 {} +
+ m find / /nocow -xdev -path ./var/tmp -prune -o -uid $uid -execdir chown -h 608 {} +
+ m find / /nocow -xdev -path ./var/tmp -prune -o -gid $gid -execdir chgrp -h 608 {} +
fi
# * start / stop services
*/5 * * * * root timeout 290 mailtest-check slow |& log-once -4 mailtest-check
# if a bounce happened yesterday, dont let it slip through the cracks
8 1 * * * root export MAILTO=alerts@iankelling.org; awk '\$5 == "**"' /var/log/exim4/mainlog.1
-0 13 * * * root echo "If the 1pm doesnt happen, you are in the matrix. Wake up."
EOF
m sudo rsync -ahhi --chown=root:root --chmod=0755 \
/b/ds/mailtest-check /b/ds/check-remote-mailqs /usr/local/bin/
test_to="testignore@expertpathologyreview.com, testignore@je.b8.nz, testignore@amnimal.ninja, jtuttle@gnu.org"
cat >>/etc/cron.d/mailtest <<EOF
+0 13 * * * root echo "1pm alert. You are not in the matrix."
2 * * * * root check-remote-mailqs |& log-once check-remote-mailqs
EOF
;;&
-for dir in /mnt/r7/amy/{root,boot}_ubuntubionic /mnt/{root2/root,boot2/boot}_ubuntubionic; do
+for dir in /mnt/r7/amy/{root/root,boot/boot}_ubuntubionic /mnt/{root2/root,boot2/boot}_ubuntubionic; do
vol=${dir##*/}
root_dir=${dir%/*}
if [[ ! -d $root_dir ]]; then
cryptsetup
lvm2
mbuffer
+ moreutils
screen
)
p2=(
bash-completion
curl
- ethtool
eatmydata
+ etckeeper
+ ethtool
fping
git
haveged
python3-dnspython
duplicity
elinks
- etckeeper
evince
exim4-doc-html
exfat-fuse
memtester
metastore
mhonarc
- moreutils
+ mmdebstrap
mps-youtube
mpv
mumble
ncftp
nginx-doc
nmap
+ nyancat
obs-studio
offlineimap
oathtool
tmate
transmission-remote-gtk
trash-cli
+ tty-clock
vlc
wamerican-huge
wireless-tools
fi
source /a/bin/bash_unpublished/source-state
+# fixing up a bad state that servers got in.
+if [[ -e /dev/shm/iank-status ]]; then
+ chown iank.iank /dev/shm/iank-status
+fi
+
if dpkg -s rss2email &>/dev/null; then
if [[ $HOSTNAME == "$MAIL_HOST" ]]; then
fi
chown -R root:root /root/.ssh
-rsync -t --chmod=755 --chown=root:root /b/ds/hssh /usr/local/bin
+rsync -t --chmod=755 --chown=root:root /a/bin/ds/hssh /usr/local/bin
if [[ -e /a/opt/btrbk/ssh_filter_btrbk.sh ]]; then
install /a/opt/btrbk/ssh_filter_btrbk.sh /usr/local/bin
bindsym $mod+Shift+m border toggle
bindsym $mod+j exec emacsclient -c
-bindsym $mod+k exec kitty
+bindsym $mod+k exec myterm
bindsym $mod+l exec dmenu_run
# note default is 27% on my system76. not sure if these
# keybinds will screw up other laptop brightness keys.
use_fading=false
scrollable_tabs=true
word_chars=-,./?%&#_~:
+search_accelerator=5
+search_key=F
bindsym $mod+Shift+m border toggle
bindsym $mod+j exec emacsclient -c
-bindsym $mod+k exec kitty
+bindsym $mod+k exec myterm
bindsym $mod+l exec dmenu_run
# note default is 27% on my system76. not sure if these
# keybinds will screw up other laptop brightness keys.
}
# log-once COUNT NAME [MESSAGE]
lo() {
- /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
+ if type -p ifne &>/dev/null; then
+ /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" root@localhost
+ fi
}
loday() {
- /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylerts@iankelling.org
+ if type -p ifne &>/dev/null; then
+ /usr/local/bin/log-once "$@" | ifne mail -s "$HOSTNAME: system-status $2" daylert@iankelling.org
+ fi
}
# todo, consider migrating some of these alerts into prometheus
write-status() {
chars=("${first_chars[@]}")
- services=(
- epanicclean
- systemstatus
- btrfsmaintstop
- dynamicipupdate
- )
+
+ services=( epanicclean )
+ case $HOSTNAME in
+ bk|je|li) : ;;
+ *)
+ services+=(
+ systemstatus
+ btrfsmaintstop
+ dynamicipupdate
+ )
+ ;;
+ esac
+
bads=()
- if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then
+ if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
for s in ${services[@]}; do
- if [[ $(systemctl show -p SubState --value $s) != running ]]; then
+ if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
bads+=($s)
fi
done
case $HOSTNAME in
kd)
bads=()
- if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$'; then
+ if systemctl show -p SubState --value ${services[@]} | egrep -v '^(running|)$' &>/dev/null; then
for s in ${services[@]}; do
- if [[ $(systemctl show -p SubState --value $s) != running ]]; then
+ if [[ $(systemctl show -p SubState --value $s 2>&1) != running ]]; then
bads+=($s)
fi
done
chars+=(A)
fi
- glob=(/m/md/daylerts/{new,cur}/!(*,S))
+ glob=(/m/md/daylert/{new,cur}/!(*,S))
if [[ -e ${glob[0]} ]]; then
chars+=(DAY)
fi
esac
begin=false
- if ! make -C /b/ds -q ~/.local/distro-begin || [[ $(<~/.local/distro-begin) != 0 ]]; then
+
+ if ! make -C /b/ds -q ~/.local/distro-begin 2>/dev/null || [[ $(<~/.local/distro-begin) != 0 ]]; then
begin=true
fi
end=false
- if ! make -C /b/ds -q ~/.local/distro-end || [[ $(<~/.local/distro-end) != 0 ]]; then
+ if ! make -C /b/ds -q ~/.local/distro-end 2>/dev/null || [[ $(<~/.local/distro-end) != 0 ]]; then
end=true
fi
# leave it up to epanic-clean to send email notification
fi
- source /a/bin/bash_unpublished/source-state
+ if [[ -e /a/bin/bash_unpublished/source-state ]]; then
+ # /a gets remounted due to btrbk, ignore error code for file doesnt exist
+ source /a/bin/bash_unpublished/source-state || [[ $? == 1 ]]
+ fi
if [[ $MAIL_HOST == "$HOSTNAME" ]]; then
bbkmsg=
if [[ $(systemctl is-active btrbk.timer) != active ]]; then
lo -1 old-snapshot $snapshotmsg
fi
- cat /a/bin/bash_unpublished/source-state >$status_file
+ if [[ ! -e $status_file || -w $status_file ]]; then
+ if [[ -e /a/bin/bash_unpublished/source-state ]]; then
+ cat /a/bin/bash_unpublished/source-state >$status_file
+ fi
- if [[ ${chars[*]} ]]; then
- echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
+ if [[ ${chars[*]} ]]; then
+ echo "ps_char=\"${chars[*]} \$ps_char\"" >>$status_file
+ fi
fi
}