#!/bin/bash


[[ $EUID == 0 ]] || exec sudo -E "${BASH_SOURCE[0]}" "$@"

f=/usr/local/lib/err;test -r $f || { echo "error: $0 no $f" >&2;exit 1;}; . $f

# inspired from
# https://github.com/kdave/btrfsmaintenance

if [[ $INVOCATION_ID ]]; then
  err-cleanup() {
    exim -odf -i root <<EOF
From: root@$(hostname -f)
To: root@$(hostname -f)
Subject: btrfsmaint automatically exited on command error

journalctl -u btrfsmaint -n 50:
$(journalctl -u btrfsmaint -n 50)
EOF
  }
fi

dusage="5 10"
musage="5"

e() {
  echo "btrfsmaint: $*"
  if ! $dryrun; then
    "$@"
  fi
}

check-idle() {
  type -p xprintidle &>/dev/null || return 0
  export DISPLAY=:0
  # a hours, a movie could run that long.
  idle_limit=$((1000 * 60 * 60 * 2))
  idle_time=$idle_limit
  while read -r user; do
    new_idle_time=$(sudo -u $user xprintidle 2>/dev/null) ||:
    if [[ $new_idle_time && $new_idle_time -lt $idle_time ]]; then
      idle_time=$new_idle_time
    fi
  done < <(users | tr " " "\n" | sort -u)
  if (( idle_time < idle_limit )); then
    idle=false
  else
    idle=true
  fi
}

usage() {
  cat <<EOF
Usage: ${0##*/} [OPTIONS]
Do btrfs maintence or stop if we have X and xprintidle shows a user

Normally, no options are needed.

--check  Only check if an existing maintence should be cancelled due to
         nonidle user and run in a loop every 20 seconds for 10
         minutes.

--dryrun Just print out what we would do.

--force  Run regardless of user idle status on all disks and do scrub
         regardless of when it was last run.
--no-stats  Avoid checking error statistics. Use this to avoid a rare race
            condition when running --check concurrently with normal run.


-h|--help   Show help

Note: Uses util-linux getopt option parsing: spaces between args and
options, short options can be combined, options before args.
EOF
  exit $1
}

##### begin command line parsing ########

# ensure we can handle args with spaces or empty.
ret=0; getopt -T || ret=$?
[[ $ret == 4 ]] || { echo "Install util-linux for enhanced getopt" >&2; exit 1; }

check=false
dryrun=false
force=false
stats=true

temp=$(getopt -l help,check,dryrun,force,no-stats h "$@") || usage 1
eval set -- "$temp"
while true; do
  case $1 in
    --check) check=true ;;
    --dryrun) dryrun=true ;;
    --force) force=true ;;
    --no-stats) stats=false ;;
    -h|--help) usage ;;
    --) shift; break ;;
    *) echo "$0: unexpected args: $*" >&2 ; usage 1 ;;
  esac
  shift
done
readonly check dryrun force stats
##### end command line parsing ########


main() {
  idle=true
  if ! $force; then
    check-idle
    if ! $check; then
      min=0
      max_min=300
      # When the cron kicks in, we may not be idle (physically sleeping) yet, so
      # wait.
      while ! $idle && (( min < max_min )); do
        min=$(( min + 1 ))
        sleep 60
        check-idle
      done
      # If we've waited a really long time for idle, just give up.
      if (( min == max_min )); then
        return
      fi
    fi
  fi


  fnd="findmnt --types btrfs --noheading"
  for x in $($fnd --output "SOURCE" --nofsroot | sort -u); do
    mnt=$($fnd --output "TARGET" --first-only --source $x)
    [[ $mnt ]] || continue

    #### begin look for diff in stats, eg: increasing error count ####
    if $stats; then
      tmp=$(mktemp)
      # ${mnt%/} so that if mnt is / we avoid making a buggy looking path
      stats_path=${mnt%/}/btrfs-dev-stats
      if [[ ! -e $stats_path ]]; then
        btrfs dev stats -c $mnt >$stats_path ||: # populate initial reading
      elif ! btrfs dev stats -c $mnt >$tmp; then
        if ! diff -q $stats_path $tmp; then
          mv $stats_path $stats_path.1
          cat $tmp >$stats_path
          diff=$(diff -u $stats_path $tmp 2>&1 ||:)
          printf "diff of: btrfs dev stats -c %s\n%s\n" "$mnt" "$diff"
          exim -odf -i root <<EOF
From: root@$(hostname -f)
To: root@$(hostname -f)
Subject: btrfsmaint: device stats changed for $mnt

diff of: btrfs dev stats -c $mnt
$diff
EOF
        fi
      fi
      rm -f $tmp
    fi
    #### end look for diff in stats, eg: increasing error count ####

    if $check; then
      if ! $idle; then
        if $dryrun; then
          echo "$0: not idle. if this wasnt a dry run, btrfs scrub cancel $mnt"
        else
          btrfs scrub cancel $mnt &>/dev/null ||:
        fi
      fi
      continue
    fi

    # for comparing before and after balance.
    # the log is already fairly verbose, so commented.
    # e btrfs filesystem df $mnt
    # e df -H $mnt
    if btrfs filesystem df $mnt | grep -q "Data+Metadata"; then
      for usage in $dusage; do
        e ionice -c 3 btrfs balance start -dusage=$usage -musage=$usage $mnt
      done
    else
      e ionice -c 3 btrfs balance start -dusage=0 $mnt
      for usage in $dusage; do
        e ionice -c 3 btrfs balance start -dusage=$usage $mnt
      done
      e ionice -c 3 btrfs balance start -musage=0 $mnt
      for usage in $musage; do
        e ionice -c 3 btrfs balance start -musage=$usage $mnt
      done
    fi
    date=
    scrub_status=$(btrfs scrub status $mnt)
    if printf "%s\n" "$scrub_status" | grep -i '^status:[[:space:]]*finished$' &>/dev/null; then
      date=$(printf "%s\n" "$scrub_status" | sed -rn 's/^Scrub started:[[:space:]]*(.*)/\1/p')
    fi
    if [[ ! $date ]]; then
      # output from older versions, at least btrfs v4.15.1
      date=$(
        printf "%s\n" "$scrub_status" | \
          sed -rn 's/^\s*scrub started at (.*) and finished.*/\1/p'
          )
    fi
    if ! $force && [[ $date ]]; then
      if $dryrun; then
        echo "$0: last scrub finish for $mnt: $date"
      fi
      date=$(date --date="$date" +%s)
      # if date is sooner than 60 days ago
      # the wiki recommends 30 days or so, but
      # I'm going with 60 days.
      if (( date > EPOCHSECONDS - 60*60*24*60 )); then
        if $dryrun; then
          echo "$0: skiping scrub of $mnt, last was $(( (EPOCHSECONDS - date) / 60/60/24 )) days ago, < 30 days"
        fi
        continue
      fi
    fi
    # btrfsmaintenance does -c 2 -n 4, but I want lowest pri.
    e btrfs scrub start -Bd -c 3 $mnt

    # We normally only do one disk since this is meant to be run in
    # downtime and if we try to do all disks, we invariably end up doing
    # a scrub after downtime. So, just do one disk per day.
    if ! $force; then
      return 0
    fi
  done
}

loop-main() {
  while true; do
    main
    sleep 60
  done
}

if $check; then
  loop-main
else
  main
fi