#!/bin/bash -x
# Copyright (C) 2016 Ian Kelling

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

PS4='+ $LINENO '
set -eE -o pipefail
trap 'echo "$0:$LINENO:error: \"$BASH_COMMAND\" returned $?" >&2' ERR

if [[ $EUID != 0 ]]; then
  echo "$0: error: need to run as root" >&2
  exit 1
fi

# for calling outside of FAI:
# fai-redep
#
# source /b/fai/fai-wrapper
# - set any appropriate classes with: fai-setclass OPT1... which sets CLASS_OPT1=true...
#   or run eval-fai-classfile FILE.
# - Set a VOL_DISTROVER (if not doing mkroot2) eg:
#   fai-setclass VOL_NABIA
#
# OPTIONS:
#
# mkroot2: for running outside of fai and setting up the root2/boot2 luks and btrfs
#
# environment variables:
#
# HOSTNAME: if frodo, we exclude 2 devices from the /boot array, which
# the bios does not see. if demohost, we set the luks password to just
# 'x'.
#
# SPECIAL_DISK: For use outside of fai. A base disk name like
# /dev/sdk. If set, we just cryptsetup and partition this one disk then
# exit. This is useful for partitioning a disk in preparation to replace
# a failed or failing disk from a raid10 array.
#
# classes:
#
# REPARTITION: forces repartitioning even if we detect the proper amount
# of partitions already exist.
#
# NOWIPE: use existing subvolumes if they exist
#
# ROTATIONAL: forces to install onto hdds instead of sdds. normally sdds
# are chosen if they exist.
#
# PARTITION_PROMPT: command line prompt before partitioning
#
# RAID0: forces raid0 filesystem. Normally with 4+ devices, we use
# raid10.
# RAID1: forces raid1 filesystem.

mkroot2=false
if [[ $1 ]]; then
  case $1 in
    mkroot2)
      mkroot2=true
      ;;
    *)
      echo "$0: error: unsupported arg: $1" >&2
      exit 1
      ;;
  esac
fi


if [[ $SPECIAL_DISK ]]; then
  export CLASS_REPARTITION=true
fi

# # fai's setup-storage won't do btrfs on luks,
# # so we do it ourself :)
# inspiration taken from files in fai-setup-storage package

# if we are not running in fai, skiptask won't be defined, so carry on.
skiptask partition || ! type skiptask

if ! type -p devbyid; then
  for d in $FAI/distro-install-common \
               /a/bin/fai/fai/config/distro-install-common $FAI $PWD; do
    [[ -d $d ]] || continue
    if [[ -e $d/devbyid ]]; then
      devbyid=$d/devbyid
      devbyid() { $devbyid "$@"; }
      break
    fi
  done
  if [[ ! $devbyid ]]; then
    echo "$0: error: failed to find devbyid script" >&2
    exit 1
  fi
fi


#### begin configuration

# this is the ordering of the /dev/sdaX, but
# the ordering of the partition layout goes like this:
# bios_grub
# grub_ext
# efi
# root
# swap
# boot

rootn=1
root2n=2
swapn=3
bootn=4
boot2n=5
efin=6
# ext partition so grub can write persistent variables,
# so it can do a one time boot. grub can't write to
# btrfs or any cow fs because it's more
# more complicated to do and they don't want to.
grub_extn=7
# bios boot partition,
# https://wiki.archlinux.org/index.php/GRUB
bios_grubn=8
even_bign=9
lastn=$bios_grubn


##### end configuration


add-part() { # add partition suffix to $dev
  local d part
  if [[ $# == 1 ]]; then
    d=$dev
    part=$1
  else
    d=$1
    part=$2
  fi
  echo $d-part$part
}

rootdev() { add-part $@ $rootn; }
root2dev() { add-part $@ $root2n; }
swapdev() { add-part $@ $swapn; }
bootdev() { add-part $@ $bootn; }
boot2dev() { add-part $@ $boot2n; }
efidev() { add-part $@ $efin; }
grub_extdev() { add-part $@ $grub_extn; }
bios_grubdev() { add-part $@ $bios_grubn; }
even_bigdev() { add-part $@ $even_bign; }

crypt-dev() { echo /dev/mapper/crypt_dev_${1##*/}; }
crypt-name() { echo crypt_dev_${1##*/}; }
root-cryptdev() { crypt-dev $(rootdev $@); }
root2-cryptdev() { crypt-dev $(root2dev $@); }
swap-cryptdev() { crypt-dev $(swapdev $@); }
root-cryptname() { crypt-name $(rootdev $@); }
root2-cryptname() { crypt-name $(root2dev $@); }
swap-cryptname() { crypt-name $(swapdev $@); }

dev-mib() {
  local d=${1:-$dev}
  echo $(( $(parted -m $d unit MiB print | \
               sed -nr "s#^/dev/[^:]+:([0-9]+).*#\1#p") - 1))
}

luks-setup() {
  local luksdev="$1"
  # when we move to newer than trisquel 9, we can remove
  # --type luks1. We can also check on cryptsetup --help | less /compil
  # to see about the other settings. Default in debian 9 is luks2.
  # You can convert from luks2 to luks 1 by adding a temporary key:
  # cryptsetup luksAddKey --pbkdf pbkdf2
  # then remove the new format keys with cryptsetup luksRemoveKey
  # then cryptsetup convert DEV --type luks1, then readd old keys and remove temp.
  yes YES | cryptsetup luksFormat $luksdev $luks_file || [[ $? == 141 ]]
  yes "$lukspw" | \
    cryptsetup luksAddKey --key-file $luks_file \
               $luksdev || [[ $? == 141 ]]
  # background: Keyfile and password are treated just
  # like 2 ways to input a passphrase, so we don't actually need to have
  # different contents of keyfile and passphrase, but it makes some
  # security sense to a really big randomly generated passphrase
  # as much as possible, so we have both.
  #
  # This would remove the keyfile.
  #    yes 'test' | cryptsetup luksRemoveKey /dev/... \
    #                            /key/file || [[ $? == 141 ]]
  cryptsetup luksOpen $luksdev $(crypt-name $luksdev) --key-file $luks_file
}

##### end function defs

if ifclass REPARTITION; then
  partition=true # force a full wipe
else
  partition=false # change to true to force a full wipe
fi
if ifclass NOWIPE; then
  wipe=false
else
  wipe=true
fi

if (($(nproc) > 2)); then
  mopts=,compress=zstd
fi

declare -A disk_excludes
if ! $mkroot2 && ! ifclass USE_MOUNTED; then
  ## ignore disks that are mounted, eg when running from fai-cd
  while read -r l; do
    eval "$l"
    if [[ ! $PKNAME ]]; then
      PKNAME="$KNAME"
    fi
    if [[ $MOUNTPOINT ]]; then
      disk_excludes[$PKNAME]=true
    fi
  done < <(lsblk -nP -o KNAME,MOUNTPOINT,PKNAME)
fi

hdds=()
ssds=()
# this excludes usb. note: i may encounter some other type in the future.
for disk in $(lsblk -do name,tran -n | awk '$2 ~ "^(sata|nvme)$" { print $1 }'); do
  if [[ ${disk_excludes[$disk]} ]]; then
    continue
  fi
  case $(cat /sys/block/$disk/queue/rotational) in
    0) ssds+=(/dev/$disk) ;;
    1) hdds+=(/dev/$disk) ;;
    *) echo "$0: error: unknown /sys/block/$disk/queue/rotational: \
$(cat $disk/queue/rotational)"; exit 1 ;;
  esac
done

# install all ssds, or if there are none, all hdds.
# Note, usb flash disks are seen as rotational, which is
# very odd, but convenient for ignoring them here.
# TODO: find a reliable way to ignore them.
if ! ifclass ROTATIONAL && (( ${#ssds[@]} > 0 )); then
  short_devs=( ${ssds[@]} )
else
  short_devs=( ${hdds[@]} )
fi

# check if the partitions exist have the right filesystems
#blkid="$(blkid -s TYPE)"
for dev in ${short_devs[@]}; do
  if $partition; then break; fi
  y=$(readlink -f $dev)
  arr=($y?*)
  if (( ${#arr[@]} < lastn )); then
    partition=true
  fi
  # On one system, blkid is missing some partitions.
  # maybe we need a flag, like FUZZY_BLKID or something, so we
  # can check that at least some exist.
  # for x in "`rootdev`: TYPE=\"crypto_LUKS\"" "`bootdev`: TYPE=\"btrfs\""; do
  #     echo "$blkid" | grep -Fx "$x" &>/dev/null || partition=true
  # done
done

if $partition && ifclass PARTITION_PROMPT; then
  echo "Press any key except ctrl-c to continue and partition these drives:"
  echo "  ${short_devs[*]}"
  read -r
fi

devs=()
shopt -s extglob
for short_dev in ${short_devs[@]}; do
  devs+=($(devbyid $short_dev))
done
if [[ ! ${devs[0]} ]]; then
  echo "$0: error: failed to detect devs" >&2
  exit 1
fi

boot_space=0
first=true
boot_devs=()
boot2_devs=()
for dev in ${devs[@]}; do
  if ifclass frodo; then
    # I ran into a machine where the bios doesn't know about some disks,
    # so 1st stage of grub also doesn't know about them.
    # Also, grub does not support mounting degraded btrfs as far as
    # I can tell with some googling.
    # From within an arch install env, I could detect them by noting
    # their partitions were mixed with the next disk in /dev/disk/by-path,
    # and I have mixed model disks, and I could see the 8 models which showed
    # up in the bios, and thus see which 2 models were missing.
    # hdparm -I /dev/sdh will give model info in linux.
    # However, in fai on jessie, /dev/disk/by-path dir doesn't exist,
    # and I don't see another way, so I'm hardcoding them.
    # We still put grub on them and partition them the same, for uniformity
    # and in case they get moved to a system that can recognize them,
    # we just exclude them from the boot filesystem.
    cd /dev/disk/by-id/
    bad_disk=false
    for id in ata-TOSHIBA_MD04ACA500_8539K4TQFS9A \
                ata-TOSHIBA_MD04ACA500_Y5IFK6IJFS9A; do
      if [[ $(readlink -f $id) == "$(readlink -f $dev)" ]]; then
        bad_disk=true
        break
      fi
    done
    if ! $bad_disk; then
      boot_devs+=($(bootdev))
      boot2_devs+=($(boot2dev))
    fi
  else
    boot_space=$(( boot_space + $(parted -m $dev unit MiB print | \
                                    sed -nr "s#^/dev/[^:]+:([0-9]+).*#\1#p") - 1))
    boot_devs+=($(bootdev))
    boot2_devs+=($(boot2dev))
  fi
  if $first && [[ $boot_devs ]]; then
    first_efi=$(efidev)
    first_grub_extdev=$(grub_extdev)
    first=false
  fi
done
first_boot_dev=${boot_devs[0]}

even_raid=false
if ifclass RAID0 || (( ${#boot_devs[@]} == 1 )); then
  raid_level=0
elif ifclass RAID1 || (( ${#boot_devs[@]} <= 3 )); then
  if (( ${#boot_devs[@]} == 2 )); then
    even_raid=true
  fi
  raid_level=1
else
  raid_level=10
fi

### Begin calculate boot partition space
# due to raid duplication
case $raid_level in
  1*) boot_space=$(( boot_space / 2 )) ;;
esac
if (( boot_space > 60000 )); then
  # this is larger than needed for several /boot subvols,
  # becuase I keep a minimal debian install on it for
  # recovery needs and for doing pxe-kexec.
  boot_mib=10000
  root2_mib=200000
  boot2_mib=2000
elif (( boot_spa_ce > 30000 )); then
  boot_mib=$(( 5000 + (boot_space - 30000) / 2 ))
  root2_mib=100
  boot2_mib=100
else
  # Small vms don't have room for /boot recovery.  With 3 kernels
  # installed, i'm using 132M on t8, so this seems like plenty of
  # room. note: rhel 8 recomments 1g for /boot.
  boot_mib=500
  root2_mib=100
  boot2_mib=100
fi
case $raid_level in
  1*)
    boot_mib=$(( boot_mib * 2 ))
    boot2_mib=$(( boot2_mib * 2 ))
    root2_mib=$(( root2_mib * 2 ))
    ;;
esac
### end calculate boot partition space


if [[ ! $DISTRO ]]; then
  if ifclass VOL_BULLSEYE_BOOTSTRAP; then
    DISTRO=debianbullseye_bootstrap
  elif ifclass VOL_STRETCH; then
    DISTRO=debianstretch
  elif ifclass VOL_BUSTER; then
    DISTRO=debianbuster
  elif ifclass VOL_BULLSEYE; then
    DISTRO=debianbullseye
  elif ifclass VOL_BOOKWORM; then
    DISTRO=debianbookworm
  elif ifclass VOL_TESTING; then
    DISTRO=debiantesting
  elif ifclass VOL_XENIAL; then
    DISTRO=ubuntuxenial
  elif ifclass VOL_BIONIC; then
    DISTRO=ubuntubionic
  elif ifclass VOL_FOCAL; then
    DISTRO=ubuntufocal
  elif ifclass VOL_FLIDAS; then
    DISTRO=trisquelflidas
  elif ifclass VOL_ETIONA; then
    DISTRO=trisqueletiona
  elif ifclass VOL_NABIA; then
    DISTRO=trisquelnabia
  elif $mkroot2; then
    :
  else
    echo "PARTITIONER ERROR: no distro class/var set" >&2
    exit 1
  fi
fi


bpart() { # btrfs a partition
  case $raid_level in
    0) mkfs.btrfs -f $@ ;;
    1) mkfs.btrfs -f -m raid1 -d raid1 $@ ;;
    10) mkfs.btrfs -f -m raid10 -d raid10 $@ ;;
  esac
}


if [[ ! $luks_dir ]]; then
  # see README for docs about how to create these
  luks_dir=$FAI/distro-install-common/luks
  if [[ ! -d $luks_dir ]]; then
    luks_dir=/q/root/luks
  fi
  if [[ ! -d $luks_dir ]]; then
    echo "$0: error: no luks_dir found" >&2
    exit 1
  fi
fi

luks_file=$luks_dir/host-$HOSTNAME
if [[ ! -e $luks_file ]]; then
  hostkeys=($luks_dir/host-*)
  # if there is only one key, we might be deploying somewhere
  # where dhcp doesnt give us a proper hostname, so use that.
  if [[ ${#hostkeys[@]} == 1 && -e ${hostkeys[0]} ]]; then
    luks_file=${hostkeys[0]}
  else
    echo "$0: error: no key for hostname at $luks_file" >&2
    exit 1
  fi
fi

# # note, corresponding changes in /b/ds/keyscript-{on,off}
if ifclass demohost; then
  lukspw=x
elif [[ -e $luks_dir/$HOSTNAME ]]; then
  lukspw=$(cat $luks_dir/$HOSTNAME)
else
  lukspw=$(cat $luks_dir/iank)
fi


first_root_crypt=$(root-cryptdev ${devs[0]})

# 1.5 x based on https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/7/html/Installation_Guide/sect-disk-partitioning-setup-x86.html#sect-custom-partitioning-x86
swap_mib=$(( $(grep ^MemTotal: /proc/meminfo | \
                 awk '{print $2}') * 3/(${#devs[@]} * 2 ) / 1024 ))

mkdir -p /tmp/fai
root_devs=()
for dev in ${devs[@]}; do
  root_devs+=($(rootdev))
done
shopt -s nullglob

# We write to these files instead of just /etc/fstab, /etc/crypttab,
# because these are filesystems created after our current root, and so
# this allows us to update other root filesystems too.
rm -f /mnt/root/root2-{fs,crypt}tab
if $mkroot2; then
  if $partition; then
    echo $0: error: found partition=true but have mkroot2 arg
    exit 1
  fi
  for dev in ${devs[@]}; do
    luks_file=$luks_dir/host-amy
    lukspw=$(cat $luks_dir/amy)
    luks-setup $(root2dev)
    cat >>/mnt/root/root2-crypttab <<EOF
$(root2-cryptname) $(root2dev)  $luks_file  discard,luks,initramfs
EOF
  done
  bpart $(for dev in ${devs[@]}; do root2-cryptdev; done)
  bpart ${boot2_devs[@]}
  mkdir -p /mnt/root2 /mnt/boot2
  cat >>/mnt/root/root2-fstab <<EOF
$(root2-cryptdev ${devs[0]}) /mnt/root2  btrfs  nofail,x-systemd.device-timeout=30s,x-systemd.mount-timeout=30s,noatime,subvolid=0$mopts  0 0
${boot2_devs[0]} /mnt/boot2  btrfs    nofail,x-systemd.device-timeout=30s,x-systemd.mount-timeout=30s,noatime,subvolid=0  0 0
EOF
  exit 0
fi


if $partition; then
  ### begin wipefs
  if [[ ! $SPECIAL_DISK ]]; then
    for dev in ${devs[@]}; do
      # if we repartition to the same as an old partition,
      # we don't want any old fses hanging around.
      for (( i=1; i <= lastn; i++ )); do
        x=$(add-part $i)
        [[ -e $x ]] || continue
        count_down=10
        # wipefs has failed, manual run works, google suggests timing issue
        while ! wipefs -a $x; do
          sleep 2
          count_down=$((count_down - 1))
          (( count_down > 0 )) || exit 1
        done
      done
    done
  fi
  ### end wipefs


  # When we have 2 disks of at least 100g difference in size,
  # make an extra partition on the end of the bigger one.
  even_big_part=false
  even_diff_min=100000
  if $even_raid; then
    smalli=0
    bigi=1
    if (( $(dev-mib ${devs[0]}) >= $(dev-mib ${devs[1]}) )); then
      smalli=1
      bigi=0
    fi
    disk_mib=$(dev-mib ${devs[smalli]})
    even_big_dev=${devs[bigi]}
    even_big_mib=$(dev-mib $even_big_dev)
    if (( even_big_mib - disk_mib > even_diff_min )); then
      even_big_part=true
    fi
  fi

  for dev in ${devs[@]}; do
    if [[ $SPECIAL_DISK ]]; then
      dev=$(devbyid $SPECIAL_DISK)
    fi

    # parted will round up the disk size. Do -1 so we can have
    # fully 1MiB unit partitions for easy resizing of the last partition.
    # Otherwise we would pass in -0 for the end argument for the last partition.
    #
    # Note: parted print error output is expected. example:
    # Error: /dev/vda: unrecognised disk label
    if ! $even_raid; then
      disk_mib=$(dev-mib)
    fi

    boot_part_mib=$(( boot_mib / ${#boot_devs[@]} ))
    boot2_part_mib=$(( boot2_mib / ${#boot_devs[@]} ))
    root2_part_mib=$(( root2_mib / ${#root_devs[@]} ))
    root_end=$(( disk_mib - root2_part_mib - swap_mib - boot_part_mib - boot2_part_mib ))
    root2_end=$(( root_end + root2_part_mib ))
    swap_end=$(( root2_end + swap_mib ))
    boot_end=$(( swap_end + boot_part_mib ))

    parted -s $dev mklabel gpt
    # MiB because parted complains about alignment otherwise.
    pcmd="parted -a optimal -s -- $dev"
    # root partition, the main big one
    $pcmd mkpart primary ext3 524MiB ${root_end}MiB
    # without naming, systemd gives us misc errors like:
    # dev-disk-by\x2dpartlabel-primary.device: Dev dev-disk-by\x2dpartlabel-primary.device appeared twice
    $pcmd name $rootn root
    # root2 partition
    $pcmd mkpart primary ext3 ${root_end}MiB ${root2_end}MiB
    $pcmd name $root2n root2
    # normally a swap is type "linux-swap", but this is encrypted swap. using that
    # label will confuse systemd.
    # swap partition
    $pcmd mkpart primary "" ${root2_end}MiB ${swap_end}MiB
    $pcmd name $swapn swap
    # boot partition
    $pcmd mkpart primary "" ${swap_end}MiB ${boot_end}MiB
    $pcmd name $bootn boot
    # boot2 partition
    $pcmd mkpart primary "" ${boot_end}MiB ${disk_mib}MiB
    $pcmd name $boot2n boot2
    # uefi partition. efi sucks, half a gig, rediculous.
    $pcmd mkpart primary "fat32" 12MiB 524MiB
    $pcmd name $efin efi
    $pcmd set $efin esp on
    # note, this is shown here: https://support.system76.com/articles/bootloader/
    # but not mentioned https://wiki.archlinux.org/index.php/EFI_system_partition
    # probably not needed
    $pcmd set $bootn boot on
    $pcmd set $boot2n boot on
    # i only need a few k, but googling min size,
    # I found someone saying that gparted required
    # required at least 8 because of their hard drive cylinder size.
    # And 8 is still very tiny.
    # grub_ext partition
    $pcmd mkpart primary "ext2" 4MiB 12MiB
    $pcmd name $grub_extn grubext
    # gpt ubuntu cloud image uses ~4 mb for this partition. fai uses 1 MiB.
    # so, I use 3, whatever.
    # note: parted manual saying cheap flash media
    # should to start at 4.
    # biols grub partition
    $pcmd mkpart primary "" 1MiB 4MiB
    $pcmd name $bios_grubn biosgrub
    $pcmd set $bios_grubn bios_grub on
    if $even_big_part  && [[ $dev == "$even_big_dev" ]]; then
      $pcmd mkpart primary ext3 ${disk_mib}MiB ${even_big_mib}MiB
      $pcmd name $even_bign even_big
    fi

    # the mkfs failed before on a vm, which prompted me to add
    # sleep .1
    # then it failed again on a physical machine
    # with:
    # Device /dev/disk/by-id/foo doesn't exist or access denied,
    # so I added a wait until it existed.
    # Then I added the mkfs.ext2, which claimed to succeed,
    # but then couldn't be found upon reboot. In that case we didn't
    # wait at all. So I've added a 3 second minimum wait.
    secs=0
    while [[ ! -e $(bios_grubdev) ]] && (( secs < 10 )); do
      sleep 1
      secs=$((secs +1))
    done
    sleep 3

    mkfs.fat -F32 $(efidev)

    if $even_big_part  && [[ $dev == "$even_big_dev" ]]; then
      luks-setup $(even_bigdev)
      mkfs.btrfs -f $(crypt-dev $(even_bigdev))
    fi

    # Holds just a single file, rarely written, so
    # use ext2, like was often used for the /boot partition.
    # This exists because grub can only persist data to a non-cow fs.
    # And we use persisting a var in grub to do a one time boot.
    # We could pass the data on the kernel command line and persist it
    # to grubenv after booting, but that relies on the boot always succeeding.
    # This is just a bit more robust, and it could work for booting
    # into ipxe which can't persist data, if we ever got that working.
    mkfs.ext2 $(grub_extdev)
    luks-setup $(rootdev)

    if [[ $SPECIAL_DISK ]]; then
      exit 0
    fi
  done
  ls -la /dev/btrfs-control # this was probably for debugging...
  sleep 1
  bpart $(for dev in ${devs[@]}; do root-cryptdev; done)
  bpart ${boot_devs[@]}
else
  for dev in ${devs[@]}; do
    if [[ -e /dev/mapper/$(root-cryptname) ]]; then
      continue
    fi
    cryptsetup luksOpen $(rootdev) $(root-cryptname) \
               --key-file $luks_file
  done
  sleep 1
fi


if $wipe && [[ $DISTRO != debianbullseye_bootstrap ]]; then
  # bootstrap distro doesn't use separate encrypted root.
  mount -o subvolid=0 $first_root_crypt /mnt
  # systemd creates subvolumes we want to delete.
  s=($(btrfs subvolume list --sort=-path /mnt |
         sed -rn "s#^.*path\s*(root_$DISTRO/\S+)\s*\$#\1#p"))
  for subvol in ${s[@]}; do btrfs subvolume delete /mnt/$subvol; done
  btrfs subvolume set-default 0 /mnt
  [[ ! -e /mnt/root_$DISTRO ]] || btrfs subvolume delete /mnt/root_$DISTRO

  ## create subvols ##
  cd /mnt

  btrfs subvolume create root_$DISTRO

  # could set default subvol like this, but no reason to.
  # btrfs subvolume set-default \
    #       $(btrfs subvolume list . | grep "root_$DISTRO$" | awk '{print $2}') .

  # For raid systems, cow allows for error correction, for non-raid systems,
  # protects root fs from having the plug pulled. Reprovisioning a root
  # subvol is not my favorite thing to do.
  # # no cow on the root filesystem. it's setup is fully scripted,
  # # if it's messed up, we will just recreated it,
  # # and we can get better perf with this.
  # # I can't remember exactly why, but this is preferable to mounting with
  # # -o nodatacow, I think because subvolumes inherit that.
  # chattr -Rf +C root_$DISTRO
  cd /
  umount /mnt
fi

mount -o subvolid=0 $first_boot_dev /mnt
cd /mnt
btrfs subvolume set-default 0 /mnt # already default, just ensuring it.

# for libreboot systems. grub2 only reads from subvolid=0
mkdir -p /mnt/grub2
cp $FAI/distro-install-common/libreboot_grub.cfg /mnt/grub2

if [[ $DISTRO == debianbullseye_bootstrap ]]; then
  # this is just convenience for the libreboot_grub config
  # so we can glob the other ones easier.
  boot_vol=$DISTRO
else
  boot_vol=boot_$DISTRO
fi
if $wipe && [[ -e /mnt/$boot_vol ]]; then
  btrfs subvolume delete /mnt/$boot_vol
fi
if [[ ! -e /mnt/$boot_vol ]]; then
  btrfs subvolume create $boot_vol
fi
cd /
umount /mnt
## end create subvols ##

dev=${boot_devs[0]}
mount $first_grub_extdev /mnt
grub-editenv /mnt/grubenv set did_fai_check=true
grub-editenv /mnt/grubenv set last_boot=/$boot_vol
umount /mnt

fstabstd=x-systemd.device-timeout=30s,x-systemd.mount-timeout=30s
if [[ $DISTRO == debianbullseye_bootstrap ]]; then
  cat > /tmp/fai/fstab <<EOF
$first_boot_dev  /  btrfs  noatime,subvol=$boot_vol  0 0
$first_efi  /boot/efi  vfat          nofail,$fstabstd  0 0
EOF
  cat >/tmp/fai/disk_var.sh <<EOF
BOOT_DEVICE="${short_devs[@]}"
ROOT_PARTITION=$first_boot_dev
EOF
else
  # note, fai creates the mountpoints listed here
  cat > /tmp/fai/fstab <<EOF
$first_root_crypt  /  btrfs $fstabstdopts,noatime,subvol=root_$DISTRO$mopts  0 0
$first_root_crypt  /mnt/root  btrfs  nofail,$fstabstd,noatime,subvolid=0$mopts  0 0
$first_boot_dev  /boot  btrfs        nofail,$fstabstd,noatime,subvol=$boot_vol  0 0
$first_efi  /boot/efi  vfat          nofail,$fstabstd  0 0
$first_boot_dev  /mnt/boot  btrfs    nofail,$fstabstd,noatime,subvolid=0  0 0
EOF
  swaps=()
  rm -f /tmp/fai/crypttab
  for dev in ${devs[@]}; do
    swaps+=($(swap-cryptname))
    cat >>/tmp/fai/crypttab <<EOF
$(root-cryptname) $(rootdev)  none  keyscript=/root/keyscript,discard,luks,initramfs
$(swap-cryptname) $(swapdev)  /dev/urandom  swap,cipher=aes-xts-plain64,size=256,hash=ripemd160
EOF
    cat >> /tmp/fai/fstab <<EOF
$(swap-cryptdev)  none  swap  nofail,$fstabstd,sw  0 0
EOF
  done

  # fai would do this:
  #BOOT_DEVICE=\${BOOT_DEVICE:-"${devs[0]}"}

  # note: swaplist seems to do nothing.
  cat >/tmp/fai/disk_var.sh <<EOF
BOOT_DEVICE="${short_devs[@]}"
BOOT_PARTITION=\${BOOT_PARTITION:-$first_boot_dev}
# ROOT_PARTITIONS is added by me, used in arch setup.
ROOT_PARTITIONS="${root_devs[@]}"
ROOT_PARTITION=\${ROOT_PARTITION:-$first_root_crypt}
SWAPLIST=\${SWAPLIST:-"${swaps[@]}"}
EOF

  if [[ $HOSTNAME == kd ]]; then
    # note, having these with keyscript and initramfs causes a luks error in fai.log,
    # but it is safely ignorable and gets us the ability to just type our password
    # in once at boot. A downside is that they are probably needed to be plugged in to boot.
    cat >>/tmp/fai/crypttab <<EOF
crypt_dev_ata-Samsung_SSD_870_QVO_8TB_S5VUNG0N900656V-part${even_bign} /dev/disk/by-id/ata-Samsung_SSD_870_QVO_8TB_S5VUNG0N900656V-part7  none  keyscript=decrypt_keyctl,discard,luks,initramfs
crypt_dev_ata-TOSHIBA_MD04ACA500_84R2K773FS9A-part1 /dev/disk/by-id/ata-TOSHIBA_MD04ACA500_84R2K773FS9A-part1  none  keyscript=decrypt_keyctl,discard,luks,initramfs
crypt_dev_ata-ST6000DM001-1XY17Z_Z4D29EBL-part1 /dev/disk/by-id/ata-ST6000DM001-1XY17Z_Z4D29EBL-part1  none  keyscript=decrypt_keyctl,discard,luks,initramfs
EOF
    cat >> /tmp/fai/fstab <<EOF
# r7 = root partition7. it isnt actually #7 anymore, not a great name, but whatever
/dev/mapper/crypt_dev_ata-Samsung_SSD_870_QVO_8TB_S5VUNG0N900656V-part${even_bign}  /mnt/r7  btrfs  nofail,$fstabstd,noatime,compress=zstd,subvolid=0  0 0
/dev/mapper/crypt_dev_ata-TOSHIBA_MD04ACA500_84R2K773FS9A-part1  /mnt/rust1  btrfs  nofail,$fstabstd,noatime,compress=zstd,subvolid=0  0 0
/dev/mapper/crypt_dev_ata-ST6000DM001-1XY17Z_Z4D29EBL-part1  /mnt/rust2  btrfs  nofail,$fstabstd,noatime,compress=zstd,subvolid=0  0 0
EOF
  fi

fi

# initial setup of extra data fs, mounted,
# btrfs subvol create nocow
# chattr +C nocow
# chown iank.iank nocow