#!/bin/bash
# SPDX-License-Identifier: GPL-2.0-only
#
# vm-run: Run command under QEMU emulated root
#
# Copyright (C) 2019-2024 Vitaly Chikunov <vt@altlinux.org>
#

set -efu

# Save original env first
SCRIPT=$(mktemp "${TMPDIR:-/tmp}/vm.XXXXXXXXXX")
( LOGNAME=root
  USER=root
  declare -fx
  export ) > $SCRIPT

# Additional options to qemu run
OPTS=
APPEND=
NOCMD=
type time >/dev/null 2>&1 && TIME=time || TIME=
SBIN=
VERBOSE=
QUIET=quiet	# no boot messages
DEF=
DEPMOD=
BIOS=
KVM=try		# try to use kvm by default
VIRTIOBUS=pci	# virtio bus to use
ARMH_KVM_OK=/usr/bin/qemu-system-aarch64-bundle-kvm-ok
SANDBOX=on,spawn=deny
MULTIDEVS=remap
MODULES=
RDADD=
ROOTFS_DEF_IMAGE=/tmp/vm-ext4.img
ROOTFS=
ROOTFSCREATE=
NOBIND=
DRIVE_IF=virtio
FAKESUDO=
RUNUSER=
BASHMINUS=-xe
RDMODE=
ARTIFACTS=("$SCRIPT" "$SCRIPT.ret")
RMARTIFACTS=y
unset NRCPU MAXCPU RSYNC SWAP MEM MAXMEM SOFTMEM TMPFS FSFEAT STDOUT HVC HEREDOC RDSHELL
unset RDBREAK UMAXCPU UMAXMEM VIRTIOFS NOSTDIN
DEF_SHELL=${SHELL:-bash}

usage() {
	echo "Usage: $(basename $0) [OPTIONS...] [COMMAND...]"
	echo "Run COMMAND in VM booted kernel."
	echo
	echo "QEMU options:"
	echo "    --bios=          Use particular firmware (ex. uefi)"
	echo "    --cpu=           Pass '-smp ...' to qemu (cpus)"
	echo "    --maxcpu=        Limit cpu count to this value (or use NPROCS=)."
	echo "    --drive=         Shortcut to pass '-drive file=' to qemu"
	echo "    --ext4[=feat]    Automatic --rootfs [with fs feature enabled] and --no-bind."
	echo "    --fat=DIR        Shortcut to pass DIR as rw FAT partition"
	echo "    --hvc            Enable high speed serial virtconsole"
	echo "    --kvm=cond       Only run if KVM is present otherwise exit 0"
	echo "    --mem=           Pass '-m ...' to qemu (memory size)"
	echo "                     Set to 'max' to exceed default soft limit."
	echo "    --maxmem=        Limit auto-requested memory to this value."
	echo "    --microvm        Use microvm machine type"
	echo "    --multidevs=     9p multidevs mode (remap, forbid, warn)"
	echo "    --overlay=FS[,SIZE=][:PATH]"
       	echo "                     Auto create overlay FS over PATH"
	echo "    --qemu='...'     Pass additional options to qemu. There are also other, more"
	echo "                     specific shortcuts: -device, -blockdev, -object,"
	echo "                     -machine, -global, -netdev, and -chardev"
	echo "    --rootfs=IMAGE   Use existing ext4 IMAGE for rootfs instead of 9p"
	echo "                     To generate new image each time use --create-rootfs=IMAGE"
	echo "                     or vm-create-image tool."
	echo "    --rootfs         As above, but use default image '/tmp/vm-ext4.img'"
	echo "    --secureboot     Use OVMF SecureBoot fw (alias --bios=secureboot)"
	echo "    -s, --silent     Do not show qemu command"
	echo "    --swap           Add SwapFree to the available memory"
	echo "    --tcg            Do not try to enable KVM"
	echo "    --uefi           Use OVMF UEFI firmware (alias --bios=uefi)"
	echo "Kernel options:"
	echo "    --append='...'   Append to kernel cmdline"
	echo "    --depmod         Auto-depmod %buildroot installed kernel"
	echo "    --kernel=        Kernel version to use (--kernels to list)"
	echo "    --no-quiet       Do not boot kernel too quietly"
	echo "    --loglevel=      Set loglevel= (implies --no-quiet), when argument isn't a"
	echo "                     number it enables 'earlyprintk=serial ignore_loglevel' instead."
	echo "    --klog           Enable kernel.printk=8 just before executing the command."
	echo "Initrd options:"
	echo "    --initrd         Run script in minimal initrd environment under busybox."
	echo "    --modules='...'  Load named modules (space separated) in initrd."
	echo "    --rdadd=X[:Y]    Add file/dir X into initrd at Y."
	echo "Boot options:"
	echo "    --no-bind        Don't bind mount /usr/src from 9p (in rootfs mode)"
	echo "    --rsync=DIR      rsync DIR over rootfs (for %buildroot)"
	echo "    --sbin           Append sbin dirs to the PATH"
	echo "    --sudo           Install fake sudo into rootfs (implies creation of rootfs)"
	echo "    --user           Run under user instead of root (implies --sudo)"
	echo "    --udevd          Start udevd"
	echo # Other options
	echo "    --heredoc        Read script from stdin (for here-documents)"
	echo "    --stdout         Output everything to stderr except command's stdout"
	echo "    -h, --help       This help"
	echo "    --verbose        Show vm actions verbosely"
	echo "    --               No options after this marker"
	echo "    command...       Will be eval'ed on the target"
	exit 0
}

while [ $# -gt 0 ]; do # opt
	opt=$1
	case "$opt" in
		-h|--help)   usage ;;
		-s|--silent) NOCMD=y TIME= ;;
		--verbose)   VERBOSE=1 APPEND+=" VERBOSE" ;;
		--no-quiet)  QUIET= ;;
		--loglevel=*) LOGLEVEL="${opt#*=}" ;;
		--sbin)      SBIN=y ;;
		--udevd)     APPEND+=" UDEVD=y" ;;
		--qemu=*)    OPTS+=" ${opt#*=}" ;;
		--append=*)  APPEND+=" ${opt#*=}" ;;
		--drive=*)   OPTS+=" -drive file=${opt#*=}" ;;
		--fat=*)     OPTS+=" -drive format=raw,file=fat:rw:${opt#*=}" ;;
		--overlay=*) APPEND+=" OVERLAY=${opt#*=}" ;;
		--uefi)	     BIOS=uefi ;;
		--secureboot) BIOS=secureboot ;;
		--bios=*)    BIOS="${opt#*=}" ;;
		--tcg)	     KVM= ;;
		--kvm=*)     KVM="${opt#*=}" ;;
		--microvm)   VIRTIOBUS=device OPTS+=" -M microvm" BIOS=microvm ;;
		--sandbox=*) SANDBOX=${opt#*=} ;;
		--mem=*)     MEM="${opt#*=}" ;;
		--cpu=*)     NRCPU="${opt#*=}" ;;
		--maxcpu=*)  UMAXCPU="${opt#*=}" ;;
		--maxmem=*)  UMAXMEM="${opt#*=}" ;;
		--kernel=*)  KERNEL="${opt#*=}" ;;
		--kernel*)   KERNEL= ;;
		--depmod)    DEPMOD=y ;;
		--multidevs=*) MULTIDEVS="${opt#*=}" ;;
		--modules=*) MODULES+=" ${opt#*=}" ;;
		--rdadd=*)   RDADD+=" ${opt#*=}" ;;
		--create-rootfs=*) ROOTFS="${opt#*=}"; ROOTFSCREATE=y ;;
		--rootfs=*)  ROOTFS="${opt#*=}" ;;
		--rootfs)    ROOTFS=$ROOTFS_DEF_IMAGE ;;
		--ext4)      NOBIND=y; IMPLYROOTFS=y; ;;
		--ext4=*)    NOBIND=y; IMPLYROOTFS=y; FSFEAT="${opt#*=}" ;;
		--virtiofs)  VIRTIOFS=y ;;
		--no-bind)   NOBIND=y ;;
		--no-virtio) DRIVE_IF= ;;
		--scsi)      DRIVE_IF=scsi ;;
		--sudo)      FAKESUDO=y ;;
		--user)      RUNUSER=builder ;;
		--no-snapshot) NOSNAPSHOT= ;;
		--hvc)       HVC=y ;;
		--rsync=*)   RSYNC="${opt#*=}" ;;
		--swap)      SWAP=y ;;
		--tmp=*)     TMPFS="${opt#*=}" ;;
		--stdout)    STDOUT=y ;;
		--stub-exit=*) ;;
		--heredoc)   HEREDOC=y ;;
		--rdshell)   RDSHELL=y ;;
		--rdbreak=*) RDBREAK="${opt#*=}" ;;
		--initrd)    RDMODE=RD; DEF_SHELL=sh ;;
		--klog)      PRINTK=8 ;;
		--no-rm)     unset RMARTIFACTS ;;
		[-+][xe] | [-+][xe][xe]) BASHMINUS+=" $opt" ;;
		-n)          NOSTDIN=y ;;
		-audio-help|-daemonize|-enable-kvm|-full-screen|-jitdump|-mem-prealloc| \
		-no-acpi|-nodefaults|-no-fd-bootchk|-nographic|-no-hpet|-no-reboot| \
		-no-shutdown|-old-param|-only-migratable|-perfmap|-portrait|--preconfig|-S| \
		-semihosting|-singlestep|-snapshot|-usb|-version|-win2k-hack|-xen-attach| \
		-xen-domid-restrict \
			) OPTS+=" $opt" ;;
		-accel|-acpitable|-action|-add-fd|-append|-async-teardown|-audio|-audiodev| \
		-bios|-blockdev|-boot|-cdrom|-chardev|-chroot|-compat|-cpu|-d|-D|-debugcon| \
		-device|-dfilter|-display|-drive|-dtb|-dump-vmstate|-echr|-fda|-fdb|-fsdev| \
		-fw_cfg|-g|-gdb|-global|-hda|-hdb|-hdc|-hdd|-icount|-incoming|-initrd| \
		-iscsi|-k|-kernel|-L|-loadvm|-m|-machine|-mem-path|-mon|-monitor|-msg| \
		-mtdblock|-name|-net|-netdev|-nic|-numa|-object|-option-rom|-overcommit| \
		-parallel|-pflash|-pidfile|-plugin|-prom-env|-qmp|-qmp-pretty|-readconfig| \
		-rotate|-rtc|-runas|-run-with|-sandbox|-sd|-seed|-semihosting-config| \
		-serial|-set|-smbios|-smp|-spice|-tpmdev|-trace|-usbdevice|-uuid|-vga| \
		-virtfs|-vnc|-watchdog-action|-xen-domid \
			) shift; OPTS+=" $opt ${1:?"$opt requires argument"}" ;;
		--) shift; break ;;
		-*) echo "Error: Unknown option $opt" >&2; exit 1 ;;
		*)  break ;;
	esac
	shift
done

# If no command run shell
if [ -z "$*" ] && [ ! -v HEREDOC ]; then
	set -- $DEF_SHELL
	SBIN=y
fi

# If we already have root just run the command
if [ $(env - whoami) = root ]; then
	exec "$@"
	exit
fi

# Hack to prevent SIGTTOU when QEMU tries to set up echo on stdio,
# and when we are in hsh-shell with pts and manually running rpmbuild.
[ -t 0 ] && [ -v RPM_BUILD_DIR ] && NOSTDIN=y
[ -v NOSTDIN ] && exec </dev/null

# Signal to vm-init that we don't have tty on stdio,
# so it will not try to determine terminal size
if [ ! -t 0 ] || [ ! -t 1 ]; then
	printf "#  CMD: NOTTY=y\n" >> $SCRIPT
fi

if [ -n "${PRINTK-}" ]; then
	printf 'echo %q > /proc/sys/kernel/printk\n' "$PRINTK" >> "$SCRIPT"
fi

# libfakeroot tries to communicate with faked and hangs.
[ -v LD_PRELOAD ] && echo 'unset LD_PRELOAD' >> $SCRIPT
# Prepare env
[ $SBIN ] && echo "PATH=/sbin:/usr/sbin:\$PATH" >> $SCRIPT
# Because 9p cannot handle redirection from deleted files and as
# a consequence cannot handle here-documents move TMPDIR to tmpfs
if [ -v TMPDIR ] && [ "$TMPDIR" = '/usr/src/tmp' ]; then
	printf "TMPDIR=/tmp\n" >> $SCRIPT
fi
printf "cd %q\n" "$PWD" >> $SCRIPT
printf "(eval" >> $SCRIPT
printf ' "set %s;"' "$BASHMINUS" >> $SCRIPT
if [ -v HEREDOC ]; then
	if [ $# -gt 0 ]; then
		echo "Error: In heredoc mode, there should not be commands on command-line."
		exit 1
	fi
	printf " %q" "$(cat)" >> $SCRIPT
elif [ $# -eq 1 ]; then
	# `bash -c` style command.
	printf " %q" "$@" >> $SCRIPT
else
	# Exact command.
	printf " '%q'" "$@" >> $SCRIPT
fi
printf ")\n" >> $SCRIPT
# Flush console.
printf  "RET=\$?\nstty sane\nexit \$RET\n" >> $SCRIPT
chmod a+rx $SCRIPT

kvm_ok() {
	if [ -c /dev/kvm -a -w /dev/kvm ]; then
		if [ "$HOSTTYPE" = "aarch32" ] &&
			[ -x $ARMH_KVM_OK ]; then
			$ARMH_KVM_OK
		else
			true < /dev/kvm
		fi
	else
		false
	fi 2>/dev/null
}

kvm_need() {
	case "$KVM" in
		no|off|false|tcg|"") false ;;
		*) true ;;
	esac
}

ppc_opts() {
	# Use `power8' and not `host', because we will transparently
	# fallback from kvm to tcg. All following options will not
	# break fallback to tcg.
	echo -n " -M cap-ibs=broken,cap-cfpc=broken,cap-sbbc=broken"
	echo -n " -cpu power8"
	if kvm_need && kvm_ok; then
		# In absence of `/usr/sbin/ppc64_cpu' determine SMT by presence
		# online cores 1-7, (while cores 0,8,.. is not SMT cores.)
		if grep -q -P '^processor\s+:\s[1234567]$' /proc/cpuinfo; then
			# If SMT enabled use slower PR (problem state) KVM.
			echo " -M kvm-type=PR,cap-ccf-assist=off,cap-fwnmi=off"
		else
			# KVM HV is faster, but incompatible with SMT.
			echo " -M kvm-type=HV,cap-ccf-assist=off"
		fi
	else
		# KVM doesn't support this.
		echo -n " -M max-cpu-compat=power7"
	fi
}

# Special case for aarch64 host running in aarch32 personality.
case "$HOSTTYPE" in
	armh | armv7l)
	if kvm_need && [ -x $ARMH_KVM_OK ] && $ARMH_KVM_OK; then
		HOSTTYPE=aarch32
	fi
	;;
esac

case "$HOSTTYPE" in
	powerpc64le)
		PACKAGE=qemu-system-ppc-core
		CONSOLE=hvc0
		QEMU=qemu-system-ppc64
		OPTS+=$(ppc_opts)
		# Avoid `CPU time limit exceeded'
		SOFTMEM=32768
		SOFTCPU=8
		# qemu-system-ppc64: kvm_init_vcpu: kvm_get_vcpu failed (1014): Too many open files
		MAXCPU=1014
		;;
	aarch64)
		PACKAGE=qemu-system-aarch64-core
		CONSOLE=ttyAMA0
		QEMU=qemu-system-aarch64
		# Values that can work well both for kvm and tcg
		OPTS+=" -M virt,gic-version=3 -cpu max"
		# More cpu and memory is slower launch, so limit them
		# to sane big values
		SOFTMEM=4096
		# Maximum available CPU count is 128.
		SOFTCPU=8
		# qemu-system-aarch64: Invalid SMP CPUs 1234. The max CPUs supported by machine 'virt-7.2' is 512
		MAXCPU=512
		EDK2=/usr/share/AAVMF
		EFI_CODE=( QEMU_EFI.fd AAVMF_CODE.fd QEMU_EFI.kernel.fd QEMU_EFI.silent.fd )
		;;
	aarch32)
		PACKAGE=qemu-system-aarch64-bundle-core
		CONSOLE=ttyAMA0
		QEMU=qemu-system-aarch64-bundle
		OPTS+=" -M virt,highmem=off -cpu host,aarch64=off"
		APPEND+=" watchdog_thresh=60"
		# qemu-system-aarch64: Addressing limited to 32 bits, but memory exceeds it by 1073741824 bytes
		MAXMEM=3072
		# Maximum available CPU count is 16.
		# CPUs are booted sequentially so higher number slower boot.
		SOFTCPU=6
		# qemu-system-aarch64: Capacity of the redist regions(123) does not match the number of vcpus(512)
		MAXCPU=123
		;;
	armh | armv7l)
		PACKAGE=qemu-system-arm-core
		CONSOLE=ttyAMA0
		QEMU=qemu-system-arm
		OPTS+=" -M virt,highmem=off -cpu max"
		APPEND+=" watchdog_thresh=60"
		MAXMEM=2047
		# Maximum available CPU count is 8.
		SOFTCPU=2
		# qemu-system-arm: Number of SMP CPUs requested (512) exceeds max CPUs supported by machine 'mach-virt' (8)
		MAXCPU=8
		;;
	i586)
		PACKAGE=qemu-system-x86-core
		CONSOLE=ttyS0
		QEMU=qemu-system-i386
		OPTS+=" -cpu max"
		# qemu-system-i386: at most 2047 MB RAM can be simulated
		# qemu-system-i386: ram size too large
		MAXMEM=2047
		# Maximum available CPU count is 8.
		SOFTCPU=8
		# Boot error if above 225: qemu: qemu_thread_create: Resource temporarily unavailable
		MAXCPU=225
		;;
	x86_64)
		PACKAGE=qemu-system-x86-core
		CONSOLE=ttyS0
		QEMU=qemu-system-x86_64
		SOFTMEM=787455
		# qemu-system-x86_64: Invalid SMP CPUs 1234. The max CPUs supported by machine 'pc-i440fx-7.2' is 255
		# qemu-system-x86_64: Invalid SMP CPUs 2553. The max CPUs supported by machine 'pc-q35-7.2' is 288
		# But even for q35: smpboot: native_cpu_up: bad cpu 256
		MAXCPU=255
		EDK2=/usr/share/OVMF
		EFI_CODE=OVMF_CODE.fd
		SB_PFLASH="OVMF_CODE.secboot.fd OVMF_VARS.secboot.fd"
		;;
	*)
		echo "Error: architecture $HOSTTYPE is unknown." >&2
		exit 1
esac

case "$HOSTTYPE" in
	i586|x86_64)
		APPEND+=" no_timer_check" ;;
esac

# Set up BIOS.
if [ "$BIOS" = "uefi" ]; then
	[ "$HOSTTYPE" = x86_64 ] && OPTS+="-M q35"
	for EFI_CODE in "${EFI_CODE[@]}"; do [ -e $EDK2/$EFI_CODE ] && break || unset EFI_CODE; done
	if [ -n "${EFI_CODE-}" ]; then
		OPTS+=" -bios $EDK2/$EFI_CODE"
	else
		echo "Error: UEFI is not available for this platform." >&2
		exit 1
	fi
elif [ "$BIOS" = "secureboot" ]; then
	if [ -n "${SB_PFLASH-}" ]; then
		OPTS+=" -global ICH9-LPC.disable_s3=1 -M q35,smm=on"
		OPTS+=" -global driver=cfi.pflash01,property=secure,value=on"
		for f in ${SB_PFLASH-}; do
			OPTS+=" -drive file=$EDK2/$f,if=pflash,format=raw"
			if [[ "$f" =~ VARS ]]; then
				OPTS+=",snapshot=on"
			else
				OPTS+=",readonly=on"
			fi
		done
	else
		echo "Error: Secure Boot is not available for this platform." >&2
		exit 1
	fi
elif [ "$BIOS" = "microvm" ]; then
	BIOS=bios-microvm.bin
	OPTS+=" -bios $BIOS"
	# grep for optimized out string if !CONFIG_SERCON.
	if grep -sq "sercon: using ioport" "/usr/share/qemu/$BIOS"; then
		# Will print warning but nevertheless do not mess terminal.
		OPTS+=" -fw_cfg etc/sercon-port,string=0"
	fi
elif [ "$BIOS" = "u-boot" ]; then
	set +f
	u_boot=( /usr/share/u-boot/qemu*/u-boot.bin )
	set -f
	if [ ! -e "$u_boot" ]; then
		echo "Error: Das U-Boot is not available for this system (install u-boot-qemu)." >&2
		exit 1
	fi
	OPTS+=" -bios $u_boot"
	unset u_boot
elif [ -f "/usr/share/qemu/$BIOS" ]; then
	OPTS+=" -bios $BIOS"
elif [ -n "$BIOS" ]; then
	OPTS+=" -bios $BIOS"
elif [ -e /usr/share/qemu/bios.bin ] && [[ "$HOSTTYPE" =~ [356x]86 ]]; then
	# x86 BIOS (seabios-128k) that doesn't clear screen unlike default bios-256k.bin
	OPTS+=" -bios bios.bin"
fi
unset BIOS

unset NOKVM
if kvm_need; then
	case "$KVM" in
		try|detect)
			# Avoid qemu warning:
			#   Could not access KVM kernel module: No such file or directory
			#   qemu-system-x86_64: failed to initialize kvm: No such file or directory
			#   qemu-system-x86_64: falling back to tcg
			kvm_ok && DEF+=" -M accel=kvm:tcg" || { DEF+=" -M accel=tcg"; NOKVM=y; }
			;;
		cond|if)
			if ! kvm_ok; then
				echo "Warning: Not running due to no KVM support (exit 0)."
				exit 0
			fi
			DEF+=" -enable-kvm"
			;;
		only|force|enable)
			DEF+=" -enable-kvm"
			;;
		any|all)
			DEF+=" -M accel=kvm:tcg"
			;;
		default)
			;;
		*)
			echo "Error: Unknown --kvm=$KVM option."
			exit 1
			;;
	esac
else
	# Forcefully disable KVM.
	DEF+=" -accel tcg"
	NOKVM=y
fi
unset KVM

if ! mountpoint -q /proc; then
	echo >&2 "  Warning: /proc is not mounted!"
	echo >&2 "  Try to enter hasher with: hsh-shell --mountpoints=/proc,/dev/kvm"
fi

[ "${NRCPU-}" = max ] && unset NRCPU SOFTCPU
if [ -v NRCPU ]; then
	# What user requested explicitly.
	DEF+=" -smp cores=$NRCPU"
	NPROCS=$NRCPU
else
	# No point to set cpu count to host nproc value if there is no kvm support,
	# or this will cause long (lpj=) calibration delays.
	[ -v NOKVM ] && [ "${MAXCPU:-0}" -gt 4 ] && MAXCPU=4

	[ -v NPROCS ] || NPROCS=$(nproc)
	if [ "$NPROCS" -gt 1 ]; then
		declare -i nprocs_macro=$(rpm --eval '%__nprocs')
		[ -n "$nprocs_macro" ] && [ "$NPROCS" -gt "$nprocs_macro" ] && NPROCS=$nprocs_macro
		# SOFTCPU is high enough for boot times around 1 second.
		[ -v SOFTCPU ] && [ "$NPROCS" -gt "$SOFTCPU" ] && NPROCS=$SOFTCPU
		# Exceeding MAXCPU will produce QEMU error.
		[ -v MAXCPU ] && [ "$NPROCS" -gt "$MAXCPU" ] && NPROCS=$MAXCPU
		[ -v UMAXCPU ] && [ "$NPROCS" -gt "$UMAXCPU" ] && NPROCS=$UMAXCPU
		DEF+=" -smp cores=$NPROCS"
		unset nprocs_macro
	fi
fi

unset M
[ "${MEM-}" = max ] && unset MEM SOFTMEM
if [ -v MEM ]; then
	M=$MEM
elif [ -e /proc/meminfo ]; then
	M=$(awk '$1 == "MemAvailable:" {print int($2 / 1024)}' /proc/meminfo)
	[ -v SWAP ] && M=$((M + $(awk '$1 == "SwapFree:" {print int($2 / 1024)}' /proc/meminfo)))
	# SOFTMEM is just to speed up boot times.
	[ -v SOFTMEM ] && [ "$M" -gt "$SOFTMEM" ] && M=$SOFTMEM
	# MAXMEM is hard QEMU limit (on 32-bit arches).
	[ -v MAXMEM ] && [ "$M" -gt "$MAXMEM" ] && M=$MAXMEM
	# In case rlimit_soft_as is used.
	AS=$(ulimit -v)
	if [ "$AS" != unlimited ]; then
		AS=$((AS/1024-512-$NPROCS*80))
		if [ "$AS" -lt 0 ]; then
			echo >&2 "Warning: RLIMIT_AS would cause negative memory setting (try --maxmem= or --maxcpu=)."
		elif [ "$AS" -lt "$M" ]; then
			M=$AS
		fi
	fi
	# User specified memory limit.
	if [ -v UMAXMEM ]; then
		case "$UMAXMEM" in
			*[Kk]) UMAXMEM=${UMAXMEM%[Kk]} ;;
			*[Mm]) UMAXMEM=$((${UMAXMEM%[Mm]}/1024)) ;;
			*[Gg]) UMAXMEM=$((${UMAXMEM%[Gg]}/1048576)) ;;
		esac
		[ "$M" -gt "$UMAXMEM" ] && M=$UMAXMEM
	fi
	[ "$HOSTTYPE" = x86_64 ] && [ "$M" -gt 787455 ] && OPTS+=" -cpu qemu64,host-phys-bits=true"
	# Round to 256M for ppc
	[ "$HOSTTYPE" = powerpc64le ] && M=$((M / 256 * 256))
	[ $M -le 256 ] && unset M
	unset AS
fi
[ -n "${M:-}" ] && DEF+=" -m $M"
unset NPROCS

# Enable sandbox
if [ -n "$SANDBOX" ] && [ "$SANDBOX" != "off" ]; then
	OPTS="-sandbox $SANDBOX $OPTS"
fi

if [ -e /proc/net/dev ] && [ $(wc -l < /proc/net/dev) -gt 3 ]; then
	if [ $VIRTIOBUS = pci ]; then
		DEF+=" -nic user,model=virtio-net-pci"
	else
		DEF+=" -device virtio-net-$VIRTIOBUS,netdev=user -netdev user,id=user"
	fi
	if type ip >/dev/null 2>&1; then
		:
	elif type ifconfig >/dev/null 2>&1; then
		:
	else
		echo "Warning: To use share_network inside vm-run you may" >&2
		echo "  need to hsh-install iproute2 or net-tools" >&2
	fi
fi

unset VFSD_SOCK
if [ -v VIRTIOFS ]; then
	VIRTIOFS=/usr/libexec/virtiofsd
	if [ ! -x "$VIRTIOFS" ]; then
		echo >&2 "Error: $VIRTIOFS command not found, install 'virtiofsd'."
		exit 1
	fi
	VFSD_SOCK=${TMPDIR:-/tmp}/vm-vfsd.sock
	VIRTIOFS+=" --shared-dir=/ --sandbox=none --socket-path=$VFSD_SOCK --announce-submounts --log-level=error"
	# size= expects value in bytes, but -m is in megabytes, unless qualified.
	SZ=${M:-256m}
	[[ $SZ =~ [[:digit:]]$ ]] && SZ+=m
	DEF+=" -chardev socket,id=vfsd,path=$VFSD_SOCK"
	DEF+=" -device vhost-user-fs-$VIRTIOBUS,queue-size=1024,chardev=vfsd,tag=vfsd"
	DEF+=" -object memory-backend-memfd,id=mem,size=$SZ,share=on"
	DEF+=" -M memory-backend=mem"
	MODULES+=" virtiofs fuse"
fi
unset M SZ

if [ -v RSYNC ]; then
	if type -p rsync >/dev/null; then
		[[ "$RSYNC" =~ : ]] || RSYNC+=:/
		printf "#  CMD: RSYNC=%q\n" "$RSYNC" >> $SCRIPT
		IMPLYROOTFS=y
	else
		echo "Error: rsync binary not found." >&2
		exit 1
	fi
fi

if [ -n "$RUNUSER" ]; then
	printf "#  CMD: RUNUSER=$RUNUSER\n" >> $SCRIPT
	FAKESUDO=y
fi

if [ -n "$FAKESUDO" ]; then
	printf '#  CMD: FAKESUDO=y\n' >> $SCRIPT
	IMPLYROOTFS=y
fi

if [ -n "${IMPLYROOTFS-}" ] && [ -z "$ROOTFS" ]; then
	ROOTFS=$ROOTFS_DEF_IMAGE
	ROOTFSCREATE=auto
fi

blkid() {
	local tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/blkid.XXXXXXXXXX")
	BLKID_FILE="$tmpdir" /sbin/blkid "$@"
	rm -rf "$tmpdir"
}

if [ -n "$ROOTFS" ]; then
	ROOTFSFILE=${ROOTFS%%,*}
	ROOTFSOPTS=${ROOTFS#"$ROOTFSFILE"}
	[ "$ROOTFSCREATE" = "auto" ] && [ -e "$ROOTFS" ] && ROOTFSCREATE=
	[ -z "$ROOTFSCREATE" ] || {
		vm-create-image "$ROOTFSFILE" $([[ "$ROOTFS" =~ qcow2 ]] && echo --qcow2) ${FSFEAT:+--mkfs="-O$FSFEAT"}
		ARTIFACTS+=("$ROOTFSFILE")
	}
	FORMAT=$(qemu-img info "$ROOTFSFILE" | grep -Po '^file format: \K.*')
	if [ "$FORMAT" = "qcow2" ]; then
		tmpfile=$(mktemp)
		qemu-img dd if="$ROOTFSFILE" of="$tmpfile" count=4
		eval "$(blkid -o export -- "$tmpfile")"
		rm $tmpfile
	elif [ "$FORMAT" = "raw" ]; then
		eval "$(blkid -o export -- "$ROOTFSFILE")"
	else
		echo "Error: Image '$ROOTFSFILE' in unsupported format '$FORMAT'."
		exit 1
	fi
	if [ -n "${PTTYPE-}" ]; then
		# Disk partition detected.
		tmpfile=$(mktemp)
		# 'qemu-img dd's skip= applies to the value of count= (or after count= is started)
		# in contrast to dd(1) where it applies before count= begins.
		qemu-img dd if="$ROOTFSFILE" of="$tmpfile" count=2052 skip=2048
		eval "$(blkid -o export -- "$tmpfile")"
		rm $tmpfile
	fi
	if [[ ! "${TYPE-}" =~ ^ext[234]$ ]]; then
		echo "Error: ext4 filesystem not found in '$ROOTFSFILE'." >&2
		exit 1
	fi
	MODULES+=" $TYPE"
	OPTS+=" -drive file=$ROOTFSFILE"
	if [ ! -w "$ROOTFSFILE" ]; then
		if [ -v NOSNAPSHOT ]; then
			# Create user-writable qcow2 image over non-writable one created by rooter.
			# Filename integrates fs UUID, so in case it's regenerated after hsh-install
			# it will be created again.
			# As a hack SIZE can be passed externally via NOSNAPSHOT= env, resize2fs will
			# be called inside vm.
			_EXT=".$UUID.qcow2"
			if [ ! -e "$ROOTFS$_EXT" ]; then
				[ -n "$VERBOSE" ] && echo >&2 "Creating writable overlay qcow2 image."
				qemu-img create -b "$ROOTFSFILE" -F "$FORMAT" -f qcow2 "$ROOTFS$_EXT" $NOSNAPSHOT
				ARTIFACTS+=("$ROOTFS$_EXT")
				[ -n "$NOSNAPSHOT" ] && printf "#  CMD: RESIZE2FS=$UUID\n" >> $SCRIPT
			fi
			OPTS+="$_EXT"
			FORMAT=qcow2
			unset _EXT
		else
			OPTS+=",snapshot=on"
		fi
	fi
	OPTS+="$ROOTFSOPTS"
	[ "$FORMAT" = "raw" ] && OPTS+=",format=raw"
	[[ "$ROOTFSOPTS" =~ ,if= ]] || \
	case "$DRIVE_IF" in
		virtio)
			if [ $VIRTIOBUS = pci ]; then
				OPTS+=",if=virtio"
			else
				OPTS+=",if=none,id=vd0 -device virtio-blk-device,drive=vd0"
			fi
			MODULES+=" virtio_blk"
			;;
		scsi)
			OPTS+=",if=none,id=sd0 -device virtio-scsi-$VIRTIOBUS -device scsi-hd,drive=sd0"
			MODULES+=" virtio_scsi sd_mod"
			;;
		'')
			if [ $VIRTIOBUS = device ]; then
				OPTS+=",if=none,id=hd0 -device isa-ide -device ide-hd,drive=hd0"
			fi
			MODULES+=" sd_mod ata_piix"
			[ "$HOSTTYPE" = powerpc64le ] && MODULES+=" ibmvscsi"
			;;
	esac
	APPEND+=" root=UUID=$UUID"
	[ -n "$NOBIND" ] || printf '#  CMD: V mount --bind /mnt/9p/usr/src /usr/src\n' >> $SCRIPT
	unset FORMAT ROOTFSFILE FSFEAT
fi
# We shall load these modules even though we're using ext4 for rootfe
# (but supposedly we can mount 9p later and modprobe before that).
# It's because we need to access SCRIPT= (to read MODPROBE_OPTIONS)
# which could be (and will be) under /usr/src, but we bind mount it
# from 9p:/, thus 9p modules should be already loaded.
MODULES+=" 9p 9pnet_virtio"

unset ODEV ODEVEC
unset rd_etc_profile
[ -v RDSHELL ] && APPEND+=" rdshell"
[ -v RDBREAK ] && APPEND+=" rdbreak=$RDBREAK"
if [ -v RDSHELL ] ||  [ -v RDBREAK ] || [ -n "$RDMODE" ]; then
	for i in busybox toybox; do
		CMD=$(type -p "$i") && break
	done
	if [ -n "${CMD:-}" ]; then
		# If it's somewhere else in PATH put it in /bin so initrd can find it.
		RDADD+=" $CMD:/bin/$(basename "$CMD")"
		rd_etc_profile=$(mktemp --suffix=.rdshell)
		cat <<-'EOF' > "$rd_etc_profile"
			[ $$ -ne 1 ] || trap "poweroff -f" EXIT
		EOF
		RDADD+=" $rd_etc_profile:/etc/profile"
	else
		echo "Error: busybox/toybox executable not found in PATH."
		exit 1
	fi
	unset CMD
fi
if [ -n "$RDMODE" ]; then
	# 'declare' is not portable.
	sed -i 's/^declare -x /export /' "$SCRIPT"
	RDADD+=" $SCRIPT"
	# Perhaps, no shared fs in initrd. Prepare exitcode device.
	ODEVEC=" -device virtserialport,chardev=ec,name=exitcode -chardev file,id=ec,path=$SCRIPT.ret"
	ODEV=
fi

# Prepare swap device for tmpfs in guest.
if [ -v TMPFS ]; then
	SWAPIMG=/tmp/vm-tmpfs.qcow2
	ARTIFACTS+=("$SWAPIMG")
	SWAPSZ=${TMPFS%%,*}
	SWAPOPT=${TMPFS#"$SWAPSZ"}
	# Clean up used space from previous run.
	rm -f -- "$SWAPIMG"
	[ "$SWAPSZ" = max ] && SWAPSZ=$(df /tmp --output=avail -BM 2>/dev/null | tail -n+2)
	[ "${SWAPSZ%[BKMGTbkmgt]}" = "$SWAPSZ" ] && SWAPSZ+='M'
	# mkswap wants secure permissions.
	(umask 077; truncate -s "$SWAPSZ" "$SWAPIMG"~)
	/sbin/mkswap -L tmpfs "$SWAPIMG"~ >/dev/null
	qemu-img convert "$SWAPIMG"~ "$SWAPIMG" -O qcow2
	rm -- "$SWAPIMG"~
	OPTS+=" -drive file=$SWAPIMG$SWAPOPT"
	unset SWAPSZ SWAPIMG SWAPOPT TMPFS
fi

# Setup output devices.
if [ -v HVC ]; then
	CONSOLE=hvc0
	# It seems 'signal=off' handles ^C differently than '-serial mon:stdio'.
	# For example, "vm-run --hvc cat /dev/zero | pv >/dev/null" is not stoppable
	# with ^C when 'signal=off' (you still can stop it with ^A x).
	ODEV=" -device virtconsole,chardev=con -chardev stdio,mux=on,signal=off,id=con -mon chardev=con"
	unset HVC
else
	DEF+=" -serial mon:stdio"
fi

if [ -v STDOUT ]; then
	printf "#  CMD: STDOUT=y\n" >> $SCRIPT
	exec 3>&1 1>&2
	ODEV=" -add-fd fd=3,set=2 -chardev pipe,path=/dev/fdset/2,id=c1 -device virtserialport,chardev=c1,name=stdout"
	unset STDOUT
fi

if [ -v ODEV ]; then
	MODULES+=" virtio_console"
	DEF+=" -device virtio-serial-$VIRTIOBUS $ODEV ${ODEVEC-}"
	unset ODEV ODEVEC
fi

# Sorted by preference (%buildroot first) and mtime (newer first)
list_kernels() {
	(
		set +ef
		# Installed kernels have highest priority.
		ls -t ${RPM_BUILD_ROOT-/usr/src/tmp/*-buildroot}/boot/vmlinu[xz]-*

		# Just built kernels in build dir have medium priority.
		builddir=${RPM_BUILD_DIR-/usr/src/RPM/BUILD}
		kbuilddir=$(find "$builddir" -depth -maxdepth 3 -type f -name MAINTAINERS -print -quit)
		kbuilddir=${kbuilddir%/*}
		KBZ=$(find $kbuilddir/arch/*/boot -name '*Image' -type f)
		# Raw vmlinux frequently do not work, with error:
		# qemu-system-x86_64: Error loading uncompressed kernel without PVH ELF Note
		KBD=$(find $kbuilddir -depth -maxdepth 3 -type f -name vmlinux)
		[ -z "$KBD$KBZ" ] || ls -t $KBD $KBZ

		# Kernels from packages have lowest priority.
		ls -t /boot/vmlinu[xz]-*
	) 2>/dev/null
}

list_kernels_ui() {
	if list_kernels | grep -q .; then
		echo "  List of available kernels:" >&2
		list_kernels | sed 's/^/\t/' >&2
	else
		echo "  No kernels found, try to install a kernel first." >&2
	fi
}

guess_kernel() {
	local match=${1-}

	# Better match first.
	list_kernels | grep -s -m1 -e ".*/$match" -x && return
	list_kernels | grep -s -m1 -e    "$match" -w && return
	list_kernels | grep -s -m1 -e    "$match"    && return
	return 0
}

if [ -f "${KERNEL-}" ]; then
	KERN=$(realpath $KERNEL)
elif [ ! -v KERNEL ]; then
	# No --kernel= option.
	KERN=$(guess_kernel)
elif [ -n "$KERNEL" ]; then
	# Non-empty --kernel= option.
	KERN=$(guess_kernel "$KERNEL")
else
	# Empty --kernel= option for easy listing.
	list_kernels_ui
	exit 1
fi

if [ ! -f "$KERN" ]; then
	if [ -n "${KERNEL-}" ]; then
		echo "Error: Cannot find the kernel matching '$KERNEL'." >&2
	else
		echo "Error: Cannot find a kernel, try to use '--kernel=' option." >&2
	fi
	list_kernels_ui
	exit 1
fi
KERNEL=$KERN
unset KERN

ROOT=${KERNEL%%/boot/*}
if [ -n "$ROOT" ]; then
	if [ ! -d "$ROOT/lib/modules" ] &&
	   [ -d "$ROOT/usr/lib/modules" ]; then
		ROOT="$ROOT/usr"
	fi
	# Two spaces to defend against accidental injection, since
	# command is '%q'-escaped and cannot contain consecutive spaces.
	printf '#  MODPROBE_OPTIONS=%q\n' "--dirname=$ROOT" >> $SCRIPT
fi

cleanup_depmod() {
	if [ -v TMP_DEPMOD -a -e "$TMP_DEPMOD" ]; then
		[ -n "$VERBOSE" ] && echo >&2 "Undoing temporary depmod."
		tar xf "$TMP_DEPMOD" -C / 2>/dev/null
		rm "$TMP_DEPMOD"
		unset TMP_DEPMOD
	fi
	exit "$@"
}

# Generate initramfs.
if [ ! -f "${INITRD-}" ]; then
	# For kernel in build dir.
	if [ -z "${KERNEL##/usr/src/RPM/BUILD/*}" ]; then
		ROOT=${TMPDIR-/tmp}
		KSRC=${KERNEL%/*}
		KSRC=${KSRC%/arch/*}
		cd $KSRC
		unset KSRC
		# Override bogus MAKEFLAGS options (such as `-w`).
		KVER=$(MAKEFLAGS=-s make kernelrelease | tail -1)
		mkdir -p "$ROOT/lib/modules"
		if [ "$(readlink "$ROOT/lib/modules/$KVER")" != "$PWD" ]; then
			( set -x; ln -snf "$PWD" "$ROOT/lib/modules/$KVER" )
		fi
		( set -x; /sbin/depmod -b "$ROOT" "$KVER" )
		cd $OLDPWD
		printf '#  MODPROBE_OPTIONS=%q\n' "--dirname=$ROOT" >> $SCRIPT
	else
		# Kernel version.
		KVER=${KERNEL##*/vmlinu[xz]-}
		if [ ! -d "$ROOT/lib/modules/$KVER" ]; then
			echo "Error: Cannot find $ROOT/lib/modules/$KVER for your kernel." >&2
			echo "       Please, install the kernel or make modules_install!"  >&2
			exit 1
		fi
	fi
	if [ ! -s "$ROOT/lib/modules/$KVER/modules.dep.bin" ]; then
		if [ -w "$ROOT/lib/modules/$KVER" ]; then
			if [ -z "$DEPMOD" ] && [[ "$ROOT" =~ buildroot ]]; then
				[ -n "$VERBOSE" ] && echo >&2 "Running temporary depmod (will undo later)."
				TMP_DEPMOD="${TMPDIR-/tmp}/depmod-$KVER-$$.tgz"
				trap 'cleanup_depmod $?' EXIT
				(set +f;
				tar zcf $TMP_DEPMOD "$ROOT/lib/modules/$KVER"/modules.* 2>/dev/null)
			fi
			( set -x; /sbin/depmod ${ROOT:+-b $ROOT} $KVER )
		else
			echo "Error: No valid 'modules.dep.bin' in $ROOT/lib/modules/$KVER"  >&2
			echo "       which is also not writable directory. As a result 'modprobe' will not work!"  >&2
			exit 1
		fi
	fi
	INITRD=${TMPDIR:-/tmp}/initramfs-$KVER.img
	[ "$VIRTIOBUS" = "pci" ] && MODULES+=" virtio_pci" || MODULES+=" virtio_mmio"
	vm-initrd --modules="$MODULES" --basedir="$ROOT" --add="$RDADD" ${VERBOSE:+-v} "$INITRD" "$KVER" || {
		echo "Error: Some errors ($?) while generating initrd." >&2
		exit 1
	}
	ARTIFACTS+=("$INITRD")
	unset CPIO MODULES KVER skip f bn un
fi
unset ROOT
[ -v rd_etc_profile ] && rm "$rd_etc_profile"

[ -v DISPLAY ] && nographic= || nographic=-nographic

if [ -v LOGLEVEL ]; then
	QUIET=
	if [[ "$LOGLEVEL" =~ ^[[:digit:]]+$ ]]; then
		APPEND+=" loglevel=$LOGLEVEL"
	else
		APPEND+=" earlyprintk=serial ignore_loglevel"
	fi
	[[ "$LOGLEVEL" =~ debug ]] && APPEND+=" debug rddebug"
	[[ "$LOGLEVEL" =~ xtrace ]] && APPEND+=" SHELLOPTS=xtrace"
	[[ "$LOGLEVEL" =~ initcall ]] && APPEND+=" initcall_debug"
fi

APPEND="console=$CONSOLE mitigations=off nokaslr $QUIET panic=-1 ${RDMODE}SCRIPT=$SCRIPT$APPEND"

# QEMU Monitor is available (through character backend multiplexer), keys:
#   C-a h    print this help
#   C-a x    exit emulator
#   C-a s    save disk data back to file (if -snapshot)
#   C-a t    toggle console timestamps
#   C-a b    send break (magic sysrq)
#   C-a c    switch between console and monitor
#   C-a C-a  sends C-a
#
# QEMU Monitor help: https://qemu-project.gitlab.io/qemu/system/monitor.html
#
# For example, to send Magic-SysRq-Help press: C-a b h
# or using monitor command: sendkey alt-sysrq-h
#   sysrq: HELP : loglevel(0-9) reboot(b) crash(c) terminate-all-tasks(e)
#   memory-full-oom-kill(f) kill-all-tasks(i) thaw-filesystems(j) sak(k)
#   show-backtrace-all-active-cpus(l) show-memory-usage(m) nice-all-RT-tasks(n)
#   poweroff(o) show-registers(p) show-all-timers(q) unraw(r) sync(s)
#   show-task-states(t) unmount(u) show-blocked-tasks(w) dump-ftrace-buffer(z)
#

[ -v VIRTIOFS ] && (
	unset LD_PRELOAD
	test "$NOCMD" || set -x
	$VIRTIOFS &
)

set +e
# -serial mon:stdio  to make ^C not break qemu
# -device virtio-rng-pci  for virtio-rng
(
	# Provide more debugging info on qemu crashes.
	grep -sq systemd-coredump /proc/sys/kernel/core_pattern && ulimit -c unlimited
	test "$NOCMD" || set -x
	$TIME \
	$QEMU \
		$DEF \
		-nodefaults \
		$nographic \
		-no-reboot \
		-fsdev local,id=root,path=/,security_model=none,multidevs=$MULTIDEVS \
		-device virtio-9p-$VIRTIOBUS,fsdev=root,mount_tag=virtio-9p:/ \
		-device virtio-rng-$VIRTIOBUS \
		-kernel $KERNEL \
		-initrd $INITRD \
		$OPTS \
		-append "$APPEND"
)
declare -i ret=$?

if [ -v VFSD_SOCK ]; then
	read pid < "$VFSD_SOCK.pid" &&
	kill "$pid" 2>/dev/null
	rm -f -- "$VFSD_SOCK" "$VFSD_SOCK.pid"
fi

set -e
if [ $ret -ge 132 -a $ret -le 159 ]; then
	echo >&2 "  NOTICE: This is crash of $QEMU, not of the Linux kernel!"
	if grep -sq systemd-coredump /proc/sys/kernel/core_pattern; then
		echo >&2 "  NOTICE: Perhaps, issue 'coredumpctl info' for details."
	fi
fi
if [ $ret -gt 0 ]; then
	# Exit due to qemu reasons.
	exit $ret
fi

# Temporary script is not deleted intentionally.
if [ -s $SCRIPT.ret ]; then
	# Exit due to script reasons.
	read ret < $SCRIPT.ret
	[ "$ret" = 0 ] && [ -v RMARTIFACTS ] && [ ! -t 0 ] && rm -- "${ARTIFACTS[@]}"
	exit $ret
else
	# Possible kernel crash or C-a x
	echo "vm-run: Error: no exit code is found, failure." >&2
	exit 1
fi
