blob: c85437252a532054457431bdad9142060b683142 [file] [log] [blame]
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2024 Red Hat, Inc. All Rights Reserved.
#
# Run all tests in parallel
#
# This is a massive resource bomb script. For every test, it creates a
# pair of sparse loop devices for test and scratch devices, then mount points
# for them and runs the test in the background. When it completes, it tears down
# the loop devices.
export SRC_DIR="tests"
basedir=$1
shift
check_args="$*"
runners=64
runner_list=()
runtimes=()
# tests in auto group
test_list=$(awk '/^[0-9].*auto/ { print "generic/" $1 }' tests/generic/group.list)
test_list+=$(awk '/^[0-9].*auto/ { print "xfs/" $1 }' tests/xfs/group.list)
# grab all previously run tests and order them from highest runtime to lowest
# We are going to try to run the longer tests first, hopefully so we can avoid
# massive thundering herds trying to run lots of really short tests in parallel
# right off the bat. This will also tend to vary the order of tests from run to
# run somewhat.
#
# If we have tests in the test list that don't have runtimes recorded, then
# append them to be run last.
build_runner_list()
{
local runtimes
local run_list=()
local prev_results=`ls -tr $basedir/runner-0/ | grep results | tail -1`
runtimes=$(cat $basedir/*/$prev_results/check.time | sort -k 2 -nr | cut -d " " -f 1)
# Iterate the timed list first. For every timed list entry that
# is found in the test_list, add it to the local runner list.
local -a _list=( $runtimes )
local -a _tlist=( $test_list )
local rx=0
local ix
local jx
#set -x
for ((ix = 0; ix < ${#_list[*]}; ix++)); do
echo $test_list | grep -q ${_list[$ix]}
if [ $? == 0 ]; then
# add the test to the new run list and remove
# it from the remaining test list.
run_list[rx++]=${_list[$ix]}
_tlist=( ${_tlist[*]/${_list[$ix]}/} )
fi
done
# The final test list is all the time ordered tests followed by
# all the tests we didn't find time records for.
test_list="${run_list[*]} ${_tlist[*]}"
}
if [ -f $basedir/runner-0/results/check.time ]; then
build_runner_list
fi
# split the list amongst N runners
split_runner_list()
{
local ix
local rx
local -a _list=( $test_list )
for ((ix = 0; ix < ${#_list[*]}; ix++)); do
seq="${_list[$ix]}"
rx=$((ix % $runners))
runner_list[$rx]+="${_list[$ix]} "
#echo $seq
done
}
_create_loop_device()
{
local file=$1 dev
dev=`losetup -f --show $file` || _fail "Cannot assign $file to a loop device"
# Using buffered IO for the loop devices seems to run quite a bit
# faster. There are a lot of tests that hit the same regions of the
# filesystems, so avoiding read IO seems to really help. Results can
# vary, though, because many tests drop all caches unconditionally.
# Uncomment to use AIO+DIO loop devices instead.
#test -b "$dev" && losetup --direct-io=on $dev 2> /dev/null
echo $dev
}
_destroy_loop_device()
{
local dev=$1
blockdev --flushbufs $dev
umount $dev > /dev/null 2>&1
losetup -d $dev || _fail "Cannot destroy loop device $dev"
}
runner_go()
{
local id=$1
local me=$basedir/runner-$id
local _test=$me/test.img
local _scratch=$me/scratch.img
local _results=$me/results-$2
mkdir -p $me
xfs_io -f -c 'truncate 2g' $_test
xfs_io -f -c 'truncate 8g' $_scratch
mkfs.xfs -f $_test > /dev/null 2>&1
export TEST_DEV=$(_create_loop_device $_test)
export TEST_DIR=$me/test
export SCRATCH_DEV=$(_create_loop_device $_scratch)
export SCRATCH_MNT=$me/scratch
export FSTYP=xfs
export RESULT_BASE=$_results
mkdir -p $TEST_DIR
mkdir -p $SCRATCH_MNT
mkdir -p $RESULT_BASE
rm -f $RESULT_BASE/check.*
# export DUMP_CORRUPT_FS=1
# Run the tests in it's own mount namespace, as per the comment below
# that precedes making the basedir a private mount.
./src/nsexec -m ./check $check_args -x unreliable_in_parallel --exact-order ${runner_list[$id]} > $me/log 2>&1
wait
sleep 1
umount -R $TEST_DIR 2> /dev/null
umount -R $SCRATCH_MNT 2> /dev/null
_destroy_loop_device $TEST_DEV
_destroy_loop_device $SCRATCH_DEV
grep -q Failures: $me/log
if [ $? -eq 0 ]; then
echo -n "Runner $id Failures: "
grep Failures: $me/log | uniq | sed -e "s/^.*Failures://"
fi
}
cleanup()
{
killall -INT -q check
wait
umount -R $basedir/*/test 2> /dev/null
umount -R $basedir/*/scratch 2> /dev/null
losetup --detach-all
}
trap "cleanup; exit" HUP INT QUIT TERM
# Each parallel test runner needs to only see it's own mount points. If we
# leave the basedir as shared, then all tests see all mounts and then we get
# mount propagation issues cropping up. For example, cloning a new mount
# namespace will take a reference to all visible shared mounts and hold them
# while the mount names space is active. This can cause unmount in the test that
# controls the mount to succeed without actually unmounting the filesytsem
# because a mount namespace still holds a reference to it. This causes other
# operations on the block device to fail as it is still busy (e.g. fsck, mkfs,
# etc). Hence we make the basedir private here and then run each check instance
# in it's own mount namespace so that they cannot see mounts that other tests
# are performing.
mount --make-private $basedir
split_runner_list
now=`date +%Y-%m-%d-%H:%M:%S`
for ((i = 0; i < $runners; i++)); do
runner_go $i $now &
done;
wait
echo -n "Tests run: "
grep Ran /mnt/xfs/*/log | sed -e 's,^.*:,,' -e 's, ,\n,g' | sort | uniq | wc -l
echo -n "Failure count: "
grep Failures: $basedir/*/log | uniq | sed -e "s/^.*Failures://" -e "s,\([0-9]\) \([gx]\),\1\n \2,g" |wc -l
echo
echo Ten slowest tests - runtime in seconds:
cat $basedir/*/results/check.time | sort -k 2 -nr | head -10
echo
echo Cleanup on Aisle 5?
echo
losetup --list
ls -l /dev/mapper
df -h |grep xfs