blob: 29d1b470bded4a59b269521fe38c9e94a566a4c3 [file] [log] [blame]
#! /bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2021 Oracle, Inc. All Rights Reserved.
#
# FS QA Test No. 648
#
# Test nested log recovery with repeated (simulated) disk failures. We kick
# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
# out the underlying scratch device with dm-error to see what happens when the
# disk goes down. Having taken down both fses in this manner, remount them and
# repeat. This test simulates VM hosts crashing to try to shake out CoW bugs
# in writeback on the host that cause VM guests to fail to recover.
#
. ./common/preamble
_begin_fstest shutdown auto log metadata eio recoveryloop
_cleanup()
{
cd /
$KILLALL_PROG -9 fsstress > /dev/null 2>&1
wait
if [ -n "$loopmnt" ]; then
$UMOUNT_PROG $loopmnt 2>/dev/null
rm -r -f $loopmnt
fi
rm -f $tmp.*
_dmerror_unmount
_dmerror_cleanup
}
# Import common functions.
. ./common/dmerror
. ./common/reflink
# Modify as appropriate.
_require_scratch_reflink
_require_cp_reflink
_require_dm_target error
_require_command "$KILLALL_PROG" "killall"
_require_loop
echo "Silence is golden."
_scratch_mkfs >> $seqres.full 2>&1
_require_metadata_journaling $SCRATCH_DEV
_dmerror_init
_dmerror_mount
# Create a fs image consuming 1/3 of the scratch fs
scratch_freesp_bytes=$(_get_available_space $SCRATCH_MNT)
loopimg_bytes=$((scratch_freesp_bytes / 3))
loopimg=$SCRATCH_MNT/testfs
truncate -s $loopimg_bytes $loopimg
_mkfs_dev $loopimg
loopmnt=$tmp.mount
mkdir -p $loopmnt
scratch_aliveflag=$tmp.runsnap
snap_aliveflag=$tmp.snapping
snap_loop_fs() {
touch "$snap_aliveflag"
while [ -e "$scratch_aliveflag" ]; do
rm -f $loopimg.a
_cp_reflink $loopimg $loopimg.a
sleep 1
done
rm -f "$snap_aliveflag"
}
fsstress=($FSSTRESS_PROG $FSSTRESS_AVOID -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))")
while _soak_loop_running $((25 * TIME_FACTOR)); do
touch $scratch_aliveflag
snap_loop_fs >> $seqres.full 2>&1 &
if ! _mount $loopimg $loopmnt -o loop; then
rm -f $scratch_aliveflag
_metadump_dev $loopimg $seqres.loop.$i.md
_fail "iteration $SOAK_LOOPIDX loopimg mount failed"
break
fi
("${fsstress[@]}" >> $seqres.full &) > /dev/null 2>&1
# purposely include 0 second sleeps to test shutdown immediately after
# recovery
sleep $((RANDOM % (3 * TIME_FACTOR) ))
rm -f $scratch_aliveflag
# This test aims to simulate sudden disk failure, which means that we
# do not want to quiesce the filesystem or otherwise give it a chance
# to flush its logs. Therefore we want to call dmsetup with the
# --nolockfs parameter; to make this happen we must call the load
# error table helper *without* 'lockfs'.
_dmerror_load_error_table
ps -e | grep fsstress > /dev/null 2>&1
while [ $? -eq 0 ]; do
$KILLALL_PROG -9 fsstress > /dev/null 2>&1
wait > /dev/null 2>&1
ps -e | grep fsstress > /dev/null 2>&1
done
for ((j = 0; j < 10; j++)); do
test -e "$snap_aliveflag" || break
sleep 1
done
# Mount again to replay log after loading working table, so we have a
# consistent fs after test.
$UMOUNT_PROG $loopmnt
is_unmounted=1
# We must unmount dmerror at here, or whole later testing will crash.
# So try to umount enough times, before we have no choice.
for ((j = 0; j < 100; j++)); do
sleep 1
_dmerror_unmount > $tmp.unmount.err 2>&1
if [ $? -eq 0 ];then
is_unmounted=0
break
fi
done
if [ $is_unmounted -ne 0 ];then
cat $tmp.unmount.err
_fail "iteration $SOAK_LOOPIDX scratch unmount failed"
fi
_dmerror_load_working_table
if ! _dmerror_mount; then
_metadump_dev $DMERROR_DEV $seqres.scratch.$i.md
_fail "iteration $SOAK_LOOPIDX scratch mount failed"
fi
done
# Make sure the fs image file is ok
if [ -f "$loopimg" ]; then
if _mount $loopimg $loopmnt -o loop; then
$UMOUNT_PROG $loopmnt &> /dev/null
else
_metadump_dev $DMERROR_DEV $seqres.scratch.final.md
echo "final scratch mount failed"
fi
SCRATCH_RTDEV= SCRATCH_LOGDEV= _check_scratch_fs $loopimg
fi
# success, all done; let the test harness check the scratch fs
status=0
exit