tests/btrfs/252 - pub/scm/linux/kernel/git/jlayton/xfstests-dev - Git at Google

 #! /bin/bash
 # SPDX-License-Identifier: GPL-2.0
 # Copyright (C) 2021 SUSE Linux Products GmbH. All Rights Reserved.
 #
 # FS QA Test No. 252
 #
 # Test that send and balance can run in parallel, without failures and producing
 # correct results.
 #
 # Before kernel 5.3 it was possible to run both operations in parallel, however
 # it was buggy and caused sporadic failures due to races, so it was disabled in
 # kernel 5.3 by commit 9e967495e0e0ae ("Btrfs: prevent send failures and crashes
 # due to concurrent relocation"). There is a now a patch that enables both
 # operations to safely run in parallel, and it has the following subject:
 #
 #      "btrfs: make send work with concurrent block group relocation"
 #
 # This also serves the purpose of testing a succession of incremental send
 # operations, where we have a bunch of snapshots of the same subvolume and we
 # keep doing an incremental send using the previous snapshot as the parent.
 #
 . ./common/preamble
 _begin_fstest auto send balance stress

 _cleanup()
 {
 	if [ ! -z $balance_pid ]; then
 		kill $balance_pid &> /dev/null
 		wait $balance_pid
 	fi
 	cd /
 	rm -r -f $tmp.*
 }

 . ./common/filter


 # The size needed is variable as it depends on the specific randomized
 # operations from fsstress and on the value of $LOAD_FACTOR. But require
 # at least $LOAD_FACTOR * 6G, as we do the receive operations to the same
 # filesystem, do relocation and store snapshot checksums from fssum in the
 # filesystem as well (can be hundreds of megabytes for $LOAD_FACTOR > 3).
 _require_scratch_size $(($LOAD_FACTOR * 6 * 1024 * 1024))
 _require_fssum

 balance_loop()
 {
 	trap "wait; exit" SIGTERM

 	while true; do
 		_run_btrfs_balance_start $SCRATCH_MNT > /dev/null
 		[ $? -eq 0 ] || echo "Balance failed: $?"
 	done
 }

 _scratch_mkfs >> $seqres.full 2>&1
 _scratch_mount

 num_snapshots=$((10 + $LOAD_FACTOR * 2))
 avg_ops_per_snapshot=$((1000 * LOAD_FACTOR))
 total_fsstress_ops=$((num_snapshots * avg_ops_per_snapshot))

 data_subvol="$SCRATCH_MNT/data"
 snapshots_dir="$SCRATCH_MNT/snapshots"
 dest_dir="$SCRATCH_MNT/received"
 fssum_dir="$SCRATCH_MNT/fssum"

 mkdir -p "$snapshots_dir"
 mkdir -p "$dest_dir"
 mkdir -p "$fssum_dir"
 $BTRFS_UTIL_PROG subvolume create "$data_subvol" >> $seqres.full

 snapshot_cmd="$BTRFS_UTIL_PROG subvolume snapshot -r \"$data_subvol\""
 snapshot_cmd="$snapshot_cmd \"$snapshots_dir/snap_\`date +'%s%N'\`\""

 # Use a single fsstress process so that in case of a failure we can grab the seed
 # number from the .full log file and deterministically reproduce a failure.
 # Also disable subvolume and snapshot creation, since send is not recursive, so
 # it's pointless to have them.
 #
 echo "Running fsstress..." >> $seqres.full
 $FSSTRESS_PROG $FSSTRESS_AVOID -d "$data_subvol" -p 1 -w \
 	       -f subvol_create=0 -f subvol_delete=0 -f snapshot=0 \
 	       -x "$snapshot_cmd" -X $num_snapshots \
 	       -n $total_fsstress_ops >> $seqres.full

 snapshots=(`IFS=$'\n' ls -1 "$snapshots_dir"`)

 # Compute the checksums for every snapshot.
 for i in "${!snapshots[@]}"; do
 	snap="${snapshots_dir}/${snapshots[$i]}"
 	echo "Computing checksum for snapshot: ${snapshots[$i]}" >> $seqres.full
 	$FSSUM_PROG -A -f -w "${fssum_dir}/${i}.fssum" "$snap"
 done

 # Now leave a process constantly running balance.
 balance_loop &
 balance_pid=$!

 # Now send and receive all snapshots to our destination directory.
 # We send and receive from/to the same same filesystem using a pipe, because
 # this is the most stressful scenario and it could lead (and has lead to during
 # development) to deadlocks - the sending task blocking on a full pipe while
 # holding some lock, while the receiving side was not reading from the pipe
 # because it was waiting for a transaction commit, which could not happen due
 # to the lock held by the sending task.
 #
 for i in "${!snapshots[@]}"; do
 	snap="${snapshots_dir}/${snapshots[$i]}"
 	prev_snap="${snapshots_dir}/${snapshots[$i - 1]}"

 	# For the first snapshot we do a full incremental send, for all the
 	# others we do an incremental send, using the previous snapshot as the
 	# parent.
 	#
 	# We redirect stderr of the send command because the commands prints to
 	# stderr a message like "At subvol ...", and the number of snapshots and
 	# snapshot names we have depends on LOAD_FACTOR and timestamps, so we
 	# don't use them in the golden output. Recent versions of btrfs-progs
 	# have the --quiet option to eliminate these messages.
 	#
 	# We also redirect stderr and stdout of the receive command. Note that
 	# when receiving a full stream, the command prints a message like
 	# "At subvol ..." to stderr, but when receiving an incremental stream it
 	# prints a message to stdout like "At snapshot ...". Just like for the
 	# send command, new versions of btrfs-progs have the --quiet option to
 	# eliminate these messages.
 	#
 	# Further the send command prints the messages with a full snapshot path,
 	# while receive prints only the snapshot name.
 	#
 	# We redirect all these messages to $seqres.full and then manually check
 	# if the commands succeeded. This is just so that the test is able to
 	# run with older versions of btrfs-progs.
 	#
 	if [ $i -eq 0 ]; then
 		echo "Full send of the snapshot at: $snap" >>$seqres.full
 		$BTRFS_UTIL_PROG send "$snap" 2>>$seqres.full | \
 			$BTRFS_UTIL_PROG receive "$dest_dir" 2>>$seqres.full
 	else
 		echo "Incremental send of the snapshot at: $snap" >>$seqres.full
 		$BTRFS_UTIL_PROG send -p "$prev_snap" "$snap" 2>>$seqres.full | \
 			$BTRFS_UTIL_PROG receive "$dest_dir" >>$seqres.full
 	fi

 	retvals=( "${PIPESTATUS[@]}" )

 	[ ${retvals[0]} -eq 0 ] || \
 		echo "Send of snapshot $snap failed: ${retvals[0]}"
 	[ ${retvals[1]} -eq 0 ] || \
 		echo "Receive of snapshot $snap failed: ${retvals[1]}"

 	if [ $i -gt 0 ]; then
 		# We don't need the previous snapshot anymore, so delete it.
 		# This makes balance not so slow and triggers the cleaner kthread
 		# to run and delete the snapshot tree in parallel, while we are
 		# also running balance and send/receive, adding additional test
 		# coverage and stress.
 		$BTRFS_UTIL_PROG subvolume delete "$prev_snap" >> $seqres.full
 	fi
 done

 # We are done with send/receive, send a signal to the balance job and verify
 # the snapshot checksums while it terminates. We do the wait at _cleanup() so
 # that we do some useful work while it terminates.
 kill $balance_pid

 # Now verify that received snapshots have the expected checksums.
 for i in "${!snapshots[@]}"; do
 	snap_csum="${fssum_dir}/${i}.fssum"
 	snap_copy="${dest_dir}/${snapshots[$i]}"

 	echo "Verifying checksum for snapshot at: $snap_copy" >> $seqres.full
 	# On success, fssum outputs only a single line with "OK" to stdout, and
 	# on error it outputs several lines to stdout. Since the number of
 	# snapshots in the test depends on $LOAD_FACTOR, filter out the success
 	# case, so we don't have a mismatch with the golden output in case we
 	# run with a non default $LOAD_FACTOR (default is 1). We only want the
 	# mismatch with the golden output in case there's a checksum failure.
 	$FSSUM_PROG -r "$snap_csum" "$snap_copy" | grep -E -v '^OK$'
 done

 echo "Silence is golden"

 # success, all done
 status=0
 exit
	#! /bin/bash
	# SPDX-License-Identifier: GPL-2.0
	# Copyright (C) 2021 SUSE Linux Products GmbH. All Rights Reserved.
	#
	# FS QA Test No. 252
	#
	# Test that send and balance can run in parallel, without failures and producing
	# correct results.
	#
	# Before kernel 5.3 it was possible to run both operations in parallel, however
	# it was buggy and caused sporadic failures due to races, so it was disabled in
	# kernel 5.3 by commit 9e967495e0e0ae ("Btrfs: prevent send failures and crashes
	# due to concurrent relocation"). There is a now a patch that enables both
	# operations to safely run in parallel, and it has the following subject:
	#
	# "btrfs: make send work with concurrent block group relocation"
	#
	# This also serves the purpose of testing a succession of incremental send
	# operations, where we have a bunch of snapshots of the same subvolume and we
	# keep doing an incremental send using the previous snapshot as the parent.
	#
	. ./common/preamble
	_begin_fstest auto send balance stress

	_cleanup()
	{
	if [ ! -z $balance_pid ]; then
	kill $balance_pid &> /dev/null
	wait $balance_pid
	fi
	cd /
	rm -r -f $tmp.*
	}

	. ./common/filter


	# The size needed is variable as it depends on the specific randomized
	# operations from fsstress and on the value of $LOAD_FACTOR. But require
	# at least $LOAD_FACTOR * 6G, as we do the receive operations to the same
	# filesystem, do relocation and store snapshot checksums from fssum in the
	# filesystem as well (can be hundreds of megabytes for $LOAD_FACTOR > 3).
	_require_scratch_size $(($LOAD_FACTOR * 6 * 1024 * 1024))
	_require_fssum

	balance_loop()
	{
	trap "wait; exit" SIGTERM

	while true; do
	_run_btrfs_balance_start $SCRATCH_MNT > /dev/null
	[ $? -eq 0 ] \|\| echo "Balance failed: $?"
	done
	}

	_scratch_mkfs >> $seqres.full 2>&1
	_scratch_mount

	num_snapshots=$((10 + $LOAD_FACTOR * 2))
	avg_ops_per_snapshot=$((1000 * LOAD_FACTOR))
	total_fsstress_ops=$((num_snapshots * avg_ops_per_snapshot))

	data_subvol="$SCRATCH_MNT/data"
	snapshots_dir="$SCRATCH_MNT/snapshots"
	dest_dir="$SCRATCH_MNT/received"
	fssum_dir="$SCRATCH_MNT/fssum"

	mkdir -p "$snapshots_dir"
	mkdir -p "$dest_dir"
	mkdir -p "$fssum_dir"
	$BTRFS_UTIL_PROG subvolume create "$data_subvol" >> $seqres.full

	snapshot_cmd="$BTRFS_UTIL_PROG subvolume snapshot -r \"$data_subvol\""
	snapshot_cmd="$snapshot_cmd \"$snapshots_dir/snap_\`date +'%s%N'\`\""

	# Use a single fsstress process so that in case of a failure we can grab the seed
	# number from the .full log file and deterministically reproduce a failure.
	# Also disable subvolume and snapshot creation, since send is not recursive, so
	# it's pointless to have them.
	#
	echo "Running fsstress..." >> $seqres.full
	$FSSTRESS_PROG $FSSTRESS_AVOID -d "$data_subvol" -p 1 -w \
	-f subvol_create=0 -f subvol_delete=0 -f snapshot=0 \
	-x "$snapshot_cmd" -X $num_snapshots \
	-n $total_fsstress_ops >> $seqres.full

	snapshots=(`IFS=$'\n' ls -1 "$snapshots_dir"`)

	# Compute the checksums for every snapshot.
	for i in "${!snapshots[@]}"; do
	snap="${snapshots_dir}/${snapshots[$i]}"
	echo "Computing checksum for snapshot: ${snapshots[$i]}" >> $seqres.full
	$FSSUM_PROG -A -f -w "${fssum_dir}/${i}.fssum" "$snap"
	done

	# Now leave a process constantly running balance.
	balance_loop &
	balance_pid=$!

	# Now send and receive all snapshots to our destination directory.
	# We send and receive from/to the same same filesystem using a pipe, because
	# this is the most stressful scenario and it could lead (and has lead to during
	# development) to deadlocks - the sending task blocking on a full pipe while
	# holding some lock, while the receiving side was not reading from the pipe
	# because it was waiting for a transaction commit, which could not happen due
	# to the lock held by the sending task.
	#
	for i in "${!snapshots[@]}"; do
	snap="${snapshots_dir}/${snapshots[$i]}"
	prev_snap="${snapshots_dir}/${snapshots[$i - 1]}"

	# For the first snapshot we do a full incremental send, for all the
	# others we do an incremental send, using the previous snapshot as the
	# parent.
	#
	# We redirect stderr of the send command because the commands prints to
	# stderr a message like "At subvol ...", and the number of snapshots and
	# snapshot names we have depends on LOAD_FACTOR and timestamps, so we
	# don't use them in the golden output. Recent versions of btrfs-progs
	# have the --quiet option to eliminate these messages.
	#
	# We also redirect stderr and stdout of the receive command. Note that
	# when receiving a full stream, the command prints a message like
	# "At subvol ..." to stderr, but when receiving an incremental stream it
	# prints a message to stdout like "At snapshot ...". Just like for the
	# send command, new versions of btrfs-progs have the --quiet option to
	# eliminate these messages.
	#
	# Further the send command prints the messages with a full snapshot path,
	# while receive prints only the snapshot name.
	#
	# We redirect all these messages to $seqres.full and then manually check
	# if the commands succeeded. This is just so that the test is able to
	# run with older versions of btrfs-progs.
	#
	if [ $i -eq 0 ]; then
	echo "Full send of the snapshot at: $snap" >>$seqres.full
	$BTRFS_UTIL_PROG send "$snap" 2>>$seqres.full \| \
	$BTRFS_UTIL_PROG receive "$dest_dir" 2>>$seqres.full
	else
	echo "Incremental send of the snapshot at: $snap" >>$seqres.full
	$BTRFS_UTIL_PROG send -p "$prev_snap" "$snap" 2>>$seqres.full \| \
	$BTRFS_UTIL_PROG receive "$dest_dir" >>$seqres.full
	fi

	retvals=( "${PIPESTATUS[@]}" )

	[ ${retvals[0]} -eq 0 ] \|\| \
	echo "Send of snapshot $snap failed: ${retvals[0]}"
	[ ${retvals[1]} -eq 0 ] \|\| \
	echo "Receive of snapshot $snap failed: ${retvals[1]}"

	if [ $i -gt 0 ]; then
	# We don't need the previous snapshot anymore, so delete it.
	# This makes balance not so slow and triggers the cleaner kthread
	# to run and delete the snapshot tree in parallel, while we are
	# also running balance and send/receive, adding additional test
	# coverage and stress.
	$BTRFS_UTIL_PROG subvolume delete "$prev_snap" >> $seqres.full
	fi
	done

	# We are done with send/receive, send a signal to the balance job and verify
	# the snapshot checksums while it terminates. We do the wait at _cleanup() so
	# that we do some useful work while it terminates.
	kill $balance_pid

	# Now verify that received snapshots have the expected checksums.
	for i in "${!snapshots[@]}"; do
	snap_csum="${fssum_dir}/${i}.fssum"
	snap_copy="${dest_dir}/${snapshots[$i]}"

	echo "Verifying checksum for snapshot at: $snap_copy" >> $seqres.full
	# On success, fssum outputs only a single line with "OK" to stdout, and
	# on error it outputs several lines to stdout. Since the number of
	# snapshots in the test depends on $LOAD_FACTOR, filter out the success
	# case, so we don't have a mismatch with the golden output in case we
	# run with a non default $LOAD_FACTOR (default is 1). We only want the
	# mismatch with the golden output in case there's a checksum failure.
	$FSSUM_PROG -r "$snap_csum" "$snap_copy" \| grep -E -v '^OK$'
	done

	echo "Silence is golden"

	# success, all done
	status=0
	exit