releases/5.11.21/drivers-hv-vmbus-increase-wait-time-for-vmbus-unload.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 78f84c182316db7b839b51657e97322914404a40 Mon Sep 17 00:00:00 2001
 From: Sasha Levin <sashal@kernel.org>
 Date: Mon, 19 Apr 2021 21:48:09 -0700
 Subject: Drivers: hv: vmbus: Increase wait time for VMbus unload

 From: Michael Kelley <mikelley@microsoft.com>

 [ Upstream commit 77db0ec8b7764cb9b09b78066ebfd47b2c0c1909 ]

 When running in Azure, disks may be connected to a Linux VM with
 read/write caching enabled. If a VM panics and issues a VMbus
 UNLOAD request to Hyper-V, the response is delayed until all dirty
 data in the disk cache is flushed.  In extreme cases, this flushing
 can take 10's of seconds, depending on the disk speed and the amount
 of dirty data. If kdump is configured for the VM, the current 10 second
 timeout in vmbus_wait_for_unload() may be exceeded, and the UNLOAD
 complete message may arrive well after the kdump kernel is already
 running, causing problems.  Note that no problem occurs if kdump is
 not enabled because Hyper-V waits for the cache flush before doing
 a reboot through the BIOS/UEFI code.

 Fix this problem by increasing the timeout in vmbus_wait_for_unload()
 to 100 seconds. Also output periodic messages so that if anyone is
 watching the serial console, they won't think the VM is completely
 hung.

 Fixes: 911e1987efc8 ("Drivers: hv: vmbus: Add timeout to vmbus_wait_for_unload")
 Signed-off-by: Michael Kelley <mikelley@microsoft.com>
 Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
 Link: https://lore.kernel.org/r/1618894089-126662-1-git-send-email-mikelley@microsoft.com
 Signed-off-by: Wei Liu <wei.liu@kernel.org>
 Signed-off-by: Sasha Levin <sashal@kernel.org>
 ---
  drivers/hv/channel_mgmt.c | 30 +++++++++++++++++++++++++-----
  1 file changed, 25 insertions(+), 5 deletions(-)

 diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
 index 6be9f56cb627..6476bfe193af 100644
 --- a/drivers/hv/channel_mgmt.c
 +++ b/drivers/hv/channel_mgmt.c
 @@ -725,6 +725,12 @@ static void init_vp_index(struct vmbus_channel *channel)
  	free_cpumask_var(available_mask);
  }

 +#define UNLOAD_DELAY_UNIT_MS	10		/* 10 milliseconds */
 +#define UNLOAD_WAIT_MS		(100*1000)	/* 100 seconds */
 +#define UNLOAD_WAIT_LOOPS	(UNLOAD_WAIT_MS/UNLOAD_DELAY_UNIT_MS)
 +#define UNLOAD_MSG_MS		(5*1000)	/* Every 5 seconds */
 +#define UNLOAD_MSG_LOOPS	(UNLOAD_MSG_MS/UNLOAD_DELAY_UNIT_MS)
 +
  static void vmbus_wait_for_unload(void)
  {
  	int cpu;
 @@ -742,12 +748,17 @@ static void vmbus_wait_for_unload(void)
  	 * vmbus_connection.unload_event. If not, the last thing we can do is
  	 * read message pages for all CPUs directly.
  	 *
 -	 * Wait no more than 10 seconds so that the panic path can't get
 -	 * hung forever in case the response message isn't seen.
 +	 * Wait up to 100 seconds since an Azure host must writeback any dirty
 +	 * data in its disk cache before the VMbus UNLOAD request will
 +	 * complete. This flushing has been empirically observed to take up
 +	 * to 50 seconds in cases with a lot of dirty data, so allow additional
 +	 * leeway and for inaccuracies in mdelay(). But eventually time out so
 +	 * that the panic path can't get hung forever in case the response
 +	 * message isn't seen.
  	 */
 -	for (i = 0; i < 1000; i++) {
 +	for (i = 1; i <= UNLOAD_WAIT_LOOPS; i++) {
  		if (completion_done(&vmbus_connection.unload_event))
 -			break;
 +			goto completed;

  		for_each_online_cpu(cpu) {
  			struct hv_per_cpu_context *hv_cpu
 @@ -770,9 +781,18 @@ static void vmbus_wait_for_unload(void)
  			vmbus_signal_eom(msg, message_type);
  		}

 -		mdelay(10);
 +		/*
 +		 * Give a notice periodically so someone watching the
 +		 * serial output won't think it is completely hung.
 +		 */
 +		if (!(i % UNLOAD_MSG_LOOPS))
 +			pr_notice("Waiting for VMBus UNLOAD to complete\n");
 +
 +		mdelay(UNLOAD_DELAY_UNIT_MS);
  	}
 +	pr_err("Continuing even though VMBus UNLOAD did not complete\n");

 +completed:
  	/*
  	 * We're crashing and already got the UNLOAD_RESPONSE, cleanup all
  	 * maybe-pending messages on all CPUs to be able to receive new
 --
 2.30.2
	From 78f84c182316db7b839b51657e97322914404a40 Mon Sep 17 00:00:00 2001
	From: Sasha Levin <sashal@kernel.org>
	Date: Mon, 19 Apr 2021 21:48:09 -0700
	Subject: Drivers: hv: vmbus: Increase wait time for VMbus unload

	From: Michael Kelley <mikelley@microsoft.com>

	[ Upstream commit 77db0ec8b7764cb9b09b78066ebfd47b2c0c1909 ]

	When running in Azure, disks may be connected to a Linux VM with
	read/write caching enabled. If a VM panics and issues a VMbus
	UNLOAD request to Hyper-V, the response is delayed until all dirty
	data in the disk cache is flushed. In extreme cases, this flushing
	can take 10's of seconds, depending on the disk speed and the amount
	of dirty data. If kdump is configured for the VM, the current 10 second
	timeout in vmbus_wait_for_unload() may be exceeded, and the UNLOAD
	complete message may arrive well after the kdump kernel is already
	running, causing problems. Note that no problem occurs if kdump is
	not enabled because Hyper-V waits for the cache flush before doing
	a reboot through the BIOS/UEFI code.

	Fix this problem by increasing the timeout in vmbus_wait_for_unload()
	to 100 seconds. Also output periodic messages so that if anyone is
	watching the serial console, they won't think the VM is completely
	hung.

	Fixes: 911e1987efc8 ("Drivers: hv: vmbus: Add timeout to vmbus_wait_for_unload")
	Signed-off-by: Michael Kelley <mikelley@microsoft.com>
	Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
	Link: https://lore.kernel.org/r/1618894089-126662-1-git-send-email-mikelley@microsoft.com
	Signed-off-by: Wei Liu <wei.liu@kernel.org>
	Signed-off-by: Sasha Levin <sashal@kernel.org>
	---
	drivers/hv/channel_mgmt.c \| 30 +++++++++++++++++++++++++-----
	1 file changed, 25 insertions(+), 5 deletions(-)

	diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
	index 6be9f56cb627..6476bfe193af 100644
	--- a/drivers/hv/channel_mgmt.c
	+++ b/drivers/hv/channel_mgmt.c
	@@ -725,6 +725,12 @@ static void init_vp_index(struct vmbus_channel *channel)
	free_cpumask_var(available_mask);
	}

	+#define UNLOAD_DELAY_UNIT_MS 10 /* 10 milliseconds */
	+#define UNLOAD_WAIT_MS (1001000) / 100 seconds */
	+#define UNLOAD_WAIT_LOOPS (UNLOAD_WAIT_MS/UNLOAD_DELAY_UNIT_MS)
	+#define UNLOAD_MSG_MS (51000) / Every 5 seconds */
	+#define UNLOAD_MSG_LOOPS (UNLOAD_MSG_MS/UNLOAD_DELAY_UNIT_MS)
	+
	static void vmbus_wait_for_unload(void)
	{
	int cpu;
	@@ -742,12 +748,17 @@ static void vmbus_wait_for_unload(void)
	* vmbus_connection.unload_event. If not, the last thing we can do is
	* read message pages for all CPUs directly.
	*
	- * Wait no more than 10 seconds so that the panic path can't get
	- * hung forever in case the response message isn't seen.
	+ * Wait up to 100 seconds since an Azure host must writeback any dirty
	+ * data in its disk cache before the VMbus UNLOAD request will
	+ * complete. This flushing has been empirically observed to take up
	+ * to 50 seconds in cases with a lot of dirty data, so allow additional
	+ * leeway and for inaccuracies in mdelay(). But eventually time out so
	+ * that the panic path can't get hung forever in case the response
	+ * message isn't seen.
	*/
	- for (i = 0; i < 1000; i++) {
	+ for (i = 1; i <= UNLOAD_WAIT_LOOPS; i++) {
	if (completion_done(&vmbus_connection.unload_event))
	- break;
	+ goto completed;

	for_each_online_cpu(cpu) {
	struct hv_per_cpu_context *hv_cpu
	@@ -770,9 +781,18 @@ static void vmbus_wait_for_unload(void)
	vmbus_signal_eom(msg, message_type);
	}

	- mdelay(10);
	+ /*
	+ * Give a notice periodically so someone watching the
	+ * serial output won't think it is completely hung.
	+ */
	+ if (!(i % UNLOAD_MSG_LOOPS))
	+ pr_notice("Waiting for VMBus UNLOAD to complete\n");
	+
	+ mdelay(UNLOAD_DELAY_UNIT_MS);
	}
	+ pr_err("Continuing even though VMBus UNLOAD did not complete\n");

	+completed:
	/*
	* We're crashing and already got the UNLOAD_RESPONSE, cleanup all
	* maybe-pending messages on all CPUs to be able to receive new
	--
	2.30.2