releases/4.4.81/sparc64-measure-receiver-forward-progress-to-avoid-send-mondo-timeout.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From foo@baz Tue Aug  8 16:56:08 PDT 2017
 From: Jane Chu <jane.chu@oracle.com>
 Date: Tue, 11 Jul 2017 12:00:54 -0600
 Subject: sparc64: Measure receiver forward progress to avoid send mondo timeout

 From: Jane Chu <jane.chu@oracle.com>


 [ Upstream commit 9d53caec84c7c5700e7c1ed744ea584fff55f9ac ]

 A large sun4v SPARC system may have moments of intensive xcall activities,
 usually caused by unmapping many pages on many CPUs concurrently. This can
 flood receivers with CPU mondo interrupts for an extended period, causing
 some unlucky senders to hit send-mondo timeout. This problem gets worse
 as cpu count increases because sometimes mappings must be invalidated on
 all CPUs, and sometimes all CPUs may gang up on a single CPU.

 But a busy system is not a broken system. In the above scenario, as long
 as the receiver is making forward progress processing mondo interrupts,
 the sender should continue to retry.

 This patch implements the receiver's forward progress meter by introducing
 a per cpu counter 'cpu_mondo_counter[cpu]' where 'cpu' is in the range
 of 0..NR_CPUS. The receiver increments its counter as soon as it receives
 a mondo and the sender tracks the receiver's counter. If the receiver has
 stopped making forward progress when the retry limit is reached, the sender
 declares send-mondo-timeout and panic; otherwise, the receiver is allowed
 to keep making forward progress.

 In addition, it's been observed that PCIe hotplug events generate Correctable
 Errors that are handled by hypervisor and then OS. Hypervisor 'borrows'
 a guest cpu strand briefly to provide the service. If the cpu strand is
 simultaneously the only cpu targeted by a mondo, it may not be available
 for the mondo in 20msec, causing SUN4V mondo timeout. It appears that 1 second
 is the agreed wait time between hypervisor and guest OS, this patch makes
 the adjustment.

 Orabug: 25476541
 Orabug: 26417466

 Signed-off-by: Jane Chu <jane.chu@oracle.com>
 Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
 Reviewed-by: Anthony Yznaga <anthony.yznaga@oracle.com>
 Reviewed-by: Rob Gardner <rob.gardner@oracle.com>
 Reviewed-by: Thomas Tai <thomas.tai@oracle.com>
 Signed-off-by: David S. Miller <davem@davemloft.net>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  arch/sparc/include/asm/trap_block.h |    1
  arch/sparc/kernel/smp_64.c          |  189 ++++++++++++++++++++++--------------
  arch/sparc/kernel/sun4v_ivec.S      |   15 ++
  arch/sparc/kernel/traps_64.c        |    1
  4 files changed, 134 insertions(+), 72 deletions(-)

 --- a/arch/sparc/include/asm/trap_block.h
 +++ b/arch/sparc/include/asm/trap_block.h
 @@ -54,6 +54,7 @@ extern struct trap_per_cpu trap_block[NR
  void init_cur_cpu_trap(struct thread_info *);
  void setup_tba(void);
  extern int ncpus_probed;
 +extern u64 cpu_mondo_counter[NR_CPUS];

  unsigned long real_hard_smp_processor_id(void);

 --- a/arch/sparc/kernel/smp_64.c
 +++ b/arch/sparc/kernel/smp_64.c
 @@ -617,22 +617,48 @@ retry:
  	}
  }

 -/* Multi-cpu list version.  */
 +#define	CPU_MONDO_COUNTER(cpuid)	(cpu_mondo_counter[cpuid])
 +#define	MONDO_USEC_WAIT_MIN		2
 +#define	MONDO_USEC_WAIT_MAX		100
 +#define	MONDO_RETRY_LIMIT		500000
 +
 +/* Multi-cpu list version.
 + *
 + * Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
 + * Sometimes not all cpus receive the mondo, requiring us to re-send
 + * the mondo until all cpus have received, or cpus are truly stuck
 + * unable to receive mondo, and we timeout.
 + * Occasionally a target cpu strand is borrowed briefly by hypervisor to
 + * perform guest service, such as PCIe error handling. Consider the
 + * service time, 1 second overall wait is reasonable for 1 cpu.
 + * Here two in-between mondo check wait time are defined: 2 usec for
 + * single cpu quick turn around and up to 100usec for large cpu count.
 + * Deliver mondo to large number of cpus could take longer, we adjusts
 + * the retry count as long as target cpus are making forward progress.
 + */
  static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
  {
 -	int retries, this_cpu, prev_sent, i, saw_cpu_error;
 +	int this_cpu, tot_cpus, prev_sent, i, rem;
 +	int usec_wait, retries, tot_retries;
 +	u16 first_cpu = 0xffff;
 +	unsigned long xc_rcvd = 0;
  	unsigned long status;
 +	int ecpuerror_id = 0;
 +	int enocpu_id = 0;
  	u16 *cpu_list;
 +	u16 cpu;

  	this_cpu = smp_processor_id();
 -
  	cpu_list = __va(tb->cpu_list_pa);
 -
 -	saw_cpu_error = 0;
 -	retries = 0;
 +	usec_wait = cnt * MONDO_USEC_WAIT_MIN;
 +	if (usec_wait > MONDO_USEC_WAIT_MAX)
 +		usec_wait = MONDO_USEC_WAIT_MAX;
 +	retries = tot_retries = 0;
 +	tot_cpus = cnt;
  	prev_sent = 0;
 +
  	do {
 -		int forward_progress, n_sent;
 +		int n_sent, mondo_delivered, target_cpu_busy;

  		status = sun4v_cpu_mondo_send(cnt,
  					      tb->cpu_list_pa,
 @@ -640,94 +666,113 @@ static void hypervisor_xcall_deliver(str

  		/* HV_EOK means all cpus received the xcall, we're done.  */
  		if (likely(status == HV_EOK))
 -			break;
 +			goto xcall_done;
 +
 +		/* If not these non-fatal errors, panic */
 +		if (unlikely((status != HV_EWOULDBLOCK) &&
 +			(status != HV_ECPUERROR) &&
 +			(status != HV_ENOCPU)))
 +			goto fatal_errors;

  		/* First, see if we made any forward progress.
  		 *
 +		 * Go through the cpu_list, count the target cpus that have
 +		 * received our mondo (n_sent), and those that did not (rem).
 +		 * Re-pack cpu_list with the cpus remain to be retried in the
 +		 * front - this simplifies tracking the truly stalled cpus.
 +		 *
  		 * The hypervisor indicates successful sends by setting
  		 * cpu list entries to the value 0xffff.
 +		 *
 +		 * EWOULDBLOCK means some target cpus did not receive the
 +		 * mondo and retry usually helps.
 +		 *
 +		 * ECPUERROR means at least one target cpu is in error state,
 +		 * it's usually safe to skip the faulty cpu and retry.
 +		 *
 +		 * ENOCPU means one of the target cpu doesn't belong to the
 +		 * domain, perhaps offlined which is unexpected, but not
 +		 * fatal and it's okay to skip the offlined cpu.
  		 */
 +		rem = 0;
  		n_sent = 0;
  		for (i = 0; i < cnt; i++) {
 -			if (likely(cpu_list[i] == 0xffff))
 +			cpu = cpu_list[i];
 +			if (likely(cpu == 0xffff)) {
  				n_sent++;
 +			} else if ((status == HV_ECPUERROR) &&
 +				(sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
 +				ecpuerror_id = cpu + 1;
 +			} else if (status == HV_ENOCPU && !cpu_online(cpu)) {
 +				enocpu_id = cpu + 1;
 +			} else {
 +				cpu_list[rem++] = cpu;
 +			}
  		}

 -		forward_progress = 0;
 -		if (n_sent > prev_sent)
 -			forward_progress = 1;
 +		/* No cpu remained, we're done. */
 +		if (rem == 0)
 +			break;

 -		prev_sent = n_sent;
 +		/* Otherwise, update the cpu count for retry. */
 +		cnt = rem;

 -		/* If we get a HV_ECPUERROR, then one or more of the cpus
 -		 * in the list are in error state.  Use the cpu_state()
 -		 * hypervisor call to find out which cpus are in error state.
 +		/* Record the overall number of mondos received by the
 +		 * first of the remaining cpus.
  		 */
 -		if (unlikely(status == HV_ECPUERROR)) {
 -			for (i = 0; i < cnt; i++) {
 -				long err;
 -				u16 cpu;
 -
 -				cpu = cpu_list[i];
 -				if (cpu == 0xffff)
 -					continue;
 -
 -				err = sun4v_cpu_state(cpu);
 -				if (err == HV_CPU_STATE_ERROR) {
 -					saw_cpu_error = (cpu + 1);
 -					cpu_list[i] = 0xffff;
 -				}
 -			}
 -		} else if (unlikely(status != HV_EWOULDBLOCK))
 -			goto fatal_mondo_error;
 +		if (first_cpu != cpu_list[0]) {
 +			first_cpu = cpu_list[0];
 +			xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
 +		}

 -		/* Don't bother rewriting the CPU list, just leave the
 -		 * 0xffff and non-0xffff entries in there and the
 -		 * hypervisor will do the right thing.
 -		 *
 -		 * Only advance timeout state if we didn't make any
 -		 * forward progress.
 +		/* Was any mondo delivered successfully? */
 +		mondo_delivered = (n_sent > prev_sent);
 +		prev_sent = n_sent;
 +
 +		/* or, was any target cpu busy processing other mondos? */
 +		target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
 +		xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
 +
 +		/* Retry count is for no progress. If we're making progress,
 +		 * reset the retry count.
  		 */
 -		if (unlikely(!forward_progress)) {
 -			if (unlikely(++retries > 10000))
 -				goto fatal_mondo_timeout;
 -
 -			/* Delay a little bit to let other cpus catch up
 -			 * on their cpu mondo queue work.
 -			 */
 -			udelay(2 * cnt);
 +		if (likely(mondo_delivered || target_cpu_busy)) {
 +			tot_retries += retries;
 +			retries = 0;
 +		} else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
 +			goto fatal_mondo_timeout;
  		}
 -	} while (1);

 -	if (unlikely(saw_cpu_error))
 -		goto fatal_mondo_cpu_error;
 +		/* Delay a little bit to let other cpus catch up on
 +		 * their cpu mondo queue work.
 +		 */
 +		if (!mondo_delivered)
 +			udelay(usec_wait);

 -	return;
 +		retries++;
 +	} while (1);

 -fatal_mondo_cpu_error:
 -	printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
 -	       "(including %d) were in error state\n",
 -	       this_cpu, saw_cpu_error - 1);
 +xcall_done:
 +	if (unlikely(ecpuerror_id > 0)) {
 +		pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
 +		       this_cpu, ecpuerror_id - 1);
 +	} else if (unlikely(enocpu_id > 0)) {
 +		pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
 +		       this_cpu, enocpu_id - 1);
 +	}
  	return;

 +fatal_errors:
 +	/* fatal errors include bad alignment, etc */
 +	pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
 +	       this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
 +	panic("Unexpected SUN4V mondo error %lu\n", status);
 +
  fatal_mondo_timeout:
 -	printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
 -	       " progress after %d retries.\n",
 -	       this_cpu, retries);
 -	goto dump_cpu_list_and_out;
 -
 -fatal_mondo_error:
 -	printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
 -	       this_cpu, status);
 -	printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
 -	       "mondo_block_pa(%lx)\n",
 -	       this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
 -
 -dump_cpu_list_and_out:
 -	printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
 -	for (i = 0; i < cnt; i++)
 -		printk("%u ", cpu_list[i]);
 -	printk("]\n");
 +	/* some cpus being non-responsive to the cpu mondo */
 +	pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
 +	       this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
 +	panic("SUN4V mondo timeout panic\n");
  }

  static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
 --- a/arch/sparc/kernel/sun4v_ivec.S
 +++ b/arch/sparc/kernel/sun4v_ivec.S
 @@ -26,6 +26,21 @@ sun4v_cpu_mondo:
  	ldxa	[%g0] ASI_SCRATCHPAD, %g4
  	sub	%g4, TRAP_PER_CPU_FAULT_INFO, %g4

 +	/* Get smp_processor_id() into %g3 */
 +	sethi	%hi(trap_block), %g5
 +	or	%g5, %lo(trap_block), %g5
 +	sub	%g4, %g5, %g3
 +	srlx	%g3, TRAP_BLOCK_SZ_SHIFT, %g3
 +
 +	/* Increment cpu_mondo_counter[smp_processor_id()] */
 +	sethi	%hi(cpu_mondo_counter), %g5
 +	or	%g5, %lo(cpu_mondo_counter), %g5
 +	sllx	%g3, 3, %g3
 +	add	%g5, %g3, %g5
 +	ldx	[%g5], %g3
 +	add	%g3, 1, %g3
 +	stx	%g3, [%g5]
 +
  	/* Get CPU mondo queue base phys address into %g7.  */
  	ldx	[%g4 + TRAP_PER_CPU_CPU_MONDO_PA], %g7

 --- a/arch/sparc/kernel/traps_64.c
 +++ b/arch/sparc/kernel/traps_64.c
 @@ -2659,6 +2659,7 @@ void do_getpsr(struct pt_regs *regs)
  	}
  }

 +u64 cpu_mondo_counter[NR_CPUS] = {0};
  struct trap_per_cpu trap_block[NR_CPUS];
  EXPORT_SYMBOL(trap_block);
	From foo@baz Tue Aug 8 16:56:08 PDT 2017
	From: Jane Chu <jane.chu@oracle.com>
	Date: Tue, 11 Jul 2017 12:00:54 -0600
	Subject: sparc64: Measure receiver forward progress to avoid send mondo timeout

	From: Jane Chu <jane.chu@oracle.com>


	[ Upstream commit 9d53caec84c7c5700e7c1ed744ea584fff55f9ac ]

	A large sun4v SPARC system may have moments of intensive xcall activities,
	usually caused by unmapping many pages on many CPUs concurrently. This can
	flood receivers with CPU mondo interrupts for an extended period, causing
	some unlucky senders to hit send-mondo timeout. This problem gets worse
	as cpu count increases because sometimes mappings must be invalidated on
	all CPUs, and sometimes all CPUs may gang up on a single CPU.

	But a busy system is not a broken system. In the above scenario, as long
	as the receiver is making forward progress processing mondo interrupts,
	the sender should continue to retry.

	This patch implements the receiver's forward progress meter by introducing
	a per cpu counter 'cpu_mondo_counter[cpu]' where 'cpu' is in the range
	of 0..NR_CPUS. The receiver increments its counter as soon as it receives
	a mondo and the sender tracks the receiver's counter. If the receiver has
	stopped making forward progress when the retry limit is reached, the sender
	declares send-mondo-timeout and panic; otherwise, the receiver is allowed
	to keep making forward progress.

	In addition, it's been observed that PCIe hotplug events generate Correctable
	Errors that are handled by hypervisor and then OS. Hypervisor 'borrows'
	a guest cpu strand briefly to provide the service. If the cpu strand is
	simultaneously the only cpu targeted by a mondo, it may not be available
	for the mondo in 20msec, causing SUN4V mondo timeout. It appears that 1 second
	is the agreed wait time between hypervisor and guest OS, this patch makes
	the adjustment.

	Orabug: 25476541
	Orabug: 26417466

	Signed-off-by: Jane Chu <jane.chu@oracle.com>
	Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
	Reviewed-by: Anthony Yznaga <anthony.yznaga@oracle.com>
	Reviewed-by: Rob Gardner <rob.gardner@oracle.com>
	Reviewed-by: Thomas Tai <thomas.tai@oracle.com>
	Signed-off-by: David S. Miller <davem@davemloft.net>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	arch/sparc/include/asm/trap_block.h \| 1
	arch/sparc/kernel/smp_64.c \| 189 ++++++++++++++++++++++--------------
	arch/sparc/kernel/sun4v_ivec.S \| 15 ++
	arch/sparc/kernel/traps_64.c \| 1
	4 files changed, 134 insertions(+), 72 deletions(-)

	--- a/arch/sparc/include/asm/trap_block.h
	+++ b/arch/sparc/include/asm/trap_block.h
	@@ -54,6 +54,7 @@ extern struct trap_per_cpu trap_block[NR
	void init_cur_cpu_trap(struct thread_info *);
	void setup_tba(void);
	extern int ncpus_probed;
	+extern u64 cpu_mondo_counter[NR_CPUS];

	unsigned long real_hard_smp_processor_id(void);

	--- a/arch/sparc/kernel/smp_64.c
	+++ b/arch/sparc/kernel/smp_64.c
	@@ -617,22 +617,48 @@ retry:
	}
	}

	-/* Multi-cpu list version. */
	+#define CPU_MONDO_COUNTER(cpuid) (cpu_mondo_counter[cpuid])
	+#define MONDO_USEC_WAIT_MIN 2
	+#define MONDO_USEC_WAIT_MAX 100
	+#define MONDO_RETRY_LIMIT 500000
	+
	+/* Multi-cpu list version.
	+ *
	+ * Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
	+ * Sometimes not all cpus receive the mondo, requiring us to re-send
	+ * the mondo until all cpus have received, or cpus are truly stuck
	+ * unable to receive mondo, and we timeout.
	+ * Occasionally a target cpu strand is borrowed briefly by hypervisor to
	+ * perform guest service, such as PCIe error handling. Consider the
	+ * service time, 1 second overall wait is reasonable for 1 cpu.
	+ * Here two in-between mondo check wait time are defined: 2 usec for
	+ * single cpu quick turn around and up to 100usec for large cpu count.
	+ * Deliver mondo to large number of cpus could take longer, we adjusts
	+ * the retry count as long as target cpus are making forward progress.
	+ */
	static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
	{
	- int retries, this_cpu, prev_sent, i, saw_cpu_error;
	+ int this_cpu, tot_cpus, prev_sent, i, rem;
	+ int usec_wait, retries, tot_retries;
	+ u16 first_cpu = 0xffff;
	+ unsigned long xc_rcvd = 0;
	unsigned long status;
	+ int ecpuerror_id = 0;
	+ int enocpu_id = 0;
	u16 *cpu_list;
	+ u16 cpu;

	this_cpu = smp_processor_id();
	-
	cpu_list = __va(tb->cpu_list_pa);
	-
	- saw_cpu_error = 0;
	- retries = 0;
	+ usec_wait = cnt * MONDO_USEC_WAIT_MIN;
	+ if (usec_wait > MONDO_USEC_WAIT_MAX)
	+ usec_wait = MONDO_USEC_WAIT_MAX;
	+ retries = tot_retries = 0;
	+ tot_cpus = cnt;
	prev_sent = 0;
	+
	do {
	- int forward_progress, n_sent;
	+ int n_sent, mondo_delivered, target_cpu_busy;

	status = sun4v_cpu_mondo_send(cnt,
	tb->cpu_list_pa,
	@@ -640,94 +666,113 @@ static void hypervisor_xcall_deliver(str

	/* HV_EOK means all cpus received the xcall, we're done. */
	if (likely(status == HV_EOK))
	- break;
	+ goto xcall_done;
	+
	+ /* If not these non-fatal errors, panic */
	+ if (unlikely((status != HV_EWOULDBLOCK) &&
	+ (status != HV_ECPUERROR) &&
	+ (status != HV_ENOCPU)))
	+ goto fatal_errors;

	/* First, see if we made any forward progress.
	*
	+ * Go through the cpu_list, count the target cpus that have
	+ * received our mondo (n_sent), and those that did not (rem).
	+ * Re-pack cpu_list with the cpus remain to be retried in the
	+ * front - this simplifies tracking the truly stalled cpus.
	+ *
	* The hypervisor indicates successful sends by setting
	* cpu list entries to the value 0xffff.
	+ *
	+ * EWOULDBLOCK means some target cpus did not receive the
	+ * mondo and retry usually helps.
	+ *
	+ * ECPUERROR means at least one target cpu is in error state,
	+ * it's usually safe to skip the faulty cpu and retry.
	+ *
	+ * ENOCPU means one of the target cpu doesn't belong to the
	+ * domain, perhaps offlined which is unexpected, but not
	+ * fatal and it's okay to skip the offlined cpu.
	*/
	+ rem = 0;
	n_sent = 0;
	for (i = 0; i < cnt; i++) {
	- if (likely(cpu_list[i] == 0xffff))
	+ cpu = cpu_list[i];
	+ if (likely(cpu == 0xffff)) {
	n_sent++;
	+ } else if ((status == HV_ECPUERROR) &&
	+ (sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
	+ ecpuerror_id = cpu + 1;
	+ } else if (status == HV_ENOCPU && !cpu_online(cpu)) {
	+ enocpu_id = cpu + 1;
	+ } else {
	+ cpu_list[rem++] = cpu;
	+ }
	}

	- forward_progress = 0;
	- if (n_sent > prev_sent)
	- forward_progress = 1;
	+ /* No cpu remained, we're done. */
	+ if (rem == 0)
	+ break;

	- prev_sent = n_sent;
	+ /* Otherwise, update the cpu count for retry. */
	+ cnt = rem;

	- /* If we get a HV_ECPUERROR, then one or more of the cpus
	- * in the list are in error state. Use the cpu_state()
	- * hypervisor call to find out which cpus are in error state.
	+ /* Record the overall number of mondos received by the
	+ * first of the remaining cpus.
	*/
	- if (unlikely(status == HV_ECPUERROR)) {
	- for (i = 0; i < cnt; i++) {
	- long err;
	- u16 cpu;
	-
	- cpu = cpu_list[i];
	- if (cpu == 0xffff)
	- continue;
	-
	- err = sun4v_cpu_state(cpu);
	- if (err == HV_CPU_STATE_ERROR) {
	- saw_cpu_error = (cpu + 1);
	- cpu_list[i] = 0xffff;
	- }
	- }
	- } else if (unlikely(status != HV_EWOULDBLOCK))
	- goto fatal_mondo_error;
	+ if (first_cpu != cpu_list[0]) {
	+ first_cpu = cpu_list[0];
	+ xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
	+ }

	- /* Don't bother rewriting the CPU list, just leave the
	- * 0xffff and non-0xffff entries in there and the
	- * hypervisor will do the right thing.
	- *
	- * Only advance timeout state if we didn't make any
	- * forward progress.
	+ /* Was any mondo delivered successfully? */
	+ mondo_delivered = (n_sent > prev_sent);
	+ prev_sent = n_sent;
	+
	+ /* or, was any target cpu busy processing other mondos? */
	+ target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
	+ xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
	+
	+ /* Retry count is for no progress. If we're making progress,
	+ * reset the retry count.
	*/
	- if (unlikely(!forward_progress)) {
	- if (unlikely(++retries > 10000))
	- goto fatal_mondo_timeout;
	-
	- /* Delay a little bit to let other cpus catch up
	- * on their cpu mondo queue work.
	- */
	- udelay(2 * cnt);
	+ if (likely(mondo_delivered \|\| target_cpu_busy)) {
	+ tot_retries += retries;
	+ retries = 0;
	+ } else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
	+ goto fatal_mondo_timeout;
	}
	- } while (1);

	- if (unlikely(saw_cpu_error))
	- goto fatal_mondo_cpu_error;
	+ /* Delay a little bit to let other cpus catch up on
	+ * their cpu mondo queue work.
	+ */
	+ if (!mondo_delivered)
	+ udelay(usec_wait);

	- return;
	+ retries++;
	+ } while (1);

	-fatal_mondo_cpu_error:
	- printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
	- "(including %d) were in error state\n",
	- this_cpu, saw_cpu_error - 1);
	+xcall_done:
	+ if (unlikely(ecpuerror_id > 0)) {
	+ pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
	+ this_cpu, ecpuerror_id - 1);
	+ } else if (unlikely(enocpu_id > 0)) {
	+ pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
	+ this_cpu, enocpu_id - 1);
	+ }
	return;

	+fatal_errors:
	+ /* fatal errors include bad alignment, etc */
	+ pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
	+ this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
	+ panic("Unexpected SUN4V mondo error %lu\n", status);
	+
	fatal_mondo_timeout:
	- printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
	- " progress after %d retries.\n",
	- this_cpu, retries);
	- goto dump_cpu_list_and_out;
	-
	-fatal_mondo_error:
	- printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
	- this_cpu, status);
	- printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
	- "mondo_block_pa(%lx)\n",
	- this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
	-
	-dump_cpu_list_and_out:
	- printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
	- for (i = 0; i < cnt; i++)
	- printk("%u ", cpu_list[i]);
	- printk("]\n");
	+ /* some cpus being non-responsive to the cpu mondo */
	+ pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
	+ this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
	+ panic("SUN4V mondo timeout panic\n");
	}

	static void (xcall_deliver_impl)(struct trap_per_cpu , int);
	--- a/arch/sparc/kernel/sun4v_ivec.S
	+++ b/arch/sparc/kernel/sun4v_ivec.S
	@@ -26,6 +26,21 @@ sun4v_cpu_mondo:
	ldxa [%g0] ASI_SCRATCHPAD, %g4
	sub %g4, TRAP_PER_CPU_FAULT_INFO, %g4

	+ /* Get smp_processor_id() into %g3 */
	+ sethi %hi(trap_block), %g5
	+ or %g5, %lo(trap_block), %g5
	+ sub %g4, %g5, %g3
	+ srlx %g3, TRAP_BLOCK_SZ_SHIFT, %g3
	+
	+ /* Increment cpu_mondo_counter[smp_processor_id()] */
	+ sethi %hi(cpu_mondo_counter), %g5
	+ or %g5, %lo(cpu_mondo_counter), %g5
	+ sllx %g3, 3, %g3
	+ add %g5, %g3, %g5
	+ ldx [%g5], %g3
	+ add %g3, 1, %g3
	+ stx %g3, [%g5]
	+
	/* Get CPU mondo queue base phys address into %g7. */
	ldx [%g4 + TRAP_PER_CPU_CPU_MONDO_PA], %g7

	--- a/arch/sparc/kernel/traps_64.c
	+++ b/arch/sparc/kernel/traps_64.c
	@@ -2659,6 +2659,7 @@ void do_getpsr(struct pt_regs *regs)
	}
	}

	+u64 cpu_mondo_counter[NR_CPUS] = {0};
	struct trap_per_cpu trap_block[NR_CPUS];
	EXPORT_SYMBOL(trap_block);