releases/2.6.32.12/x86-64-rwsem-avoid-store-forwarding-hazard-in-__downgrade_write.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 0d1622d7f526311d87d7da2ee7dd14b73e45d3fc Mon Sep 17 00:00:00 2001
 From: Avi Kivity <avi@redhat.com>
 Date: Sat, 13 Feb 2010 10:33:12 +0200
 Subject: x86-64, rwsem: Avoid store forwarding hazard in __downgrade_write

 From: Avi Kivity <avi@redhat.com>

 commit 0d1622d7f526311d87d7da2ee7dd14b73e45d3fc upstream.

 The Intel Architecture Optimization Reference Manual states that a short
 load that follows a long store to the same object will suffer a store
 forwading penalty, particularly if the two accesses use different addresses.
 Trivially, a long load that follows a short store will also suffer a penalty.

 __downgrade_write() in rwsem incurs both penalties:  the increment operation
 will not be able to reuse a recently-loaded rwsem value, and its result will
 not be reused by any recently-following rwsem operation.

 A comment in the code states that this is because 64-bit immediates are
 special and expensive; but while they are slightly special (only a single
 instruction allows them), they aren't expensive: a test shows that two loops,
 one loading a 32-bit immediate and one loading a 64-bit immediate, both take
 1.5 cycles per iteration.

 Fix this by changing __downgrade_write to use the same add instruction on
 i386 and on x86_64, so that it uses the same operand size as all the other
 rwsem functions.

 Signed-off-by: Avi Kivity <avi@redhat.com>
 LKML-Reference: <1266049992-17419-1-git-send-email-avi@redhat.com>
 Signed-off-by: H. Peter Anvin <hpa@zytor.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

 ---
  arch/x86/include/asm/rwsem.h |   25 +++++--------------------
  1 file changed, 5 insertions(+), 20 deletions(-)

 --- a/arch/x86/include/asm/rwsem.h
 +++ b/arch/x86/include/asm/rwsem.h
 @@ -232,34 +232,19 @@ static inline void __up_write(struct rw_
   */
  static inline void __downgrade_write(struct rw_semaphore *sem)
  {
 -#ifdef CONFIG_X86_64
 -# if RWSEM_WAITING_BIAS != -0x100000000
 -#  error "This code assumes RWSEM_WAITING_BIAS == -2^32"
 -# endif
 -
 -	/* 64-bit immediates are special and expensive, and not needed here */
 -	asm volatile("# beginning __downgrade_write\n\t"
 -		     LOCK_PREFIX "incl 4(%1)\n\t"
 -		     /* transitions 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 */
 -		     "  jns       1f\n\t"
 -		     "  call call_rwsem_downgrade_wake\n"
 -		     "1:\n\t"
 -		     "# ending __downgrade_write\n"
 -		     : "+m" (sem->count)
 -		     : "a" (sem)
 -		     : "memory", "cc");
 -#else
  	asm volatile("# beginning __downgrade_write\n\t"
  		     LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t"
 -		     /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
 +		     /*
 +		      * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386)
 +		      *     0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64)
 +		      */
  		     "  jns       1f\n\t"
  		     "  call call_rwsem_downgrade_wake\n"
  		     "1:\n\t"
  		     "# ending __downgrade_write\n"
  		     : "+m" (sem->count)
 -		     : "a" (sem), "i" (-RWSEM_WAITING_BIAS)
 +		     : "a" (sem), "er" (-RWSEM_WAITING_BIAS)
  		     : "memory", "cc");
 -#endif
  }

  /*
	From 0d1622d7f526311d87d7da2ee7dd14b73e45d3fc Mon Sep 17 00:00:00 2001
	From: Avi Kivity <avi@redhat.com>
	Date: Sat, 13 Feb 2010 10:33:12 +0200
	Subject: x86-64, rwsem: Avoid store forwarding hazard in __downgrade_write

	From: Avi Kivity <avi@redhat.com>

	commit 0d1622d7f526311d87d7da2ee7dd14b73e45d3fc upstream.

	The Intel Architecture Optimization Reference Manual states that a short
	load that follows a long store to the same object will suffer a store
	forwading penalty, particularly if the two accesses use different addresses.
	Trivially, a long load that follows a short store will also suffer a penalty.

	__downgrade_write() in rwsem incurs both penalties: the increment operation
	will not be able to reuse a recently-loaded rwsem value, and its result will
	not be reused by any recently-following rwsem operation.

	A comment in the code states that this is because 64-bit immediates are
	special and expensive; but while they are slightly special (only a single
	instruction allows them), they aren't expensive: a test shows that two loops,
	one loading a 32-bit immediate and one loading a 64-bit immediate, both take
	1.5 cycles per iteration.

	Fix this by changing __downgrade_write to use the same add instruction on
	i386 and on x86_64, so that it uses the same operand size as all the other
	rwsem functions.

	Signed-off-by: Avi Kivity <avi@redhat.com>
	LKML-Reference: <1266049992-17419-1-git-send-email-avi@redhat.com>
	Signed-off-by: H. Peter Anvin <hpa@zytor.com>
	Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

	---
	arch/x86/include/asm/rwsem.h \| 25 +++++--------------------
	1 file changed, 5 insertions(+), 20 deletions(-)

	--- a/arch/x86/include/asm/rwsem.h
	+++ b/arch/x86/include/asm/rwsem.h
	@@ -232,34 +232,19 @@ static inline void __up_write(struct rw_
	*/
	static inline void __downgrade_write(struct rw_semaphore *sem)
	{
	-#ifdef CONFIG_X86_64
	-# if RWSEM_WAITING_BIAS != -0x100000000
	-# error "This code assumes RWSEM_WAITING_BIAS == -2^32"
	-# endif
	-
	- /* 64-bit immediates are special and expensive, and not needed here */
	- asm volatile("# beginning __downgrade_write\n\t"
	- LOCK_PREFIX "incl 4(%1)\n\t"
	- /* transitions 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 */
	- " jns 1f\n\t"
	- " call call_rwsem_downgrade_wake\n"
	- "1:\n\t"
	- "# ending __downgrade_write\n"
	- : "+m" (sem->count)
	- : "a" (sem)
	- : "memory", "cc");
	-#else
	asm volatile("# beginning __downgrade_write\n\t"
	LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t"
	- /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
	+ /*
	+ * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386)
	+ * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64)
	+ */
	" jns 1f\n\t"
	" call call_rwsem_downgrade_wake\n"
	"1:\n\t"
	"# ending __downgrade_write\n"
	: "+m" (sem->count)
	- : "a" (sem), "i" (-RWSEM_WAITING_BIAS)
	+ : "a" (sem), "er" (-RWSEM_WAITING_BIAS)
	: "memory", "cc");
	-#endif
	}

	/*