releases/2.6.32.23/x86-add-memory-modify-constraints-to-xchg-and-cmpxchg.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 113fc5a6e8c2288619ff7e8187a6f556b7e0d372 Mon Sep 17 00:00:00 2001
 From: H. Peter Anvin <hpa@zytor.com>
 Date: Tue, 27 Jul 2010 17:01:49 -0700
 Subject: x86: Add memory modify constraints to xchg() and cmpxchg()

 From: H. Peter Anvin <hpa@zytor.com>

 commit 113fc5a6e8c2288619ff7e8187a6f556b7e0d372 upstream.

 [ Backport to .32 by Tomáš Janoušek <tomi@nomi.cz> ]

 xchg() and cmpxchg() modify their memory operands, not merely read
 them.  For some versions of gcc the "memory" clobber has apparently
 dealt with the situation, but not for all.

 Originally-by: Linus Torvalds <torvalds@linux-foundation.org>
 Signed-off-by: H. Peter Anvin <hpa@zytor.com>
 Cc: Glauber Costa <glommer@redhat.com>
 Cc: Avi Kivity <avi@redhat.com>
 Cc: Peter Palfrader <peter@palfrader.org>
 Cc: Greg KH <gregkh@suse.de>
 Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
 Cc: Zachary Amsden <zamsden@redhat.com>
 Cc: Marcelo Tosatti <mtosatti@redhat.com>
 LKML-Reference: <4C4F7277.8050306@zytor.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

 ---
  arch/x86/include/asm/cmpxchg_32.h |   65 +++++++++++---------------------------
  arch/x86/include/asm/cmpxchg_64.h |    4 --
  2 files changed, 20 insertions(+), 49 deletions(-)

 --- a/arch/x86/include/asm/cmpxchg_32.h
 +++ b/arch/x86/include/asm/cmpxchg_32.h
 @@ -17,60 +17,33 @@ struct __xchg_dummy {
  #define __xg(x) ((struct __xchg_dummy *)(x))

  /*
 - * The semantics of XCHGCMP8B are a bit strange, this is why
 - * there is a loop and the loading of %%eax and %%edx has to
 - * be inside. This inlines well in most cases, the cached
 - * cost is around ~38 cycles. (in the future we might want
 - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
 - * might have an implicit FPU-save as a cost, so it's not
 - * clear which path to go.)
 + * CMPXCHG8B only writes to the target if we had the previous
 + * value in registers, otherwise it acts as a read and gives us the
 + * "new previous" value.  That is why there is a loop.  Preloading
 + * EDX:EAX is a performance optimization: in the common case it means
 + * we need only one locked operation.
   *
 - * cmpxchg8b must be used with the lock prefix here to allow
 - * the instruction to be executed atomically, see page 3-102
 - * of the instruction set reference 24319102.pdf. We need
 - * the reader side to see the coherent 64bit value.
 + * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
 + * least an FPU save and/or %cr0.ts manipulation.
 + *
 + * cmpxchg8b must be used with the lock prefix here to allow the
 + * instruction to be executed atomically.  We need to have the reader
 + * side to see the coherent 64bit value.
   */
 -static inline void __set_64bit(unsigned long long *ptr,
 -			       unsigned int low, unsigned int high)
 +static inline void set_64bit(volatile u64 *ptr, u64 value)
  {
 +	u32 low  = value;
 +	u32 high = value >> 32;
 +	u64 prev = *ptr;
 +
  	asm volatile("\n1:\t"
 -		     "movl (%1), %%eax\n\t"
 -		     "movl 4(%1), %%edx\n\t"
  		     LOCK_PREFIX "cmpxchg8b %0\n\t"
  		     "jnz 1b"
 -		     : "=m"(*ptr)
 -		     : "D" (ptr),
 -		       "b"(low),
 -		       "c"(high)
 -		     : "ax", "dx", "memory");
 -}
 -
 -static inline void __set_64bit_constant(unsigned long long *ptr,
 -					unsigned long long value)
 -{
 -	__set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
 +		     : "=m" (*ptr), "+A" (prev)
 +		     : "b" (low), "c" (high)
 +		     : "memory");
  }

 -#define ll_low(x)	*(((unsigned int *)&(x)) + 0)
 -#define ll_high(x)	*(((unsigned int *)&(x)) + 1)
 -
 -static inline void __set_64bit_var(unsigned long long *ptr,
 -				   unsigned long long value)
 -{
 -	__set_64bit(ptr, ll_low(value), ll_high(value));
 -}
 -
 -#define set_64bit(ptr, value)			\
 -	(__builtin_constant_p((value))		\
 -	 ? __set_64bit_constant((ptr), (value))	\
 -	 : __set_64bit_var((ptr), (value)))
 -
 -#define _set_64bit(ptr, value)						\
 -	(__builtin_constant_p(value)					\
 -	 ? __set_64bit(ptr, (unsigned int)(value),			\
 -		       (unsigned int)((value) >> 32))			\
 -	 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
 -
  /*
   * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
   * Note 2: xchg has side effect, so that attribute volatile is necessary,
 --- a/arch/x86/include/asm/cmpxchg_64.h
 +++ b/arch/x86/include/asm/cmpxchg_64.h
 @@ -8,13 +8,11 @@

  #define __xg(x) ((volatile long *)(x))

 -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
 +static inline void set_64bit(volatile u64 *ptr, u64 val)
  {
  	*ptr = val;
  }

 -#define _set_64bit set_64bit
 -
  /*
   * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
   * Note 2: xchg has side effect, so that attribute volatile is necessary,
	From 113fc5a6e8c2288619ff7e8187a6f556b7e0d372 Mon Sep 17 00:00:00 2001
	From: H. Peter Anvin <hpa@zytor.com>
	Date: Tue, 27 Jul 2010 17:01:49 -0700
	Subject: x86: Add memory modify constraints to xchg() and cmpxchg()

	From: H. Peter Anvin <hpa@zytor.com>

	commit 113fc5a6e8c2288619ff7e8187a6f556b7e0d372 upstream.

	[ Backport to .32 by Tomáš Janoušek <tomi@nomi.cz> ]

	xchg() and cmpxchg() modify their memory operands, not merely read
	them. For some versions of gcc the "memory" clobber has apparently
	dealt with the situation, but not for all.

	Originally-by: Linus Torvalds <torvalds@linux-foundation.org>
	Signed-off-by: H. Peter Anvin <hpa@zytor.com>
	Cc: Glauber Costa <glommer@redhat.com>
	Cc: Avi Kivity <avi@redhat.com>
	Cc: Peter Palfrader <peter@palfrader.org>
	Cc: Greg KH <gregkh@suse.de>
	Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
	Cc: Zachary Amsden <zamsden@redhat.com>
	Cc: Marcelo Tosatti <mtosatti@redhat.com>
	LKML-Reference: <4C4F7277.8050306@zytor.com>
	Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

	---
	arch/x86/include/asm/cmpxchg_32.h \| 65 +++++++++++---------------------------
	arch/x86/include/asm/cmpxchg_64.h \| 4 --
	2 files changed, 20 insertions(+), 49 deletions(-)

	--- a/arch/x86/include/asm/cmpxchg_32.h
	+++ b/arch/x86/include/asm/cmpxchg_32.h
	@@ -17,60 +17,33 @@ struct __xchg_dummy {
	#define __xg(x) ((struct __xchg_dummy *)(x))

	/*
	- * The semantics of XCHGCMP8B are a bit strange, this is why
	- * there is a loop and the loading of %%eax and %%edx has to
	- * be inside. This inlines well in most cases, the cached
	- * cost is around ~38 cycles. (in the future we might want
	- * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
	- * might have an implicit FPU-save as a cost, so it's not
	- * clear which path to go.)
	+ * CMPXCHG8B only writes to the target if we had the previous
	+ * value in registers, otherwise it acts as a read and gives us the
	+ * "new previous" value. That is why there is a loop. Preloading
	+ * EDX:EAX is a performance optimization: in the common case it means
	+ * we need only one locked operation.
	*
	- * cmpxchg8b must be used with the lock prefix here to allow
	- * the instruction to be executed atomically, see page 3-102
	- * of the instruction set reference 24319102.pdf. We need
	- * the reader side to see the coherent 64bit value.
	+ * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
	+ * least an FPU save and/or %cr0.ts manipulation.
	+ *
	+ * cmpxchg8b must be used with the lock prefix here to allow the
	+ * instruction to be executed atomically. We need to have the reader
	+ * side to see the coherent 64bit value.
	*/
	-static inline void __set_64bit(unsigned long long *ptr,
	- unsigned int low, unsigned int high)
	+static inline void set_64bit(volatile u64 *ptr, u64 value)
	{
	+ u32 low = value;
	+ u32 high = value >> 32;
	+ u64 prev = *ptr;
	+
	asm volatile("\n1:\t"
	- "movl (%1), %%eax\n\t"
	- "movl 4(%1), %%edx\n\t"
	LOCK_PREFIX "cmpxchg8b %0\n\t"
	"jnz 1b"
	- : "=m"(*ptr)
	- : "D" (ptr),
	- "b"(low),
	- "c"(high)
	- : "ax", "dx", "memory");
	-}
	-
	-static inline void __set_64bit_constant(unsigned long long *ptr,
	- unsigned long long value)
	-{
	- __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
	+ : "=m" (*ptr), "+A" (prev)
	+ : "b" (low), "c" (high)
	+ : "memory");
	}

	-#define ll_low(x) (((unsigned int )&(x)) + 0)
	-#define ll_high(x) (((unsigned int )&(x)) + 1)
	-
	-static inline void __set_64bit_var(unsigned long long *ptr,
	- unsigned long long value)
	-{
	- __set_64bit(ptr, ll_low(value), ll_high(value));
	-}
	-
	-#define set_64bit(ptr, value) \
	- (__builtin_constant_p((value)) \
	- ? __set_64bit_constant((ptr), (value)) \
	- : __set_64bit_var((ptr), (value)))
	-
	-#define _set_64bit(ptr, value) \
	- (__builtin_constant_p(value) \
	- ? __set_64bit(ptr, (unsigned int)(value), \
	- (unsigned int)((value) >> 32)) \
	- : __set_64bit(ptr, ll_low((value)), ll_high((value))))
	-
	/*
	* Note: no "lock" prefix even on SMP: xchg always implies lock anyway
	* Note 2: xchg has side effect, so that attribute volatile is necessary,
	--- a/arch/x86/include/asm/cmpxchg_64.h
	+++ b/arch/x86/include/asm/cmpxchg_64.h
	@@ -8,13 +8,11 @@

	#define __xg(x) ((volatile long *)(x))

	-static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
	+static inline void set_64bit(volatile u64 *ptr, u64 val)
	{
	*ptr = val;
	}

	-#define _set_64bit set_64bit
	-
	/*
	* Note: no "lock" prefix even on SMP: xchg always implies lock anyway
	* Note 2: xchg has side effect, so that attribute volatile is necessary,