RISC-V Atomic Cleanups

This patch set is the result of some feedback that filtered through
after our original patch set was reviewed, some of which was the result
of me missing some email.  It contains:

* A new READ_ONCE in arch_spin_is_locked()
* __test_and_op_bit_ord() is now actually ordered
* Improvements to various comments
* Removal of some dead code
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index e2e37c5..e65d1cd 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -50,30 +50,30 @@
  * have the AQ or RL bits set.  These don't return anything, so there's only
  * one version to worry about.
  */
-#define ATOMIC_OP(op, asm_op, c_op, I, asm_type, c_type, prefix)				\
-static __always_inline void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)		\
-{												\
-	__asm__ __volatile__ (									\
-		"amo" #asm_op "." #asm_type " zero, %1, %0"					\
-		: "+A" (v->counter)								\
-		: "r" (I)									\
-		: "memory");									\
+#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix)				\
+static __always_inline void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)	\
+{											\
+	__asm__ __volatile__ (								\
+		"amo" #asm_op "." #asm_type " zero, %1, %0"				\
+		: "+A" (v->counter)							\
+		: "r" (I)								\
+		: "memory");								\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, c_op, I)			\
-        ATOMIC_OP (op, asm_op, c_op, I, w,  int,   )
+#define ATOMIC_OPS(op, asm_op, I)			\
+        ATOMIC_OP (op, asm_op, I, w,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, c_op, I)			\
-        ATOMIC_OP (op, asm_op, c_op, I, w,  int,   )	\
-        ATOMIC_OP (op, asm_op, c_op, I, d, long, 64)
+#define ATOMIC_OPS(op, asm_op, I)			\
+        ATOMIC_OP (op, asm_op, I, w,  int,   )	\
+        ATOMIC_OP (op, asm_op, I, d, long, 64)
 #endif
 
-ATOMIC_OPS(add, add, +,  i)
-ATOMIC_OPS(sub, add, +, -i)
-ATOMIC_OPS(and, and, &,  i)
-ATOMIC_OPS( or,  or, |,  i)
-ATOMIC_OPS(xor, xor, ^,  i)
+ATOMIC_OPS(add, add,  i)
+ATOMIC_OPS(sub, add, -i)
+ATOMIC_OPS(and, and,  i)
+ATOMIC_OPS( or,  or,  i)
+ATOMIC_OPS(xor, xor,  i)
 
 #undef ATOMIC_OP
 #undef ATOMIC_OPS
@@ -83,7 +83,7 @@
  * There's two flavors of these: the arithmatic ops have both fetch and return
  * versions, while the logical ops only have fetch versions.
  */
-#define ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, asm_type, c_type, prefix)			\
+#define ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, asm_type, c_type, prefix)				\
 static __always_inline c_type atomic##prefix##_fetch_##op##c_or(c_type i, atomic##prefix##_t *v)	\
 {													\
 	register c_type ret;										\
@@ -103,13 +103,13 @@
 
 #ifdef CONFIG_GENERIC_ATOMIC64
 #define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP (op, asm_op, c_op, I, asm_or, c_or, w,  int,   )	\
+        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )	\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )
 #else
 #define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP (op, asm_op, c_op, I, asm_or, c_or, w,  int,   )	\
+        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )	\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )	\
-        ATOMIC_FETCH_OP (op, asm_op, c_op, I, asm_or, c_or, d, long, 64)	\
+        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, d, long, 64)	\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, d, long, 64)
 #endif
 
@@ -126,28 +126,28 @@
 #undef ATOMIC_OPS
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )
+#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)				\
+        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )		\
-        ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, d, long, 64)
+#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)				\
+        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )	\
+        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, d, long, 64)
 #endif
 
-ATOMIC_OPS(and, and, &,  i,      , _relaxed)
-ATOMIC_OPS(and, and, &,  i, .aq  , _acquire)
-ATOMIC_OPS(and, and, &,  i, .rl  , _release)
-ATOMIC_OPS(and, and, &,  i, .aqrl,         )
+ATOMIC_OPS(and, and, i,      , _relaxed)
+ATOMIC_OPS(and, and, i, .aq  , _acquire)
+ATOMIC_OPS(and, and, i, .rl  , _release)
+ATOMIC_OPS(and, and, i, .aqrl,         )
 
-ATOMIC_OPS( or,  or, |,  i,      , _relaxed)
-ATOMIC_OPS( or,  or, |,  i, .aq  , _acquire)
-ATOMIC_OPS( or,  or, |,  i, .rl  , _release)
-ATOMIC_OPS( or,  or, |,  i, .aqrl,         )
+ATOMIC_OPS( or,  or, i,      , _relaxed)
+ATOMIC_OPS( or,  or, i, .aq  , _acquire)
+ATOMIC_OPS( or,  or, i, .rl  , _release)
+ATOMIC_OPS( or,  or, i, .aqrl,         )
 
-ATOMIC_OPS(xor, xor, ^,  i,      , _relaxed)
-ATOMIC_OPS(xor, xor, ^,  i, .aq  , _acquire)
-ATOMIC_OPS(xor, xor, ^,  i, .rl  , _release)
-ATOMIC_OPS(xor, xor, ^,  i, .aqrl,         )
+ATOMIC_OPS(xor, xor, i,      , _relaxed)
+ATOMIC_OPS(xor, xor, i, .aq  , _acquire)
+ATOMIC_OPS(xor, xor, i, .rl  , _release)
+ATOMIC_OPS(xor, xor, i, .aqrl,         )
 
 #undef ATOMIC_OPS
 
@@ -182,13 +182,13 @@
 #undef ATOMIC_OP
 #undef ATOMIC_OPS
 
-#define ATOMIC_OP(op, func_op, c_op, I, c_type, prefix)				\
+#define ATOMIC_OP(op, func_op, I, c_type, prefix)				\
 static __always_inline void atomic##prefix##_##op(atomic##prefix##_t *v)	\
 {										\
 	atomic##prefix##_##func_op(I, v);					\
 }
 
-#define ATOMIC_FETCH_OP(op, func_op, c_op, I, c_type, prefix)				\
+#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix)					\
 static __always_inline c_type atomic##prefix##_fetch_##op(atomic##prefix##_t *v)	\
 {											\
 	return atomic##prefix##_fetch_##func_op(I, v);					\
@@ -202,16 +202,16 @@
 
 #ifdef CONFIG_GENERIC_ATOMIC64
 #define ATOMIC_OPS(op, asm_op, c_op, I)						\
-        ATOMIC_OP       (op, asm_op, c_op, I,  int,   )				\
-        ATOMIC_FETCH_OP (op, asm_op, c_op, I,  int,   )				\
+        ATOMIC_OP       (op, asm_op,       I,  int,   )				\
+        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )				\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )
 #else
 #define ATOMIC_OPS(op, asm_op, c_op, I)						\
-        ATOMIC_OP       (op, asm_op, c_op, I,  int,   )				\
-        ATOMIC_FETCH_OP (op, asm_op, c_op, I,  int,   )				\
+        ATOMIC_OP       (op, asm_op,       I,  int,   )				\
+        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )				\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )				\
-        ATOMIC_OP       (op, asm_op, c_op, I, long, 64)				\
-        ATOMIC_FETCH_OP (op, asm_op, c_op, I, long, 64)				\
+        ATOMIC_OP       (op, asm_op,       I, long, 64)				\
+        ATOMIC_FETCH_OP (op, asm_op,       I, long, 64)				\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I, long, 64)
 #endif
 
@@ -300,8 +300,13 @@
 
 /*
  * atomic_{cmp,}xchg is required to have exactly the same ordering semantics as
- * {cmp,}xchg and the operations that return, so they need a barrier.  We just
- * use the other implementations directly.
+ * {cmp,}xchg and the operations that return, so they need a barrier.
+ */
+/*
+ * FIXME: atomic_cmpxchg_{acquire,release,relaxed} are all implemented by
+ * assigning the same barrier to both the LR and SC operations, but that might
+ * not make any sense.  We're waiting on a memory model specification to
+ * determine exactly what the right thing to do is here.
  */
 #define ATOMIC_OP(c_t, prefix, c_or, size, asm_or)						\
 static __always_inline c_t atomic##prefix##_cmpxchg##c_or(atomic##prefix##_t *v, c_t o, c_t n) 	\
diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h
index 183534b..773c4e0 100644
--- a/arch/riscv/include/asm/barrier.h
+++ b/arch/riscv/include/asm/barrier.h
@@ -38,29 +38,6 @@
 #define smp_rmb()	RISCV_FENCE(r,r)
 #define smp_wmb()	RISCV_FENCE(w,w)
 
-/*
- * These fences exist to enforce ordering around the relaxed AMOs.  The
- * documentation defines that
- * "
- *     atomic_fetch_add();
- *   is equivalent to:
- *     smp_mb__before_atomic();
- *     atomic_fetch_add_relaxed();
- *     smp_mb__after_atomic();
- * "
- * So we emit full fences on both sides.
- */
-#define __smb_mb__before_atomic()	smp_mb()
-#define __smb_mb__after_atomic()	smp_mb()
-
-/*
- * These barriers prevent accesses performed outside a spinlock from being moved
- * inside a spinlock.  Since RISC-V sets the aq/rl bits on our spinlock only
- * enforce release consistency, we need full fences here.
- */
-#define smb_mb__before_spinlock()	smp_mb()
-#define smb_mb__after_spinlock()	smp_mb()
-
 #include <asm-generic/barrier.h>
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index 7c281ef..f30daf2 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -67,7 +67,7 @@
 		: "memory");
 
 #define __test_and_op_bit(op, mod, nr, addr) 			\
-	__test_and_op_bit_ord(op, mod, nr, addr, )
+	__test_and_op_bit_ord(op, mod, nr, addr, .aqrl)
 #define __op_bit(op, mod, nr, addr)				\
 	__op_bit_ord(op, mod, nr, addr, )
 
diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
index 04c71d9..2fd27e8 100644
--- a/arch/riscv/include/asm/spinlock.h
+++ b/arch/riscv/include/asm/spinlock.h
@@ -24,7 +24,7 @@
 
 /* FIXME: Replace this with a ticket lock, like MIPS. */
 
-#define arch_spin_is_locked(x)	((x)->lock != 0)
+#define arch_spin_is_locked(x)	(READ_ONCE((x)->lock) != 0)
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
@@ -58,15 +58,6 @@
 	}
 }
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_rmb();
-	do {
-		cpu_relax();
-	} while (arch_spin_is_locked(lock));
-	smp_acquire__after_ctrl_dep();
-}
-
 /***********************************************************/
 
 static inline void arch_read_lock(arch_rwlock_t *lock)
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 5ee4ae3..c79fab3 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -17,7 +17,10 @@
 
 #ifdef CONFIG_MMU
 
-/* Flush entire local TLB */
+/*
+ * Flush entire local TLB.  'sfence.vma' implicitly fences with the instruction
+ * cache as well, so a 'fence.i' is not necessary.
+ */
 static inline void local_flush_tlb_all(void)
 {
 	__asm__ __volatile__ ("sfence.vma" : : : "memory");