lib/test: add locking-benchmark module
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 4fb4489..d722d6d 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -52,3 +52,5 @@
 	RET
 
 SYM_FUNC_END(this_cpu_cmpxchg16b_emu)
+
+EXPORT_SYMBOL(this_cpu_cmpxchg16b_emu)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 35796c2..f67f132 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2574,6 +2574,15 @@
 
 	  If unsure, say N.
 
+config LOCKING_BENCHMARK
+	tristate "Benchmark (pcp) locking primitives"
+	help
+	  This builds the "locking-benchmark" module that measures performance
+	  of various primitives and schemes for pcp structures locking or
+	  atomic updates.
+
+	  If unsure, say N.
+
 config TEST_FIRMWARE
 	tristate "Test firmware loading via userspace interface"
 	depends on FW_LOADER
diff --git a/lib/Makefile b/lib/Makefile
index d5cfc7a..7f111fb 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -138,6 +138,7 @@
 obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
+obj-$(CONFIG_LOCKING_BENCHMARK) += locking-benchmark.o
 
 lib-y += logic_pio.o
 
diff --git a/lib/locking-benchmark.c b/lib/locking-benchmark.c
new file mode 100644
index 0000000..2e699ee
--- /dev/null
+++ b/lib/locking-benchmark.c
@@ -0,0 +1,427 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/percpu_counter.h>
+#include <asm/tsc.h>
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags)	do { } while (0)
+#define pcp_trylock_finish(flag)	do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags)	local_irq_save(flags)
+#define pcp_trylock_finish(flags)	local_irq_restore(flags)
+#endif
+
+/*
+ * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
+ * a migration causing the wrong PCP to be locked and remote memory being
+ * potentially allocated, pin the task to the CPU for the lookup+lock.
+ * preempt_disable is used on !RT because it is faster than migrate_disable.
+ * migrate_disable is used on RT because otherwise RT spinlock usage is
+ * interfered with and a high priority task cannot preempt the allocator.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define pcpu_task_pin()		preempt_disable()
+#define pcpu_task_unpin()	preempt_enable()
+#else
+#define pcpu_task_pin()		migrate_disable()
+#define pcpu_task_unpin()	migrate_enable()
+#endif
+
+/*
+ * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
+ * Return value should be used with equivalent unlock helper.
+ */
+#define pcpu_spin_lock(type, member, ptr)				\
+({									\
+	type *_ret;							\
+	pcpu_task_pin();						\
+	_ret = this_cpu_ptr(ptr);					\
+	spin_lock(&_ret->member);					\
+	_ret;								\
+})
+
+#define pcpu_spin_trylock(type, member, ptr)				\
+({									\
+	type *_ret;							\
+	pcpu_task_pin();						\
+	_ret = this_cpu_ptr(ptr);					\
+	if (!spin_trylock(&_ret->member)) {				\
+		pcpu_task_unpin();					\
+		_ret = NULL;						\
+	}								\
+	_ret;								\
+})
+
+#define pcpu_spin_unlock(member, ptr)					\
+({									\
+	spin_unlock(&ptr->member);					\
+	pcpu_task_unpin();						\
+})
+
+/* struct per_cpu_pages specific helpers. */
+#define pcp_spin_lock(ptr)						\
+	pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_trylock(ptr)						\
+	pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_unlock(ptr)						\
+	pcpu_spin_unlock(lock, ptr)
+
+typedef union {
+	struct {
+		unsigned long counter;
+		void *dummy;
+	};
+	u128 full;
+} counter_ptr_t;
+
+struct test_pcp {
+	local_lock_t llock;
+	localtry_lock_t ltlock;
+	spinlock_t slock;
+	unsigned long counter;
+	counter_ptr_t counter_ptr;
+};
+
+static bool __dummy;
+
+static DEFINE_PER_CPU(struct test_pcp, test_pcps) = {
+        .llock = INIT_LOCAL_LOCK(llock),
+	.ltlock = INIT_LOCALTRY_LOCK(ltlock),
+	.slock = __SPIN_LOCK_UNLOCKED(slock),
+};
+
+static counter_ptr_t counter_ptr;
+
+struct test_bsl {
+	unsigned long page_flags;
+	unsigned long counter;
+};
+
+static struct test_bsl bsl = {};
+
+#define TIMING_ITERATIONS 1000000000
+
+#define print_result(name) \
+	pr_info("%-35s %12llu cycles\n", name, after - before)
+
+static int __init locking_bench(void)
+{
+	unsigned long long before, after;
+	unsigned long __maybe_unused UP_flags;
+	struct test_pcp *pcp;
+	struct percpu_counter pcpc;
+	unsigned long flags;
+
+	percpu_counter_init(&pcpc, 0, GFP_KERNEL);
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		if (this_cpu_inc_return(test_pcps.counter) == 0)
+			__dummy = true;
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("this_cpu_inc_return");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		unsigned long old, new;
+		do {
+			old = this_cpu_read(test_pcps.counter);
+			new = old + 1;
+		} while (!this_cpu_try_cmpxchg(test_pcps.counter, &old, new));
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("this_cpu_try_cmpxchg");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		unsigned long old, new;
+		do {
+			old = raw_cpu_read(test_pcps.counter);
+			new = old + 1;
+		} while (!this_cpu_try_cmpxchg(test_pcps.counter, &old, new));
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("raw+this_cpu_try_cmpxchg");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		counter_ptr_t old, new;
+		do {
+			struct test_pcp *pcp = raw_cpu_ptr(&test_pcps);
+			old.full = pcp->counter_ptr.full;
+			new.counter = old.counter + 1;
+			new.dummy = old.dummy;
+		} while (!this_cpu_try_cmpxchg128(test_pcps.counter_ptr.full,
+						  &old.full, new.full));
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("this_cpu_try_cmpxchg128");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		counter_ptr_t *test;
+		counter_ptr_t old, new;
+		do {
+			test = &counter_ptr;
+			old.full = test->full;
+			new.counter = old.counter + 1;
+			new.dummy = old.dummy;
+		} while (!try_cmpxchg128(&test->full,
+					 &old.full, new.full));
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("try_cmpxchg128");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		struct test_bsl *test = &bsl;
+
+		bit_spin_lock(PG_locked, &test->page_flags);
+		test->counter++;
+		bit_spin_unlock(PG_locked, &test->page_flags);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("bit_spin_lock");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		percpu_counter_inc(&pcpc);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("percpu_counter_inc");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		percpu_counter_add_local(&pcpc, 1);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("percpu_counter_inc_local");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		local_lock(&test_pcps.llock);
+
+		pcp = this_cpu_ptr(&test_pcps);
+
+		pcp->counter++;
+
+		local_unlock(&test_pcps.llock);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("local_lock");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		local_lock_irq(&test_pcps.llock);
+
+		pcp = this_cpu_ptr(&test_pcps);
+
+		pcp->counter++;
+
+		local_unlock_irq(&test_pcps.llock);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("local_lock_irq");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		local_lock_irqsave(&test_pcps.llock, flags);
+
+		pcp = this_cpu_ptr(&test_pcps);
+
+		pcp->counter++;
+
+		local_unlock_irqrestore(&test_pcps.llock, flags);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("local_lock_irqsave");
+
+
+	before = rdtsc_ordered();
+
+	for (unsigned j = 0; j < 10; j++) {
+
+		local_irq_disable();
+
+		for (unsigned long i = 0; i < TIMING_ITERATIONS/10; i++) {
+			local_lock_irqsave(&test_pcps.llock, flags);
+
+			pcp = this_cpu_ptr(&test_pcps);
+
+			pcp->counter++;
+
+			local_unlock_irqrestore(&test_pcps.llock, flags);
+		}
+
+		local_irq_enable();
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("irq_dis(local_lock_irqsave)");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		localtry_lock(&test_pcps.ltlock);
+
+		pcp = this_cpu_ptr(&test_pcps);
+
+		pcp->counter++;
+
+		localtry_unlock(&test_pcps.ltlock);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("localtry_lock");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		localtry_lock_irq(&test_pcps.ltlock);
+
+		pcp = this_cpu_ptr(&test_pcps);
+
+		pcp->counter++;
+
+		localtry_unlock_irq(&test_pcps.ltlock);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("localtry_lock_irq");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+		localtry_lock_irqsave(&test_pcps.ltlock, flags);
+
+		pcp = this_cpu_ptr(&test_pcps);
+
+		pcp->counter++;
+
+		localtry_unlock_irqrestore(&test_pcps.ltlock, flags);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("localtry_lock_irqsave");
+
+	before = rdtsc_ordered();
+
+	for (unsigned j = 0; j < 10; j++) {
+
+		local_irq_disable();
+
+		for (unsigned long i = 0; i < TIMING_ITERATIONS/10; i++) {
+			localtry_lock_irqsave(&test_pcps.ltlock, flags);
+
+			pcp = this_cpu_ptr(&test_pcps);
+
+			pcp->counter++;
+
+			localtry_unlock_irqrestore(&test_pcps.ltlock, flags);
+		}
+
+		local_irq_enable();
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("irq_dis(localtry_lock_irqsave)");
+
+	before = rdtsc_ordered();
+
+	for (unsigned long i = 0; i < TIMING_ITERATIONS; i++) {
+
+		pcp_trylock_prepare(UP_flags);
+
+		pcp = pcpu_spin_trylock(struct test_pcp, slock, &test_pcps);
+
+		pcp = this_cpu_ptr(&test_pcps);
+
+		pcp->counter++;
+
+		pcpu_spin_unlock(slock, pcp);
+		pcp_trylock_finish(UP_flags);
+	}
+
+	after = rdtsc_ordered();
+
+	cond_resched();
+	print_result("pcpu_spin_trylock");
+
+	percpu_counter_destroy(&pcpc);
+
+	/*
+	 * Everything is OK. Return error just to let user run benchmark
+	 * again without annoying rmmod.
+	 */
+	return -EINVAL;
+}
+
+module_init(locking_bench);
+
+MODULE_DESCRIPTION("Benchmark for (pcp) locking schemes");
+MODULE_LICENSE("GPL");