include/asm-generic/rqspinlock.h - pub/scm/linux/kernel/git/next/linux-next - Git at Google

 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * Resilient Queued Spin Lock
  *
  * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
  *
  * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
  */
 #ifndef __ASM_GENERIC_RQSPINLOCK_H
 #define __ASM_GENERIC_RQSPINLOCK_H

 #include <linux/types.h>
 #include <vdso/time64.h>
 #include <linux/percpu.h>
 #ifdef CONFIG_QUEUED_SPINLOCKS
 #include <asm/qspinlock.h>
 #endif

 struct rqspinlock {
 	union {
 		atomic_t val;
 		u32 locked;
 	};
 };

 /* Even though this is same as struct rqspinlock, we need to emit a distinct
  * type in BTF for BPF programs.
  */
 struct bpf_res_spin_lock {
 	u32 val;
 };

 struct qspinlock;
 #ifdef CONFIG_QUEUED_SPINLOCKS
 typedef struct qspinlock rqspinlock_t;
 #else
 typedef struct rqspinlock rqspinlock_t;
 #endif

 extern int resilient_tas_spin_lock(rqspinlock_t *lock);
 #ifdef CONFIG_QUEUED_SPINLOCKS
 extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
 #endif

 #ifndef resilient_virt_spin_lock_enabled
 static __always_inline bool resilient_virt_spin_lock_enabled(void)
 {
 	return false;
 }
 #endif

 #ifndef resilient_virt_spin_lock
 static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
 {
 	return 0;
 }
 #endif

 /*
  * Default timeout for waiting loops is 0.25 seconds
  */
 #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)

 /*
  * Choose 31 as it makes rqspinlock_held cacheline-aligned.
  */
 #define RES_NR_HELD 31

 struct rqspinlock_held {
 	int cnt;
 	void *locks[RES_NR_HELD];
 };

 DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);

 static __always_inline void grab_held_lock_entry(void *lock)
 {
 	int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);

 	if (unlikely(cnt > RES_NR_HELD)) {
 		/* Still keep the inc so we decrement later. */
 		return;
 	}

 	/*
 	 * Implied compiler barrier in per-CPU operations; otherwise we can have
 	 * the compiler reorder inc with write to table, allowing interrupts to
 	 * overwrite and erase our write to the table (as on interrupt exit it
 	 * will be reset to NULL).
 	 *
 	 * It is fine for cnt inc to be reordered wrt remote readers though,
 	 * they won't observe our entry until the cnt update is visible, that's
 	 * all.
 	 */
 	this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
 }

 /*
  * We simply don't support out-of-order unlocks, and keep the logic simple here.
  * The verifier prevents BPF programs from unlocking out-of-order, and the same
  * holds for in-kernel users.
  *
  * It is possible to run into misdetection scenarios of AA deadlocks on the same
  * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
  * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
  * logic to preserve right entries in the table would be to walk the array of
  * held locks and swap and clear out-of-order entries, but that's too
  * complicated and we don't have a compelling use case for out of order unlocking.
  */
 static __always_inline void release_held_lock_entry(void)
 {
 	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);

 	if (unlikely(rqh->cnt > RES_NR_HELD))
 		goto dec;
 	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
 dec:
 	/*
 	 * Reordering of clearing above with inc and its write in
 	 * grab_held_lock_entry that came before us (in same acquisition
 	 * attempt) is ok, we either see a valid entry or NULL when it's
 	 * visible.
 	 *
 	 * But this helper is invoked when we unwind upon failing to acquire the
 	 * lock. Unlike the unlock path which constitutes a release store after
 	 * we clear the entry, we need to emit a write barrier here. Otherwise,
 	 * we may have a situation as follows:
 	 *
 	 * <error> for lock B
 	 * release_held_lock_entry
 	 *
 	 * try_cmpxchg_acquire for lock A
 	 * grab_held_lock_entry
 	 *
 	 * Lack of any ordering means reordering may occur such that dec, inc
 	 * are done before entry is overwritten. This permits a remote lock
 	 * holder of lock B (which this CPU failed to acquire) to now observe it
 	 * as being attempted on this CPU, and may lead to misdetection (if this
 	 * CPU holds a lock it is attempting to acquire, leading to false ABBA
 	 * diagnosis).
 	 *
 	 * In case of unlock, we will always do a release on the lock word after
 	 * releasing the entry, ensuring that other CPUs cannot hold the lock
 	 * (and make conclusions about deadlocks) until the entry has been
 	 * cleared on the local CPU, preventing any anomalies. Reordering is
 	 * still possible there, but a remote CPU cannot observe a lock in our
 	 * table which it is already holding, since visibility entails our
 	 * release store for the said lock has not retired.
 	 *
 	 * In theory we don't have a problem if the dec and WRITE_ONCE above get
 	 * reordered with each other, we either notice an empty NULL entry on
 	 * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
 	 * cannot be observed (if dec precedes WRITE_ONCE).
 	 *
 	 * Emit the write barrier _before_ the dec, this permits dec-inc
 	 * reordering but that is harmless as we'd have new entry set to NULL
 	 * already, i.e. they cannot precede the NULL store above.
 	 */
 	smp_wmb();
 	this_cpu_dec(rqspinlock_held_locks.cnt);
 }

 #ifdef CONFIG_QUEUED_SPINLOCKS

 /**
  * res_spin_lock - acquire a queued spinlock
  * @lock: Pointer to queued spinlock structure
  *
  * Return:
  * * 0		- Lock was acquired successfully.
  * * -EDEADLK	- Lock acquisition failed because of AA/ABBA deadlock.
  * * -ETIMEDOUT - Lock acquisition failed because of timeout.
  */
 static __always_inline int res_spin_lock(rqspinlock_t *lock)
 {
 	int val = 0;

 	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
 		grab_held_lock_entry(lock);
 		return 0;
 	}
 	return resilient_queued_spin_lock_slowpath(lock, val);
 }

 #else

 #define res_spin_lock(lock) resilient_tas_spin_lock(lock)

 #endif /* CONFIG_QUEUED_SPINLOCKS */

 static __always_inline void res_spin_unlock(rqspinlock_t *lock)
 {
 	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);

 	if (unlikely(rqh->cnt > RES_NR_HELD))
 		goto unlock;
 	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
 unlock:
 	/*
 	 * Release barrier, ensures correct ordering. See release_held_lock_entry
 	 * for details.  Perform release store instead of queued_spin_unlock,
 	 * since we use this function for test-and-set fallback as well. When we
 	 * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
 	 *
 	 * Like release_held_lock_entry, we can do the release before the dec.
 	 * We simply care about not seeing the 'lock' in our table from a remote
 	 * CPU once the lock has been released, which doesn't rely on the dec.
 	 *
 	 * Unlike smp_wmb(), release is not a two way fence, hence it is
 	 * possible for a inc to move up and reorder with our clearing of the
 	 * entry. This isn't a problem however, as for a misdiagnosis of ABBA,
 	 * the remote CPU needs to hold this lock, which won't be released until
 	 * the store below is done, which would ensure the entry is overwritten
 	 * to NULL, etc.
 	 */
 	smp_store_release(&lock->locked, 0);
 	this_cpu_dec(rqspinlock_held_locks.cnt);
 }

 #ifdef CONFIG_QUEUED_SPINLOCKS
 #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
 #else
 #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
 #endif

 #define raw_res_spin_lock(lock)                    \
 	({                                         \
 		int __ret;                         \
 		preempt_disable();                 \
 		__ret = res_spin_lock(lock);	   \
 		if (__ret)                         \
 			preempt_enable();          \
 		__ret;                             \
 	})

 #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })

 #define raw_res_spin_lock_irqsave(lock, flags)    \
 	({                                        \
 		int __ret;                        \
 		local_irq_save(flags);            \
 		__ret = raw_res_spin_lock(lock);  \
 		if (__ret)                        \
 			local_irq_restore(flags); \
 		__ret;                            \
 	})

 #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })

 #endif /* __ASM_GENERIC_RQSPINLOCK_H */
	/* SPDX-License-Identifier: GPL-2.0-or-later */
	/*
	* Resilient Queued Spin Lock
	*
	* (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
	*
	* Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
	*/
	#ifndef __ASM_GENERIC_RQSPINLOCK_H
	#define __ASM_GENERIC_RQSPINLOCK_H

	#include <linux/types.h>
	#include <vdso/time64.h>
	#include <linux/percpu.h>
	#ifdef CONFIG_QUEUED_SPINLOCKS
	#include <asm/qspinlock.h>
	#endif

	struct rqspinlock {
	union {
	atomic_t val;
	u32 locked;
	};
	};

	/* Even though this is same as struct rqspinlock, we need to emit a distinct
	* type in BTF for BPF programs.
	*/
	struct bpf_res_spin_lock {
	u32 val;
	};

	struct qspinlock;
	#ifdef CONFIG_QUEUED_SPINLOCKS
	typedef struct qspinlock rqspinlock_t;
	#else
	typedef struct rqspinlock rqspinlock_t;
	#endif

	extern int resilient_tas_spin_lock(rqspinlock_t *lock);
	#ifdef CONFIG_QUEUED_SPINLOCKS
	extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
	#endif

	#ifndef resilient_virt_spin_lock_enabled
	static __always_inline bool resilient_virt_spin_lock_enabled(void)
	{
	return false;
	}
	#endif

	#ifndef resilient_virt_spin_lock
	static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
	{
	return 0;
	}
	#endif

	/*
	* Default timeout for waiting loops is 0.25 seconds
	*/
	#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)

	/*
	* Choose 31 as it makes rqspinlock_held cacheline-aligned.
	*/
	#define RES_NR_HELD 31

	struct rqspinlock_held {
	int cnt;
	void *locks[RES_NR_HELD];
	};

	DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);

	static __always_inline void grab_held_lock_entry(void *lock)
	{
	int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);

	if (unlikely(cnt > RES_NR_HELD)) {
	/* Still keep the inc so we decrement later. */
	return;
	}

	/*
	* Implied compiler barrier in per-CPU operations; otherwise we can have
	* the compiler reorder inc with write to table, allowing interrupts to
	* overwrite and erase our write to the table (as on interrupt exit it
	* will be reset to NULL).
	*
	* It is fine for cnt inc to be reordered wrt remote readers though,
	* they won't observe our entry until the cnt update is visible, that's
	* all.
	*/
	this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
	}

	/*
	* We simply don't support out-of-order unlocks, and keep the logic simple here.
	* The verifier prevents BPF programs from unlocking out-of-order, and the same
	* holds for in-kernel users.
	*
	* It is possible to run into misdetection scenarios of AA deadlocks on the same
	* CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
	* out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
	* logic to preserve right entries in the table would be to walk the array of
	* held locks and swap and clear out-of-order entries, but that's too
	* complicated and we don't have a compelling use case for out of order unlocking.
	*/
	static __always_inline void release_held_lock_entry(void)
	{
	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);

	if (unlikely(rqh->cnt > RES_NR_HELD))
	goto dec;
	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
	dec:
	/*
	* Reordering of clearing above with inc and its write in
	* grab_held_lock_entry that came before us (in same acquisition
	* attempt) is ok, we either see a valid entry or NULL when it's
	* visible.
	*
	* But this helper is invoked when we unwind upon failing to acquire the
	* lock. Unlike the unlock path which constitutes a release store after
	* we clear the entry, we need to emit a write barrier here. Otherwise,
	* we may have a situation as follows:
	*
	* <error> for lock B
	* release_held_lock_entry
	*
	* try_cmpxchg_acquire for lock A
	* grab_held_lock_entry
	*
	* Lack of any ordering means reordering may occur such that dec, inc
	* are done before entry is overwritten. This permits a remote lock
	* holder of lock B (which this CPU failed to acquire) to now observe it
	* as being attempted on this CPU, and may lead to misdetection (if this
	* CPU holds a lock it is attempting to acquire, leading to false ABBA
	* diagnosis).
	*
	* In case of unlock, we will always do a release on the lock word after
	* releasing the entry, ensuring that other CPUs cannot hold the lock
	* (and make conclusions about deadlocks) until the entry has been
	* cleared on the local CPU, preventing any anomalies. Reordering is
	* still possible there, but a remote CPU cannot observe a lock in our
	* table which it is already holding, since visibility entails our
	* release store for the said lock has not retired.
	*
	* In theory we don't have a problem if the dec and WRITE_ONCE above get
	* reordered with each other, we either notice an empty NULL entry on
	* top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
	* cannot be observed (if dec precedes WRITE_ONCE).
	*
	* Emit the write barrier _before_ the dec, this permits dec-inc
	* reordering but that is harmless as we'd have new entry set to NULL
	* already, i.e. they cannot precede the NULL store above.
	*/
	smp_wmb();
	this_cpu_dec(rqspinlock_held_locks.cnt);
	}

	#ifdef CONFIG_QUEUED_SPINLOCKS

	/**
	* res_spin_lock - acquire a queued spinlock
	* @lock: Pointer to queued spinlock structure
	*
	* Return:
	* * 0 - Lock was acquired successfully.
	* * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
	* * -ETIMEDOUT - Lock acquisition failed because of timeout.
	*/
	static __always_inline int res_spin_lock(rqspinlock_t *lock)
	{
	int val = 0;

	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
	grab_held_lock_entry(lock);
	return 0;
	}
	return resilient_queued_spin_lock_slowpath(lock, val);
	}

	#else

	#define res_spin_lock(lock) resilient_tas_spin_lock(lock)

	#endif /* CONFIG_QUEUED_SPINLOCKS */

	static __always_inline void res_spin_unlock(rqspinlock_t *lock)
	{
	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);

	if (unlikely(rqh->cnt > RES_NR_HELD))
	goto unlock;
	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
	unlock:
	/*
	* Release barrier, ensures correct ordering. See release_held_lock_entry
	* for details. Perform release store instead of queued_spin_unlock,
	* since we use this function for test-and-set fallback as well. When we
	* have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
	*
	* Like release_held_lock_entry, we can do the release before the dec.
	* We simply care about not seeing the 'lock' in our table from a remote
	* CPU once the lock has been released, which doesn't rely on the dec.
	*
	* Unlike smp_wmb(), release is not a two way fence, hence it is
	* possible for a inc to move up and reorder with our clearing of the
	* entry. This isn't a problem however, as for a misdiagnosis of ABBA,
	* the remote CPU needs to hold this lock, which won't be released until
	* the store below is done, which would ensure the entry is overwritten
	* to NULL, etc.
	*/
	smp_store_release(&lock->locked, 0);
	this_cpu_dec(rqspinlock_held_locks.cnt);
	}

	#ifdef CONFIG_QUEUED_SPINLOCKS
	#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
	#else
	#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
	#endif

	#define raw_res_spin_lock(lock) \
	({ \
	int __ret; \
	preempt_disable(); \
	__ret = res_spin_lock(lock); \
	if (__ret) \
	preempt_enable(); \
	__ret; \
	})

	#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })

	#define raw_res_spin_lock_irqsave(lock, flags) \
	({ \
	int __ret; \
	local_irq_save(flags); \
	__ret = raw_res_spin_lock(lock); \
	if (__ret) \
	local_irq_restore(flags); \
	__ret; \
	})

	#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })

	#endif /* __ASM_GENERIC_RQSPINLOCK_H */