| /* SPDX-License-Identifier: GPL-2.0-or-later */ |
| /* |
| * Resilient Queued Spin Lock |
| * |
| * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. |
| * |
| * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> |
| */ |
| #ifndef __ASM_GENERIC_RQSPINLOCK_H |
| #define __ASM_GENERIC_RQSPINLOCK_H |
| |
| #include <linux/types.h> |
| #include <vdso/time64.h> |
| #include <linux/percpu.h> |
| #ifdef CONFIG_QUEUED_SPINLOCKS |
| #include <asm/qspinlock.h> |
| #endif |
| |
| struct rqspinlock { |
| union { |
| atomic_t val; |
| u32 locked; |
| }; |
| }; |
| |
| /* Even though this is same as struct rqspinlock, we need to emit a distinct |
| * type in BTF for BPF programs. |
| */ |
| struct bpf_res_spin_lock { |
| u32 val; |
| }; |
| |
| struct qspinlock; |
| #ifdef CONFIG_QUEUED_SPINLOCKS |
| typedef struct qspinlock rqspinlock_t; |
| #else |
| typedef struct rqspinlock rqspinlock_t; |
| #endif |
| |
| extern int resilient_tas_spin_lock(rqspinlock_t *lock); |
| #ifdef CONFIG_QUEUED_SPINLOCKS |
| extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); |
| #endif |
| |
| #ifndef resilient_virt_spin_lock_enabled |
| static __always_inline bool resilient_virt_spin_lock_enabled(void) |
| { |
| return false; |
| } |
| #endif |
| |
| #ifndef resilient_virt_spin_lock |
| static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) |
| { |
| return 0; |
| } |
| #endif |
| |
| /* |
| * Default timeout for waiting loops is 0.25 seconds |
| */ |
| #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) |
| |
| /* |
| * Choose 31 as it makes rqspinlock_held cacheline-aligned. |
| */ |
| #define RES_NR_HELD 31 |
| |
| struct rqspinlock_held { |
| int cnt; |
| void *locks[RES_NR_HELD]; |
| }; |
| |
| DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); |
| |
| static __always_inline void grab_held_lock_entry(void *lock) |
| { |
| int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); |
| |
| if (unlikely(cnt > RES_NR_HELD)) { |
| /* Still keep the inc so we decrement later. */ |
| return; |
| } |
| |
| /* |
| * Implied compiler barrier in per-CPU operations; otherwise we can have |
| * the compiler reorder inc with write to table, allowing interrupts to |
| * overwrite and erase our write to the table (as on interrupt exit it |
| * will be reset to NULL). |
| * |
| * It is fine for cnt inc to be reordered wrt remote readers though, |
| * they won't observe our entry until the cnt update is visible, that's |
| * all. |
| */ |
| this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); |
| } |
| |
| /* |
| * We simply don't support out-of-order unlocks, and keep the logic simple here. |
| * The verifier prevents BPF programs from unlocking out-of-order, and the same |
| * holds for in-kernel users. |
| * |
| * It is possible to run into misdetection scenarios of AA deadlocks on the same |
| * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries |
| * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct |
| * logic to preserve right entries in the table would be to walk the array of |
| * held locks and swap and clear out-of-order entries, but that's too |
| * complicated and we don't have a compelling use case for out of order unlocking. |
| */ |
| static __always_inline void release_held_lock_entry(void) |
| { |
| struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); |
| |
| if (unlikely(rqh->cnt > RES_NR_HELD)) |
| goto dec; |
| WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); |
| dec: |
| /* |
| * Reordering of clearing above with inc and its write in |
| * grab_held_lock_entry that came before us (in same acquisition |
| * attempt) is ok, we either see a valid entry or NULL when it's |
| * visible. |
| * |
| * But this helper is invoked when we unwind upon failing to acquire the |
| * lock. Unlike the unlock path which constitutes a release store after |
| * we clear the entry, we need to emit a write barrier here. Otherwise, |
| * we may have a situation as follows: |
| * |
| * <error> for lock B |
| * release_held_lock_entry |
| * |
| * try_cmpxchg_acquire for lock A |
| * grab_held_lock_entry |
| * |
| * Lack of any ordering means reordering may occur such that dec, inc |
| * are done before entry is overwritten. This permits a remote lock |
| * holder of lock B (which this CPU failed to acquire) to now observe it |
| * as being attempted on this CPU, and may lead to misdetection (if this |
| * CPU holds a lock it is attempting to acquire, leading to false ABBA |
| * diagnosis). |
| * |
| * In case of unlock, we will always do a release on the lock word after |
| * releasing the entry, ensuring that other CPUs cannot hold the lock |
| * (and make conclusions about deadlocks) until the entry has been |
| * cleared on the local CPU, preventing any anomalies. Reordering is |
| * still possible there, but a remote CPU cannot observe a lock in our |
| * table which it is already holding, since visibility entails our |
| * release store for the said lock has not retired. |
| * |
| * In theory we don't have a problem if the dec and WRITE_ONCE above get |
| * reordered with each other, we either notice an empty NULL entry on |
| * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which |
| * cannot be observed (if dec precedes WRITE_ONCE). |
| * |
| * Emit the write barrier _before_ the dec, this permits dec-inc |
| * reordering but that is harmless as we'd have new entry set to NULL |
| * already, i.e. they cannot precede the NULL store above. |
| */ |
| smp_wmb(); |
| this_cpu_dec(rqspinlock_held_locks.cnt); |
| } |
| |
| #ifdef CONFIG_QUEUED_SPINLOCKS |
| |
| /** |
| * res_spin_lock - acquire a queued spinlock |
| * @lock: Pointer to queued spinlock structure |
| * |
| * Return: |
| * * 0 - Lock was acquired successfully. |
| * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. |
| * * -ETIMEDOUT - Lock acquisition failed because of timeout. |
| */ |
| static __always_inline int res_spin_lock(rqspinlock_t *lock) |
| { |
| int val = 0; |
| |
| if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { |
| grab_held_lock_entry(lock); |
| return 0; |
| } |
| return resilient_queued_spin_lock_slowpath(lock, val); |
| } |
| |
| #else |
| |
| #define res_spin_lock(lock) resilient_tas_spin_lock(lock) |
| |
| #endif /* CONFIG_QUEUED_SPINLOCKS */ |
| |
| static __always_inline void res_spin_unlock(rqspinlock_t *lock) |
| { |
| struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); |
| |
| if (unlikely(rqh->cnt > RES_NR_HELD)) |
| goto unlock; |
| WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); |
| unlock: |
| /* |
| * Release barrier, ensures correct ordering. See release_held_lock_entry |
| * for details. Perform release store instead of queued_spin_unlock, |
| * since we use this function for test-and-set fallback as well. When we |
| * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. |
| * |
| * Like release_held_lock_entry, we can do the release before the dec. |
| * We simply care about not seeing the 'lock' in our table from a remote |
| * CPU once the lock has been released, which doesn't rely on the dec. |
| * |
| * Unlike smp_wmb(), release is not a two way fence, hence it is |
| * possible for a inc to move up and reorder with our clearing of the |
| * entry. This isn't a problem however, as for a misdiagnosis of ABBA, |
| * the remote CPU needs to hold this lock, which won't be released until |
| * the store below is done, which would ensure the entry is overwritten |
| * to NULL, etc. |
| */ |
| smp_store_release(&lock->locked, 0); |
| this_cpu_dec(rqspinlock_held_locks.cnt); |
| } |
| |
| #ifdef CONFIG_QUEUED_SPINLOCKS |
| #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) |
| #else |
| #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) |
| #endif |
| |
| #define raw_res_spin_lock(lock) \ |
| ({ \ |
| int __ret; \ |
| preempt_disable(); \ |
| __ret = res_spin_lock(lock); \ |
| if (__ret) \ |
| preempt_enable(); \ |
| __ret; \ |
| }) |
| |
| #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) |
| |
| #define raw_res_spin_lock_irqsave(lock, flags) \ |
| ({ \ |
| int __ret; \ |
| local_irq_save(flags); \ |
| __ret = raw_res_spin_lock(lock); \ |
| if (__ret) \ |
| local_irq_restore(flags); \ |
| __ret; \ |
| }) |
| |
| #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) |
| |
| #endif /* __ASM_GENERIC_RQSPINLOCK_H */ |