blob: 6d4244d643df32e78dcbc8561534c208fa9780c0 [file] [log] [blame]
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Resilient Queued Spin Lock
*
* (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
*
* Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
*/
#ifndef __ASM_GENERIC_RQSPINLOCK_H
#define __ASM_GENERIC_RQSPINLOCK_H
#include <linux/types.h>
#include <vdso/time64.h>
#include <linux/percpu.h>
#ifdef CONFIG_QUEUED_SPINLOCKS
#include <asm/qspinlock.h>
#endif
struct rqspinlock {
union {
atomic_t val;
u32 locked;
};
};
/* Even though this is same as struct rqspinlock, we need to emit a distinct
* type in BTF for BPF programs.
*/
struct bpf_res_spin_lock {
u32 val;
};
struct qspinlock;
#ifdef CONFIG_QUEUED_SPINLOCKS
typedef struct qspinlock rqspinlock_t;
#else
typedef struct rqspinlock rqspinlock_t;
#endif
extern int resilient_tas_spin_lock(rqspinlock_t *lock);
#ifdef CONFIG_QUEUED_SPINLOCKS
extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
#endif
#ifndef resilient_virt_spin_lock_enabled
static __always_inline bool resilient_virt_spin_lock_enabled(void)
{
return false;
}
#endif
#ifndef resilient_virt_spin_lock
static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
{
return 0;
}
#endif
/*
* Default timeout for waiting loops is 0.25 seconds
*/
#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)
/*
* Choose 31 as it makes rqspinlock_held cacheline-aligned.
*/
#define RES_NR_HELD 31
struct rqspinlock_held {
int cnt;
void *locks[RES_NR_HELD];
};
DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
static __always_inline void grab_held_lock_entry(void *lock)
{
int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);
if (unlikely(cnt > RES_NR_HELD)) {
/* Still keep the inc so we decrement later. */
return;
}
/*
* Implied compiler barrier in per-CPU operations; otherwise we can have
* the compiler reorder inc with write to table, allowing interrupts to
* overwrite and erase our write to the table (as on interrupt exit it
* will be reset to NULL).
*
* It is fine for cnt inc to be reordered wrt remote readers though,
* they won't observe our entry until the cnt update is visible, that's
* all.
*/
this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
}
/*
* We simply don't support out-of-order unlocks, and keep the logic simple here.
* The verifier prevents BPF programs from unlocking out-of-order, and the same
* holds for in-kernel users.
*
* It is possible to run into misdetection scenarios of AA deadlocks on the same
* CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
* out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
* logic to preserve right entries in the table would be to walk the array of
* held locks and swap and clear out-of-order entries, but that's too
* complicated and we don't have a compelling use case for out of order unlocking.
*/
static __always_inline void release_held_lock_entry(void)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
if (unlikely(rqh->cnt > RES_NR_HELD))
goto dec;
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
dec:
/*
* Reordering of clearing above with inc and its write in
* grab_held_lock_entry that came before us (in same acquisition
* attempt) is ok, we either see a valid entry or NULL when it's
* visible.
*
* But this helper is invoked when we unwind upon failing to acquire the
* lock. Unlike the unlock path which constitutes a release store after
* we clear the entry, we need to emit a write barrier here. Otherwise,
* we may have a situation as follows:
*
* <error> for lock B
* release_held_lock_entry
*
* try_cmpxchg_acquire for lock A
* grab_held_lock_entry
*
* Lack of any ordering means reordering may occur such that dec, inc
* are done before entry is overwritten. This permits a remote lock
* holder of lock B (which this CPU failed to acquire) to now observe it
* as being attempted on this CPU, and may lead to misdetection (if this
* CPU holds a lock it is attempting to acquire, leading to false ABBA
* diagnosis).
*
* In case of unlock, we will always do a release on the lock word after
* releasing the entry, ensuring that other CPUs cannot hold the lock
* (and make conclusions about deadlocks) until the entry has been
* cleared on the local CPU, preventing any anomalies. Reordering is
* still possible there, but a remote CPU cannot observe a lock in our
* table which it is already holding, since visibility entails our
* release store for the said lock has not retired.
*
* In theory we don't have a problem if the dec and WRITE_ONCE above get
* reordered with each other, we either notice an empty NULL entry on
* top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
* cannot be observed (if dec precedes WRITE_ONCE).
*
* Emit the write barrier _before_ the dec, this permits dec-inc
* reordering but that is harmless as we'd have new entry set to NULL
* already, i.e. they cannot precede the NULL store above.
*/
smp_wmb();
this_cpu_dec(rqspinlock_held_locks.cnt);
}
#ifdef CONFIG_QUEUED_SPINLOCKS
/**
* res_spin_lock - acquire a queued spinlock
* @lock: Pointer to queued spinlock structure
*
* Return:
* * 0 - Lock was acquired successfully.
* * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
* * -ETIMEDOUT - Lock acquisition failed because of timeout.
*/
static __always_inline int res_spin_lock(rqspinlock_t *lock)
{
int val = 0;
if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
grab_held_lock_entry(lock);
return 0;
}
return resilient_queued_spin_lock_slowpath(lock, val);
}
#else
#define res_spin_lock(lock) resilient_tas_spin_lock(lock)
#endif /* CONFIG_QUEUED_SPINLOCKS */
static __always_inline void res_spin_unlock(rqspinlock_t *lock)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
if (unlikely(rqh->cnt > RES_NR_HELD))
goto unlock;
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
unlock:
/*
* Release barrier, ensures correct ordering. See release_held_lock_entry
* for details. Perform release store instead of queued_spin_unlock,
* since we use this function for test-and-set fallback as well. When we
* have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
*
* Like release_held_lock_entry, we can do the release before the dec.
* We simply care about not seeing the 'lock' in our table from a remote
* CPU once the lock has been released, which doesn't rely on the dec.
*
* Unlike smp_wmb(), release is not a two way fence, hence it is
* possible for a inc to move up and reorder with our clearing of the
* entry. This isn't a problem however, as for a misdiagnosis of ABBA,
* the remote CPU needs to hold this lock, which won't be released until
* the store below is done, which would ensure the entry is overwritten
* to NULL, etc.
*/
smp_store_release(&lock->locked, 0);
this_cpu_dec(rqspinlock_held_locks.cnt);
}
#ifdef CONFIG_QUEUED_SPINLOCKS
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
#else
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
#endif
#define raw_res_spin_lock(lock) \
({ \
int __ret; \
preempt_disable(); \
__ret = res_spin_lock(lock); \
if (__ret) \
preempt_enable(); \
__ret; \
})
#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
#define raw_res_spin_lock_irqsave(lock, flags) \
({ \
int __ret; \
local_irq_save(flags); \
__ret = raw_res_spin_lock(lock); \
if (__ret) \
local_irq_restore(flags); \
__ret; \
})
#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
#endif /* __ASM_GENERIC_RQSPINLOCK_H */