mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-12-27 12:21:22 -05:00
Ritesh reported that timeouts occurred frequently for rqspinlock despite
reentrancy on the same lock on the same CPU in [0]. This patch closes
one of the races leading to this behavior, and reduces the frequency of
timeouts.
We currently have a tiny window between the fast-path cmpxchg and the
grabbing of the lock entry where an NMI could land, attempt the same
lock that was just acquired, and end up timing out. This is not ideal.
Instead, move the lock entry acquisition from the fast path to before
the cmpxchg, and remove the grabbing of the lock entry in the slow path,
assuming it was already taken by the fast path. The TAS fallback is
invoked directly without being preceded by the typical fast path,
therefore we must continue to grab the deadlock detection entry in that
case.
Case on lock leading to missed AA:
cmpxchg lock A
<NMI>
... rqspinlock acquisition of A
... timeout
</NMI>
grab_held_lock_entry(A)
There is a similar case when unlocking the lock. If the NMI lands
between the WRITE_ONCE and smp_store_release, it is possible that we end
up in a situation where the NMI fails to diagnose the AA condition,
leading to a timeout.
Case on unlock leading to missed AA:
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
<NMI>
... rqspinlock acquisition of A
... timeout
</NMI>
smp_store_release(A->locked, 0)
The patch changes the order on unlock to smp_store_release() succeeded
by WRITE_ONCE() of NULL. This avoids the missed AA detection described
above, but may lead to a false positive if the NMI lands between these
two statements, which is acceptable (and preferred over a timeout).
The original intention of the reverse order on unlock was to prevent the
following possible misdiagnosis of an ABBA scenario:
grab entry A
lock A
grab entry B
lock B
unlock B
smp_store_release(B->locked, 0)
grab entry B
lock B
grab entry A
lock A
! <detect ABBA>
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
If the store release were is after the WRITE_ONCE, the other CPU would
not observe B in the table of the CPU unlocking the lock B. However,
since the threads are obviously participating in an ABBA deadlock, it
is no longer appealing to use the order above since it may lead to a
250 ms timeout due to missed AA detection.
[0]: https://lore.kernel.org/bpf/CAH6OuBTjG+N=+GGwcpOUbeDN563oz4iVcU3rbse68egp9wj9_A@mail.gmail.com
Fixes: 0d80e7f951 ("rqspinlock: Choose trylock fallback for NMI waiters")
Reported-by: Ritesh Oedayrajsingh Varma <ritesh@superluminal.eu>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251128232802.1031906-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
255 lines
7.7 KiB
C
255 lines
7.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* Resilient Queued Spin Lock
|
|
*
|
|
* (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
|
|
*/
|
|
#ifndef __ASM_GENERIC_RQSPINLOCK_H
|
|
#define __ASM_GENERIC_RQSPINLOCK_H
|
|
|
|
#include <linux/types.h>
|
|
#include <vdso/time64.h>
|
|
#include <linux/percpu.h>
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
#include <asm/qspinlock.h>
|
|
#endif
|
|
|
|
struct rqspinlock {
|
|
union {
|
|
atomic_t val;
|
|
u32 locked;
|
|
};
|
|
};
|
|
|
|
/* Even though this is same as struct rqspinlock, we need to emit a distinct
|
|
* type in BTF for BPF programs.
|
|
*/
|
|
struct bpf_res_spin_lock {
|
|
u32 val;
|
|
};
|
|
|
|
struct qspinlock;
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
typedef struct qspinlock rqspinlock_t;
|
|
#else
|
|
typedef struct rqspinlock rqspinlock_t;
|
|
#endif
|
|
|
|
extern int resilient_tas_spin_lock(rqspinlock_t *lock);
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
|
|
#endif
|
|
|
|
#ifndef resilient_virt_spin_lock_enabled
|
|
static __always_inline bool resilient_virt_spin_lock_enabled(void)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
#ifndef resilient_virt_spin_lock
|
|
static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Default timeout for waiting loops is 0.25 seconds
|
|
*/
|
|
#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)
|
|
|
|
/*
|
|
* Choose 31 as it makes rqspinlock_held cacheline-aligned.
|
|
*/
|
|
#define RES_NR_HELD 31
|
|
|
|
struct rqspinlock_held {
|
|
int cnt;
|
|
void *locks[RES_NR_HELD];
|
|
};
|
|
|
|
DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
|
|
|
|
static __always_inline void grab_held_lock_entry(void *lock)
|
|
{
|
|
int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);
|
|
|
|
if (unlikely(cnt > RES_NR_HELD)) {
|
|
/* Still keep the inc so we decrement later. */
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Implied compiler barrier in per-CPU operations; otherwise we can have
|
|
* the compiler reorder inc with write to table, allowing interrupts to
|
|
* overwrite and erase our write to the table (as on interrupt exit it
|
|
* will be reset to NULL).
|
|
*
|
|
* It is fine for cnt inc to be reordered wrt remote readers though,
|
|
* they won't observe our entry until the cnt update is visible, that's
|
|
* all.
|
|
*/
|
|
this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
|
|
}
|
|
|
|
/*
|
|
* We simply don't support out-of-order unlocks, and keep the logic simple here.
|
|
* The verifier prevents BPF programs from unlocking out-of-order, and the same
|
|
* holds for in-kernel users.
|
|
*
|
|
* It is possible to run into misdetection scenarios of AA deadlocks on the same
|
|
* CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
|
|
* out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
|
|
* logic to preserve right entries in the table would be to walk the array of
|
|
* held locks and swap and clear out-of-order entries, but that's too
|
|
* complicated and we don't have a compelling use case for out of order unlocking.
|
|
*/
|
|
static __always_inline void release_held_lock_entry(void)
|
|
{
|
|
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
|
|
|
|
if (unlikely(rqh->cnt > RES_NR_HELD))
|
|
goto dec;
|
|
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
|
|
dec:
|
|
/*
|
|
* Reordering of clearing above with inc and its write in
|
|
* grab_held_lock_entry that came before us (in same acquisition
|
|
* attempt) is ok, we either see a valid entry or NULL when it's
|
|
* visible.
|
|
*
|
|
* But this helper is invoked when we unwind upon failing to acquire the
|
|
* lock. Unlike the unlock path which constitutes a release store after
|
|
* we clear the entry, we need to emit a write barrier here. Otherwise,
|
|
* we may have a situation as follows:
|
|
*
|
|
* <error> for lock B
|
|
* release_held_lock_entry
|
|
*
|
|
* grab_held_lock_entry
|
|
* try_cmpxchg_acquire for lock A
|
|
*
|
|
* Lack of any ordering means reordering may occur such that dec, inc
|
|
* are done before entry is overwritten. This permits a remote lock
|
|
* holder of lock B (which this CPU failed to acquire) to now observe it
|
|
* as being attempted on this CPU, and may lead to misdetection (if this
|
|
* CPU holds a lock it is attempting to acquire, leading to false ABBA
|
|
* diagnosis).
|
|
*
|
|
* The case of unlock is treated differently due to NMI reentrancy, see
|
|
* comments in res_spin_unlock.
|
|
*
|
|
* In theory we don't have a problem if the dec and WRITE_ONCE above get
|
|
* reordered with each other, we either notice an empty NULL entry on
|
|
* top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
|
|
* cannot be observed (if dec precedes WRITE_ONCE).
|
|
*
|
|
* Emit the write barrier _before_ the dec, this permits dec-inc
|
|
* reordering but that is harmless as we'd have new entry set to NULL
|
|
* already, i.e. they cannot precede the NULL store above.
|
|
*/
|
|
smp_wmb();
|
|
this_cpu_dec(rqspinlock_held_locks.cnt);
|
|
}
|
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
|
|
/**
|
|
* res_spin_lock - acquire a queued spinlock
|
|
* @lock: Pointer to queued spinlock structure
|
|
*
|
|
* Return:
|
|
* * 0 - Lock was acquired successfully.
|
|
* * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
|
|
* * -ETIMEDOUT - Lock acquisition failed because of timeout.
|
|
*/
|
|
static __always_inline int res_spin_lock(rqspinlock_t *lock)
|
|
{
|
|
int val = 0;
|
|
|
|
/*
|
|
* Grab the deadlock detection entry before doing the cmpxchg, so that
|
|
* reentrancy due to NMIs between the succeeding cmpxchg and creation of
|
|
* held lock entry can correctly detect an acquisition attempt in the
|
|
* interrupted context.
|
|
*
|
|
* cmpxchg lock A
|
|
* <NMI>
|
|
* res_spin_lock(A) --> missed AA, leads to timeout
|
|
* </NMI>
|
|
* grab_held_lock_entry(A)
|
|
*/
|
|
grab_held_lock_entry(lock);
|
|
|
|
if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
|
|
return 0;
|
|
return resilient_queued_spin_lock_slowpath(lock, val);
|
|
}
|
|
|
|
#else
|
|
|
|
#define res_spin_lock(lock) resilient_tas_spin_lock(lock)
|
|
|
|
#endif /* CONFIG_QUEUED_SPINLOCKS */
|
|
|
|
static __always_inline void res_spin_unlock(rqspinlock_t *lock)
|
|
{
|
|
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
|
|
|
|
/*
|
|
* Release barrier, ensures correct ordering. Perform release store
|
|
* instead of queued_spin_unlock, since we use this function for the TAS
|
|
* fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
|
|
* the full 4-byte lockword.
|
|
*
|
|
* Perform the smp_store_release before clearing the lock entry so that
|
|
* NMIs landing in the unlock path can correctly detect AA issues. The
|
|
* opposite order shown below may lead to missed AA checks:
|
|
*
|
|
* WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
|
|
* <NMI>
|
|
* res_spin_lock(A) --> missed AA, leads to timeout
|
|
* </NMI>
|
|
* smp_store_release(A->locked, 0)
|
|
*/
|
|
smp_store_release(&lock->locked, 0);
|
|
if (likely(rqh->cnt <= RES_NR_HELD))
|
|
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
|
|
this_cpu_dec(rqspinlock_held_locks.cnt);
|
|
}
|
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
|
|
#else
|
|
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
|
|
#endif
|
|
|
|
#define raw_res_spin_lock(lock) \
|
|
({ \
|
|
int __ret; \
|
|
preempt_disable(); \
|
|
__ret = res_spin_lock(lock); \
|
|
if (__ret) \
|
|
preempt_enable(); \
|
|
__ret; \
|
|
})
|
|
|
|
#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
|
|
|
|
#define raw_res_spin_lock_irqsave(lock, flags) \
|
|
({ \
|
|
int __ret; \
|
|
local_irq_save(flags); \
|
|
__ret = raw_res_spin_lock(lock); \
|
|
if (__ret) \
|
|
local_irq_restore(flags); \
|
|
__ret; \
|
|
})
|
|
|
|
#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
|
|
|
|
#endif /* __ASM_GENERIC_RQSPINLOCK_H */
|