diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/asm-generic/Kbuild | 1 | ||||
-rw-r--r-- | include/asm-generic/mcs_spinlock.h | 6 | ||||
-rw-r--r-- | include/asm-generic/rqspinlock.h | 250 | ||||
-rw-r--r-- | include/linux/bpf.h | 10 | ||||
-rw-r--r-- | include/linux/bpf_verifier.h | 19 |
5 files changed, 283 insertions, 3 deletions
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 1b43c3a77012..8675b7b4ad23 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -45,6 +45,7 @@ mandatory-y += pci.h mandatory-y += percpu.h mandatory-y += pgalloc.h mandatory-y += preempt.h +mandatory-y += rqspinlock.h mandatory-y += runtime-const.h mandatory-y += rwonce.h mandatory-y += sections.h diff --git a/include/asm-generic/mcs_spinlock.h b/include/asm-generic/mcs_spinlock.h index 10cd4ffc6ba2..39c94012b88a 100644 --- a/include/asm-generic/mcs_spinlock.h +++ b/include/asm-generic/mcs_spinlock.h @@ -1,6 +1,12 @@ #ifndef __ASM_MCS_SPINLOCK_H #define __ASM_MCS_SPINLOCK_H +struct mcs_spinlock { + struct mcs_spinlock *next; + int locked; /* 1 if lock acquired */ + int count; /* nesting count, see qspinlock.c */ +}; + /* * Architectures can define their own: * diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h new file mode 100644 index 000000000000..6d4244d643df --- /dev/null +++ b/include/asm-generic/rqspinlock.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Resilient Queued Spin Lock + * + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ +#ifndef __ASM_GENERIC_RQSPINLOCK_H +#define __ASM_GENERIC_RQSPINLOCK_H + +#include <linux/types.h> +#include <vdso/time64.h> +#include <linux/percpu.h> +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#endif + +struct rqspinlock { + union { + atomic_t val; + u32 locked; + }; +}; + +/* Even though this is same as struct rqspinlock, we need to emit a distinct + * type in BTF for BPF programs. + */ +struct bpf_res_spin_lock { + u32 val; +}; + +struct qspinlock; +#ifdef CONFIG_QUEUED_SPINLOCKS +typedef struct qspinlock rqspinlock_t; +#else +typedef struct rqspinlock rqspinlock_t; +#endif + +extern int resilient_tas_spin_lock(rqspinlock_t *lock); +#ifdef CONFIG_QUEUED_SPINLOCKS +extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); +#endif + +#ifndef resilient_virt_spin_lock_enabled +static __always_inline bool resilient_virt_spin_lock_enabled(void) +{ + return false; +} +#endif + +#ifndef resilient_virt_spin_lock +static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) +{ + return 0; +} +#endif + +/* + * Default timeout for waiting loops is 0.25 seconds + */ +#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) + +/* + * Choose 31 as it makes rqspinlock_held cacheline-aligned. + */ +#define RES_NR_HELD 31 + +struct rqspinlock_held { + int cnt; + void *locks[RES_NR_HELD]; +}; + +DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); + +static __always_inline void grab_held_lock_entry(void *lock) +{ + int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); + + if (unlikely(cnt > RES_NR_HELD)) { + /* Still keep the inc so we decrement later. */ + return; + } + + /* + * Implied compiler barrier in per-CPU operations; otherwise we can have + * the compiler reorder inc with write to table, allowing interrupts to + * overwrite and erase our write to the table (as on interrupt exit it + * will be reset to NULL). + * + * It is fine for cnt inc to be reordered wrt remote readers though, + * they won't observe our entry until the cnt update is visible, that's + * all. + */ + this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); +} + +/* + * We simply don't support out-of-order unlocks, and keep the logic simple here. + * The verifier prevents BPF programs from unlocking out-of-order, and the same + * holds for in-kernel users. + * + * It is possible to run into misdetection scenarios of AA deadlocks on the same + * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries + * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct + * logic to preserve right entries in the table would be to walk the array of + * held locks and swap and clear out-of-order entries, but that's too + * complicated and we don't have a compelling use case for out of order unlocking. + */ +static __always_inline void release_held_lock_entry(void) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + + if (unlikely(rqh->cnt > RES_NR_HELD)) + goto dec; + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); +dec: + /* + * Reordering of clearing above with inc and its write in + * grab_held_lock_entry that came before us (in same acquisition + * attempt) is ok, we either see a valid entry or NULL when it's + * visible. + * + * But this helper is invoked when we unwind upon failing to acquire the + * lock. Unlike the unlock path which constitutes a release store after + * we clear the entry, we need to emit a write barrier here. Otherwise, + * we may have a situation as follows: + * + * <error> for lock B + * release_held_lock_entry + * + * try_cmpxchg_acquire for lock A + * grab_held_lock_entry + * + * Lack of any ordering means reordering may occur such that dec, inc + * are done before entry is overwritten. This permits a remote lock + * holder of lock B (which this CPU failed to acquire) to now observe it + * as being attempted on this CPU, and may lead to misdetection (if this + * CPU holds a lock it is attempting to acquire, leading to false ABBA + * diagnosis). + * + * In case of unlock, we will always do a release on the lock word after + * releasing the entry, ensuring that other CPUs cannot hold the lock + * (and make conclusions about deadlocks) until the entry has been + * cleared on the local CPU, preventing any anomalies. Reordering is + * still possible there, but a remote CPU cannot observe a lock in our + * table which it is already holding, since visibility entails our + * release store for the said lock has not retired. + * + * In theory we don't have a problem if the dec and WRITE_ONCE above get + * reordered with each other, we either notice an empty NULL entry on + * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which + * cannot be observed (if dec precedes WRITE_ONCE). + * + * Emit the write barrier _before_ the dec, this permits dec-inc + * reordering but that is harmless as we'd have new entry set to NULL + * already, i.e. they cannot precede the NULL store above. + */ + smp_wmb(); + this_cpu_dec(rqspinlock_held_locks.cnt); +} + +#ifdef CONFIG_QUEUED_SPINLOCKS + +/** + * res_spin_lock - acquire a queued spinlock + * @lock: Pointer to queued spinlock structure + * + * Return: + * * 0 - Lock was acquired successfully. + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. + * * -ETIMEDOUT - Lock acquisition failed because of timeout. + */ +static __always_inline int res_spin_lock(rqspinlock_t *lock) +{ + int val = 0; + + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { + grab_held_lock_entry(lock); + return 0; + } + return resilient_queued_spin_lock_slowpath(lock, val); +} + +#else + +#define res_spin_lock(lock) resilient_tas_spin_lock(lock) + +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +static __always_inline void res_spin_unlock(rqspinlock_t *lock) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + + if (unlikely(rqh->cnt > RES_NR_HELD)) + goto unlock; + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); +unlock: + /* + * Release barrier, ensures correct ordering. See release_held_lock_entry + * for details. Perform release store instead of queued_spin_unlock, + * since we use this function for test-and-set fallback as well. When we + * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. + * + * Like release_held_lock_entry, we can do the release before the dec. + * We simply care about not seeing the 'lock' in our table from a remote + * CPU once the lock has been released, which doesn't rely on the dec. + * + * Unlike smp_wmb(), release is not a two way fence, hence it is + * possible for a inc to move up and reorder with our clearing of the + * entry. This isn't a problem however, as for a misdiagnosis of ABBA, + * the remote CPU needs to hold this lock, which won't be released until + * the store below is done, which would ensure the entry is overwritten + * to NULL, etc. + */ + smp_store_release(&lock->locked, 0); + this_cpu_dec(rqspinlock_held_locks.cnt); +} + +#ifdef CONFIG_QUEUED_SPINLOCKS +#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) +#else +#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) +#endif + +#define raw_res_spin_lock(lock) \ + ({ \ + int __ret; \ + preempt_disable(); \ + __ret = res_spin_lock(lock); \ + if (__ret) \ + preempt_enable(); \ + __ret; \ + }) + +#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) + +#define raw_res_spin_lock_irqsave(lock, flags) \ + ({ \ + int __ret; \ + local_irq_save(flags); \ + __ret = raw_res_spin_lock(lock); \ + if (__ret) \ + local_irq_restore(flags); \ + __ret; \ + }) + +#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) + +#endif /* __ASM_GENERIC_RQSPINLOCK_H */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 111bea4e507f..d67490dc3a2b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -30,6 +30,7 @@ #include <linux/static_call.h> #include <linux/memcontrol.h> #include <linux/cfi.h> +#include <asm/rqspinlock.h> struct bpf_verifier_env; struct bpf_verifier_log; @@ -204,6 +205,7 @@ enum btf_field_type { BPF_REFCOUNT = (1 << 9), BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), + BPF_RES_SPIN_LOCK = (1 << 12), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -239,6 +241,7 @@ struct btf_record { u32 cnt; u32 field_mask; int spin_lock_off; + int res_spin_lock_off; int timer_off; int wq_off; int refcount_off; @@ -314,6 +317,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) switch (type) { case BPF_SPIN_LOCK: return "bpf_spin_lock"; + case BPF_RES_SPIN_LOCK: + return "bpf_res_spin_lock"; case BPF_TIMER: return "bpf_timer"; case BPF_WORKQUEUE: @@ -346,6 +351,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) switch (type) { case BPF_SPIN_LOCK: return sizeof(struct bpf_spin_lock); + case BPF_RES_SPIN_LOCK: + return sizeof(struct bpf_res_spin_lock); case BPF_TIMER: return sizeof(struct bpf_timer); case BPF_WORKQUEUE: @@ -376,6 +383,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) switch (type) { case BPF_SPIN_LOCK: return __alignof__(struct bpf_spin_lock); + case BPF_RES_SPIN_LOCK: + return __alignof__(struct bpf_res_spin_lock); case BPF_TIMER: return __alignof__(struct bpf_timer); case BPF_WORKQUEUE: @@ -419,6 +428,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_RB_ROOT: /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_KPTR_UNREF: diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d6cfc4ee6820..9734544b6957 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -115,6 +115,14 @@ struct bpf_reg_state { int depth:30; } iter; + /* For irq stack slots */ + struct { + enum { + IRQ_NATIVE_KFUNC, + IRQ_LOCK_KFUNC, + } kfunc_class; + } irq; + /* Max size from any of the above. */ struct { unsigned long raw1; @@ -255,9 +263,12 @@ struct bpf_reference_state { * default to pointer reference on zero initialization of a state. */ enum ref_state_type { - REF_TYPE_PTR = 1, - REF_TYPE_IRQ = 2, - REF_TYPE_LOCK = 3, + REF_TYPE_PTR = (1 << 1), + REF_TYPE_IRQ = (1 << 2), + REF_TYPE_LOCK = (1 << 3), + REF_TYPE_RES_LOCK = (1 << 4), + REF_TYPE_RES_LOCK_IRQ = (1 << 5), + REF_TYPE_LOCK_MASK = REF_TYPE_LOCK | REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, } type; /* Track each reference created with a unique id, even if the same * instruction creates the reference multiple times (eg, via CALL). @@ -424,6 +435,8 @@ struct bpf_verifier_state { u32 active_locks; u32 active_preempt_locks; u32 active_irq_id; + u32 active_lock_id; + void *active_lock_ptr; bool active_rcu_lock; bool speculative; |