diff options
-rw-r--r-- | arch/powerpc/Kconfig | 3 | ||||
-rw-r--r-- | arch/powerpc/include/asm/qspinlock.h | 192 | ||||
-rw-r--r-- | arch/powerpc/include/asm/qspinlock_paravirt.h | 7 | ||||
-rw-r--r-- | arch/powerpc/include/asm/qspinlock_types.h | 72 | ||||
-rw-r--r-- | arch/powerpc/include/asm/spinlock.h | 2 | ||||
-rw-r--r-- | arch/powerpc/include/asm/spinlock_types.h | 2 | ||||
-rw-r--r-- | arch/powerpc/lib/Makefile | 4 | ||||
-rw-r--r-- | arch/powerpc/lib/qspinlock.c | 996 |
8 files changed, 1215 insertions, 63 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1a134c9769f8..fe2aa445b654 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -99,7 +99,7 @@ config LOCKDEP_SUPPORT config GENERIC_LOCKBREAK bool default y - depends on SMP && PREEMPTION + depends on SMP && PREEMPTION && !PPC_QUEUED_SPINLOCKS config GENERIC_HWEIGHT bool @@ -158,7 +158,6 @@ config PPC select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS - select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IRQS_OFF_ACTIVATE_MM diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index b676c4fb90fd..28a53fb69b38 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -2,83 +2,173 @@ #ifndef _ASM_POWERPC_QSPINLOCK_H #define _ASM_POWERPC_QSPINLOCK_H -#include <asm-generic/qspinlock_types.h> +#include <linux/compiler.h> +#include <asm/qspinlock_types.h> #include <asm/paravirt.h> -#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */ +#ifdef CONFIG_PPC64 +/* + * Use the EH=1 hint for accesses that result in the lock being acquired. + * The hardware is supposed to optimise this pattern by holding the lock + * cacheline longer, and releasing when a store to the same memory (the + * unlock) is performed. + */ +#define _Q_SPIN_EH_HINT 1 +#else +#define _Q_SPIN_EH_HINT 0 +#endif -#ifdef CONFIG_PARAVIRT_SPINLOCKS -extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __pv_queued_spin_unlock(struct qspinlock *lock); +/* + * The trylock itself may steal. This makes trylocks slightly stronger, and + * makes locks slightly more efficient when stealing. + * + * This is compile-time, so if true then there may always be stealers, so the + * nosteal paths become unused. + */ +#define _Q_SPIN_TRY_LOCK_STEAL 1 -static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) -{ - if (!is_shared_processor()) - native_queued_spin_lock_slowpath(lock, val); - else - __pv_queued_spin_lock_slowpath(lock, val); -} +/* + * Put a speculation barrier after testing the lock/node and finding it + * busy. Try to prevent pointless speculation in slow paths. + * + * Slows down the lockstorm microbenchmark with no stealing, where locking + * is purely FIFO through the queue. May have more benefit in real workload + * where speculating into the wrong place could have a greater cost. + */ +#define _Q_SPIN_SPEC_BARRIER 0 -#define queued_spin_unlock queued_spin_unlock -static inline void queued_spin_unlock(struct qspinlock *lock) -{ - if (!is_shared_processor()) - smp_store_release(&lock->locked, 0); - else - __pv_queued_spin_unlock(lock); -} +#ifdef CONFIG_PPC64 +/* + * Execute a miso instruction after passing the MCS lock ownership to the + * queue head. Miso is intended to make stores visible to other CPUs sooner. + * + * This seems to make the lockstorm microbenchmark nospin test go slightly + * faster on POWER10, but disable for now. + */ +#define _Q_SPIN_MISO 0 +#else +#define _Q_SPIN_MISO 0 +#endif +#ifdef CONFIG_PPC64 +/* + * This executes miso after an unlock of the lock word, having ownership + * pass to the next CPU sooner. This will slow the uncontended path to some + * degree. Not evidence it helps yet. + */ +#define _Q_SPIN_MISO_UNLOCK 0 #else -extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +#define _Q_SPIN_MISO_UNLOCK 0 #endif -static __always_inline void queued_spin_lock(struct qspinlock *lock) +/* + * Seems to slow down lockstorm microbenchmark, suspect queue node just + * has to become shared again right afterwards when its waiter spins on + * the lock field. + */ +#define _Q_SPIN_PREFETCH_NEXT 0 + +static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { - u32 val = 0; + return READ_ONCE(lock->val); +} - if (likely(arch_atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL))) - return; +static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) +{ + return !lock.val; +} - queued_spin_lock_slowpath(lock, val); +static __always_inline int queued_spin_is_contended(struct qspinlock *lock) +{ + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); } -#define queued_spin_lock queued_spin_lock -#ifdef CONFIG_PARAVIRT_SPINLOCKS -#define SPIN_THRESHOLD (1<<15) /* not tuned */ +static __always_inline u32 queued_spin_encode_locked_val(void) +{ + /* XXX: make this use lock value in paca like simple spinlocks? */ + return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET); +} -static __always_inline void pv_wait(u8 *ptr, u8 val) +static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock) { - if (*ptr != val) - return; - yield_to_any(); - /* - * We could pass in a CPU here if waiting in the queue and yield to - * the previous CPU in the queue. - */ + u32 new = queued_spin_encode_locked_val(); + u32 prev; + + /* Trylock succeeds only when unlocked and no queued nodes */ + asm volatile( +"1: lwarx %0,0,%1,%3 # __queued_spin_trylock_nosteal \n" +" cmpwi 0,%0,0 \n" +" bne- 2f \n" +" stwcx. %2,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (new), + "i" (_Q_SPIN_EH_HINT) + : "cr0", "memory"); + + return likely(prev == 0); } -static __always_inline void pv_kick(int cpu) +static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) { - prod_cpu(cpu); + u32 new = queued_spin_encode_locked_val(); + u32 prev, tmp; + + /* Trylock may get ahead of queued nodes if it finds unlocked */ + asm volatile( +"1: lwarx %0,0,%2,%5 # __queued_spin_trylock_steal \n" +" andc. %1,%0,%4 \n" +" bne- 2f \n" +" and %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev), "=&r" (tmp) + : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK), + "i" (_Q_SPIN_EH_HINT) + : "cr0", "memory"); + + return likely(!(prev & ~_Q_TAIL_CPU_MASK)); } -extern void __pv_init_lock_hash(void); +static __always_inline int queued_spin_trylock(struct qspinlock *lock) +{ + if (!_Q_SPIN_TRY_LOCK_STEAL) + return __queued_spin_trylock_nosteal(lock); + else + return __queued_spin_trylock_steal(lock); +} -static inline void pv_spinlocks_init(void) +void queued_spin_lock_slowpath(struct qspinlock *lock); + +static __always_inline void queued_spin_lock(struct qspinlock *lock) { - __pv_init_lock_hash(); + if (!queued_spin_trylock(lock)) + queued_spin_lock_slowpath(lock); } -#endif +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + smp_store_release(&lock->locked, 0); + if (_Q_SPIN_MISO_UNLOCK) + asm volatile("miso" ::: "memory"); +} -/* - * Queued spinlocks rely heavily on smp_cond_load_relaxed() to busy-wait, - * which was found to have performance problems if implemented with - * the preferred spin_begin()/spin_end() SMT priority pattern. Use the - * generic version instead. - */ +#define arch_spin_is_locked(l) queued_spin_is_locked(l) +#define arch_spin_is_contended(l) queued_spin_is_contended(l) +#define arch_spin_value_unlocked(l) queued_spin_value_unlocked(l) +#define arch_spin_lock(l) queued_spin_lock(l) +#define arch_spin_trylock(l) queued_spin_trylock(l) +#define arch_spin_unlock(l) queued_spin_unlock(l) -#include <asm-generic/qspinlock.h> +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void pv_spinlocks_init(void); +#else +static inline void pv_spinlocks_init(void) { } +#endif #endif /* _ASM_POWERPC_QSPINLOCK_H */ diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h deleted file mode 100644 index 6b60e7736a47..000000000000 --- a/arch/powerpc/include/asm/qspinlock_paravirt.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_POWERPC_QSPINLOCK_PARAVIRT_H -#define _ASM_POWERPC_QSPINLOCK_PARAVIRT_H - -EXPORT_SYMBOL(__pv_queued_spin_unlock); - -#endif /* _ASM_POWERPC_QSPINLOCK_PARAVIRT_H */ diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h new file mode 100644 index 000000000000..4766a7aa03cb --- /dev/null +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ASM_POWERPC_QSPINLOCK_TYPES_H +#define _ASM_POWERPC_QSPINLOCK_TYPES_H + +#include <linux/types.h> +#include <asm/byteorder.h> + +typedef struct qspinlock { + union { + u32 val; + +#ifdef __LITTLE_ENDIAN + struct { + u16 locked; + u8 reserved[2]; + }; +#else + struct { + u8 reserved[2]; + u16 locked; + }; +#endif + }; +} arch_spinlock_t; + +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } + +/* + * Bitfields in the lock word: + * + * 0: locked bit + * 1-14: lock holder cpu + * 15: lock owner or queuer vcpus observed to be preempted bit + * 16: must queue bit + * 17-31: tail cpu (+1) + */ +#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ + << _Q_ ## type ## _OFFSET) +/* 0x00000001 */ +#define _Q_LOCKED_OFFSET 0 +#define _Q_LOCKED_BITS 1 +#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) + +/* 0x00007ffe */ +#define _Q_OWNER_CPU_OFFSET 1 +#define _Q_OWNER_CPU_BITS 14 +#define _Q_OWNER_CPU_MASK _Q_SET_MASK(OWNER_CPU) + +#if CONFIG_NR_CPUS > (1U << _Q_OWNER_CPU_BITS) +#error "qspinlock does not support such large CONFIG_NR_CPUS" +#endif + +/* 0x00008000 */ +#define _Q_SLEEPY_OFFSET 15 +#define _Q_SLEEPY_BITS 1 +#define _Q_SLEEPY_VAL (1U << _Q_SLEEPY_OFFSET) + +/* 0x00010000 */ +#define _Q_MUST_Q_OFFSET 16 +#define _Q_MUST_Q_BITS 1 +#define _Q_MUST_Q_VAL (1U << _Q_MUST_Q_OFFSET) + +/* 0xfffe0000 */ +#define _Q_TAIL_CPU_OFFSET 17 +#define _Q_TAIL_CPU_BITS 15 +#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) + +#if CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS) +#error "qspinlock does not support such large CONFIG_NR_CPUS" +#endif + +#endif /* _ASM_POWERPC_QSPINLOCK_TYPES_H */ diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index bd75872a6334..7dafca8e3f02 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -13,7 +13,7 @@ /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() -#ifndef CONFIG_PARAVIRT_SPINLOCKS +#ifndef CONFIG_PPC_QUEUED_SPINLOCKS static inline void pv_spinlocks_init(void) { } #endif diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index d5f8a74ed2e8..40b01446cf75 100644 --- a/arch/powerpc/include/asm/spinlock_types.h +++ b/arch/powerpc/include/asm/spinlock_types.h @@ -7,7 +7,7 @@ #endif #ifdef CONFIG_PPC_QUEUED_SPINLOCKS -#include <asm-generic/qspinlock_types.h> +#include <asm/qspinlock_types.h> #include <asm-generic/qrwlock_types.h> #else #include <asm/simple_spinlock_types.h> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 8560c912186d..4de71cbf6e8e 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -52,7 +52,9 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ memcpy_64.o copy_mc_64.o -ifndef CONFIG_PPC_QUEUED_SPINLOCKS +ifdef CONFIG_PPC_QUEUED_SPINLOCKS +obj-$(CONFIG_SMP) += qspinlock.o +else obj64-$(CONFIG_SMP) += locks.o endif diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c new file mode 100644 index 000000000000..1cf5d3e75250 --- /dev/null +++ b/arch/powerpc/lib/qspinlock.c @@ -0,0 +1,996 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/bug.h> +#include <linux/compiler.h> +#include <linux/export.h> +#include <linux/percpu.h> +#include <linux/smp.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <asm/qspinlock.h> +#include <asm/paravirt.h> + +#define MAX_NODES 4 + +struct qnode { + struct qnode *next; + struct qspinlock *lock; + int cpu; + int yield_cpu; + u8 locked; /* 1 if lock acquired */ +}; + +struct qnodes { + int count; + struct qnode nodes[MAX_NODES]; +}; + +/* Tuning parameters */ +static int steal_spins __read_mostly = (1 << 5); +static int remote_steal_spins __read_mostly = (1 << 2); +#if _Q_SPIN_TRY_LOCK_STEAL == 1 +static const bool maybe_stealers = true; +#else +static bool maybe_stealers __read_mostly = true; +#endif +static int head_spins __read_mostly = (1 << 8); + +static bool pv_yield_owner __read_mostly = true; +static bool pv_yield_allow_steal __read_mostly = false; +static bool pv_spin_on_preempted_owner __read_mostly = false; +static bool pv_sleepy_lock __read_mostly = true; +static bool pv_sleepy_lock_sticky __read_mostly = false; +static u64 pv_sleepy_lock_interval_ns __read_mostly = 0; +static int pv_sleepy_lock_factor __read_mostly = 256; +static bool pv_yield_prev __read_mostly = true; +static bool pv_yield_propagate_owner __read_mostly = true; +static bool pv_prod_head __read_mostly = false; + +static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); +static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock); + +#if _Q_SPIN_SPEC_BARRIER == 1 +#define spec_barrier() do { asm volatile("ori 31,31,0" ::: "memory"); } while (0) +#else +#define spec_barrier() do { } while (0) +#endif + +static __always_inline bool recently_sleepy(void) +{ + /* pv_sleepy_lock is true when this is called */ + if (pv_sleepy_lock_interval_ns) { + u64 seen = this_cpu_read(sleepy_lock_seen_clock); + + if (seen) { + u64 delta = sched_clock() - seen; + if (delta < pv_sleepy_lock_interval_ns) + return true; + this_cpu_write(sleepy_lock_seen_clock, 0); + } + } + + return false; +} + +static __always_inline int get_steal_spins(bool paravirt, bool sleepy) +{ + if (paravirt && sleepy) + return steal_spins * pv_sleepy_lock_factor; + else + return steal_spins; +} + +static __always_inline int get_remote_steal_spins(bool paravirt, bool sleepy) +{ + if (paravirt && sleepy) + return remote_steal_spins * pv_sleepy_lock_factor; + else + return remote_steal_spins; +} + +static __always_inline int get_head_spins(bool paravirt, bool sleepy) +{ + if (paravirt && sleepy) + return head_spins * pv_sleepy_lock_factor; + else + return head_spins; +} + +static inline u32 encode_tail_cpu(int cpu) +{ + return (cpu + 1) << _Q_TAIL_CPU_OFFSET; +} + +static inline int decode_tail_cpu(u32 val) +{ + return (val >> _Q_TAIL_CPU_OFFSET) - 1; +} + +static inline int get_owner_cpu(u32 val) +{ + return (val & _Q_OWNER_CPU_MASK) >> _Q_OWNER_CPU_OFFSET; +} + +/* + * Try to acquire the lock if it was not already locked. If the tail matches + * mytail then clear it, otherwise leave it unchnaged. Return previous value. + * + * This is used by the head of the queue to acquire the lock and clean up + * its tail if it was the last one queued. + */ +static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail) +{ + u32 newval = queued_spin_encode_locked_val(); + u32 prev, tmp; + + asm volatile( +"1: lwarx %0,0,%2,%7 # trylock_clean_tail \n" + /* This test is necessary if there could be stealers */ +" andi. %1,%0,%5 \n" +" bne 3f \n" + /* Test whether the lock tail == mytail */ +" and %1,%0,%6 \n" +" cmpw 0,%1,%3 \n" + /* Merge the new locked value */ +" or %1,%1,%4 \n" +" bne 2f \n" + /* If the lock tail matched, then clear it, otherwise leave it. */ +" andc %1,%1,%6 \n" +"2: stwcx. %1,0,%2 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"3: \n" + : "=&r" (prev), "=&r" (tmp) + : "r" (&lock->val), "r"(tail), "r" (newval), + "i" (_Q_LOCKED_VAL), + "r" (_Q_TAIL_CPU_MASK), + "i" (_Q_SPIN_EH_HINT) + : "cr0", "memory"); + + return prev; +} + +/* + * Publish our tail, replacing previous tail. Return previous value. + * + * This provides a release barrier for publishing node, this pairs with the + * acquire barrier in get_tail_qnode() when the next CPU finds this tail + * value. + */ +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) +{ + u32 prev, tmp; + + asm volatile( +"\t" PPC_RELEASE_BARRIER " \n" +"1: lwarx %0,0,%2 # publish_tail_cpu \n" +" andc %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" + : "=&r" (prev), "=&r"(tmp) + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) + : "cr0", "memory"); + + return prev; +} + +static __always_inline u32 set_mustq(struct qspinlock *lock) +{ + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1 # set_mustq \n" +" or %0,%0,%2 \n" +" stwcx. %0,0,%1 \n" +" bne- 1b \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (_Q_MUST_Q_VAL) + : "cr0", "memory"); + + return prev; +} + +static __always_inline u32 clear_mustq(struct qspinlock *lock) +{ + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1 # clear_mustq \n" +" andc %0,%0,%2 \n" +" stwcx. %0,0,%1 \n" +" bne- 1b \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (_Q_MUST_Q_VAL) + : "cr0", "memory"); + + return prev; +} + +static __always_inline bool try_set_sleepy(struct qspinlock *lock, u32 old) +{ + u32 prev; + u32 new = old | _Q_SLEEPY_VAL; + + BUG_ON(!(old & _Q_LOCKED_VAL)); + BUG_ON(old & _Q_SLEEPY_VAL); + + asm volatile( +"1: lwarx %0,0,%1 # try_set_sleepy \n" +" cmpw 0,%0,%2 \n" +" bne- 2f \n" +" stwcx. %3,0,%1 \n" +" bne- 1b \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r"(old), "r" (new) + : "cr0", "memory"); + + return likely(prev == old); +} + +static __always_inline void seen_sleepy_owner(struct qspinlock *lock, u32 val) +{ + if (pv_sleepy_lock) { + if (pv_sleepy_lock_interval_ns) + this_cpu_write(sleepy_lock_seen_clock, sched_clock()); + if (!(val & _Q_SLEEPY_VAL)) + try_set_sleepy(lock, val); + } +} + +static __always_inline void seen_sleepy_lock(void) +{ + if (pv_sleepy_lock && pv_sleepy_lock_interval_ns) + this_cpu_write(sleepy_lock_seen_clock, sched_clock()); +} + +static __always_inline void seen_sleepy_node(struct qspinlock *lock, u32 val) +{ + if (pv_sleepy_lock) { + if (pv_sleepy_lock_interval_ns) + this_cpu_write(sleepy_lock_seen_clock, sched_clock()); + if (val & _Q_LOCKED_VAL) { + if (!(val & _Q_SLEEPY_VAL)) + try_set_sleepy(lock, val); + } + } +} + +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) +{ + int cpu = decode_tail_cpu(val); + struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); + int idx; + + /* + * After publishing the new tail and finding a previous tail in the + * previous val (which is the control dependency), this barrier + * orders the release barrier in publish_tail_cpu performed by the + * last CPU, with subsequently looking at its qnode structures + * after the barrier. + */ + smp_acquire__after_ctrl_dep(); + + for (idx = 0; idx < MAX_NODES; idx++) { + struct qnode *qnode = &qnodesp->nodes[idx]; + if (qnode->lock == lock) + return qnode; + } + + BUG(); +} + +/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */ +static __always_inline bool __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq) +{ + int owner; + u32 yield_count; + bool preempted = false; + + BUG_ON(!(val & _Q_LOCKED_VAL)); + + if (!paravirt) + goto relax; + + if (!pv_yield_owner) + goto relax; + + owner = get_owner_cpu(val); + yield_count = yield_count_of(owner); + + if ((yield_count & 1) == 0) + goto relax; /* owner vcpu is running */ + + spin_end(); + + seen_sleepy_owner(lock, val); + preempted = true; + + /* + * Read the lock word after sampling the yield count. On the other side + * there may a wmb because the yield count update is done by the + * hypervisor preemption and the value update by the OS, however this + * ordering might reduce the chance of out of order accesses and + * improve the heuristic. + */ + smp_rmb(); + + if (READ_ONCE(lock->val) == val) { + if (mustq) + clear_mustq(lock); + yield_to_preempted(owner, yield_count); + if (mustq) + set_mustq(lock); + spin_begin(); + + /* Don't relax if we yielded. Maybe we should? */ + return preempted; + } + spin_begin(); +relax: + spin_cpu_relax(); + + return preempted; +} + +/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */ +static __always_inline bool yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +{ + return __yield_to_locked_owner(lock, val, paravirt, false); +} + +/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */ +static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +{ + bool mustq = false; + + if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal) + mustq = true; + + return __yield_to_locked_owner(lock, val, paravirt, mustq); +} + +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt) +{ + struct qnode *next; + int owner; + + if (!paravirt) + return; + if (!pv_yield_propagate_owner) + return; + + owner = get_owner_cpu(val); + if (*set_yield_cpu == owner) + return; + + next = READ_ONCE(node->next); + if (!next) + return; + + if (vcpu_is_preempted(owner)) { + next->yield_cpu = owner; + *set_yield_cpu = owner; + } else if (*set_yield_cpu != -1) { + next->yield_cpu = owner; + *set_yield_cpu = owner; + } +} + +/* Called inside spin_begin() */ +static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt) +{ + int prev_cpu = decode_tail_cpu(val); + u32 yield_count; + int yield_cpu; + bool preempted = false; + + if (!paravirt) + goto relax; + + if (!pv_yield_propagate_owner) + goto yield_prev; + + yield_cpu = READ_ONCE(node->yield_cpu); + if (yield_cpu == -1) { + /* Propagate back the -1 CPU */ + if (node->next && node->next->yield_cpu != -1) + node->next->yield_cpu = yield_cpu; + goto yield_prev; + } + + yield_count = yield_count_of(yield_cpu); + if ((yield_count & 1) == 0) + goto yield_prev; /* owner vcpu is running */ + + spin_end(); + + preempted = true; + seen_sleepy_node(lock, val); + + smp_rmb(); + + if (yield_cpu == node->yield_cpu) { + if (node->next && node->next->yield_cpu != yield_cpu) + node->next->yield_cpu = yield_cpu; + yield_to_preempted(yield_cpu, yield_count); + spin_begin(); + return preempted; + } + spin_begin(); + +yield_prev: + if (!pv_yield_prev) + goto relax; + + yield_count = yield_count_of(prev_cpu); + if ((yield_count & 1) == 0) + goto relax; /* owner vcpu is running */ + + spin_end(); + + preempted = true; + seen_sleepy_node(lock, val); + + smp_rmb(); /* See __yield_to_locked_owner comment */ + + if (!node->locked) { + yield_to_preempted(prev_cpu, yield_count); + spin_begin(); + return preempted; + } + spin_begin(); + +relax: + spin_cpu_relax(); + + return preempted; +} + +static __always_inline bool steal_break(u32 val, int iters, bool paravirt, bool sleepy) +{ + if (iters >= get_steal_spins(paravirt, sleepy)) + return true; + + if (IS_ENABLED(CONFIG_NUMA) && + (iters >= get_remote_steal_spins(paravirt, sleepy))) { + int cpu = get_owner_cpu(val); + if (numa_node_id() != cpu_to_node(cpu)) + return true; + } + return false; +} + +static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt) +{ + bool seen_preempted = false; + bool sleepy = false; + int iters = 0; + u32 val; + + if (!steal_spins) { + /* XXX: should spin_on_preempted_owner do anything here? */ + return false; + } + + /* Attempt to steal the lock */ + spin_begin(); + do { + bool preempted = false; + + val = READ_ONCE(lock->val); + if (val & _Q_MUST_Q_VAL) + break; + spec_barrier(); + + if (unlikely(!(val & _Q_LOCKED_VAL))) { + spin_end(); + if (__queued_spin_trylock_steal(lock)) + return true; + spin_begin(); + } else { + preempted = yield_to_locked_owner(lock, val, paravirt); + } + + if (paravirt && pv_sleepy_lock) { + if (!sleepy) { + if (val & _Q_SLEEPY_VAL) { + seen_sleepy_lock(); + sleepy = true; + } else if (recently_sleepy()) { + sleepy = true; + } + } + if (pv_sleepy_lock_sticky && seen_preempted && + !(val & _Q_SLEEPY_VAL)) { + if (try_set_sleepy(lock, val)) + val |= _Q_SLEEPY_VAL; + } + } + + if (preempted) { + seen_preempted = true; + sleepy = true; + if (!pv_spin_on_preempted_owner) + iters++; + /* + * pv_spin_on_preempted_owner don't increase iters + * while the owner is preempted -- we won't interfere + * with it by definition. This could introduce some + * latency issue if we continually observe preempted + * owners, but hopefully that's a rare corner case of + * a badly oversubscribed system. + */ + } else { + iters++; + } + } while (!steal_break(val, iters, paravirt, sleepy)); + + spin_end(); + + return false; +} + +static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt) +{ + struct qnodes *qnodesp; + struct qnode *next, *node; + u32 val, old, tail; + bool seen_preempted = false; + bool sleepy = false; + bool mustq = false; + int idx; + int set_yield_cpu = -1; + int iters = 0; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + qnodesp = this_cpu_ptr(&qnodes); + if (unlikely(qnodesp->count >= MAX_NODES)) { + spec_barrier(); + while (!queued_spin_trylock(lock)) + cpu_relax(); + return; + } + + idx = qnodesp->count++; + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + node = &qnodesp->nodes[idx]; + node->next = NULL; + node->lock = lock; + node->cpu = smp_processor_id(); + node->yield_cpu = -1; + node->locked = 0; + + tail = encode_tail_cpu(node->cpu); + + old = publish_tail_cpu(lock, tail); + + /* + * If there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_CPU_MASK) { + struct qnode *prev = get_tail_qnode(lock, old); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + /* Wait for mcs node lock to be released */ + spin_begin(); + while (!node->locked) { + spec_barrier(); + + if (yield_to_prev(lock, node, old, paravirt)) + seen_preempted = true; + } + spec_barrier(); + spin_end(); + + /* Clear out stale propagated yield_cpu */ + if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1) + node->yield_cpu = -1; + + smp_rmb(); /* acquire barrier for the mcs lock */ + + /* + * Generic qspinlocks have this prefetch here, but it seems + * like it could cause additional line transitions because + * the waiter will keep loading from it. + */ + if (_Q_SPIN_PREFETCH_NEXT) { + next = READ_ONCE(node->next); + if (next) + prefetchw(next); + } + } + + /* We're at the head of the waitqueue, wait for the lock. */ +again: + spin_begin(); + for (;;) { + bool preempted; + + val = READ_ONCE(lock->val); + if (!(val & _Q_LOCKED_VAL)) + break; + spec_barrier(); + + if (paravirt && pv_sleepy_lock && maybe_stealers) { + if (!sleepy) { + if (val & _Q_SLEEPY_VAL) { + seen_sleepy_lock(); + sleepy = true; + } else if (recently_sleepy()) { + sleepy = true; + } + } + if (pv_sleepy_lock_sticky && seen_preempted && + !(val & _Q_SLEEPY_VAL)) { + if (try_set_sleepy(lock, val)) + val |= _Q_SLEEPY_VAL; + } + } + + propagate_yield_cpu(node, val, &set_yield_cpu, paravirt); + preempted = yield_head_to_locked_owner(lock, val, paravirt); + if (!maybe_stealers) + continue; + + if (preempted) + seen_preempted = true; + + if (paravirt && preempted) { + sleepy = true; + + if (!pv_spin_on_preempted_owner) + iters++; + } else { + iters++; + } + + if (!mustq && iters >= get_head_spins(paravirt, sleepy)) { + mustq = true; + set_mustq(lock); + val |= _Q_MUST_Q_VAL; + } + } + spec_barrier(); + spin_end(); + + /* If we're the last queued, must clean up the tail. */ + old = trylock_clean_tail(lock, tail); + if (unlikely(old & _Q_LOCKED_VAL)) { + BUG_ON(!maybe_stealers); + goto again; /* Can only be true if maybe_stealers. */ + } + + if ((old & _Q_TAIL_CPU_MASK) == tail) + goto release; /* We were the tail, no next. */ + + /* There is a next, must wait for node->next != NULL (MCS protocol) */ + next = READ_ONCE(node->next); + if (!next) { + spin_begin(); + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + spin_end(); + } + spec_barrier(); + + /* + * Unlock the next mcs waiter node. Release barrier is not required + * here because the acquirer is only accessing the lock word, and + * the acquire barrier we took the lock with orders that update vs + * this store to locked. The corresponding barrier is the smp_rmb() + * acquire barrier for mcs lock, above. + */ + if (paravirt && pv_prod_head) { + int next_cpu = next->cpu; + WRITE_ONCE(next->locked, 1); + if (_Q_SPIN_MISO) + asm volatile("miso" ::: "memory"); + if (vcpu_is_preempted(next_cpu)) + prod_cpu(next_cpu); + } else { + WRITE_ONCE(next->locked, 1); + if (_Q_SPIN_MISO) + asm volatile("miso" ::: "memory"); + } + +release: + qnodesp->count--; /* release the node */ +} + +void queued_spin_lock_slowpath(struct qspinlock *lock) +{ + /* + * This looks funny, but it induces the compiler to inline both + * sides of the branch rather than share code as when the condition + * is passed as the paravirt argument to the functions. + */ + if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) { + if (try_to_steal_lock(lock, true)) { + spec_barrier(); + return; + } + queued_spin_lock_mcs_queue(lock, true); + } else { + if (try_to_steal_lock(lock, false)) { + spec_barrier(); + return; + } + queued_spin_lock_mcs_queue(lock, false); + } +} +EXPORT_SYMBOL(queued_spin_lock_slowpath); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void pv_spinlocks_init(void) +{ +} +#endif + +#include <linux/debugfs.h> +static int steal_spins_set(void *data, u64 val) +{ +#if _Q_SPIN_TRY_LOCK_STEAL == 1 + /* MAYBE_STEAL remains true */ + steal_spins = val; +#else + static DEFINE_MUTEX(lock); + + /* + * The lock slow path has a !maybe_stealers case that can assume + * the head of queue will not see concurrent waiters. That waiter + * is unsafe in the presence of stealers, so must keep them away + * from one another. + */ + + mutex_lock(&lock); + if (val && !steal_spins) { + maybe_stealers = true; + /* wait for queue head waiter to go away */ + synchronize_rcu(); + steal_spins = val; + } else if (!val && steal_spins) { + steal_spins = val; + /* wait for all possible stealers to go away */ + synchronize_rcu(); + maybe_stealers = false; + } else { + steal_spins = val; + } + mutex_unlock(&lock); +#endif + + return 0; +} + +static int steal_spins_get(void *data, u64 *val) +{ + *val = steal_spins; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, "%llu\n"); + +static int remote_steal_spins_set(void *data, u64 val) +{ + remote_steal_spins = val; + + return 0; +} + +static int remote_steal_spins_get(void *data, u64 *val) +{ + *val = remote_steal_spins; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_remote_steal_spins, remote_steal_spins_get, remote_steal_spins_set, "%llu\n"); + +static int head_spins_set(void *data, u64 val) +{ + head_spins = val; + + return 0; +} + +static int head_spins_get(void *data, u64 *val) +{ + *val = head_spins; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_head_spins, head_spins_get, head_spins_set, "%llu\n"); + +static int pv_yield_owner_set(void *data, u64 val) +{ + pv_yield_owner = !!val; + + return 0; +} + +static int pv_yield_owner_get(void *data, u64 *val) +{ + *val = pv_yield_owner; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, pv_yield_owner_set, "%llu\n"); + +static int pv_yield_allow_steal_set(void *data, u64 val) +{ + pv_yield_allow_steal = !!val; + + return 0; +} + +static int pv_yield_allow_steal_get(void *data, u64 *val) +{ + *val = pv_yield_allow_steal; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_allow_steal, pv_yield_allow_steal_get, pv_yield_allow_steal_set, "%llu\n"); + +static int pv_spin_on_preempted_owner_set(void *data, u64 val) +{ + pv_spin_on_preempted_owner = !!val; + + return 0; +} + +static int pv_spin_on_preempted_owner_get(void *data, u64 *val) +{ + *val = pv_spin_on_preempted_owner; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_spin_on_preempted_owner, pv_spin_on_preempted_owner_get, pv_spin_on_preempted_owner_set, "%llu\n"); + +static int pv_sleepy_lock_set(void *data, u64 val) +{ + pv_sleepy_lock = !!val; + + return 0; +} + +static int pv_sleepy_lock_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock, pv_sleepy_lock_get, pv_sleepy_lock_set, "%llu\n"); + +static int pv_sleepy_lock_sticky_set(void *data, u64 val) +{ + pv_sleepy_lock_sticky = !!val; + + return 0; +} + +static int pv_sleepy_lock_sticky_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock_sticky; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_sticky, pv_sleepy_lock_sticky_get, pv_sleepy_lock_sticky_set, "%llu\n"); + +static int pv_sleepy_lock_interval_ns_set(void *data, u64 val) +{ + pv_sleepy_lock_interval_ns = val; + + return 0; +} + +static int pv_sleepy_lock_interval_ns_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock_interval_ns; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_interval_ns, pv_sleepy_lock_interval_ns_get, pv_sleepy_lock_interval_ns_set, "%llu\n"); + +static int pv_sleepy_lock_factor_set(void *data, u64 val) +{ + pv_sleepy_lock_factor = val; + + return 0; +} + +static int pv_sleepy_lock_factor_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock_factor; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_factor, pv_sleepy_lock_factor_get, pv_sleepy_lock_factor_set, "%llu\n"); + +static int pv_yield_prev_set(void *data, u64 val) +{ + pv_yield_prev = !!val; + + return 0; +} + +static int pv_yield_prev_get(void *data, u64 *val) +{ + *val = pv_yield_prev; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n"); + +static int pv_yield_propagate_owner_set(void *data, u64 val) +{ + pv_yield_propagate_owner = !!val; + + return 0; +} + +static int pv_yield_propagate_owner_get(void *data, u64 *val) +{ + *val = pv_yield_propagate_owner; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n"); + +static int pv_prod_head_set(void *data, u64 val) +{ + pv_prod_head = !!val; + + return 0; +} + +static int pv_prod_head_get(void *data, u64 *val) +{ + *val = pv_prod_head; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, pv_prod_head_get, pv_prod_head_set, "%llu\n"); + +static __init int spinlock_debugfs_init(void) +{ + debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins); + debugfs_create_file("qspl_remote_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_remote_steal_spins); + debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins); + if (is_shared_processor()) { + debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner); + debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal); + debugfs_create_file("qspl_pv_spin_on_preempted_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_spin_on_preempted_owner); + debugfs_create_file("qspl_pv_sleepy_lock", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock); + debugfs_create_file("qspl_pv_sleepy_lock_sticky", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_sticky); + debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns); + debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor); + debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev); + debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner); + debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head); + } + + return 0; +} +device_initcall(spinlock_debugfs_init); |