Merge branch 'limited-queueing-in-nmi-for-rqspinlock'

Kumar Kartikeya Dwivedi says: ==================== Limited queueing in NMI for rqspinlock Ritesh reported that he was frequently seeing timeouts in cases which should have been covered by the AA heuristics. This led to the discovery of multiple gaps in the current code that could lead to timeouts when AA heuristics could work to prevent them. More details and investigation is available in the original threads. [0][1] This set restores the ability for NMI waiters to queue in the slow path, and reduces the cases where they would attempt to trylock. However, such queueing must not happen when interrupting waiters which the NMI itself depends upon for forward progress; in those cases the trylock fallback remains, but with a single attempt to avoid aimless attempts to acquire the lock. It also closes a possible window in the lock fast path and the unlock path where NMIs landing between cmpxchg and entry creation, or entry deletion and unlock would miss the detection of an AA scenario and end up timing out. This virtually eliminates all the cases where existing heuristics can prevent timeouts and quickly recover from a deadlock. More details are available in the commit logs for each patch. [0]: https://lore.kernel.org/bpf/CAH6OuBTjG+N=+GGwcpOUbeDN563oz4iVcU3rbse68egp9wj9_A@mail.gmail.com [1]: https://lore.kernel.org/bpf/20251125203253.3287019-1-memxor@gmail.com ==================== Link: https://patch.msgid.link/20251128232802.1031906-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
author: Alexei Starovoitov <ast@kernel.org> 2025-11-29 09:35:36 -0800
committer: Alexei Starovoitov <ast@kernel.org> 2025-11-29 09:35:36 -0800
commit: 34235a3544f20291819c20d1d6c4ba07784045a2 (patch)
tree: 145058be55b4ecf4916b0ac3df96b49e3d78d9c6 /kernel/bpf
parent: bd5bdd200c9e981cd5e2495966968cb26010573c (diff)
parent: 3448375e71a49cc29cc62cc941bea137d723956e (diff)
1 files changed, 33 insertions, 36 deletions
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 3cc23d79a9fc..f7d0c8d4644e 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -196,32 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
 	return 0;
 }
 
-static noinline int check_deadlock(rqspinlock_t *lock, u32 mask)
-{
-	int ret;
-
-	ret = check_deadlock_AA(lock);
-	if (ret)
-		return ret;
-	ret = check_deadlock_ABBA(lock, mask);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 				  struct rqspinlock_timeout *ts)
 {
-	u64 time = ktime_get_mono_fast_ns();
 	u64 prev = ts->cur;
+	u64 time;
 
 	if (!ts->timeout_end) {
-		ts->cur = time;
-		ts->timeout_end = time + ts->duration;
+		if (check_deadlock_AA(lock))
+			return -EDEADLK;
+		ts->cur = ktime_get_mono_fast_ns();
+		ts->timeout_end = ts->cur + ts->duration;
 		return 0;
 	}
 
+	time = ktime_get_mono_fast_ns();
 	if (time > ts->timeout_end)
 		return -ETIMEDOUT;
 
@@ -231,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 	 */
 	if (prev + NSEC_PER_MSEC < time) {
 		ts->cur = time;
-		return check_deadlock(lock, mask);
+		return check_deadlock_ABBA(lock, mask);
 	}
 
 	return 0;
@@ -275,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
 	int val, ret = 0;
 
 	RES_INIT_TIMEOUT(ts);
+	/*
+	 * The fast path is not invoked for the TAS fallback, so we must grab
+	 * the deadlock detection entry here.
+	 */
 	grab_held_lock_entry(lock);
 
 	/*
@@ -397,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 		goto queue;
 	}
 
-	/*
-	 * Grab an entry in the held locks array, to enable deadlock detection.
-	 */
-	grab_held_lock_entry(lock);
+	/* Deadlock detection entry already held after failing fast path. */
 
 	/*
 	 * We're pending, wait for the owner to go away.
@@ -447,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 	 * queuing.
 	 */
 queue:
-	lockevent_inc(lock_slowpath);
 	/*
-	 * Grab deadlock detection entry for the queue path.
+	 * Do not queue if we're a waiter and someone is attempting this lock on
+	 * the same CPU. In case of NMIs, this prevents long timeouts where we
+	 * interrupt the pending waiter, and the owner, that will eventually
+	 * signal the head of our queue, both of which are logically but not
+	 * physically part of the queue, hence outside the scope of the idx > 0
+	 * check above for the trylock fallback.
 	 */
-	grab_held_lock_entry(lock);
+	if (check_deadlock_AA(lock)) {
+		ret = -EDEADLK;
+		goto err_release_entry;
+	}
 
+	lockevent_inc(lock_slowpath);
+	/* Deadlock detection entry already held after failing fast path. */
 	node = this_cpu_ptr(&rqnodes[0].mcs);
 	idx = node->count++;
 	tail = encode_tail(smp_processor_id(), idx);
@@ -464,19 +463,17 @@ queue:
 	 * not be nested NMIs taking spinlocks. That may not be true in
 	 * some architectures even though the chance of needing more than
 	 * 4 nodes will still be extremely unlikely. When that happens,
-	 * we fall back to spinning on the lock directly without using
-	 * any MCS node. This is not the most elegant solution, but is
-	 * simple enough.
+	 * we fall back to attempting a trylock operation without using
+	 * any MCS node. Unlike qspinlock which cannot fail, we have the
+	 * option of failing the slow path, and under contention, such a
+	 * trylock spinning will likely be treated unfairly due to lack of
+	 * queueing, hence do not spin.
 	 */
-	if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
+	if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
 		lockevent_inc(lock_no_node);
-		RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
-		while (!queued_spin_trylock(lock)) {
-			if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
-				lockevent_inc(rqspinlock_lock_timeout);
-				goto err_release_node;
-			}
-			cpu_relax();
+		if (!queued_spin_trylock(lock)) {
+			ret = -EDEADLK;
+			goto err_release_node;
 		}
 		goto release;
 	}
author	Alexei Starovoitov <ast@kernel.org>	2025-11-29 09:35:36 -0800
committer	Alexei Starovoitov <ast@kernel.org>	2025-11-29 09:35:36 -0800
commit	34235a3544f20291819c20d1d6c4ba07784045a2 (patch)
tree	145058be55b4ecf4916b0ac3df96b49e3d78d9c6 /kernel/bpf
parent	bd5bdd200c9e981cd5e2495966968cb26010573c (diff)
parent	3448375e71a49cc29cc62cc941bea137d723956e (diff)