Merge tag 'kernel-6.15-rc1.tasklist_lock' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull tasklist_lock optimizations from Christian Brauner: "According to the performance testbots this brings a 23% performance increase when creating new processes: - Reduce tasklist_lock hold time on exit: - Perform add_device_randomness() without tasklist_lock - Perform free_pid() calls outside of tasklist_lock - Drop irq disablement around pidmap_lock - Add some tasklist_lock asserts - Call flush_sigqueue() lockless by changing release_task() - Don't pointlessly clear TIF_SIGPENDING in __exit_signal() -> clear_tsk_thread_flag()" * tag 'kernel-6.15-rc1.tasklist_lock' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: pid: drop irq disablement around pidmap_lock pid: perform free_pid() calls outside of tasklist_lock pid: sprinkle tasklist_lock asserts exit: hoist get_pid() in release_task() outside of tasklist_lock exit: perform add_device_randomness() without tasklist_lock exit: kill the pointless __exit_signal()->clear_tsk_thread_flag(TIF_SIGPENDING) exit: change the release_task() paths to call flush_sigqueue() lockless
author: Linus Torvalds <torvalds@linux-foundation.org> 2025-03-24 13:39:27 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2025-03-24 13:39:27 -0700
commit: b0cb56cbbdb4754918c28d6d7c294d56e28a3dd5 (patch)
tree: 0ea5d93e6fb3b90e4af055ded02376f851e8fc4f /kernel/exit.c
parent: 56e7a8b051512da8e7eb5c5a00dcc8e2065a89bb (diff)
parent: 0a7713ac0d98d02f2c69145754c93715ab07b307 (diff)
1 files changed, 35 insertions, 21 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 683766316a3d..c2e6c7b7779f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -123,14 +123,22 @@ static __init int kernel_exit_sysfs_init(void)
 late_initcall(kernel_exit_sysfs_init);
 #endif
 
-static void __unhash_process(struct task_struct *p, bool group_dead)
+/*
+ * For things release_task() would like to do *after* tasklist_lock is released.
+ */
+struct release_task_post {
+	struct pid *pids[PIDTYPE_MAX];
+};
+
+static void __unhash_process(struct release_task_post *post, struct task_struct *p,
+			     bool group_dead)
 {
 	nr_threads--;
-	detach_pid(p, PIDTYPE_PID);
+	detach_pid(post->pids, p, PIDTYPE_PID);
 	if (group_dead) {
-		detach_pid(p, PIDTYPE_TGID);
-		detach_pid(p, PIDTYPE_PGID);
-		detach_pid(p, PIDTYPE_SID);
+		detach_pid(post->pids, p, PIDTYPE_TGID);
+		detach_pid(post->pids, p, PIDTYPE_PGID);
+		detach_pid(post->pids, p, PIDTYPE_SID);
 
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
@@ -142,7 +150,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 /*
  * This function expects the tasklist_lock write-locked.
  */
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
 	bool group_dead = thread_group_leader(tsk);
@@ -175,9 +183,6 @@ static void __exit_signal(struct task_struct *tsk)
 			sig->curr_target = next_thread(tsk);
 	}
 
-	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
-			      sizeof(unsigned long long));
-
 	/*
 	 * Accumulate here the counters for all threads as they die. We could
 	 * skip the group leader because it is the last user of signal_struct,
@@ -198,23 +203,15 @@ static void __exit_signal(struct task_struct *tsk)
 	task_io_accounting_add(&sig->ioac, &tsk->ioac);
 	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	sig->nr_threads--;
-	__unhash_process(tsk, group_dead);
+	__unhash_process(post, tsk, group_dead);
 	write_sequnlock(&sig->stats_lock);
 
-	/*
-	 * Do this under ->siglock, we can race with another thread
-	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
-	 */
-	flush_sigqueue(&tsk->pending);
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 
 	__cleanup_sighand(sighand);
-	clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
-	if (group_dead) {
-		flush_sigqueue(&sig->shared_pending);
+	if (group_dead)
 		tty_kref_put(tty);
-	}
 }
 
 static void delayed_put_task_struct(struct rcu_head *rhp)
@@ -240,10 +237,13 @@ void __weak release_thread(struct task_struct *dead_task)
 
 void release_task(struct task_struct *p)
 {
+	struct release_task_post post;
 	struct task_struct *leader;
 	struct pid *thread_pid;
 	int zap_leader;
 repeat:
+	memset(&post, 0, sizeof(post));
+
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
 	rcu_read_lock();
@@ -253,10 +253,11 @@ repeat:
 	pidfs_exit(p);
 	cgroup_release(p);
 
+	thread_pid = get_pid(p->thread_pid);
+
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
-	thread_pid = get_pid(p->thread_pid);
-	__exit_signal(p);
+	__exit_signal(&post, p);
 
 	/*
 	 * If we are the last non-leader member of the thread
@@ -280,7 +281,20 @@ repeat:
 	write_unlock_irq(&tasklist_lock);
 	proc_flush_pid(thread_pid);
 	put_pid(thread_pid);
+	add_device_randomness(&p->se.sum_exec_runtime,
+			      sizeof(p->se.sum_exec_runtime));
+	free_pids(post.pids);
 	release_thread(p);
+	/*
+	 * This task was already removed from the process/thread/pid lists
+	 * and lock_task_sighand(p) can't succeed. Nobody else can touch
+	 * ->pending or, if group dead, signal->shared_pending. We can call
+	 * flush_sigqueue() lockless.
+	 */
+	flush_sigqueue(&p->pending);
+	if (thread_group_leader(p))
+		flush_sigqueue(&p->signal->shared_pending);
+
 	put_task_struct_rcu_user(p);
 
 	p = leader;
author	Linus Torvalds <torvalds@linux-foundation.org>	2025-03-24 13:39:27 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2025-03-24 13:39:27 -0700
commit	b0cb56cbbdb4754918c28d6d7c294d56e28a3dd5 (patch)
tree	0ea5d93e6fb3b90e4af055ded02376f851e8fc4f /kernel/exit.c
parent	56e7a8b051512da8e7eb5c5a00dcc8e2065a89bb (diff)
parent	0a7713ac0d98d02f2c69145754c93715ab07b307 (diff)