diff options
| author | Tejun Heo <tj@kernel.org> | 2025-10-28 20:19:17 -1000 |
|---|---|---|
| committer | Tejun Heo <tj@kernel.org> | 2025-11-03 11:46:18 -1000 |
| commit | d245698d727ab8f5420b3e28d1243f96a5234851 (patch) | |
| tree | df5b6a4cdf01bd423c9c42b871b323a2b0105884 | |
| parent | 260fbcb92bbeacfcd050410fdc2d24ab15044400 (diff) | |
cgroup: Defer task cgroup unlink until after the task is done switching out
When a task exits, css_set_move_task(tsk, cset, NULL, false) unlinks the task
from its cgroup. From the cgroup's perspective, the task is now gone. If this
makes the cgroup empty, it can be removed, triggering ->css_offline() callbacks
that notify controllers the cgroup is going offline resource-wise.
However, the exiting task can still run, perform memory operations, and schedule
until the final context switch in finish_task_switch(). This creates a confusing
situation where controllers are told a cgroup is offline while resource
activities are still happening in it. While this hasn't broken existing
controllers, it has caused direct confusion for sched_ext schedulers.
Split cgroup_task_exit() into two functions. cgroup_task_exit() now only calls
the subsystem exit callbacks and continues to be called from do_exit(). The
css_set cleanup is moved to the new cgroup_task_dead() which is called from
finish_task_switch() after the final context switch, so that the cgroup only
appears empty after the task is truly done running.
This also reorders operations so that subsys->exit() is now called before
unlinking from the cgroup, which shouldn't break anything.
Cc: Dan Schatzberg <dschatzberg@meta.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
| -rw-r--r-- | include/linux/cgroup.h | 2 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 23 | ||||
| -rw-r--r-- | kernel/sched/core.c | 2 |
3 files changed, 18 insertions, 9 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4068035176c4..bc892e3b37ee 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -138,6 +138,7 @@ extern void cgroup_cancel_fork(struct task_struct *p, extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_task_exit(struct task_struct *p); +void cgroup_task_dead(struct task_struct *p); void cgroup_task_release(struct task_struct *p); void cgroup_task_free(struct task_struct *p); @@ -681,6 +682,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p, static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_dead(struct task_struct *p) {} static inline void cgroup_task_release(struct task_struct *p) {} static inline void cgroup_task_free(struct task_struct *p) {} diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b3c27900c5d2..aae180d56c8c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -944,7 +944,7 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_task_exit()/cgroup_task_free() dropping + * against cgroup_task_dead()/cgroup_task_free() dropping * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -6982,10 +6982,20 @@ void cgroup_post_fork(struct task_struct *child, void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; - struct css_set *cset; int i; - spin_lock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ + do_each_subsys_mask(ss, i, have_exit_callback) { + ss->exit(tsk); + } while_each_subsys_mask(); +} + +void cgroup_task_dead(struct task_struct *tsk) +{ + struct css_set *cset; + unsigned long flags; + + spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); @@ -7003,12 +7013,7 @@ void cgroup_task_exit(struct task_struct *tsk) test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); - - /* see cgroup_post_fork() for details */ - do_each_subsys_mask(ss, i, have_exit_callback) { - ss->exit(tsk); - } while_each_subsys_mask(); + spin_unlock_irqrestore(&css_set_lock, flags); } void cgroup_task_release(struct task_struct *task) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ebf67b48e2..40f12e37f60f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5222,6 +5222,8 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + cgroup_task_dead(prev); + /* Task is done with its stack. */ put_task_stack(prev); |
