diff options
| author | Tejun Heo <tj@kernel.org> | 2025-11-06 08:12:36 -1000 |
|---|---|---|
| committer | Tejun Heo <tj@kernel.org> | 2025-11-06 12:52:26 -1000 |
| commit | 9311e6c29b348b005e79228ef6facd38ebcc73f9 (patch) | |
| tree | 31cdf288b7857629bd4fa42ac216af01f2a87a31 | |
| parent | be04e96ba911fac1dc4c7f89ebb42018d167043f (diff) | |
cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
cgroup_task_dead() is called from finish_task_switch() which runs with
preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The
function needs to acquire css_set_lock which is a regular spinlock that can
sleep on RT kernels, leading to "sleeping function called from invalid
context" warnings.
css_set_lock is too large in scope to convert to a raw_spinlock. However,
the unlinking operations don't need to run synchronously - they just need
to complete after the task is done running.
On PREEMPT_RT, defer the work through irq_work. While the work doesn't need
to happen immediately, it can't be delayed indefinitely either as the dead
task pins the cgroup and task_struct can be pinned indefinitely. Use the
lazy version of irq_work to allow batching and lower impact while ensuring
timely completion.
v2: Use IRQ_WORK_INIT_LAZY instead of immediate irq_work and add explanation
for why the work can't be delayed indefinitely (Sebastian Andrzej Siewior).
Fixes: d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out")
Reported-by: Calvin Owens <calvin@wbinvd.org>
Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org
Signed-off-by: Tejun Heo <tj@kernel.org>
| -rw-r--r-- | include/linux/sched.h | 5 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 55 |
2 files changed, 58 insertions, 2 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..5e80d48488ef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1324,7 +1324,10 @@ struct task_struct { struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; -#endif +#ifdef CONFIG_PREEMPT_RT + struct llist_node cg_dead_lnode; +#endif /* CONFIG_PREEMPT_RT */ +#endif /* CONFIG_CGROUPS */ #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aae180d56c8c..48019a661c08 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static void cgroup_rt_init(void); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline @@ -6360,6 +6361,7 @@ int __init cgroup_init(void) BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); + cgroup_rt_init(); cgroup_lock(); @@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_task_dead(struct task_struct *tsk) +static void do_cgroup_task_dead(struct task_struct *tsk) { struct css_set *cset; unsigned long flags; @@ -7016,6 +7018,57 @@ void cgroup_task_dead(struct task_struct *tsk) spin_unlock_irqrestore(&css_set_lock, flags); } +#ifdef CONFIG_PREEMPT_RT +/* + * cgroup_task_dead() is called from finish_task_switch() which doesn't allow + * scheduling even in RT. As the task_dead path requires grabbing css_set_lock, + * this lead to sleeping in the invalid context warning bug. css_set_lock is too + * big to become a raw_spinlock. The task_dead path doesn't need to run + * synchronously but can't be delayed indefinitely either as the dead task pins + * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy + * irq_work to allow batching while ensuring timely completion. + */ +static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks); +static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork); + +static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork) +{ + struct llist_node *lnode; + struct task_struct *task, *next; + + lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks)); + llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) { + do_cgroup_task_dead(task); + put_task_struct(task); + } +} + +static void __init cgroup_rt_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu)); + per_cpu(cgrp_dead_tasks_iwork, cpu) = + IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn); + } +} + +void cgroup_task_dead(struct task_struct *task) +{ + get_task_struct(task); + llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks)); + irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork)); +} +#else /* CONFIG_PREEMPT_RT */ +static void __init cgroup_rt_init(void) {} + +void cgroup_task_dead(struct task_struct *task) +{ + do_cgroup_task_dead(task); +} +#endif /* CONFIG_PREEMPT_RT */ + void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; |
