summaryrefslogtreecommitdiff
path: root/kernel/cgroup/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/cgroup.c')
-rw-r--r--kernel/cgroup/cgroup.c91
1 files changed, 76 insertions, 15 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ae1eb7a85eb4..fa08ea288737 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -60,6 +60,7 @@
#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <linux/nstree.h>
+#include <linux/irq_work.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -287,6 +288,7 @@ static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+static void cgroup_rt_init(void);
#ifdef CONFIG_DEBUG_CGROUP_REF
#define CGROUP_REF_FN_ATTRS noinline
@@ -941,7 +943,8 @@ static void css_set_move_task(struct task_struct *task,
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
- * against cgroup_exit()/cgroup_free() dropping the css_set.
+ * against cgroup_task_dead()/cgroup_task_free() dropping
+ * the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -6354,6 +6357,7 @@ int __init cgroup_init(void)
BUG_ON(ss_rstat_init(NULL));
get_user_ns(init_cgroup_ns.user_ns);
+ cgroup_rt_init();
cgroup_lock();
@@ -6967,19 +6971,29 @@ void cgroup_post_fork(struct task_struct *child,
}
/**
- * cgroup_exit - detach cgroup from exiting task
+ * cgroup_task_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk.
*
*/
-void cgroup_exit(struct task_struct *tsk)
+void cgroup_task_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
- struct css_set *cset;
int i;
- spin_lock_irq(&css_set_lock);
+ /* see cgroup_post_fork() for details */
+ do_each_subsys_mask(ss, i, have_exit_callback) {
+ ss->exit(tsk);
+ } while_each_subsys_mask();
+}
+
+static void do_cgroup_task_dead(struct task_struct *tsk)
+{
+ struct css_set *cset;
+ unsigned long flags;
+
+ spin_lock_irqsave(&css_set_lock, flags);
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
@@ -6997,15 +7011,61 @@ void cgroup_exit(struct task_struct *tsk)
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
- /* see cgroup_post_fork() for details */
- do_each_subsys_mask(ss, i, have_exit_callback) {
- ss->exit(tsk);
- } while_each_subsys_mask();
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
+ * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
+ * this lead to sleeping in the invalid context warning bug. css_set_lock is too
+ * big to become a raw_spinlock. The task_dead path doesn't need to run
+ * synchronously but can't be delayed indefinitely either as the dead task pins
+ * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
+ * irq_work to allow batching while ensuring timely completion.
+ */
+static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
+static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
+
+static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
+{
+ struct llist_node *lnode;
+ struct task_struct *task, *next;
+
+ lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
+ llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+ do_cgroup_task_dead(task);
+ put_task_struct(task);
+ }
+}
+
+static void __init cgroup_rt_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
+ per_cpu(cgrp_dead_tasks_iwork, cpu) =
+ IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
+ }
+}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+ get_task_struct(task);
+ llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
+ irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
}
+#else /* CONFIG_PREEMPT_RT */
+static void __init cgroup_rt_init(void) {}
-void cgroup_release(struct task_struct *task)
+void cgroup_task_dead(struct task_struct *task)
+{
+ do_cgroup_task_dead(task);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+void cgroup_task_release(struct task_struct *task)
{
struct cgroup_subsys *ss;
int ssid;
@@ -7013,6 +7073,11 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();
+}
+
+void cgroup_task_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
if (!list_empty(&task->cg_list)) {
spin_lock_irq(&css_set_lock);
@@ -7020,11 +7085,7 @@ void cgroup_release(struct task_struct *task)
list_del_init(&task->cg_list);
spin_unlock_irq(&css_set_lock);
}
-}
-void cgroup_free(struct task_struct *task)
-{
- struct css_set *cset = task_css_set(task);
put_css_set(cset);
}