diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup/cgroup.c | 44 | ||||
| -rw-r--r-- | kernel/sched/core.c | 2 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 6 |
3 files changed, 39 insertions, 13 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..77d02f87f3f1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -126,8 +126,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -4159,6 +4182,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -5558,7 +5582,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5567,7 +5591,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5701,7 +5725,7 @@ err_list_del: list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5939,7 +5963,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6325,8 +6349,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..ccba6fc3c3fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9551,7 +9551,7 @@ static unsigned long tg_weight(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); #else - return sched_weight_from_cgroup(tg->scx_weight); + return sched_weight_from_cgroup(tg->scx.weight); #endif } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4ae32ef179dd..088ceff38c8a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6788,12 +6788,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); |
