summaryrefslogtreecommitdiff
path: root/kernel/sched/ext.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r--kernel/sched/ext.c163
1 files changed, 53 insertions, 110 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ecb251e883ea..6827689a0966 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -25,7 +25,7 @@ static struct scx_sched __rcu *scx_root;
* guarantee system safety. Maintain a dedicated task list which contains every
* task between its fork and eventual free.
*/
-static DEFINE_SPINLOCK(scx_tasks_lock);
+static DEFINE_RAW_SPINLOCK(scx_tasks_lock);
static LIST_HEAD(scx_tasks);
/* ops enable/disable */
@@ -476,7 +476,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
- spin_lock_irq(&scx_tasks_lock);
+ raw_spin_lock_irq(&scx_tasks_lock);
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
@@ -507,14 +507,14 @@ static void scx_task_iter_unlock(struct scx_task_iter *iter)
__scx_task_iter_rq_unlock(iter);
if (iter->list_locked) {
iter->list_locked = false;
- spin_unlock_irq(&scx_tasks_lock);
+ raw_spin_unlock_irq(&scx_tasks_lock);
}
}
static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
{
if (!iter->list_locked) {
- spin_lock_irq(&scx_tasks_lock);
+ raw_spin_lock_irq(&scx_tasks_lock);
iter->list_locked = true;
}
}
@@ -1474,7 +1474,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
static void yield_task_scx(struct rq *rq)
{
struct scx_sched *sch = scx_root;
- struct task_struct *p = rq->curr;
+ struct task_struct *p = rq->donor;
if (SCX_HAS_OP(sch, yield))
SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
@@ -1485,7 +1485,7 @@ static void yield_task_scx(struct rq *rq)
static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
{
struct scx_sched *sch = scx_root;
- struct task_struct *from = rq->curr;
+ struct task_struct *from = rq->donor;
if (SCX_HAS_OP(sch, yield))
return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
@@ -2047,7 +2047,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
+ rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
@@ -2153,42 +2153,6 @@ has_tasks:
return true;
}
-static int balance_scx(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
-{
- int ret;
-
- rq_unpin_lock(rq, rf);
-
- ret = balance_one(rq, prev);
-
-#ifdef CONFIG_SCHED_SMT
- /*
- * When core-sched is enabled, this ops.balance() call will be followed
- * by pick_task_scx() on this CPU and the SMT siblings. Balance the
- * siblings too.
- */
- if (sched_core_enabled(rq)) {
- const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
- int scpu;
-
- for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
- struct rq *srq = cpu_rq(scpu);
- struct task_struct *sprev = srq->curr;
-
- WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
- update_rq_clock(srq);
- balance_one(srq, sprev);
- }
- }
-#endif
- rq_repin_lock(rq, rf);
-
- maybe_queue_balance_callback(rq);
-
- return ret;
-}
-
static void process_ddsp_deferred_locals(struct rq *rq)
{
struct task_struct *p;
@@ -2368,41 +2332,23 @@ static struct task_struct *first_local_task(struct rq *rq)
struct task_struct, scx.dsq_list.node);
}
-static struct task_struct *pick_task_scx(struct rq *rq)
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *prev = rq->curr;
+ bool keep_prev, kick_idle = false;
struct task_struct *p;
- bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
- bool kick_idle = false;
- /*
- * WORKAROUND:
- *
- * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
- * have gone through balance_scx(). Unfortunately, there currently is a
- * bug where fair could say yes on balance() but no on pick_task(),
- * which then ends up calling pick_task_scx() without preceding
- * balance_scx().
- *
- * Keep running @prev if possible and avoid stalling from entering idle
- * without balancing.
- *
- * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
- * if pick_task_scx() is called without preceding balance_scx().
- */
- if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
- if (prev->scx.flags & SCX_TASK_QUEUED) {
- keep_prev = true;
- } else {
- keep_prev = false;
- kick_idle = true;
- }
- } else if (unlikely(keep_prev &&
- prev->sched_class != &ext_sched_class)) {
- /*
- * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
- * conditional on scx_enabled() and may have been skipped.
- */
+ rq_modified_clear(rq);
+ rq_unpin_lock(rq, rf);
+ balance_one(rq, prev);
+ rq_repin_lock(rq, rf);
+ maybe_queue_balance_callback(rq);
+ if (rq_modified_above(rq, &ext_sched_class))
+ return RETRY_TASK;
+
+ keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ if (unlikely(keep_prev &&
+ prev->sched_class != &ext_sched_class)) {
WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -2940,9 +2886,9 @@ void scx_post_fork(struct task_struct *p)
}
}
- spin_lock_irq(&scx_tasks_lock);
+ raw_spin_lock_irq(&scx_tasks_lock);
list_add_tail(&p->scx.tasks_node, &scx_tasks);
- spin_unlock_irq(&scx_tasks_lock);
+ raw_spin_unlock_irq(&scx_tasks_lock);
percpu_up_read(&scx_fork_rwsem);
}
@@ -2966,9 +2912,9 @@ void sched_ext_free(struct task_struct *p)
{
unsigned long flags;
- spin_lock_irqsave(&scx_tasks_lock, flags);
+ raw_spin_lock_irqsave(&scx_tasks_lock, flags);
list_del_init(&p->scx.tasks_node);
- spin_unlock_irqrestore(&scx_tasks_lock, flags);
+ raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
/*
* @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
@@ -2997,7 +2943,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
p, p->scx.weight);
}
-static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
{
}
@@ -3270,6 +3216,8 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
+ .queue_mask = 1,
+
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
@@ -3277,7 +3225,6 @@ DEFINE_SCHED_CLASS(ext) = {
.wakeup_preempt = wakeup_preempt_scx,
- .balance = balance_scx,
.pick_task = pick_task_scx,
.put_prev_task = put_prev_task_scx,
@@ -3818,11 +3765,10 @@ static void scx_bypass(bool bypass)
*/
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
scx.runnable_node) {
- struct sched_enq_and_set_ctx ctx;
-
/* cycling deq/enq is enough, see the function comment */
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /* nothing */ ;
+ }
}
/* resched to restore ticks and idle state */
@@ -3972,22 +3918,20 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ update_rq_clock(task_rq(p));
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
-
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, queue_flags) {
+ p->sched_class = new_class;
+ }
- check_class_changed(task_rq(p), p, old_class, p->prio);
scx_exit_task(p);
}
scx_task_iter_stop(&sti);
@@ -4276,7 +4220,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
size_t avail, used;
bool idle;
- rq_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
idle = list_empty(&rq->scx.runnable_list) &&
rq->curr->sched_class == &idle_sched_class;
@@ -4345,7 +4289,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
scx_dump_task(&s, &dctx, p, ' ');
next:
- rq_unlock(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
}
dump_newline(&s);
@@ -4479,8 +4423,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
goto err_free_gdsqs;
sch->helper = kthread_run_worker(0, "sched_ext_helper");
- if (!sch->helper)
+ if (IS_ERR(sch->helper)) {
+ ret = PTR_ERR(sch->helper);
goto err_free_pcpu;
+ }
+
sched_set_fifo(sch->helper->task);
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
@@ -4748,26 +4695,22 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
if (!tryget_task_struct(p))
continue;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-
- p->scx.slice = SCX_SLICE_DFL;
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, queue_flags) {
+ p->scx.slice = SCX_SLICE_DFL;
+ p->sched_class = new_class;
+ }
- check_class_changed(task_rq(p), p, old_class, p->prio);
put_task_struct(p);
}
scx_task_iter_stop(&sti);
@@ -5321,8 +5264,8 @@ void __init init_sched_ext_class(void)
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
- init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
- init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
+ rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
+ rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
if (cpu_online(cpu))
cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
@@ -6401,7 +6344,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
guard(rcu)();
- sch = rcu_dereference(sch);
+ sch = rcu_dereference(scx_root);
if (unlikely(!sch))
return;