summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/autogroup.c4
-rw-r--r--kernel/sched/core.c838
-rw-r--r--kernel/sched/deadline.c54
-rw-r--r--kernel/sched/ext.c1067
-rw-r--r--kernel/sched/ext_idle.c43
-rw-r--r--kernel/sched/ext_internal.h29
-rw-r--r--kernel/sched/idle.c12
-rw-r--r--kernel/sched/isolation.c23
-rw-r--r--kernel/sched/membarrier.c8
-rw-r--r--kernel/sched/sched.h396
-rw-r--r--kernel/sched/syscalls.c13
11 files changed, 1530 insertions, 957 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index cdea931aae30..954137775f38 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
* this process can already run with task_group() == prev->tg or we can
* race with cgroup code which can read autogroup = prev under rq->lock.
* In the latter case for_each_thread() can not miss a migrating thread,
- * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
- * can't be removed from thread list, we hold ->siglock.
+ * cpu_cgroup_attach() must not be possible after cgroup_task_exit()
+ * and it can't be removed from thread list, we hold ->siglock.
*
* If an exiting thread was already removed from thread list we rely on
* sched_autogroup_exit_task().
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0c4ff93eeb78..b7801cd05d5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2131,8 +2131,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED;
- if (flags & ENQUEUE_MIGRATED)
- sched_mm_cid_migrate_to(rq, p);
enqueue_task(rq, p, flags);
@@ -2643,6 +2641,8 @@ out_unlock:
return 0;
}
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask);
+
/*
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
@@ -2656,6 +2656,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
cpumask_copy(&p->cpus_mask, ctx->new_mask);
p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+ mm_update_cpus_allowed(p->mm, ctx->new_mask);
/*
* Swap in a new user_cpus_ptr if SCA_USER flag set
@@ -2667,10 +2668,8 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
static void
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- scoped_guard (sched_change, p, DEQUEUE_SAVE) {
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
p->sched_class->set_cpus_allowed(p, ctx);
- mm_set_cpus_allowed(p->mm, ctx->new_mask);
- }
}
/*
@@ -3263,8 +3262,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- rseq_migrate(p);
- sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -4415,7 +4412,6 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
- init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4691,7 +4687,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
p->sched_task_group = tg;
}
#endif
- rseq_migrate(p);
/*
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
@@ -4755,7 +4750,6 @@ void wake_up_new_task(struct task_struct *p)
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
- rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
@@ -5049,7 +5043,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
kcov_prepare_switch(prev);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
- rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5150,6 +5143,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
+ /*
+ * sched_ext_dead() must come before cgroup_task_dead() to
+ * prevent cgroups from being removed while its member tasks are
+ * visible to SCX schedulers.
+ */
+ sched_ext_dead(prev);
+ cgroup_task_dead(prev);
+
/* Task is done with its stack. */
put_task_stack(prev);
@@ -5212,19 +5213,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
*
* kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
- *
- * switch_mm_cid() needs to be updated if the barriers provided
- * by context_switch() are modified.
*/
- if (!next->mm) { // to kernel
+ if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
- if (prev->mm) // from user
+ if (prev->mm) // from user
mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
- } else { // to user
+ } else { // to user
membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
@@ -5237,15 +5235,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);
- if (!prev->mm) { // from kernel
+ if (!prev->mm) { // from kernel
/* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
- /* switch_mm_cid() requires the memory barriers above. */
- switch_mm_cid(rq, prev, next);
+ mm_cid_switch_to(prev, next);
+
+ /*
+ * Tell rseq that the task was scheduled in. Must be after
+ * switch_mm_cid() to get the TIF flag set.
+ */
+ rseq_sched_switch_event(next);
prepare_lock_switch(rq, next, rf);
@@ -5530,7 +5533,6 @@ void sched_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
- task_tick_mm_cid(rq, donor);
scx_tick(rq);
rq_unlock(rq, &rf);
@@ -10260,525 +10262,501 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
}
#ifdef CONFIG_SCHED_MM_CID
-
/*
- * @cid_lock: Guarantee forward-progress of cid allocation.
+ * Concurrency IDentifier management
*
- * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
- * is only used when contention is detected by the lock-free allocation so
- * forward progress can be guaranteed.
- */
-DEFINE_RAW_SPINLOCK(cid_lock);
-
-/*
- * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
- *
- * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
- * detected, it is set to 1 to ensure that all newly coming allocations are
- * serialized by @cid_lock until the allocation which detected contention
- * completes and sets @use_cid_lock back to 0. This guarantees forward progress
- * of a cid allocation.
- */
-int use_cid_lock;
-
-/*
- * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
- * concurrently with respect to the execution of the source runqueue context
- * switch.
- *
- * There is one basic properties we want to guarantee here:
- *
- * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
- * used by a task. That would lead to concurrent allocation of the cid and
- * userspace corruption.
- *
- * Provide this guarantee by introducing a Dekker memory ordering to guarantee
- * that a pair of loads observe at least one of a pair of stores, which can be
- * shown as:
+ * Serialization rules:
*
- * X = Y = 0
+ * mm::mm_cid::mutex: Serializes fork() and exit() and therefore
+ * protects mm::mm_cid::users.
*
- * w[X]=1 w[Y]=1
- * MB MB
- * r[Y]=y r[X]=x
+ * mm::mm_cid::lock: Serializes mm_update_max_cids() and
+ * mm_update_cpus_allowed(). Nests in mm_cid::mutex
+ * and runqueue lock.
*
- * Which guarantees that x==0 && y==0 is impossible. But rather than using
- * values 0 and 1, this algorithm cares about specific state transitions of the
- * runqueue current task (as updated by the scheduler context switch), and the
- * per-mm/cpu cid value.
+ * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks
+ * and can only be modified with atomic operations.
*
- * Let's introduce task (Y) which has task->mm == mm and task (N) which has
- * task->mm != mm for the rest of the discussion. There are two scheduler state
- * transitions on context switch we care about:
+ * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue
+ * lock.
*
- * (TSA) Store to rq->curr with transition from (N) to (Y)
+ * CID ownership:
*
- * (TSB) Store to rq->curr with transition from (Y) to (N)
+ * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
+ * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
+ * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
+ * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
+ * task needs to drop the CID into the pool when scheduling out. Both bits
+ * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
+ * actually handed over to user space in the RSEQ memory.
*
- * On the remote-clear side, there is one transition we care about:
+ * Mode switching:
*
- * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ * Switching to per CPU mode happens when the user count becomes greater
+ * than the maximum number of CIDs, which is calculated by:
*
- * There is also a transition to UNSET state which can be performed from all
- * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
- * guarantees that only a single thread will succeed:
+ * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
+ * max_cids = min(1.25 * opt_cids, num_possible_cpus());
*
- * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ * The +25% allowance is useful for tight CPU masks in scenarios where only
+ * a few threads are created and destroyed to avoid frequent mode
+ * switches. Though this allowance shrinks, the closer opt_cids becomes to
+ * num_possible_cpus(), which is the (unfortunate) hard ABI limit.
*
- * Just to be clear, what we do _not_ want to happen is a transition to UNSET
- * when a thread is actively using the cid (property (1)).
+ * At the point of switching to per CPU mode the new user is not yet
+ * visible in the system, so the task which initiated the fork() runs the
+ * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either transfers each tasks owned CID to the CPU the task runs on or
+ * drops it into the CID pool if a task is not on a CPU at that point in
+ * time. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
+ * it's guaranteed that no task related to that MM owns a CID anymore.
*
- * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ * Switching back to task mode happens when the user count goes below the
+ * threshold which was recorded on the per CPU mode switch:
*
- * Scenario A) (TSA)+(TMA) (from next task perspective)
+ * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2);
*
- * CPU0 CPU1
+ * This threshold is updated when a affinity change increases the number of
+ * allowed CPUs for the MM, which might cause a switch back to per task
+ * mode.
*
- * Context switch CS-1 Remote-clear
- * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
- * (implied barrier after cmpxchg)
- * - switch_mm_cid()
- * - memory barrier (see switch_mm_cid()
- * comment explaining how this barrier
- * is combined with other scheduler
- * barriers)
- * - mm_cid_get (next)
- * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
+ * If the switch back was initiated by a exiting task, then that task runs
+ * the fixup function. If it was initiated by a affinity change, then it's
+ * run either in the deferred update function in context of a workqueue or
+ * by a task which forks a new one or by a task which exits. Whatever
+ * happens first. mm_cid_fixup_cpus_to_task() walks through the possible
+ * CPUs and either transfers the CPU owned CIDs to a related task which
+ * runs on the CPU or drops it into the pool. Tasks which schedule in on a
+ * CPU which the walk did not cover yet do the handover themself.
*
- * This Dekker ensures that either task (Y) is observed by the
- * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
- * observed.
+ * This transition from CPU to per task ownership happens in two phases:
*
- * If task (Y) store is observed by rcu_dereference(), it means that there is
- * still an active task on the cpu. Remote-clear will therefore not transition
- * to UNSET, which fulfills property (1).
+ * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
+ * CID and denotes that the CID is only temporarily owned by the
+ * task. When it schedules out the task drops the CID back into the
+ * pool if this bit is set.
*
- * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
- * it will move its state to UNSET, which clears the percpu cid perhaps
- * uselessly (which is not an issue for correctness). Because task (Y) is not
- * observed, CPU1 can move ahead to set the state to UNSET. Because moving
- * state to UNSET is done with a cmpxchg expecting that the old state has the
- * LAZY flag set, only one thread will successfully UNSET.
+ * 2) The initiating context walks the per CPU space and after completion
+ * clears mm:mm_cid.transit. So after that point the CIDs are strictly
+ * task owned again.
*
- * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
- * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
- * CPU1 will observe task (Y) and do nothing more, which is fine.
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail if
+ * two tasks are scheduled in on the same CPU before the fixup freed per
+ * CPU CIDs.
*
- * What we are effectively preventing with this Dekker is a scenario where
- * neither LAZY flag nor store (Y) are observed, which would fail property (1)
- * because this would UNSET a cid which is actively used.
+ * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
+ * related to that MM is owned by a CPU anymore.
*/
-void sched_mm_cid_migrate_from(struct task_struct *t)
-{
- t->migrate_from_cpu = task_cpu(t);
-}
-
-static
-int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid)
+/*
+ * Update the CID range properties when the constraints change. Invoked via
+ * fork(), exit() and affinity changes
+ */
+static void __mm_update_max_cids(struct mm_mm_cid *mc)
{
- struct mm_struct *mm = t->mm;
- struct task_struct *src_task;
- int src_cid, last_mm_cid;
+ unsigned int opt_cids, max_cids;
- if (!mm)
- return -1;
+ /* Calculate the new optimal constraint */
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
- last_mm_cid = t->last_mm_cid;
- /*
- * If the migrated task has no last cid, or if the current
- * task on src rq uses the cid, it means the source cid does not need
- * to be moved to the destination cpu.
- */
- if (last_mm_cid == -1)
- return -1;
- src_cid = READ_ONCE(src_pcpu_cid->cid);
- if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
- return -1;
+ /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */
+ max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus());
+ WRITE_ONCE(mc->max_cids, max_cids);
+}
- /*
- * If we observe an active task using the mm on this rq, it means we
- * are not the last task to be migrated from this cpu for this mm, so
- * there is no need to move src_cid to the destination cpu.
- */
- guard(rcu)();
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- t->last_mm_cid = -1;
- return -1;
- }
+static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
+{
+ unsigned int opt_cids;
- return src_cid;
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
+ /* Has to be at least 1 because 0 indicates PCPU mode off */
+ return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1);
}
-static
-int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid,
- int src_cid)
+static bool mm_update_max_cids(struct mm_struct *mm)
{
- struct task_struct *src_task;
- struct mm_struct *mm = t->mm;
- int lazy_cid;
-
- if (src_cid == -1)
- return -1;
+ struct mm_mm_cid *mc = &mm->mm_cid;
- /*
- * Attempt to clear the source cpu cid to move it to the destination
- * cpu.
- */
- lazy_cid = mm_cid_set_lazy_put(src_cid);
- if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
- return -1;
+ lockdep_assert_held(&mm->mm_cid.lock);
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
+ /* Clear deferred mode switch flag. A change is handled by the caller */
+ mc->update_deferred = false;
+ __mm_update_max_cids(mc);
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, this task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- /*
- * We observed an active task for this mm, there is therefore
- * no point in moving this cid to the destination cpu.
- */
- t->last_mm_cid = -1;
- return -1;
- }
+ /* Check whether owner mode must be changed */
+ if (!mc->percpu) {
+ /* Enable per CPU mode when the number of users is above max_cids */
+ if (mc->users > mc->max_cids)
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ } else {
+ /* Switch back to per task if user count under threshold */
+ if (mc->users < mc->pcpu_thrs)
+ mc->pcpu_thrs = 0;
}
- /*
- * The src_cid is unused, so it can be unset.
- */
- if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- return -1;
- WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
- return src_cid;
+ /* Mode change required? */
+ if (!!mc->percpu == !!mc->pcpu_thrs)
+ return false;
+ /* When switching back to per TASK mode, set the transition flag */
+ if (!mc->pcpu_thrs)
+ WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+ WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+ return true;
}
-/*
- * Migration to dst cpu. Called with dst_rq lock held.
- * Interrupts are disabled, which keeps the window of cid ownership without the
- * source rq lock held small.
- */
-void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
{
- struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
- struct mm_struct *mm = t->mm;
- int src_cid, src_cpu;
- bool dst_cid_is_set;
- struct rq *src_rq;
-
- lockdep_assert_rq_held(dst_rq);
+ struct cpumask *mm_allowed;
+ struct mm_mm_cid *mc;
+ unsigned int weight;
- if (!mm)
+ if (!mm || !READ_ONCE(mm->mm_cid.users))
return;
- src_cpu = t->migrate_from_cpu;
- if (src_cpu == -1) {
- t->last_mm_cid = -1;
- return;
- }
/*
- * Move the src cid if the dst cid is unset. This keeps id
- * allocation closest to 0 in cases where few threads migrate around
- * many CPUs.
- *
- * If destination cid or recent cid is already set, we may have
- * to just clear the src cid to ensure compactness in frequent
- * migrations scenarios.
- *
- * It is not useful to clear the src cid when the number of threads is
- * greater or equal to the number of allowed CPUs, because user-space
- * can expect that the number of allowed cids can reach the number of
- * allowed CPUs.
- */
- dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
- dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
- !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
- if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
+ * mm::mm_cid::mm_cpus_allowed is the superset of each threads
+ * allowed CPUs mask which means it can only grow.
+ */
+ mc = &mm->mm_cid;
+ guard(raw_spinlock)(&mc->lock);
+ mm_allowed = mm_cpus_allowed(mm);
+ weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk);
+ if (weight == mc->nr_cpus_allowed)
return;
- src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
- src_rq = cpu_rq(src_cpu);
- src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
- if (src_cid == -1)
+
+ WRITE_ONCE(mc->nr_cpus_allowed, weight);
+ __mm_update_max_cids(mc);
+ if (!mc->percpu)
return;
- src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
- src_cid);
- if (src_cid == -1)
+
+ /* Adjust the threshold to the wider set */
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ /* Switch back to per task mode? */
+ if (mc->users >= mc->pcpu_thrs)
return;
- if (dst_cid_is_set) {
- __mm_cid_put(mm, src_cid);
+
+ /* Don't queue twice */
+ if (mc->update_deferred)
return;
- }
- /* Move src_cid to dst cpu. */
- mm_cid_snapshot_time(dst_rq, mm);
- WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
- WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
+
+ /* Queue the irq work, which schedules the real work */
+ mc->update_deferred = true;
+ irq_work_queue(&mc->irq_work);
}
-static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
- int cpu)
+static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
- struct rq *rq = cpu_rq(cpu);
- struct task_struct *t;
- int cid, lazy_cid;
+ if (cid_on_cpu(t->mm_cid.cid)) {
+ unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid);
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid))
- return;
+ t->mm_cid.cid = cid_to_transit_cid(cid);
+ pcp->cid = t->mm_cid.cid;
+ }
+}
- /*
- * Clear the cpu cid if it is set to keep cid allocation compact. If
- * there happens to be other tasks left on the source cpu using this
- * mm, the next task using this mm will reallocate its cid on context
- * switch.
- */
- lazy_cid = mm_cid_set_lazy_put(cid);
- if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
- return;
+static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+{
+ unsigned int cpu;
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
+ /* Walk the CPUs and fixup all stale CIDs */
+ for_each_possible_cpu(cpu) {
+ struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
+ struct rq *rq = cpu_rq(cpu);
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, that task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- t = rcu_dereference(rq->curr);
- if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
- return;
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(rq_lock_irq)(rq);
+ /* Is the CID still owned by the CPU? */
+ if (cid_on_cpu(pcp->cid)) {
+ /*
+ * If rq->curr has @mm, transfer it with the
+ * transition bit set. Otherwise drop it.
+ */
+ if (rq->curr->mm == mm && rq->curr->mm_cid.active)
+ mm_cid_transit_to_task(rq->curr, pcp);
+ else
+ mm_drop_cid_on_cpu(mm, pcp);
+
+ } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) {
+ unsigned int cid = rq->curr->mm_cid.cid;
+
+ /* Ensure it has the transition bit set */
+ if (!cid_in_transit(cid)) {
+ cid = cid_to_transit_cid(cid);
+ rq->curr->mm_cid.cid = cid;
+ pcp->cid = cid;
+ }
+ }
}
+ /* Clear the transition bit */
+ WRITE_ONCE(mm->mm_cid.transit, 0);
+}
- /*
- * The cid is unused, so it can be unset.
- * Disable interrupts to keep the window of cid ownership without rq
- * lock small.
- */
- scoped_guard (irqsave) {
- if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- __mm_cid_put(mm, cid);
+static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+ if (cid_on_task(t->mm_cid.cid)) {
+ t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ pcp->cid = t->mm_cid.cid;
}
}
-static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
{
- struct rq *rq = cpu_rq(cpu);
- struct mm_cid *pcpu_cid;
- struct task_struct *curr;
- u64 rq_clock;
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(task_rq_lock)(t);
+ /* If the task is not active it is not in the users count */
+ if (!t->mm_cid.active)
+ return false;
+ if (cid_on_task(t->mm_cid.cid)) {
+ /* If running on the CPU, transfer the CID, otherwise drop it */
+ if (task_rq(t)->curr == t)
+ mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ else
+ mm_unset_cid_on_task(t);
+ }
+ return true;
+}
- /*
- * rq->clock load is racy on 32-bit but one spurious clear once in a
- * while is irrelevant.
- */
- rq_clock = READ_ONCE(rq->clock);
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct task_struct *p, *t;
+ unsigned int users;
/*
- * In order to take care of infrequently scheduled tasks, bump the time
- * snapshot associated with this cid if an active task using the mm is
- * observed on this rq.
+ * This can obviously race with a concurrent affinity change, which
+ * increases the number of allowed CPUs for this mm, but that does
+ * not affect the mode and only changes the CID constraints. A
+ * possible switch back to per task mode happens either in the
+ * deferred handler function or in the next fork()/exit().
+ *
+ * The caller has already transferred. The newly incoming task is
+ * already accounted for, but not yet visible.
*/
- scoped_guard (rcu) {
- curr = rcu_dereference(rq->curr);
- if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- WRITE_ONCE(pcpu_cid->time, rq_clock);
- return;
- }
+ users = mm->mm_cid.users - 2;
+ if (!users)
+ return;
+
+ guard(rcu)();
+ for_other_threads(current, t) {
+ if (mm_cid_fixup_task_to_cpu(t, mm))
+ users--;
}
- if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+ if (!users)
return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+
+ /* Happens only for VM_CLONE processes. */
+ for_each_process_thread(p, t) {
+ if (t == current || t->mm != mm)
+ continue;
+ if (mm_cid_fixup_task_to_cpu(t, mm)) {
+ if (--users == 0)
+ return;
+ }
+ }
}
-static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
- int weight)
+static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
- struct mm_cid *pcpu_cid;
- int cid;
-
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid) || cid < weight)
- return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+ t->mm_cid.active = 1;
+ mm->mm_cid.users++;
+ return mm_update_max_cids(mm);
}
-static void task_mm_cid_work(struct callback_head *work)
+void sched_mm_cid_fork(struct task_struct *t)
{
- unsigned long now = jiffies, old_scan, next_scan;
- struct task_struct *t = current;
- struct cpumask *cidmask;
- struct mm_struct *mm;
- int weight, cpu;
+ struct mm_struct *mm = t->mm;
+ bool percpu;
- WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
+ WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
- work->next = work; /* Prevent double-add */
- if (t->flags & PF_EXITING)
- return;
- mm = t->mm;
- if (!mm)
- return;
- old_scan = READ_ONCE(mm->mm_cid_next_scan);
- next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
- if (!old_scan) {
- unsigned long res;
-
- res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
- if (res != old_scan)
- old_scan = res;
+ guard(mutex)(&mm->mm_cid.mutex);
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+ /* First user ? */
+ if (!mm->mm_cid.users) {
+ sched_mm_cid_add_user(t, mm);
+ t->mm_cid.cid = mm_get_cid(mm);
+ /* Required for execve() */
+ pcp->cid = t->mm_cid.cid;
+ return;
+ }
+
+ if (!sched_mm_cid_add_user(t, mm)) {
+ if (!mm->mm_cid.percpu)
+ t->mm_cid.cid = mm_get_cid(mm);
+ return;
+ }
+
+ /* Handle the mode change and transfer current's CID */
+ percpu = !!mm->mm_cid.percpu;
+ if (!percpu)
+ mm_cid_transit_to_task(current, pcp);
else
- old_scan = next_scan;
+ mm_cid_transfer_to_cpu(current, pcp);
}
- if (time_before(now, old_scan))
- return;
- if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
- return;
- cidmask = mm_cidmask(mm);
- /* Clear cids that were not recently used. */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_old(mm, cpu);
- weight = cpumask_weight(cidmask);
- /*
- * Clear cids that are greater or equal to the cidmask weight to
- * recompact it.
- */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_weight(mm, cpu, weight);
-}
-
-void init_sched_mm_cid(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- int mm_users = 0;
- if (mm) {
- mm_users = atomic_read(&mm->mm_users);
- if (mm_users == 1)
- mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ if (percpu) {
+ mm_cid_fixup_tasks_to_cpus();
+ } else {
+ mm_cid_fixup_cpus_to_tasks(mm);
+ t->mm_cid.cid = mm_get_cid(mm);
}
- t->cid_work.next = &t->cid_work; /* Protect against double add */
- init_task_work(&t->cid_work, task_mm_cid_work);
}
-void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+static bool sched_mm_cid_remove_user(struct task_struct *t)
{
- struct callback_head *work = &curr->cid_work;
- unsigned long now = jiffies;
-
- if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
- work->next != work)
- return;
- if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
- return;
-
- /* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME);
+ t->mm_cid.active = 0;
+ scoped_guard(preempt) {
+ /* Clear the transition bit */
+ t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+ mm_unset_cid_on_task(t);
+ }
+ t->mm->mm_cid.users--;
+ return mm_update_max_cids(t->mm);
}
-void sched_mm_cid_exit_signals(struct task_struct *t)
+static bool __sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
- if (!mm)
- return;
-
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
+ if (!sched_mm_cid_remove_user(t))
+ return false;
+ /*
+ * Contrary to fork() this only deals with a switch back to per
+ * task mode either because the above decreased users or an
+ * affinity change increased the number of allowed CPUs and the
+ * deferred fixup did not run yet.
+ */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return false;
/*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
+ * A failed fork(2) cleanup never gets here, so @current must have
+ * the same MM as @t. That's true for exit() and the failed
+ * pthread_create() cleanup case.
*/
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ if (WARN_ON_ONCE(current->mm != mm))
+ return false;
+ return true;
}
-void sched_mm_cid_before_execve(struct task_struct *t)
+/*
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
+ */
+void sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
- if (!mm)
+ if (!mm || !t->mm_cid.active)
return;
+ /*
+ * Ensure that only one instance is doing MM CID operations within
+ * a MM. The common case is uncontended. The rare fixup case adds
+ * some overhead.
+ */
+ scoped_guard(mutex, &mm->mm_cid.mutex) {
+ /* mm_cid::mutex is sufficient to protect mm_cid::users */
+ if (likely(mm->mm_cid.users > 1)) {
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ if (!__sched_mm_cid_exit(t))
+ return;
+ /* Mode change required. Transfer currents CID */
+ mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ }
+ mm_cid_fixup_cpus_to_tasks(mm);
+ return;
+ }
+ /* Last user */
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Required across execve() */
+ if (t == current)
+ mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+ /* Ignore mode change. There is nothing to do. */
+ sched_mm_cid_remove_user(t);
+ }
+ }
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
/*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
+ * As this is the last user (execve(), process exit or failed
+ * fork(2)) there is no concurrency anymore.
+ *
+ * Synchronize eventually pending work to ensure that there are no
+ * dangling references left. @t->mm_cid.users is zero so nothing
+ * can queue this work anymore.
*/
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ irq_work_sync(&mm->mm_cid.irq_work);
+ cancel_work_sync(&mm->mm_cid.work);
+}
+
+/* Deactivate MM CID allocation across execve() */
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ sched_mm_cid_exit(t);
}
+/* Reactivate MM CID after successful execve() */
void sched_mm_cid_after_execve(struct task_struct *t)
{
- struct mm_struct *mm = t->mm;
- struct rq *rq;
+ sched_mm_cid_fork(t);
+}
+
+static void mm_cid_work_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
- if (!mm)
+ guard(mutex)(&mm->mm_cid.mutex);
+ /* Did the last user task exit already? */
+ if (!mm->mm_cid.users)
return;
- preempt_disable();
- rq = this_rq();
- scoped_guard (rq_lock_irqsave, rq) {
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 1);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Have fork() or exit() handled it already? */
+ if (!mm->mm_cid.update_deferred)
+ return;
+ /* This clears mm_cid::update_deferred */
+ if (!mm_update_max_cids(mm))
+ return;
+ /* Affinity changes can only switch back to task mode */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return;
}
+ mm_cid_fixup_cpus_to_tasks(mm);
}
-void sched_mm_cid_fork(struct task_struct *t)
+static void mm_cid_irq_work(struct irq_work *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
+
+ /*
+ * Needs to be unconditional because mm_cid::lock cannot be held
+ * when scheduling work as mm_update_cpus_allowed() nests inside
+ * rq::lock and schedule_work() might end up in wakeup...
+ */
+ schedule_work(&mm->mm_cid.work);
+}
+
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
{
- WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
- t->mm_cid_active = 1;
+ mm->mm_cid.max_cids = 0;
+ mm->mm_cid.percpu = 0;
+ mm->mm_cid.transit = 0;
+ mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+ mm->mm_cid.users = 0;
+ mm->mm_cid.pcpu_thrs = 0;
+ mm->mm_cid.update_deferred = 0;
+ raw_spin_lock_init(&mm->mm_cid.lock);
+ mutex_init(&mm->mm_cid.mutex);
+ mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
+ INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+ cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
+ bitmap_zero(mm_cidmask(mm), num_possible_cpus());
}
-#endif /* CONFIG_SCHED_MM_CID */
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+#endif /* !CONFIG_SCHED_MM_CID */
static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 67f540c23717..319439fe1870 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2675,6 +2675,7 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu
return NULL;
}
+/* Access rule: must be called on local CPU with preemption disabled */
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
static int find_later_rq(struct task_struct *task)
@@ -3117,11 +3118,43 @@ void __init init_sched_dl_class(void)
GFP_KERNEL, cpu_to_node(i));
}
+/*
+ * This function always returns a non-empty bitmap in @cpus. This is because
+ * if a root domain has reserved bandwidth for DL tasks, the DL bandwidth
+ * check will prevent CPU hotplug from deactivating all CPUs in that domain.
+ */
+static void dl_get_task_effective_cpus(struct task_struct *p, struct cpumask *cpus)
+{
+ const struct cpumask *hk_msk;
+
+ hk_msk = housekeeping_cpumask(HK_TYPE_DOMAIN);
+ if (housekeeping_enabled(HK_TYPE_DOMAIN)) {
+ if (!cpumask_intersects(p->cpus_ptr, hk_msk)) {
+ /*
+ * CPUs isolated by isolcpu="domain" always belong to
+ * def_root_domain.
+ */
+ cpumask_andnot(cpus, cpu_active_mask, hk_msk);
+ return;
+ }
+ }
+
+ /*
+ * If a root domain holds a DL task, it must have active CPUs. So
+ * active CPUs can always be found by walking up the task's cpuset
+ * hierarchy up to the partition root.
+ */
+ cpuset_cpus_allowed_locked(p, cpus);
+}
+
+/* The caller should hold cpuset_mutex */
void dl_add_task_root_domain(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
struct dl_bw *dl_b;
+ unsigned int cpu;
+ struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
@@ -3129,16 +3162,25 @@ void dl_add_task_root_domain(struct task_struct *p)
return;
}
- rq = __task_rq_lock(p, &rf);
-
+ /*
+ * Get an active rq, whose rq->rd traces the correct root
+ * domain.
+ * Ideally this would be under cpuset reader lock until rq->rd is
+ * fetched. However, sleepable locks cannot nest inside pi_lock, so we
+ * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex'
+ * to guarantee the CPU stays in the cpuset.
+ */
+ dl_get_task_effective_cpus(p, msk);
+ cpu = cpumask_first_and(cpu_active_mask, msk);
+ BUG_ON(cpu >= nr_cpu_ids);
+ rq = cpu_rq(cpu);
dl_b = &rq->rd->dl_bw;
- raw_spin_lock(&dl_b->lock);
+ /* End of fetching rd */
+ raw_spin_lock(&dl_b->lock);
__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
-
raw_spin_unlock(&dl_b->lock);
-
- task_rq_unlock(rq, p, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
}
void dl_clear_root_domain(struct root_domain *rd)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6827689a0966..05f5a49e9649 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -33,9 +33,10 @@ static DEFINE_MUTEX(scx_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
-static unsigned long scx_in_softlockup;
-static atomic_t scx_breather_depth = ATOMIC_INIT(0);
static int scx_bypass_depth;
+static cpumask_var_t scx_bypass_lb_donee_cpumask;
+static cpumask_var_t scx_bypass_lb_resched_cpumask;
+static bool scx_aborting;
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -68,18 +69,18 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
static struct delayed_work scx_watchdog_work;
/*
- * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence
* numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
* allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
* lazily when enabling and freed when disabling to avoid waste when sched_ext
* isn't active.
*/
-struct scx_kick_pseqs {
+struct scx_kick_syncs {
struct rcu_head rcu;
- unsigned long seqs[];
+ unsigned long syncs[];
};
-static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
+static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
/*
* Direct dispatch marker.
@@ -143,26 +144,70 @@ static struct scx_dump_data scx_dump_data = {
/* /sys/kernel/sched_ext interface */
static struct kset *scx_kset;
+/*
+ * Parameters that can be adjusted through /sys/module/sched_ext/parameters.
+ * There usually is no reason to modify these as normal scheduler operation
+ * shouldn't be affected by them. The knobs are primarily for debugging.
+ */
+static u64 scx_slice_dfl = SCX_SLICE_DFL;
+static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
+static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US;
+
+static int set_slice_us(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC);
+}
+
+static const struct kernel_param_ops slice_us_param_ops = {
+ .set = set_slice_us,
+ .get = param_get_uint,
+};
+
+static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC);
+}
+
+static const struct kernel_param_ops bypass_lb_intv_us_param_ops = {
+ .set = set_bypass_lb_intv_us,
+ .get = param_get_uint,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "sched_ext."
+
+module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600);
+MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)");
+module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600);
+MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)");
+
+#undef MODULE_PARAM_PREFIX
+
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
+static u32 reenq_local(struct rq *rq);
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
-static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
s64 exit_code, const char *fmt, va_list args);
-static __printf(4, 5) void scx_exit(struct scx_sched *sch,
+static __printf(4, 5) bool scx_exit(struct scx_sched *sch,
enum scx_exit_kind kind, s64 exit_code,
const char *fmt, ...)
{
va_list args;
+ bool ret;
va_start(args, fmt);
- scx_vexit(sch, kind, exit_code, fmt, args);
+ ret = scx_vexit(sch, kind, exit_code, fmt, args);
va_end(args);
+
+ return ret;
}
#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args)
#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
@@ -200,7 +245,15 @@ static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
{
- return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params);
+ return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
+}
+
+static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class)
+ return &stop_sched_class;
+
+ return __setscheduler_class(p->policy, p->prio);
}
/*
@@ -469,19 +522,16 @@ struct scx_task_iter {
* RCU read lock or obtaining a reference count.
*
* All tasks which existed when the iteration started are guaranteed to be
- * visited as long as they still exist.
+ * visited as long as they are not dead.
*/
static void scx_task_iter_start(struct scx_task_iter *iter)
{
- BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
- ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
+ memset(iter, 0, sizeof(*iter));
raw_spin_lock_irq(&scx_tasks_lock);
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
- iter->locked_task = NULL;
- iter->cnt = 0;
iter->list_locked = true;
}
@@ -547,14 +597,13 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
- __scx_task_iter_maybe_relock(iter);
-
if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
- __scx_task_iter_maybe_relock(iter);
}
+ __scx_task_iter_maybe_relock(iter);
+
list_for_each_entry(pos, cursor, tasks_node) {
if (&pos->tasks_node == &scx_tasks)
return NULL;
@@ -755,6 +804,11 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err
static void run_deferred(struct rq *rq)
{
process_ddsp_deferred_locals(rq);
+
+ if (local_read(&rq->scx.reenq_local_deferred)) {
+ local_set(&rq->scx.reenq_local_deferred, 0);
+ reenq_local(rq);
+ }
}
static void deferred_bal_cb_workfn(struct rq *rq)
@@ -775,12 +829,28 @@ static void deferred_irq_workfn(struct irq_work *irq_work)
* schedule_deferred - Schedule execution of deferred actions on an rq
* @rq: target rq
*
- * Schedule execution of deferred actions on @rq. Must be called with @rq
- * locked. Deferred actions are executed with @rq locked but unpinned, and thus
- * can unlock @rq to e.g. migrate tasks to other rqs.
+ * Schedule execution of deferred actions on @rq. Deferred actions are executed
+ * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks
+ * to other rqs.
*/
static void schedule_deferred(struct rq *rq)
{
+ /*
+ * Queue an irq work. They are executed on IRQ re-enable which may take
+ * a bit longer than the scheduler hook in schedule_deferred_locked().
+ */
+ irq_work_queue(&rq->scx.deferred_irq_work);
+}
+
+/**
+ * schedule_deferred_locked - Schedule execution of deferred actions on an rq
+ * @rq: target rq
+ *
+ * Schedule execution of deferred actions on @rq. Equivalent to
+ * schedule_deferred() but requires @rq to be locked and can be more efficient.
+ */
+static void schedule_deferred_locked(struct rq *rq)
+{
lockdep_assert_rq_held(rq);
/*
@@ -812,12 +882,11 @@ static void schedule_deferred(struct rq *rq)
}
/*
- * No scheduler hooks available. Queue an irq work. They are executed on
- * IRQ re-enable which may take a bit longer than the scheduler hooks.
- * The above WAKEUP and BALANCE paths should cover most of the cases and
- * the time to IRQ re-enable shouldn't be long.
+ * No scheduler hooks available. Use the generic irq_work path. The
+ * above WAKEUP and BALANCE paths should cover most of the cases and the
+ * time to IRQ re-enable shouldn't be long.
*/
- irq_work_queue(&rq->scx.deferred_irq_work);
+ schedule_deferred(rq);
}
/**
@@ -902,7 +971,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
{
- p->scx.slice = SCX_SLICE_DFL;
+ p->scx.slice = READ_ONCE(scx_slice_dfl);
__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
@@ -916,7 +985,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
!RB_EMPTY_NODE(&p->scx.dsq_priq));
if (!is_local) {
- raw_spin_lock(&dsq->lock);
+ raw_spin_lock_nested(&dsq->lock,
+ (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0);
+
if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
@@ -965,8 +1036,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
container_of(rbp, struct task_struct,
scx.dsq_priq);
list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
+ /* first task unchanged - no update needed */
} else {
list_add(&p->scx.dsq_list.node, &dsq->list);
+ /* not builtin and new task is at head - use fastpath */
+ rcu_assign_pointer(dsq->first_task, p);
}
} else {
/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
@@ -974,10 +1048,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
dsq->id);
- if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) {
list_add(&p->scx.dsq_list.node, &dsq->list);
- else
+ /* new task inserted at head - use fastpath */
+ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ rcu_assign_pointer(dsq->first_task, p);
+ } else {
+ bool was_empty;
+
+ was_empty = list_empty(&dsq->list);
list_add_tail(&p->scx.dsq_list.node, &dsq->list);
+ if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ rcu_assign_pointer(dsq->first_task, p);
+ }
}
/* seq records the order tasks are queued, used by BPF DSQ iterator */
@@ -1034,6 +1117,13 @@ static void task_unlink_from_dsq(struct task_struct *p,
list_del_init(&p->scx.dsq_list.node);
dsq_mod_nr(dsq, -1);
+
+ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
+ struct task_struct *first_task;
+
+ first_task = nldsq_next_task(dsq, NULL, false);
+ rcu_assign_pointer(dsq->first_task, first_task);
+ }
}
static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
@@ -1041,6 +1131,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
struct scx_dispatch_q *dsq = p->scx.dsq;
bool is_local = dsq == &rq->scx.local_dsq;
+ lockdep_assert_rq_held(rq);
+
if (!dsq) {
/*
* If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
@@ -1087,6 +1179,20 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
raw_spin_unlock(&dsq->lock);
}
+/*
+ * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq
+ * and dsq are locked.
+ */
+static void dispatch_dequeue_locked(struct task_struct *p,
+ struct scx_dispatch_q *dsq)
+{
+ lockdep_assert_rq_held(task_rq(p));
+ lockdep_assert_held(&dsq->lock);
+
+ task_unlink_from_dsq(p, dsq);
+ p->scx.dsq = NULL;
+}
+
static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
struct rq *rq, u64 dsq_id,
struct task_struct *p)
@@ -1192,7 +1298,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
list_add_tail(&p->scx.dsq_list.node,
&rq->scx.ddsp_deferred_locals);
- schedule_deferred(rq);
+ schedule_deferred_locked(rq);
return;
}
@@ -1217,6 +1323,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
{
struct scx_sched *sch = scx_root;
struct task_struct **ddsp_taskp;
+ struct scx_dispatch_q *dsq;
unsigned long qseq;
WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
@@ -1235,7 +1342,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (scx_rq_bypassing(rq)) {
__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
- goto global;
+ goto bypass;
}
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -1284,8 +1391,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
direct:
direct_dispatch(sch, p, enq_flags);
return;
-
+local_norefill:
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
+ return;
local:
+ dsq = &rq->scx.local_dsq;
+ goto enqueue;
+global:
+ dsq = find_global_dsq(sch, p);
+ goto enqueue;
+bypass:
+ dsq = &task_rq(p)->scx.bypass_dsq;
+ goto enqueue;
+
+enqueue:
/*
* For task-ordering, slice refill must be treated as implying the end
* of the current slice. Otherwise, the longer @p stays on the CPU, the
@@ -1293,14 +1412,7 @@ local:
*/
touch_core_sched(rq, p);
refill_task_slice_dfl(sch, p);
-local_norefill:
- dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
- return;
-
-global:
- touch_core_sched(rq, p); /* see the comment in local: */
- refill_task_slice_dfl(sch, p);
- dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags);
+ dispatch_enqueue(sch, dsq, p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -1741,8 +1853,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
* @p is going from a non-local DSQ to a non-local DSQ. As
* $src_dsq is already locked, do an abbreviated dequeue.
*/
- task_unlink_from_dsq(p, src_dsq);
- p->scx.dsq = NULL;
+ dispatch_dequeue_locked(p, src_dsq);
raw_spin_unlock(&src_dsq->lock);
dispatch_enqueue(sch, dst_dsq, p, enq_flags);
@@ -1751,49 +1862,12 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
return dst_rq;
}
-/*
- * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
- * banging on the same DSQ on a large NUMA system to the point where switching
- * to the bypass mode can take a long time. Inject artificial delays while the
- * bypass mode is switching to guarantee timely completion.
- */
-static void scx_breather(struct rq *rq)
-{
- u64 until;
-
- lockdep_assert_rq_held(rq);
-
- if (likely(!atomic_read(&scx_breather_depth)))
- return;
-
- raw_spin_rq_unlock(rq);
-
- until = ktime_get_ns() + NSEC_PER_MSEC;
-
- do {
- int cnt = 1024;
- while (atomic_read(&scx_breather_depth) && --cnt)
- cpu_relax();
- } while (atomic_read(&scx_breather_depth) &&
- time_before64(ktime_get_ns(), until));
-
- raw_spin_rq_lock(rq);
-}
-
static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
- * This retry loop can repeatedly race against scx_bypass() dequeueing
- * tasks from @dsq trying to put the system into the bypass mode. On
- * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
- * the machine into soft lockups. Give a breather.
- */
- scx_breather(rq);
-
- /*
* The caller can't expect to successfully consume a task if the task's
* addition to @dsq isn't guaranteed to be visible somehow. Test
* @dsq->list without locking and skip if it seems empty.
@@ -1806,6 +1880,17 @@ retry:
nldsq_for_each_task(p, dsq) {
struct rq *task_rq = task_rq(p);
+ /*
+ * This loop can lead to multiple lockup scenarios, e.g. the BPF
+ * scheduler can put an enormous number of affinitized tasks into
+ * a contended DSQ, or the outer retry loop can repeatedly race
+ * against scx_bypass() dequeueing tasks from @dsq trying to put
+ * the system into the bypass mode. This can easily live-lock the
+ * machine. If aborting, exit from all non-bypass DSQs.
+ */
+ if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS)
+ break;
+
if (rq == task_rq) {
task_unlink_from_dsq(p, dsq);
move_local_task_to_local_dsq(p, 0, dsq, rq);
@@ -2089,8 +2174,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (consume_global_dsq(sch, rq))
goto has_tasks;
- if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
- scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ if (scx_rq_bypassing(rq)) {
+ if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
+ goto has_tasks;
+ else
+ goto no_tasks;
+ }
+
+ if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
goto no_tasks;
dspc->rq = rq;
@@ -2241,12 +2332,6 @@ static void switch_class(struct rq *rq, struct task_struct *next)
struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
- /*
- * Pairs with the smp_load_acquire() issued by a CPU in
- * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
- * resched.
- */
- smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
@@ -2286,6 +2371,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
struct task_struct *next)
{
struct scx_sched *sch = scx_root;
+
+ /* see kick_cpus_irq_workfn() */
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+
update_curr_scx(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
@@ -2332,18 +2421,32 @@ static struct task_struct *first_local_task(struct rq *rq)
struct task_struct, scx.dsq_list.node);
}
-static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
+static struct task_struct *
+do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
{
struct task_struct *prev = rq->curr;
bool keep_prev, kick_idle = false;
struct task_struct *p;
+ /* see kick_cpus_irq_workfn() */
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+
rq_modified_clear(rq);
+
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
rq_repin_lock(rq, rf);
maybe_queue_balance_callback(rq);
- if (rq_modified_above(rq, &ext_sched_class))
+
+ /*
+ * If any higher-priority sched class enqueued a runnable task on
+ * this rq during balance_one(), abort and return RETRY_TASK, so
+ * that the scheduler loop can restart.
+ *
+ * If @force_scx is true, always try to pick a SCHED_EXT task,
+ * regardless of any higher-priority sched classes activity.
+ */
+ if (!force_scx && rq_modified_above(rq, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2386,6 +2489,11 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
return p;
}
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
+{
+ return do_pick_task_scx(rq, rf, false);
+}
+
#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
@@ -2842,7 +2950,7 @@ void init_scx_entity(struct sched_ext_entity *scx)
INIT_LIST_HEAD(&scx->runnable_node);
scx->runnable_at = jiffies;
scx->ddsp_dsq_id = SCX_DSQ_INVALID;
- scx->slice = SCX_SLICE_DFL;
+ scx->slice = READ_ONCE(scx_slice_dfl);
}
void scx_pre_fork(struct task_struct *p)
@@ -2908,7 +3016,7 @@ void scx_cancel_fork(struct task_struct *p)
percpu_up_read(&scx_fork_rwsem);
}
-void sched_ext_free(struct task_struct *p)
+void sched_ext_dead(struct task_struct *p)
{
unsigned long flags;
@@ -3012,6 +3120,7 @@ void scx_tg_init(struct task_group *tg)
tg->scx.weight = CGROUP_WEIGHT_DFL;
tg->scx.bw_period_us = default_bw_period_us();
tg->scx.bw_quota_us = RUNTIME_INF;
+ tg->scx.idle = false;
}
int scx_tg_online(struct task_group *tg)
@@ -3160,7 +3269,18 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
void scx_group_set_idle(struct task_group *tg, bool idle)
{
- /* TODO: Implement ops->cgroup_set_idle() */
+ struct scx_sched *sch = scx_root;
+
+ percpu_down_read(&scx_cgroup_ops_rwsem);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL,
+ tg_cgrp(tg), idle);
+
+ /* Update the task group's idle state */
+ tg->scx.idle = idle;
+
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
void scx_group_set_bandwidth(struct task_group *tg,
@@ -3575,38 +3695,55 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
}
/**
- * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ * handle_lockup - sched_ext common lockup handler
+ * @fmt: format string
*
- * While there are various reasons why RCU CPU stalls can occur on a system
- * that may not be caused by the current BPF scheduler, try kicking out the
- * current scheduler in an attempt to recover the system to a good state before
- * issuing panics.
+ * Called on system stall or lockup condition and initiates abort of sched_ext
+ * if enabled, which may resolve the reported lockup.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the lockup. %false if sched_ext is not enabled or abort was already
+ * initiated by someone else.
*/
-bool scx_rcu_cpu_stall(void)
+static __printf(1, 2) bool handle_lockup(const char *fmt, ...)
{
struct scx_sched *sch;
+ va_list args;
+ bool ret;
- rcu_read_lock();
+ guard(rcu)();
sch = rcu_dereference(scx_root);
- if (unlikely(!sch)) {
- rcu_read_unlock();
+ if (unlikely(!sch))
return false;
- }
switch (scx_enable_state()) {
case SCX_ENABLING:
case SCX_ENABLED:
- break;
+ va_start(args, fmt);
+ ret = scx_verror(sch, fmt, args);
+ va_end(args);
+ return ret;
default:
- rcu_read_unlock();
return false;
}
+}
- scx_error(sch, "RCU CPU stall detected!");
- rcu_read_unlock();
-
- return true;
+/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported RCU stall. %false if sched_ext is not enabled or someone
+ * else already initiated abort.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+ return handle_lockup("RCU CPU stall detected!");
}
/**
@@ -3617,50 +3754,240 @@ bool scx_rcu_cpu_stall(void)
* live-lock the system by making many CPUs target the same DSQ to the point
* where soft-lockup detection triggers. This function is called from
* soft-lockup watchdog when the triggering point is close and tries to unjam
- * the system by enabling the breather and aborting the BPF scheduler.
+ * the system and aborting the BPF scheduler.
*/
void scx_softlockup(u32 dur_s)
{
- struct scx_sched *sch;
+ if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s))
+ return;
- rcu_read_lock();
+ printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n",
+ smp_processor_id(), dur_s);
+}
- sch = rcu_dereference(scx_root);
- if (unlikely(!sch))
- goto out_unlock;
+/**
+ * scx_hardlockup - sched_ext hardlockup handler
+ *
+ * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting
+ * numerous affinitized tasks in a single queue and directing all CPUs at it.
+ * Try kicking out the current scheduler in an attempt to recover the system to
+ * a good state before taking more drastic actions.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported hardlockdup. %false if sched_ext is not enabled or
+ * someone else already initiated abort.
+ */
+bool scx_hardlockup(int cpu)
+{
+ if (!handle_lockup("hard lockup - CPU %d", cpu))
+ return false;
- switch (scx_enable_state()) {
- case SCX_ENABLING:
- case SCX_ENABLED:
- break;
- default:
- goto out_unlock;
+ printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
+ cpu);
+ return true;
+}
+
+static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
+ struct cpumask *donee_mask, struct cpumask *resched_mask,
+ u32 nr_donor_target, u32 nr_donee_target)
+{
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+ struct task_struct *p, *n;
+ struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
+ s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
+ u32 nr_balanced = 0, min_delta_us;
+
+ /*
+ * All we want to guarantee is reasonable forward progress. No reason to
+ * fine tune. Assuming every task on @donor_dsq runs their full slice,
+ * consider offloading iff the total queued duration is over the
+ * threshold.
+ */
+ min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ return 0;
+
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ list_add(&cursor.node, &donor_dsq->list);
+resume:
+ n = container_of(&cursor, struct task_struct, scx.dsq_list);
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ while ((p = n)) {
+ struct rq *donee_rq;
+ struct scx_dispatch_q *donee_dsq;
+ int donee;
+
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ if (donor_dsq->nr <= nr_donor_target)
+ break;
+
+ if (cpumask_empty(donee_mask))
+ break;
+
+ donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
+ if (donee >= nr_cpu_ids)
+ continue;
+
+ donee_rq = cpu_rq(donee);
+ donee_dsq = &donee_rq->scx.bypass_dsq;
+
+ /*
+ * $p's rq is not locked but $p's DSQ lock protects its
+ * scheduling properties making this test safe.
+ */
+ if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
+ continue;
+
+ /*
+ * Moving $p from one non-local DSQ to another. The source rq
+ * and DSQ are already locked. Do an abbreviated dequeue and
+ * then perform enqueue without unlocking $donor_dsq.
+ *
+ * We don't want to drop and reacquire the lock on each
+ * iteration as @donor_dsq can be very long and potentially
+ * highly contended. Donee DSQs are less likely to be contended.
+ * The nested locking is safe as only this LB moves tasks
+ * between bypass DSQs.
+ */
+ dispatch_dequeue_locked(p, donor_dsq);
+ dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
+
+ /*
+ * $donee might have been idle and need to be woken up. No need
+ * to be clever. Kick every CPU that receives tasks.
+ */
+ cpumask_set_cpu(donee, resched_mask);
+
+ if (READ_ONCE(donee_dsq->nr) >= nr_donee_target)
+ cpumask_clear_cpu(donee, donee_mask);
+
+ nr_balanced++;
+ if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
+ list_move_tail(&cursor.node, &n->scx.dsq_list.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
+ cpu_relax();
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ goto resume;
+ }
}
- /* allow only one instance, cleared at the end of scx_bypass() */
- if (test_and_set_bit(0, &scx_in_softlockup))
- goto out_unlock;
+ list_del_init(&cursor.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
- printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
- smp_processor_id(), dur_s, scx_root->ops.name);
+ return nr_balanced;
+}
+
+static void bypass_lb_node(struct scx_sched *sch, int node)
+{
+ const struct cpumask *node_mask = cpumask_of_node(node);
+ struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
+ struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+ u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
+ u32 nr_target, nr_donor_target;
+ u32 before_min = U32_MAX, before_max = 0;
+ u32 after_min = U32_MAX, after_max = 0;
+ int cpu;
+
+ /* count the target tasks and CPUs */
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ nr_tasks += nr;
+ nr_cpus++;
+
+ before_min = min(nr, before_min);
+ before_max = max(nr, before_max);
+ }
+
+ if (!nr_cpus)
+ return;
/*
- * Some CPUs may be trapped in the dispatch paths. Enable breather
- * immediately; otherwise, we might even be able to get to scx_bypass().
+ * We don't want CPUs to have more than $nr_donor_target tasks and
+ * balancing to fill donee CPUs upto $nr_target. Once targets are
+ * calculated, find the donee CPUs.
*/
- atomic_inc(&scx_breather_depth);
+ nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus);
+ nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100);
- scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s);
-out_unlock:
- rcu_read_unlock();
+ cpumask_clear(donee_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
+ cpumask_set_cpu(cpu, donee_mask);
+ }
+
+ /* iterate !donee CPUs and see if they should be offloaded */
+ cpumask_clear(resched_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ struct rq *rq = cpu_rq(cpu);
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+
+ if (cpumask_empty(donee_mask))
+ break;
+ if (cpumask_test_cpu(cpu, donee_mask))
+ continue;
+ if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
+ continue;
+
+ nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
+ nr_donor_target, nr_target);
+ }
+
+ for_each_cpu(cpu, resched_mask) {
+ struct rq *rq = cpu_rq(cpu);
+
+ raw_spin_rq_lock_irq(rq);
+ resched_curr(rq);
+ raw_spin_rq_unlock_irq(rq);
+ }
+
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ after_min = min(nr, after_min);
+ after_max = max(nr, after_max);
+
+ }
+
+ trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced,
+ before_min, before_max, after_min, after_max);
}
-static void scx_clear_softlockup(void)
+/*
+ * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine
+ * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some
+ * bypass DSQs can be overloaded. If there are enough tasks to saturate other
+ * lightly loaded CPUs, such imbalance can lead to very high execution latency
+ * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such
+ * outcomes, a simple load balancing mechanism is implemented by the following
+ * timer which runs periodically while bypass mode is in effect.
+ */
+static void scx_bypass_lb_timerfn(struct timer_list *timer)
{
- if (test_and_clear_bit(0, &scx_in_softlockup))
- atomic_dec(&scx_breather_depth);
+ struct scx_sched *sch;
+ int node;
+ u32 intv_us;
+
+ sch = rcu_dereference_all(scx_root);
+ if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth))
+ return;
+
+ for_each_node_with_cpus(node)
+ bypass_lb_node(sch, node);
+
+ intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ if (intv_us)
+ mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
}
+static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
+
/**
* scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @bypass: true for bypass, false for unbypass
@@ -3704,25 +4031,34 @@ static void scx_bypass(bool bypass)
sch = rcu_dereference_bh(scx_root);
if (bypass) {
- scx_bypass_depth++;
+ u32 intv_us;
+
+ WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1);
WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1)
goto unlock;
+ WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
bypass_timestamp = ktime_get_ns();
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
+
+ intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ if (intv_us && !timer_pending(&scx_bypass_lb_timer)) {
+ scx_bypass_lb_timer.expires =
+ jiffies + usecs_to_jiffies(intv_us);
+ add_timer_global(&scx_bypass_lb_timer);
+ }
} else {
- scx_bypass_depth--;
+ WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1);
WARN_ON_ONCE(scx_bypass_depth < 0);
if (scx_bypass_depth != 0)
goto unlock;
+ WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL);
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_DURATION,
ktime_get_ns() - bypass_timestamp);
}
- atomic_inc(&scx_breather_depth);
-
/*
* No task property is changing. We just need to make sure all currently
* queued tasks are re-queued according to the new scx_rq_bypassing()
@@ -3778,10 +4114,8 @@ static void scx_bypass(bool bypass)
raw_spin_rq_unlock(rq);
}
- atomic_dec(&scx_breather_depth);
unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags);
- scx_clear_softlockup();
}
static void free_exit_info(struct scx_exit_info *ei)
@@ -3834,24 +4168,17 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
-static void free_kick_pseqs_rcu(struct rcu_head *rcu)
-{
- struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
-
- kvfree(pseqs);
-}
-
-static void free_kick_pseqs(void)
+static void free_kick_syncs(void)
{
int cpu;
for_each_possible_cpu(cpu) {
- struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
- struct scx_kick_pseqs *to_free;
+ struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu);
+ struct scx_kick_syncs *to_free;
- to_free = rcu_replace_pointer(*pseqs, NULL, true);
+ to_free = rcu_replace_pointer(*ksyncs, NULL, true);
if (to_free)
- call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
+ kvfree_rcu(to_free, rcu);
}
}
@@ -3876,6 +4203,7 @@ static void scx_disable_workfn(struct kthread_work *work)
/* guarantee forward progress by bypassing scx_ops */
scx_bypass(true);
+ WRITE_ONCE(scx_aborting, false);
switch (scx_set_enable_state(SCX_DISABLING)) {
case SCX_DISABLING:
@@ -3920,8 +4248,7 @@ static void scx_disable_workfn(struct kthread_work *work)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
update_rq_clock(task_rq(p));
@@ -3989,7 +4316,7 @@ static void scx_disable_workfn(struct kthread_work *work)
free_percpu(scx_dsp_ctx);
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
- free_kick_pseqs();
+ free_kick_syncs();
mutex_unlock(&scx_enable_mutex);
@@ -3998,9 +4325,24 @@ done:
scx_bypass(false);
}
-static void scx_disable(enum scx_exit_kind kind)
+static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+
+ if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
+ return false;
+
+ /*
+ * Some CPUs may be trapped in the dispatch paths. Set the aborting
+ * flag to break potential live-lock scenarios, ensuring we can
+ * successfully reach scx_bypass().
+ */
+ WRITE_ONCE(scx_aborting, true);
+ return true;
+}
+
+static void scx_disable(enum scx_exit_kind kind)
+{
struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
@@ -4009,7 +4351,7 @@ static void scx_disable(enum scx_exit_kind kind)
rcu_read_lock();
sch = rcu_dereference(scx_root);
if (sch) {
- atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+ scx_claim_exit(sch, kind);
kthread_queue_work(sch->helper, &sch->disable_work);
}
rcu_read_unlock();
@@ -4238,10 +4580,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
seq_buf_init(&ns, buf, avail);
dump_newline(&ns);
- dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",
+ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
cpu, rq->scx.nr_running, rq->scx.flags,
rq->scx.cpu_released, rq->scx.ops_qseq,
- rq->scx.pnt_seq);
+ rq->scx.kick_sync);
dump_line(&ns, " curr=%s[%d] class=%ps",
rq->curr->comm, rq->curr->pid,
rq->curr->sched_class);
@@ -4325,15 +4667,14 @@ static void scx_error_irq_workfn(struct irq_work *irq_work)
kthread_queue_work(sch->helper, &sch->disable_work);
}
-static void scx_vexit(struct scx_sched *sch,
+static bool scx_vexit(struct scx_sched *sch,
enum scx_exit_kind kind, s64 exit_code,
const char *fmt, va_list args)
{
struct scx_exit_info *ei = sch->exit_info;
- int none = SCX_EXIT_NONE;
- if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
- return;
+ if (!scx_claim_exit(sch, kind))
+ return false;
ei->exit_code = exit_code;
#ifdef CONFIG_STACKTRACE
@@ -4350,9 +4691,10 @@ static void scx_vexit(struct scx_sched *sch,
ei->reason = scx_exit_reason(ei->kind);
irq_work_queue(&sch->error_irq_work);
+ return true;
}
-static int alloc_kick_pseqs(void)
+static int alloc_kick_syncs(void)
{
int cpu;
@@ -4361,19 +4703,19 @@ static int alloc_kick_pseqs(void)
* can exceed percpu allocator limits on large machines.
*/
for_each_possible_cpu(cpu) {
- struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
- struct scx_kick_pseqs *new_pseqs;
+ struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu);
+ struct scx_kick_syncs *new_ksyncs;
- WARN_ON_ONCE(rcu_access_pointer(*pseqs));
+ WARN_ON_ONCE(rcu_access_pointer(*ksyncs));
- new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
- GFP_KERNEL, cpu_to_node(cpu));
- if (!new_pseqs) {
- free_kick_pseqs();
+ new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!new_ksyncs) {
+ free_kick_syncs();
return -ENOMEM;
}
- rcu_assign_pointer(*pseqs, new_pseqs);
+ rcu_assign_pointer(*ksyncs, new_ksyncs);
}
return 0;
@@ -4460,7 +4802,7 @@ err_free_sch:
return ERR_PTR(ret);
}
-static void check_hotplug_seq(struct scx_sched *sch,
+static int check_hotplug_seq(struct scx_sched *sch,
const struct sched_ext_ops *ops)
{
unsigned long long global_hotplug_seq;
@@ -4477,8 +4819,11 @@ static void check_hotplug_seq(struct scx_sched *sch,
SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
"expected hotplug seq %llu did not match actual %llu",
ops->hotplug_seq, global_hotplug_seq);
+ return -EBUSY;
}
}
+
+ return 0;
}
static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
@@ -4505,6 +4850,9 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
+ if (ops->cpu_acquire || ops->cpu_release)
+ pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
+
return 0;
}
@@ -4529,14 +4877,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock;
}
- ret = alloc_kick_pseqs();
+ ret = alloc_kick_syncs();
if (ret)
goto err_unlock;
sch = scx_alloc_and_add_sched(ops);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
- goto err_free_pseqs;
+ goto err_free_ksyncs;
}
/*
@@ -4545,6 +4893,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
WARN_ON_ONCE(scx_root);
+ if (WARN_ON_ONCE(READ_ONCE(scx_aborting)))
+ WRITE_ONCE(scx_aborting, false);
atomic_long_set(&scx_nr_rejected, 0);
@@ -4580,7 +4930,11 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
- check_hotplug_seq(sch, ops);
+ ret = check_hotplug_seq(sch, ops);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
scx_idle_update_selcpu_topology(ops);
cpus_read_unlock();
@@ -4697,21 +5051,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
- if (!tryget_task_struct(p))
+ if (scx_get_task_state(p) != SCX_TASK_READY)
continue;
if (old_class != new_class)
queue_flags |= DEQUEUE_CLASS;
scoped_guard (sched_change, p, queue_flags) {
- p->scx.slice = SCX_SLICE_DFL;
+ p->scx.slice = READ_ONCE(scx_slice_dfl);
p->sched_class = new_class;
}
-
- put_task_struct(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -4735,8 +5086,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return 0;
-err_free_pseqs:
- free_kick_pseqs();
+err_free_ksyncs:
+ free_kick_syncs();
err_unlock:
mutex_unlock(&scx_enable_mutex);
return ret;
@@ -4953,6 +5304,7 @@ static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *fro
static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
+static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {}
#endif
static void sched_ext_ops__cpu_online(s32 cpu) {}
static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -4991,6 +5343,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
.cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
.cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
+ .cgroup_set_idle = sched_ext_ops__cgroup_set_idle,
#endif
.cpu_online = sched_ext_ops__cpu_online,
.cpu_offline = sched_ext_ops__cpu_offline,
@@ -5064,29 +5417,38 @@ static bool can_skip_idle_kick(struct rq *rq)
return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
}
-static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs)
+static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
{
struct rq *rq = cpu_rq(cpu);
struct scx_rq *this_scx = &this_rq->scx;
+ const struct sched_class *cur_class;
bool should_wait = false;
unsigned long flags;
raw_spin_rq_lock_irqsave(rq, flags);
+ cur_class = rq->curr->sched_class;
/*
* During CPU hotplug, a CPU may depend on kicking itself to make
- * forward progress. Allow kicking self regardless of online state.
+ * forward progress. Allow kicking self regardless of online state. If
+ * @cpu is running a higher class task, we have no control over @cpu.
+ * Skip kicking.
*/
- if (cpu_online(cpu) || cpu == cpu_of(this_rq)) {
+ if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) &&
+ !sched_class_above(cur_class, &ext_sched_class)) {
if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
- if (rq->curr->sched_class == &ext_sched_class)
+ if (cur_class == &ext_sched_class)
rq->curr->scx.slice = 0;
cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
}
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
- pseqs[cpu] = rq->scx.pnt_seq;
- should_wait = true;
+ if (cur_class == &ext_sched_class) {
+ ksyncs[cpu] = rq->scx.kick_sync;
+ should_wait = true;
+ } else {
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ }
}
resched_curr(rq);
@@ -5118,20 +5480,20 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
{
struct rq *this_rq = this_rq();
struct scx_rq *this_scx = &this_rq->scx;
- struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
+ struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs);
bool should_wait = false;
- unsigned long *pseqs;
+ unsigned long *ksyncs;
s32 cpu;
- if (unlikely(!pseqs_pcpu)) {
- pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
+ if (unlikely(!ksyncs_pcpu)) {
+ pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs");
return;
}
- pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
+ ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
for_each_cpu(cpu, this_scx->cpus_to_kick) {
- should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
+ should_wait |= kick_one_cpu(cpu, this_rq, ksyncs);
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
}
@@ -5145,20 +5507,21 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
return;
for_each_cpu(cpu, this_scx->cpus_to_wait) {
- unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;
+ unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
- if (cpu != cpu_of(this_rq)) {
- /*
- * Pairs with smp_store_release() issued by this CPU in
- * switch_class() on the resched path.
- *
- * We busy-wait here to guarantee that no other task can
- * be scheduled on our core before the target CPU has
- * entered the resched path.
- */
- while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
- cpu_relax();
- }
+ /*
+ * Busy-wait until the task running at the time of kicking is no
+ * longer running. This can be used to implement e.g. core
+ * scheduling.
+ *
+ * smp_cond_load_acquire() pairs with store_releases in
+ * pick_task_scx() and put_prev_task_scx(). The former breaks
+ * the wait if SCX's scheduling path is entered even if the same
+ * task is picked subsequently. The latter is necessary to break
+ * the wait when $cpu is taken by a higher sched class.
+ */
+ if (cpu != cpu_of(this_rq))
+ smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
@@ -5257,6 +5620,7 @@ void __init init_sched_ext_class(void)
int n = cpu_to_node(cpu);
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+ init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
INIT_LIST_HEAD(&rq->scx.runnable_list);
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
@@ -5362,19 +5726,23 @@ __bpf_kfunc_start_defs();
* exhaustion. If zero, the current residual slice is maintained. If
* %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
* scx_bpf_kick_cpu() to trigger scheduling.
+ *
+ * Returns %true on successful insertion, %false on failure. On the root
+ * scheduler, %false return triggers scheduler abort and the caller doesn't need
+ * to check the return value.
*/
-__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
- u64 enq_flags)
+__bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 enq_flags)
{
struct scx_sched *sch;
guard(rcu)();
sch = rcu_dereference(scx_root);
if (unlikely(!sch))
- return;
+ return false;
if (!scx_dsq_insert_preamble(sch, p, enq_flags))
- return;
+ return false;
if (slice)
p->scx.slice = slice;
@@ -5382,56 +5750,114 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
p->scx.slice = p->scx.slice ?: 1;
scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
+
+ return true;
+}
+
+/*
+ * COMPAT: Will be removed in v6.23 along with the ___v2 suffix.
+ */
+__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 enq_flags)
+{
+ scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags);
+}
+
+static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
+ u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags)
+{
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
+ return false;
+
+ if (slice)
+ p->scx.slice = slice;
+ else
+ p->scx.slice = p->scx.slice ?: 1;
+
+ p->scx.dsq_vtime = vtime;
+
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+
+ return true;
}
+struct scx_bpf_dsq_insert_vtime_args {
+ /* @p can't be packed together as KF_RCU is not transitive */
+ u64 dsq_id;
+ u64 slice;
+ u64 vtime;
+ u64 enq_flags;
+};
+
/**
- * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
+ * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion
* @p: task_struct to insert
- * @dsq_id: DSQ to insert into
- * @slice: duration @p can run for in nsecs, 0 to keep the current value
- * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
- * @enq_flags: SCX_ENQ_*
+ * @args: struct containing the rest of the arguments
+ * @args->dsq_id: DSQ to insert into
+ * @args->slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @args->enq_flags: SCX_ENQ_*
+ *
+ * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
+ * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided
+ * as an inline wrapper in common.bpf.h.
*
- * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id.
- * Tasks queued into the priority queue are ordered by @vtime. All other aspects
- * are identical to scx_bpf_dsq_insert().
+ * Insert @p into the vtime priority queue of the DSQ identified by
+ * @args->dsq_id. Tasks queued into the priority queue are ordered by
+ * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert().
*
- * @vtime ordering is according to time_before64() which considers wrapping. A
- * numerically larger vtime may indicate an earlier position in the ordering and
- * vice-versa.
+ * @args->vtime ordering is according to time_before64() which considers
+ * wrapping. A numerically larger vtime may indicate an earlier position in the
+ * ordering and vice-versa.
*
* A DSQ can only be used as a FIFO or priority queue at any given time and this
* function must not be called on a DSQ which already has one or more FIFO tasks
* queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
* SCX_DSQ_GLOBAL) cannot be used as priority queues.
+ *
+ * Returns %true on successful insertion, %false on failure. On the root
+ * scheduler, %false return triggers scheduler abort and the caller doesn't need
+ * to check the return value.
*/
-__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 vtime, u64 enq_flags)
+__bpf_kfunc bool
+__scx_bpf_dsq_insert_vtime(struct task_struct *p,
+ struct scx_bpf_dsq_insert_vtime_args *args)
{
struct scx_sched *sch;
guard(rcu)();
+
sch = rcu_dereference(scx_root);
if (unlikely(!sch))
- return;
+ return false;
- if (!scx_dsq_insert_preamble(sch, p, enq_flags))
- return;
+ return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice,
+ args->vtime, args->enq_flags);
+}
- if (slice)
- p->scx.slice = slice;
- else
- p->scx.slice = p->scx.slice ?: 1;
+/*
+ * COMPAT: Will be removed in v6.23.
+ */
+__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
+{
+ struct scx_sched *sch;
- p->scx.dsq_vtime = vtime;
+ guard(rcu)();
- scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags);
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
@@ -5455,6 +5881,13 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
return false;
/*
+ * If the BPF scheduler keeps calling this function repeatedly, it can
+ * cause similar live-lock conditions as consume_dispatch_q().
+ */
+ if (unlikely(READ_ONCE(scx_aborting)))
+ return false;
+
+ /*
* Can be called from either ops.dispatch() locking this_rq() or any
* context where no rq lock is held. If latter, lock @p's task_rq which
* we'll likely need anyway.
@@ -5474,13 +5907,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
raw_spin_rq_lock(src_rq);
}
- /*
- * If the BPF scheduler keeps calling this function repeatedly, it can
- * cause similar live-lock conditions as consume_dispatch_q(). Insert a
- * breather if necessary.
- */
- scx_breather(src_rq);
-
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
@@ -5685,8 +6111,9 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
* Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
* lock (e.g. BPF timers or SYSCALL programs).
*
- * Returns %true if @p has been consumed, %false if @p had already been consumed
- * or dequeued.
+ * Returns %true if @p has been consumed, %false if @p had already been
+ * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local
+ * DSQ.
*/
__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
struct task_struct *p, u64 dsq_id,
@@ -5738,32 +6165,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
.set = &scx_kfunc_ids_dispatch,
};
-__bpf_kfunc_start_defs();
-
-/**
- * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
- *
- * Iterate over all of the tasks currently enqueued on the local DSQ of the
- * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
- * processed tasks. Can only be called from ops.cpu_release().
- */
-__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+static u32 reenq_local(struct rq *rq)
{
- struct scx_sched *sch;
LIST_HEAD(tasks);
u32 nr_enqueued = 0;
- struct rq *rq;
struct task_struct *p, *n;
- guard(rcu)();
- sch = rcu_dereference(scx_root);
- if (unlikely(!sch))
- return 0;
-
- if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
- return 0;
-
- rq = cpu_rq(smp_processor_id());
lockdep_assert_rq_held(rq);
/*
@@ -5800,6 +6207,37 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
return nr_enqueued;
}
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
+ * processed tasks. Can only be called from ops.cpu_release().
+ *
+ * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void
+ * returning variant that can be called from anywhere.
+ */
+__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+{
+ struct scx_sched *sch;
+ struct rq *rq;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
+ return 0;
+
+ rq = cpu_rq(smp_processor_id());
+ lockdep_assert_rq_held(rq);
+
+ return reenq_local(rq);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
@@ -5872,6 +6310,34 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
__bpf_kfunc_start_defs();
+/**
+ * scx_bpf_task_set_slice - Set task's time slice
+ * @p: task of interest
+ * @slice: time slice to set in nsecs
+ *
+ * Set @p's time slice to @slice. Returns %true on success, %false if the
+ * calling scheduler doesn't have authority over @p.
+ */
+__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
+{
+ p->scx.slice = slice;
+ return true;
+}
+
+/**
+ * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering
+ * @p: task of interest
+ * @vtime: virtual time to set
+ *
+ * Set @p's virtual time to @vtime. Returns %true on success, %false if the
+ * calling scheduler doesn't have authority over @p.
+ */
+__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
+{
+ p->scx.dsq_vtime = vtime;
+ return true;
+}
+
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
{
struct rq *this_rq;
@@ -6029,6 +6495,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
sizeof(struct bpf_iter_scx_dsq));
BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
__alignof__(struct bpf_iter_scx_dsq));
+ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
+ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
/*
* next() and destroy() will be called regardless of the return value.
@@ -6047,9 +6515,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
if (!kit->dsq)
return -ENOENT;
- INIT_LIST_HEAD(&kit->cursor.node);
- kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags;
- kit->cursor.priv = READ_ONCE(kit->dsq->seq);
+ kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags,
+ READ_ONCE(kit->dsq->seq));
return 0;
}
@@ -6123,6 +6590,40 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
kit->dsq = NULL;
}
+/**
+ * scx_bpf_dsq_peek - Lockless peek at the first element.
+ * @dsq_id: DSQ to examine.
+ *
+ * Read the first element in the DSQ. This is semantically equivalent to using
+ * the DSQ iterator, but is lockfree. Of course, like any lockless operation,
+ * this provides only a point-in-time snapshot, and the contents may change
+ * by the time any subsequent locking operation reads the queue.
+ *
+ * Returns the pointer, or NULL indicates an empty queue OR internal error.
+ */
+__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
+{
+ struct scx_sched *sch;
+ struct scx_dispatch_q *dsq;
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) {
+ scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id);
+ return NULL;
+ }
+
+ dsq = find_user_dsq(sch, dsq_id);
+ if (unlikely(!dsq)) {
+ scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
+ return NULL;
+ }
+
+ return rcu_dereference(dsq->first_task);
+}
+
__bpf_kfunc_end_defs();
static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
@@ -6277,6 +6778,24 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
}
/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
+ * anywhere.
+ */
+__bpf_kfunc void scx_bpf_reenqueue_local___v2(void)
+{
+ struct rq *rq;
+
+ guard(preempt)();
+
+ rq = this_rq();
+ local_set(&rq->scx.reenq_local_deferred, 1);
+ schedule_deferred(rq);
+}
+
+/**
* scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
* @cpu: CPU of interest
*
@@ -6677,15 +7196,19 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
+BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU);
+BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU);
BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
@@ -6776,6 +7299,12 @@ static int __init scx_init(void)
return ret;
}
+ if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
+ !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
+ pr_err("sched_ext: Failed to allocate cpumasks\n");
+ return -ENOMEM;
+ }
+
return 0;
}
__initcall(scx_init);
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index d2434c954848..3d9d404d5cd2 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -995,26 +995,56 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
return prev_cpu;
}
+struct scx_bpf_select_cpu_and_args {
+ /* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */
+ s32 prev_cpu;
+ u64 wake_flags;
+ u64 flags;
+};
+
/**
- * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
- * prioritizing those in @cpus_allowed
+ * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask
* @p: task_struct to select a CPU for
- * @prev_cpu: CPU @p was on previously
- * @wake_flags: %SCX_WAKE_* flags
* @cpus_allowed: cpumask of allowed CPUs
- * @flags: %SCX_PICK_IDLE* flags
+ * @args: struct containing the rest of the arguments
+ * @args->prev_cpu: CPU @p was on previously
+ * @args->wake_flags: %SCX_WAKE_* flags
+ * @args->flags: %SCX_PICK_IDLE* flags
+ *
+ * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
+ * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
+ * as an inline wrapper in common.bpf.h.
*
* Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
* context such as a BPF test_run() call, as long as built-in CPU selection
* is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
* is set.
*
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu().
*
* Returns the selected idle CPU, which will be automatically awakened upon
* returning from ops.select_cpu() and can be used for direct dispatch, or
* a negative value if no idle CPU is available.
*/
+__bpf_kfunc s32
+__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
+ struct scx_bpf_select_cpu_and_args *args)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags,
+ cpus_allowed, args->flags);
+}
+
+/*
+ * COMPAT: Will be removed in v6.22.
+ */
__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags)
{
@@ -1383,6 +1413,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_idle)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index b3617abed510..386c677e4c9a 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -23,6 +23,11 @@ enum scx_consts {
* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
*/
SCX_TASK_ITER_BATCH = 32,
+
+ SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
+ SCX_BYPASS_LB_DONOR_PCT = 125,
+ SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
+ SCX_BYPASS_LB_BATCH = 256,
};
enum scx_exit_kind {
@@ -697,12 +702,23 @@ struct sched_ext_ops {
* 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
* interpreted in the same fashion and specifies how much @cgrp can
* burst temporarily. The specific control mechanism and thus the
- * interpretation of @period_us and burstiness is upto to the BPF
+ * interpretation of @period_us and burstiness is up to the BPF
* scheduler.
*/
void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
u64 period_us, u64 quota_us, u64 burst_us);
+ /**
+ * @cgroup_set_idle: A cgroup's idle state is being changed
+ * @cgrp: cgroup whose idle state is being updated
+ * @idle: whether the cgroup is entering or exiting idle state
+ *
+ * Update @cgrp's idle state to @idle. This callback is invoked when
+ * a cgroup transitions between idle and non-idle states, allowing the
+ * BPF scheduler to adjust its behavior accordingly.
+ */
+ void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle);
+
#endif /* CONFIG_EXT_GROUP_SCHED */
/*
@@ -884,6 +900,10 @@ struct scx_sched {
struct scx_dispatch_q **global_dsqs;
struct scx_sched_pcpu __percpu *pcpu;
+ /*
+ * Updates to the following warned bitfields can race causing RMW issues
+ * but it doesn't really matter.
+ */
bool warned_zero_slice:1;
bool warned_deprecated_rq:1;
@@ -948,6 +968,7 @@ enum scx_enq_flags {
SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
+ SCX_ENQ_NESTED = 1LLU << 58,
};
enum scx_deq_flags {
@@ -986,8 +1007,10 @@ enum scx_kick_flags {
SCX_KICK_PREEMPT = 1LLU << 1,
/*
- * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
- * return after the target CPU finishes picking the next task.
+ * The scx_bpf_kick_cpu() call will return after the current SCX task of
+ * the target CPU switches out. This can be used to implement e.g. core
+ * scheduling. This has no effect if the current task on the target CPU
+ * is not on SCX.
*/
SCX_KICK_WAIT = 1LLU << 2,
};
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1cb7a3d70e65..c174afe1dd17 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -131,12 +131,13 @@ void __cpuidle default_idle_call(void)
}
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+ struct cpuidle_device *dev,
+ u64 max_latency_ns)
{
if (current_clr_polling_and_test())
return -EBUSY;
- return cpuidle_enter_s2idle(drv, dev);
+ return cpuidle_enter_s2idle(drv, dev, max_latency_ns);
}
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
@@ -205,12 +206,13 @@ static void cpuidle_idle_call(void)
u64 max_latency_ns;
if (idle_should_enter_s2idle()) {
+ max_latency_ns = cpu_wakeup_latency_qos_limit() *
+ NSEC_PER_USEC;
- entered_state = call_cpuidle_s2idle(drv, dev);
+ entered_state = call_cpuidle_s2idle(drv, dev,
+ max_latency_ns);
if (entered_state > 0)
goto exit_idle;
-
- max_latency_ns = U64_MAX;
} else {
max_latency_ns = dev->forced_idle_latency_limit_ns;
}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index a4cf17b1fab0..3ad0d6df6a0a 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -167,6 +167,29 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
}
}
+ /*
+ * Check the combination of nohz_full and isolcpus=domain,
+ * necessary to avoid problems with the timer migration
+ * hierarchy. managed_irq is ignored by this check since it
+ * isn't considered in the timer migration logic.
+ */
+ iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
+ type = find_first_bit(&iter_flags, HK_TYPE_MAX);
+ /*
+ * Pass the check if none of these flags were previously set or
+ * are not in the current selection.
+ */
+ iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
+ first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
+ cpumask_first_and_and(cpu_present_mask,
+ housekeeping_staging, housekeeping.cpumasks[type]);
+ if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
+ pr_warn("Housekeeping: must include one present CPU "
+ "neither in nohz_full= nor in isolcpus=domain, "
+ "ignoring setting %s\n", str);
+ goto free_housekeeping_staging;
+ }
+
iter_flags = flags & ~housekeeping.flags;
for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 62fba83b7bb1..623445603725 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -199,7 +199,7 @@ static void ipi_rseq(void *info)
* is negligible.
*/
smp_mb();
- rseq_preempt(current);
+ rseq_sched_switch_event(current);
}
static void ipi_sync_rq_state(void *info)
@@ -407,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id)
* membarrier, we will end up with some thread in the mm
* running without a core sync.
*
- * For RSEQ, don't rseq_preempt() the caller. User code
- * is not supposed to issue syscalls at all from inside an
- * rseq critical section.
+ * For RSEQ, don't invoke rseq_sched_switch_event() on the
+ * caller. User code is not supposed to issue syscalls at
+ * all from inside an rseq critical section.
*/
if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
preempt_disable();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b419a4d98461..bbf513b3e76c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -803,10 +803,12 @@ struct scx_rq {
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
- unsigned long pnt_seq;
+ unsigned long kick_sync;
+ local_t reenq_local_deferred;
struct balance_callback deferred_bal_cb;
struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work;
+ struct scx_dispatch_q bypass_dsq;
};
#endif /* CONFIG_SCHED_CLASS_EXT */
@@ -2223,6 +2225,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
smp_wmb();
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu;
+ rseq_sched_set_ids_changed(p);
#endif /* CONFIG_SMP */
}
@@ -3679,283 +3682,212 @@ extern const char *preempt_modes[];
#ifdef CONFIG_SCHED_MM_CID
-#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
-#define MM_CID_SCAN_DELAY 100 /* 100ms */
+static __always_inline bool cid_on_cpu(unsigned int cid)
+{
+ return cid & MM_CID_ONCPU;
+}
-extern raw_spinlock_t cid_lock;
-extern int use_cid_lock;
+static __always_inline bool cid_in_transit(unsigned int cid)
+{
+ return cid & MM_CID_TRANSIT;
+}
-extern void sched_mm_cid_migrate_from(struct task_struct *t);
-extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
-extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
-extern void init_sched_mm_cid(struct task_struct *t);
+static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid)
+{
+ return cid & ~MM_CID_ONCPU;
+}
-static inline void __mm_cid_put(struct mm_struct *mm, int cid)
+static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid)
{
- if (cid < 0)
- return;
- cpumask_clear_cpu(cid, mm_cidmask(mm));
+ return cid | MM_CID_ONCPU;
}
-/*
- * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
- * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
- * be held to transition to other states.
- *
- * State transitions synchronized with cmpxchg or try_cmpxchg need to be
- * consistent across CPUs, which prevents use of this_cpu_cmpxchg.
- */
-static inline void mm_cid_put_lazy(struct task_struct *t)
+static __always_inline unsigned int cid_to_transit_cid(unsigned int cid)
{
- struct mm_struct *mm = t->mm;
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid;
+ return cid | MM_CID_TRANSIT;
+}
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- if (!mm_cid_is_lazy_put(cid) ||
- !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
- return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+static __always_inline unsigned int cid_from_transit_cid(unsigned int cid)
+{
+ return cid & ~MM_CID_TRANSIT;
}
-static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
+static __always_inline bool cid_on_task(unsigned int cid)
{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid, res;
+ /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */
+ return cid < MM_CID_TRANSIT;
+}
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- for (;;) {
- if (mm_cid_is_unset(cid))
- return MM_CID_UNSET;
- /*
- * Attempt transition from valid or lazy-put to unset.
- */
- res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
- if (res == cid)
- break;
- cid = res;
- }
- return cid;
+static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid)
+{
+ clear_bit(cid, mm_cidmask(mm));
}
-static inline void mm_cid_put(struct mm_struct *mm)
+static __always_inline void mm_unset_cid_on_task(struct task_struct *t)
{
- int cid;
+ unsigned int cid = t->mm_cid.cid;
- lockdep_assert_irqs_disabled();
- cid = mm_cid_pcpu_unset(mm);
- if (cid == MM_CID_UNSET)
- return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ t->mm_cid.cid = MM_CID_UNSET;
+ if (cid_on_task(cid))
+ mm_drop_cid(t->mm, cid);
}
-static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
+static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp)
{
- struct cpumask *cidmask = mm_cidmask(mm);
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid, max_nr_cid, allowed_max_nr_cid;
+ /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */
+ pcp->cid = cpu_cid_to_cid(pcp->cid);
+ mm_drop_cid(mm, pcp->cid);
+}
- /*
- * After shrinking the number of threads or reducing the number
- * of allowed cpus, reduce the value of max_nr_cid so expansion
- * of cid allocation will preserve cache locality if the number
- * of threads or allowed cpus increase again.
- */
- max_nr_cid = atomic_read(&mm->max_nr_cid);
- while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed),
- atomic_read(&mm->mm_users))),
- max_nr_cid > allowed_max_nr_cid) {
- /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */
- if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) {
- max_nr_cid = allowed_max_nr_cid;
- break;
- }
- }
- /* Try to re-use recent cid. This improves cache locality. */
- cid = __this_cpu_read(pcpu_cid->recent_cid);
- if (!mm_cid_is_unset(cid) && cid < max_nr_cid &&
- !cpumask_test_and_set_cpu(cid, cidmask))
- return cid;
- /*
- * Expand cid allocation if the maximum number of concurrency
- * IDs allocated (max_nr_cid) is below the number cpus allowed
- * and number of threads. Expanding cid allocation as much as
- * possible improves cache locality.
- */
- cid = max_nr_cid;
- while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
- /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */
- if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
- continue;
- if (!cpumask_test_and_set_cpu(cid, cidmask))
- return cid;
- }
- /*
- * Find the first available concurrency id.
- * Retry finding first zero bit if the mask is temporarily
- * filled. This only happens during concurrent remote-clear
- * which owns a cid without holding a rq lock.
- */
- for (;;) {
- cid = cpumask_first_zero(cidmask);
- if (cid < READ_ONCE(mm->nr_cpus_allowed))
- break;
- cpu_relax();
- }
- if (cpumask_test_and_set_cpu(cid, cidmask))
- return -1;
+static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
+{
+ unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids);
+ if (cid >= max_cids)
+ return MM_CID_UNSET;
+ if (test_and_set_bit(cid, mm_cidmask(mm)))
+ return MM_CID_UNSET;
return cid;
}
-/*
- * Save a snapshot of the current runqueue time of this cpu
- * with the per-cpu cid value, allowing to estimate how recently it was used.
- */
-static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
+static inline unsigned int mm_get_cid(struct mm_struct *mm)
{
- struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+ unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids));
- lockdep_assert_rq_held(rq);
- WRITE_ONCE(pcpu_cid->time, rq->clock);
+ while (cid == MM_CID_UNSET) {
+ cpu_relax();
+ cid = __mm_get_cid(mm, num_possible_cpus());
+ }
+ return cid;
}
-static inline int __mm_cid_get(struct rq *rq, struct task_struct *t,
- struct mm_struct *mm)
+static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid,
+ unsigned int max_cids)
{
- int cid;
+ unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid);
- /*
- * All allocations (even those using the cid_lock) are lock-free. If
- * use_cid_lock is set, hold the cid_lock to perform cid allocation to
- * guarantee forward progress.
- */
- if (!READ_ONCE(use_cid_lock)) {
- cid = __mm_cid_try_get(t, mm);
- if (cid >= 0)
- goto end;
- raw_spin_lock(&cid_lock);
- } else {
- raw_spin_lock(&cid_lock);
- cid = __mm_cid_try_get(t, mm);
- if (cid >= 0)
- goto unlock;
+ /* Is it in the optimal CID space? */
+ if (likely(cid < max_cids))
+ return orig_cid;
+
+ /* Try to find one in the optimal space. Otherwise keep the provided. */
+ new_cid = __mm_get_cid(mm, max_cids);
+ if (new_cid != MM_CID_UNSET) {
+ mm_drop_cid(mm, cid);
+ /* Preserve the ONCPU mode of the original CID */
+ return new_cid | (orig_cid & MM_CID_ONCPU);
}
+ return orig_cid;
+}
- /*
- * cid concurrently allocated. Retry while forcing following
- * allocations to use the cid_lock to ensure forward progress.
- */
- WRITE_ONCE(use_cid_lock, 1);
- /*
- * Set use_cid_lock before allocation. Only care about program order
- * because this is only required for forward progress.
- */
- barrier();
- /*
- * Retry until it succeeds. It is guaranteed to eventually succeed once
- * all newcoming allocations observe the use_cid_lock flag set.
- */
- do {
- cid = __mm_cid_try_get(t, mm);
- cpu_relax();
- } while (cid < 0);
- /*
- * Allocate before clearing use_cid_lock. Only care about
- * program order because this is for forward progress.
- */
- barrier();
- WRITE_ONCE(use_cid_lock, 0);
-unlock:
- raw_spin_unlock(&cid_lock);
-end:
- mm_cid_snapshot_time(rq, mm);
+static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid)
+{
+ if (t->mm_cid.cid != cid) {
+ t->mm_cid.cid = cid;
+ rseq_sched_set_ids_changed(t);
+ }
+}
- return cid;
+static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid)
+{
+ __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
}
-static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
- struct mm_struct *mm)
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid;
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
- lockdep_assert_rq_held(rq);
- cid = __this_cpu_read(pcpu_cid->cid);
- if (mm_cid_is_valid(cid)) {
- mm_cid_snapshot_time(rq, mm);
- return cid;
- }
- if (mm_cid_is_lazy_put(cid)) {
- if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case where both have the ONCPU bit set */
+ if (likely(cid_on_cpu(cpu_cid & tcid))) {
+ if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) {
+ mm_cid_update_task_cid(t, cpu_cid);
+ return;
+ }
+ /* Try to converge into the optimal CID space */
+ cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids);
+ } else {
+ /* Hand over or drop the task owned CID */
+ if (cid_on_task(tcid)) {
+ if (cid_on_cpu(cpu_cid))
+ mm_unset_cid_on_task(t);
+ else
+ cpu_cid = cid_to_cpu_cid(tcid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_cpu(cpu_cid))
+ cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
}
- cid = __mm_cid_get(rq, t, mm);
- __this_cpu_write(pcpu_cid->cid, cid);
- __this_cpu_write(pcpu_cid->recent_cid, cid);
-
- return cid;
+ mm_cid_update_pcpu_cid(mm, cpu_cid);
+ mm_cid_update_task_cid(t, cpu_cid);
}
-static inline void switch_mm_cid(struct rq *rq,
- struct task_struct *prev,
- struct task_struct *next)
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
{
- /*
- * Provide a memory barrier between rq->curr store and load of
- * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
- *
- * Should be adapted if context_switch() is modified.
- */
- if (!next->mm) { // to kernel
- /*
- * user -> kernel transition does not guarantee a barrier, but
- * we can use the fact that it performs an atomic operation in
- * mmgrab().
- */
- if (prev->mm) // from user
- smp_mb__after_mmgrab();
- /*
- * kernel -> kernel transition does not change rq->curr->mm
- * state. It stays NULL.
- */
- } else { // to user
- /*
- * kernel -> user transition does not provide a barrier
- * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
- * Provide it here.
- */
- if (!prev->mm) { // from kernel
- smp_mb();
- } else { // from user
- /*
- * user->user transition relies on an implicit
- * memory barrier in switch_mm() when
- * current->mm changes. If the architecture
- * switch_mm() does not have an implicit memory
- * barrier, it is emitted here. If current->mm
- * is unchanged, no barrier is needed.
- */
- smp_mb__after_switch_mm();
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case, where both have the ONCPU bit clear */
+ if (likely(cid_on_task(tcid | cpu_cid))) {
+ if (likely(tcid < max_cids)) {
+ mm_cid_update_pcpu_cid(mm, tcid);
+ return;
}
+ /* Try to converge into the optimal CID space */
+ tcid = mm_cid_converge(mm, tcid, max_cids);
+ } else {
+ /* Hand over or drop the CPU owned CID */
+ if (cid_on_cpu(cpu_cid)) {
+ if (cid_on_task(tcid))
+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
+ else
+ tcid = cpu_cid_to_cid(cpu_cid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_task(tcid))
+ tcid = mm_get_cid(mm);
+ /* Set the transition mode flag if required */
+ tcid |= READ_ONCE(mm->mm_cid.transit);
}
- if (prev->mm_cid_active) {
- mm_cid_snapshot_time(rq, prev->mm);
- mm_cid_put_lazy(prev);
- prev->mm_cid = -1;
- }
- if (next->mm_cid_active)
- next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
+ mm_cid_update_pcpu_cid(mm, tcid);
+ mm_cid_update_task_cid(t, tcid);
+}
+
+static __always_inline void mm_cid_schedin(struct task_struct *next)
+{
+ struct mm_struct *mm = next->mm;
+ unsigned int cpu_cid;
+
+ if (!next->mm_cid.active)
+ return;
+
+ cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
+ if (likely(!READ_ONCE(mm->mm_cid.percpu)))
+ mm_cid_from_task(next, cpu_cid);
+ else
+ mm_cid_from_cpu(next, cpu_cid);
+}
+
+static __always_inline void mm_cid_schedout(struct task_struct *prev)
+{
+ /* During mode transitions CIDs are temporary and need to be dropped */
+ if (likely(!cid_in_transit(prev->mm_cid.cid)))
+ return;
+
+ mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
+ prev->mm_cid.cid = MM_CID_UNSET;
+}
+
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
+{
+ mm_cid_schedout(prev);
+ mm_cid_schedin(next);
}
#else /* !CONFIG_SCHED_MM_CID: */
-static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
-static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
-static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
-static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
-static inline void init_sched_mm_cid(struct task_struct *t) { }
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 807879131add..0496dc29ed0f 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -824,6 +824,19 @@ void sched_set_fifo_low(struct task_struct *p)
}
EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+/*
+ * Used when the primary interrupt handler is forced into a thread, in addition
+ * to the (always threaded) secondary handler. The secondary handler gets a
+ * slightly lower priority so that the primary handler can preempt it, thereby
+ * emulating the behavior of a non-PREEMPT_RT system where the primary handler
+ * runs in hard interrupt context.
+ */
+void sched_set_fifo_secondary(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+
void sched_set_normal(struct task_struct *p, int nice)
{
struct sched_attr attr = {