summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/kernel/smpboot.c70
-rw-r--r--include/linux/cleanup.h5
-rw-r--r--include/linux/sched.h11
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/kthread.c15
-rw-r--r--kernel/sched/core.c376
-rw-r--r--kernel/sched/cpudeadline.c34
-rw-r--r--kernel/sched/cpudeadline.h4
-rw-r--r--kernel/sched/deadline.c48
-rw-r--r--kernel/sched/ext.c131
-rw-r--r--kernel/sched/fair.c64
-rw-r--r--kernel/sched/idle.c13
-rw-r--r--kernel/sched/rt.c11
-rw-r--r--kernel/sched/sched.h252
-rw-r--r--kernel/sched/stats.h2
-rw-r--r--kernel/sched/stop_task.c13
-rw-r--r--kernel/sched/syscalls.c82
-rw-r--r--kernel/sched/topology.c108
19 files changed, 683 insertions, 560 deletions
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 21041898157a..df943881f15b 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled
extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick
+extern int arch_sched_node_distance(int from, int to);
+
#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index eb289abece23..5709c9cab195 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -515,6 +515,76 @@ static void __init build_sched_topology(void)
set_sched_topology(topology);
}
+#ifdef CONFIG_NUMA
+static int sched_avg_remote_distance;
+static int avg_remote_numa_distance(void)
+{
+ int i, j;
+ int distance, nr_remote, total_distance;
+
+ if (sched_avg_remote_distance > 0)
+ return sched_avg_remote_distance;
+
+ nr_remote = 0;
+ total_distance = 0;
+ for_each_node_state(i, N_CPU) {
+ for_each_node_state(j, N_CPU) {
+ distance = node_distance(i, j);
+
+ if (distance >= REMOTE_DISTANCE) {
+ nr_remote++;
+ total_distance += distance;
+ }
+ }
+ }
+ if (nr_remote)
+ sched_avg_remote_distance = total_distance / nr_remote;
+ else
+ sched_avg_remote_distance = REMOTE_DISTANCE;
+
+ return sched_avg_remote_distance;
+}
+
+int arch_sched_node_distance(int from, int to)
+{
+ int d = node_distance(from, to);
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_ATOM_DARKMONT_X:
+
+ if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+ d < REMOTE_DISTANCE)
+ return d;
+
+ /*
+ * With SNC enabled, there could be too many levels of remote
+ * NUMA node distances, creating NUMA domain levels
+ * including local nodes and partial remote nodes.
+ *
+ * Trim finer distance tuning for NUMA nodes in remote package
+ * for the purpose of building sched domains. Group NUMA nodes
+ * in the remote package in the same sched group.
+ * Simplify NUMA domains and avoid extra NUMA levels including
+ * different remote NUMA nodes and local nodes.
+ *
+ * GNR and CWF don't expect systems with more than 2 packages
+ * and more than 2 hops between packages. Single average remote
+ * distance won't be appropriate if there are more than 2
+ * packages as average distance to different remote packages
+ * could be different.
+ */
+ WARN_ONCE(topology_max_packages() > 2,
+ "sched: Expect only up to 2 packages for GNR or CWF, "
+ "but saw %d packages when building sched domains.",
+ topology_max_packages());
+
+ d = avg_remote_numa_distance();
+ }
+ return d;
+}
+#endif /* CONFIG_NUMA */
+
void set_cpu_sibling_map(int cpu)
{
bool has_smt = __max_threads_per_core > 1;
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 2573585b7f06..ae381675455d 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -340,6 +340,11 @@ _label: \
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
+#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \
+ __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
+ static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+ { return (void *)1; }
+
#define __GUARD_IS_ERR(_ptr) \
({ \
unsigned long _rc = (__force unsigned long)(_ptr); \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbb7340c5866..07576479c0ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -637,8 +637,8 @@ struct sched_rt_entity {
#endif
} __randomize_layout;
-typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
-typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+struct rq_flags;
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf);
struct sched_dl_entity {
struct rb_node rb_node;
@@ -730,9 +730,6 @@ struct sched_dl_entity {
* dl_server_update().
*
* @rq the runqueue this server is for
- *
- * @server_has_tasks() returns true if @server_pick return a
- * runnable task.
*/
struct rq *rq;
dl_server_pick_f server_pick_task;
@@ -1861,8 +1858,8 @@ extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
-/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
-extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
+extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);
/**
* set_cpus_allowed_ptr - set CPU affinity mask of a task
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 52468d2c178a..185e820cd1df 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4180,7 +4180,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
rcu_read_lock();
cs_mask = task_cs(tsk)->cpus_allowed;
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
- do_set_cpus_allowed(tsk, cs_mask);
+ set_cpus_allowed_force(tsk, cs_mask);
changed = true;
}
rcu_read_unlock();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 31b072e8d427..99a3808d086f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
- unsigned long flags;
-
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ set_cpus_allowed_force(p, mask);
+
/* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
@@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
{
struct kthread *kthread = to_kthread(p);
cpumask_var_t affinity;
- unsigned long flags;
int ret = 0;
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
@@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
kthread_fetch_affinity(kthread, affinity);
- /* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, affinity);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ set_cpus_allowed_force(p, affinity);
mutex_unlock(&kthreads_hotplug_lock);
out:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 198d2dd45f59..096e8d03d85e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -583,8 +583,8 @@ EXPORT_SYMBOL(__trace_set_current_state);
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
- * is set by activate_task() and cleared by deactivate_task(), under
- * rq->lock. Non-zero indicates the task is runnable, the special
+ * is set by activate_task() and cleared by deactivate_task()/block_task(),
+ * under rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
@@ -2089,6 +2089,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
+ rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2121,6 +2122,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
+ rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2169,37 +2171,6 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class)
-{
- if (prev_class != p->sched_class && p->sched_class->switching_to)
- p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
-{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
- prev_class->switched_from(rq, p);
-
- p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio || dl_task(p))
- p->sched_class->prio_changed(rq, p, oldprio);
-}
-
void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
@@ -2362,7 +2333,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
@@ -2377,10 +2348,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
if (p->cpus_ptr != &p->cpus_mask)
return;
- /*
- * Violates locking rules! See comment in __do_set_cpus_allowed().
- */
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
}
void ___migrate_enable(void)
@@ -2613,7 +2582,8 @@ static int migration_cpu_stop(void *data)
*/
WARN_ON_ONCE(!pending->stop_pending);
preempt_disable();
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
preempt_enable();
@@ -2622,7 +2592,8 @@ static int migration_cpu_stop(void *data)
out:
if (pending)
pending->stop_pending = false;
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
if (complete)
complete_all(&pending->done);
@@ -2693,56 +2664,19 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- struct rq *rq = task_rq(p);
- bool queued, running;
-
- /*
- * This here violates the locking rules for affinity, since we're only
- * supposed to change these variables while holding both rq->lock and
- * p->pi_lock.
- *
- * HOWEVER, it magically works, because ttwu() is the only code that
- * accesses these variables under p->pi_lock and only does so after
- * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
- * before finish_task().
- *
- * XXX do further audits, this smells like something putrid.
- */
- if (ctx->flags & SCA_MIGRATE_DISABLE)
- WARN_ON_ONCE(!p->on_cpu);
- else
- lockdep_assert_held(&p->pi_lock);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued) {
- /*
- * Because __kthread_bind() calls this on blocked tasks without
- * holding rq->lock.
- */
- lockdep_assert_rq_held(rq);
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE) {
+ p->sched_class->set_cpus_allowed(p, ctx);
+ mm_set_cpus_allowed(p->mm, ctx->new_mask);
}
- if (running)
- put_prev_task(rq, p);
-
- p->sched_class->set_cpus_allowed(p, ctx);
- mm_set_cpus_allowed(p->mm, ctx->new_mask);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
}
/*
* Used for kthread_bind() and select_fallback_rq(), in both cases the user
* affinity (if any) should be destroyed too.
*/
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
@@ -2754,7 +2688,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
struct rcu_head rcu;
};
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (__task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -2792,7 +2727,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
* Use pi_lock to protect content of user_cpus_ptr
*
* Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
- * do_set_cpus_allowed().
+ * set_cpus_allowed_force().
*/
raw_spin_lock_irqsave(&src->pi_lock, flags);
if (src->user_cpus_ptr) {
@@ -3120,7 +3055,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
goto out;
}
- __do_set_cpus_allowed(p, ctx);
+ do_set_cpus_allowed(p, ctx);
return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
@@ -3529,13 +3464,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
fallthrough;
case possible:
- /*
- * XXX When called from select_task_rq() we only
- * hold p->pi_lock and again violate locking order.
- *
- * More yuck to audit.
- */
- do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
+ set_cpus_allowed_force(p, task_cpu_fallback_mask(p));
state = fail;
break;
case fail:
@@ -3777,7 +3706,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
ttwu_do_wakeup(p);
ret = 1;
}
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
return ret;
}
@@ -4231,7 +4160,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * schedule()'s block_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
@@ -4370,7 +4299,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
ret = func(p, arg);
if (rq)
- rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
@@ -5916,19 +5845,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
-#ifdef CONFIG_SCHED_CLASS_EXT
- /*
- * SCX requires a balance() call before every pick_task() including when
- * waking up from SCHED_IDLE. If @start_class is below SCX, start from
- * SCX instead. Also, set a flag to detect missing balance() call.
- */
- if (scx_enabled()) {
- rq->scx.flags |= SCX_RQ_BAL_PENDING;
- if (sched_class_above(&ext_sched_class, start_class))
- start_class = &ext_sched_class;
- }
-#endif
-
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
@@ -5972,7 +5888,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* Assume the next prioritized class is idle_sched_class */
if (!p) {
- p = pick_task_idle(rq);
+ p = pick_task_idle(rq, rf);
put_prev_set_next_task(rq, prev, p);
}
@@ -5984,11 +5900,15 @@ restart:
for_each_active_class(class) {
if (class->pick_next_task) {
- p = class->pick_next_task(rq, prev);
+ p = class->pick_next_task(rq, prev, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
if (p)
return p;
} else {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
if (p) {
put_prev_set_next_task(rq, prev, p);
return p;
@@ -6018,7 +5938,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}
-static inline struct task_struct *pick_task(struct rq *rq)
+/*
+ * Careful; this can return RETRY_TASK, it does not include the retry-loop
+ * itself due to the whole SMT pick retry thing below.
+ */
+static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -6026,7 +5950,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
rq->dl_server = NULL;
for_each_active_class(class) {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
if (p)
return p;
}
@@ -6041,7 +5965,7 @@ static void queue_core_balance(struct rq *rq);
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct task_struct *next, *p, *max = NULL;
+ struct task_struct *next, *p, *max;
const struct cpumask *smt_mask;
bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
@@ -6126,7 +6050,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* and there are no cookied tasks running on siblings.
*/
if (!need_sync) {
- next = pick_task(rq);
+restart_single:
+ next = pick_task(rq, rf);
+ if (unlikely(next == RETRY_TASK))
+ goto restart_single;
if (!next->core_cookie) {
rq->core_pick = NULL;
rq->core_dl_server = NULL;
@@ -6146,6 +6073,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* Tie-break prio towards the current CPU
*/
+restart_multi:
+ max = NULL;
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
@@ -6157,7 +6086,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
- rq_i->core_pick = p = pick_task(rq_i);
+ p = pick_task(rq_i, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart_multi;
+
+ rq_i->core_pick = p;
rq_i->core_dl_server = rq_i->dl_server;
if (!max || prio_less(max, p, fi_before))
@@ -6179,7 +6112,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (cookie)
p = sched_core_find(rq_i, cookie);
if (!p)
- p = idle_sched_class.pick_task(rq_i);
+ p = idle_sched_class.pick_task(rq_i, rf);
}
rq_i->core_pick = p;
@@ -6812,6 +6745,7 @@ static void __sched notrace __schedule(int sched_mode)
local_irq_disable();
rcu_note_context_switch(preempt);
+ migrate_disable_switch(rq, prev);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -6918,7 +6852,6 @@ keep_resched:
*/
++*switch_count;
- migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
prev->se.sched_delayed);
@@ -7326,7 +7259,7 @@ void rt_mutex_post_schedule(void)
*/
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int prio, oldprio, queued, running, queue_flag =
+ int prio, oldprio, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
@@ -7388,64 +7321,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
prev_class = p->sched_class;
next_class = __setscheduler_class(p->policy, prio);
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flag);
- if (running)
- put_prev_task(rq, p);
+ if (prev_class != next_class)
+ queue_flag |= DEQUEUE_CLASS;
- /*
- * Boosting condition are:
- * 1. -rt task is running and holds mutex A
- * --> -dl task blocks on mutex A
- *
- * 2. -dl task is running and holds mutex A
- * --> -dl task blocks on mutex A and could preempt the
- * running task
- */
- if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_prio(pi_task->prio) &&
- dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.pi_se = pi_task->dl.pi_se;
- queue_flag |= ENQUEUE_REPLENISH;
+ scoped_guard (sched_change, p, queue_flag) {
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (oldprio < prio)
+ scope->flags |= ENQUEUE_HEAD;
} else {
- p->dl.pi_se = &p->dl;
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
}
- } else if (rt_prio(prio)) {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (oldprio < prio)
- queue_flag |= ENQUEUE_HEAD;
- } else {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (rt_prio(oldprio))
- p->rt.timeout = 0;
- }
-
- p->sched_class = next_class;
- p->prio = prio;
-
- check_class_changing(rq, p, prev_class);
- if (queued)
- enqueue_task(rq, p, queue_flag);
- if (running)
- set_next_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
+ p->sched_class = next_class;
+ p->prio = prio;
+ }
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_rq_unlock(rq);
+ rq_repin_lock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
preempt_enable();
}
@@ -8084,26 +8004,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- bool queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(p, &rf);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
- if (running)
- put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
- task_rq_unlock(rq, p, &rf);
+ guard(task_rq_lock)(p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->numa_preferred_nid = nid;
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -9205,38 +9108,23 @@ static void sched_change_group(struct task_struct *tsk)
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- int queued, running, queue_flags =
- DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- update_rq_clock(rq);
-
- running = task_current_donor(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, queue_flags);
- if (running)
- put_prev_task(rq, tsk);
-
- sched_change_group(tsk);
- if (!for_autogroup)
- scx_cgroup_move_task(tsk);
+ scoped_guard (sched_change, tsk, queue_flags) {
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
+ if (scope->running)
+ resched = true;
+ }
- if (queued)
- enqueue_task(rq, tsk, queue_flags);
- if (running) {
- set_next_task(rq, tsk);
- /*
- * After changing group, the running task may have joined a
- * throttled one but it's still the running task. Trigger a
- * resched to make sure that task can still run.
- */
+ if (resched)
resched_curr(rq);
- }
}
static struct cgroup_subsys_state *
@@ -10892,37 +10780,75 @@ void sched_mm_cid_fork(struct task_struct *t)
}
#endif /* CONFIG_SCHED_MM_CID */
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx)
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
{
+ struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
struct rq *rq = task_rq(p);
+ /*
+ * Must exclusively use matched flags since this is both dequeue and
+ * enqueue.
+ */
+ WARN_ON_ONCE(flags & 0xFFFF0000);
+
lockdep_assert_rq_held(rq);
- *ctx = (struct sched_enq_and_set_ctx){
+ if (!(flags & DEQUEUE_NOCLOCK)) {
+ update_rq_clock(rq);
+ flags |= DEQUEUE_NOCLOCK;
+ }
+
+ if (flags & DEQUEUE_CLASS) {
+ if (p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
+ }
+
+ *ctx = (struct sched_change_ctx){
.p = p,
- .queue_flags = queue_flags,
+ .flags = flags,
.queued = task_on_rq_queued(p),
- .running = task_current(rq, p),
+ .running = task_current_donor(rq, p),
};
- update_rq_clock(rq);
+ if (!(flags & DEQUEUE_CLASS)) {
+ if (p->sched_class->get_prio)
+ ctx->prio = p->sched_class->get_prio(rq, p);
+ else
+ ctx->prio = p->prio;
+ }
+
if (ctx->queued)
- dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ dequeue_task(rq, p, flags);
if (ctx->running)
put_prev_task(rq, p);
+
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+ p->sched_class->switched_from(rq, p);
+
+ return ctx;
}
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
{
- struct rq *rq = task_rq(ctx->p);
+ struct task_struct *p = ctx->p;
+ struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+
if (ctx->queued)
- enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags);
if (ctx->running)
- set_next_task(rq, ctx->p);
+ set_next_task(rq, p);
+
+ if (ctx->flags & ENQUEUE_CLASS) {
+ if (p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
+ } else {
+ p->sched_class->prio_changed(rq, p, ctx->prio);
+ }
}
-#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index cdd740b3f774..37b572cc8aca 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
* cpudl_clear - remove a CPU from the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target CPU
+ * @online: the online state of the deadline runqueue
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
-void cpudl_clear(struct cpudl *cp, int cpu)
+void cpudl_clear(struct cpudl *cp, int cpu, bool online)
{
int old_idx, new_cpu;
unsigned long flags;
@@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu)
if (old_idx == IDX_INVALID) {
/*
* Nothing to remove if old_idx was invalid.
- * This could happen if a rq_offline_dl is
+ * This could happen if rq_online_dl or rq_offline_dl is
* called for a CPU without -dl tasks running.
*/
} else {
@@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu)
cp->elements[new_cpu].idx = old_idx;
cp->elements[cpu].idx = IDX_INVALID;
cpudl_heapify(cp, old_idx);
-
- cpumask_set_cpu(cpu, cp->free_cpus);
}
+ if (likely(online))
+ __cpumask_set_cpu(cpu, cp->free_cpus);
+ else
+ __cpumask_clear_cpu(cpu, cp->free_cpus);
+
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
@@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
cp->elements[new_idx].cpu = cpu;
cp->elements[cpu].idx = new_idx;
cpudl_heapify_up(cp, new_idx);
- cpumask_clear_cpu(cpu, cp->free_cpus);
+ __cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
cp->elements[old_idx].dl = dl;
cpudl_heapify(cp, old_idx);
@@ -238,26 +242,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
}
/*
- * cpudl_set_freecpu - Set the cpudl.free_cpus
- * @cp: the cpudl max-heap context
- * @cpu: rd attached CPU
- */
-void cpudl_set_freecpu(struct cpudl *cp, int cpu)
-{
- cpumask_set_cpu(cpu, cp->free_cpus);
-}
-
-/*
- * cpudl_clear_freecpu - Clear the cpudl.free_cpus
- * @cp: the cpudl max-heap context
- * @cpu: rd attached CPU
- */
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
-{
- cpumask_clear_cpu(cpu, cp->free_cpus);
-}
-
-/*
* cpudl_init - initialize the cpudl structure
* @cp: the cpudl max-heap context
*/
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 11c0f1faa7e1..d7699468eedd 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -19,8 +19,6 @@ struct cpudl {
int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
-void cpudl_clear(struct cpudl *cp, int cpu);
+void cpudl_clear(struct cpudl *cp, int cpu, bool online);
int cpudl_init(struct cpudl *cp);
-void cpudl_set_freecpu(struct cpudl *cp, int cpu);
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 615411a0a881..48357d4609bf 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -405,7 +405,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
* up, and checks if the task is still in the "ACTIVE non contending"
* state or not (in the second case, it updates running_bw).
*/
-static void task_non_contending(struct sched_dl_entity *dl_se)
+static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task)
{
struct hrtimer *timer = &dl_se->inactive_timer;
struct rq *rq = rq_of_dl_se(dl_se);
@@ -444,10 +444,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
} else {
struct task_struct *p = dl_task_of(dl_se);
- if (dl_task(p))
+ if (dl_task)
sub_running_bw(dl_se, dl_rq);
- if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+ if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (READ_ONCE(p->__state) == TASK_DEAD)
@@ -1808,7 +1808,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (!dl_rq->dl_nr_running) {
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
- cpudl_clear(&rq->rd->cpudl, rq->cpu);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online);
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
@@ -2045,7 +2045,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
* or "inactive")
*/
if (flags & DEQUEUE_SLEEP)
- task_non_contending(dl_se);
+ task_non_contending(dl_se, true);
}
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -2352,7 +2352,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
* __pick_next_task_dl - Helper to pick the next -deadline task to run.
* @rq: The runqueue to pick the next task from.
*/
-static struct task_struct *__pick_task_dl(struct rq *rq)
+static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
@@ -2366,7 +2366,7 @@ again:
WARN_ON_ONCE(!dl_se);
if (dl_server(dl_se)) {
- p = dl_se->server_pick_task(dl_se);
+ p = dl_se->server_pick_task(dl_se, rf);
if (!p) {
dl_server_stop(dl_se);
goto again;
@@ -2379,9 +2379,9 @@ again:
return p;
}
-static struct task_struct *pick_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf)
{
- return __pick_task_dl(rq);
+ return __pick_task_dl(rq, rf);
}
static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
@@ -2880,9 +2880,10 @@ static void rq_online_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_set_overload(rq);
- cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
+ else
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, true);
}
/* Assumes rq->lock is held */
@@ -2891,8 +2892,7 @@ static void rq_offline_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_clear_overload(rq);
- cpudl_clear(&rq->rd->cpudl, rq->cpu);
- cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, false);
}
void __init init_sched_dl_class(void)
@@ -2970,7 +2970,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* will reset the task parameters.
*/
if (task_on_rq_queued(p) && p->dl.dl_runtime)
- task_non_contending(&p->dl);
+ task_non_contending(&p->dl, false);
/*
* In case a task is setscheduled out from SCHED_DEADLINE we need to
@@ -3042,23 +3042,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
}
}
+static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
+{
+ return p->dl.deadline;
+}
+
/*
* If the scheduling parameters of a -deadline task changed,
* a push or pull operation might be needed.
*/
-static void prio_changed_dl(struct rq *rq, struct task_struct *p,
- int oldprio)
+static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline)
{
if (!task_on_rq_queued(p))
return;
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
+ if (p->dl.deadline == old_deadline)
+ return;
+
+ if (dl_time_before(old_deadline, p->dl.deadline))
deadline_queue_pull_task(rq);
if (task_current_donor(rq, p)) {
@@ -3091,6 +3092,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
DEFINE_SCHED_CLASS(dl) = {
+ .queue_mask = 8,
+
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -3113,6 +3116,7 @@ DEFINE_SCHED_CLASS(dl) = {
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
+ .get_prio = get_prio_dl,
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5d6fdb31f18a..adff739b396c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2066,7 +2066,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
+ rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
@@ -2172,42 +2172,6 @@ has_tasks:
return true;
}
-static int balance_scx(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
-{
- int ret;
-
- rq_unpin_lock(rq, rf);
-
- ret = balance_one(rq, prev);
-
-#ifdef CONFIG_SCHED_SMT
- /*
- * When core-sched is enabled, this ops.balance() call will be followed
- * by pick_task_scx() on this CPU and the SMT siblings. Balance the
- * siblings too.
- */
- if (sched_core_enabled(rq)) {
- const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
- int scpu;
-
- for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
- struct rq *srq = cpu_rq(scpu);
- struct task_struct *sprev = srq->curr;
-
- WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
- update_rq_clock(srq);
- balance_one(srq, sprev);
- }
- }
-#endif
- rq_repin_lock(rq, rf);
-
- maybe_queue_balance_callback(rq);
-
- return ret;
-}
-
static void process_ddsp_deferred_locals(struct rq *rq)
{
struct task_struct *p;
@@ -2387,41 +2351,26 @@ static struct task_struct *first_local_task(struct rq *rq)
struct task_struct, scx.dsq_list.node);
}
-static struct task_struct *pick_task_scx(struct rq *rq)
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *prev = rq->curr;
+ bool keep_prev, kick_idle = false;
struct task_struct *p;
- bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
- bool kick_idle = false;
- /*
- * WORKAROUND:
- *
- * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
- * have gone through balance_scx(). Unfortunately, there currently is a
- * bug where fair could say yes on balance() but no on pick_task(),
- * which then ends up calling pick_task_scx() without preceding
- * balance_scx().
- *
- * Keep running @prev if possible and avoid stalling from entering idle
- * without balancing.
- *
- * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
- * if pick_task_scx() is called without preceding balance_scx().
- */
- if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
- if (prev->scx.flags & SCX_TASK_QUEUED) {
- keep_prev = true;
- } else {
- keep_prev = false;
- kick_idle = true;
- }
- } else if (unlikely(keep_prev &&
- prev->sched_class != &ext_sched_class)) {
- /*
- * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
- * conditional on scx_enabled() and may have been skipped.
- */
+ rq_modified_clear(rq);
+
+ rq_unpin_lock(rq, rf);
+ balance_one(rq, prev);
+ rq_repin_lock(rq, rf);
+
+ maybe_queue_balance_callback(rq);
+
+ if (rq_modified_above(rq, &ext_sched_class))
+ return RETRY_TASK;
+
+ keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ if (unlikely(keep_prev &&
+ prev->sched_class != &ext_sched_class)) {
WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -3016,7 +2965,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
p, p->scx.weight);
}
-static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
{
}
@@ -3301,6 +3250,8 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
+ .queue_mask = 1,
+
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
@@ -3308,7 +3259,6 @@ DEFINE_SCHED_CLASS(ext) = {
.wakeup_preempt = wakeup_preempt_scx,
- .balance = balance_scx,
.pick_task = pick_task_scx,
.put_prev_task = put_prev_task_scx,
@@ -3849,11 +3799,10 @@ static void scx_bypass(bool bypass)
*/
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
scx.runnable_node) {
- struct sched_enq_and_set_ctx ctx;
-
/* cycling deq/enq is enough, see the function comment */
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /* nothing */ ;
+ }
}
/* resched to restore ticks and idle state */
@@ -4003,22 +3952,20 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ update_rq_clock(task_rq(p));
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
-
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, queue_flags) {
+ p->sched_class = new_class;
+ }
- check_class_changed(task_rq(p), p, old_class, p->prio);
scx_exit_task(p);
}
scx_task_iter_stop(&sti);
@@ -4786,26 +4733,22 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
if (!tryget_task_struct(p))
continue;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
- p->scx.slice = SCX_SLICE_DFL;
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
-
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, queue_flags) {
+ p->scx.slice = SCX_SLICE_DFL;
+ p->sched_class = new_class;
+ }
- check_class_changed(task_rq(p), p, old_class, p->prio);
put_task_struct(p);
}
scx_task_iter_stop(&sti);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bc0b7ce8a65d..2554055c1ba1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8705,15 +8705,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context
set_task_max_allowed_capacity(p);
}
-static int
-balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-{
- if (sched_fair_runnable(rq))
- return 1;
-
- return sched_balance_newidle(rq, rf) != 0;
-}
-
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
@@ -8822,7 +8813,7 @@ preempt:
resched_curr_lazy(rq);
}
-static struct task_struct *pick_task_fair(struct rq *rq)
+static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
@@ -8866,7 +8857,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
- p = pick_task_fair(rq);
+ p = pick_task_fair(rq, rf);
if (!p)
goto idle;
se = &p->se;
@@ -8945,14 +8936,10 @@ idle:
return NULL;
}
-static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+static struct task_struct *
+fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
{
- return pick_next_task_fair(rq, prev, NULL);
-}
-
-static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
-{
- return pick_task_fair(dl_se->rq);
+ return pick_task_fair(dl_se->rq, rf);
}
void fair_server_init(struct rq *rq)
@@ -9007,7 +8994,19 @@ static void yield_task_fair(struct rq *rq)
*/
rq_clock_skip_update(rq);
- se->deadline += calc_delta_fair(se->slice, se);
+ /*
+ * Forfeit the remaining vruntime, only if the entity is eligible. This
+ * condition is necessary because in core scheduling we prefer to run
+ * ineligible tasks rather than force idling. If this happens we may
+ * end up in a loop where the core scheduler picks the yielding task,
+ * which yields immediately again; without the condition the vruntime
+ * ends up quickly running away.
+ */
+ if (entity_eligible(cfs_rq, se)) {
+ se->vruntime = se->deadline;
+ se->deadline += calc_delta_fair(se->slice, se);
+ update_min_vruntime(cfs_rq);
+ }
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
@@ -10671,7 +10670,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (sd->flags & SD_ASYM_CPUCAPACITY)
sgs->group_misfit_task_load = 1;
- for_each_cpu(i, sched_group_span(group)) {
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -12829,6 +12828,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
}
rcu_read_unlock();
+ rq_modified_clear(this_rq);
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
@@ -12886,8 +12886,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
+ /* If a higher prio class was modified, restart the pick */
+ if (rq_modified_above(this_rq, &fair_sched_class))
pulled_task = -1;
out:
@@ -13138,11 +13138,14 @@ static void task_fork_fair(struct task_struct *p)
* the current task.
*/
static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
if (rq->cfs.nr_queued == 1)
return;
@@ -13154,8 +13157,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
- } else
+ } else {
wakeup_preempt(rq, p, 0);
+ }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -13237,6 +13241,12 @@ static void attach_task_cfs_rq(struct task_struct *p)
attach_entity_cfs_rq(se);
}
+static void switching_from_fair(struct rq *rq, struct task_struct *p)
+{
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+}
+
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -13611,6 +13621,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
*/
DEFINE_SCHED_CLASS(fair) = {
+ .queue_mask = 2,
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
@@ -13619,11 +13631,10 @@ DEFINE_SCHED_CLASS(fair) = {
.wakeup_preempt = check_preempt_wakeup_fair,
.pick_task = pick_task_fair,
- .pick_next_task = __pick_next_task_fair,
+ .pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
- .balance = balance_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -13638,6 +13649,7 @@ DEFINE_SCHED_CLASS(fair) = {
.reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair,
+ .switching_from = switching_from_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c39b089d4f09..7fa0b593bcff 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -466,7 +466,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
next->se.exec_start = rq_clock_task(rq);
}
-struct task_struct *pick_task_idle(struct rq *rq)
+struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
{
scx_update_idle(rq, true, false);
return rq->idle;
@@ -498,14 +498,17 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio)
{
+ if (p->prio == oldprio)
+ return;
+
BUG();
}
@@ -518,6 +521,8 @@ static void update_curr_idle(struct rq *rq)
*/
DEFINE_SCHED_CLASS(idle) = {
+ .queue_mask = 0,
+
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
@@ -536,6 +541,6 @@ DEFINE_SCHED_CLASS(idle) = {
.task_tick = task_tick_idle,
.prio_changed = prio_changed_idle,
- .switched_to = switched_to_idle,
+ .switching_to = switching_to_idle,
.update_curr = update_curr_idle,
};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7936d4333731..1fd97f2d7ec6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *pick_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *p;
@@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* us to initiate a push or pull.
*/
static void
-prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
if (task_current_donor(rq, p)) {
/*
* If our priority decreases while running, we
@@ -2566,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
DEFINE_SCHED_CLASS(rt) = {
+ .queue_mask = 4,
+
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
@@ -2589,8 +2594,8 @@ DEFINE_SCHED_CLASS(rt) = {
.get_rr_interval = get_rr_interval_rt,
- .prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+ .prio_changed = prio_changed_rt,
.update_curr = update_curr_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3f7fab3d7960..63ffb3eafd05 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -20,7 +20,6 @@
#include <linux/sched/task_flags.h>
#include <linux/sched/task.h>
#include <linux/sched/topology.h>
-
#include <linux/atomic.h>
#include <linux/bitmap.h>
#include <linux/bug.h>
@@ -780,7 +779,6 @@ enum scx_rq_flags {
*/
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
- SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
@@ -1120,6 +1118,8 @@ struct rq {
/* runqueue lock: */
raw_spinlock_t __lock;
+ /* Per class runqueue modification mask; bits in class order. */
+ unsigned int queue_mask;
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -1827,7 +1827,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock);
-static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+static inline void
+__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
@@ -1839,8 +1840,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
__releases(p->pi_lock)
{
- rq_unpin_lock(rq, rf);
- raw_spin_rq_unlock(rq);
+ __task_rq_unlock(rq, p, rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1849,6 +1849,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+ _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+ __task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
@@ -2342,8 +2347,7 @@ extern const u32 sched_prio_to_wmult[40];
/*
* {de,en}queue flags:
*
- * DEQUEUE_SLEEP - task is no longer runnable
- * ENQUEUE_WAKEUP - task just became runnable
+ * SLEEP/WAKEUP - task is no-longer/just-became runnable
*
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
* are in a known state which allows modification. Such pairs
@@ -2356,34 +2360,46 @@ extern const u32 sched_prio_to_wmult[40];
*
* MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
*
+ * DELAYED - de/re-queue a sched_delayed task
+ *
+ * CLASS - going to update p->sched_class; makes sched_change call the
+ * various switch methods.
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
* ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
*
+ * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but
+ * SCHED_DEADLINE seems to rely on this for now.
*/
-#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */
-#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
-#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
-#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
-#define DEQUEUE_SPECIAL 0x10
-#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
-#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
-#define DEQUEUE_THROTTLE 0x800
-
-#define ENQUEUE_WAKEUP 0x01
-#define ENQUEUE_RESTORE 0x02
-#define ENQUEUE_MOVE 0x04
-#define ENQUEUE_NOCLOCK 0x08
-
-#define ENQUEUE_HEAD 0x10
-#define ENQUEUE_REPLENISH 0x20
-#define ENQUEUE_MIGRATED 0x40
-#define ENQUEUE_INITIAL 0x80
-#define ENQUEUE_MIGRATING 0x100
-#define ENQUEUE_DELAYED 0x200
-#define ENQUEUE_RQ_SELECTED 0x400
+#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */
+#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */
+
+#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */
+
+#define DEQUEUE_SPECIAL 0x00010000
+#define DEQUEUE_THROTTLE 0x00020000
+
+#define ENQUEUE_WAKEUP 0x0001
+#define ENQUEUE_RESTORE 0x0002
+#define ENQUEUE_MOVE 0x0004
+#define ENQUEUE_NOCLOCK 0x0008
+
+#define ENQUEUE_MIGRATING 0x0010
+#define ENQUEUE_DELAYED 0x0020
+#define ENQUEUE_CLASS 0x0040
+
+#define ENQUEUE_HEAD 0x00010000
+#define ENQUEUE_REPLENISH 0x00020000
+#define ENQUEUE_MIGRATED 0x00040000
+#define ENQUEUE_INITIAL 0x00080000
+#define ENQUEUE_RQ_SELECTED 0x00100000
#define RETRY_TASK ((void *)-1UL)
@@ -2400,16 +2416,61 @@ struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
+ /*
+ * idle: 0
+ * ext: 1
+ * fair: 2
+ * rt: 4
+ * dl: 8
+ * stop: 16
+ */
+ unsigned int queue_mask;
+ /*
+ * move_queued_task/activate_task/enqueue_task: rq->lock
+ * ttwu_do_activate/activate_task/enqueue_task: rq->lock
+ * wake_up_new_task/activate_task/enqueue_task: task_rq_lock
+ * ttwu_runnable/enqueue_task: task_rq_lock
+ * proxy_task_current: rq->lock
+ * sched_change_end
+ */
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * move_queued_task/deactivate_task/dequeue_task: rq->lock
+ * __schedule/block_task/dequeue_task: rq->lock
+ * proxy_task_current: rq->lock
+ * wait_task_inactive: task_rq_lock
+ * sched_change_begin
+ */
bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * do_sched_yield: rq->lock
+ */
void (*yield_task) (struct rq *rq);
+ /*
+ * yield_to: rq->lock (double)
+ */
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
+ /*
+ * move_queued_task: rq->lock
+ * __migrate_swap_task: rq->lock
+ * ttwu_do_activate: rq->lock
+ * ttwu_runnable: task_rq_lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * schedule/pick_next_task/prev_balance: rq->lock
+ */
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
- struct task_struct *(*pick_task)(struct rq *rq);
+
+ /*
+ * schedule/pick_next_task: rq->lock
+ */
+ struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
/*
* Optional! When implemented pick_next_task() should be equivalent to:
*
@@ -2419,55 +2480,123 @@ struct sched_class {
* set_next_task_first(next);
* }
*/
- struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+ struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf);
+ /*
+ * sched_change:
+ * __schedule: rq->lock
+ */
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+ /*
+ * select_task_rq: p->pi_lock
+ * sched_exec: p->pi_lock
+ */
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+ /*
+ * set_task_cpu: p->pi_lock || rq->lock (ttwu like)
+ */
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+ /*
+ * ttwu_do_activate: rq->lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+ /*
+ * do_set_cpus_allowed: task_rq_lock + sched_change
+ */
void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
+ /*
+ * sched_set_rq_{on,off}line: rq->lock
+ */
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+ /*
+ * push_cpu_stop: p->pi_lock && rq->lock
+ */
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
+ /*
+ * hrtick: rq->lock
+ * sched_tick: rq->lock
+ * sched_tick_remote: rq->lock
+ */
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+ /*
+ * sched_cgroup_fork: p->pi_lock
+ */
void (*task_fork)(struct task_struct *p);
+ /*
+ * finish_task_switch: no locks
+ */
void (*task_dead)(struct task_struct *p);
/*
- * The switched_from() call is allowed to drop rq->lock, therefore we
- * cannot assume the switched_from/switched_to pair is serialized by
- * rq->lock. They are however serialized by p->pi_lock.
+ * sched_change
+ */
+ void (*switching_from)(struct rq *this_rq, struct task_struct *task);
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switching_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
+ void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+ u64 oldprio);
+
+ /*
+ * set_load_weight: task_rq_lock + sched_change
+ * __setscheduler_parms: task_rq_lock + sched_change
*/
- void (*switching_to) (struct rq *this_rq, struct task_struct *task);
- void (*switched_from)(struct rq *this_rq, struct task_struct *task);
- void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
const struct load_weight *lw);
- void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
- int oldprio);
+ /*
+ * sched_rr_get_interval: task_rq_lock
+ */
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
+ /*
+ * task_sched_runtime: task_rq_lock
+ */
void (*update_curr)(struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * sched_change_group: task_rq_lock + sched_change
+ */
void (*task_change_group)(struct task_struct *p);
#endif
#ifdef CONFIG_SCHED_CORE
+ /*
+ * pick_next_task: rq->lock
+ * try_steal_cookie: rq->lock (double)
+ */
int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
+/*
+ * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
+ */
+static inline void rq_modified_clear(struct rq *rq)
+{
+ rq->queue_mask = 0;
+}
+
+static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
+{
+ unsigned int mask = class->queue_mask;
+ return rq->queue_mask & ~((mask << 1) - 1);
+}
+
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->donor != prev);
@@ -2579,8 +2708,9 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_queued > 0;
}
-extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-extern struct task_struct *pick_task_idle(struct rq *rq);
+extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf);
+extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
@@ -2610,7 +2740,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
static inline cpumask_t *alloc_user_cpus_ptr(int node)
{
/*
- * See do_set_cpus_allowed() above for the rcu_head usage.
+ * See set_cpus_allowed_force() above for the rcu_head usage.
*/
int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
@@ -3877,32 +4007,42 @@ extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class);
-extern void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio);
-
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
-#ifdef CONFIG_SCHED_CLASS_EXT
/*
- * Used by SCX in the enable/disable paths to move tasks between sched_classes
- * and establish invariants.
+ * The 'sched_change' pattern is the safe, easy and slow way of changing a
+ * task's scheduling properties. It dequeues a task, such that the scheduler
+ * is fully unaware of it; at which point its properties can be modified;
+ * after which it is enqueued again.
+ *
+ * Typically this must be called while holding task_rq_lock, since most/all
+ * properties are serialized under those locks. There is currently one
+ * exception to this rule in sched/ext which only holds rq->lock.
+ */
+
+/*
+ * This structure is a temporary, used to preserve/convey the queueing state
+ * of the task between sched_change_begin() and sched_change_end(). Ensuring
+ * the task's queueing state is idempotent across the operation.
*/
-struct sched_enq_and_set_ctx {
+struct sched_change_ctx {
+ u64 prio;
struct task_struct *p;
- int queue_flags;
+ int flags;
bool queued;
bool running;
};
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx);
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
+void sched_change_end(struct sched_change_ctx *ctx);
-#endif /* CONFIG_SCHED_CLASS_EXT */
+DEFINE_CLASS(sched_change, struct sched_change_ctx *,
+ sched_change_end(_T),
+ sched_change_begin(p, flags),
+ struct task_struct *p, unsigned int flags)
+
+DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
#include "ext.h"
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 26f3fd4d34ce..cbf7206b3f9d 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
rq = __task_rq_lock(p, &rf);
psi_task_change(p, p->psi_flags, 0);
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
}
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 2d4e279f05ee..4f9192be4b5b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *pick_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf)
{
if (!sched_stop_runnable(rq))
return NULL;
@@ -75,14 +75,17 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
+static void switching_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
static void
-prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio)
{
+ if (p->prio == oldprio)
+ return;
+
BUG(); /* how!?, what priority? */
}
@@ -95,6 +98,8 @@ static void update_curr_stop(struct rq *rq)
*/
DEFINE_SCHED_CLASS(stop) = {
+ .queue_mask = 16,
+
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
@@ -112,6 +117,6 @@ DEFINE_SCHED_CLASS(stop) = {
.task_tick = task_tick_stop,
.prio_changed = prio_changed_stop,
- .switched_to = switched_to_stop,
+ .switching_to = switching_to_stop,
.update_curr = update_curr_stop,
};
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 77ae87f36e84..8f0f603b530b 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -64,8 +64,6 @@ static int effective_prio(struct task_struct *p)
void set_user_nice(struct task_struct *p, long nice)
{
- bool queued, running;
- struct rq *rq;
int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -74,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- CLASS(task_rq_lock, rq_guard)(p);
- rq = rq_guard.rq;
-
- update_rq_clock(rq);
+ guard(task_rq_lock)(p);
/*
* The RT priorities are set via sched_setscheduler(), but we still
@@ -90,28 +85,12 @@ void set_user_nice(struct task_struct *p, long nice)
return;
}
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- if (running)
- put_prev_task(rq, p);
-
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p, true);
- old_prio = p->prio;
- p->prio = effective_prio(p);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
-
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- p->sched_class->prio_changed(rq, p, old_prio);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ }
}
EXPORT_SYMBOL(set_user_nice);
@@ -515,7 +494,7 @@ int __sched_setscheduler(struct task_struct *p,
bool user, bool pi)
{
int oldpolicy = -1, policy = attr->sched_policy;
- int retval, oldprio, newprio, queued, running;
+ int retval, oldprio, newprio;
const struct sched_class *prev_class, *next_class;
struct balance_callback *head;
struct rq_flags rf;
@@ -695,38 +674,27 @@ change:
prev_class = p->sched_class;
next_class = __setscheduler_class(policy, newprio);
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
+ if (prev_class != next_class)
+ queue_flags |= DEQUEUE_CLASS;
- if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
- __setscheduler_params(p, attr);
- p->sched_class = next_class;
- p->prio = newprio;
- }
- __setscheduler_uclamp(p, attr);
- check_class_changing(rq, p, prev_class);
+ scoped_guard (sched_change, p, queue_flags) {
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- if (oldprio < p->prio)
- queue_flags |= ENQUEUE_HEAD;
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ p->sched_class = next_class;
+ p->prio = newprio;
+ }
+ __setscheduler_uclamp(p, attr);
- enqueue_task(rq, p, queue_flags);
+ if (scope->queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ scope->flags |= ENQUEUE_HEAD;
+ }
}
- if (running)
- set_next_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
/* Avoid rq from going away on us: */
preempt_disable();
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 444bdfdab731..711076aa4980 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1590,10 +1590,17 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
enum numa_topology_type sched_numa_topology_type;
+/*
+ * sched_domains_numa_distance is derived from sched_numa_node_distance
+ * and provides a simplified view of NUMA distances used specifically
+ * for building NUMA scheduling domains.
+ */
static int sched_domains_numa_levels;
+static int sched_numa_node_levels;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
+static int *sched_numa_node_distance;
static struct cpumask ***sched_domains_numa_masks;
#endif /* CONFIG_NUMA */
@@ -1845,10 +1852,10 @@ bool find_numa_distance(int distance)
return true;
rcu_read_lock();
- distances = rcu_dereference(sched_domains_numa_distance);
+ distances = rcu_dereference(sched_numa_node_distance);
if (!distances)
goto unlock;
- for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (i = 0; i < sched_numa_node_levels; i++) {
if (distances[i] == distance) {
found = true;
break;
@@ -1924,14 +1931,34 @@ static void init_numa_topology_type(int offline_node)
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
-void sched_init_numa(int offline_node)
+/*
+ * An architecture could modify its NUMA distance, to change
+ * grouping of NUMA nodes and number of NUMA levels when creating
+ * NUMA level sched domains.
+ *
+ * A NUMA level is created for each unique
+ * arch_sched_node_distance.
+ */
+static int numa_node_dist(int i, int j)
{
- struct sched_domain_topology_level *tl;
- unsigned long *distance_map;
+ return node_distance(i, j);
+}
+
+int arch_sched_node_distance(int from, int to)
+ __weak __alias(numa_node_dist);
+
+static bool modified_sched_node_distance(void)
+{
+ return numa_node_dist != arch_sched_node_distance;
+}
+
+static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
+ int **dist, int *levels)
+{
+ unsigned long *distance_map __free(bitmap) = NULL;
int nr_levels = 0;
int i, j;
int *distances;
- struct cpumask ***masks;
/*
* O(nr_nodes^2) de-duplicating selection sort -- in order to find the
@@ -1939,17 +1966,16 @@ void sched_init_numa(int offline_node)
*/
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
if (!distance_map)
- return;
+ return -ENOMEM;
bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for_each_cpu_node_but(i, offline_node) {
for_each_cpu_node_but(j, offline_node) {
- int distance = node_distance(i, j);
+ int distance = n_dist(i, j);
if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
sched_numa_warn("Invalid distance value range");
- bitmap_free(distance_map);
- return;
+ return -EINVAL;
}
bitmap_set(distance_map, distance, 1);
@@ -1962,18 +1988,46 @@ void sched_init_numa(int offline_node)
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
- if (!distances) {
- bitmap_free(distance_map);
- return;
- }
+ if (!distances)
+ return -ENOMEM;
for (i = 0, j = 0; i < nr_levels; i++, j++) {
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
distances[i] = j;
}
- rcu_assign_pointer(sched_domains_numa_distance, distances);
+ *dist = distances;
+ *levels = nr_levels;
+
+ return 0;
+}
+
+void sched_init_numa(int offline_node)
+{
+ struct sched_domain_topology_level *tl;
+ int nr_levels, nr_node_levels;
+ int i, j;
+ int *distances, *domain_distances;
+ struct cpumask ***masks;
- bitmap_free(distance_map);
+ /* Record the NUMA distances from SLIT table */
+ if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
+ &nr_node_levels))
+ return;
+
+ /* Record modified NUMA distances for building sched domains */
+ if (modified_sched_node_distance()) {
+ if (sched_record_numa_dist(offline_node, arch_sched_node_distance,
+ &domain_distances, &nr_levels)) {
+ kfree(distances);
+ return;
+ }
+ } else {
+ domain_distances = distances;
+ nr_levels = nr_node_levels;
+ }
+ rcu_assign_pointer(sched_numa_node_distance, distances);
+ WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]);
+ WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
/*
* 'nr_levels' contains the number of unique distances
@@ -1991,6 +2045,8 @@ void sched_init_numa(int offline_node)
*
* We reset it to 'nr_levels' at the end of this function.
*/
+ rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
+
sched_domains_numa_levels = 0;
masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
@@ -2016,10 +2072,13 @@ void sched_init_numa(int offline_node)
masks[i][j] = mask;
for_each_cpu_node_but(k, offline_node) {
- if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ if (sched_debug() &&
+ (arch_sched_node_distance(j, k) !=
+ arch_sched_node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
- if (node_distance(j, k) > sched_domains_numa_distance[i])
+ if (arch_sched_node_distance(j, k) >
+ sched_domains_numa_distance[i])
continue;
cpumask_or(mask, mask, cpumask_of_node(k));
@@ -2059,7 +2118,6 @@ void sched_init_numa(int offline_node)
sched_domain_topology = tl;
sched_domains_numa_levels = nr_levels;
- WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
init_numa_topology_type(offline_node);
}
@@ -2067,14 +2125,18 @@ void sched_init_numa(int offline_node)
static void sched_reset_numa(void)
{
- int nr_levels, *distances;
+ int nr_levels, *distances, *dom_distances = NULL;
struct cpumask ***masks;
nr_levels = sched_domains_numa_levels;
+ sched_numa_node_levels = 0;
sched_domains_numa_levels = 0;
sched_max_numa_distance = 0;
sched_numa_topology_type = NUMA_DIRECT;
- distances = sched_domains_numa_distance;
+ distances = sched_numa_node_distance;
+ if (sched_numa_node_distance != sched_domains_numa_distance)
+ dom_distances = sched_domains_numa_distance;
+ rcu_assign_pointer(sched_numa_node_distance, NULL);
rcu_assign_pointer(sched_domains_numa_distance, NULL);
masks = sched_domains_numa_masks;
rcu_assign_pointer(sched_domains_numa_masks, NULL);
@@ -2083,6 +2145,7 @@ static void sched_reset_numa(void)
synchronize_rcu();
kfree(distances);
+ kfree(dom_distances);
for (i = 0; i < nr_levels && masks; i++) {
if (!masks[i])
continue;
@@ -2129,7 +2192,8 @@ void sched_domains_numa_masks_set(unsigned int cpu)
continue;
/* Set ourselves in the remote node's masks */
- if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ if (arch_sched_node_distance(j, node) <=
+ sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
}