diff options
| -rw-r--r-- | arch/x86/include/asm/topology.h | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/smpboot.c | 70 | ||||
| -rw-r--r-- | include/linux/cleanup.h | 5 | ||||
| -rw-r--r-- | include/linux/sched.h | 11 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 2 | ||||
| -rw-r--r-- | kernel/kthread.c | 15 | ||||
| -rw-r--r-- | kernel/sched/core.c | 376 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.c | 34 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.h | 4 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 48 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 131 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 64 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 13 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 11 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 252 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 2 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 13 | ||||
| -rw-r--r-- | kernel/sched/syscalls.c | 82 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 108 |
19 files changed, 683 insertions, 560 deletions
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 21041898157a..df943881f15b 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick +extern int arch_sched_node_distance(int from, int to); + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index eb289abece23..5709c9cab195 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -515,6 +515,76 @@ static void __init build_sched_topology(void) set_sched_topology(topology); } +#ifdef CONFIG_NUMA +static int sched_avg_remote_distance; +static int avg_remote_numa_distance(void) +{ + int i, j; + int distance, nr_remote, total_distance; + + if (sched_avg_remote_distance > 0) + return sched_avg_remote_distance; + + nr_remote = 0; + total_distance = 0; + for_each_node_state(i, N_CPU) { + for_each_node_state(j, N_CPU) { + distance = node_distance(i, j); + + if (distance >= REMOTE_DISTANCE) { + nr_remote++; + total_distance += distance; + } + } + } + if (nr_remote) + sched_avg_remote_distance = total_distance / nr_remote; + else + sched_avg_remote_distance = REMOTE_DISTANCE; + + return sched_avg_remote_distance; +} + +int arch_sched_node_distance(int from, int to) +{ + int d = node_distance(from, to); + + switch (boot_cpu_data.x86_vfm) { + case INTEL_GRANITERAPIDS_X: + case INTEL_ATOM_DARKMONT_X: + + if (!x86_has_numa_in_package || topology_max_packages() == 1 || + d < REMOTE_DISTANCE) + return d; + + /* + * With SNC enabled, there could be too many levels of remote + * NUMA node distances, creating NUMA domain levels + * including local nodes and partial remote nodes. + * + * Trim finer distance tuning for NUMA nodes in remote package + * for the purpose of building sched domains. Group NUMA nodes + * in the remote package in the same sched group. + * Simplify NUMA domains and avoid extra NUMA levels including + * different remote NUMA nodes and local nodes. + * + * GNR and CWF don't expect systems with more than 2 packages + * and more than 2 hops between packages. Single average remote + * distance won't be appropriate if there are more than 2 + * packages as average distance to different remote packages + * could be different. + */ + WARN_ONCE(topology_max_packages() > 2, + "sched: Expect only up to 2 packages for GNR or CWF, " + "but saw %d packages when building sched domains.", + topology_max_packages()); + + d = avg_remote_numa_distance(); + } + return d; +} +#endif /* CONFIG_NUMA */ + void set_cpu_sibling_map(int cpu) { bool has_smt = __max_threads_per_core > 1; diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..ae381675455d 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -340,6 +340,11 @@ _label: \ #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond +#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return (void *)1; } + #define __GUARD_IS_ERR(_ptr) \ ({ \ unsigned long _rc = (__force unsigned long)(_ptr); \ diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..07576479c0ed 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -637,8 +637,8 @@ struct sched_rt_entity { #endif } __randomize_layout; -typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); -typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); +struct rq_flags; +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); struct sched_dl_entity { struct rb_node rb_node; @@ -730,9 +730,6 @@ struct sched_dl_entity { * dl_server_update(). * * @rq the runqueue this server is for - * - * @server_has_tasks() returns true if @server_pick return a - * runnable task. */ struct rq *rq; dl_server_pick_f server_pick_task; @@ -1861,8 +1858,8 @@ extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); -/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ -extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); +/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ +extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 52468d2c178a..185e820cd1df 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4180,7 +4180,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { - do_set_cpus_allowed(tsk, cs_mask); + set_cpus_allowed_force(tsk, cs_mask); changed = true; } rcu_read_unlock(); diff --git a/kernel/kthread.c b/kernel/kthread.c index 31b072e8d427..99a3808d086f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { - unsigned long flags; - if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, mask); + /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) @@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; - unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { @@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); - /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, affinity); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, affinity); mutex_unlock(&kthreads_hotplug_lock); out: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 198d2dd45f59..096e8d03d85e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -583,8 +583,8 @@ EXPORT_SYMBOL(__trace_set_current_state); * * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: * - * is set by activate_task() and cleared by deactivate_task(), under - * rq->lock. Non-zero indicates the task is runnable, the special + * is set by activate_task() and cleared by deactivate_task()/block_task(), + * under rq->lock. Non-zero indicates the task is runnable, the special * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * @@ -2089,6 +2089,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); + rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2121,6 +2122,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); + rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2169,37 +2171,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* - * ->switching_to() is called with the pi_lock and rq_lock held and must not - * mess with locking. - */ -void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class) -{ - if (prev_class != p->sched_class && p->sched_class->switching_to) - p->sched_class->switching_to(rq, p); -} - -/* - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, - * use the balance_callback list if you want balancing. - * - * this means any call to check_class_changed() must be followed by a call to - * balance_callback(). - */ -void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p); - - p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio || dl_task(p)) - p->sched_class->prio_changed(rq, p, oldprio); -} - void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; @@ -2362,7 +2333,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { @@ -2377,10 +2348,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) if (p->cpus_ptr != &p->cpus_mask) return; - /* - * Violates locking rules! See comment in __do_set_cpus_allowed(). - */ - __do_set_cpus_allowed(p, &ac); + scoped_guard (task_rq_lock, p) + do_set_cpus_allowed(p, &ac); } void ___migrate_enable(void) @@ -2613,7 +2582,8 @@ static int migration_cpu_stop(void *data) */ WARN_ON_ONCE(!pending->stop_pending); preempt_disable(); - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); preempt_enable(); @@ -2622,7 +2592,8 @@ static int migration_cpu_stop(void *data) out: if (pending) pending->stop_pending = false; - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); if (complete) complete_all(&pending->done); @@ -2693,56 +2664,19 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { - struct rq *rq = task_rq(p); - bool queued, running; - - /* - * This here violates the locking rules for affinity, since we're only - * supposed to change these variables while holding both rq->lock and - * p->pi_lock. - * - * HOWEVER, it magically works, because ttwu() is the only code that - * accesses these variables under p->pi_lock and only does so after - * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() - * before finish_task(). - * - * XXX do further audits, this smells like something putrid. - */ - if (ctx->flags & SCA_MIGRATE_DISABLE) - WARN_ON_ONCE(!p->on_cpu); - else - lockdep_assert_held(&p->pi_lock); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) { - /* - * Because __kthread_bind() calls this on blocked tasks without - * holding rq->lock. - */ - lockdep_assert_rq_held(rq); - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + scoped_guard (sched_change, p, DEQUEUE_SAVE) { + p->sched_class->set_cpus_allowed(p, ctx); + mm_set_cpus_allowed(p->mm, ctx->new_mask); } - if (running) - put_prev_task(rq, p); - - p->sched_class->set_cpus_allowed(p, ctx); - mm_set_cpus_allowed(p->mm, ctx->new_mask); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); } /* * Used for kthread_bind() and select_fallback_rq(), in both cases the user * affinity (if any) should be destroyed too. */ -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) { struct affinity_context ac = { .new_mask = new_mask, @@ -2754,7 +2688,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) struct rcu_head rcu; }; - __do_set_cpus_allowed(p, &ac); + scoped_guard (__task_rq_lock, p) + do_set_cpus_allowed(p, &ac); /* * Because this is called with p->pi_lock held, it is not possible @@ -2792,7 +2727,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, * Use pi_lock to protect content of user_cpus_ptr * * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent - * do_set_cpus_allowed(). + * set_cpus_allowed_force(). */ raw_spin_lock_irqsave(&src->pi_lock, flags); if (src->user_cpus_ptr) { @@ -3120,7 +3055,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, goto out; } - __do_set_cpus_allowed(p, ctx); + do_set_cpus_allowed(p, ctx); return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); @@ -3529,13 +3464,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: - /* - * XXX When called from select_task_rq() we only - * hold p->pi_lock and again violate locking order. - * - * More yuck to audit. - */ - do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); + set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: @@ -3777,7 +3706,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) ttwu_do_wakeup(p); ret = 1; } - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); return ret; } @@ -4231,7 +4160,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * __schedule(). See the comment for smp_mb__after_spinlock(). * * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer + * schedule()'s block_task() has 'happened' and p will no longer * care about it's own p->state. See the comment in __schedule(). */ smp_acquire__after_ctrl_dep(); @@ -4370,7 +4299,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) ret = func(p, arg); if (rq) - rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; @@ -5916,19 +5845,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; -#ifdef CONFIG_SCHED_CLASS_EXT - /* - * SCX requires a balance() call before every pick_task() including when - * waking up from SCHED_IDLE. If @start_class is below SCX, start from - * SCX instead. Also, set a flag to detect missing balance() call. - */ - if (scx_enabled()) { - rq->scx.flags |= SCX_RQ_BAL_PENDING; - if (sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; - } -#endif - /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same @@ -5972,7 +5888,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - p = pick_task_idle(rq); + p = pick_task_idle(rq, rf); put_prev_set_next_task(rq, prev, p); } @@ -5984,11 +5900,15 @@ restart: for_each_active_class(class) { if (class->pick_next_task) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) return p; } else { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) { put_prev_set_next_task(rq, prev, p); return p; @@ -6018,7 +5938,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) return a->core_cookie == b->core_cookie; } -static inline struct task_struct *pick_task(struct rq *rq) +/* + * Careful; this can return RETRY_TASK, it does not include the retry-loop + * itself due to the whole SMT pick retry thing below. + */ +static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -6026,7 +5950,7 @@ static inline struct task_struct *pick_task(struct rq *rq) rq->dl_server = NULL; for_each_active_class(class) { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); if (p) return p; } @@ -6041,7 +5965,7 @@ static void queue_core_balance(struct rq *rq); static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *next, *p, *max = NULL; + struct task_struct *next, *p, *max; const struct cpumask *smt_mask; bool fi_before = false; bool core_clock_updated = (rq == rq->core); @@ -6126,7 +6050,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * and there are no cookied tasks running on siblings. */ if (!need_sync) { - next = pick_task(rq); +restart_single: + next = pick_task(rq, rf); + if (unlikely(next == RETRY_TASK)) + goto restart_single; if (!next->core_cookie) { rq->core_pick = NULL; rq->core_dl_server = NULL; @@ -6146,6 +6073,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * Tie-break prio towards the current CPU */ +restart_multi: + max = NULL; for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); @@ -6157,7 +6086,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - rq_i->core_pick = p = pick_task(rq_i); + p = pick_task(rq_i, rf); + if (unlikely(p == RETRY_TASK)) + goto restart_multi; + + rq_i->core_pick = p; rq_i->core_dl_server = rq_i->dl_server; if (!max || prio_less(max, p, fi_before)) @@ -6179,7 +6112,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (cookie) p = sched_core_find(rq_i, cookie); if (!p) - p = idle_sched_class.pick_task(rq_i); + p = idle_sched_class.pick_task(rq_i, rf); } rq_i->core_pick = p; @@ -6812,6 +6745,7 @@ static void __sched notrace __schedule(int sched_mode) local_irq_disable(); rcu_note_context_switch(preempt); + migrate_disable_switch(rq, prev); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6918,7 +6852,6 @@ keep_resched: */ ++*switch_count; - migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev) || prev->se.sched_delayed); @@ -7326,7 +7259,7 @@ void rt_mutex_post_schedule(void) */ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int prio, oldprio, queued, running, queue_flag = + int prio, oldprio, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class, *next_class; struct rq_flags rf; @@ -7388,64 +7321,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) prev_class = p->sched_class; next_class = __setscheduler_class(p->policy, prio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flag); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flag |= DEQUEUE_CLASS; - /* - * Boosting condition are: - * 1. -rt task is running and holds mutex A - * --> -dl task blocks on mutex A - * - * 2. -dl task is running and holds mutex A - * --> -dl task blocks on mutex A and could preempt the - * running task - */ - if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || - (pi_task && dl_prio(pi_task->prio) && - dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.pi_se = pi_task->dl.pi_se; - queue_flag |= ENQUEUE_REPLENISH; + scoped_guard (sched_change, p, queue_flag) { + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.pi_se = pi_task->dl.pi_se; + scope->flags |= ENQUEUE_REPLENISH; + } else { + p->dl.pi_se = &p->dl; + } + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (oldprio < prio) + scope->flags |= ENQUEUE_HEAD; } else { - p->dl.pi_se = &p->dl; + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (rt_prio(oldprio)) + p->rt.timeout = 0; } - } else if (rt_prio(prio)) { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (oldprio < prio) - queue_flag |= ENQUEUE_HEAD; - } else { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (rt_prio(oldprio)) - p->rt.timeout = 0; - } - - p->sched_class = next_class; - p->prio = prio; - - check_class_changing(rq, p, prev_class); - if (queued) - enqueue_task(rq, p, queue_flag); - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); + p->sched_class = next_class; + p->prio = prio; + } out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_rq_unlock(rq); + rq_repin_lock(rq, &rf); + __task_rq_unlock(rq, p, &rf); preempt_enable(); } @@ -8084,26 +8004,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - bool queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(p, &rf); - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); - if (running) - put_prev_task(rq, p); - - p->numa_preferred_nid = nid; - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - task_rq_unlock(rq, p, &rf); + guard(task_rq_lock)(p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->numa_preferred_nid = nid; } #endif /* CONFIG_NUMA_BALANCING */ @@ -9205,38 +9108,23 @@ static void sched_change_group(struct task_struct *tsk) */ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { - int queued, running, queue_flags = - DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + bool resched = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - update_rq_clock(rq); - - running = task_current_donor(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, queue_flags); - if (running) - put_prev_task(rq, tsk); - - sched_change_group(tsk); - if (!for_autogroup) - scx_cgroup_move_task(tsk); + scoped_guard (sched_change, tsk, queue_flags) { + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); + if (scope->running) + resched = true; + } - if (queued) - enqueue_task(rq, tsk, queue_flags); - if (running) { - set_next_task(rq, tsk); - /* - * After changing group, the running task may have joined a - * throttled one but it's still the running task. Trigger a - * resched to make sure that task can still run. - */ + if (resched) resched_curr(rq); - } } static struct cgroup_subsys_state * @@ -10892,37 +10780,75 @@ void sched_mm_cid_fork(struct task_struct *t) } #endif /* CONFIG_SCHED_MM_CID */ -#ifdef CONFIG_SCHED_CLASS_EXT -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx) +static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); + +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) { + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); struct rq *rq = task_rq(p); + /* + * Must exclusively use matched flags since this is both dequeue and + * enqueue. + */ + WARN_ON_ONCE(flags & 0xFFFF0000); + lockdep_assert_rq_held(rq); - *ctx = (struct sched_enq_and_set_ctx){ + if (!(flags & DEQUEUE_NOCLOCK)) { + update_rq_clock(rq); + flags |= DEQUEUE_NOCLOCK; + } + + if (flags & DEQUEUE_CLASS) { + if (p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); + } + + *ctx = (struct sched_change_ctx){ .p = p, - .queue_flags = queue_flags, + .flags = flags, .queued = task_on_rq_queued(p), - .running = task_current(rq, p), + .running = task_current_donor(rq, p), }; - update_rq_clock(rq); + if (!(flags & DEQUEUE_CLASS)) { + if (p->sched_class->get_prio) + ctx->prio = p->sched_class->get_prio(rq, p); + else + ctx->prio = p->prio; + } + if (ctx->queued) - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + dequeue_task(rq, p, flags); if (ctx->running) put_prev_task(rq, p); + + if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) + p->sched_class->switched_from(rq, p); + + return ctx; } -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +void sched_change_end(struct sched_change_ctx *ctx) { - struct rq *rq = task_rq(ctx->p); + struct task_struct *p = ctx->p; + struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); + if (ctx->queued) - enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + enqueue_task(rq, p, ctx->flags); if (ctx->running) - set_next_task(rq, ctx->p); + set_next_task(rq, p); + + if (ctx->flags & ENQUEUE_CLASS) { + if (p->sched_class->switched_to) + p->sched_class->switched_to(rq, p); + } else { + p->sched_class->prio_changed(rq, p, ctx->prio); + } } -#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index cdd740b3f774..37b572cc8aca 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, * cpudl_clear - remove a CPU from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target CPU + * @online: the online state of the deadline runqueue * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_clear(struct cpudl *cp, int cpu) +void cpudl_clear(struct cpudl *cp, int cpu, bool online) { int old_idx, new_cpu; unsigned long flags; @@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu) if (old_idx == IDX_INVALID) { /* * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is + * This could happen if rq_online_dl or rq_offline_dl is * called for a CPU without -dl tasks running. */ } else { @@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu) cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; cpudl_heapify(cp, old_idx); - - cpumask_set_cpu(cpu, cp->free_cpus); } + if (likely(online)) + __cpumask_set_cpu(cpu, cp->free_cpus); + else + __cpumask_clear_cpu(cpu, cp->free_cpus); + raw_spin_unlock_irqrestore(&cp->lock, flags); } @@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) cp->elements[new_idx].cpu = cpu; cp->elements[cpu].idx = new_idx; cpudl_heapify_up(cp, new_idx); - cpumask_clear_cpu(cpu, cp->free_cpus); + __cpumask_clear_cpu(cpu, cp->free_cpus); } else { cp->elements[old_idx].dl = dl; cpudl_heapify(cp, old_idx); @@ -238,26 +242,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) } /* - * cpudl_set_freecpu - Set the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_set_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_set_cpu(cpu, cp->free_cpus); -} - -/* - * cpudl_clear_freecpu - Clear the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_clear_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_clear_cpu(cpu, cp->free_cpus); -} - -/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 11c0f1faa7e1..d7699468eedd 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -19,8 +19,6 @@ struct cpudl { int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); -void cpudl_clear(struct cpudl *cp, int cpu); +void cpudl_clear(struct cpudl *cp, int cpu, bool online); int cpudl_init(struct cpudl *cp); -void cpudl_set_freecpu(struct cpudl *cp, int cpu); -void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 615411a0a881..48357d4609bf 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -405,7 +405,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct sched_dl_entity *dl_se) +static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task) { struct hrtimer *timer = &dl_se->inactive_timer; struct rq *rq = rq_of_dl_se(dl_se); @@ -444,10 +444,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se) } else { struct task_struct *p = dl_task_of(dl_se); - if (dl_task(p)) + if (dl_task) sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (READ_ONCE(p->__state) == TASK_DEAD) @@ -1808,7 +1808,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online); cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { struct rb_node *leftmost = rb_first_cached(&dl_rq->root); @@ -2045,7 +2045,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - task_non_contending(dl_se); + task_non_contending(dl_se, true); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -2352,7 +2352,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) * __pick_next_task_dl - Helper to pick the next -deadline task to run. * @rq: The runqueue to pick the next task from. */ -static struct task_struct *__pick_task_dl(struct rq *rq) +static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2366,7 +2366,7 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick_task(dl_se); + p = dl_se->server_pick_task(dl_se, rf); if (!p) { dl_server_stop(dl_se); goto again; @@ -2379,9 +2379,9 @@ again: return p; } -static struct task_struct *pick_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf) { - return __pick_task_dl(rq); + return __pick_task_dl(rq, rf); } static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) @@ -2880,9 +2880,10 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); - cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); + else + cpudl_clear(&rq->rd->cpudl, rq->cpu, true); } /* Assumes rq->lock is held */ @@ -2891,8 +2892,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_clear(&rq->rd->cpudl, rq->cpu); - cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, false); } void __init init_sched_dl_class(void) @@ -2970,7 +2970,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(&p->dl); + task_non_contending(&p->dl, false); /* * In case a task is setscheduled out from SCHED_DEADLINE we need to @@ -3042,23 +3042,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } } +static u64 get_prio_dl(struct rq *rq, struct task_struct *p) +{ + return p->dl.deadline; +} + /* * If the scheduling parameters of a -deadline task changed, * a push or pull operation might be needed. */ -static void prio_changed_dl(struct rq *rq, struct task_struct *p, - int oldprio) +static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline) { if (!task_on_rq_queued(p)) return; - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) + if (p->dl.deadline == old_deadline) + return; + + if (dl_time_before(old_deadline, p->dl.deadline)) deadline_queue_pull_task(rq); if (task_current_donor(rq, p)) { @@ -3091,6 +3092,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(dl) = { + .queue_mask = 8, + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -3113,6 +3116,7 @@ DEFINE_SCHED_CLASS(dl) = { .task_tick = task_tick_dl, .task_fork = task_fork_dl, + .get_prio = get_prio_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5d6fdb31f18a..adff739b396c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2066,7 +2066,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && unlikely(rq->scx.cpu_released)) { @@ -2172,42 +2172,6 @@ has_tasks: return true; } -static int balance_scx(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) -{ - int ret; - - rq_unpin_lock(rq, rf); - - ret = balance_one(rq, prev); - -#ifdef CONFIG_SCHED_SMT - /* - * When core-sched is enabled, this ops.balance() call will be followed - * by pick_task_scx() on this CPU and the SMT siblings. Balance the - * siblings too. - */ - if (sched_core_enabled(rq)) { - const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); - int scpu; - - for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { - struct rq *srq = cpu_rq(scpu); - struct task_struct *sprev = srq->curr; - - WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); - update_rq_clock(srq); - balance_one(srq, sprev); - } - } -#endif - rq_repin_lock(rq, rf); - - maybe_queue_balance_callback(rq); - - return ret; -} - static void process_ddsp_deferred_locals(struct rq *rq) { struct task_struct *p; @@ -2387,41 +2351,26 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq) +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) { struct task_struct *prev = rq->curr; + bool keep_prev, kick_idle = false; struct task_struct *p; - bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; - bool kick_idle = false; - /* - * WORKAROUND: - * - * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just - * have gone through balance_scx(). Unfortunately, there currently is a - * bug where fair could say yes on balance() but no on pick_task(), - * which then ends up calling pick_task_scx() without preceding - * balance_scx(). - * - * Keep running @prev if possible and avoid stalling from entering idle - * without balancing. - * - * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() - * if pick_task_scx() is called without preceding balance_scx(). - */ - if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev->scx.flags & SCX_TASK_QUEUED) { - keep_prev = true; - } else { - keep_prev = false; - kick_idle = true; - } - } else if (unlikely(keep_prev && - prev->sched_class != &ext_sched_class)) { - /* - * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is - * conditional on scx_enabled() and may have been skipped. - */ + rq_modified_clear(rq); + + rq_unpin_lock(rq, rf); + balance_one(rq, prev); + rq_repin_lock(rq, rf); + + maybe_queue_balance_callback(rq); + + if (rq_modified_above(rq, &ext_sched_class)) + return RETRY_TASK; + + keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); keep_prev = false; } @@ -3016,7 +2965,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p, p->scx.weight); } -static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) { } @@ -3301,6 +3250,8 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { + .queue_mask = 1, + .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, @@ -3308,7 +3259,6 @@ DEFINE_SCHED_CLASS(ext) = { .wakeup_preempt = wakeup_preempt_scx, - .balance = balance_scx, .pick_task = pick_task_scx, .put_prev_task = put_prev_task_scx, @@ -3849,11 +3799,10 @@ static void scx_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { - struct sched_enq_and_set_ctx ctx; - /* cycling deq/enq is enough, see the function comment */ - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* nothing */ ; + } } /* resched to restore ticks and idle state */ @@ -4003,22 +3952,20 @@ static void scx_disable_workfn(struct kthread_work *work) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + update_rq_clock(task_rq(p)); - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); - - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); scx_exit_task(p); } scx_task_iter_stop(&sti); @@ -4786,26 +4733,22 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) percpu_down_write(&scx_fork_rwsem); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; if (!tryget_task_struct(p)) continue; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - p->scx.slice = SCX_SLICE_DFL; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); - - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); put_task_struct(p); } scx_task_iter_stop(&sti); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc0b7ce8a65d..2554055c1ba1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8705,15 +8705,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context set_task_max_allowed_capacity(p); } -static int -balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - if (sched_fair_runnable(rq)) - return 1; - - return sched_balance_newidle(rq, rf) != 0; -} - static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { @@ -8822,7 +8813,7 @@ preempt: resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq) +static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) { struct sched_entity *se; struct cfs_rq *cfs_rq; @@ -8866,7 +8857,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - p = pick_task_fair(rq); + p = pick_task_fair(rq, rf); if (!p) goto idle; se = &p->se; @@ -8945,14 +8936,10 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) +static struct task_struct * +fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) { - return pick_next_task_fair(rq, prev, NULL); -} - -static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) -{ - return pick_task_fair(dl_se->rq); + return pick_task_fair(dl_se->rq, rf); } void fair_server_init(struct rq *rq) @@ -9007,7 +8994,19 @@ static void yield_task_fair(struct rq *rq) */ rq_clock_skip_update(rq); - se->deadline += calc_delta_fair(se->slice, se); + /* + * Forfeit the remaining vruntime, only if the entity is eligible. This + * condition is necessary because in core scheduling we prefer to run + * ineligible tasks rather than force idling. If this happens we may + * end up in a loop where the core scheduler picks the yielding task, + * which yields immediately again; without the condition the vruntime + * ends up quickly running away. + */ + if (entity_eligible(cfs_rq, se)) { + se->vruntime = se->deadline; + se->deadline += calc_delta_fair(se->slice, se); + update_min_vruntime(cfs_rq); + } } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -10671,7 +10670,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (sd->flags & SD_ASYM_CPUCAPACITY) sgs->group_misfit_task_load = 1; - for_each_cpu(i, sched_group_span(group)) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -12829,6 +12828,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) } rcu_read_unlock(); + rq_modified_clear(this_rq); raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); @@ -12886,8 +12886,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_queued) + /* If a higher prio class was modified, restart the pick */ + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: @@ -13138,11 +13138,14 @@ static void task_fork_fair(struct task_struct *p) * the current task. */ static void -prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (rq->cfs.nr_queued == 1) return; @@ -13154,8 +13157,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); - } else + } else { wakeup_preempt(rq, p, 0); + } } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13237,6 +13241,12 @@ static void attach_task_cfs_rq(struct task_struct *p) attach_entity_cfs_rq(se); } +static void switching_from_fair(struct rq *rq, struct task_struct *p) +{ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); +} + static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); @@ -13611,6 +13621,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task */ DEFINE_SCHED_CLASS(fair) = { + .queue_mask = 2, + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -13619,11 +13631,10 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = check_preempt_wakeup_fair, .pick_task = pick_task_fair, - .pick_next_task = __pick_next_task_fair, + .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, - .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13638,6 +13649,7 @@ DEFINE_SCHED_CLASS(fair) = { .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, + .switching_from = switching_from_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c39b089d4f09..7fa0b593bcff 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -466,7 +466,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir next->se.exec_start = rq_clock_task(rq); } -struct task_struct *pick_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) { scx_update_idle(rq, true, false); return rq->idle; @@ -498,14 +498,17 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { } -static void switched_to_idle(struct rq *rq, struct task_struct *p) +static void switching_to_idle(struct rq *rq, struct task_struct *p) { BUG(); } static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); } @@ -518,6 +521,8 @@ static void update_curr_idle(struct rq *rq) */ DEFINE_SCHED_CLASS(idle) = { + .queue_mask = 0, + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -536,6 +541,6 @@ DEFINE_SCHED_CLASS(idle) = { .task_tick = task_tick_idle, .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, + .switching_to = switching_to_idle, .update_curr = update_curr_idle, }; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7936d4333731..1fd97f2d7ec6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct *pick_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) { struct task_struct *p; @@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * us to initiate a push or pull. */ static void -prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (task_current_donor(rq, p)) { /* * If our priority decreases while running, we @@ -2566,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(rt) = { + .queue_mask = 4, + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -2589,8 +2594,8 @@ DEFINE_SCHED_CLASS(rt) = { .get_rr_interval = get_rr_interval_rt, - .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + .prio_changed = prio_changed_rt, .update_curr = update_curr_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3f7fab3d7960..63ffb3eafd05 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -20,7 +20,6 @@ #include <linux/sched/task_flags.h> #include <linux/sched/task.h> #include <linux/sched/topology.h> - #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bug.h> @@ -780,7 +779,6 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ @@ -1120,6 +1118,8 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; + /* Per class runqueue modification mask; bits in class order. */ + unsigned int queue_mask; unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1827,7 +1827,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void +__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1839,8 +1840,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - rq_unpin_lock(rq, rf); - raw_spin_rq_unlock(rq); + __task_rq_unlock(rq, p, rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } @@ -1849,6 +1849,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, + _T->rq = __task_rq_lock(_T->lock, &_T->rf), + __task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { @@ -2342,8 +2347,7 @@ extern const u32 sched_prio_to_wmult[40]; /* * {de,en}queue flags: * - * DEQUEUE_SLEEP - task is no longer runnable - * ENQUEUE_WAKEUP - task just became runnable + * SLEEP/WAKEUP - task is no-longer/just-became runnable * * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks * are in a known state which allows modification. Such pairs @@ -2356,34 +2360,46 @@ extern const u32 sched_prio_to_wmult[40]; * * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) * + * DELAYED - de/re-queue a sched_delayed task + * + * CLASS - going to update p->sched_class; makes sched_change call the + * various switch methods. + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * + * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but + * SCHED_DEADLINE seems to rely on this for now. */ -#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ -#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ -#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ -#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ -#define DEQUEUE_SPECIAL 0x10 -#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ -#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ -#define DEQUEUE_THROTTLE 0x800 - -#define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_RESTORE 0x02 -#define ENQUEUE_MOVE 0x04 -#define ENQUEUE_NOCLOCK 0x08 - -#define ENQUEUE_HEAD 0x10 -#define ENQUEUE_REPLENISH 0x20 -#define ENQUEUE_MIGRATED 0x40 -#define ENQUEUE_INITIAL 0x80 -#define ENQUEUE_MIGRATING 0x100 -#define ENQUEUE_DELAYED 0x200 -#define ENQUEUE_RQ_SELECTED 0x400 +#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */ +#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */ + +#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */ + +#define DEQUEUE_SPECIAL 0x00010000 +#define DEQUEUE_THROTTLE 0x00020000 + +#define ENQUEUE_WAKEUP 0x0001 +#define ENQUEUE_RESTORE 0x0002 +#define ENQUEUE_MOVE 0x0004 +#define ENQUEUE_NOCLOCK 0x0008 + +#define ENQUEUE_MIGRATING 0x0010 +#define ENQUEUE_DELAYED 0x0020 +#define ENQUEUE_CLASS 0x0040 + +#define ENQUEUE_HEAD 0x00010000 +#define ENQUEUE_REPLENISH 0x00020000 +#define ENQUEUE_MIGRATED 0x00040000 +#define ENQUEUE_INITIAL 0x00080000 +#define ENQUEUE_RQ_SELECTED 0x00100000 #define RETRY_TASK ((void *)-1UL) @@ -2400,16 +2416,61 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif + /* + * idle: 0 + * ext: 1 + * fair: 2 + * rt: 4 + * dl: 8 + * stop: 16 + */ + unsigned int queue_mask; + /* + * move_queued_task/activate_task/enqueue_task: rq->lock + * ttwu_do_activate/activate_task/enqueue_task: rq->lock + * wake_up_new_task/activate_task/enqueue_task: task_rq_lock + * ttwu_runnable/enqueue_task: task_rq_lock + * proxy_task_current: rq->lock + * sched_change_end + */ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + /* + * move_queued_task/deactivate_task/dequeue_task: rq->lock + * __schedule/block_task/dequeue_task: rq->lock + * proxy_task_current: rq->lock + * wait_task_inactive: task_rq_lock + * sched_change_begin + */ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + + /* + * do_sched_yield: rq->lock + */ void (*yield_task) (struct rq *rq); + /* + * yield_to: rq->lock (double) + */ bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + /* + * move_queued_task: rq->lock + * __migrate_swap_task: rq->lock + * ttwu_do_activate: rq->lock + * ttwu_runnable: task_rq_lock + * wake_up_new_task: task_rq_lock + */ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + /* + * schedule/pick_next_task/prev_balance: rq->lock + */ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - struct task_struct *(*pick_task)(struct rq *rq); + + /* + * schedule/pick_next_task: rq->lock + */ + struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); /* * Optional! When implemented pick_next_task() should be equivalent to: * @@ -2419,55 +2480,123 @@ struct sched_class { * set_next_task_first(next); * } */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); + /* + * sched_change: + * __schedule: rq->lock + */ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + /* + * select_task_rq: p->pi_lock + * sched_exec: p->pi_lock + */ int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + /* + * set_task_cpu: p->pi_lock || rq->lock (ttwu like) + */ void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + /* + * ttwu_do_activate: rq->lock + * wake_up_new_task: task_rq_lock + */ void (*task_woken)(struct rq *this_rq, struct task_struct *task); + /* + * do_set_cpus_allowed: task_rq_lock + sched_change + */ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); + /* + * sched_set_rq_{on,off}line: rq->lock + */ void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + /* + * push_cpu_stop: p->pi_lock && rq->lock + */ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); + /* + * hrtick: rq->lock + * sched_tick: rq->lock + * sched_tick_remote: rq->lock + */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + /* + * sched_cgroup_fork: p->pi_lock + */ void (*task_fork)(struct task_struct *p); + /* + * finish_task_switch: no locks + */ void (*task_dead)(struct task_struct *p); /* - * The switched_from() call is allowed to drop rq->lock, therefore we - * cannot assume the switched_from/switched_to pair is serialized by - * rq->lock. They are however serialized by p->pi_lock. + * sched_change + */ + void (*switching_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switching_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + u64 (*get_prio) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + u64 oldprio); + + /* + * set_load_weight: task_rq_lock + sched_change + * __setscheduler_parms: task_rq_lock + sched_change */ - void (*switching_to) (struct rq *this_rq, struct task_struct *task); - void (*switched_from)(struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*reweight_task)(struct rq *this_rq, struct task_struct *task, const struct load_weight *lw); - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); + /* + * sched_rr_get_interval: task_rq_lock + */ unsigned int (*get_rr_interval)(struct rq *rq, struct task_struct *task); + /* + * task_sched_runtime: task_rq_lock + */ void (*update_curr)(struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED + /* + * sched_change_group: task_rq_lock + sched_change + */ void (*task_change_group)(struct task_struct *p); #endif #ifdef CONFIG_SCHED_CORE + /* + * pick_next_task: rq->lock + * try_steal_cookie: rq->lock (double) + */ int (*task_is_throttled)(struct task_struct *p, int cpu); #endif }; +/* + * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. + */ +static inline void rq_modified_clear(struct rq *rq) +{ + rq->queue_mask = 0; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) +{ + unsigned int mask = class->queue_mask; + return rq->queue_mask & ~((mask << 1) - 1); +} + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -2579,8 +2708,9 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_task_idle(struct rq *rq); +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); +extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 @@ -2610,7 +2740,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) static inline cpumask_t *alloc_user_cpus_ptr(int node) { /* - * See do_set_cpus_allowed() above for the rcu_head usage. + * See set_cpus_allowed_force() above for the rcu_head usage. */ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); @@ -3877,32 +4007,42 @@ extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class); -extern void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio); - extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#ifdef CONFIG_SCHED_CLASS_EXT /* - * Used by SCX in the enable/disable paths to move tasks between sched_classes - * and establish invariants. + * The 'sched_change' pattern is the safe, easy and slow way of changing a + * task's scheduling properties. It dequeues a task, such that the scheduler + * is fully unaware of it; at which point its properties can be modified; + * after which it is enqueued again. + * + * Typically this must be called while holding task_rq_lock, since most/all + * properties are serialized under those locks. There is currently one + * exception to this rule in sched/ext which only holds rq->lock. + */ + +/* + * This structure is a temporary, used to preserve/convey the queueing state + * of the task between sched_change_begin() and sched_change_end(). Ensuring + * the task's queueing state is idempotent across the operation. */ -struct sched_enq_and_set_ctx { +struct sched_change_ctx { + u64 prio; struct task_struct *p; - int queue_flags; + int flags; bool queued; bool running; }; -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx); -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags); +void sched_change_end(struct sched_change_ctx *ctx); -#endif /* CONFIG_SCHED_CLASS_EXT */ +DEFINE_CLASS(sched_change, struct sched_change_ctx *, + sched_change_end(_T), + sched_change_begin(p, flags), + struct task_struct *p, unsigned int flags) + +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change) #include "ext.h" diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 26f3fd4d34ce..cbf7206b3f9d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) rq = __task_rq_lock(p, &rf); psi_task_change(p, p->psi_flags, 0); - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); } } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 2d4e279f05ee..4f9192be4b5b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf) { if (!sched_stop_runnable(rq)) return NULL; @@ -75,14 +75,17 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p) +static void switching_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } static void -prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); /* how!?, what priority? */ } @@ -95,6 +98,8 @@ static void update_curr_stop(struct rq *rq) */ DEFINE_SCHED_CLASS(stop) = { + .queue_mask = 16, + .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, @@ -112,6 +117,6 @@ DEFINE_SCHED_CLASS(stop) = { .task_tick = task_tick_stop, .prio_changed = prio_changed_stop, - .switched_to = switched_to_stop, + .switching_to = switching_to_stop, .update_curr = update_curr_stop, }; diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 77ae87f36e84..8f0f603b530b 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -64,8 +64,6 @@ static int effective_prio(struct task_struct *p) void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; - struct rq *rq; int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -74,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - CLASS(task_rq_lock, rq_guard)(p); - rq = rq_guard.rq; - - update_rq_clock(rq); + guard(task_rq_lock)(p); /* * The RT priorities are set via sched_setscheduler(), but we still @@ -90,28 +85,12 @@ void set_user_nice(struct task_struct *p, long nice) return; } - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - p->sched_class->prio_changed(rq, p, old_prio); + scoped_guard (sched_change, p, DEQUEUE_SAVE) { + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + } } EXPORT_SYMBOL(set_user_nice); @@ -515,7 +494,7 @@ int __sched_setscheduler(struct task_struct *p, bool user, bool pi) { int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; + int retval, oldprio, newprio; const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; @@ -695,38 +674,27 @@ change: prev_class = p->sched_class; next_class = __setscheduler_class(policy, newprio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flags |= DEQUEUE_CLASS; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - p->sched_class = next_class; - p->prio = newprio; - } - __setscheduler_uclamp(p, attr); - check_class_changing(rq, p, prev_class); + scoped_guard (sched_change, p, queue_flags) { - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + p->sched_class = next_class; + p->prio = newprio; + } + __setscheduler_uclamp(p, attr); - enqueue_task(rq, p, queue_flags); + if (scope->queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + scope->flags |= ENQUEUE_HEAD; + } } - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); /* Avoid rq from going away on us: */ preempt_disable(); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 444bdfdab731..711076aa4980 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1590,10 +1590,17 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA enum numa_topology_type sched_numa_topology_type; +/* + * sched_domains_numa_distance is derived from sched_numa_node_distance + * and provides a simplified view of NUMA distances used specifically + * for building NUMA scheduling domains. + */ static int sched_domains_numa_levels; +static int sched_numa_node_levels; int sched_max_numa_distance; static int *sched_domains_numa_distance; +static int *sched_numa_node_distance; static struct cpumask ***sched_domains_numa_masks; #endif /* CONFIG_NUMA */ @@ -1845,10 +1852,10 @@ bool find_numa_distance(int distance) return true; rcu_read_lock(); - distances = rcu_dereference(sched_domains_numa_distance); + distances = rcu_dereference(sched_numa_node_distance); if (!distances) goto unlock; - for (i = 0; i < sched_domains_numa_levels; i++) { + for (i = 0; i < sched_numa_node_levels; i++) { if (distances[i] == distance) { found = true; break; @@ -1924,14 +1931,34 @@ static void init_numa_topology_type(int offline_node) #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) -void sched_init_numa(int offline_node) +/* + * An architecture could modify its NUMA distance, to change + * grouping of NUMA nodes and number of NUMA levels when creating + * NUMA level sched domains. + * + * A NUMA level is created for each unique + * arch_sched_node_distance. + */ +static int numa_node_dist(int i, int j) { - struct sched_domain_topology_level *tl; - unsigned long *distance_map; + return node_distance(i, j); +} + +int arch_sched_node_distance(int from, int to) + __weak __alias(numa_node_dist); + +static bool modified_sched_node_distance(void) +{ + return numa_node_dist != arch_sched_node_distance; +} + +static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int), + int **dist, int *levels) +{ + unsigned long *distance_map __free(bitmap) = NULL; int nr_levels = 0; int i, j; int *distances; - struct cpumask ***masks; /* * O(nr_nodes^2) de-duplicating selection sort -- in order to find the @@ -1939,17 +1966,16 @@ void sched_init_numa(int offline_node) */ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); if (!distance_map) - return; + return -ENOMEM; bitmap_zero(distance_map, NR_DISTANCE_VALUES); for_each_cpu_node_but(i, offline_node) { for_each_cpu_node_but(j, offline_node) { - int distance = node_distance(i, j); + int distance = n_dist(i, j); if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { sched_numa_warn("Invalid distance value range"); - bitmap_free(distance_map); - return; + return -EINVAL; } bitmap_set(distance_map, distance, 1); @@ -1962,18 +1988,46 @@ void sched_init_numa(int offline_node) nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); - if (!distances) { - bitmap_free(distance_map); - return; - } + if (!distances) + return -ENOMEM; for (i = 0, j = 0; i < nr_levels; i++, j++) { j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); distances[i] = j; } - rcu_assign_pointer(sched_domains_numa_distance, distances); + *dist = distances; + *levels = nr_levels; + + return 0; +} + +void sched_init_numa(int offline_node) +{ + struct sched_domain_topology_level *tl; + int nr_levels, nr_node_levels; + int i, j; + int *distances, *domain_distances; + struct cpumask ***masks; - bitmap_free(distance_map); + /* Record the NUMA distances from SLIT table */ + if (sched_record_numa_dist(offline_node, numa_node_dist, &distances, + &nr_node_levels)) + return; + + /* Record modified NUMA distances for building sched domains */ + if (modified_sched_node_distance()) { + if (sched_record_numa_dist(offline_node, arch_sched_node_distance, + &domain_distances, &nr_levels)) { + kfree(distances); + return; + } + } else { + domain_distances = distances; + nr_levels = nr_node_levels; + } + rcu_assign_pointer(sched_numa_node_distance, distances); + WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]); + WRITE_ONCE(sched_numa_node_levels, nr_node_levels); /* * 'nr_levels' contains the number of unique distances @@ -1991,6 +2045,8 @@ void sched_init_numa(int offline_node) * * We reset it to 'nr_levels' at the end of this function. */ + rcu_assign_pointer(sched_domains_numa_distance, domain_distances); + sched_domains_numa_levels = 0; masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); @@ -2016,10 +2072,13 @@ void sched_init_numa(int offline_node) masks[i][j] = mask; for_each_cpu_node_but(k, offline_node) { - if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + if (sched_debug() && + (arch_sched_node_distance(j, k) != + arch_sched_node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); - if (node_distance(j, k) > sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, k) > + sched_domains_numa_distance[i]) continue; cpumask_or(mask, mask, cpumask_of_node(k)); @@ -2059,7 +2118,6 @@ void sched_init_numa(int offline_node) sched_domain_topology = tl; sched_domains_numa_levels = nr_levels; - WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); init_numa_topology_type(offline_node); } @@ -2067,14 +2125,18 @@ void sched_init_numa(int offline_node) static void sched_reset_numa(void) { - int nr_levels, *distances; + int nr_levels, *distances, *dom_distances = NULL; struct cpumask ***masks; nr_levels = sched_domains_numa_levels; + sched_numa_node_levels = 0; sched_domains_numa_levels = 0; sched_max_numa_distance = 0; sched_numa_topology_type = NUMA_DIRECT; - distances = sched_domains_numa_distance; + distances = sched_numa_node_distance; + if (sched_numa_node_distance != sched_domains_numa_distance) + dom_distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_numa_node_distance, NULL); rcu_assign_pointer(sched_domains_numa_distance, NULL); masks = sched_domains_numa_masks; rcu_assign_pointer(sched_domains_numa_masks, NULL); @@ -2083,6 +2145,7 @@ static void sched_reset_numa(void) synchronize_rcu(); kfree(distances); + kfree(dom_distances); for (i = 0; i < nr_levels && masks; i++) { if (!masks[i]) continue; @@ -2129,7 +2192,8 @@ void sched_domains_numa_masks_set(unsigned int cpu) continue; /* Set ourselves in the remote node's masks */ - if (node_distance(j, node) <= sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, node) <= + sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } } |
