summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c1226
1 files changed, 560 insertions, 666 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f754a60de848..fc358c1b6ca9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
#ifdef CONFIG_SCHED_PROXY_EXEC
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -583,8 +584,8 @@ EXPORT_SYMBOL(__trace_set_current_state);
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
- * is set by activate_task() and cleared by deactivate_task(), under
- * rq->lock. Non-zero indicates the task is runnable, the special
+ * is set by activate_task() and cleared by deactivate_task()/block_task(),
+ * under rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
@@ -2089,6 +2090,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
+ rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2121,6 +2123,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
+ rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2128,8 +2131,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED;
- if (flags & ENQUEUE_MIGRATED)
- sched_mm_cid_migrate_to(rq, p);
enqueue_task(rq, p, flags);
@@ -2169,37 +2170,6 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class)
-{
- if (prev_class != p->sched_class && p->sched_class->switching_to)
- p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
-{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
- prev_class->switched_from(rq, p);
-
- p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio || dl_task(p))
- p->sched_class->prio_changed(rq, p, oldprio);
-}
-
void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
@@ -2362,7 +2332,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
@@ -2377,10 +2347,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
if (p->cpus_ptr != &p->cpus_mask)
return;
- /*
- * Violates locking rules! See comment in __do_set_cpus_allowed().
- */
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
}
void ___migrate_enable(void)
@@ -2613,7 +2581,8 @@ static int migration_cpu_stop(void *data)
*/
WARN_ON_ONCE(!pending->stop_pending);
preempt_disable();
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
preempt_enable();
@@ -2622,7 +2591,8 @@ static int migration_cpu_stop(void *data)
out:
if (pending)
pending->stop_pending = false;
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
if (complete)
complete_all(&pending->done);
@@ -2671,6 +2641,8 @@ out_unlock:
return 0;
}
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask);
+
/*
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
@@ -2684,6 +2656,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
cpumask_copy(&p->cpus_mask, ctx->new_mask);
p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+ mm_update_cpus_allowed(p->mm, ctx->new_mask);
/*
* Swap in a new user_cpus_ptr if SCA_USER flag set
@@ -2693,56 +2666,17 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- struct rq *rq = task_rq(p);
- bool queued, running;
-
- /*
- * This here violates the locking rules for affinity, since we're only
- * supposed to change these variables while holding both rq->lock and
- * p->pi_lock.
- *
- * HOWEVER, it magically works, because ttwu() is the only code that
- * accesses these variables under p->pi_lock and only does so after
- * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
- * before finish_task().
- *
- * XXX do further audits, this smells like something putrid.
- */
- if (ctx->flags & SCA_MIGRATE_DISABLE)
- WARN_ON_ONCE(!p->on_cpu);
- else
- lockdep_assert_held(&p->pi_lock);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued) {
- /*
- * Because __kthread_bind() calls this on blocked tasks without
- * holding rq->lock.
- */
- lockdep_assert_rq_held(rq);
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- }
- if (running)
- put_prev_task(rq, p);
-
- p->sched_class->set_cpus_allowed(p, ctx);
- mm_set_cpus_allowed(p->mm, ctx->new_mask);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->sched_class->set_cpus_allowed(p, ctx);
}
/*
* Used for kthread_bind() and select_fallback_rq(), in both cases the user
* affinity (if any) should be destroyed too.
*/
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
@@ -2754,7 +2688,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
struct rcu_head rcu;
};
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (__task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -2792,7 +2727,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
* Use pi_lock to protect content of user_cpus_ptr
*
* Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
- * do_set_cpus_allowed().
+ * set_cpus_allowed_force().
*/
raw_spin_lock_irqsave(&src->pi_lock, flags);
if (src->user_cpus_ptr) {
@@ -3064,8 +2999,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
unsigned int dest_cpu;
int ret = 0;
- update_rq_clock(rq);
-
if (kthread || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs,
@@ -3120,7 +3053,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
goto out;
}
- __do_set_cpus_allowed(p, ctx);
+ do_set_cpus_allowed(p, ctx);
return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
@@ -3329,8 +3262,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- rseq_migrate(p);
- sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -3529,13 +3460,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
fallthrough;
case possible:
- /*
- * XXX When called from select_task_rq() we only
- * hold p->pi_lock and again violate locking order.
- *
- * More yuck to audit.
- */
- do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
+ set_cpus_allowed_force(p, task_cpu_fallback_mask(p));
state = fail;
break;
case fail:
@@ -3777,7 +3702,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
ttwu_do_wakeup(p);
ret = 1;
}
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
return ret;
}
@@ -4231,7 +4156,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * schedule()'s block_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
@@ -4370,7 +4295,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
ret = func(p, arg);
if (rq)
- rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
@@ -4487,7 +4412,6 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
- init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4763,7 +4687,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
p->sched_task_group = tg;
}
#endif
- rseq_migrate(p);
/*
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
@@ -4827,7 +4750,6 @@ void wake_up_new_task(struct task_struct *p)
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
- rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
@@ -5121,7 +5043,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
kcov_prepare_switch(prev);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
- rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5284,19 +5205,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
*
* kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
- *
- * switch_mm_cid() needs to be updated if the barriers provided
- * by context_switch() are modified.
*/
- if (!next->mm) { // to kernel
+ if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
- if (prev->mm) // from user
+ if (prev->mm) // from user
mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
- } else { // to user
+ } else { // to user
membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
@@ -5309,15 +5227,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);
- if (!prev->mm) { // from kernel
+ if (!prev->mm) { // from kernel
/* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
- /* switch_mm_cid() requires the memory barriers above. */
- switch_mm_cid(rq, prev, next);
+ mm_cid_switch_to(prev, next);
+
+ /*
+ * Tell rseq that the task was scheduled in. Must be after
+ * switch_mm_cid() to get the TIF flag set.
+ */
+ rseq_sched_switch_event(next);
prepare_lock_switch(rq, next, rf);
@@ -5602,7 +5525,6 @@ void sched_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
- task_tick_mm_cid(rq, donor);
scx_tick(rq);
rq_unlock(rq, &rf);
@@ -5692,7 +5614,7 @@ static void sched_tick_remote(struct work_struct *work)
* reasonable amount of time.
*/
u64 delta = rq_clock_task(rq) - curr->se.exec_start;
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30);
}
curr->sched_class->task_tick(rq, curr, 0);
@@ -5916,19 +5838,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
-#ifdef CONFIG_SCHED_CLASS_EXT
- /*
- * SCX requires a balance() call before every pick_task() including when
- * waking up from SCHED_IDLE. If @start_class is below SCX, start from
- * SCX instead. Also, set a flag to detect missing balance() call.
- */
- if (scx_enabled()) {
- rq->scx.flags |= SCX_RQ_BAL_PENDING;
- if (sched_class_above(&ext_sched_class, start_class))
- start_class = &ext_sched_class;
- }
-#endif
-
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
@@ -5972,7 +5881,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* Assume the next prioritized class is idle_sched_class */
if (!p) {
- p = pick_task_idle(rq);
+ p = pick_task_idle(rq, rf);
put_prev_set_next_task(rq, prev, p);
}
@@ -5984,11 +5893,15 @@ restart:
for_each_active_class(class) {
if (class->pick_next_task) {
- p = class->pick_next_task(rq, prev);
+ p = class->pick_next_task(rq, prev, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
if (p)
return p;
} else {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
if (p) {
put_prev_set_next_task(rq, prev, p);
return p;
@@ -6018,7 +5931,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}
-static inline struct task_struct *pick_task(struct rq *rq)
+/*
+ * Careful; this can return RETRY_TASK, it does not include the retry-loop
+ * itself due to the whole SMT pick retry thing below.
+ */
+static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -6026,7 +5943,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
rq->dl_server = NULL;
for_each_active_class(class) {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
if (p)
return p;
}
@@ -6041,7 +5958,7 @@ static void queue_core_balance(struct rq *rq);
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct task_struct *next, *p, *max = NULL;
+ struct task_struct *next, *p, *max;
const struct cpumask *smt_mask;
bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
@@ -6126,7 +6043,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* and there are no cookied tasks running on siblings.
*/
if (!need_sync) {
- next = pick_task(rq);
+restart_single:
+ next = pick_task(rq, rf);
+ if (unlikely(next == RETRY_TASK))
+ goto restart_single;
if (!next->core_cookie) {
rq->core_pick = NULL;
rq->core_dl_server = NULL;
@@ -6146,6 +6066,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* Tie-break prio towards the current CPU
*/
+restart_multi:
+ max = NULL;
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
@@ -6157,7 +6079,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
- rq_i->core_pick = p = pick_task(rq_i);
+ p = pick_task(rq_i, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart_multi;
+
+ rq_i->core_pick = p;
rq_i->core_dl_server = rq_i->dl_server;
if (!max || prio_less(max, p, fi_before))
@@ -6179,7 +6105,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (cookie)
p = sched_core_find(rq_i, cookie);
if (!p)
- p = idle_sched_class.pick_task(rq_i);
+ p = idle_sched_class.pick_task(rq_i, rf);
}
rq_i->core_pick = p;
@@ -6812,6 +6738,7 @@ static void __sched notrace __schedule(int sched_mode)
local_irq_disable();
rcu_note_context_switch(preempt);
+ migrate_disable_switch(rq, prev);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -6918,7 +6845,6 @@ keep_resched:
*/
++*switch_count;
- migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
prev->se.sched_delayed);
@@ -7326,7 +7252,7 @@ void rt_mutex_post_schedule(void)
*/
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int prio, oldprio, queued, running, queue_flag =
+ int prio, oldprio, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
@@ -7388,64 +7314,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
prev_class = p->sched_class;
next_class = __setscheduler_class(p->policy, prio);
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flag);
- if (running)
- put_prev_task(rq, p);
+ if (prev_class != next_class)
+ queue_flag |= DEQUEUE_CLASS;
- /*
- * Boosting condition are:
- * 1. -rt task is running and holds mutex A
- * --> -dl task blocks on mutex A
- *
- * 2. -dl task is running and holds mutex A
- * --> -dl task blocks on mutex A and could preempt the
- * running task
- */
- if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_prio(pi_task->prio) &&
- dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.pi_se = pi_task->dl.pi_se;
- queue_flag |= ENQUEUE_REPLENISH;
+ scoped_guard (sched_change, p, queue_flag) {
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (oldprio < prio)
+ scope->flags |= ENQUEUE_HEAD;
} else {
- p->dl.pi_se = &p->dl;
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
}
- } else if (rt_prio(prio)) {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (oldprio < prio)
- queue_flag |= ENQUEUE_HEAD;
- } else {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (rt_prio(oldprio))
- p->rt.timeout = 0;
- }
-
- p->sched_class = next_class;
- p->prio = prio;
-
- check_class_changing(rq, p, prev_class);
- if (queued)
- enqueue_task(rq, p, queue_flag);
- if (running)
- set_next_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
+ p->sched_class = next_class;
+ p->prio = prio;
+ }
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_rq_unlock(rq);
+ rq_repin_lock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
preempt_enable();
}
@@ -8084,26 +7997,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- bool queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(p, &rf);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
- if (running)
- put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
- task_rq_unlock(rq, p, &rf);
+ guard(task_rq_lock)(p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->numa_preferred_nid = nid;
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -8141,18 +8037,15 @@ static int __balance_push_cpu_stop(void *arg)
struct rq_flags rf;
int cpu;
- raw_spin_lock_irq(&p->pi_lock);
- rq_lock(rq, &rf);
-
- update_rq_clock(rq);
-
- if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ scoped_guard (raw_spinlock_irq, &p->pi_lock) {
cpu = select_fallback_rq(rq->cpu, p);
- rq = __migrate_task(rq, &rf, p, cpu);
- }
- rq_unlock(rq, &rf);
- raw_spin_unlock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+ if (task_rq(p) == rq && task_on_rq_queued(p))
+ rq = __migrate_task(rq, &rf, p, cpu);
+ rq_unlock(rq, &rf);
+ }
put_task_struct(p);
@@ -8591,6 +8484,8 @@ void __init sched_init_smp(void)
{
sched_init_numa(NUMA_NO_NODE);
+ prandom_init_once(&sched_rnd_state);
+
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
@@ -9207,38 +9102,23 @@ static void sched_change_group(struct task_struct *tsk)
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- int queued, running, queue_flags =
- DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- update_rq_clock(rq);
-
- running = task_current_donor(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, queue_flags);
- if (running)
- put_prev_task(rq, tsk);
-
- sched_change_group(tsk);
- if (!for_autogroup)
- scx_cgroup_move_task(tsk);
+ scoped_guard (sched_change, tsk, queue_flags) {
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
+ if (scope->running)
+ resched = true;
+ }
- if (queued)
- enqueue_task(rq, tsk, queue_flags);
- if (running) {
- set_next_task(rq, tsk);
- /*
- * After changing group, the running task may have joined a
- * throttled one but it's still the running task. Trigger a
- * resched to make sure that task can still run.
- */
+ if (resched)
resched_curr(rq);
- }
}
static struct cgroup_subsys_state *
@@ -10374,557 +10254,571 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
}
#ifdef CONFIG_SCHED_MM_CID
-
-/*
- * @cid_lock: Guarantee forward-progress of cid allocation.
- *
- * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
- * is only used when contention is detected by the lock-free allocation so
- * forward progress can be guaranteed.
- */
-DEFINE_RAW_SPINLOCK(cid_lock);
-
-/*
- * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
- *
- * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
- * detected, it is set to 1 to ensure that all newly coming allocations are
- * serialized by @cid_lock until the allocation which detected contention
- * completes and sets @use_cid_lock back to 0. This guarantees forward progress
- * of a cid allocation.
- */
-int use_cid_lock;
-
/*
- * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
- * concurrently with respect to the execution of the source runqueue context
- * switch.
- *
- * There is one basic properties we want to guarantee here:
- *
- * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
- * used by a task. That would lead to concurrent allocation of the cid and
- * userspace corruption.
+ * Concurrency IDentifier management
*
- * Provide this guarantee by introducing a Dekker memory ordering to guarantee
- * that a pair of loads observe at least one of a pair of stores, which can be
- * shown as:
+ * Serialization rules:
*
- * X = Y = 0
+ * mm::mm_cid::mutex: Serializes fork() and exit() and therefore
+ * protects mm::mm_cid::users.
*
- * w[X]=1 w[Y]=1
- * MB MB
- * r[Y]=y r[X]=x
+ * mm::mm_cid::lock: Serializes mm_update_max_cids() and
+ * mm_update_cpus_allowed(). Nests in mm_cid::mutex
+ * and runqueue lock.
*
- * Which guarantees that x==0 && y==0 is impossible. But rather than using
- * values 0 and 1, this algorithm cares about specific state transitions of the
- * runqueue current task (as updated by the scheduler context switch), and the
- * per-mm/cpu cid value.
+ * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks
+ * and can only be modified with atomic operations.
*
- * Let's introduce task (Y) which has task->mm == mm and task (N) which has
- * task->mm != mm for the rest of the discussion. There are two scheduler state
- * transitions on context switch we care about:
+ * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue
+ * lock.
*
- * (TSA) Store to rq->curr with transition from (N) to (Y)
+ * CID ownership:
*
- * (TSB) Store to rq->curr with transition from (Y) to (N)
+ * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
+ * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
+ * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
+ * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
+ * task needs to drop the CID into the pool when scheduling out. Both bits
+ * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
+ * actually handed over to user space in the RSEQ memory.
*
- * On the remote-clear side, there is one transition we care about:
+ * Mode switching:
*
- * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ * Switching to per CPU mode happens when the user count becomes greater
+ * than the maximum number of CIDs, which is calculated by:
*
- * There is also a transition to UNSET state which can be performed from all
- * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
- * guarantees that only a single thread will succeed:
+ * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
+ * max_cids = min(1.25 * opt_cids, num_possible_cpus());
*
- * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ * The +25% allowance is useful for tight CPU masks in scenarios where only
+ * a few threads are created and destroyed to avoid frequent mode
+ * switches. Though this allowance shrinks, the closer opt_cids becomes to
+ * num_possible_cpus(), which is the (unfortunate) hard ABI limit.
*
- * Just to be clear, what we do _not_ want to happen is a transition to UNSET
- * when a thread is actively using the cid (property (1)).
+ * At the point of switching to per CPU mode the new user is not yet
+ * visible in the system, so the task which initiated the fork() runs the
+ * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either transfers each tasks owned CID to the CPU the task runs on or
+ * drops it into the CID pool if a task is not on a CPU at that point in
+ * time. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
+ * it's guaranteed that no task related to that MM owns a CID anymore.
*
- * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ * Switching back to task mode happens when the user count goes below the
+ * threshold which was recorded on the per CPU mode switch:
*
- * Scenario A) (TSA)+(TMA) (from next task perspective)
+ * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2);
*
- * CPU0 CPU1
+ * This threshold is updated when a affinity change increases the number of
+ * allowed CPUs for the MM, which might cause a switch back to per task
+ * mode.
*
- * Context switch CS-1 Remote-clear
- * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
- * (implied barrier after cmpxchg)
- * - switch_mm_cid()
- * - memory barrier (see switch_mm_cid()
- * comment explaining how this barrier
- * is combined with other scheduler
- * barriers)
- * - mm_cid_get (next)
- * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
+ * If the switch back was initiated by a exiting task, then that task runs
+ * the fixup function. If it was initiated by a affinity change, then it's
+ * run either in the deferred update function in context of a workqueue or
+ * by a task which forks a new one or by a task which exits. Whatever
+ * happens first. mm_cid_fixup_cpus_to_task() walks through the possible
+ * CPUs and either transfers the CPU owned CIDs to a related task which
+ * runs on the CPU or drops it into the pool. Tasks which schedule in on a
+ * CPU which the walk did not cover yet do the handover themself.
*
- * This Dekker ensures that either task (Y) is observed by the
- * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
- * observed.
+ * This transition from CPU to per task ownership happens in two phases:
*
- * If task (Y) store is observed by rcu_dereference(), it means that there is
- * still an active task on the cpu. Remote-clear will therefore not transition
- * to UNSET, which fulfills property (1).
+ * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
+ * CID and denotes that the CID is only temporarily owned by the
+ * task. When it schedules out the task drops the CID back into the
+ * pool if this bit is set.
*
- * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
- * it will move its state to UNSET, which clears the percpu cid perhaps
- * uselessly (which is not an issue for correctness). Because task (Y) is not
- * observed, CPU1 can move ahead to set the state to UNSET. Because moving
- * state to UNSET is done with a cmpxchg expecting that the old state has the
- * LAZY flag set, only one thread will successfully UNSET.
+ * 2) The initiating context walks the per CPU space and after completion
+ * clears mm:mm_cid.transit. So after that point the CIDs are strictly
+ * task owned again.
*
- * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
- * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
- * CPU1 will observe task (Y) and do nothing more, which is fine.
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail if
+ * two tasks are scheduled in on the same CPU before the fixup freed per
+ * CPU CIDs.
*
- * What we are effectively preventing with this Dekker is a scenario where
- * neither LAZY flag nor store (Y) are observed, which would fail property (1)
- * because this would UNSET a cid which is actively used.
+ * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
+ * related to that MM is owned by a CPU anymore.
*/
-void sched_mm_cid_migrate_from(struct task_struct *t)
-{
- t->migrate_from_cpu = task_cpu(t);
-}
-
-static
-int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid)
+/*
+ * Update the CID range properties when the constraints change. Invoked via
+ * fork(), exit() and affinity changes
+ */
+static void __mm_update_max_cids(struct mm_mm_cid *mc)
{
- struct mm_struct *mm = t->mm;
- struct task_struct *src_task;
- int src_cid, last_mm_cid;
+ unsigned int opt_cids, max_cids;
- if (!mm)
- return -1;
+ /* Calculate the new optimal constraint */
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
- last_mm_cid = t->last_mm_cid;
- /*
- * If the migrated task has no last cid, or if the current
- * task on src rq uses the cid, it means the source cid does not need
- * to be moved to the destination cpu.
- */
- if (last_mm_cid == -1)
- return -1;
- src_cid = READ_ONCE(src_pcpu_cid->cid);
- if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
- return -1;
+ /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */
+ max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus());
+ WRITE_ONCE(mc->max_cids, max_cids);
+}
- /*
- * If we observe an active task using the mm on this rq, it means we
- * are not the last task to be migrated from this cpu for this mm, so
- * there is no need to move src_cid to the destination cpu.
- */
- guard(rcu)();
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- t->last_mm_cid = -1;
- return -1;
- }
+static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
+{
+ unsigned int opt_cids;
- return src_cid;
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
+ /* Has to be at least 1 because 0 indicates PCPU mode off */
+ return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1);
}
-static
-int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid,
- int src_cid)
+static bool mm_update_max_cids(struct mm_struct *mm)
{
- struct task_struct *src_task;
- struct mm_struct *mm = t->mm;
- int lazy_cid;
+ struct mm_mm_cid *mc = &mm->mm_cid;
- if (src_cid == -1)
- return -1;
+ lockdep_assert_held(&mm->mm_cid.lock);
- /*
- * Attempt to clear the source cpu cid to move it to the destination
- * cpu.
- */
- lazy_cid = mm_cid_set_lazy_put(src_cid);
- if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
- return -1;
-
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
+ /* Clear deferred mode switch flag. A change is handled by the caller */
+ mc->update_deferred = false;
+ __mm_update_max_cids(mc);
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, this task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- /*
- * We observed an active task for this mm, there is therefore
- * no point in moving this cid to the destination cpu.
- */
- t->last_mm_cid = -1;
- return -1;
- }
+ /* Check whether owner mode must be changed */
+ if (!mc->percpu) {
+ /* Enable per CPU mode when the number of users is above max_cids */
+ if (mc->users > mc->max_cids)
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ } else {
+ /* Switch back to per task if user count under threshold */
+ if (mc->users < mc->pcpu_thrs)
+ mc->pcpu_thrs = 0;
}
- /*
- * The src_cid is unused, so it can be unset.
- */
- if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- return -1;
- WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
- return src_cid;
+ /* Mode change required? */
+ if (!!mc->percpu == !!mc->pcpu_thrs)
+ return false;
+ /* When switching back to per TASK mode, set the transition flag */
+ if (!mc->pcpu_thrs)
+ WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+ WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+ return true;
}
-/*
- * Migration to dst cpu. Called with dst_rq lock held.
- * Interrupts are disabled, which keeps the window of cid ownership without the
- * source rq lock held small.
- */
-void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
{
- struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
- struct mm_struct *mm = t->mm;
- int src_cid, src_cpu;
- bool dst_cid_is_set;
- struct rq *src_rq;
-
- lockdep_assert_rq_held(dst_rq);
+ struct cpumask *mm_allowed;
+ struct mm_mm_cid *mc;
+ unsigned int weight;
- if (!mm)
+ if (!mm || !READ_ONCE(mm->mm_cid.users))
return;
- src_cpu = t->migrate_from_cpu;
- if (src_cpu == -1) {
- t->last_mm_cid = -1;
- return;
- }
/*
- * Move the src cid if the dst cid is unset. This keeps id
- * allocation closest to 0 in cases where few threads migrate around
- * many CPUs.
- *
- * If destination cid or recent cid is already set, we may have
- * to just clear the src cid to ensure compactness in frequent
- * migrations scenarios.
- *
- * It is not useful to clear the src cid when the number of threads is
- * greater or equal to the number of allowed CPUs, because user-space
- * can expect that the number of allowed cids can reach the number of
- * allowed CPUs.
- */
- dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
- dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
- !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
- if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
+ * mm::mm_cid::mm_cpus_allowed is the superset of each threads
+ * allowed CPUs mask which means it can only grow.
+ */
+ mc = &mm->mm_cid;
+ guard(raw_spinlock)(&mc->lock);
+ mm_allowed = mm_cpus_allowed(mm);
+ weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk);
+ if (weight == mc->nr_cpus_allowed)
return;
- src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
- src_rq = cpu_rq(src_cpu);
- src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
- if (src_cid == -1)
+
+ WRITE_ONCE(mc->nr_cpus_allowed, weight);
+ __mm_update_max_cids(mc);
+ if (!mc->percpu)
return;
- src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
- src_cid);
- if (src_cid == -1)
+
+ /* Adjust the threshold to the wider set */
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ /* Switch back to per task mode? */
+ if (mc->users >= mc->pcpu_thrs)
return;
- if (dst_cid_is_set) {
- __mm_cid_put(mm, src_cid);
+
+ /* Don't queue twice */
+ if (mc->update_deferred)
return;
- }
- /* Move src_cid to dst cpu. */
- mm_cid_snapshot_time(dst_rq, mm);
- WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
- WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
+
+ /* Queue the irq work, which schedules the real work */
+ mc->update_deferred = true;
+ irq_work_queue(&mc->irq_work);
}
-static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
- int cpu)
+static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
- struct rq *rq = cpu_rq(cpu);
- struct task_struct *t;
- int cid, lazy_cid;
+ if (cid_on_cpu(t->mm_cid.cid)) {
+ unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid);
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid))
- return;
+ t->mm_cid.cid = cid_to_transit_cid(cid);
+ pcp->cid = t->mm_cid.cid;
+ }
+}
- /*
- * Clear the cpu cid if it is set to keep cid allocation compact. If
- * there happens to be other tasks left on the source cpu using this
- * mm, the next task using this mm will reallocate its cid on context
- * switch.
- */
- lazy_cid = mm_cid_set_lazy_put(cid);
- if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
- return;
+static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+{
+ unsigned int cpu;
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
+ /* Walk the CPUs and fixup all stale CIDs */
+ for_each_possible_cpu(cpu) {
+ struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
+ struct rq *rq = cpu_rq(cpu);
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, that task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- t = rcu_dereference(rq->curr);
- if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
- return;
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(rq_lock_irq)(rq);
+ /* Is the CID still owned by the CPU? */
+ if (cid_on_cpu(pcp->cid)) {
+ /*
+ * If rq->curr has @mm, transfer it with the
+ * transition bit set. Otherwise drop it.
+ */
+ if (rq->curr->mm == mm && rq->curr->mm_cid.active)
+ mm_cid_transit_to_task(rq->curr, pcp);
+ else
+ mm_drop_cid_on_cpu(mm, pcp);
+
+ } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) {
+ unsigned int cid = rq->curr->mm_cid.cid;
+
+ /* Ensure it has the transition bit set */
+ if (!cid_in_transit(cid)) {
+ cid = cid_to_transit_cid(cid);
+ rq->curr->mm_cid.cid = cid;
+ pcp->cid = cid;
+ }
+ }
}
+ /* Clear the transition bit */
+ WRITE_ONCE(mm->mm_cid.transit, 0);
+}
- /*
- * The cid is unused, so it can be unset.
- * Disable interrupts to keep the window of cid ownership without rq
- * lock small.
- */
- scoped_guard (irqsave) {
- if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- __mm_cid_put(mm, cid);
+static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+ if (cid_on_task(t->mm_cid.cid)) {
+ t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ pcp->cid = t->mm_cid.cid;
}
}
-static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
{
- struct rq *rq = cpu_rq(cpu);
- struct mm_cid *pcpu_cid;
- struct task_struct *curr;
- u64 rq_clock;
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(task_rq_lock)(t);
+ /* If the task is not active it is not in the users count */
+ if (!t->mm_cid.active)
+ return false;
+ if (cid_on_task(t->mm_cid.cid)) {
+ /* If running on the CPU, transfer the CID, otherwise drop it */
+ if (task_rq(t)->curr == t)
+ mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ else
+ mm_unset_cid_on_task(t);
+ }
+ return true;
+}
- /*
- * rq->clock load is racy on 32-bit but one spurious clear once in a
- * while is irrelevant.
- */
- rq_clock = READ_ONCE(rq->clock);
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct task_struct *p, *t;
+ unsigned int users;
/*
- * In order to take care of infrequently scheduled tasks, bump the time
- * snapshot associated with this cid if an active task using the mm is
- * observed on this rq.
+ * This can obviously race with a concurrent affinity change, which
+ * increases the number of allowed CPUs for this mm, but that does
+ * not affect the mode and only changes the CID constraints. A
+ * possible switch back to per task mode happens either in the
+ * deferred handler function or in the next fork()/exit().
+ *
+ * The caller has already transferred. The newly incoming task is
+ * already accounted for, but not yet visible.
*/
- scoped_guard (rcu) {
- curr = rcu_dereference(rq->curr);
- if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- WRITE_ONCE(pcpu_cid->time, rq_clock);
- return;
- }
+ users = mm->mm_cid.users - 2;
+ if (!users)
+ return;
+
+ guard(rcu)();
+ for_other_threads(current, t) {
+ if (mm_cid_fixup_task_to_cpu(t, mm))
+ users--;
}
- if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+ if (!users)
return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+
+ /* Happens only for VM_CLONE processes. */
+ for_each_process_thread(p, t) {
+ if (t == current || t->mm != mm)
+ continue;
+ if (mm_cid_fixup_task_to_cpu(t, mm)) {
+ if (--users == 0)
+ return;
+ }
+ }
}
-static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
- int weight)
+static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
- struct mm_cid *pcpu_cid;
- int cid;
-
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid) || cid < weight)
- return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+ t->mm_cid.active = 1;
+ mm->mm_cid.users++;
+ return mm_update_max_cids(mm);
}
-static void task_mm_cid_work(struct callback_head *work)
+void sched_mm_cid_fork(struct task_struct *t)
{
- unsigned long now = jiffies, old_scan, next_scan;
- struct task_struct *t = current;
- struct cpumask *cidmask;
- struct mm_struct *mm;
- int weight, cpu;
+ struct mm_struct *mm = t->mm;
+ bool percpu;
- WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
+ WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
- work->next = work; /* Prevent double-add */
- if (t->flags & PF_EXITING)
- return;
- mm = t->mm;
- if (!mm)
- return;
- old_scan = READ_ONCE(mm->mm_cid_next_scan);
- next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
- if (!old_scan) {
- unsigned long res;
-
- res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
- if (res != old_scan)
- old_scan = res;
+ guard(mutex)(&mm->mm_cid.mutex);
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+ /* First user ? */
+ if (!mm->mm_cid.users) {
+ sched_mm_cid_add_user(t, mm);
+ t->mm_cid.cid = mm_get_cid(mm);
+ /* Required for execve() */
+ pcp->cid = t->mm_cid.cid;
+ return;
+ }
+
+ if (!sched_mm_cid_add_user(t, mm)) {
+ if (!mm->mm_cid.percpu)
+ t->mm_cid.cid = mm_get_cid(mm);
+ return;
+ }
+
+ /* Handle the mode change and transfer current's CID */
+ percpu = !!mm->mm_cid.percpu;
+ if (!percpu)
+ mm_cid_transit_to_task(current, pcp);
else
- old_scan = next_scan;
+ mm_cid_transfer_to_cpu(current, pcp);
}
- if (time_before(now, old_scan))
- return;
- if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
- return;
- cidmask = mm_cidmask(mm);
- /* Clear cids that were not recently used. */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_old(mm, cpu);
- weight = cpumask_weight(cidmask);
- /*
- * Clear cids that are greater or equal to the cidmask weight to
- * recompact it.
- */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_weight(mm, cpu, weight);
-}
-void init_sched_mm_cid(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- int mm_users = 0;
-
- if (mm) {
- mm_users = atomic_read(&mm->mm_users);
- if (mm_users == 1)
- mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ if (percpu) {
+ mm_cid_fixup_tasks_to_cpus();
+ } else {
+ mm_cid_fixup_cpus_to_tasks(mm);
+ t->mm_cid.cid = mm_get_cid(mm);
}
- t->cid_work.next = &t->cid_work; /* Protect against double add */
- init_task_work(&t->cid_work, task_mm_cid_work);
}
-void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+static bool sched_mm_cid_remove_user(struct task_struct *t)
{
- struct callback_head *work = &curr->cid_work;
- unsigned long now = jiffies;
-
- if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
- work->next != work)
- return;
- if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
- return;
-
- /* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME);
+ t->mm_cid.active = 0;
+ scoped_guard(preempt) {
+ /* Clear the transition bit */
+ t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+ mm_unset_cid_on_task(t);
+ }
+ t->mm->mm_cid.users--;
+ return mm_update_max_cids(t->mm);
}
-void sched_mm_cid_exit_signals(struct task_struct *t)
+static bool __sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
- if (!mm)
- return;
-
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
+ if (!sched_mm_cid_remove_user(t))
+ return false;
/*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
+ * Contrary to fork() this only deals with a switch back to per
+ * task mode either because the above decreased users or an
+ * affinity change increased the number of allowed CPUs and the
+ * deferred fixup did not run yet.
*/
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return false;
+ /*
+ * A failed fork(2) cleanup never gets here, so @current must have
+ * the same MM as @t. That's true for exit() and the failed
+ * pthread_create() cleanup case.
+ */
+ if (WARN_ON_ONCE(current->mm != mm))
+ return false;
+ return true;
}
-void sched_mm_cid_before_execve(struct task_struct *t)
+/*
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
+ */
+void sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
- if (!mm)
+ if (!mm || !t->mm_cid.active)
return;
+ /*
+ * Ensure that only one instance is doing MM CID operations within
+ * a MM. The common case is uncontended. The rare fixup case adds
+ * some overhead.
+ */
+ scoped_guard(mutex, &mm->mm_cid.mutex) {
+ /* mm_cid::mutex is sufficient to protect mm_cid::users */
+ if (likely(mm->mm_cid.users > 1)) {
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ if (!__sched_mm_cid_exit(t))
+ return;
+ /* Mode change required. Transfer currents CID */
+ mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ }
+ mm_cid_fixup_cpus_to_tasks(mm);
+ return;
+ }
+ /* Last user */
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Required across execve() */
+ if (t == current)
+ mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+ /* Ignore mode change. There is nothing to do. */
+ sched_mm_cid_remove_user(t);
+ }
+ }
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
/*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
+ * As this is the last user (execve(), process exit or failed
+ * fork(2)) there is no concurrency anymore.
+ *
+ * Synchronize eventually pending work to ensure that there are no
+ * dangling references left. @t->mm_cid.users is zero so nothing
+ * can queue this work anymore.
*/
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ irq_work_sync(&mm->mm_cid.irq_work);
+ cancel_work_sync(&mm->mm_cid.work);
+}
+
+/* Deactivate MM CID allocation across execve() */
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ sched_mm_cid_exit(t);
}
+/* Reactivate MM CID after successful execve() */
void sched_mm_cid_after_execve(struct task_struct *t)
{
- struct mm_struct *mm = t->mm;
- struct rq *rq;
+ sched_mm_cid_fork(t);
+}
- if (!mm)
+static void mm_cid_work_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
+
+ guard(mutex)(&mm->mm_cid.mutex);
+ /* Did the last user task exit already? */
+ if (!mm->mm_cid.users)
return;
- preempt_disable();
- rq = this_rq();
- scoped_guard (rq_lock_irqsave, rq) {
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 1);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Have fork() or exit() handled it already? */
+ if (!mm->mm_cid.update_deferred)
+ return;
+ /* This clears mm_cid::update_deferred */
+ if (!mm_update_max_cids(mm))
+ return;
+ /* Affinity changes can only switch back to task mode */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return;
}
+ mm_cid_fixup_cpus_to_tasks(mm);
}
-void sched_mm_cid_fork(struct task_struct *t)
+static void mm_cid_irq_work(struct irq_work *work)
{
- WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
- t->mm_cid_active = 1;
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
+
+ /*
+ * Needs to be unconditional because mm_cid::lock cannot be held
+ * when scheduling work as mm_update_cpus_allowed() nests inside
+ * rq::lock and schedule_work() might end up in wakeup...
+ */
+ schedule_work(&mm->mm_cid.work);
}
-#endif /* CONFIG_SCHED_MM_CID */
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx)
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+{
+ mm->mm_cid.max_cids = 0;
+ mm->mm_cid.percpu = 0;
+ mm->mm_cid.transit = 0;
+ mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+ mm->mm_cid.users = 0;
+ mm->mm_cid.pcpu_thrs = 0;
+ mm->mm_cid.update_deferred = 0;
+ raw_spin_lock_init(&mm->mm_cid.lock);
+ mutex_init(&mm->mm_cid.mutex);
+ mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
+ INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+ cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
+ bitmap_zero(mm_cidmask(mm), num_possible_cpus());
+}
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+#endif /* !CONFIG_SCHED_MM_CID */
+
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
{
+ struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
struct rq *rq = task_rq(p);
+ /*
+ * Must exclusively use matched flags since this is both dequeue and
+ * enqueue.
+ */
+ WARN_ON_ONCE(flags & 0xFFFF0000);
+
lockdep_assert_rq_held(rq);
- *ctx = (struct sched_enq_and_set_ctx){
+ if (!(flags & DEQUEUE_NOCLOCK)) {
+ update_rq_clock(rq);
+ flags |= DEQUEUE_NOCLOCK;
+ }
+
+ if (flags & DEQUEUE_CLASS) {
+ if (p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
+ }
+
+ *ctx = (struct sched_change_ctx){
.p = p,
- .queue_flags = queue_flags,
+ .flags = flags,
.queued = task_on_rq_queued(p),
- .running = task_current(rq, p),
+ .running = task_current_donor(rq, p),
};
- update_rq_clock(rq);
+ if (!(flags & DEQUEUE_CLASS)) {
+ if (p->sched_class->get_prio)
+ ctx->prio = p->sched_class->get_prio(rq, p);
+ else
+ ctx->prio = p->prio;
+ }
+
if (ctx->queued)
- dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ dequeue_task(rq, p, flags);
if (ctx->running)
put_prev_task(rq, p);
+
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+ p->sched_class->switched_from(rq, p);
+
+ return ctx;
}
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
{
- struct rq *rq = task_rq(ctx->p);
+ struct task_struct *p = ctx->p;
+ struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+
if (ctx->queued)
- enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags);
if (ctx->running)
- set_next_task(rq, ctx->p);
+ set_next_task(rq, p);
+
+ if (ctx->flags & ENQUEUE_CLASS) {
+ if (p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
+ } else {
+ p->sched_class->prio_changed(rq, p, ctx->prio);
+ }
}
-#endif /* CONFIG_SCHED_CLASS_EXT */