summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c615
1 files changed, 437 insertions, 178 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25970dbbb279..769d7b7990df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a,
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->min_vruntime);
+ return (s64)(se->vruntime - cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* Which we track using:
*
- * v0 := cfs_rq->min_vruntime
+ * v0 := cfs_rq->zero_vruntime
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
* \Sum w_i := cfs_rq->avg_load
*
- * Since min_vruntime is a monotonic increasing variable that closely tracks
- * the per-task service, these deltas: (v_i - v), will be in the order of the
- * maximal (virtual) lag induced in the system due to quantisation.
+ * Since zero_vruntime closely tracks the per-task service, these
+ * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
*
@@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
avg = div_s64(avg, load);
}
- return cfs_rq->min_vruntime + avg;
+ return cfs_rq->zero_vruntime + avg;
}
/*
@@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+ return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
- u64 min_vruntime = cfs_rq->min_vruntime;
- /*
- * open coded max_vruntime() to allow updating avg_vruntime
- */
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta > 0) {
- avg_vruntime_update(cfs_rq, delta);
- min_vruntime = vruntime;
- }
- return min_vruntime;
-}
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
-{
- struct sched_entity *se = __pick_root_entity(cfs_rq);
- struct sched_entity *curr = cfs_rq->curr;
- u64 vruntime = cfs_rq->min_vruntime;
-
- if (curr) {
- if (curr->on_rq)
- vruntime = curr->vruntime;
- else
- curr = NULL;
- }
-
- if (se) {
- if (!curr)
- vruntime = se->min_vruntime;
- else
- vruntime = min_vruntime(vruntime, se->min_vruntime);
- }
+ avg_vruntime_update(cfs_rq, delta);
- /* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+ cfs_rq->zero_vruntime = vruntime;
}
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
@@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
avg_vruntime_sub(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -955,6 +929,16 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
if (cfs_rq->nr_queued == 1)
return curr && curr->on_rq ? curr : se;
+ /*
+ * Picking the ->next buddy will affect latency but not fairness.
+ */
+ if (sched_feat(PICK_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+ /* ->next will never be delayed */
+ WARN_ON_ONCE(cfs_rq->next->sched_delayed);
+ return cfs_rq->next;
+ }
+
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
@@ -1193,6 +1177,8 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
return delta_exec;
}
+static void set_next_buddy(struct sched_entity *se);
+
/*
* Used by other classes to account runtime.
*/
@@ -1226,7 +1212,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->vruntime += calc_delta_fair(delta_exec, curr);
resched = update_deadline(cfs_rq, curr);
- update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
/*
@@ -1239,8 +1224,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
* against fair_server such that it can account for this time
* and possibly avoid running this period.
*/
- if (dl_server_active(&rq->fair_server))
- dl_server_update(&rq->fair_server, delta_exec);
+ dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -3808,15 +3792,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
if (!curr)
__enqueue_entity(cfs_rq, se);
cfs_rq->nr_queued++;
-
- /*
- * The entity's vruntime has been adjusted, so let's check
- * whether the rq-wide min_vruntime needs updated too. Since
- * the calculations above require stable min_vruntime rather
- * than up-to-date one, we do the update at the end of the
- * reweight process.
- */
- update_min_vruntime(cfs_rq);
}
}
@@ -5429,15 +5404,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_cfs_group(se);
- /*
- * Now advance min_vruntime if @se was the entity holding it back,
- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
- * put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- i.e. we'll be penalized.
- */
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
- update_min_vruntime(cfs_rq);
-
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
@@ -5512,16 +5478,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
struct sched_entity *se;
- /*
- * Picking the ->next buddy will affect latency but not fairness.
- */
- if (sched_feat(PICK_BUDDY) &&
- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
- /* ->next will never be delayed */
- WARN_ON_ONCE(cfs_rq->next->sched_delayed);
- return cfs_rq->next;
- }
-
se = pick_eevdf(cfs_rq);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
@@ -6024,20 +5980,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
/*
- * It's possible we are called with !runtime_remaining due to things
- * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
- * unthrottled us with a positive runtime_remaining but other still
- * running entities consumed those runtime before we reached here.
+ * It's possible we are called with runtime_remaining < 0 due to things
+ * like async unthrottled us with a positive runtime_remaining but other
+ * still running entities consumed those runtime before we reached here.
*
- * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
- * because any enqueue in tg_unthrottle_up() will immediately trigger a
- * throttle, which is not supposed to happen on unthrottle path.
+ * We can't unthrottle this cfs_rq without any runtime remaining because
+ * any enqueue in tg_unthrottle_up() will immediately trigger a throttle,
+ * which is not supposed to happen on unthrottle path.
*/
if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
return;
- se = cfs_rq->tg->se[cpu_of(rq)];
-
cfs_rq->throttled = 0;
update_rq_clock(rq);
@@ -7006,12 +6959,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
h_nr_idle = 1;
}
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
- /* Account for idle runtime */
- if (!rq->nr_running)
- dl_server_update_idle_time(rq, rq->curr);
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
- }
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
@@ -7038,8 +6987,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
hrtick_update(rq);
}
-static void set_next_buddy(struct sched_entity *se);
-
/*
* Basically dequeue_task_fair(), except it can deal with dequeue_entity()
* failing half-way through and resume the dequeue later.
@@ -8715,15 +8662,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context
set_task_max_allowed_capacity(p);
}
-static int
-balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-{
- if (sched_fair_runnable(rq))
- return 1;
-
- return sched_balance_newidle(rq, rf) != 0;
-}
-
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
@@ -8735,16 +8673,81 @@ static void set_next_buddy(struct sched_entity *se)
}
}
+enum preempt_wakeup_action {
+ PREEMPT_WAKEUP_NONE, /* No preemption. */
+ PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */
+ PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */
+ PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */
+};
+
+static inline bool
+set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ /*
+ * Keep existing buddy if the deadline is sooner than pse.
+ * The older buddy may be cache cold and completely unrelated
+ * to the current wakeup but that is unpredictable where as
+ * obeying the deadline is more in line with EEVDF objectives.
+ */
+ if (cfs_rq->next && entity_before(cfs_rq->next, pse))
+ return false;
+
+ set_next_buddy(pse);
+ return true;
+}
+
+/*
+ * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
+ * strictly enforced because the hint is either misunderstood or
+ * multiple tasks must be woken up.
+ */
+static inline enum preempt_wakeup_action
+preempt_sync(struct rq *rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ u64 threshold, delta;
+
+ /*
+ * WF_SYNC without WF_TTWU is not expected so warn if it happens even
+ * though it is likely harmless.
+ */
+ WARN_ON_ONCE(!(wake_flags & WF_TTWU));
+
+ threshold = sysctl_sched_migration_cost;
+ delta = rq_clock_task(rq) - se->exec_start;
+ if ((s64)delta < 0)
+ delta = 0;
+
+ /*
+ * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they
+ * could run on other CPUs. Reduce the threshold before preemption is
+ * allowed to an arbitrary lower value as it is more likely (but not
+ * guaranteed) the waker requires the wakee to finish.
+ */
+ if (wake_flags & WF_RQ_SELECTED)
+ threshold >>= 2;
+
+ /*
+ * As WF_SYNC is not strictly obeyed, allow some runtime for batch
+ * wakeups to be issued.
+ */
+ if (entity_before(pse, se) && delta >= threshold)
+ return PREEMPT_WAKEUP_RESCHED;
+
+ return PREEMPT_WAKEUP_NONE;
+}
+
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
+ enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
struct sched_entity *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
- bool do_preempt_short = false;
if (unlikely(se == pse))
return;
@@ -8758,10 +8761,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
if (task_is_throttled(p))
return;
- if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
- set_next_buddy(pse);
- }
-
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
@@ -8793,7 +8792,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* When non-idle entity preempt an idle entity,
* don't give idle entity slice protection.
*/
- do_preempt_short = true;
+ preempt_action = PREEMPT_WAKEUP_SHORT;
goto preempt;
}
@@ -8812,27 +8811,74 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* If @p has a shorter slice than current and @p is eligible, override
* current's slice protection in order to allow preemption.
*/
- do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
+ if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
+ preempt_action = PREEMPT_WAKEUP_SHORT;
+ goto pick;
+ }
/*
+ * Ignore wakee preemption on WF_FORK as it is less likely that
+ * there is shared data as exec often follow fork. Do not
+ * preempt for tasks that are sched_delayed as it would violate
+ * EEVDF to forcibly queue an ineligible task.
+ */
+ if ((wake_flags & WF_FORK) || pse->sched_delayed)
+ return;
+
+ /*
+ * If @p potentially is completing work required by current then
+ * consider preemption.
+ *
+ * Reschedule if waker is no longer eligible. */
+ if (in_task() && !entity_eligible(cfs_rq, se)) {
+ preempt_action = PREEMPT_WAKEUP_RESCHED;
+ goto preempt;
+ }
+
+ /* Prefer picking wakee soon if appropriate. */
+ if (sched_feat(NEXT_BUDDY) &&
+ set_preempt_buddy(cfs_rq, wake_flags, pse, se)) {
+
+ /*
+ * Decide whether to obey WF_SYNC hint for a new buddy. Old
+ * buddies are ignored as they may not be relevant to the
+ * waker and less likely to be cache hot.
+ */
+ if (wake_flags & WF_SYNC)
+ preempt_action = preempt_sync(rq, wake_flags, pse, se);
+ }
+
+ switch (preempt_action) {
+ case PREEMPT_WAKEUP_NONE:
+ return;
+ case PREEMPT_WAKEUP_RESCHED:
+ goto preempt;
+ case PREEMPT_WAKEUP_SHORT:
+ fallthrough;
+ case PREEMPT_WAKEUP_PICK:
+ break;
+ }
+
+pick:
+ /*
* If @p has become the most eligible task, force preemption.
*/
- if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
+ if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
goto preempt;
- if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
+ if (sched_feat(RUN_TO_PARITY))
update_protect_slice(cfs_rq, se);
return;
preempt:
- if (do_preempt_short)
+ if (preempt_action == PREEMPT_WAKEUP_SHORT)
cancel_protect_slice(se);
resched_curr_lazy(rq);
}
-static struct task_struct *pick_task_fair(struct rq *rq)
+static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
@@ -8876,7 +8922,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
- p = pick_task_fair(rq);
+ p = pick_task_fair(rq, rf);
if (!p)
goto idle;
se = &p->se;
@@ -8955,14 +9001,10 @@ idle:
return NULL;
}
-static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+static struct task_struct *
+fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
{
- return pick_next_task_fair(rq, prev, NULL);
-}
-
-static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
-{
- return pick_task_fair(dl_se->rq);
+ return pick_task_fair(dl_se->rq, rf);
}
void fair_server_init(struct rq *rq)
@@ -8993,7 +9035,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
*/
static void yield_task_fair(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr = rq->donor;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se;
@@ -9017,7 +9059,18 @@ static void yield_task_fair(struct rq *rq)
*/
rq_clock_skip_update(rq);
- se->deadline += calc_delta_fair(se->slice, se);
+ /*
+ * Forfeit the remaining vruntime, only if the entity is eligible. This
+ * condition is necessary because in core scheduling we prefer to run
+ * ineligible tasks rather than force idling. If this happens we may
+ * end up in a loop where the core scheduler picks the yielding task,
+ * which yields immediately again; without the condition the vruntime
+ * ends up quickly running away.
+ */
+ if (entity_eligible(cfs_rq, se)) {
+ se->vruntime = se->deadline;
+ se->deadline += calc_delta_fair(se->slice, se);
+ }
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
@@ -10681,7 +10734,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (sd->flags & SD_ASYM_CPUCAPACITY)
sgs->group_misfit_task_load = 1;
- for_each_cpu(i, sched_group_span(group)) {
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -11733,6 +11786,21 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
}
/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
+
+/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
@@ -11757,6 +11825,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
+ bool need_unlock = false;
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
@@ -11768,6 +11837,14 @@ redo:
goto out_balanced;
}
+ if (!need_unlock && (sd->flags & SD_SERIALIZE)) {
+ int zero = 0;
+ if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1))
+ goto out_balanced;
+
+ need_unlock = true;
+ }
+
group = sched_balance_find_src_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
@@ -12008,6 +12085,9 @@ out_one_pinned:
sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
out:
+ if (need_unlock)
+ atomic_set_release(&sched_balance_running, 0);
+
return ld_moved;
}
@@ -12133,21 +12213,6 @@ out_unlock:
}
/*
- * This flag serializes load-balancing passes over large domains
- * (above the NODE topology level) - only one load-balancing instance
- * may run at a time, to reduce overhead on very large systems with
- * lots of CPUs and large NUMA distances.
- *
- * - Note that load-balancing passes triggered while another one
- * is executing are skipped and not re-tried.
- *
- * - Also note that this does not serialize rebalance_domains()
- * execution, as non-SD_SERIALIZE domains will still be
- * load-balanced in parallel.
- */
-static atomic_t sched_balance_running = ATOMIC_INIT(0);
-
-/*
* Scale the max sched_balance_rq interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
@@ -12156,30 +12221,43 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
-static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+ sd->newidle_call++;
+ sd->newidle_success += success;
+
+ if (sd->newidle_call >= 1024) {
+ sd->newidle_ratio = sd->newidle_success;
+ sd->newidle_call /= 2;
+ sd->newidle_success /= 2;
+ }
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
{
+ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
+ unsigned long now = jiffies;
+
+ if (cost)
+ update_newidle_stats(sd, success);
+
if (cost > sd->max_newidle_lb_cost) {
/*
* Track max cost of a domain to make sure to not delay the
* next wakeup on the CPU.
- *
- * sched_balance_newidle() bumps the cost whenever newidle
- * balance fails, and we don't want things to grow out of
- * control. Use the sysctl_sched_migration_cost as the upper
- * limit, plus a litle extra to avoid off by ones.
*/
- sd->max_newidle_lb_cost =
- min(cost, sysctl_sched_migration_cost + 200);
- sd->last_decay_max_lb_cost = jiffies;
- } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
+ sd->max_newidle_lb_cost = cost;
+ sd->last_decay_max_lb_cost = now;
+
+ } else if (time_after(now, next_decay)) {
/*
* Decay the newidle max times by ~1% per second to ensure that
* it is not outdated and the current max cost is actually
* shorter.
*/
sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
- sd->last_decay_max_lb_cost = jiffies;
-
+ sd->last_decay_max_lb_cost = now;
return true;
}
@@ -12202,7 +12280,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
- int need_serialize, need_decay = 0;
+ int need_decay = 0;
u64 max_cost = 0;
rcu_read_lock();
@@ -12211,7 +12289,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* Decay the newidle max times here because this is a regular
* visit to all the domains.
*/
- need_decay = update_newidle_cost(sd, 0);
+ need_decay = update_newidle_cost(sd, 0, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -12226,13 +12304,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
}
interval = get_sd_balance_interval(sd, busy);
-
- need_serialize = sd->flags & SD_SERIALIZE;
- if (need_serialize) {
- if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
- goto out;
- }
-
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
@@ -12246,9 +12317,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
- if (need_serialize)
- atomic_set_release(&sched_balance_running, 0);
-out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
@@ -12827,18 +12895,21 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ goto out;
+ }
if (!get_rd_overloaded(this_rq->rd) ||
- (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
+ this_rq->avg_idle < sd->max_newidle_lb_cost) {
- if (sd)
- update_next_balance(sd, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
-
goto out;
}
rcu_read_unlock();
+ rq_modified_clear(this_rq);
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
@@ -12854,6 +12925,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
break;
if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned int weight = 1;
+
+ if (sched_feat(NI_RANDOM)) {
+ /*
+ * Throw a 1k sided dice; and only run
+ * newidle_balance according to the success
+ * rate.
+ */
+ u32 d1k = sched_rng() % 1024;
+ weight = 1 + sd->newidle_ratio;
+ if (d1k > weight) {
+ update_newidle_stats(sd, 0);
+ continue;
+ }
+ weight = (1024 + weight/2) / weight;
+ }
pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
@@ -12865,13 +12952,10 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t0 = t1;
/*
- * Failing newidle means it is not effective;
- * bump the cost so we end up doing less of it.
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
*/
- if (!pulled_task)
- domain_cost = (3 * sd->max_newidle_lb_cost) / 2;
-
- update_newidle_cost(sd, domain_cost);
+ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
}
/*
@@ -12896,8 +12980,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
+ /* If a higher prio class was modified, restart the pick */
+ if (rq_modified_above(this_rq, &fair_sched_class))
pulled_task = -1;
out:
@@ -13015,7 +13099,170 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
}
/*
- * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ * Consider any infeasible weight scenario. Take for instance two tasks,
+ * each bound to their respective sibling, one with weight 1 and one with
+ * weight 2. Then the lower weight task will run ahead of the higher weight
+ * task without bound.
+ *
+ * This utterly destroys the concept of a shared time base.
+ *
+ * Remember; all this is about a proportionally fair scheduling, where each
+ * tasks receives:
+ *
+ * w_i
+ * dt_i = ---------- dt (1)
+ * \Sum_j w_j
+ *
+ * which we do by tracking a virtual time, s_i:
+ *
+ * 1
+ * s_i = --- d[t]_i (2)
+ * w_i
+ *
+ * Where d[t] is a delta of discrete time, while dt is an infinitesimal.
+ * The immediate corollary is that the ideal schedule S, where (2) to use
+ * an infinitesimal delta, is:
+ *
+ * 1
+ * S = ---------- dt (3)
+ * \Sum_i w_i
+ *
+ * From which we can define the lag, or deviation from the ideal, as:
+ *
+ * lag(i) = S - s_i (4)
+ *
+ * And since the one and only purpose is to approximate S, we get that:
+ *
+ * \Sum_i w_i lag(i) := 0 (5)
+ *
+ * If this were not so, we no longer converge to S, and we can no longer
+ * claim our scheduler has any of the properties we derive from S. This is
+ * exactly what you did above, you broke it!
+ *
+ *
+ * Let's continue for a while though; to see if there is anything useful to
+ * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i:
+ *
+ * \Sum_i w_i s_i
+ * S = -------------- (6)
+ * \Sum_i w_i
+ *
+ * Which gives us a way to compute S, given our s_i. Now, if you've read
+ * our code, you know that we do not in fact do this, the reason for this
+ * is two-fold. Firstly, computing S in that way requires a 64bit division
+ * for every time we'd use it (see 12), and secondly, this only describes
+ * the steady-state, it doesn't handle dynamics.
+ *
+ * Anyway, in (6): s_i -> x + (s_i - x), to get:
+ *
+ * \Sum_i w_i (s_i - x)
+ * S - x = -------------------- (7)
+ * \Sum_i w_i
+ *
+ * Which shows that S and s_i transform alike (which makes perfect sense
+ * given that S is basically the (weighted) average of s_i).
+ *
+ * So the thing to remember is that the above is strictly UP. It is
+ * possible to generalize to multiple runqueues -- however it gets really
+ * yuck when you have to add affinity support, as illustrated by our very
+ * first counter-example.
+ *
+ * Luckily I think we can avoid needing a full multi-queue variant for
+ * core-scheduling (or load-balancing). The crucial observation is that we
+ * only actually need this comparison in the presence of forced-idle; only
+ * then do we need to tell if the stalled rq has higher priority over the
+ * other.
+ *
+ * [XXX assumes SMT2; better consider the more general case, I suspect
+ * it'll work out because our comparison is always between 2 rqs and the
+ * answer is only interesting if one of them is forced-idle]
+ *
+ * And (under assumption of SMT2) when there is forced-idle, there is only
+ * a single queue, so everything works like normal.
+ *
+ * Let, for our runqueue 'k':
+ *
+ * T_k = \Sum_i w_i s_i
+ * W_k = \Sum_i w_i ; for all i of k (8)
+ *
+ * Then we can write (6) like:
+ *
+ * T_k
+ * S_k = --- (9)
+ * W_k
+ *
+ * From which immediately follows that:
+ *
+ * T_k + T_l
+ * S_k+l = --------- (10)
+ * W_k + W_l
+ *
+ * On which we can define a combined lag:
+ *
+ * lag_k+l(i) := S_k+l - s_i (11)
+ *
+ * And that gives us the tools to compare tasks across a combined runqueue.
+ *
+ *
+ * Combined this gives the following:
+ *
+ * a) when a runqueue enters force-idle, sync it against it's sibling rq(s)
+ * using (7); this only requires storing single 'time'-stamps.
+ *
+ * b) when comparing tasks between 2 runqueues of which one is forced-idle,
+ * compare the combined lag, per (11).
+ *
+ * Now, of course cgroups (I so hate them) make this more interesting in
+ * that a) seems to suggest we need to iterate all cgroup on a CPU at such
+ * boundaries, but I think we can avoid that. The force-idle is for the
+ * whole CPU, all it's rqs. So we can mark it in the root and lazily
+ * propagate downward on demand.
+ */
+
+/*
+ * So this sync is basically a relative reset of S to 0.
+ *
+ * So with 2 queues, when one goes idle, we drop them both to 0 and one
+ * then increases due to not being idle, and the idle one builds up lag to
+ * get re-elected. So far so simple, right?
+ *
+ * When there's 3, we can have the situation where 2 run and one is idle,
+ * we sync to 0 and let the idle one build up lag to get re-election. Now
+ * suppose another one also drops idle. At this point dropping all to 0
+ * again would destroy the built-up lag from the queue that was already
+ * idle, not good.
+ *
+ * So instead of syncing everything, we can:
+ *
+ * less := !((s64)(s_a - s_b) <= 0)
+ *
+ * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b
+ * == v_a - (v_b - S_a + S_b)
+ *
+ * IOW, we can recast the (lag) comparison to a one-sided difference.
+ * So if then, instead of syncing the whole queue, sync the idle queue
+ * against the active queue with S_a + S_b at the point where we sync.
+ *
+ * (XXX consider the implication of living in a cyclic group: N / 2^n N)
+ *
+ * This gives us means of syncing single queues against the active queue,
+ * and for already idle queues to preserve their build-up lag.
+ *
+ * Of course, then we get the situation where there's 2 active and one
+ * going idle, who do we pick to sync against? Theory would have us sync
+ * against the combined S, but as we've already demonstrated, there is no
+ * such thing in infeasible weight scenarios.
+ *
+ * One thing I've considered; and this is where that core_active rudiment
+ * came from, is having active queues sync up between themselves after
+ * every tick. This limits the observed divergence due to the work
+ * conservancy.
+ *
+ * On top of that, we can improve upon things by employing (10) here.
+ */
+
+/*
+ * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
*/
static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
bool forceidle)
@@ -13029,7 +13276,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
cfs_rq->forceidle_seq = fi_seq;
}
- cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
}
}
@@ -13082,11 +13329,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
- * min_vruntime_fi, which would have been updated in prior calls
+ * zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+ (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13148,11 +13395,14 @@ static void task_fork_fair(struct task_struct *p)
* the current task.
*/
static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
if (rq->cfs.nr_queued == 1)
return;
@@ -13164,8 +13414,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
- } else
+ } else {
wakeup_preempt(rq, p, 0);
+ }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -13249,6 +13500,12 @@ static void attach_task_cfs_rq(struct task_struct *p)
attach_entity_cfs_rq(se);
}
+static void switching_from_fair(struct rq *rq, struct task_struct *p)
+{
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+}
+
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -13322,7 +13579,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+ cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
raw_spin_lock_init(&cfs_rq->removed.lock);
}
@@ -13623,6 +13880,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
*/
DEFINE_SCHED_CLASS(fair) = {
+ .queue_mask = 2,
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
@@ -13631,11 +13890,10 @@ DEFINE_SCHED_CLASS(fair) = {
.wakeup_preempt = check_preempt_wakeup_fair,
.pick_task = pick_task_fair,
- .pick_next_task = __pick_next_task_fair,
+ .pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
- .balance = balance_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -13650,6 +13908,7 @@ DEFINE_SCHED_CLASS(fair) = {
.reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair,
+ .switching_from = switching_from_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,