summaryrefslogtreecommitdiff
path: root/kernel/sched/ext_idle.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/ext_idle.c')
-rw-r--r--kernel/sched/ext_idle.c307
1 files changed, 226 insertions, 81 deletions
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index cb343ca889e0..66da03cc0b33 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -47,6 +47,13 @@ static struct scx_idle_cpus scx_idle_global_masks;
static struct scx_idle_cpus **scx_idle_node_masks;
/*
+ * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
+ */
+static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
+
+/*
* Return the idle masks associated to a target @node.
*
* NUMA_NO_NODE identifies the global idle cpumask.
@@ -392,6 +399,14 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
}
/*
+ * Return true if @p can run on all possible CPUs, false otherwise.
+ */
+static inline bool task_affinity_all(const struct task_struct *p)
+{
+ return p->nr_cpus_allowed >= num_possible_cpus();
+}
+
+/*
* Built-in CPU idle selection policy:
*
* 1. Prioritize full-idle cores:
@@ -403,13 +418,15 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* branch prediction optimizations.
*
* 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
+ * - if the above conditions aren't met, pick a CPU that shares the same
+ * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
+ * cache locality.
*
* 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
+ * - choose a CPU from the same NUMA node, if the node cpumask is a
+ * subset of @cpus_allowed, to reduce memory access latency.
*
- * 5. Pick any idle CPU usable by the task.
+ * 5. Pick any idle CPU within the @cpus_allowed domain.
*
* Step 3 and 4 are performed only if the system has, respectively,
* multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
@@ -424,35 +441,77 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
* we never call ops.select_cpu() for them, see select_task_rq().
*/
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
{
- const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
+ const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
+ const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr;
int node = scx_cpu_node_if_enabled(prev_cpu);
s32 cpu;
+ preempt_disable();
+
+ /*
+ * Determine the subset of CPUs usable by @p within @cpus_allowed.
+ */
+ if (allowed != p->cpus_ptr) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask);
+
+ if (task_affinity_all(p)) {
+ allowed = cpus_allowed;
+ } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) {
+ allowed = local_cpus;
+ } else {
+ cpu = -EBUSY;
+ goto out_enable;
+ }
+
+ /*
+ * If @prev_cpu is not in the allowed CPUs, skip topology
+ * optimizations and try to pick any idle CPU usable by the
+ * task.
+ *
+ * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize
+ * the current node, as it may optimize some waker->wakee
+ * workloads.
+ */
+ if (!cpumask_test_cpu(prev_cpu, allowed)) {
+ node = scx_cpu_node_if_enabled(smp_processor_id());
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
+ goto out_enable;
+ }
+ }
+
/*
* This is necessary to protect llc_cpus.
*/
rcu_read_lock();
/*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
+ * Determine the subset of CPUs that the task can use in its
+ * current LLC and node.
*
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
+ * If the task can run on all CPUs, use the node and LLC cpumasks
+ * directly.
*/
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask);
+ const struct cpumask *cpus = numa_span(prev_cpu);
+
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ numa_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ numa_cpus = local_cpus;
+ }
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask);
+ const struct cpumask *cpus = llc_span(prev_cpu);
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
- llc_cpus = llc_span(prev_cpu);
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ llc_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ llc_cpus = local_cpus;
}
/*
@@ -490,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
!cpumask_empty(idle_cpumask(waker_node)->cpu)) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (cpumask_test_cpu(cpu, allowed))
goto out_unlock;
}
}
@@ -535,7 +594,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* begin in prev_cpu's node and proceed to other nodes in
* order of increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+ cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto out_unlock;
@@ -583,10 +642,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* in prev_cpu's node and proceed to other nodes in order of
* increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
out_unlock:
rcu_read_unlock();
+out_enable:
+ preempt_enable();
return cpu;
}
@@ -596,7 +657,7 @@ out_unlock:
*/
void scx_idle_init_masks(void)
{
- int node;
+ int i;
/* Allocate global idle cpumasks */
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
@@ -607,13 +668,23 @@ void scx_idle_init_masks(void)
sizeof(*scx_idle_node_masks), GFP_KERNEL);
BUG_ON(!scx_idle_node_masks);
- for_each_node(node) {
- scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
- GFP_KERNEL, node);
- BUG_ON(!scx_idle_node_masks[node]);
+ for_each_node(i) {
+ scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, i);
+ BUG_ON(!scx_idle_node_masks[i]);
+
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i));
+ }
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+ /* Allocate local per-cpu idle cpumasks */
+ for_each_possible_cpu(i) {
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
}
}
@@ -662,21 +733,12 @@ static void update_builtin_idle(int cpu, bool idle)
*/
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
+ struct scx_sched *sch = scx_root;
int cpu = cpu_of(rq);
lockdep_assert_rq_held(rq);
/*
- * Trigger ops.update_idle() only when transitioning from a task to
- * the idle thread and vice versa.
- *
- * Idle transitions are indicated by do_notify being set to true,
- * managed by put_prev_task_idle()/set_next_task_idle().
- */
- if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
-
- /*
* Update the idle masks:
* - for real idle transitions (do_notify == true)
* - for idle-to-idle transitions (indicated by the previous task
@@ -693,6 +755,21 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
if (static_branch_likely(&scx_builtin_idle_enabled))
if (do_notify || is_idle_task(rq->curr))
update_builtin_idle(cpu, idle);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ *
+ * This must come after builtin idle update so that BPF schedulers can
+ * create interlocking between ops.update_idle() and ops.enqueue() -
+ * either enqueue() sees the idle bit or update_idle() sees the task
+ * that enqueue() queued.
+ */
+ if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
}
static void reset_idle_masks(struct sched_ext_ops *ops)
@@ -748,7 +825,7 @@ void scx_idle_disable(void)
static int validate_node(int node)
{
if (!static_branch_likely(&scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is disabled");
+ scx_kf_error("per-node idle tracking is disabled");
return -EOPNOTSUPP;
}
@@ -758,13 +835,13 @@ static int validate_node(int node)
/* Make sure node is in a valid range */
if (node < 0 || node >= nr_node_ids) {
- scx_ops_error("invalid node %d", node);
+ scx_kf_error("invalid node %d", node);
return -EINVAL;
}
/* Make sure the node is part of the set of possible nodes */
if (!node_possible(node)) {
- scx_ops_error("unavailable node %d", node);
+ scx_kf_error("unavailable node %d", node);
return -EINVAL;
}
@@ -778,10 +855,72 @@ static bool check_builtin_idle_enabled(void)
if (static_branch_likely(&scx_builtin_idle_enabled))
return true;
- scx_ops_error("built-in idle tracking is disabled");
+ scx_kf_error("built-in idle tracking is disabled");
return false;
}
+s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *allowed, u64 flags)
+{
+ struct rq *rq;
+ struct rq_flags rf;
+ s32 cpu;
+
+ if (!kf_cpu_valid(prev_cpu, NULL))
+ return -EINVAL;
+
+ if (!check_builtin_idle_enabled())
+ return -EBUSY;
+
+ /*
+ * If called from an unlocked context, acquire the task's rq lock,
+ * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed.
+ *
+ * Otherwise, allow to use this kfunc only from ops.select_cpu()
+ * and ops.select_enqueue().
+ */
+ if (scx_kf_allowed_if_unlocked()) {
+ rq = task_rq_lock(p, &rf);
+ } else {
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ return -EPERM;
+ rq = scx_locked_rq();
+ }
+
+ /*
+ * Validate locking correctness to access p->cpus_ptr and
+ * p->nr_cpus_allowed: if we're holding an rq lock, we're safe;
+ * otherwise, assert that p->pi_lock is held.
+ */
+ if (!rq)
+ lockdep_assert_held(&p->pi_lock);
+
+#ifdef CONFIG_SMP
+ /*
+ * This may also be called from ops.enqueue(), so we need to handle
+ * per-CPU tasks as well. For these tasks, we can skip all idle CPU
+ * selection optimizations and simply check whether the previously
+ * used CPU is idle and within the allowed cpumask.
+ */
+ if (p->nr_cpus_allowed == 1) {
+ if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
+ scx_idle_test_and_clear_cpu(prev_cpu))
+ cpu = prev_cpu;
+ else
+ cpu = -EBUSY;
+ } else {
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags,
+ allowed ?: p->cpus_ptr, flags);
+ }
+#else
+ cpu = -EBUSY;
+#endif
+ if (scx_kf_allowed_if_unlocked())
+ task_rq_unlock(rq, p, &rf);
+
+ return cpu;
+}
+
/**
* scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
* trigger an error if @cpu is invalid
@@ -790,7 +929,7 @@ static bool check_builtin_idle_enabled(void)
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
#ifdef CONFIG_NUMA
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
@@ -806,9 +945,10 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
* @wake_flags: %SCX_WAKE_* flags
* @is_idle: out parameter indicating whether the returned CPU is idle
*
- * Can only be called from ops.select_cpu() if the built-in CPU selection is
- * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
*
* Returns the picked CPU with *@is_idle indicating whether the picked CPU is
* currently idle and thus a good candidate for direct dispatching.
@@ -816,39 +956,52 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
-#ifdef CONFIG_SMP
s32 cpu;
-#endif
- if (!ops_cpu_valid(prev_cpu, NULL))
- goto prev_cpu;
- if (!check_builtin_idle_enabled())
- goto prev_cpu;
-
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
- goto prev_cpu;
-
-#ifdef CONFIG_SMP
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
*is_idle = true;
return cpu;
}
-#endif
-
-prev_cpu:
*is_idle = false;
+
return prev_cpu;
}
/**
+ * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
+ * prioritizing those in @cpus_allowed
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @cpus_allowed: cpumask of allowed CPUs
+ * @flags: %SCX_PICK_IDLE* flags
+ *
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
+ *
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the selected idle CPU, which will be automatically awakened upon
+ * returning from ops.select_cpu() and can be used for direct dispatch, or
+ * a negative value if no idle CPU is available.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
+{
+ return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags);
+}
+
+/**
* scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
* idle-tracking per-CPU cpumask of a target NUMA node.
* @node: target NUMA node
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
{
@@ -873,7 +1026,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
@@ -895,7 +1048,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
{
@@ -924,7 +1077,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
@@ -971,7 +1124,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
if (!check_builtin_idle_enabled())
return false;
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return scx_idle_test_and_clear_cpu(cpu);
else
return false;
@@ -1032,7 +1185,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_kf_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1109,7 +1262,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
s32 cpu;
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_kf_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1140,6 +1293,8 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_idle)
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
@@ -1147,21 +1302,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
.set = &scx_kfunc_ids_idle,
};
-BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-
-static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
- .owner = THIS_MODULE,
- .set = &scx_kfunc_ids_select_cpu,
-};
-
int scx_idle_init(void)
{
int ret;
- ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);