summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c9
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/ext_idle.c37
-rw-r--r--kernel/sched/fair.c12
4 files changed, 41 insertions, 21 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62b3416f5e43..dce50fa57471 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3362,6 +3362,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
+ __schedstat_inc(p->stats.numa_task_swapped);
+ count_vm_numa_event(NUMA_TASK_SWAP);
+ count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
+
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
@@ -7930,8 +7934,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
- /* TODO: This is not properly updating schedstats */
-
+ __schedstat_inc(p->stats.numa_task_migrated);
+ count_vm_numa_event(NUMA_TASK_MIGRATE);
+ count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE);
trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 557246880a7e..9d71baf08075 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1210,6 +1210,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P_SCHEDSTAT(nr_failed_migrations_running);
P_SCHEDSTAT(nr_failed_migrations_hot);
P_SCHEDSTAT(nr_forced_migrations);
+#ifdef CONFIG_NUMA_BALANCING
+ P_SCHEDSTAT(numa_task_migrated);
+ P_SCHEDSTAT(numa_task_swapped);
+#endif
P_SCHEDSTAT(nr_wakeups);
P_SCHEDSTAT(nr_wakeups_sync);
P_SCHEDSTAT(nr_wakeups_migrate);
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 66da03cc0b33..6d29d3cbc670 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -138,6 +138,7 @@ found:
goto retry;
}
+#ifdef CONFIG_NUMA
/*
* Tracks nodes that have not yet been visited when searching for an idle
* CPU across all available nodes.
@@ -186,6 +187,13 @@ static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, i
return cpu;
}
+#else
+static inline s32
+pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ return -EBUSY;
+}
+#endif
/*
* Find an idle CPU in the system, starting from @node.
@@ -447,11 +455,18 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr;
int node = scx_cpu_node_if_enabled(prev_cpu);
+ bool is_prev_allowed;
s32 cpu;
preempt_disable();
/*
+ * Check whether @prev_cpu is still within the allowed set. If not,
+ * we can still try selecting a nearby CPU.
+ */
+ is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
+
+ /*
* Determine the subset of CPUs usable by @p within @cpus_allowed.
*/
if (allowed != p->cpus_ptr) {
@@ -465,21 +480,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
cpu = -EBUSY;
goto out_enable;
}
-
- /*
- * If @prev_cpu is not in the allowed CPUs, skip topology
- * optimizations and try to pick any idle CPU usable by the
- * task.
- *
- * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize
- * the current node, as it may optimize some waker->wakee
- * workloads.
- */
- if (!cpumask_test_cpu(prev_cpu, allowed)) {
- node = scx_cpu_node_if_enabled(smp_processor_id());
- cpu = scx_pick_idle_cpu(allowed, node, flags);
- goto out_enable;
- }
}
/*
@@ -525,7 +525,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
* then avoid a migration.
*/
cpu = smp_processor_id();
- if (cpus_share_cache(cpu, prev_cpu) &&
+ if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) &&
scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
@@ -562,7 +562,8 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
/*
* Keep using @prev_cpu if it's part of a fully idle core.
*/
- if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
+ if (is_prev_allowed &&
+ cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
@@ -611,7 +612,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
/*
* Use @prev_cpu if it's idle.
*/
- if (scx_idle_test_and_clear_cpu(prev_cpu)) {
+ if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 125912c0e9dd..7a14da5396fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2273,7 +2273,8 @@ static bool task_numa_compare(struct task_numa_env *env,
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
- if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
+ if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
+ !cur->mm))
cur = NULL;
/*
@@ -3329,6 +3330,15 @@ static void task_numa_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
+ /*
+ * Memory is pinned to only one NUMA node via cpuset.mems, naturally
+ * no page can be migrated.
+ */
+ if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) {
+ trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed);
+ return;
+ }
+
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);