Merge tag 'sched_ext-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext updates from Tejun Heo: - Add mechanism to count and report internal events. This significantly improves visibility on subtle corner conditions. - The default idle CPU selection logic is revamped and improved in multiple ways including being made topology aware. - sched_ext was disabling ttwu_queue for simplicity, which can be costly when hardware topology is more complex. Implement SCX_OPS_ALLOWED_QUEUED_WAKEUP so that BPF schedulers can selectively enable ttwu_queue. - tools/sched_ext updates to improve compatibility among others. - Other misc updates and fixes. - sched_ext/for-6.14-fixes were pulled a few times to receive prerequisite fixes and resolve conflicts. * tag 'sched_ext-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (42 commits) sched_ext: idle: Refactor scx_select_cpu_dfl() sched_ext: idle: Honor idle flags in the built-in idle selection policy sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local() sched_ext: Add trace point to track sched_ext core events sched_ext: Change the event type from u64 to s64 sched_ext: Documentation: add task lifecycle summary tools/sched_ext: Provide a compatible helper for scx_bpf_events() selftests/sched_ext: Add NUMA-aware scheduler test tools/sched_ext: Provide consistent access to scx flags sched_ext: idle: Fix scx_bpf_pick_any_cpu_node() behavior sched_ext: idle: Introduce scx_bpf_nr_node_ids() sched_ext: idle: Introduce node-aware idle cpu kfunc helpers sched_ext: idle: Per-node idle cpumasks sched_ext: idle: Introduce SCX_OPS_BUILTIN_IDLE_PER_NODE sched_ext: idle: Make idle static keys private sched/topology: Introduce for_each_node_numadist() iterator mm/numa: Introduce nearest_node_nodemask() nodemask: numa: reorganize inclusion path nodemask: add nodes_copy() tools/sched_ext: Sync with scx repo ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2025-03-24 17:23:48 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2025-03-24 17:23:48 -0700
commit: bcb044256d3f5d9f5bb61d1eac6492f77883bd60 (patch)
tree: 0c0c48beef0742ac47a5e136fe0ecbcea66b6871
parent: 94dc216ad848ebee06ce7692fcfcbb2e9b3e643c (diff)
parent: e4855fc90e52efef7e3926205c8dc53ce39b6138 (diff)
25 files changed, 2148 insertions, 783 deletions
diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index c4672d7df2f7..0993e41353db 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -294,6 +294,42 @@ dispatching, and must be dispatched to with ``scx_bpf_dsq_insert()``. See
 the function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c``
 for more information.
 
+Task Lifecycle
+--------------
+
+The following pseudo-code summarizes the entire lifecycle of a task managed
+by a sched_ext scheduler:
+
+.. code-block:: c
+
+    ops.init_task();            /* A new task is created */
+    ops.enable();               /* Enable BPF scheduling for the task */
+
+    while (task in SCHED_EXT) {
+        if (task can migrate)
+            ops.select_cpu();   /* Called on wakeup (optimization) */
+
+        ops.runnable();         /* Task becomes ready to run */
+
+        while (task is runnable) {
+            if (task is not in a DSQ) {
+                ops.enqueue();  /* Task can be added to a DSQ */
+
+                /* A CPU becomes available */
+
+                ops.dispatch(); /* Task is moved to a local DSQ */
+            }
+            ops.running();      /* Task starts running on its assigned CPU */
+            ops.tick();         /* Called every 1/HZ seconds */
+            ops.stopping();     /* Task stops running (time slice expires or wait) */
+        }
+
+        ops.quiescent();        /* Task releases its assigned CPU (wait) */
+    }
+
+    ops.disable();              /* Disable BPF scheduling for the task */
+    ops.exit_task();            /* Task is destroyed */
+
 Where to Look
 =============
 
diff --git a/MAINTAINERS b/MAINTAINERS
index b04862031c91..7caaa76b9954 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21196,8 +21196,7 @@ S:	Maintained
 W:	https://github.com/sched-ext/scx
 T:	git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
 F:	include/linux/sched/ext.h
-F:	kernel/sched/ext.h
-F:	kernel/sched/ext.c
+F:	kernel/sched/ext*
 F:	tools/sched_ext/
 F:	tools/testing/selftests/sched_ext
 
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 9fd7a0ce9c1a..f0ac0633366b 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -94,7 +94,6 @@
 #include <linux/bitmap.h>
 #include <linux/minmax.h>
 #include <linux/nodemask_types.h>
-#include <linux/numa.h>
 #include <linux/random.h>
 
 extern nodemask_t _unused_nodemask_arg_;
@@ -191,6 +190,13 @@ static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *s
 	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
+#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES)
+static __always_inline void __nodes_copy(nodemask_t *dstp,
+					const nodemask_t *srcp, unsigned int nbits)
+{
+	bitmap_copy(dstp->bits, srcp->bits, nbits);
+}
+
 #define nodes_complement(dst, src) \
 			__nodes_complement(&(dst), &(src), MAX_NUMNODES)
 static __always_inline void __nodes_complement(nodemask_t *dstp,
diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h
index 6b28d97ea6ed..f850a48742f1 100644
--- a/include/linux/nodemask_types.h
+++ b/include/linux/nodemask_types.h
@@ -3,7 +3,16 @@
 #define __LINUX_NODEMASK_TYPES_H
 
 #include <linux/bitops.h>
-#include <linux/numa.h>
+
+#ifdef CONFIG_NODES_SHIFT
+#define NODES_SHIFT     CONFIG_NODES_SHIFT
+#else
+#define NODES_SHIFT     0
+#endif
+
+#define MAX_NUMNODES    (1 << NODES_SHIFT)
+
+#define	NUMA_NO_NODE	(-1)
 
 typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 3567e40329eb..e6baaf6051bc 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -3,16 +3,8 @@
 #define _LINUX_NUMA_H
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/nodemask.h>
 
-#ifdef CONFIG_NODES_SHIFT
-#define NODES_SHIFT     CONFIG_NODES_SHIFT
-#else
-#define NODES_SHIFT     0
-#endif
-
-#define MAX_NUMNODES    (1 << NODES_SHIFT)
-
-#define	NUMA_NO_NODE	(-1)
 #define	NUMA_NO_MEMBLK	(-1)
 
 static inline bool numa_valid_node(int nid)
@@ -39,6 +31,8 @@ void __init alloc_offline_node_data(int nid);
 /* Generic implementation available */
 int numa_nearest_node(int node, unsigned int state);
 
+int nearest_node_nodemask(int node, nodemask_t *mask);
+
 #ifndef memory_add_physaddr_to_nid
 int memory_add_physaddr_to_nid(u64 start);
 #endif
@@ -55,6 +49,11 @@ static inline int numa_nearest_node(int node, unsigned int state)
 	return NUMA_NO_NODE;
 }
 
+static inline int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+	return NUMA_NO_NODE;
+}
+
 static inline int memory_add_physaddr_to_nid(u64 start)
 {
 	return 0;
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1d70a9867fb1..f7545430a548 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -146,6 +146,7 @@ struct sched_ext_entity {
 	u32			weight;
 	s32			sticky_cpu;
 	s32			holding_cpu;
+	s32			selected_cpu;
 	u32			kf_mask;	/* see scx_kf_mask above */
 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
 	atomic_long_t		ops_state;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3..a1815f4395ab 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -262,6 +262,36 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops)
 #endif	/* CONFIG_NUMA */
 
 /**
+ * for_each_node_numadist() - iterate over nodes in increasing distance
+ *			      order, starting from a given node
+ * @node: the iteration variable and the starting node.
+ * @unvisited: a nodemask to keep track of the unvisited nodes.
+ *
+ * This macro iterates over NUMA node IDs in increasing distance from the
+ * starting @node and yields MAX_NUMNODES when all the nodes have been
+ * visited.
+ *
+ * Note that by the time the loop completes, the @unvisited nodemask will
+ * be fully cleared, unless the loop exits early.
+ *
+ * The difference between for_each_node() and for_each_node_numadist() is
+ * that the former allows to iterate over nodes in numerical order, whereas
+ * the latter iterates over nodes in increasing order of distance.
+ *
+ * This complexity of this iterator is O(N^2), where N represents the
+ * number of nodes, as each iteration involves scanning all nodes to
+ * find the one with the shortest distance.
+ *
+ * Requires rcu_lock to be held.
+ */
+#define for_each_node_numadist(node, unvisited)					\
+	for (int __start = (node),						\
+	     (node) = nearest_node_nodemask((__start), &(unvisited));		\
+	     (node) < MAX_NUMNODES;						\
+	     node_clear((node), (unvisited)),					\
+	     (node) = nearest_node_nodemask((__start), &(unvisited)))
+
+/**
  * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
  *                          from a given node.
  * @mask: the iteration variable.
diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h
index fe19da7315a9..50e4b712735a 100644
--- a/include/trace/events/sched_ext.h
+++ b/include/trace/events/sched_ext.h
@@ -26,6 +26,25 @@ TRACE_EVENT(sched_ext_dump,
 	)
 );
 
+TRACE_EVENT(sched_ext_event,
+	    TP_PROTO(const char *name, __s64 delta),
+	    TP_ARGS(name, delta),
+
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(	__s64,		delta		)
+	),
+
+	TP_fast_assign(
+		__assign_str(name);
+		__entry->delta		= delta;
+	),
+
+	TP_printk("name %s delta %lld",
+		  __get_str(name), __entry->delta
+	)
+);
+
 #endif /* _TRACE_SCHED_EXT_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index fae1f5c921eb..72d97aa8b726 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -61,6 +61,7 @@
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 # include "ext.c"
+# include "ext_idle.c"
 #endif
 
 #include "syscalls.c"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 042351c7afce..f6840c33a296 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3922,13 +3922,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
 
 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 {
-	/*
-	 * The BPF scheduler may depend on select_task_rq() being invoked during
-	 * wakeups. In addition, @p may end up executing on a different CPU
-	 * regardless of what happens in the wakeup path making the ttwu_queue
-	 * optimization less meaningful. Skip if on SCX.
-	 */
-	if (task_on_scx(p))
+	/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
+	if (!scx_allow_ttwu_queue(p))
 		return false;
 
 	/*
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7b9dfee858e7..06561d6717c9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6,6 +6,9 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
+#include <linux/btf_ids.h>
+#include "ext_idle.h"
+
 #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
 
 enum scx_consts {
@@ -93,7 +96,7 @@ enum scx_ops_flags {
 	/*
 	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
 	 */
-	SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+	SCX_OPS_KEEP_BUILTIN_IDLE	= 1LLU << 0,
 
 	/*
 	 * By default, if there are no other task to run on the CPU, ext core
@@ -101,7 +104,7 @@ enum scx_ops_flags {
 	 * flag is specified, such tasks are passed to ops.enqueue() with
 	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
 	 */
-	SCX_OPS_ENQ_LAST	= 1LLU << 1,
+	SCX_OPS_ENQ_LAST		= 1LLU << 1,
 
 	/*
 	 * An exiting task may schedule after PF_EXITING is set. In such cases,
@@ -114,13 +117,13 @@ enum scx_ops_flags {
 	 * depend on pid lookups and wants to handle these tasks directly, the
 	 * following flag can be used.
 	 */
-	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
+	SCX_OPS_ENQ_EXITING		= 1LLU << 2,
 
 	/*
 	 * If set, only tasks with policy set to SCHED_EXT are attached to
 	 * sched_ext. If clear, SCHED_NORMAL tasks are also included.
 	 */
-	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,
+	SCX_OPS_SWITCH_PARTIAL		= 1LLU << 3,
 
 	/*
 	 * A migration disabled task can only execute on its current CPU. By
@@ -133,7 +136,29 @@ enum scx_ops_flags {
 	 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
 	 * and thus may disagree with cpumask_weight(p->cpus_ptr).
 	 */
-	SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+	SCX_OPS_ENQ_MIGRATION_DISABLED	= 1LLU << 4,
+
+	/*
+	 * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+	 * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+	 * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+	 * transfers. When this optimization is enabled, ops.select_cpu() is
+	 * skipped in some cases (when racing against the wakee switching out).
+	 * As the BPF scheduler may depend on ops.select_cpu() being invoked
+	 * during wakeups, queued wakeup is disabled by default.
+	 *
+	 * If this ops flag is set, queued wakeup optimization is enabled and
+	 * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+	 * wakee's CPU without preceding ops.select_cpu() even for tasks which
+	 * may be executed on multiple CPUs.
+	 */
+	SCX_OPS_ALLOW_QUEUED_WAKEUP	= 1LLU << 5,
+
+	/*
+	 * If set, enable per-node idle cpumasks. If clear, use a single global
+	 * flat idle cpumask.
+	 */
+	SCX_OPS_BUILTIN_IDLE_PER_NODE	= 1LLU << 6,
 
 	/*
 	 * CPU cgroup support flags
@@ -144,7 +169,9 @@ enum scx_ops_flags {
 				  SCX_OPS_ENQ_LAST |
 				  SCX_OPS_ENQ_EXITING |
 				  SCX_OPS_ENQ_MIGRATION_DISABLED |
+				  SCX_OPS_ALLOW_QUEUED_WAKEUP |
 				  SCX_OPS_SWITCH_PARTIAL |
+				  SCX_OPS_BUILTIN_IDLE_PER_NODE |
 				  SCX_OPS_HAS_CGROUP_WEIGHT,
 };
 
@@ -779,6 +806,7 @@ enum scx_deq_flags {
 
 enum scx_pick_idle_cpu_flags {
 	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
+	SCX_PICK_IDLE_IN_NODE	= 1LLU << 1,	/* pick a CPU in the same target NUMA node */
 };
 
 enum scx_kick_flags {
@@ -894,16 +922,11 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
 static struct sched_ext_ops scx_ops;
 static bool scx_warned_zero_slice;
 
+DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
 static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
-
-#ifdef CONFIG_SMP
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
-#endif
 
 static struct static_key_false scx_has_op[SCX_OPI_END] =
 	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
@@ -938,21 +961,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
 
 static struct delayed_work scx_watchdog_work;
 
-/* idle tracking */
-#ifdef CONFIG_SMP
-#ifdef CONFIG_CPUMASK_OFFSTACK
-#define CL_ALIGNED_IF_ONSTACK
-#else
-#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
-#endif
-
-static struct {
-	cpumask_var_t cpu;
-	cpumask_var_t smt;
-} idle_masks CL_ALIGNED_IF_ONSTACK;
-
-#endif	/* CONFIG_SMP */
-
 /* for %SCX_KICK_WAIT */
 static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
 
@@ -1473,6 +1481,117 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
 	return p;
 }
 
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+	/*
+	 * If ops.select_cpu() returns a CPU which can't be used by the task,
+	 * the core scheduler code silently picks a fallback CPU.
+	 */
+	s64		SCX_EV_SELECT_CPU_FALLBACK;
+
+	/*
+	 * When dispatching to a local DSQ, the CPU may have gone offline in
+	 * the meantime. In this case, the task is bounced to the global DSQ.
+	 */
+	s64		SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+	/*
+	 * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+	 * continued to run because there were no other tasks on the CPU.
+	 */
+	s64		SCX_EV_DISPATCH_KEEP_LAST;
+
+	/*
+	 * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+	 * is dispatched to a local DSQ when exiting.
+	 */
+	s64		SCX_EV_ENQ_SKIP_EXITING;
+
+	/*
+	 * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+	 * migration disabled task skips ops.enqueue() and is dispatched to its
+	 * local DSQ.
+	 */
+	s64		SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+	/*
+	 * The total number of tasks enqueued (or pick_task-ed) with a
+	 * default time slice (SCX_SLICE_DFL).
+	 */
+	s64		SCX_EV_ENQ_SLICE_DFL;
+
+	/*
+	 * The total duration of bypass modes in nanoseconds.
+	 */
+	s64		SCX_EV_BYPASS_DURATION;
+
+	/*
+	 * The number of tasks dispatched in the bypassing mode.
+	 */
+	s64		SCX_EV_BYPASS_DISPATCH;
+
+	/*
+	 * The number of times the bypassing mode has been activated.
+	 */
+	s64		SCX_EV_BYPASS_ACTIVATE;
+};
+
+/*
+ * The event counter is organized by a per-CPU variable to minimize the
+ * accounting overhead without synchronization. A system-wide view on the
+ * event counter is constructed when requested by scx_bpf_get_event_stat().
+ */
+static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This can be used when preemption is not disabled.
+ */
+#define scx_add_event(name, cnt) do {						\
+	this_cpu_add(event_stats_cpu.name, cnt);				\
+	trace_sched_ext_event(#name, cnt);					\
+} while(0)
+
+/**
+ * __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This should be used only when preemption is disabled.
+ */
+#define __scx_add_event(name, cnt) do {						\
+	__this_cpu_add(event_stats_cpu.name, cnt);				\
+	trace_sched_ext_event(#name, cnt);					\
+} while(0)
+
+/**
+ * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
+ * @dst_e: destination event stats
+ * @src_e: source event stats
+ * @kind: a kind of event to be aggregated
+ */
+#define scx_agg_event(dst_e, src_e, kind) do {					\
+	(dst_e)->kind += READ_ONCE((src_e)->kind);				\
+} while(0)
+
+/**
+ * scx_dump_event - Dump an event 'kind' in 'events' to 's'
+ * @s: output seq_buf
+ * @events: event stats
+ * @kind: a kind of event to dump
+ */
+#define scx_dump_event(s, events, kind) do {					\
+	dump_line(&(s), "%40s: %16lld", #kind, (events)->kind);			\
+} while (0)
+
+
+static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+
 static enum scx_ops_enable_state scx_ops_enable_state(void)
 {
 	return atomic_read(&scx_ops_enable_state_var);
@@ -2018,21 +2137,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	if (!scx_rq_online(rq))
 		goto local;
 
-	if (scx_rq_bypassing(rq))
+	if (scx_rq_bypassing(rq)) {
+		__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
 		goto global;
+	}
 
 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
 		goto direct;
 
 	/* see %SCX_OPS_ENQ_EXITING */
 	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
-	    unlikely(p->flags & PF_EXITING))
+	    unlikely(p->flags & PF_EXITING)) {
+		__scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
 		goto local;
+	}
 
 	/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
 	if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
-	    is_migration_disabled(p))
+	    is_migration_disabled(p)) {
+		__scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
 		goto local;
+	}
 
 	if (!SCX_HAS_OP(enqueue))
 		goto global;
@@ -2072,6 +2197,7 @@ local:
 	 */
 	touch_core_sched(rq, p);
 	p->scx.slice = SCX_SLICE_DFL;
+	__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
 local_norefill:
 	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
 	return;
@@ -2079,6 +2205,7 @@ local_norefill:
 global:
 	touch_core_sched(rq, p);	/* see the comment in local: */
 	p->scx.slice = SCX_SLICE_DFL;
+	__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
 	dispatch_enqueue(find_global_dsq(p), p, enq_flags);
 }
 
@@ -2150,6 +2277,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
 out:
 	rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
+
+	if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
+	    unlikely(cpu_of(rq) != p->scx.selected_cpu))
+		__scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
 }
 
 static void ops_dequeue(struct task_struct *p, u64 deq_flags)
@@ -2337,7 +2468,7 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
  * The caller must ensure that @p and @rq are on different CPUs.
  */
 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
-				      bool trigger_error)
+				      bool enforce)
 {
 	int cpu = cpu_of(rq);
 
@@ -2356,7 +2487,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
 	 * easily be masked if task_allowed_on_cpu() is done first.
 	 */
 	if (unlikely(is_migration_disabled(p))) {
-		if (trigger_error)
+		if (enforce)
 			scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
 				      p->comm, p->pid, task_cpu(p), cpu);
 		return false;
@@ -2369,14 +2500,17 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
 	 * picked CPU is outside the allowed mask.
 	 */
 	if (!task_allowed_on_cpu(p, cpu)) {
-		if (trigger_error)
+		if (enforce)
 			scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
 				      cpu, p->comm, p->pid);
 		return false;
 	}
 
-	if (!scx_rq_online(rq))
+	if (!scx_rq_online(rq)) {
+		if (enforce)
+			__scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
 		return false;
+	}
 
 	return true;
 }
@@ -2446,7 +2580,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
 }
 #else	/* CONFIG_SMP */
 static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
+static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; }
 static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
 #endif	/* CONFIG_SMP */
 
@@ -2893,6 +3027,7 @@ no_tasks:
 	if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
 	     scx_rq_bypassing(rq))) {
 		rq->scx.flags |= SCX_RQ_BAL_KEEP;
+		__scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
 		goto has_tasks;
 	}
 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3159,8 +3294,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
 	 */
 	if (keep_prev) {
 		p = prev;
-		if (!p->scx.slice)
+		if (!p->scx.slice) {
 			p->scx.slice = SCX_SLICE_DFL;
+			__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+		}
 	} else {
 		p = first_local_task(rq);
 		if (!p) {
@@ -3176,6 +3313,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
 				scx_warned_zero_slice = true;
 			}
 			p->scx.slice = SCX_SLICE_DFL;
+			__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
 		}
 	}
 
@@ -3220,418 +3358,10 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 
 #ifdef CONFIG_SMP
 
-static bool test_and_clear_cpu_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	/*
-	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
-	 * cluster is not wholly idle either way. This also prevents
-	 * scx_pick_idle_cpu() from getting caught in an infinite loop.
-	 */
-	if (sched_smt_active()) {
-		const struct cpumask *smt = cpu_smt_mask(cpu);
-
-		/*
-		 * If offline, @cpu is not its own sibling and
-		 * scx_pick_idle_cpu() can get caught in an infinite loop as
-		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
-		 * is eventually cleared.
-		 *
-		 * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
-		 * reduce memory writes, which may help alleviate cache
-		 * coherence pressure.
-		 */
-		if (cpumask_intersects(smt, idle_masks.smt))
-			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
-		else if (cpumask_test_cpu(cpu, idle_masks.smt))
-			__cpumask_clear_cpu(cpu, idle_masks.smt);
-	}
-#endif
-	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
-}
-
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
-{
-	int cpu;
-
-retry:
-	if (sched_smt_active()) {
-		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
-		if (cpu < nr_cpu_ids)
-			goto found;
-
-		if (flags & SCX_PICK_IDLE_CORE)
-			return -EBUSY;
-	}
-
-	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
-	if (cpu >= nr_cpu_ids)
-		return -EBUSY;
-
-found:
-	if (test_and_clear_cpu_idle(cpu))
-		return cpu;
-	else
-		goto retry;
-}
-
-/*
- * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
- * domain is not defined).
- */
-static unsigned int llc_weight(s32 cpu)
-{
-	struct sched_domain *sd;
-
-	sd = rcu_dereference(per_cpu(sd_llc, cpu));
-	if (!sd)
-		return 0;
-
-	return sd->span_weight;
-}
-
-/*
- * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
- * domain is not defined).
- */
-static struct cpumask *llc_span(s32 cpu)
-{
-	struct sched_domain *sd;
-
-	sd = rcu_dereference(per_cpu(sd_llc, cpu));
-	if (!sd)
-		return 0;
-
-	return sched_domain_span(sd);
-}
-
-/*
- * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
- * NUMA domain is not defined).
- */
-static unsigned int numa_weight(s32 cpu)
-{
-	struct sched_domain *sd;
-	struct sched_group *sg;
-
-	sd = rcu_dereference(per_cpu(sd_numa, cpu));
-	if (!sd)
-		return 0;
-	sg = sd->groups;
-	if (!sg)
-		return 0;
-
-	return sg->group_weight;
-}
-
-/*
- * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
- * domain is not defined).
- */
-static struct cpumask *numa_span(s32 cpu)
-{
-	struct sched_domain *sd;
-	struct sched_group *sg;
-
-	sd = rcu_dereference(per_cpu(sd_numa, cpu));
-	if (!sd)
-		return NULL;
-	sg = sd->groups;
-	if (!sg)
-		return NULL;
-
-	return sched_group_span(sg);
-}
-
-/*
- * Return true if the LLC domains do not perfectly overlap with the NUMA
- * domains, false otherwise.
- */
-static bool llc_numa_mismatch(void)
-{
-	int cpu;
-
-	/*
-	 * We need to scan all online CPUs to verify whether their scheduling
-	 * domains overlap.
-	 *
-	 * While it is rare to encounter architectures with asymmetric NUMA
-	 * topologies, CPU hotplugging or virtualized environments can result
-	 * in asymmetric configurations.
-	 *
-	 * For example:
-	 *
-	 *  NUMA 0:
-	 *    - LLC 0: cpu0..cpu7
-	 *    - LLC 1: cpu8..cpu15 [offline]
-	 *
-	 *  NUMA 1:
-	 *    - LLC 0: cpu16..cpu23
-	 *    - LLC 1: cpu24..cpu31
-	 *
-	 * In this case, if we only check the first online CPU (cpu0), we might
-	 * incorrectly assume that the LLC and NUMA domains are fully
-	 * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
-	 * domains).
-	 */
-	for_each_online_cpu(cpu)
-		if (llc_weight(cpu) != numa_weight(cpu))
-			return true;
-
-	return false;
-}
-
-/*
- * Initialize topology-aware scheduling.
- *
- * Detect if the system has multiple LLC or multiple NUMA domains and enable
- * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
- * selection policy.
- *
- * Assumption: the kernel's internal topology representation assumes that each
- * CPU belongs to a single LLC domain, and that each LLC domain is entirely
- * contained within a single NUMA node.
- */
-static void update_selcpu_topology(void)
-{
-	bool enable_llc = false, enable_numa = false;
-	unsigned int nr_cpus;
-	s32 cpu = cpumask_first(cpu_online_mask);
-
-	/*
-	 * Enable LLC domain optimization only when there are multiple LLC
-	 * domains among the online CPUs. If all online CPUs are part of a
-	 * single LLC domain, the idle CPU selection logic can choose any
-	 * online CPU without bias.
-	 *
-	 * Note that it is sufficient to check the LLC domain of the first
-	 * online CPU to determine whether a single LLC domain includes all
-	 * CPUs.
-	 */
-	rcu_read_lock();
-	nr_cpus = llc_weight(cpu);
-	if (nr_cpus > 0) {
-		if (nr_cpus < num_online_cpus())
-			enable_llc = true;
-		pr_debug("sched_ext: LLC=%*pb weight=%u\n",
-			 cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
-	}
-
-	/*
-	 * Enable NUMA optimization only when there are multiple NUMA domains
-	 * among the online CPUs and the NUMA domains don't perfectly overlaps
-	 * with the LLC domains.
-	 *
-	 * If all CPUs belong to the same NUMA node and the same LLC domain,
-	 * enabling both NUMA and LLC optimizations is unnecessary, as checking
-	 * for an idle CPU in the same domain twice is redundant.
-	 */
-	nr_cpus = numa_weight(cpu);
-	if (nr_cpus > 0) {
-		if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
-			enable_numa = true;
-		pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
-			 cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
-	}
-	rcu_read_unlock();
-
-	pr_debug("sched_ext: LLC idle selection %s\n",
-		 str_enabled_disabled(enable_llc));
-	pr_debug("sched_ext: NUMA idle selection %s\n",
-		 str_enabled_disabled(enable_numa));
-
-	if (enable_llc)
-		static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
-	else
-		static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
-	if (enable_numa)
-		static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
-	else
-		static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
-}
-
-/*
- * Built-in CPU idle selection policy:
- *
- * 1. Prioritize full-idle cores:
- *   - always prioritize CPUs from fully idle cores (both logical CPUs are
- *     idle) to avoid interference caused by SMT.
- *
- * 2. Reuse the same CPU:
- *   - prefer the last used CPU to take advantage of cached data (L1, L2) and
- *     branch prediction optimizations.
- *
- * 3. Pick a CPU within the same LLC (Last-Level Cache):
- *   - if the above conditions aren't met, pick a CPU that shares the same LLC
- *     to maintain cache locality.
- *
- * 4. Pick a CPU within the same NUMA node, if enabled:
- *   - choose a CPU from the same NUMA node to reduce memory access latency.
- *
- * 5. Pick any idle CPU usable by the task.
- *
- * Step 3 and 4 are performed only if the system has, respectively, multiple
- * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
- * scx_selcpu_topo_numa).
- *
- * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
- * we never call ops.select_cpu() for them, see select_task_rq().
- */
-static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
-			      u64 wake_flags, bool *found)
-{
-	const struct cpumask *llc_cpus = NULL;
-	const struct cpumask *numa_cpus = NULL;
-	s32 cpu;
-
-	*found = false;
-
-	/*
-	 * This is necessary to protect llc_cpus.
-	 */
-	rcu_read_lock();
-
-	/*
-	 * Determine the scheduling domain only if the task is allowed to run
-	 * on all CPUs.
-	 *
-	 * This is done primarily for efficiency, as it avoids the overhead of
-	 * updating a cpumask every time we need to select an idle CPU (which
-	 * can be costly in large SMP systems), but it also aligns logically:
-	 * if a task's scheduling domain is restricted by user-space (through
-	 * CPU affinity), the task will simply use the flat scheduling domain
-	 * defined by user-space.
-	 */
-	if (p->nr_cpus_allowed >= num_possible_cpus()) {
-		if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
-			numa_cpus = numa_span(prev_cpu);
-
-		if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
-			llc_cpus = llc_span(prev_cpu);
-	}
-
-	/*
-	 * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
-	 */
-	if (wake_flags & SCX_WAKE_SYNC) {
-		cpu = smp_processor_id();
-
-		/*
-		 * If the waker's CPU is cache affine and prev_cpu is idle,
-		 * then avoid a migration.
-		 */
-		if (cpus_share_cache(cpu, prev_cpu) &&
-		    test_and_clear_cpu_idle(prev_cpu)) {
-			cpu = prev_cpu;
-			goto cpu_found;
-		}
-
-		/*
-		 * If the waker's local DSQ is empty, and the system is under
-		 * utilized, try to wake up @p to the local DSQ of the waker.
-		 *
-		 * Checking only for an empty local DSQ is insufficient as it
-		 * could give the wakee an unfair advantage when the system is
-		 * oversaturated.
-		 *
-		 * Checking only for the presence of idle CPUs is also
-		 * insufficient as the local DSQ of the waker could have tasks
-		 * piled up on it even if there is an idle core elsewhere on
-		 * the system.
-		 */
-		if (!cpumask_empty(idle_masks.cpu) &&
-		    !(current->flags & PF_EXITING) &&
-		    cpu_rq(cpu)->scx.local_dsq.nr == 0) {
-			if (cpumask_test_cpu(cpu, p->cpus_ptr))
-				goto cpu_found;
-		}
-	}
-
-	/*
-	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
-	 * partially idle @prev_cpu.
-	 */
-	if (sched_smt_active()) {
-		/*
-		 * Keep using @prev_cpu if it's part of a fully idle core.
-		 */
-		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
-		    test_and_clear_cpu_idle(prev_cpu)) {
-			cpu = prev_cpu;
-			goto cpu_found;
-		}
-
-		/*
-		 * Search for any fully idle core in the same LLC domain.
-		 */
-		if (llc_cpus) {
-			cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
-			if (cpu >= 0)
-				goto cpu_found;
-		}
-
-		/*
-		 * Search for any fully idle core in the same NUMA node.
-		 */
-		if (numa_cpus) {
-			cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
-			if (cpu >= 0)
-				goto cpu_found;
-		}
-
-		/*
-		 * Search for any full idle core usable by the task.
-		 */
-		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
-		if (cpu >= 0)
-			goto cpu_found;
-	}
-
-	/*
-	 * Use @prev_cpu if it's idle.
-	 */
-	if (test_and_clear_cpu_idle(prev_cpu)) {
-		cpu = prev_cpu;
-		goto cpu_found;
-	}
-
-	/*
-	 * Search for any idle CPU in the same LLC domain.
-	 */
-	if (llc_cpus) {
-		cpu = scx_pick_idle_cpu(llc_cpus, 0);
-		if (cpu >= 0)
-			goto cpu_found;
-	}
-
-	/*
-	 * Search for any idle CPU in the same NUMA node.
-	 */
-	if (numa_cpus) {
-		cpu = scx_pick_idle_cpu(numa_cpus, 0);
-		if (cpu >= 0)
-			goto cpu_found;
-	}
-
-	/*
-	 * Search for any idle CPU usable by the task.
-	 */
-	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
-	if (cpu >= 0)
-		goto cpu_found;
-
-	rcu_read_unlock();
-	return prev_cpu;
-
-cpu_found:
-	rcu_read_unlock();
-
-	*found = true;
-	return cpu;
-}
-
 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
 {
+	bool rq_bypass;
+
 	/*
 	 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
 	 * can be a good migration opportunity with low cache and memory
@@ -3645,7 +3375,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 	if (unlikely(wake_flags & WF_EXEC))
 		return prev_cpu;
 
-	if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
+	rq_bypass = scx_rq_bypassing(task_rq(p));
+	if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
 		s32 cpu;
 		struct task_struct **ddsp_taskp;
 
@@ -3655,20 +3386,27 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 
 		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
 					   select_cpu, p, prev_cpu, wake_flags);
+		p->scx.selected_cpu = cpu;
 		*ddsp_taskp = NULL;
 		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
 			return cpu;
 		else
 			return prev_cpu;
 	} else {
-		bool found;
 		s32 cpu;
 
-		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
-		if (found) {
+		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+		if (cpu >= 0) {
 			p->scx.slice = SCX_SLICE_DFL;
 			p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
+			__scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+		} else {
+			cpu = prev_cpu;
 		}
+		p->scx.selected_cpu = cpu;
+
+		if (rq_bypass)
+			__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
 		return cpu;
 	}
 }
@@ -3696,90 +3434,6 @@ static void set_cpus_allowed_scx(struct task_struct *p,
 				 (struct cpumask *)p->cpus_ptr);
 }
 
-static void reset_idle_masks(void)
-{
-	/*
-	 * Consider all online cpus idle. Should converge to the actual state
-	 * quickly.
-	 */
-	cpumask_copy(idle_masks.cpu, cpu_online_mask);
-	cpumask_copy(idle_masks.smt, cpu_online_mask);
-}
-
-static void update_builtin_idle(int cpu, bool idle)
-{
-	assign_cpu(cpu, idle_masks.cpu, idle);
-
-#ifdef CONFIG_SCHED_SMT
-	if (sched_smt_active()) {
-		const struct cpumask *smt = cpu_smt_mask(cpu);
-
-		if (idle) {
-			/*
-			 * idle_masks.smt handling is racy but that's fine as
-			 * it's only for optimization and self-correcting.
-			 */
-			if (!cpumask_subset(smt, idle_masks.cpu))
-				return;
-			cpumask_or(idle_masks.smt, idle_masks.smt, smt);
-		} else {
-			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
-		}
-	}
-#endif
-}
-
-/*
- * Update the idle state of a CPU to @idle.
- *
- * If @do_notify is true, ops.update_idle() is invoked to notify the scx
- * scheduler of an actual idle state transition (idle to busy or vice
- * versa). If @do_notify is false, only the idle state in the idle masks is
- * refreshed without invoking ops.update_idle().
- *
- * This distinction is necessary, because an idle CPU can be "reserved" and
- * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
- * busy even if no tasks are dispatched. In this case, the CPU may return
- * to idle without a true state transition. Refreshing the idle masks
- * without invoking ops.update_idle() ensures accurate idle state tracking
- * while avoiding unnecessary updates and maintaining balanced state
- * transitions.
- */
-void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
-{
-	int cpu = cpu_of(rq);
-
-	lockdep_assert_rq_held(rq);
-
-	/*
-	 * Trigger ops.update_idle() only when transitioning from a task to
-	 * the idle thread and vice versa.
-	 *
-	 * Idle transitions are indicated by do_notify being set to true,
-	 * managed by put_prev_task_idle()/set_next_task_idle().
-	 */
-	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
-		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
-
-	/*
-	 * Update the idle masks:
-	 * - for real idle transitions (do_notify == true)
-	 * - for idle-to-idle transitions (indicated by the previous task
-	 *   being the idle thread, managed by pick_task_idle())
-	 *
-	 * Skip updating idle masks if the previous task is not the idle
-	 * thread, since set_next_task_idle() has already handled it when
-	 * transitioning from a task to the idle thread (calling this
-	 * function with do_notify == true).
-	 *
-	 * In this way we can avoid updating the idle masks twice,
-	 * unnecessarily.
-	 */
-	if (static_branch_likely(&scx_builtin_idle_enabled))
-		if (do_notify || is_idle_task(rq->curr))
-			update_builtin_idle(cpu, idle);
-}
-
 static void handle_hotplug(struct rq *rq, bool online)
 {
 	int cpu = cpu_of(rq);
@@ -3787,7 +3441,7 @@ static void handle_hotplug(struct rq *rq, bool online)
 	atomic_long_inc(&scx_hotplug_seq);
 
 	if (scx_enabled())
-		update_selcpu_topology();
+		scx_idle_update_selcpu_topology(&scx_ops);
 
 	if (online && SCX_HAS_OP(cpu_online))
 		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
@@ -3819,12 +3473,6 @@ static void rq_offline_scx(struct rq *rq)
 	rq->scx.flags &= ~SCX_RQ_ONLINE;
 }
 
-#else	/* CONFIG_SMP */
-
-static bool test_and_clear_cpu_idle(int cpu) { return false; }
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
-static void reset_idle_masks(void) {}
-
 #endif	/* CONFIG_SMP */
 
 static bool check_rq_for_timeouts(struct rq *rq)
@@ -4749,8 +4397,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj,
 }
 SCX_ATTR(ops);
 
+#define scx_attr_event_show(buf, at, events, kind) ({				\
+	sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind);		\
+})
+
+static ssize_t scx_attr_events_show(struct kobject *kobj,
+				    struct kobj_attribute *ka, char *buf)
+{
+	struct scx_event_stats events;
+	int at = 0;
+
+	scx_bpf_events(&events, sizeof(events));
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
+	return at;
+}
+SCX_ATTR(events);
+
 static struct attribute *scx_sched_attrs[] = {
 	&scx_attr_ops.attr,
+	&scx_attr_events.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(scx_sched);
@@ -4862,6 +4535,8 @@ static void scx_clear_softlockup(void)
 static void scx_ops_bypass(bool bypass)
 {
 	static DEFINE_RAW_SPINLOCK(bypass_lock);
+	static unsigned long bypass_timestamp;
+
 	int cpu;
 	unsigned long flags;
 
@@ -4871,11 +4546,15 @@ static void scx_ops_bypass(bool bypass)
 		WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
 		if (scx_ops_bypass_depth != 1)
 			goto unlock;
+		bypass_timestamp = ktime_get_ns();
+		scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
 	} else {
 		scx_ops_bypass_depth--;
 		WARN_ON_ONCE(scx_ops_bypass_depth < 0);
 		if (scx_ops_bypass_depth != 0)
 			goto unlock;
+		scx_add_event(SCX_EV_BYPASS_DURATION,
+			      ktime_get_ns() - bypass_timestamp);
 	}
 
 	atomic_inc(&scx_ops_breather_depth);
@@ -5095,11 +4774,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	static_branch_disable(&__scx_ops_enabled);
 	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
 		static_branch_disable(&scx_has_op[i]);
+	static_branch_disable(&scx_ops_allow_queued_wakeup);
 	static_branch_disable(&scx_ops_enq_last);
 	static_branch_disable(&scx_ops_enq_exiting);
 	static_branch_disable(&scx_ops_enq_migration_disabled);
 	static_branch_disable(&scx_ops_cpu_preempt);
-	static_branch_disable(&scx_builtin_idle_enabled);
+	scx_idle_disable();
 	synchronize_rcu();
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
@@ -5349,6 +5029,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		.at_jiffies = jiffies,
 	};
 	struct seq_buf s;
+	struct scx_event_stats events;
 	unsigned long flags;
 	char *buf;
 	int cpu;
@@ -5457,6 +5138,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		rq_unlock(rq, &rf);
 	}
 
+	dump_newline(&s);
+	dump_line(&s, "Event counters");
+	dump_line(&s, "--------------");
+
+	scx_bpf_events(&events, sizeof(events));
+	scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
+	scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+	scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
+	scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
+	scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+	scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
+	scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
+	scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
+	scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
+
 	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
 		       trunc_marker, sizeof(trunc_marker));
@@ -5546,6 +5242,16 @@ static int validate_ops(const struct sched_ext_ops *ops)
 		return -EINVAL;
 	}
 
+	/*
+	 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
+	 * selection policy to be enabled.
+	 */
+	if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
+	    (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
+		scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -5564,6 +5270,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	mutex_lock(&scx_ops_enable_mutex);
 
+	/*
+	 * Clear event counters so a new scx scheduler gets
+	 * fresh event counter values.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
+		memset(e, 0, sizeof(*e));
+	}
+
 	if (!scx_ops_helper) {
 		WRITE_ONCE(scx_ops_helper,
 			   scx_create_rt_helper("sched_ext_ops_helper"));
@@ -5661,9 +5376,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 			static_branch_enable_cpuslocked(&scx_has_op[i]);
 
 	check_hotplug_seq(ops);
-#ifdef CONFIG_SMP
-	update_selcpu_topology();
-#endif
+	scx_idle_update_selcpu_topology(ops);
+
 	cpus_read_unlock();
 
 	ret = validate_ops(ops);
@@ -5702,9 +5416,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		if (((void (**)(void))ops)[i])
 			static_branch_enable(&scx_has_op[i]);
 
+	if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+		static_branch_enable(&scx_ops_allow_queued_wakeup);
 	if (ops->flags & SCX_OPS_ENQ_LAST)
 		static_branch_enable(&scx_ops_enq_last);
-
 	if (ops->flags & SCX_OPS_ENQ_EXITING)
 		static_branch_enable(&scx_ops_enq_exiting);
 	if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
@@ -5712,12 +5427,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
 		static_branch_enable(&scx_ops_cpu_preempt);
 
-	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
-		reset_idle_masks();
-		static_branch_enable(&scx_builtin_idle_enabled);
-	} else {
-		static_branch_disable(&scx_builtin_idle_enabled);
-	}
+	scx_idle_enable(ops);
 
 	/*
 	 * Lock out forks, cgroup on/offlining and moves before opening the
@@ -6356,10 +6066,8 @@ void __init init_sched_ext_class(void)
 		   SCX_TG_ONLINE);
 
 	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
-#ifdef CONFIG_SMP
-	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
-	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
-#endif
+	scx_idle_init_masks();
+
 	scx_kick_cpus_pnt_seqs =
 		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
 			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
@@ -6367,15 +6075,16 @@ void __init init_sched_ext_class(void)
 
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
+		int  n = cpu_to_node(cpu);
 
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
 		INIT_LIST_HEAD(&rq->scx.runnable_list);
 		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
 
-		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
-		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
-		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
-		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
+		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));
+		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
+		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
+		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
 		init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
 		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
 
@@ -6392,65 +6101,6 @@ void __init init_sched_ext_class(void)
 /********************************************************************************
  * Helpers that can be called from the BPF scheduler.
  */
-#include <linux/btf_ids.h>
-
-__bpf_kfunc_start_defs();
-
-static bool check_builtin_idle_enabled(void)
-{
-	if (static_branch_likely(&scx_builtin_idle_enabled))
-		return true;
-
-	scx_ops_error("built-in idle tracking is disabled");
-	return false;
-}
-
-/**
- * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
- * @p: task_struct to select a CPU for
- * @prev_cpu: CPU @p was on previously
- * @wake_flags: %SCX_WAKE_* flags
- * @is_idle: out parameter indicating whether the returned CPU is idle
- *
- * Can only be called from ops.select_cpu() if the built-in CPU selection is
- * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
- *
- * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
- * currently idle and thus a good candidate for direct dispatching.
- */
-__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
-				       u64 wake_flags, bool *is_idle)
-{
-	if (!ops_cpu_valid(prev_cpu, NULL))
-		goto prev_cpu;
-
-	if (!check_builtin_idle_enabled())
-		goto prev_cpu;
-
-	if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
-		goto prev_cpu;
-
-#ifdef CONFIG_SMP
-	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
-#endif
-
-prev_cpu:
-	*is_idle = false;
-	return prev_cpu;
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-
-static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
-	.owner			= THIS_MODULE,
-	.set			= &scx_kfunc_ids_select_cpu,
-};
-
 static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
 {
 	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
@@ -6972,8 +6622,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
 		 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
 		 * the current local DSQ for running tasks and thus are not
 		 * visible to the BPF scheduler.
+		 *
+		 * Also skip re-enqueueing tasks that can only run on this
+		 * CPU, as they would just be re-added to the same local
+		 * DSQ without any benefit.
 		 */
-		if (p->migration_pending)
+		if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
 			continue;
 
 		dispatch_dequeue(rq, p);
@@ -7470,6 +7124,16 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
 }
 
 /**
+ * scx_bpf_nr_node_ids - Return the number of possible node IDs
+ *
+ * All valid node IDs in the system are smaller than the returned value.
+ */
+__bpf_kfunc u32 scx_bpf_nr_node_ids(void)
+{
+	return nr_node_ids;
+}
+
+/**
  * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
  *
  * All valid CPU IDs in the system are smaller than the returned value.
@@ -7510,142 +7174,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
 }
 
 /**
- * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
- * per-CPU cpumask.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
-{
-	if (!check_builtin_idle_enabled())
-		return cpu_none_mask;
-
-#ifdef CONFIG_SMP
-	return idle_masks.cpu;
-#else
-	return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
- * per-physical-core cpumask. Can be used to determine if an entire physical
- * core is free.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
-{
-	if (!check_builtin_idle_enabled())
-		return cpu_none_mask;
-
-#ifdef CONFIG_SMP
-	if (sched_smt_active())
-		return idle_masks.smt;
-	else
-		return idle_masks.cpu;
-#else
-	return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
- * either the percpu, or SMT idle-tracking cpumask.
- * @idle_mask: &cpumask to use
- */
-__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
-{
-	/*
-	 * Empty function body because we aren't actually acquiring or releasing
-	 * a reference to a global idle cpumask, which is read-only in the
-	 * caller and is never released. The acquire / release semantics here
-	 * are just used to make the cpumask a trusted pointer in the caller.
-	 */
-}
-
-/**
- * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
- * @cpu: cpu to test and clear idle for
- *
- * Returns %true if @cpu was idle and its idle state was successfully cleared.
- * %false otherwise.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
-{
-	if (!check_builtin_idle_enabled())
-		return false;
-
-	if (ops_cpu_valid(cpu, NULL))
-		return test_and_clear_cpu_idle(cpu);
-	else
-		return false;
-}
-
-/**
- * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
- * number on success. -%EBUSY if no matching cpu was found.
- *
- * Idle CPU tracking may race against CPU scheduling state transitions. For
- * example, this function may return -%EBUSY as CPUs are transitioning into the
- * idle state. If the caller then assumes that there will be dispatch events on
- * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
- * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
- * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
- * event in the near future.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
-				      u64 flags)
-{
-	if (!check_builtin_idle_enabled())
-		return -EBUSY;
-
-	return scx_pick_idle_cpu(cpus_allowed, flags);
-}
-
-/**
- * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
- * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
- * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
- * empty.
- *
- * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
- * set, this function can't tell which CPUs are idle and will always pick any
- * CPU.
- */
-__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
-				     u64 flags)
-{
-	s32 cpu;
-
-	if (static_branch_likely(&scx_builtin_idle_enabled)) {
-		cpu = scx_pick_idle_cpu(cpus_allowed, flags);
-		if (cpu >= 0)
-			return cpu;
-	}
-
-	cpu = cpumask_any_distribute(cpus_allowed);
-	if (cpu < nr_cpu_ids)
-		return cpu;
-	else
-		return -EBUSY;
-}
-
-/**
  * scx_bpf_task_running - Is task currently running?
  * @p: task of interest
  */
@@ -7765,6 +7293,43 @@ __bpf_kfunc u64 scx_bpf_now(void)
 	return clock;
 }
 
+/*
+ * scx_bpf_events - Get a system-wide event counter to
+ * @events: output buffer from a BPF program
+ * @events__sz: @events len, must end in '__sz'' for the verifier
+ */
+__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
+				size_t events__sz)
+{
+	struct scx_event_stats e_sys, *e_cpu;
+	int cpu;
+
+	/* Aggregate per-CPU event counters into the system-wide counters. */
+	memset(&e_sys, 0, sizeof(e_sys));
+	for_each_possible_cpu(cpu) {
+		e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
+		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+	}
+
+	/*
+	 * We cannot entirely trust a BPF-provided size since a BPF program
+	 * might be compiled against a different vmlinux.h, of which
+	 * scx_event_stats would be larger (a newer vmlinux.h) or smaller
+	 * (an older vmlinux.h). Hence, we use the smaller size to avoid
+	 * memory corruption.
+	 */
+	events__sz = min(events__sz, sizeof(*events));
+	memcpy(events, &e_sys, events__sz);
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7780,6 +7345,7 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
@@ -7797,6 +7363,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
 #endif
 BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
 BTF_KFUNCS_END(scx_kfunc_ids_any)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
@@ -7820,8 +7387,6 @@ static int __init scx_init(void)
 	 * check using scx_kf_allowed().
 	 */
 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-					     &scx_kfunc_set_select_cpu)) ||
-	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_enqueue_dispatch)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_dispatch)) ||
@@ -7841,6 +7406,12 @@ static int __init scx_init(void)
 		return ret;
 	}
 
+	ret = scx_idle_init();
+	if (ret) {
+		pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret);
+		return ret;
+	}
+
 	ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
 	if (ret) {
 		pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 1079b56b0f7a..1bda96b19a1b 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,6 +8,8 @@
  */
 #ifdef CONFIG_SCHED_CLASS_EXT
 
+DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+
 void scx_tick(struct rq *rq);
 void init_scx_entity(struct sched_ext_entity *scx);
 void scx_pre_fork(struct task_struct *p);
@@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p)
 	return scx_enabled() && p->sched_class == &ext_sched_class;
 }
 
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+	return !scx_enabled() ||
+		static_branch_likely(&scx_ops_allow_queued_wakeup) ||
+		p->sched_class != &ext_sched_class;
+}
+
 #ifdef CONFIG_SCHED_CORE
 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 		   bool in_fi);
@@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {}
 static inline void scx_rq_deactivate(struct rq *rq) {}
 static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
 static inline bool task_on_scx(const struct task_struct *p) { return false; }
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
 static inline void init_sched_ext_class(void) {}
 
 #endif	/* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
new file mode 100644
index 000000000000..52c36a70a3d0
--- /dev/null
+++ b/kernel/sched/ext_idle.c
@@ -0,0 +1,1171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Built-in idle CPU tracking policy.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#include "ext_idle.h"
+
+/* Enable/disable built-in idle CPU selection policy */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
+/* Enable/disable per-node idle cpumasks */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
+
+#ifdef CONFIG_SMP
+/* Enable/disable LLC aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+
+/* Enable/disable NUMA aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+
+/*
+ * cpumasks to track idle CPUs within each NUMA node.
+ *
+ * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask
+ * from is used to track all the idle CPUs in the system.
+ */
+struct scx_idle_cpus {
+	cpumask_var_t cpu;
+	cpumask_var_t smt;
+};
+
+/*
+ * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
+ * is not enabled).
+ */
+static struct scx_idle_cpus scx_idle_global_masks;
+
+/*
+ * Per-node idle cpumasks.
+ */
+static struct scx_idle_cpus **scx_idle_node_masks;
+
+/*
+ * Return the idle masks associated to a target @node.
+ *
+ * NUMA_NO_NODE identifies the global idle cpumask.
+ */
+static struct scx_idle_cpus *idle_cpumask(int node)
+{
+	return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node];
+}
+
+/*
+ * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if
+ * per-node idle cpumasks are disabled.
+ */
+static int scx_cpu_node_if_enabled(int cpu)
+{
+	if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
+		return NUMA_NO_NODE;
+
+	return cpu_to_node(cpu);
+}
+
+bool scx_idle_test_and_clear_cpu(int cpu)
+{
+	int node = scx_cpu_node_if_enabled(cpu);
+	struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+#ifdef CONFIG_SCHED_SMT
+	/*
+	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
+	 * cluster is not wholly idle either way. This also prevents
+	 * scx_pick_idle_cpu() from getting caught in an infinite loop.
+	 */
+	if (sched_smt_active()) {
+		const struct cpumask *smt = cpu_smt_mask(cpu);
+		struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+		/*
+		 * If offline, @cpu is not its own sibling and
+		 * scx_pick_idle_cpu() can get caught in an infinite loop as
+		 * @cpu is never cleared from the idle SMT mask. Ensure that
+		 * @cpu is eventually cleared.
+		 *
+		 * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
+		 * reduce memory writes, which may help alleviate cache
+		 * coherence pressure.
+		 */
+		if (cpumask_intersects(smt, idle_smts))
+			cpumask_andnot(idle_smts, idle_smts, smt);
+		else if (cpumask_test_cpu(cpu, idle_smts))
+			__cpumask_clear_cpu(cpu, idle_smts);
+	}
+#endif
+
+	return cpumask_test_and_clear_cpu(cpu, idle_cpus);
+}
+
+/*
+ * Pick an idle CPU in a specific NUMA node.
+ */
+static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+	int cpu;
+
+retry:
+	if (sched_smt_active()) {
+		cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed);
+		if (cpu < nr_cpu_ids)
+			goto found;
+
+		if (flags & SCX_PICK_IDLE_CORE)
+			return -EBUSY;
+	}
+
+	cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed);
+	if (cpu >= nr_cpu_ids)
+		return -EBUSY;
+
+found:
+	if (scx_idle_test_and_clear_cpu(cpu))
+		return cpu;
+	else
+		goto retry;
+}
+
+/*
+ * Tracks nodes that have not yet been visited when searching for an idle
+ * CPU across all available nodes.
+ */
+static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited);
+
+/*
+ * Search for an idle CPU across all nodes, excluding @node.
+ */
+static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+	nodemask_t *unvisited;
+	s32 cpu = -EBUSY;
+
+	preempt_disable();
+	unvisited = this_cpu_ptr(&per_cpu_unvisited);
+
+	/*
+	 * Restrict the search to the online nodes (excluding the current
+	 * node that has been visited already).
+	 */
+	nodes_copy(*unvisited, node_states[N_ONLINE]);
+	node_clear(node, *unvisited);
+
+	/*
+	 * Traverse all nodes in order of increasing distance, starting
+	 * from @node.
+	 *
+	 * This loop is O(N^2), with N being the amount of NUMA nodes,
+	 * which might be quite expensive in large NUMA systems. However,
+	 * this complexity comes into play only when a scheduler enables
+	 * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU
+	 * without specifying a target NUMA node, so it shouldn't be a
+	 * bottleneck is most cases.
+	 *
+	 * As a future optimization we may want to cache the list of nodes
+	 * in a per-node array, instead of actually traversing them every
+	 * time.
+	 */
+	for_each_node_numadist(node, *unvisited) {
+		cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+		if (cpu >= 0)
+			break;
+	}
+	preempt_enable();
+
+	return cpu;
+}
+
+/*
+ * Find an idle CPU in the system, starting from @node.
+ */
+s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+	s32 cpu;
+
+	/*
+	 * Always search in the starting node first (this is an
+	 * optimization that can save some cycles even when the search is
+	 * not limited to a single node).
+	 */
+	cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+	if (cpu >= 0)
+		return cpu;
+
+	/*
+	 * Stop the search if we are using only a single global cpumask
+	 * (NUMA_NO_NODE) or if the search is restricted to the first node
+	 * only.
+	 */
+	if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE)
+		return -EBUSY;
+
+	/*
+	 * Extend the search to the other online nodes.
+	 */
+	return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags);
+}
+
+/*
+ * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
+ * domain is not defined).
+ */
+static unsigned int llc_weight(s32 cpu)
+{
+	struct sched_domain *sd;
+
+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
+	if (!sd)
+		return 0;
+
+	return sd->span_weight;
+}
+
+/*
+ * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
+ * domain is not defined).
+ */
+static struct cpumask *llc_span(s32 cpu)
+{
+	struct sched_domain *sd;
+
+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
+	if (!sd)
+		return 0;
+
+	return sched_domain_span(sd);
+}
+
+/*
+ * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
+ * NUMA domain is not defined).
+ */
+static unsigned int numa_weight(s32 cpu)
+{
+	struct sched_domain *sd;
+	struct sched_group *sg;
+
+	sd = rcu_dereference(per_cpu(sd_numa, cpu));
+	if (!sd)
+		return 0;
+	sg = sd->groups;
+	if (!sg)
+		return 0;
+
+	return sg->group_weight;
+}
+
+/*
+ * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
+ * domain is not defined).
+ */
+static struct cpumask *numa_span(s32 cpu)
+{
+	struct sched_domain *sd;
+	struct sched_group *sg;
+
+	sd = rcu_dereference(per_cpu(sd_numa, cpu));
+	if (!sd)
+		return NULL;
+	sg = sd->groups;
+	if (!sg)
+		return NULL;
+
+	return sched_group_span(sg);
+}
+
+/*
+ * Return true if the LLC domains do not perfectly overlap with the NUMA
+ * domains, false otherwise.
+ */
+static bool llc_numa_mismatch(void)
+{
+	int cpu;
+
+	/*
+	 * We need to scan all online CPUs to verify whether their scheduling
+	 * domains overlap.
+	 *
+	 * While it is rare to encounter architectures with asymmetric NUMA
+	 * topologies, CPU hotplugging or virtualized environments can result
+	 * in asymmetric configurations.
+	 *
+	 * For example:
+	 *
+	 *  NUMA 0:
+	 *    - LLC 0: cpu0..cpu7
+	 *    - LLC 1: cpu8..cpu15 [offline]
+	 *
+	 *  NUMA 1:
+	 *    - LLC 0: cpu16..cpu23
+	 *    - LLC 1: cpu24..cpu31
+	 *
+	 * In this case, if we only check the first online CPU (cpu0), we might
+	 * incorrectly assume that the LLC and NUMA domains are fully
+	 * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
+	 * domains).
+	 */
+	for_each_online_cpu(cpu)
+		if (llc_weight(cpu) != numa_weight(cpu))
+			return true;
+
+	return false;
+}
+
+/*
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
+ *
+ * Assumption: the kernel's internal topology representation assumes that each
+ * CPU belongs to a single LLC domain, and that each LLC domain is entirely
+ * contained within a single NUMA node.
+ */
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
+{
+	bool enable_llc = false, enable_numa = false;
+	unsigned int nr_cpus;
+	s32 cpu = cpumask_first(cpu_online_mask);
+
+	/*
+	 * Enable LLC domain optimization only when there are multiple LLC
+	 * domains among the online CPUs. If all online CPUs are part of a
+	 * single LLC domain, the idle CPU selection logic can choose any
+	 * online CPU without bias.
+	 *
+	 * Note that it is sufficient to check the LLC domain of the first
+	 * online CPU to determine whether a single LLC domain includes all
+	 * CPUs.
+	 */
+	rcu_read_lock();
+	nr_cpus = llc_weight(cpu);
+	if (nr_cpus > 0) {
+		if (nr_cpus < num_online_cpus())
+			enable_llc = true;
+		pr_debug("sched_ext: LLC=%*pb weight=%u\n",
+			 cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
+	}
+
+	/*
+	 * Enable NUMA optimization only when there are multiple NUMA domains
+	 * among the online CPUs and the NUMA domains don't perfectly overlaps
+	 * with the LLC domains.
+	 *
+	 * If all CPUs belong to the same NUMA node and the same LLC domain,
+	 * enabling both NUMA and LLC optimizations is unnecessary, as checking
+	 * for an idle CPU in the same domain twice is redundant.
+	 *
+	 * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA
+	 * optimization, as we would naturally select idle CPUs within
+	 * specific NUMA nodes querying the corresponding per-node cpumask.
+	 */
+	if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+		nr_cpus = numa_weight(cpu);
+		if (nr_cpus > 0) {
+			if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
+				enable_numa = true;
+			pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
+				 cpumask_pr_args(numa_span(cpu)), nr_cpus);
+		}
+	}
+	rcu_read_unlock();
+
+	pr_debug("sched_ext: LLC idle selection %s\n",
+		 str_enabled_disabled(enable_llc));
+	pr_debug("sched_ext: NUMA idle selection %s\n",
+		 str_enabled_disabled(enable_numa));
+
+	if (enable_llc)
+		static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+	else
+		static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+	if (enable_numa)
+		static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+	else
+		static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
+}
+
+/*
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ *   - always prioritize CPUs from fully idle cores (both logical CPUs are
+ *     idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ *   - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ *     branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ *   - if the above conditions aren't met, pick a CPU that shares the same LLC
+ *     to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ *   - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * 5. Pick any idle CPU usable by the task.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively,
+ * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs.
+ *
+ * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always
+ * begin in @prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ *
+ * Return the picked CPU if idle, or a negative value otherwise.
+ *
+ * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
+ * we never call ops.select_cpu() for them, see select_task_rq().
+ */
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
+{
+	const struct cpumask *llc_cpus = NULL;
+	const struct cpumask *numa_cpus = NULL;
+	int node = scx_cpu_node_if_enabled(prev_cpu);
+	s32 cpu;
+
+	/*
+	 * This is necessary to protect llc_cpus.
+	 */
+	rcu_read_lock();
+
+	/*
+	 * Determine the scheduling domain only if the task is allowed to run
+	 * on all CPUs.
+	 *
+	 * This is done primarily for efficiency, as it avoids the overhead of
+	 * updating a cpumask every time we need to select an idle CPU (which
+	 * can be costly in large SMP systems), but it also aligns logically:
+	 * if a task's scheduling domain is restricted by user-space (through
+	 * CPU affinity), the task will simply use the flat scheduling domain
+	 * defined by user-space.
+	 */
+	if (p->nr_cpus_allowed >= num_possible_cpus()) {
+		if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+			numa_cpus = numa_span(prev_cpu);
+
+		if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
+			llc_cpus = llc_span(prev_cpu);
+	}
+
+	/*
+	 * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
+	 */
+	if (wake_flags & SCX_WAKE_SYNC) {
+		int waker_node;
+
+		/*
+		 * If the waker's CPU is cache affine and prev_cpu is idle,
+		 * then avoid a migration.
+		 */
+		cpu = smp_processor_id();
+		if (cpus_share_cache(cpu, prev_cpu) &&
+		    scx_idle_test_and_clear_cpu(prev_cpu)) {
+			cpu = prev_cpu;
+			goto out_unlock;
+		}
+
+		/*
+		 * If the waker's local DSQ is empty, and the system is under
+		 * utilized, try to wake up @p to the local DSQ of the waker.
+		 *
+		 * Checking only for an empty local DSQ is insufficient as it
+		 * could give the wakee an unfair advantage when the system is
+		 * oversaturated.
+		 *
+		 * Checking only for the presence of idle CPUs is also
+		 * insufficient as the local DSQ of the waker could have tasks
+		 * piled up on it even if there is an idle core elsewhere on
+		 * the system.
+		 */
+		waker_node = cpu_to_node(cpu);
+		if (!(current->flags & PF_EXITING) &&
+		    cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
+		    (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
+		    !cpumask_empty(idle_cpumask(waker_node)->cpu)) {
+			if (cpumask_test_cpu(cpu, p->cpus_ptr))
+				goto out_unlock;
+		}
+	}
+
+	/*
+	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
+	 * partially idle @prev_cpu.
+	 */
+	if (sched_smt_active()) {
+		/*
+		 * Keep using @prev_cpu if it's part of a fully idle core.
+		 */
+		if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
+		    scx_idle_test_and_clear_cpu(prev_cpu)) {
+			cpu = prev_cpu;
+			goto out_unlock;
+		}
+
+		/*
+		 * Search for any fully idle core in the same LLC domain.
+		 */
+		if (llc_cpus) {
+			cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE);
+			if (cpu >= 0)
+				goto out_unlock;
+		}
+
+		/*
+		 * Search for any fully idle core in the same NUMA node.
+		 */
+		if (numa_cpus) {
+			cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE);
+			if (cpu >= 0)
+				goto out_unlock;
+		}
+
+		/*
+		 * Search for any full-idle core usable by the task.
+		 *
+		 * If the node-aware idle CPU selection policy is enabled
+		 * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always
+		 * begin in prev_cpu's node and proceed to other nodes in
+		 * order of increasing distance.
+		 */
+		cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+		if (cpu >= 0)
+			goto out_unlock;
+
+		/*
+		 * Give up if we're strictly looking for a full-idle SMT
+		 * core.
+		 */
+		if (flags & SCX_PICK_IDLE_CORE) {
+			cpu = prev_cpu;
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * Use @prev_cpu if it's idle.
+	 */
+	if (scx_idle_test_and_clear_cpu(prev_cpu)) {
+		cpu = prev_cpu;
+		goto out_unlock;
+	}
+
+	/*
+	 * Search for any idle CPU in the same LLC domain.
+	 */
+	if (llc_cpus) {
+		cpu = pick_idle_cpu_in_node(llc_cpus, node, 0);
+		if (cpu >= 0)
+			goto out_unlock;
+	}
+
+	/*
+	 * Search for any idle CPU in the same NUMA node.
+	 */
+	if (numa_cpus) {
+		cpu = pick_idle_cpu_in_node(numa_cpus, node, 0);
+		if (cpu >= 0)
+			goto out_unlock;
+	}
+
+	/*
+	 * Search for any idle CPU usable by the task.
+	 *
+	 * If the node-aware idle CPU selection policy is enabled
+	 * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin
+	 * in prev_cpu's node and proceed to other nodes in order of
+	 * increasing distance.
+	 */
+	cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
+	if (cpu >= 0)
+		goto out_unlock;
+
+out_unlock:
+	rcu_read_unlock();
+
+	return cpu;
+}
+
+/*
+ * Initialize global and per-node idle cpumasks.
+ */
+void scx_idle_init_masks(void)
+{
+	int node;
+
+	/* Allocate global idle cpumasks */
+	BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
+
+	/* Allocate per-node idle cpumasks */
+	scx_idle_node_masks = kcalloc(num_possible_nodes(),
+				      sizeof(*scx_idle_node_masks), GFP_KERNEL);
+	BUG_ON(!scx_idle_node_masks);
+
+	for_each_node(node) {
+		scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
+							 GFP_KERNEL, node);
+		BUG_ON(!scx_idle_node_masks[node]);
+
+		BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
+		BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+	}
+}
+
+static void update_builtin_idle(int cpu, bool idle)
+{
+	int node = scx_cpu_node_if_enabled(cpu);
+	struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+	assign_cpu(cpu, idle_cpus, idle);
+
+#ifdef CONFIG_SCHED_SMT
+	if (sched_smt_active()) {
+		const struct cpumask *smt = cpu_smt_mask(cpu);
+		struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+		if (idle) {
+			/*
+			 * idle_smt handling is racy but that's fine as it's
+			 * only for optimization and self-correcting.
+			 */
+			if (!cpumask_subset(smt, idle_cpus))
+				return;
+			cpumask_or(idle_smts, idle_smts, smt);
+		} else {
+			cpumask_andnot(idle_smts, idle_smts, smt);
+		}
+	}
+#endif
+}
+
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+	int cpu = cpu_of(rq);
+
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * Trigger ops.update_idle() only when transitioning from a task to
+	 * the idle thread and vice versa.
+	 *
+	 * Idle transitions are indicated by do_notify being set to true,
+	 * managed by put_prev_task_idle()/set_next_task_idle().
+	 */
+	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+
+	/*
+	 * Update the idle masks:
+	 * - for real idle transitions (do_notify == true)
+	 * - for idle-to-idle transitions (indicated by the previous task
+	 *   being the idle thread, managed by pick_task_idle())
+	 *
+	 * Skip updating idle masks if the previous task is not the idle
+	 * thread, since set_next_task_idle() has already handled it when
+	 * transitioning from a task to the idle thread (calling this
+	 * function with do_notify == true).
+	 *
+	 * In this way we can avoid updating the idle masks twice,
+	 * unnecessarily.
+	 */
+	if (static_branch_likely(&scx_builtin_idle_enabled))
+		if (do_notify || is_idle_task(rq->curr))
+			update_builtin_idle(cpu, idle);
+}
+
+static void reset_idle_masks(struct sched_ext_ops *ops)
+{
+	int node;
+
+	/*
+	 * Consider all online cpus idle. Should converge to the actual state
+	 * quickly.
+	 */
+	if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+		cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask);
+		cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask);
+		return;
+	}
+
+	for_each_node(node) {
+		const struct cpumask *node_mask = cpumask_of_node(node);
+
+		cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask);
+		cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
+	}
+}
+#endif	/* CONFIG_SMP */
+
+void scx_idle_enable(struct sched_ext_ops *ops)
+{
+	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
+		static_branch_enable(&scx_builtin_idle_enabled);
+	else
+		static_branch_disable(&scx_builtin_idle_enabled);
+
+	if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
+		static_branch_enable(&scx_builtin_idle_per_node);
+	else
+		static_branch_disable(&scx_builtin_idle_per_node);
+
+#ifdef CONFIG_SMP
+	reset_idle_masks(ops);
+#endif
+}
+
+void scx_idle_disable(void)
+{
+	static_branch_disable(&scx_builtin_idle_enabled);
+	static_branch_disable(&scx_builtin_idle_per_node);
+}
+
+/********************************************************************************
+ * Helpers that can be called from the BPF scheduler.
+ */
+
+static int validate_node(int node)
+{
+	if (!static_branch_likely(&scx_builtin_idle_per_node)) {
+		scx_ops_error("per-node idle tracking is disabled");
+		return -EOPNOTSUPP;
+	}
+
+	/* Return no entry for NUMA_NO_NODE (not a critical scx error) */
+	if (node == NUMA_NO_NODE)
+		return -ENOENT;
+
+	/* Make sure node is in a valid range */
+	if (node < 0 || node >= nr_node_ids) {
+		scx_ops_error("invalid node %d", node);
+		return -EINVAL;
+	}
+
+	/* Make sure the node is part of the set of possible nodes */
+	if (!node_possible(node)) {
+		scx_ops_error("unavailable node %d", node);
+		return -EINVAL;
+	}
+
+	return node;
+}
+
+__bpf_kfunc_start_defs();
+
+static bool check_builtin_idle_enabled(void)
+{
+	if (static_branch_likely(&scx_builtin_idle_enabled))
+		return true;
+
+	scx_ops_error("built-in idle tracking is disabled");
+	return false;
+}
+
+/**
+ * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
+ *		      trigger an error if @cpu is invalid
+ * @cpu: target CPU
+ */
+__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
+{
+#ifdef CONFIG_NUMA
+	if (!ops_cpu_valid(cpu, NULL))
+		return NUMA_NO_NODE;
+
+	return cpu_to_node(cpu);
+#else
+	return 0;
+#endif
+}
+
+/**
+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @is_idle: out parameter indicating whether the returned CPU is idle
+ *
+ * Can only be called from ops.select_cpu() if the built-in CPU selection is
+ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
+ * currently idle and thus a good candidate for direct dispatching.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+				       u64 wake_flags, bool *is_idle)
+{
+#ifdef CONFIG_SMP
+	s32 cpu;
+#endif
+	if (!ops_cpu_valid(prev_cpu, NULL))
+		goto prev_cpu;
+
+	if (!check_builtin_idle_enabled())
+		goto prev_cpu;
+
+	if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
+		goto prev_cpu;
+
+#ifdef CONFIG_SMP
+	cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+	if (cpu >= 0) {
+		*is_idle = true;
+		return cpu;
+	}
+#endif
+
+prev_cpu:
+	*is_idle = false;
+	return prev_cpu;
+}
+
+/**
+ * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
+ * idle-tracking per-CPU cpumask of a target NUMA node.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_ops_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
+{
+	node = validate_node(node);
+	if (node < 0)
+		return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+	return idle_cpumask(node)->cpu;
+#else
+	return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
+ * per-CPU cpumask.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
+{
+	if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+		scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+		return cpu_none_mask;
+	}
+
+	if (!check_builtin_idle_enabled())
+		return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+	return idle_cpumask(NUMA_NO_NODE)->cpu;
+#else
+	return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the
+ * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
+ * used to determine if an entire physical core is free.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_ops_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
+{
+	node = validate_node(node);
+	if (node < 0)
+		return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+	if (sched_smt_active())
+		return idle_cpumask(node)->smt;
+	else
+		return idle_cpumask(node)->cpu;
+#else
+	return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
+ * per-physical-core cpumask. Can be used to determine if an entire physical
+ * core is free.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
+{
+	if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+		scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+		return cpu_none_mask;
+	}
+
+	if (!check_builtin_idle_enabled())
+		return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+	if (sched_smt_active())
+		return idle_cpumask(NUMA_NO_NODE)->smt;
+	else
+		return idle_cpumask(NUMA_NO_NODE)->cpu;
+#else
+	return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
+ * either the percpu, or SMT idle-tracking cpumask.
+ * @idle_mask: &cpumask to use
+ */
+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
+{
+	/*
+	 * Empty function body because we aren't actually acquiring or releasing
+	 * a reference to a global idle cpumask, which is read-only in the
+	 * caller and is never released. The acquire / release semantics here
+	 * are just used to make the cpumask a trusted pointer in the caller.
+	 */
+}
+
+/**
+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
+ * @cpu: cpu to test and clear idle for
+ *
+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
+ * %false otherwise.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+{
+	if (!check_builtin_idle_enabled())
+		return false;
+
+	if (ops_cpu_valid(cpu, NULL))
+		return scx_idle_test_and_clear_cpu(cpu);
+	else
+		return false;
+}
+
+/**
+ * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
+ *
+ * Returns the picked idle cpu number on success, or -%EBUSY if no matching
+ * cpu was found.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node).
+ *
+ * Always returns an error if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if
+ * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
+					   int node, u64 flags)
+{
+	node = validate_node(node);
+	if (node < 0)
+		return node;
+
+	return scx_pick_idle_cpu(cpus_allowed, node, flags);
+}
+
+/**
+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
+ * number on success. -%EBUSY if no matching cpu was found.
+ *
+ * Idle CPU tracking may race against CPU scheduling state transitions. For
+ * example, this function may return -%EBUSY as CPUs are transitioning into the
+ * idle state. If the caller then assumes that there will be dispatch events on
+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
+ * event in the near future.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_idle_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
+				      u64 flags)
+{
+	if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+		scx_ops_error("per-node idle tracking is enabled");
+		return -EBUSY;
+	}
+
+	if (!check_builtin_idle_enabled())
+		return -EBUSY;
+
+	return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+}
+
+/**
+ * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available
+ *			       or pick any CPU from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node, regardless of
+ * the CPU idle state).
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
+					  int node, u64 flags)
+{
+	s32 cpu;
+
+	node = validate_node(node);
+	if (node < 0)
+		return node;
+
+	cpu = scx_pick_idle_cpu(cpus_allowed, node, flags);
+	if (cpu >= 0)
+		return cpu;
+
+	if (flags & SCX_PICK_IDLE_IN_NODE)
+		cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed);
+	else
+		cpu = cpumask_any_distribute(cpus_allowed);
+	if (cpu < nr_cpu_ids)
+		return cpu;
+	else
+		return -EBUSY;
+}
+
+/**
+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_any_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
+				     u64 flags)
+{
+	s32 cpu;
+
+	if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+		scx_ops_error("per-node idle tracking is enabled");
+		return -EBUSY;
+	}
+
+	if (static_branch_likely(&scx_builtin_idle_enabled)) {
+		cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+		if (cpu >= 0)
+			return cpu;
+	}
+
+	cpu = cpumask_any_distribute(cpus_allowed);
+	if (cpu < nr_cpu_ids)
+		return cpu;
+	else
+		return -EBUSY;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_idle)
+BTF_ID_FLAGS(func, scx_bpf_cpu_node)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_idle)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_idle,
+};
+
+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_select_cpu,
+};
+
+int scx_idle_init(void)
+{
+	int ret;
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
+	      register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
+	      register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
+	      register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
+
+	return ret;
+}
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
new file mode 100644
index 000000000000..511cc2221f7a
--- /dev/null
+++ b/kernel/sched/ext_idle.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#ifndef _KERNEL_SCHED_EXT_IDLE_H
+#define _KERNEL_SCHED_EXT_IDLE_H
+
+struct sched_ext_ops;
+
+#ifdef CONFIG_SMP
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
+void scx_idle_init_masks(void);
+bool scx_idle_test_and_clear_cpu(int cpu);
+s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags);
+#else /* !CONFIG_SMP */
+static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
+static inline void scx_idle_init_masks(void) {}
+static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
+static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+	return -EBUSY;
+}
+#endif /* CONFIG_SMP */
+
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags);
+void scx_idle_enable(struct sched_ext_ops *ops);
+void scx_idle_disable(void);
+int scx_idle_init(void);
+
+#endif /* _KERNEL_SCHED_EXT_IDLE_H */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bbaadbeeb291..a9eea051b0d6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state)
 }
 EXPORT_SYMBOL_GPL(numa_nearest_node);
 
+/**
+ * nearest_node_nodemask - Find the node in @mask at the nearest distance
+ *			   from @node.
+ *
+ * @node: a valid node ID to start the search from.
+ * @mask: a pointer to a nodemask representing the allowed nodes.
+ *
+ * This function iterates over all nodes in @mask and calculates the
+ * distance from the starting @node, then it returns the node ID that is
+ * the closest to @node, or MAX_NUMNODES if no node is found.
+ *
+ * Note that @node must be a valid node ID usable with node_distance(),
+ * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
+ * or unexpected behavior.
+ */
+int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
+
+	for_each_node_mask(n, *mask) {
+		dist = node_distance(node, n);
+		if (dist < min_dist) {
+			min_dist = dist;
+			min_node = n;
+		}
+	}
+
+	return min_node;
+}
+EXPORT_SYMBOL_GPL(nearest_node_nodemask);
+
 struct mempolicy *get_task_policy(struct task_struct *p)
 {
 	struct mempolicy *pol = p->mempolicy;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 7849405614b1..dc4333d23189 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -7,6 +7,13 @@
 #ifndef __SCX_COMMON_BPF_H
 #define __SCX_COMMON_BPF_H
 
+/*
+ * The generated kfunc prototypes in vmlinux.h are missing address space
+ * attributes which cause build failures. For now, suppress the generated
+ * prototypes. See https://github.com/sched-ext/scx/issues/1111.
+ */
+#define BPF_NO_KFUNC_PROTOTYPES
+
 #ifdef LSP
 #define __bpf__
 #include "../vmlinux.h"
@@ -18,6 +25,7 @@
 #include <bpf/bpf_tracing.h>
 #include <asm-generic/errno.h>
 #include "user_exit_info.h"
+#include "enum_defs.autogen.h"
 
 #define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
 #define PF_KTHREAD			0x00200000	/* I am a kernel thread */
@@ -62,21 +70,28 @@ void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym
 u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
 u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
 void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
+u32 scx_bpf_nr_node_ids(void) __ksym __weak;
 u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
+int scx_bpf_cpu_node(s32 cpu) __ksym __weak;
 const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
 const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
 void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
+const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __ksym __weak;
 const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
+const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __ksym __weak;
 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
 bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
+s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak;
 s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak;
 s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
 u64 scx_bpf_now(void) __ksym __weak;
+void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
 
 /*
  * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
@@ -84,6 +99,9 @@ u64 scx_bpf_now(void) __ksym __weak;
  */
 #define BPF_FOR_EACH_ITER	(&___it)
 
+#define scx_read_event(e, name)							\
+	(bpf_core_field_exists((e)->name) ? (e)->name : 0)
+
 static inline __attribute__((format(printf, 1, 2)))
 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
 
@@ -584,6 +602,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	__u.__val;					\
 })
 
+#define READ_ONCE_ARENA(type, x)				\
+({								\
+	union { type __val; char __c[1]; } __u =		\
+		{ .__c = { 0 } };				\
+	__read_once_size((void *)&(x), __u.__c, sizeof(x));	\
+	__u.__val;						\
+})
+
+#define WRITE_ONCE_ARENA(type, x, val)				\
+({								\
+	union { type __val; char __c[1]; } __u =		\
+		{ .__val = (val) }; 				\
+	__write_once_size((void *)&(x), __u.__c, sizeof(x));	\
+	__u.__val;						\
+})
+
 /*
  * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
  * @v: The value for which we're computing the base 2 logarithm.
diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
index dc18b99e55cd..1dc76bd84296 100644
--- a/tools/sched_ext/include/scx/common.h
+++ b/tools/sched_ext/include/scx/common.h
@@ -16,6 +16,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <errno.h>
+#include "enum_defs.autogen.h"
 
 typedef uint8_t u8;
 typedef uint16_t u16;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 50e1499ae093..9252e1a00556 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -125,12 +125,107 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter,
 	false;									\
 })
 
+/**
+ * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
+ * in a compatible way. We will preserve this __COMPAT helper until v6.16.
+ *
+ * @enq_flags: enqueue flags from ops.enqueue()
+ *
+ * Return: True if SCX_ENQ_CPU_SELECTED is turned on in @enq_flags
+ */
+static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags)
+{
+#ifdef HAVE_SCX_ENQ_CPU_SELECTED
+	/*
+	 * This is the case that a BPF code compiled against vmlinux.h
+	 * where the enum SCX_ENQ_CPU_SELECTED exists.
+	 */
+
+	/*
+	 * We should temporarily suspend the macro expansion of
+	 * 'SCX_ENQ_CPU_SELECTED'. This avoids 'SCX_ENQ_CPU_SELECTED' being
+	 * rewritten to '__SCX_ENQ_CPU_SELECTED' when 'SCX_ENQ_CPU_SELECTED'
+	 * is defined in 'scripts/gen_enums.py'.
+	 */
+#pragma push_macro("SCX_ENQ_CPU_SELECTED")
+#undef SCX_ENQ_CPU_SELECTED
+	u64 flag;
+
+	/*
+	 * When the kernel did not have SCX_ENQ_CPU_SELECTED,
+	 * select_task_rq_scx() has never been skipped. Thus, this case
+	 * should be considered that the CPU has already been selected.
+	 */
+	if (!bpf_core_enum_value_exists(enum scx_enq_flags,
+					SCX_ENQ_CPU_SELECTED))
+		return true;
+
+	flag = bpf_core_enum_value(enum scx_enq_flags, SCX_ENQ_CPU_SELECTED);
+	return enq_flags & flag;
+
+	/*
+	 * Once done, resume the macro expansion of 'SCX_ENQ_CPU_SELECTED'.
+	 */
+#pragma pop_macro("SCX_ENQ_CPU_SELECTED")
+#else
+	/*
+	 * This is the case that a BPF code compiled against vmlinux.h
+	 * where the enum SCX_ENQ_CPU_SELECTED does NOT exist.
+	 */
+	return true;
+#endif /* HAVE_SCX_ENQ_CPU_SELECTED */
+}
+
+
 #define scx_bpf_now()								\
 	(bpf_ksym_exists(scx_bpf_now) ?						\
 	 scx_bpf_now() :							\
 	 bpf_ktime_get_ns())
 
 /*
+ * v6.15: Introduce event counters.
+ *
+ * Preserve the following macro until v6.17.
+ */
+#define __COMPAT_scx_bpf_events(events, size)					\
+	(bpf_ksym_exists(scx_bpf_events) ?					\
+	 scx_bpf_events(events, size) : ({}))
+
+/*
+ * v6.15: Introduce NUMA-aware kfuncs to operate with per-node idle
+ * cpumasks.
+ *
+ * Preserve the following __COMPAT_scx_*_node macros until v6.17.
+ */
+#define __COMPAT_scx_bpf_nr_node_ids()						\
+	(bpf_ksym_exists(scx_bpf_nr_node_ids) ?					\
+	 scx_bpf_nr_node_ids() : 1U)
+
+#define __COMPAT_scx_bpf_cpu_node(cpu)						\
+	(bpf_ksym_exists(scx_bpf_cpu_node) ?					\
+	 scx_bpf_cpu_node(cpu) : 0)
+
+#define __COMPAT_scx_bpf_get_idle_cpumask_node(node)				\
+	(bpf_ksym_exists(scx_bpf_get_idle_cpumask_node) ?			\
+	 scx_bpf_get_idle_cpumask_node(node) :					\
+	 scx_bpf_get_idle_cpumask())						\
+
+#define __COMPAT_scx_bpf_get_idle_smtmask_node(node)				\
+	(bpf_ksym_exists(scx_bpf_get_idle_smtmask_node) ?			\
+	 scx_bpf_get_idle_smtmask_node(node) :					\
+	 scx_bpf_get_idle_smtmask())
+
+#define __COMPAT_scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags)		\
+	(bpf_ksym_exists(scx_bpf_pick_idle_cpu_node) ?				\
+	 scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) :		\
+	 scx_bpf_pick_idle_cpu(cpus_allowed, flags))
+
+#define __COMPAT_scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags)		\
+	(bpf_ksym_exists(scx_bpf_pick_any_cpu_node) ?				\
+	 scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) :			\
+	 scx_bpf_pick_any_cpu(cpus_allowed, flags))
+
+/*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
  */
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index b50280e2ba2b..35c67c5174ac 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -106,8 +106,20 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
 	return false;
 }
 
-#define SCX_OPS_SWITCH_PARTIAL							\
-	__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
+#define SCX_OPS_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_ops_flags", #name)
+
+#define SCX_OPS_KEEP_BUILTIN_IDLE SCX_OPS_FLAG(SCX_OPS_KEEP_BUILTIN_IDLE)
+#define SCX_OPS_ENQ_LAST SCX_OPS_FLAG(SCX_OPS_ENQ_LAST)
+#define SCX_OPS_ENQ_EXITING  SCX_OPS_FLAG(SCX_OPS_ENQ_EXITING)
+#define SCX_OPS_SWITCH_PARTIAL SCX_OPS_FLAG(SCX_OPS_SWITCH_PARTIAL)
+#define SCX_OPS_ENQ_MIGRATION_DISABLED SCX_OPS_FLAG(SCX_OPS_ENQ_MIGRATION_DISABLED)
+#define SCX_OPS_ALLOW_QUEUED_WAKEUP SCX_OPS_FLAG(SCX_OPS_ALLOW_QUEUED_WAKEUP)
+#define SCX_OPS_BUILTIN_IDLE_PER_NODE SCX_OPS_FLAG(SCX_OPS_BUILTIN_IDLE_PER_NODE)
+
+#define SCX_PICK_IDLE_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_pick_idle_cpu_flags", #name)
+
+#define SCX_PICK_IDLE_CORE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_CORE)
+#define SCX_PICK_IDLE_IN_NODE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_IN_NODE)
 
 static inline long scx_hotplug_seq(void)
 {
diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h
new file mode 100644
index 000000000000..6e6c45f14fe1
--- /dev/null
+++ b/tools/sched_ext/include/scx/enum_defs.autogen.h
@@ -0,0 +1,120 @@
+/*
+ * WARNING: This file is autogenerated from gen_enum_defs.py [1].
+ *
+ * [1] https://github.com/sched-ext/scx/blob/main/scripts/gen_enum_defs.py
+ */
+
+#ifndef __ENUM_DEFS_AUTOGEN_H__
+#define __ENUM_DEFS_AUTOGEN_H__
+
+#define HAVE_SCX_DSP_DFL_MAX_BATCH
+#define HAVE_SCX_DSP_MAX_LOOPS
+#define HAVE_SCX_WATCHDOG_MAX_TIMEOUT
+#define HAVE_SCX_EXIT_BT_LEN
+#define HAVE_SCX_EXIT_MSG_LEN
+#define HAVE_SCX_EXIT_DUMP_DFL_LEN
+#define HAVE_SCX_CPUPERF_ONE
+#define HAVE_SCX_OPS_TASK_ITER_BATCH
+#define HAVE_SCX_CPU_PREEMPT_RT
+#define HAVE_SCX_CPU_PREEMPT_DL
+#define HAVE_SCX_CPU_PREEMPT_STOP
+#define HAVE_SCX_CPU_PREEMPT_UNKNOWN
+#define HAVE_SCX_DEQ_SLEEP
+#define HAVE_SCX_DEQ_CORE_SCHED_EXEC
+#define HAVE_SCX_DSQ_FLAG_BUILTIN
+#define HAVE_SCX_DSQ_FLAG_LOCAL_ON
+#define HAVE_SCX_DSQ_INVALID
+#define HAVE_SCX_DSQ_GLOBAL
+#define HAVE_SCX_DSQ_LOCAL
+#define HAVE_SCX_DSQ_LOCAL_ON
+#define HAVE_SCX_DSQ_LOCAL_CPU_MASK
+#define HAVE_SCX_DSQ_ITER_REV
+#define HAVE___SCX_DSQ_ITER_HAS_SLICE
+#define HAVE___SCX_DSQ_ITER_HAS_VTIME
+#define HAVE___SCX_DSQ_ITER_USER_FLAGS
+#define HAVE___SCX_DSQ_ITER_ALL_FLAGS
+#define HAVE_SCX_DSQ_LNODE_ITER_CURSOR
+#define HAVE___SCX_DSQ_LNODE_PRIV_SHIFT
+#define HAVE_SCX_ENQ_WAKEUP
+#define HAVE_SCX_ENQ_HEAD
+#define HAVE_SCX_ENQ_CPU_SELECTED
+#define HAVE_SCX_ENQ_PREEMPT
+#define HAVE_SCX_ENQ_REENQ
+#define HAVE_SCX_ENQ_LAST
+#define HAVE___SCX_ENQ_INTERNAL_MASK
+#define HAVE_SCX_ENQ_CLEAR_OPSS
+#define HAVE_SCX_ENQ_DSQ_PRIQ
+#define HAVE_SCX_TASK_DSQ_ON_PRIQ
+#define HAVE_SCX_TASK_QUEUED
+#define HAVE_SCX_TASK_RESET_RUNNABLE_AT
+#define HAVE_SCX_TASK_DEQD_FOR_SLEEP
+#define HAVE_SCX_TASK_STATE_SHIFT
+#define HAVE_SCX_TASK_STATE_BITS
+#define HAVE_SCX_TASK_STATE_MASK
+#define HAVE_SCX_TASK_CURSOR
+#define HAVE_SCX_ECODE_RSN_HOTPLUG
+#define HAVE_SCX_ECODE_ACT_RESTART
+#define HAVE_SCX_EXIT_NONE
+#define HAVE_SCX_EXIT_DONE
+#define HAVE_SCX_EXIT_UNREG
+#define HAVE_SCX_EXIT_UNREG_BPF
+#define HAVE_SCX_EXIT_UNREG_KERN
+#define HAVE_SCX_EXIT_SYSRQ
+#define HAVE_SCX_EXIT_ERROR
+#define HAVE_SCX_EXIT_ERROR_BPF
+#define HAVE_SCX_EXIT_ERROR_STALL
+#define HAVE_SCX_KF_UNLOCKED
+#define HAVE_SCX_KF_CPU_RELEASE
+#define HAVE_SCX_KF_DISPATCH
+#define HAVE_SCX_KF_ENQUEUE
+#define HAVE_SCX_KF_SELECT_CPU
+#define HAVE_SCX_KF_REST
+#define HAVE___SCX_KF_RQ_LOCKED
+#define HAVE___SCX_KF_TERMINAL
+#define HAVE_SCX_KICK_IDLE
+#define HAVE_SCX_KICK_PREEMPT
+#define HAVE_SCX_KICK_WAIT
+#define HAVE_SCX_OPI_BEGIN
+#define HAVE_SCX_OPI_NORMAL_BEGIN
+#define HAVE_SCX_OPI_NORMAL_END
+#define HAVE_SCX_OPI_CPU_HOTPLUG_BEGIN
+#define HAVE_SCX_OPI_CPU_HOTPLUG_END
+#define HAVE_SCX_OPI_END
+#define HAVE_SCX_OPS_ENABLING
+#define HAVE_SCX_OPS_ENABLED
+#define HAVE_SCX_OPS_DISABLING
+#define HAVE_SCX_OPS_DISABLED
+#define HAVE_SCX_OPS_KEEP_BUILTIN_IDLE
+#define HAVE_SCX_OPS_ENQ_LAST
+#define HAVE_SCX_OPS_ENQ_EXITING
+#define HAVE_SCX_OPS_SWITCH_PARTIAL
+#define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT
+#define HAVE_SCX_OPS_ALL_FLAGS
+#define HAVE_SCX_OPSS_NONE
+#define HAVE_SCX_OPSS_QUEUEING
+#define HAVE_SCX_OPSS_QUEUED
+#define HAVE_SCX_OPSS_DISPATCHING
+#define HAVE_SCX_OPSS_QSEQ_SHIFT
+#define HAVE_SCX_PICK_IDLE_CORE
+#define HAVE_SCX_OPS_NAME_LEN
+#define HAVE_SCX_SLICE_DFL
+#define HAVE_SCX_SLICE_INF
+#define HAVE_SCX_RQ_ONLINE
+#define HAVE_SCX_RQ_CAN_STOP_TICK
+#define HAVE_SCX_RQ_BAL_PENDING
+#define HAVE_SCX_RQ_BAL_KEEP
+#define HAVE_SCX_RQ_BYPASSING
+#define HAVE_SCX_RQ_IN_WAKEUP
+#define HAVE_SCX_RQ_IN_BALANCE
+#define HAVE_SCX_TASK_NONE
+#define HAVE_SCX_TASK_INIT
+#define HAVE_SCX_TASK_READY
+#define HAVE_SCX_TASK_ENABLED
+#define HAVE_SCX_TASK_NR_STATES
+#define HAVE_SCX_TG_ONLINE
+#define HAVE_SCX_TG_INITED
+#define HAVE_SCX_WAKE_FORK
+#define HAVE_SCX_WAKE_TTWU
+#define HAVE_SCX_WAKE_SYNC
+
+#endif /* __ENUM_DEFS_AUTOGEN_H__ */
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 1e9f74525d8f..6ba6e610eeaa 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -10,6 +10,7 @@
 #include <unistd.h>
 #include <inttypes.h>
 #include <signal.h>
+#include <assert.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
@@ -60,14 +61,22 @@ restart:
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
 	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
+	assert(skel->rodata->nr_cpu_ids <= INT32_MAX);
+
 	while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
 			break;
-		case 'c':
-			skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
+		case 'c': {
+			u32 central_cpu = strtoul(optarg, NULL, 0);
+			if (central_cpu >= skel->rodata->nr_cpu_ids) {
+				fprintf(stderr, "invalid central CPU id value, %u given (%u max)\n", central_cpu, skel->rodata->nr_cpu_ids);
+				return -1;
+			}
+			skel->rodata->central_cpu = (s32)central_cpu;
 			break;
+		}
 		case 'v':
 			verbose = true;
 			break;
@@ -96,7 +105,7 @@ restart:
 	 */
 	cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
 	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
-	CPU_ZERO(cpuset);
+	CPU_ZERO_S(CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids), cpuset);
 	CPU_SET(skel->rodata->central_cpu, cpuset);
 	SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset),
 		   "Failed to affinitize to central CPU %d (max %d)",
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 3a20bb0c014a..26c40ca4f36c 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -231,7 +231,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 
 	/* if select_cpu() wasn't called, try direct dispatch */
-	if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
+	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
 		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
@@ -763,6 +763,8 @@ static void dump_shared_dsq(void)
 
 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
+	struct scx_event_stats events;
+
 	bpf_rcu_read_lock();
 	dispatch_highpri(true);
 	bpf_rcu_read_unlock();
@@ -772,6 +774,25 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 	if (print_shared_dsq)
 		dump_shared_dsq();
 
+	__COMPAT_scx_bpf_events(&events, sizeof(events));
+
+	bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
+		   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
+	bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
+		   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
+	bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
+		   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
+	bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
+		   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
+	bpf_printk("%35s: %lld", "SCX_EV_ENQ_SLICE_DFL",
+		   scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL));
+	bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
+		   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
+	bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
+		   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
+	bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
+		   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
+
 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 	return 0;
 }
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 011762224600..f4531327b8e7 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -172,6 +172,7 @@ auto-test-targets :=			\
 	maximal				\
 	maybe_null			\
 	minimal				\
+	numa				\
 	prog_run			\
 	reload_loop			\
 	select_cpu_dfl			\
diff --git a/tools/testing/selftests/sched_ext/numa.bpf.c b/tools/testing/selftests/sched_ext/numa.bpf.c
new file mode 100644
index 000000000000..a79d86ed54a1
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/numa.bpf.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates the behavior of the NUMA-aware
+ * functionalities.
+ *
+ * The scheduler creates a separate DSQ for each NUMA node, ensuring tasks
+ * are exclusively processed by CPUs within their respective nodes. Idle
+ * CPUs are selected only within the same node, so task migration can only
+ * occurs between CPUs belonging to the same node.
+ *
+ * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+const volatile unsigned int __COMPAT_SCX_PICK_IDLE_IN_NODE;
+
+static bool is_cpu_idle(s32 cpu, int node)
+{
+	const struct cpumask *idle_cpumask;
+	bool idle;
+
+	idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node(node);
+	idle = bpf_cpumask_test_cpu(cpu, idle_cpumask);
+	scx_bpf_put_cpumask(idle_cpumask);
+
+	return idle;
+}
+
+s32 BPF_STRUCT_OPS(numa_select_cpu,
+		   struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+	int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p));
+	s32 cpu;
+
+	/*
+	 * We could just use __COMPAT_scx_bpf_pick_any_cpu_node() here,
+	 * since it already tries to pick an idle CPU within the node
+	 * first, but let's use both functions for better testing coverage.
+	 */
+	cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node,
+					__COMPAT_SCX_PICK_IDLE_IN_NODE);
+	if (cpu < 0)
+		cpu = __COMPAT_scx_bpf_pick_any_cpu_node(p->cpus_ptr, node,
+						__COMPAT_SCX_PICK_IDLE_IN_NODE);
+
+	if (is_cpu_idle(cpu, node))
+		scx_bpf_error("CPU %d should be marked as busy", cpu);
+
+	if (__COMPAT_scx_bpf_cpu_node(cpu) != node)
+		scx_bpf_error("CPU %d should be in node %d", cpu, node);
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(numa_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p));
+
+	scx_bpf_dsq_insert(p, node, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(numa_dispatch, s32 cpu, struct task_struct *prev)
+{
+	int node = __COMPAT_scx_bpf_cpu_node(cpu);
+
+	scx_bpf_dsq_move_to_local(node);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(numa_init)
+{
+	int node, err;
+
+	bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) {
+		err = scx_bpf_create_dsq(node, node);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(numa_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops numa_ops = {
+	.select_cpu		= (void *)numa_select_cpu,
+	.enqueue		= (void *)numa_enqueue,
+	.dispatch		= (void *)numa_dispatch,
+	.init			= (void *)numa_init,
+	.exit			= (void *)numa_exit,
+	.name			= "numa",
+};
diff --git a/tools/testing/selftests/sched_ext/numa.c b/tools/testing/selftests/sched_ext/numa.c
new file mode 100644
index 000000000000..b060c3b65c82
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/numa.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "numa.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct numa *skel;
+
+	skel = numa__open();
+	SCX_FAIL_IF(!skel, "Failed to open");
+	SCX_ENUM_INIT(skel);
+	skel->rodata->__COMPAT_SCX_PICK_IDLE_IN_NODE = SCX_PICK_IDLE_IN_NODE;
+	skel->struct_ops.numa_ops->flags = SCX_OPS_BUILTIN_IDLE_PER_NODE;
+	SCX_FAIL_IF(numa__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct numa *skel = ctx;
+	struct bpf_link *link;
+
+	link = bpf_map__attach_struct_ops(skel->maps.numa_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	/* Just sleeping is fine, plenty of scheduling events happening */
+	sleep(1);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+	bpf_link__destroy(link);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct numa *skel = ctx;
+
+	numa__destroy(skel);
+}
+
+struct scx_test numa = {
+	.name = "numa",
+	.description = "Verify NUMA-aware functionalities",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&numa)
author	Linus Torvalds <torvalds@linux-foundation.org>	2025-03-24 17:23:48 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2025-03-24 17:23:48 -0700
commit	bcb044256d3f5d9f5bb61d1eac6492f77883bd60 (patch)
tree	0c0c48beef0742ac47a5e136fe0ecbcea66b6871
parent	94dc216ad848ebee06ce7692fcfcbb2e9b3e643c (diff)
parent	e4855fc90e52efef7e3926205c8dc53ce39b6138 (diff)