summaryrefslogtreecommitdiff
path: root/tools/sched_ext
diff options
context:
space:
mode:
Diffstat (limited to 'tools/sched_ext')
-rw-r--r--tools/sched_ext/Makefile4
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h15
-rw-r--r--tools/sched_ext/include/scx/compat.bpf.h314
-rw-r--r--tools/sched_ext/include/scx/compat.h14
-rw-r--r--tools/sched_ext/scx_cpu0.bpf.c88
-rw-r--r--tools/sched_ext/scx_cpu0.c106
-rw-r--r--tools/sched_ext/scx_flatcg.bpf.c10
-rw-r--r--tools/sched_ext/scx_qmap.bpf.c52
8 files changed, 474 insertions, 129 deletions
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index d68780e2e03d..e4bda2474060 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -133,6 +133,7 @@ $(MAKE_DIRS):
$(call msg,MKDIR,,$@)
$(Q)mkdir -p $@
+ifneq ($(CROSS_COMPILE),)
$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
$(APIDIR)/linux/bpf.h \
| $(OBJ_DIR)/libbpf
@@ -141,6 +142,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
EXTRA_CFLAGS='-g -O0 -fPIC' \
LDFLAGS="$(LDFLAGS)" \
DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
+endif
$(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
$(APIDIR)/linux/bpf.h \
@@ -187,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
-c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg
$(addprefix $(BINDIR)/,$(c-sched-targets)): \
$(BINDIR)/%: \
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 06e2551033cb..821d5791bd42 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -60,21 +60,15 @@ static inline void ___vmlinux_h_sanity_check___(void)
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
-s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
- const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
-void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
-void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
+ struct scx_bpf_select_cpu_and_args *args) __ksym __weak;
+bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch_cancel(void) __ksym;
-bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak;
-void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
-void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
-bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-u32 scx_bpf_reenqueue_local(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
@@ -105,7 +99,6 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
struct rq *scx_bpf_locked_rq(void) __ksym;
struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
-struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index dd9144624dc9..f2969c3061a7 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -16,119 +16,92 @@
})
/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
-#define __COMPAT_scx_bpf_task_cgroup(p) \
- (bpf_ksym_exists(scx_bpf_task_cgroup) ? \
- scx_bpf_task_cgroup((p)) : NULL)
+struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak;
+
+#define scx_bpf_task_cgroup(p) \
+ (bpf_ksym_exists(scx_bpf_task_cgroup___new) ? \
+ scx_bpf_task_cgroup___new((p)) : NULL)
/*
* v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are
* renamed to unload the verb.
*
- * Build error is triggered if old names are used. New binaries work with both
- * new and old names. The compat macros will be removed on v6.15 release.
- *
* scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by
* 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()").
- * Preserve __COMPAT macros until v6.15.
*/
-void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
-void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
-bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak;
-void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
-void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
-bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
-
-#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \
- (bpf_ksym_exists(scx_bpf_dsq_insert) ? \
- scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) : \
- scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags)))
-
-#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags) \
- (bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ? \
- scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \
- scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags)))
+bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak;
+void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
+bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+
+bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak;
+void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
+bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
#define scx_bpf_dsq_move_to_local(dsq_id) \
- (bpf_ksym_exists(scx_bpf_dsq_move_to_local) ? \
- scx_bpf_dsq_move_to_local((dsq_id)) : \
- scx_bpf_consume___compat((dsq_id)))
-
-#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice) \
- (bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ? \
- scx_bpf_dsq_move_set_slice((it__iter), (slice)) : \
- (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ? \
- scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) : \
+ (bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ? \
+ scx_bpf_dsq_move_to_local___new((dsq_id)) : \
+ scx_bpf_consume___old((dsq_id)))
+
+#define scx_bpf_dsq_move_set_slice(it__iter, slice) \
+ (bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ? \
+ scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) : \
+ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ? \
+ scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) : \
+ (void)0))
+
+#define scx_bpf_dsq_move_set_vtime(it__iter, vtime) \
+ (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ? \
+ scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) : \
+ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ? \
+ scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) : \
(void)0))
-#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime) \
- (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ? \
- scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) : \
- (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ? \
- scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) : \
- (void) 0))
-
-#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \
- (bpf_ksym_exists(scx_bpf_dsq_move) ? \
- scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) : \
- (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ? \
- scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+#define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \
+ (bpf_ksym_exists(scx_bpf_dsq_move___new) ? \
+ scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) : \
+ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ? \
+ scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
false))
-#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \
- (bpf_ksym_exists(scx_bpf_dsq_move_vtime) ? \
- scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \
- (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \
- scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+#define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \
+ (bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ? \
+ scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \
+ (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ? \
+ scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
false))
+/*
+ * v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits")
+ *
+ * Compat macro will be dropped on v6.19 release.
+ */
+int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
+
#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \
(bpf_ksym_exists(bpf_cpumask_populate) ? \
(bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
-#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \
- _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
-
-#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags) \
- _Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()")
-
-#define scx_bpf_consume(dsq_id) ({ \
- _Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \
- false; \
-})
-
-#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \
- _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()")
-
-#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \
- _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()")
-
-#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
- _Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \
- false; \
-})
-
-#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
- _Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \
- false; \
-})
-
-#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \
- _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()")
-
-#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \
- _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()")
-
-#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
- _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \
- false; \
-})
-
-#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
- _Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \
- false; \
-})
+/*
+ * v6.19: Introduce lockless peek API for user DSQs.
+ *
+ * Preserve the following macro until v6.21.
+ */
+static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
+{
+ struct task_struct *p = NULL;
+ struct bpf_iter_scx_dsq it;
+
+ if (bpf_ksym_exists(scx_bpf_dsq_peek))
+ return scx_bpf_dsq_peek(dsq_id);
+ if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0))
+ p = bpf_iter_scx_dsq_next(&it);
+ bpf_iter_scx_dsq_destroy(&it);
+ return p;
+}
/**
* __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
@@ -248,6 +221,161 @@ static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu)
}
/*
+ * v6.19: To work around BPF maximum parameter limit, the following kfuncs are
+ * replaced with variants that pack scalar arguments in a struct. Wrappers are
+ * provided to maintain source compatibility.
+ *
+ * v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the
+ * block on dispatch renaming above for more details.
+ *
+ * The kernel will carry the compat variants until v6.23 to maintain binary
+ * compatibility. After v6.23 release, remove the compat handling and move the
+ * wrappers to common.bpf.h.
+ */
+s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
+void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+
+/**
+ * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @cpus_allowed: cpumask of allowed CPUs
+ * @flags: %SCX_PICK_IDLE* flags
+ *
+ * Inline wrapper that packs scalar arguments into a struct and calls
+ * __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details.
+ */
+static inline s32
+scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
+{
+ if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) {
+ struct scx_bpf_select_cpu_and_args args = {
+ .prev_cpu = prev_cpu,
+ .wake_flags = wake_flags,
+ .flags = flags,
+ };
+
+ return __scx_bpf_select_cpu_and(p, cpus_allowed, &args);
+ } else {
+ return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags,
+ cpus_allowed, flags);
+ }
+}
+
+/**
+ * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
+ * @slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Inline wrapper that packs scalar arguments into a struct and calls
+ * __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details.
+ */
+static inline bool
+scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime,
+ u64 enq_flags)
+{
+ if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) {
+ struct scx_bpf_dsq_insert_vtime_args args = {
+ .dsq_id = dsq_id,
+ .slice = slice,
+ .vtime = vtime,
+ .enq_flags = enq_flags,
+ };
+
+ return __scx_bpf_dsq_insert_vtime(p, &args);
+ } else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) {
+ scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime,
+ enq_flags);
+ return true;
+ } else {
+ scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime,
+ enq_flags);
+ return true;
+ }
+}
+
+/*
+ * v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move
+ * scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22.
+ * The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on
+ * kernel side. The entire suffix can be dropped later.
+ *
+ * v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on
+ * dispatch renaming above for more details.
+ */
+bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+
+static inline bool
+scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags)
+{
+ if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) {
+ return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags);
+ } else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) {
+ scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags);
+ return true;
+ } else {
+ scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags);
+ return true;
+ }
+}
+
+/*
+ * v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for
+ * sub-sched authority checks. Drop the wrappers and move the decls to
+ * common.bpf.h after v6.22.
+ */
+bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak;
+bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak;
+
+static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
+{
+ if (bpf_ksym_exists(scx_bpf_task_set_slice___new))
+ scx_bpf_task_set_slice___new(p, slice);
+ else
+ p->scx.slice = slice;
+}
+
+static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
+{
+ if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new))
+ scx_bpf_task_set_dsq_vtime___new(p, vtime);
+ else
+ p->scx.dsq_vtime = vtime;
+}
+
+/*
+ * v6.19: The new void variant can be called from anywhere while the older v1
+ * variant can only be called from ops.cpu_release(). The double ___ prefixes on
+ * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix
+ * on kernel side. Drop the wrapper and move the decl to common.bpf.h after
+ * v6.22.
+ */
+u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak;
+void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak;
+
+static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void)
+{
+ return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat);
+}
+
+static inline void scx_bpf_reenqueue_local(void)
+{
+ if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
+ scx_bpf_reenqueue_local___v2___compat();
+ else
+ scx_bpf_reenqueue_local___v1();
+}
+
+/*
* Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
*/
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index 35c67c5174ac..8b4897fc8b99 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -151,6 +151,10 @@ static inline long scx_hotplug_seq(void)
*
* ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
* the current minimum required kernel version.
+ *
+ * COMPAT:
+ * - v6.17: ops.cgroup_set_bandwidth()
+ * - v6.19: ops.cgroup_set_idle()
*/
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
struct __scx_name *__skel; \
@@ -162,6 +166,16 @@ static inline long scx_hotplug_seq(void)
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
SCX_ENUM_INIT(__skel); \
+ if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth && \
+ !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \
+ fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \
+ __skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL; \
+ } \
+ if (__skel->struct_ops.__ops_name->cgroup_set_idle && \
+ !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \
+ fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \
+ __skel->struct_ops.__ops_name->cgroup_set_idle = NULL; \
+ } \
__skel; \
})
diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c
new file mode 100644
index 000000000000..6326ce598c8e
--- /dev/null
+++ b/tools/sched_ext/scx_cpu0.bpf.c
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A CPU0 scheduler.
+ *
+ * This scheduler queues all tasks to a shared DSQ and only dispatches them on
+ * CPU0 in FIFO order. This is useful for testing bypass behavior when many
+ * tasks are concentrated on a single CPU. If the load balancer doesn't work,
+ * bypass mode can trigger task hangs or RCU stalls as the queue is long and
+ * there's only one CPU working on it.
+ *
+ * - Statistics tracking how many tasks are queued to local and CPU0 DSQs.
+ * - Termination notification for userspace.
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
+
+UEI_DEFINE(uei);
+
+/*
+ * We create a custom DSQ with ID 0 that we dispatch to and consume from on
+ * CPU0.
+ */
+#define DSQ_CPU0 0
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u64));
+ __uint(max_entries, 2); /* [local, cpu0] */
+} stats SEC(".maps");
+
+static void stat_inc(u32 idx)
+{
+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
+ if (cnt_p)
+ (*cnt_p)++;
+}
+
+s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+ return 0;
+}
+
+void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ /*
+ * select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on
+ * CPU 0. Queue on whichever CPU it's currently only.
+ */
+ if (scx_bpf_task_cpu(p) != 0) {
+ stat_inc(0); /* count local queueing */
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+ return;
+ }
+
+ stat_inc(1); /* count cpu0 queueing */
+ scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev)
+{
+ if (cpu == 0)
+ scx_bpf_dsq_move_to_local(DSQ_CPU0);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init)
+{
+ return scx_bpf_create_dsq(DSQ_CPU0, -1);
+}
+
+void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(cpu0_ops,
+ .select_cpu = (void *)cpu0_select_cpu,
+ .enqueue = (void *)cpu0_enqueue,
+ .dispatch = (void *)cpu0_dispatch,
+ .init = (void *)cpu0_init,
+ .exit = (void *)cpu0_exit,
+ .name = "cpu0");
diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c
new file mode 100644
index 000000000000..1e4fa4ab8da9
--- /dev/null
+++ b/tools/sched_ext/scx_cpu0.c
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_cpu0.bpf.skel.h"
+
+const char help_fmt[] =
+"A cpu0 sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-v]\n"
+"\n"
+" -v Print libbpf debug messages\n"
+" -h Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+ if (level == LIBBPF_DEBUG && !verbose)
+ return 0;
+ return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int sig)
+{
+ exit_req = 1;
+}
+
+static void read_stats(struct scx_cpu0 *skel, __u64 *stats)
+{
+ int nr_cpus = libbpf_num_possible_cpus();
+ assert(nr_cpus > 0);
+ __u64 cnts[2][nr_cpus];
+ __u32 idx;
+
+ memset(stats, 0, sizeof(stats[0]) * 2);
+
+ for (idx = 0; idx < 2; idx++) {
+ int ret, cpu;
+
+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+ &idx, cnts[idx]);
+ if (ret < 0)
+ continue;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ stats[idx] += cnts[idx][cpu];
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct scx_cpu0 *skel;
+ struct bpf_link *link;
+ __u32 opt;
+ __u64 ecode;
+
+ libbpf_set_print(libbpf_print_fn);
+ signal(SIGINT, sigint_handler);
+ signal(SIGTERM, sigint_handler);
+restart:
+ skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
+
+ skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+
+ while ((opt = getopt(argc, argv, "vh")) != -1) {
+ switch (opt) {
+ case 'v':
+ verbose = true;
+ break;
+ default:
+ fprintf(stderr, help_fmt, basename(argv[0]));
+ return opt != 'h';
+ }
+ }
+
+ SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei);
+ link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0);
+
+ while (!exit_req && !UEI_EXITED(skel, uei)) {
+ __u64 stats[2];
+
+ read_stats(skel, stats);
+ printf("local=%llu cpu0=%llu\n", stats[0], stats[1]);
+ fflush(stdout);
+ sleep(1);
+ }
+
+ bpf_link__destroy(link);
+ ecode = UEI_REPORT(skel, uei);
+ scx_cpu0__destroy(skel);
+
+ if (UEI_ECODE_RESTART(ecode))
+ goto restart;
+ return 0;
+}
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 2c720e3ecad5..43126858b8e4 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -382,7 +382,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp);
if (!cgc)
goto out_release;
@@ -508,7 +508,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
{
struct cgroup *cgrp;
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
update_active_weight_sums(cgrp, true);
bpf_cgroup_release(cgrp);
}
@@ -521,7 +521,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
if (fifo_sched)
return;
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp);
if (cgc) {
/*
@@ -564,7 +564,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
if (!taskc->bypassed_at)
return;
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp);
if (cgc) {
__sync_fetch_and_add(&cgc->cvtime_delta,
@@ -578,7 +578,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
{
struct cgroup *cgrp;
- cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+ cgrp = scx_bpf_task_cgroup(p);
update_active_weight_sums(cgrp, false);
bpf_cgroup_release(cgrp);
}
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 3072b593f898..df21fad0c438 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
void *ring;
s32 cpu;
+ if (enq_flags & SCX_ENQ_REENQ)
+ __sync_fetch_and_add(&nr_reenqueued, 1);
+
if (p->flags & PF_KTHREAD) {
if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
return;
@@ -320,12 +323,9 @@ static bool dispatch_highpri(bool from_timer)
if (tctx->highpri) {
/* exercise the set_*() and vtime interface too */
- __COMPAT_scx_bpf_dsq_move_set_slice(
- BPF_FOR_EACH_ITER, slice_ns * 2);
- __COMPAT_scx_bpf_dsq_move_set_vtime(
- BPF_FOR_EACH_ITER, highpri_seq++);
- __COMPAT_scx_bpf_dsq_move_vtime(
- BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
+ scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
+ scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
+ scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
}
}
@@ -342,9 +342,8 @@ static bool dispatch_highpri(bool from_timer)
else
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
- if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
- SCX_DSQ_LOCAL_ON | cpu,
- SCX_ENQ_PREEMPT)) {
+ if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
+ SCX_ENQ_PREEMPT)) {
if (cpu == this_cpu) {
dispatched = true;
__sync_fetch_and_add(&nr_expedited_local, 1);
@@ -533,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
return task_qdist(a) > task_qdist(b);
}
-void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+SEC("tp_btf/sched_switch")
+int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
+ struct task_struct *next, unsigned long prev_state)
{
- u32 cnt;
+ if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
+ return 0;
/*
- * Called when @cpu is taken by a higher priority scheduling class. This
- * makes @cpu no longer available for executing sched_ext tasks. As we
- * don't want the tasks in @cpu's local dsq to sit there until @cpu
- * becomes available again, re-enqueue them into the global dsq. See
- * %SCX_ENQ_REENQ handling in qmap_enqueue().
+ * If @cpu is taken by a higher priority scheduling class, it is no
+ * longer available for executing sched_ext tasks. As we don't want the
+ * tasks in @cpu's local dsq to sit there until @cpu becomes available
+ * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
+ * handling in qmap_enqueue().
*/
- cnt = scx_bpf_reenqueue_local();
- if (cnt)
- __sync_fetch_and_add(&nr_reenqueued, cnt);
+ switch (next->policy) {
+ case 1: /* SCHED_FIFO */
+ case 2: /* SCHED_RR */
+ case 6: /* SCHED_DEADLINE */
+ scx_bpf_reenqueue_local();
+ }
+
+ return 0;
+}
+
+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+ /* see qmap_sched_switch() to learn how to do this on newer kernels */
+ if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
+ scx_bpf_reenqueue_local();
}
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,