diff options
Diffstat (limited to 'tools/sched_ext')
| -rw-r--r-- | tools/sched_ext/Makefile | 4 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/common.bpf.h | 15 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/compat.bpf.h | 314 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/compat.h | 14 | ||||
| -rw-r--r-- | tools/sched_ext/scx_cpu0.bpf.c | 88 | ||||
| -rw-r--r-- | tools/sched_ext/scx_cpu0.c | 106 | ||||
| -rw-r--r-- | tools/sched_ext/scx_flatcg.bpf.c | 10 | ||||
| -rw-r--r-- | tools/sched_ext/scx_qmap.bpf.c | 52 |
8 files changed, 474 insertions, 129 deletions
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index d68780e2e03d..e4bda2474060 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -133,6 +133,7 @@ $(MAKE_DIRS): $(call msg,MKDIR,,$@) $(Q)mkdir -p $@ +ifneq ($(CROSS_COMPILE),) $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(APIDIR)/linux/bpf.h \ | $(OBJ_DIR)/libbpf @@ -141,6 +142,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ EXTRA_CFLAGS='-g -O0 -fPIC' \ LDFLAGS="$(LDFLAGS)" \ DESTDIR=$(OUTPUT_DIR) prefix= all install_headers +endif $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(APIDIR)/linux/bpf.h \ @@ -187,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg +c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg $(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(BINDIR)/%: \ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 06e2551033cb..821d5791bd42 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -60,21 +60,15 @@ static inline void ___vmlinux_h_sanity_check___(void) s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; -s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, - const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; -void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; -void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; +s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, + struct scx_bpf_select_cpu_and_args *args) __ksym __weak; +bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak; u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; -bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak; -void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; -void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; -bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -u32 scx_bpf_reenqueue_local(void) __ksym; void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak; int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; @@ -105,7 +99,6 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct rq *scx_bpf_locked_rq(void) __ksym; struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; -struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index dd9144624dc9..f2969c3061a7 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -16,119 +16,92 @@ }) /* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ -#define __COMPAT_scx_bpf_task_cgroup(p) \ - (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ - scx_bpf_task_cgroup((p)) : NULL) +struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak; + +#define scx_bpf_task_cgroup(p) \ + (bpf_ksym_exists(scx_bpf_task_cgroup___new) ? \ + scx_bpf_task_cgroup___new((p)) : NULL) /* * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are * renamed to unload the verb. * - * Build error is triggered if old names are used. New binaries work with both - * new and old names. The compat macros will be removed on v6.15 release. - * * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()"). - * Preserve __COMPAT macros until v6.15. */ -void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; -void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; -bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak; -void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; -void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; -bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; - -#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ - scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) : \ - scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags))) - -#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ? \ - scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \ - scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags))) +bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak; +void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; +bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; + +bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak; +void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; +bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; #define scx_bpf_dsq_move_to_local(dsq_id) \ - (bpf_ksym_exists(scx_bpf_dsq_move_to_local) ? \ - scx_bpf_dsq_move_to_local((dsq_id)) : \ - scx_bpf_consume___compat((dsq_id))) - -#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice) \ - (bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ? \ - scx_bpf_dsq_move_set_slice((it__iter), (slice)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ? \ - scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) : \ + (bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ? \ + scx_bpf_dsq_move_to_local___new((dsq_id)) : \ + scx_bpf_consume___old((dsq_id))) + +#define scx_bpf_dsq_move_set_slice(it__iter, slice) \ + (bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ? \ + scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ? \ + scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) : \ + (void)0)) + +#define scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ + (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ? \ + scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ? \ + scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) : \ (void)0)) -#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ - (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ? \ - scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ? \ - scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) : \ - (void) 0)) - -#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_move) ? \ - scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ? \ - scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ +#define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dsq_move___new) ? \ + scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ? \ + scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) -#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_move_vtime) ? \ - scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \ - scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ +#define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ? \ + scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ? \ + scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) +/* + * v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits") + * + * Compat macro will be dropped on v6.19 release. + */ +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; + #define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ (bpf_ksym_exists(bpf_cpumask_populate) ? \ (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) -#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ - _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") - -#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags) \ - _Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()") - -#define scx_bpf_consume(dsq_id) ({ \ - _Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \ - false; \ -}) - -#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()") - -#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()") - -#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \ - false; \ -}) - -#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \ - false; \ -}) - -#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()") - -#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()") - -#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \ - false; \ -}) - -#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \ - false; \ -}) +/* + * v6.19: Introduce lockless peek API for user DSQs. + * + * Preserve the following macro until v6.21. + */ +static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id) +{ + struct task_struct *p = NULL; + struct bpf_iter_scx_dsq it; + + if (bpf_ksym_exists(scx_bpf_dsq_peek)) + return scx_bpf_dsq_peek(dsq_id); + if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0)) + p = bpf_iter_scx_dsq_next(&it); + bpf_iter_scx_dsq_destroy(&it); + return p; +} /** * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on @@ -248,6 +221,161 @@ static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) } /* + * v6.19: To work around BPF maximum parameter limit, the following kfuncs are + * replaced with variants that pack scalar arguments in a struct. Wrappers are + * provided to maintain source compatibility. + * + * v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the + * block on dispatch renaming above for more details. + * + * The kernel will carry the compat variants until v6.23 to maintain binary + * compatibility. After v6.23 release, remove the compat handling and move the + * wrappers to common.bpf.h. + */ +s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; +void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; +void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; + +/** + * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @cpus_allowed: cpumask of allowed CPUs + * @flags: %SCX_PICK_IDLE* flags + * + * Inline wrapper that packs scalar arguments into a struct and calls + * __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details. + */ +static inline s32 +scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) { + struct scx_bpf_select_cpu_and_args args = { + .prev_cpu = prev_cpu, + .wake_flags = wake_flags, + .flags = flags, + }; + + return __scx_bpf_select_cpu_and(p, cpus_allowed, &args); + } else { + return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags, + cpus_allowed, flags); + } +} + +/** + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into + * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Inline wrapper that packs scalar arguments into a struct and calls + * __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details. + */ +static inline bool +scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, + u64 enq_flags) +{ + if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) { + struct scx_bpf_dsq_insert_vtime_args args = { + .dsq_id = dsq_id, + .slice = slice, + .vtime = vtime, + .enq_flags = enq_flags, + }; + + return __scx_bpf_dsq_insert_vtime(p, &args); + } else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) { + scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime, + enq_flags); + return true; + } else { + scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime, + enq_flags); + return true; + } +} + +/* + * v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move + * scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22. + * The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on + * kernel side. The entire suffix can be dropped later. + * + * v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on + * dispatch renaming above for more details. + */ +bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; +void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; +void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; + +static inline bool +scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) +{ + if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) { + return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags); + } else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) { + scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags); + return true; + } else { + scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags); + return true; + } +} + +/* + * v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for + * sub-sched authority checks. Drop the wrappers and move the decls to + * common.bpf.h after v6.22. + */ +bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak; +bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak; + +static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +{ + if (bpf_ksym_exists(scx_bpf_task_set_slice___new)) + scx_bpf_task_set_slice___new(p, slice); + else + p->scx.slice = slice; +} + +static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +{ + if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new)) + scx_bpf_task_set_dsq_vtime___new(p, vtime); + else + p->scx.dsq_vtime = vtime; +} + +/* + * v6.19: The new void variant can be called from anywhere while the older v1 + * variant can only be called from ops.cpu_release(). The double ___ prefixes on + * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix + * on kernel side. Drop the wrapper and move the decl to common.bpf.h after + * v6.22. + */ +u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak; +void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak; + +static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void) +{ + return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat); +} + +static inline void scx_bpf_reenqueue_local(void) +{ + if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + scx_bpf_reenqueue_local___v2___compat(); + else + scx_bpf_reenqueue_local___v1(); +} + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index 35c67c5174ac..8b4897fc8b99 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -151,6 +151,10 @@ static inline long scx_hotplug_seq(void) * * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is * the current minimum required kernel version. + * + * COMPAT: + * - v6.17: ops.cgroup_set_bandwidth() + * - v6.19: ops.cgroup_set_idle() */ #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ struct __scx_name *__skel; \ @@ -162,6 +166,16 @@ static inline long scx_hotplug_seq(void) SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ SCX_ENUM_INIT(__skel); \ + if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth && \ + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \ + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \ + __skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL; \ + } \ + if (__skel->struct_ops.__ops_name->cgroup_set_idle && \ + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \ + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \ + __skel->struct_ops.__ops_name->cgroup_set_idle = NULL; \ + } \ __skel; \ }) diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c new file mode 100644 index 000000000000..6326ce598c8e --- /dev/null +++ b/tools/sched_ext/scx_cpu0.bpf.c @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A CPU0 scheduler. + * + * This scheduler queues all tasks to a shared DSQ and only dispatches them on + * CPU0 in FIFO order. This is useful for testing bypass behavior when many + * tasks are concentrated on a single CPU. If the load balancer doesn't work, + * bypass mode can trigger task hangs or RCU stalls as the queue is long and + * there's only one CPU working on it. + * + * - Statistics tracking how many tasks are queued to local and CPU0 DSQs. + * - Termination notification for userspace. + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ + +UEI_DEFINE(uei); + +/* + * We create a custom DSQ with ID 0 that we dispatch to and consume from on + * CPU0. + */ +#define DSQ_CPU0 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, cpu0] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + return 0; +} + +void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags) +{ + /* + * select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on + * CPU 0. Queue on whichever CPU it's currently only. + */ + if (scx_bpf_task_cpu(p) != 0) { + stat_inc(0); /* count local queueing */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + return; + } + + stat_inc(1); /* count cpu0 queueing */ + scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev) +{ + if (cpu == 0) + scx_bpf_dsq_move_to_local(DSQ_CPU0); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init) +{ + return scx_bpf_create_dsq(DSQ_CPU0, -1); +} + +void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(cpu0_ops, + .select_cpu = (void *)cpu0_select_cpu, + .enqueue = (void *)cpu0_enqueue, + .dispatch = (void *)cpu0_dispatch, + .init = (void *)cpu0_init, + .exit = (void *)cpu0_exit, + .name = "cpu0"); diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c new file mode 100644 index 000000000000..1e4fa4ab8da9 --- /dev/null +++ b/tools/sched_ext/scx_cpu0.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <assert.h> +#include <libgen.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include "scx_cpu0.bpf.skel.h" + +const char help_fmt[] = +"A cpu0 sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-v]\n" +"\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int sig) +{ + exit_req = 1; +} + +static void read_stats(struct scx_cpu0 *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + assert(nr_cpus > 0); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_cpu0 *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0); + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "vh")) != -1) { + switch (opt) { + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei); + link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu cpu0=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_cpu0__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 2c720e3ecad5..43126858b8e4 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -382,7 +382,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) return; } - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (!cgc) goto out_release; @@ -508,7 +508,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) { struct cgroup *cgrp; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); update_active_weight_sums(cgrp, true); bpf_cgroup_release(cgrp); } @@ -521,7 +521,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) if (fifo_sched) return; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (cgc) { /* @@ -564,7 +564,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) if (!taskc->bypassed_at) return; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (cgc) { __sync_fetch_and_add(&cgc->cvtime_delta, @@ -578,7 +578,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) { struct cgroup *cgrp; - cgrp = __COMPAT_scx_bpf_task_cgroup(p); + cgrp = scx_bpf_task_cgroup(p); update_active_weight_sums(cgrp, false); bpf_cgroup_release(cgrp); } diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 3072b593f898..df21fad0c438 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) void *ring; s32 cpu; + if (enq_flags & SCX_ENQ_REENQ) + __sync_fetch_and_add(&nr_reenqueued, 1); + if (p->flags & PF_KTHREAD) { if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) return; @@ -320,12 +323,9 @@ static bool dispatch_highpri(bool from_timer) if (tctx->highpri) { /* exercise the set_*() and vtime interface too */ - __COMPAT_scx_bpf_dsq_move_set_slice( - BPF_FOR_EACH_ITER, slice_ns * 2); - __COMPAT_scx_bpf_dsq_move_set_vtime( - BPF_FOR_EACH_ITER, highpri_seq++); - __COMPAT_scx_bpf_dsq_move_vtime( - BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); + scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2); + scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++); + scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); } } @@ -342,9 +342,8 @@ static bool dispatch_highpri(bool from_timer) else cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); - if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, - SCX_DSQ_LOCAL_ON | cpu, - SCX_ENQ_PREEMPT)) { + if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu, + SCX_ENQ_PREEMPT)) { if (cpu == this_cpu) { dispatched = true; __sync_fetch_and_add(&nr_expedited_local, 1); @@ -533,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before, return task_qdist(a) > task_qdist(b); } -void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +SEC("tp_btf/sched_switch") +int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev, + struct task_struct *next, unsigned long prev_state) { - u32 cnt; + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + return 0; /* - * Called when @cpu is taken by a higher priority scheduling class. This - * makes @cpu no longer available for executing sched_ext tasks. As we - * don't want the tasks in @cpu's local dsq to sit there until @cpu - * becomes available again, re-enqueue them into the global dsq. See - * %SCX_ENQ_REENQ handling in qmap_enqueue(). + * If @cpu is taken by a higher priority scheduling class, it is no + * longer available for executing sched_ext tasks. As we don't want the + * tasks in @cpu's local dsq to sit there until @cpu becomes available + * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ + * handling in qmap_enqueue(). */ - cnt = scx_bpf_reenqueue_local(); - if (cnt) - __sync_fetch_and_add(&nr_reenqueued, cnt); + switch (next->policy) { + case 1: /* SCHED_FIFO */ + case 2: /* SCHED_RR */ + case 6: /* SCHED_DEADLINE */ + scx_bpf_reenqueue_local(); + } + + return 0; +} + +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + /* see qmap_sched_switch() to learn how to do this on newer kernels */ + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + scx_bpf_reenqueue_local(); } s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, |
