summaryrefslogtreecommitdiff
path: root/kernel/sched/ext.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r--kernel/sched/ext.c1875
1 files changed, 1078 insertions, 797 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fdbf249d1c68..793e288f63cf 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -26,7 +26,7 @@ enum scx_consts {
* Iterating all tasks may take a while. Periodically drop
* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
*/
- SCX_OPS_TASK_ITER_BATCH = 32,
+ SCX_TASK_ITER_BATCH = 32,
};
enum scx_exit_kind {
@@ -44,9 +44,9 @@ enum scx_exit_kind {
};
/*
- * An exit code can be specified when exiting with scx_bpf_exit() or
- * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
- * respectively. The codes are 64bit of the format:
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
*
* Bits: [63 .. 48 47 .. 32 31 .. 0]
* [ SYS ACT ] [ SYS RSN ] [ USR ]
@@ -163,16 +163,21 @@ enum scx_ops_flags {
/*
* CPU cgroup support flags
*/
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
- SCX_OPS_ENQ_LAST |
- SCX_OPS_ENQ_EXITING |
- SCX_OPS_ENQ_MIGRATION_DISABLED |
- SCX_OPS_ALLOW_QUEUED_WAKEUP |
- SCX_OPS_SWITCH_PARTIAL |
- SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_HAS_CGROUP_WEIGHT,
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+
+ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
};
/* argument container for ops.init_task() */
@@ -368,6 +373,15 @@ struct sched_ext_ops {
* @running: A task is starting to run on its associated CPU
* @p: task starting to run
*
+ * Note that this callback may be called from a CPU other than the
+ * one the task is going to run on. This can happen when a task
+ * property is changed (i.e., affinity), since scx_next_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+ * target CPU the task is going to use.
+ *
* See ->runnable() for explanation on the task state notifiers.
*/
void (*running)(struct task_struct *p);
@@ -377,6 +391,15 @@ struct sched_ext_ops {
* @p: task stopping to run
* @runnable: is task @p still runnable?
*
+ * Note that this callback may be called from a CPU other than the
+ * one the task was running on. This can happen when a task
+ * property is changed (i.e., affinity), since dequeue_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+ * the task was running on.
+ *
* See ->runnable() for explanation on the task state notifiers. If
* !@runnable, ->quiescent() will be invoked after this operation
* returns.
@@ -465,6 +488,7 @@ struct sched_ext_ops {
* idle CPU tracking and the following helpers become unavailable:
*
* - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
* - scx_bpf_test_and_clear_cpu_idle()
* - scx_bpf_pick_idle_cpu()
*
@@ -728,6 +752,9 @@ struct sched_ext_ops {
* BPF scheduler is enabled.
*/
char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void *priv;
};
enum scx_opi {
@@ -739,6 +766,98 @@ enum scx_opi {
SCX_OPI_END = SCX_OP_IDX(init),
};
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
+ struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_events().
+ */
+ struct scx_event_stats __percpu *event_stats_cpu;
+
+ bool warned_zero_slice;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
+ struct rcu_work rcu_work;
+};
+
enum scx_wake_flags {
/* expose select WF_* flags as enums */
SCX_WAKE_FORK = WF_FORK,
@@ -838,18 +957,18 @@ enum scx_tg_flags {
SCX_TG_INITED = 1U << 1,
};
-enum scx_ops_enable_state {
- SCX_OPS_ENABLING,
- SCX_OPS_ENABLED,
- SCX_OPS_DISABLING,
- SCX_OPS_DISABLED,
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
};
-static const char *scx_ops_enable_state_str[] = {
- [SCX_OPS_ENABLING] = "enabling",
- [SCX_OPS_ENABLED] = "enabled",
- [SCX_OPS_DISABLING] = "disabling",
- [SCX_OPS_DISABLED] = "disabled",
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
};
/*
@@ -898,6 +1017,16 @@ enum scx_ops_state {
#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
/*
+ * NOTE: sched_ext is in the process of growing multiple scheduler support and
+ * scx_root usage is in a transitional state. Naked dereferences are safe if the
+ * caller is one of the tasks attached to SCX and explicit RCU dereference is
+ * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but
+ * are used as temporary markers to indicate that the dereferences need to be
+ * updated to point to the associated scheduler instances rather than scx_root.
+ */
+static struct scx_sched __rcu *scx_root;
+
+/*
* During exit, a task may schedule after losing its PIDs. When disabling the
* BPF scheduler, we need to be able to iterate tasks in every state to
* guarantee system safety. Maintain a dedicated task list which contains every
@@ -907,33 +1036,17 @@ static DEFINE_SPINLOCK(scx_tasks_lock);
static LIST_HEAD(scx_tasks);
/* ops enable/disable */
-static struct kthread_worker *scx_ops_helper;
-static DEFINE_MUTEX(scx_ops_enable_mutex);
-DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
+static DEFINE_MUTEX(scx_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
-static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
static unsigned long scx_in_softlockup;
-static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
-static int scx_ops_bypass_depth;
-static bool scx_ops_init_task_enabled;
+static atomic_t scx_breather_depth = ATOMIC_INIT(0);
+static int scx_bypass_depth;
+static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
-static struct sched_ext_ops scx_ops;
-static bool scx_warned_zero_slice;
-
-DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-
-static struct static_key_false scx_has_op[SCX_OPI_END] =
- { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
-
-static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-static struct scx_exit_info *scx_exit_info;
-
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
@@ -947,7 +1060,7 @@ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
* The maximum amount of time in jiffies that a task may be runnable without
* being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_ops_error().
+ * scx_error().
*/
static unsigned long scx_watchdog_timeout;
@@ -973,23 +1086,12 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
*/
static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
-/*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
- * to avoid live-locking in bypass mode where all tasks are dispatched to
- * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
- * sufficient, it can be further split.
- */
-static struct scx_dispatch_q **global_dsqs;
-
static const struct rhashtable_params dsq_hash_params = {
.key_len = sizeof_field(struct scx_dispatch_q, id),
.key_offset = offsetof(struct scx_dispatch_q, id),
.head_offset = offsetof(struct scx_dispatch_q, hash_node),
};
-static struct rhashtable dsq_hash;
static LLIST_HEAD(dsqs_to_free);
/* dispatch buf */
@@ -1036,27 +1138,46 @@ static struct scx_dump_data scx_dump_data = {
/* /sys/kernel/sched_ext interface */
static struct kset *scx_kset;
-static struct kobject *scx_root_kobj;
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...);
+static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+ s64 exit_code, const char *fmt, va_list args);
+
+static __printf(4, 5) void scx_exit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
+{
+ va_list args;
-#define scx_ops_error_kind(err, fmt, args...) \
- scx_ops_exit_kind((err), 0, fmt, ##args)
+ va_start(args, fmt);
+ scx_vexit(sch, kind, exit_code, fmt, args);
+ va_end(args);
+}
+
+static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
+{
+ struct scx_sched *sch;
+ va_list args;
-#define scx_ops_exit(code, fmt, args...) \
- scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ va_start(args, fmt);
+ scx_vexit(sch, kind, exit_code, fmt, args);
+ va_end(args);
+ }
+ rcu_read_unlock();
+}
-#define scx_ops_error(fmt, args...) \
- scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
+#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args)
-#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
+#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
static long jiffies_delta_msecs(unsigned long at, unsigned long now)
{
@@ -1086,12 +1207,14 @@ static bool u32_before(u32 a, u32 b)
static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
{
- return global_dsqs[cpu_to_node(task_cpu(p))];
+ struct scx_sched *sch = scx_root;
+
+ return sch->global_dsqs[cpu_to_node(task_cpu(p))];
}
-static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
+static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
{
- return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+ return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params);
}
/*
@@ -1118,27 +1241,61 @@ static void scx_kf_disallow(u32 mask)
current->scx.kf_mask &= ~mask;
}
-#define SCX_CALL_OP(mask, op, args...) \
+/*
+ * Track the rq currently locked.
+ *
+ * This allows kfuncs to safely operate on rq from any scx ops callback,
+ * knowing which rq is already locked.
+ */
+static DEFINE_PER_CPU(struct rq *, locked_rq);
+
+static inline void update_locked_rq(struct rq *rq)
+{
+ /*
+ * Check whether @rq is actually locked. This can help expose bugs
+ * or incorrect assumptions about the context in which a kfunc or
+ * callback is executed.
+ */
+ if (rq)
+ lockdep_assert_rq_held(rq);
+ __this_cpu_write(locked_rq, rq);
+}
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(locked_rq);
+}
+
+#define SCX_CALL_OP(sch, mask, op, rq, args...) \
do { \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- scx_ops.op(args); \
+ (sch)->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- scx_ops.op(args); \
+ (sch)->ops.op(args); \
} \
+ update_locked_rq(NULL); \
} while (0)
-#define SCX_CALL_OP_RET(mask, op, args...) \
+#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \
({ \
- __typeof__(scx_ops.op(args)) __ret; \
+ __typeof__((sch)->ops.op(args)) __ret; \
+ \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- __ret = scx_ops.op(args); \
+ __ret = (sch)->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- __ret = scx_ops.op(args); \
+ __ret = (sch)->ops.op(args); \
} \
+ update_locked_rq(NULL); \
__ret; \
})
@@ -1153,31 +1310,31 @@ do { \
* scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
* the specific task.
*/
-#define SCX_CALL_OP_TASK(mask, op, task, args...) \
+#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \
do { \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP(mask, op, task, ##args); \
+ SCX_CALL_OP((sch), mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \
+#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \
({ \
- __typeof__(scx_ops.op(task, ##args)) __ret; \
+ __typeof__((sch)->ops.op(task, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \
({ \
- __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \
+ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
@@ -1187,8 +1344,8 @@ do { \
static __always_inline bool scx_kf_allowed(u32 mask)
{
if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
+ scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
return false;
}
@@ -1201,13 +1358,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_ops_error("cpu_release kfunc called from a nested operation");
+ scx_kf_error("cpu_release kfunc called from a nested operation");
return false;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_ops_error("dispatch kfunc called from a nested operation");
+ scx_kf_error("dispatch kfunc called from a nested operation");
return false;
}
@@ -1223,18 +1380,13 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
- scx_ops_error("called on a task not being operated on");
+ scx_kf_error("called on a task not being operated on");
return false;
}
return true;
}
-static bool scx_kf_allowed_if_unlocked(void)
-{
- return !current->scx.kf_mask;
-}
-
/**
* nldsq_next_task - Iterate to the next task in a non-local DSQ
* @dsq: user dsq being iterated
@@ -1402,15 +1554,15 @@ static void scx_task_iter_stop(struct scx_task_iter *iter)
* @iter: iterator to walk
*
* Visit the next task. See scx_task_iter_start() for details. Locks are dropped
- * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
- * stalls by holding scx_tasks_lock for too long.
+ * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls
+ * by holding scx_tasks_lock for too long.
*/
static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
{
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
- if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
+ if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
scx_task_iter_relock(iter);
@@ -1481,91 +1633,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
- /*
- * If ops.select_cpu() returns a CPU which can't be used by the task,
- * the core scheduler code silently picks a fallback CPU.
- */
- s64 SCX_EV_SELECT_CPU_FALLBACK;
-
- /*
- * When dispatching to a local DSQ, the CPU may have gone offline in
- * the meantime. In this case, the task is bounced to the global DSQ.
- */
- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
- /*
- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
- * continued to run because there were no other tasks on the CPU.
- */
- s64 SCX_EV_DISPATCH_KEEP_LAST;
-
- /*
- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
- * is dispatched to a local DSQ when exiting.
- */
- s64 SCX_EV_ENQ_SKIP_EXITING;
-
- /*
- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
- * migration disabled task skips ops.enqueue() and is dispatched to its
- * local DSQ.
- */
- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
- /*
- * The total number of tasks enqueued (or pick_task-ed) with a
- * default time slice (SCX_SLICE_DFL).
- */
- s64 SCX_EV_ENQ_SLICE_DFL;
-
- /*
- * The total duration of bypass modes in nanoseconds.
- */
- s64 SCX_EV_BYPASS_DURATION;
-
- /*
- * The number of tasks dispatched in the bypassing mode.
- */
- s64 SCX_EV_BYPASS_DISPATCH;
-
- /*
- * The number of times the bypassing mode has been activated.
- */
- s64 SCX_EV_BYPASS_ACTIVATE;
-};
-
-/*
- * The event counter is organized by a per-CPU variable to minimize the
- * accounting overhead without synchronization. A system-wide view on the
- * event counter is constructed when requested by scx_bpf_get_event_stat().
- */
-static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
-
/**
* scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured
*
* This can be used when preemption is not disabled.
*/
-#define scx_add_event(name, cnt) do { \
- this_cpu_add(event_stats_cpu.name, cnt); \
- trace_sched_ext_event(#name, cnt); \
+#define scx_add_event(sch, name, cnt) do { \
+ this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ trace_sched_ext_event(#name, (cnt)); \
} while(0)
/**
* __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured
*
* This should be used only when preemption is disabled.
*/
-#define __scx_add_event(name, cnt) do { \
- __this_cpu_add(event_stats_cpu.name, cnt); \
+#define __scx_add_event(sch, name, cnt) do { \
+ __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
@@ -1590,25 +1680,25 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
} while (0)
-static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+static void scx_read_events(struct scx_sched *sch,
+ struct scx_event_stats *events);
-static enum scx_ops_enable_state scx_ops_enable_state(void)
+static enum scx_enable_state scx_enable_state(void)
{
- return atomic_read(&scx_ops_enable_state_var);
+ return atomic_read(&scx_enable_state_var);
}
-static enum scx_ops_enable_state
-scx_ops_set_enable_state(enum scx_ops_enable_state to)
+static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to)
{
- return atomic_xchg(&scx_ops_enable_state_var, to);
+ return atomic_xchg(&scx_enable_state_var, to);
}
-static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
- enum scx_ops_enable_state from)
+static bool scx_tryset_enable_state(enum scx_enable_state to,
+ enum scx_enable_state from)
{
int from_v = from;
- return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
+ return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
}
static bool scx_rq_bypassing(struct rq *rq)
@@ -1633,8 +1723,14 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss)
} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
}
+static inline bool __cpu_valid(s32 cpu)
+{
+ return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
+}
+
/**
- * ops_cpu_valid - Verify a cpu number
+ * ops_cpu_valid - Verify a cpu number, to be used on ops input args
+ * @sch: scx_sched to abort on error
* @cpu: cpu number which came from a BPF ops
* @where: extra information reported on error
*
@@ -1642,35 +1738,52 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss)
* Verify that it is in range and one of the possible cpus. If invalid, trigger
* an ops error.
*/
-static bool ops_cpu_valid(s32 cpu, const char *where)
+static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
{
- if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
+ if (__cpu_valid(cpu)) {
return true;
} else {
- scx_ops_error("invalid CPU %d%s%s", cpu,
- where ? " " : "", where ?: "");
+ scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
+ return false;
+ }
+}
+
+/**
+ * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args
+ * @cpu: cpu number which came from a BPF ops
+ * @where: extra information reported on error
+ *
+ * The same as ops_cpu_valid() but @sch is implicit.
+ */
+static bool kf_cpu_valid(u32 cpu, const char *where)
+{
+ if (__cpu_valid(cpu)) {
+ return true;
+ } else {
+ scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
return false;
}
}
/**
* ops_sanitize_err - Sanitize a -errno value
+ * @sch: scx_sched to error out on error
* @ops_name: operation to blame on failure
* @err: -errno value to sanitize
*
- * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
+ * Verify @err is a valid -errno. If not, trigger scx_error() and return
* -%EPROTO. This is necessary because returning a rogue -errno up the chain can
* cause misbehaviors. For an example, a large negative return from
* ops.init_task() triggers an oops when passed up the call chain because the
* value fails IS_ERR() test after being encoded with ERR_PTR() and then is
* handled as a pointer.
*/
-static int ops_sanitize_err(const char *ops_name, s32 err)
+static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err)
{
if (err < 0 && err >= -MAX_ERRNO)
return err;
- scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
+ scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err);
return -EPROTO;
}
@@ -1777,7 +1890,7 @@ static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
lockdep_assert_rq_held(rq);
#ifdef CONFIG_SCHED_CORE
- if (SCX_HAS_OP(core_sched_before))
+ if (unlikely(SCX_HAS_OP(scx_root, core_sched_before)))
touch_core_sched(rq, p);
#endif
}
@@ -1815,8 +1928,14 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
WRITE_ONCE(dsq->nr, dsq->nr + delta);
}
-static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
- u64 enq_flags)
+static void refill_task_slice_dfl(struct task_struct *p)
+{
+ p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
+}
+
+static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ struct task_struct *p, u64 enq_flags)
{
bool is_local = dsq->id == SCX_DSQ_LOCAL;
@@ -1827,7 +1946,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
if (!is_local) {
raw_spin_lock(&dsq->lock);
if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
- scx_ops_error("attempting to dispatch to a destroyed dsq");
+ scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
dsq = find_global_dsq(p);
@@ -1844,7 +1963,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
* disallow any internal DSQ from doing vtime ordering of
* tasks.
*/
- scx_ops_error("cannot use vtime ordering for built-in DSQs");
+ scx_error(sch, "cannot use vtime ordering for built-in DSQs");
enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
}
@@ -1858,8 +1977,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
*/
if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
nldsq_next_task(dsq, NULL, false)))
- scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
- dsq->id);
+ scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+ dsq->id);
p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
@@ -1880,8 +1999,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
} else {
/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
- scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
- dsq->id);
+ scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+ dsq->id);
if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
list_add(&p->scx.dsq_list.node, &dsq->list);
@@ -1996,7 +2115,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
raw_spin_unlock(&dsq->lock);
}
-static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
+ struct rq *rq, u64 dsq_id,
struct task_struct *p)
{
struct scx_dispatch_q *dsq;
@@ -2007,7 +2127,7 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
- if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
+ if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
return find_global_dsq(p);
return &cpu_rq(cpu)->scx.local_dsq;
@@ -2016,11 +2136,11 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
if (dsq_id == SCX_DSQ_GLOBAL)
dsq = find_global_dsq(p);
else
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
- dsq_id, p->comm, p->pid);
+ scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
+ dsq_id, p->comm, p->pid);
return find_global_dsq(p);
}
@@ -2041,12 +2161,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
/* @p must match the task on the enqueue path */
if (unlikely(p != ddsp_task)) {
if (IS_ERR(ddsp_task))
- scx_ops_error("%s[%d] already direct-dispatched",
- p->comm, p->pid);
+ scx_kf_error("%s[%d] already direct-dispatched",
+ p->comm, p->pid);
else
- scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
- ddsp_task->comm, ddsp_task->pid,
- p->comm, p->pid);
+ scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ ddsp_task->comm, ddsp_task->pid,
+ p->comm, p->pid);
return;
}
@@ -2057,11 +2177,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
p->scx.ddsp_enq_flags = enq_flags;
}
-static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
{
struct rq *rq = task_rq(p);
struct scx_dispatch_q *dsq =
- find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
touch_core_sched_dispatch(rq, p);
@@ -2102,7 +2223,8 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
return;
}
- dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dsq, p,
+ p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
}
static bool scx_rq_online(struct rq *rq)
@@ -2120,6 +2242,7 @@ static bool scx_rq_online(struct rq *rq)
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
int sticky_cpu)
{
+ struct scx_sched *sch = scx_root;
struct task_struct **ddsp_taskp;
unsigned long qseq;
@@ -2138,7 +2261,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto local;
if (scx_rq_bypassing(rq)) {
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
goto global;
}
@@ -2146,20 +2269,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto direct;
/* see %SCX_OPS_ENQ_EXITING */
- if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
+ if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) &&
unlikely(p->flags & PF_EXITING)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
goto local;
}
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
- if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
+ if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
is_migration_disabled(p)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
}
- if (!SCX_HAS_OP(enqueue))
+ if (unlikely(!SCX_HAS_OP(sch, enqueue)))
goto global;
/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
@@ -2172,7 +2295,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+ SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
*ddsp_taskp = NULL;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2186,7 +2309,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
return;
direct:
- direct_dispatch(p, enq_flags);
+ direct_dispatch(sch, p, enq_flags);
return;
local:
@@ -2196,17 +2319,15 @@ local:
* higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ refill_task_slice_dfl(p);
local_norefill:
- dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
return;
global:
touch_core_sched(rq, p); /* see the comment in local: */
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
- dispatch_enqueue(find_global_dsq(p), p, enq_flags);
+ refill_task_slice_dfl(p);
+ dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -2224,7 +2345,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
}
/*
- * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
+ * list_add_tail() must be used. scx_bypass() depends on tasks being
* appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
@@ -2239,6 +2360,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
{
+ struct scx_sched *sch = scx_root;
int sticky_cpu = p->scx.sticky_cpu;
if (enq_flags & ENQUEUE_WAKEUP)
@@ -2268,8 +2390,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
rq->scx.nr_running++;
add_nr_running(rq, 1);
- if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
+ if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
@@ -2280,11 +2402,12 @@ out:
if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
unlikely(cpu_of(rq) != p->scx.selected_cpu))
- __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
+ __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1);
}
-static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
{
+ struct scx_sched *sch = scx_root;
unsigned long opss;
/* dequeue is always temporary, don't reset runnable_at */
@@ -2303,8 +2426,9 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
*/
BUG();
case SCX_OPSS_QUEUED:
- if (SCX_HAS_OP(dequeue))
- SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
+ if (SCX_HAS_OP(sch, dequeue))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
+ p, deq_flags);
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
@@ -2332,12 +2456,14 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
{
+ struct scx_sched *sch = scx_root;
+
if (!(p->scx.flags & SCX_TASK_QUEUED)) {
WARN_ON_ONCE(task_runnable(p));
return true;
}
- ops_dequeue(p, deq_flags);
+ ops_dequeue(rq, p, deq_flags);
/*
* A currently running task which is going off @rq first gets dequeued
@@ -2351,13 +2477,13 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
* information meaningful to the BPF scheduler and can be suppressed by
* skipping the callbacks if the task is !QUEUED.
*/
- if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+ if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) {
update_curr_scx(rq);
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
}
- if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
+ if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
if (deq_flags & SCX_DEQ_SLEEP)
p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -2374,20 +2500,23 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
static void yield_task_scx(struct rq *rq)
{
+ struct scx_sched *sch = scx_root;
struct task_struct *p = rq->curr;
- if (SCX_HAS_OP(yield))
- SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
+ if (SCX_HAS_OP(sch, yield))
+ SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
else
p->scx.slice = 0;
}
static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
{
+ struct scx_sched *sch = scx_root;
struct task_struct *from = rq->curr;
- if (SCX_HAS_OP(yield))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
+ if (SCX_HAS_OP(sch, yield))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
+ from, to);
else
return false;
}
@@ -2467,7 +2596,8 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
*
* The caller must ensure that @p and @rq are on different CPUs.
*/
-static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
+static bool task_can_run_on_remote_rq(struct scx_sched *sch,
+ struct task_struct *p, struct rq *rq,
bool enforce)
{
int cpu = cpu_of(rq);
@@ -2488,8 +2618,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (unlikely(is_migration_disabled(p))) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
- p->comm, p->pid, task_cpu(p), cpu);
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
return false;
}
@@ -2501,14 +2631,15 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (!task_allowed_on_cpu(p, cpu)) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
- cpu, p->comm, p->pid);
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
return false;
}
if (!scx_rq_online(rq)) {
if (enforce)
- __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ __scx_add_event(scx_root,
+ SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
}
@@ -2580,12 +2711,13 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
}
#else /* CONFIG_SMP */
static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; }
+static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; }
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
/**
* move_task_between_dsqs() - Move a task from one DSQ to another
+ * @sch: scx_sched being operated on
* @p: target task
* @enq_flags: %SCX_ENQ_*
* @src_dsq: DSQ @p is currently on, must not be a local DSQ
@@ -2599,7 +2731,8 @@ static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p
* On return, @src_dsq is unlocked and only @p's new task_rq, which is the
* return value, is locked.
*/
-static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
+static struct rq *move_task_between_dsqs(struct scx_sched *sch,
+ struct task_struct *p, u64 enq_flags,
struct scx_dispatch_q *src_dsq,
struct scx_dispatch_q *dst_dsq)
{
@@ -2612,7 +2745,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (src_rq != dst_rq &&
- unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
@@ -2646,7 +2779,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
p->scx.dsq = NULL;
raw_spin_unlock(&src_dsq->lock);
- dispatch_enqueue(dst_dsq, p, enq_flags);
+ dispatch_enqueue(sch, dst_dsq, p, enq_flags);
}
return dst_rq;
@@ -2658,13 +2791,13 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
* to the bypass mode can take a long time. Inject artificial delays while the
* bypass mode is switching to guarantee timely completion.
*/
-static void scx_ops_breather(struct rq *rq)
+static void scx_breather(struct rq *rq)
{
u64 until;
lockdep_assert_rq_held(rq);
- if (likely(!atomic_read(&scx_ops_breather_depth)))
+ if (likely(!atomic_read(&scx_breather_depth)))
return;
raw_spin_rq_unlock(rq);
@@ -2673,25 +2806,26 @@ static void scx_ops_breather(struct rq *rq)
do {
int cnt = 1024;
- while (atomic_read(&scx_ops_breather_depth) && --cnt)
+ while (atomic_read(&scx_breather_depth) && --cnt)
cpu_relax();
- } while (atomic_read(&scx_ops_breather_depth) &&
+ } while (atomic_read(&scx_breather_depth) &&
time_before64(ktime_get_ns(), until));
raw_spin_rq_lock(rq);
}
-static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
+static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
- * This retry loop can repeatedly race against scx_ops_bypass()
- * dequeueing tasks from @dsq trying to put the system into the bypass
- * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
- * live-lock the machine into soft lockups. Give a breather.
+ * This retry loop can repeatedly race against scx_bypass() dequeueing
+ * tasks from @dsq trying to put the system into the bypass mode. On
+ * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
+ * the machine into soft lockups. Give a breather.
*/
- scx_ops_breather(rq);
+ scx_breather(rq);
/*
* The caller can't expect to successfully consume a task if the task's
@@ -2713,7 +2847,7 @@ retry:
return true;
}
- if (task_can_run_on_remote_rq(p, rq, false)) {
+ if (task_can_run_on_remote_rq(sch, p, rq, false)) {
if (likely(consume_remote_task(rq, p, dsq, task_rq)))
return true;
goto retry;
@@ -2724,15 +2858,16 @@ retry:
return false;
}
-static bool consume_global_dsq(struct rq *rq)
+static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
{
int node = cpu_to_node(cpu_of(rq));
- return consume_dispatch_q(rq, global_dsqs[node]);
+ return consume_dispatch_q(sch, rq, sch->global_dsqs[node]);
}
/**
* dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @sch: scx_sched being operated on
* @rq: current rq which is locked
* @dst_dsq: destination DSQ
* @p: task to dispatch
@@ -2745,7 +2880,8 @@ static bool consume_global_dsq(struct rq *rq)
* The caller must have exclusive ownership of @p (e.g. through
* %SCX_OPSS_DISPATCHING).
*/
-static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
+static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dst_dsq,
struct task_struct *p, u64 enq_flags)
{
struct rq *src_rq = task_rq(p);
@@ -2761,14 +2897,15 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* If dispatching to @rq that @p is already on, no lock dancing needed.
*/
if (rq == src_rq && rq == dst_rq) {
- dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dst_dsq, p,
+ enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
#ifdef CONFIG_SMP
if (src_rq != dst_rq &&
- unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
- dispatch_enqueue(find_global_dsq(p), p,
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
+ dispatch_enqueue(sch, find_global_dsq(p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
@@ -2806,7 +2943,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
*/
if (src_rq == dst_rq) {
p->scx.holding_cpu = -1;
- dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
+ dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
+ enq_flags);
} else {
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq);
@@ -2848,7 +2986,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* was valid in the first place. Make sure that the task is still owned by the
* BPF scheduler and claim the ownership before dispatching.
*/
-static void finish_dispatch(struct rq *rq, struct task_struct *p,
+static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *p,
unsigned long qseq_at_dispatch,
u64 dsq_id, u64 enq_flags)
{
@@ -2901,15 +3040,15 @@ retry:
BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
- dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
if (dsq->id == SCX_DSQ_LOCAL)
- dispatch_to_local_dsq(rq, dsq, p, enq_flags);
+ dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
else
- dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
}
-static void flush_dispatch_buf(struct rq *rq)
+static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
u32 u;
@@ -2917,7 +3056,7 @@ static void flush_dispatch_buf(struct rq *rq)
for (u = 0; u < dspc->cursor; u++) {
struct scx_dsp_buf_ent *ent = &dspc->buf[u];
- finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,
+ finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
ent->enq_flags);
}
@@ -2927,6 +3066,7 @@ static void flush_dispatch_buf(struct rq *rq)
static int balance_one(struct rq *rq, struct task_struct *prev)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
@@ -2936,7 +3076,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
rq->scx.flags |= SCX_RQ_IN_BALANCE;
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
- if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
+ if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
/*
* If the previous sched_class for the current CPU was not SCX,
@@ -2944,8 +3084,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* core. This callback complements ->cpu_release(), which is
* emitted in switch_class().
*/
- if (SCX_HAS_OP(cpu_acquire))
- SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
+ if (SCX_HAS_OP(sch, cpu_acquire))
+ SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq,
+ cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -2959,8 +3100,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* scheduler wants to handle this explicitly, it should
* implement ->cpu_release().
*
- * See scx_ops_disable_workfn() for the explanation on the
- * bypassing test.
+ * See scx_disable_workfn() for the explanation on the bypassing
+ * test.
*/
if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -2972,10 +3113,11 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
- if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
+ scx_rq_bypassing(rq) || !scx_rq_online(rq))
goto no_tasks;
dspc->rq = rq;
@@ -2990,10 +3132,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
do {
dspc->nr_tasks = 0;
- SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
- prev_on_scx ? prev : NULL);
+ SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
+ cpu_of(rq), prev_on_scx ? prev : NULL);
- flush_dispatch_buf(rq);
+ flush_dispatch_buf(sch, rq);
if (prev_on_rq && prev->scx.slice) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -3001,7 +3143,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
}
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
/*
@@ -3024,10 +3166,10 @@ no_tasks:
* Didn't find another task to run. Keep running @prev unless
* %SCX_OPS_ENQ_LAST is in effect.
*/
- if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
- scx_rq_bypassing(rq))) {
+ if (prev_on_rq &&
+ (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
- __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
+ __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3087,32 +3229,36 @@ static void process_ddsp_deferred_locals(struct rq *rq)
*/
while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
struct task_struct, scx.dsq_list.node))) {
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *dsq;
list_del_init(&p->scx.dsq_list.node);
- dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
- dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
+ dispatch_to_local_dsq(sch, rq, dsq, p,
+ p->scx.ddsp_enq_flags);
}
}
static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
{
+ struct scx_sched *sch = scx_root;
+
if (p->scx.flags & SCX_TASK_QUEUED) {
/*
* Core-sched might decide to execute @p before it is
* dispatched. Call ops_dequeue() to notify the BPF scheduler.
*/
- ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
+ ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC);
dispatch_dequeue(rq, p);
}
p->se.exec_start = rq_clock_task(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
+ if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p);
clr_task_runnable(p, true);
@@ -3155,6 +3301,7 @@ preempt_reason_from_class(const struct sched_class *class)
static void switch_class(struct rq *rq, struct task_struct *next)
{
+ struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
#ifdef CONFIG_SMP
@@ -3165,7 +3312,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
#endif
- if (!static_branch_unlikely(&scx_ops_cpu_preempt))
+ if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
/*
@@ -3187,14 +3334,14 @@ static void switch_class(struct rq *rq, struct task_struct *next)
* next time that balance_scx() is invoked.
*/
if (!rq->scx.cpu_released) {
- if (SCX_HAS_OP(cpu_release)) {
+ if (SCX_HAS_OP(sch, cpu_release)) {
struct scx_cpu_release_args args = {
.reason = preempt_reason_from_class(next_class),
.task = next,
};
- SCX_CALL_OP(SCX_KF_CPU_RELEASE,
- cpu_release, cpu_of(rq), &args);
+ SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq,
+ cpu_of(rq), &args);
}
rq->scx.cpu_released = true;
}
@@ -3203,11 +3350,12 @@ static void switch_class(struct rq *rq, struct task_struct *next)
static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
struct task_struct *next)
{
+ struct scx_sched *sch = scx_root;
update_curr_scx(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
+ if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true);
if (p->scx.flags & SCX_TASK_QUEUED) {
set_task_runnable(rq, p);
@@ -3219,7 +3367,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* DSQ.
*/
if (p->scx.slice && !scx_rq_bypassing(rq)) {
- dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p,
+ SCX_ENQ_HEAD);
goto switch_class;
}
@@ -3230,7 +3379,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* which should trigger an explicit follow-up scheduling event.
*/
if (sched_class_above(&ext_sched_class, next->sched_class)) {
- WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
+ WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
} else {
do_enqueue_task(rq, p, 0, -1);
@@ -3283,7 +3432,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
* conditional on scx_enabled() and may have been skipped.
*/
- WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -3294,10 +3443,8 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*/
if (keep_prev) {
p = prev;
- if (!p->scx.slice) {
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
- }
+ if (!p->scx.slice)
+ refill_task_slice_dfl(p);
} else {
p = first_local_task(rq);
if (!p) {
@@ -3307,13 +3454,14 @@ static struct task_struct *pick_task_scx(struct rq *rq)
}
if (unlikely(!p->scx.slice)) {
- if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
+ struct scx_sched *sch = scx_root;
+
+ if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
- scx_warned_zero_slice = true;
+ sch->warned_zero_slice = true;
}
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ refill_task_slice_dfl(p);
}
}
@@ -3342,13 +3490,17 @@ static struct task_struct *pick_task_scx(struct rq *rq)
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi)
{
+ struct scx_sched *sch = scx_root;
+
/*
* The const qualifiers are dropped from task_struct pointers when
* calling ops.core_sched_before(). Accesses are controlled by the
* verifier.
*/
- if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+ if (SCX_HAS_OP(sch, core_sched_before) &&
+ !scx_rq_bypassing(task_rq(a)))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before,
+ NULL,
(struct task_struct *)a,
(struct task_struct *)b);
else
@@ -3360,6 +3512,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ struct scx_sched *sch = scx_root;
bool rq_bypass;
/*
@@ -3376,7 +3529,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
return prev_cpu;
rq_bypass = scx_rq_bypassing(task_rq(p));
- if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
+ if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3384,29 +3537,30 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
- select_cpu, p, prev_cpu, wake_flags);
+ cpu = SCX_CALL_OP_TASK_RET(sch,
+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
+ select_cpu, NULL, p, prev_cpu,
+ wake_flags);
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
- if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
+ if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
} else {
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
- p->scx.slice = SCX_SLICE_DFL;
+ refill_task_slice_dfl(p);
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
} else {
cpu = prev_cpu;
}
p->scx.selected_cpu = cpu;
if (rq_bypass)
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
}
@@ -3419,6 +3573,8 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_scx(struct task_struct *p,
struct affinity_context *ac)
{
+ struct scx_sched *sch = scx_root;
+
set_cpus_allowed_common(p, ac);
/*
@@ -3429,28 +3585,38 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* Fine-grained memory write control is enforced by BPF making the const
* designation pointless. Cast it away when calling the operation.
*/
- if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
- (struct cpumask *)p->cpus_ptr);
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL,
+ p, (struct cpumask *)p->cpus_ptr);
}
static void handle_hotplug(struct rq *rq, bool online)
{
+ struct scx_sched *sch = scx_root;
int cpu = cpu_of(rq);
atomic_long_inc(&scx_hotplug_seq);
+ /*
+ * scx_root updates are protected by cpus_read_lock() and will stay
+ * stable here. Note that we can't depend on scx_enabled() test as the
+ * hotplug ops need to be enabled before __scx_enabled is set.
+ */
+ if (unlikely(!sch))
+ return;
+
if (scx_enabled())
- scx_idle_update_selcpu_topology(&scx_ops);
+ scx_idle_update_selcpu_topology(&sch->ops);
- if (online && SCX_HAS_OP(cpu_online))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
- else if (!online && SCX_HAS_OP(cpu_offline))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
+ if (online && SCX_HAS_OP(sch, cpu_online))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
+ else if (!online && SCX_HAS_OP(sch, cpu_offline))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
else
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "cpu %d going %s, exiting scheduler", cpu,
- online ? "online" : "offline");
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "cpu %d going %s, exiting scheduler", cpu,
+ online ? "online" : "offline");
}
void scx_rq_activate(struct rq *rq)
@@ -3477,11 +3643,16 @@ static void rq_offline_scx(struct rq *rq)
static bool check_rq_for_timeouts(struct rq *rq)
{
+ struct scx_sched *sch;
struct task_struct *p;
struct rq_flags rf;
bool timed_out = false;
rq_lock_irqsave(rq, &rf);
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ goto out_unlock;
+
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
unsigned long last_runnable = p->scx.runnable_at;
@@ -3489,16 +3660,15 @@ static bool check_rq_for_timeouts(struct rq *rq)
last_runnable + scx_watchdog_timeout))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "%s[%d] failed to run for %u.%03us",
- p->comm, p->pid,
- dur_ms / 1000, dur_ms % 1000);
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
timed_out = true;
break;
}
}
+out_unlock:
rq_unlock_irqrestore(rq, &rf);
-
return timed_out;
}
@@ -3520,19 +3690,24 @@ static void scx_watchdog_workfn(struct work_struct *work)
void scx_tick(struct rq *rq)
{
+ struct scx_sched *sch;
unsigned long last_check;
if (!scx_enabled())
return;
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ return;
+
last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies,
last_check + READ_ONCE(scx_watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "watchdog failed to check in for %u.%03us",
- dur_ms / 1000, dur_ms % 1000);
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
}
update_other_load_avgs(rq);
@@ -3540,6 +3715,8 @@ void scx_tick(struct rq *rq)
static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
{
+ struct scx_sched *sch = scx_root;
+
update_curr_scx(rq);
/*
@@ -3549,8 +3726,8 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
if (scx_rq_bypassing(rq)) {
curr->scx.slice = 0;
touch_core_sched(rq, curr);
- } else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
+ } else if (SCX_HAS_OP(sch, tick)) {
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr);
}
if (!curr->scx.slice)
@@ -3615,21 +3792,23 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
}
-static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork)
{
+ struct scx_sched *sch = scx_root;
int ret;
p->scx.disallow = false;
- if (SCX_HAS_OP(init_task)) {
+ if (SCX_HAS_OP(sch, init_task)) {
struct scx_init_task_args args = {
SCX_INIT_TASK_ARGS_CGROUP(tg)
.fork = fork,
};
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL,
+ p, &args);
if (unlikely(ret)) {
- ret = ops_sanitize_err("init_task", ret);
+ ret = ops_sanitize_err(sch, "init_task", ret);
return ret;
}
}
@@ -3657,8 +3836,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
task_rq_unlock(rq, p, &rf);
} else if (p->policy == SCHED_EXT) {
- scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
- p->comm, p->pid);
+ scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
+ p->comm, p->pid);
}
}
@@ -3666,11 +3845,13 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
return 0;
}
-static void scx_ops_enable_task(struct task_struct *p)
+static void scx_enable_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
+ struct rq *rq = task_rq(p);
u32 weight;
- lockdep_assert_rq_held(task_rq(p));
+ lockdep_assert_rq_held(rq);
/*
* Set the weight before calling ops.enable() so that the scheduler
@@ -3683,26 +3864,31 @@ static void scx_ops_enable_task(struct task_struct *p)
p->scx.weight = sched_weight_to_cgroup(weight);
- if (SCX_HAS_OP(enable))
- SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
+ if (SCX_HAS_OP(sch, enable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
scx_set_task_state(p, SCX_TASK_ENABLED);
- if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
}
-static void scx_ops_disable_task(struct task_struct *p)
+static void scx_disable_task(struct task_struct *p)
{
- lockdep_assert_rq_held(task_rq(p));
+ struct scx_sched *sch = scx_root;
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
- if (SCX_HAS_OP(disable))
- SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
+ if (SCX_HAS_OP(sch, disable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
scx_set_task_state(p, SCX_TASK_READY);
}
-static void scx_ops_exit_task(struct task_struct *p)
+static void scx_exit_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
struct scx_exit_task_args args = {
.cancelled = false,
};
@@ -3718,15 +3904,16 @@ static void scx_ops_exit_task(struct task_struct *p)
case SCX_TASK_READY:
break;
case SCX_TASK_ENABLED:
- scx_ops_disable_task(p);
+ scx_disable_task(p);
break;
default:
WARN_ON_ONCE(true);
return;
}
- if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
+ if (SCX_HAS_OP(sch, exit_task))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
+ p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -3758,15 +3945,15 @@ int scx_fork(struct task_struct *p)
{
percpu_rwsem_assert_held(&scx_fork_rwsem);
- if (scx_ops_init_task_enabled)
- return scx_ops_init_task(p, task_group(p), true);
+ if (scx_init_task_enabled)
+ return scx_init_task(p, task_group(p), true);
else
return 0;
}
void scx_post_fork(struct task_struct *p)
{
- if (scx_ops_init_task_enabled) {
+ if (scx_init_task_enabled) {
scx_set_task_state(p, SCX_TASK_READY);
/*
@@ -3779,7 +3966,7 @@ void scx_post_fork(struct task_struct *p)
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_enable_task(p);
+ scx_enable_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3799,7 +3986,7 @@ void scx_cancel_fork(struct task_struct *p)
rq = task_rq_lock(p, &rf);
WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
@@ -3815,15 +4002,15 @@ void sched_ext_free(struct task_struct *p)
spin_unlock_irqrestore(&scx_tasks_lock, flags);
/*
- * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
- * ENABLED transitions can't race us. Disable ops for @p.
+ * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
+ * transitions can't race us. Disable ops for @p.
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3831,11 +4018,14 @@ void sched_ext_free(struct task_struct *p)
static void reweight_task_scx(struct rq *rq, struct task_struct *p,
const struct load_weight *lw)
{
+ struct scx_sched *sch = scx_root;
+
lockdep_assert_rq_held(task_rq(p));
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
- if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
}
static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
@@ -3844,20 +4034,22 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
static void switching_to_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_enable_task(p);
+ struct scx_sched *sch = scx_root;
+
+ scx_enable_task(p);
/*
* set_cpus_allowed_scx() is not called while @p is associated with a
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
- if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
- (struct cpumask *)p->cpus_ptr);
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq,
+ p, (struct cpumask *)p->cpus_ptr);
}
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_disable_task(p);
+ scx_disable_task(p);
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -3902,6 +4094,7 @@ static bool scx_cgroup_enabled;
int scx_tg_online(struct task_group *tg)
{
+ struct scx_sched *sch = scx_root;
int ret = 0;
WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
@@ -3909,14 +4102,14 @@ int scx_tg_online(struct task_group *tg)
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled) {
- if (SCX_HAS_OP(cgroup_init)) {
+ if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
{ .weight = tg->scx_weight };
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
- tg->css.cgroup, &args);
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
+ NULL, tg->css.cgroup, &args);
if (ret)
- ret = ops_sanitize_err("cgroup_init", ret);
+ ret = ops_sanitize_err(sch, "cgroup_init", ret);
}
if (ret == 0)
tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
@@ -3930,12 +4123,16 @@ int scx_tg_online(struct task_group *tg)
void scx_tg_offline(struct task_group *tg)
{
+ struct scx_sched *sch = scx_root;
+
WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
percpu_down_read(&scx_cgroup_rwsem);
- if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
+ (tg->scx_flags & SCX_TG_INITED))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ tg->css.cgroup);
tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
percpu_up_read(&scx_cgroup_rwsem);
@@ -3943,6 +4140,7 @@ void scx_tg_offline(struct task_group *tg)
int scx_cgroup_can_attach(struct cgroup_taskset *tset)
{
+ struct scx_sched *sch = scx_root;
struct cgroup_subsys_state *css;
struct task_struct *p;
int ret;
@@ -3967,8 +4165,9 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
if (from == to)
continue;
- if (SCX_HAS_OP(cgroup_prep_move)) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
+ if (SCX_HAS_OP(sch, cgroup_prep_move)) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED,
+ cgroup_prep_move, NULL,
p, from, css->cgroup);
if (ret)
goto err;
@@ -3981,18 +4180,21 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
err:
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
- p->scx.cgrp_moving_from, css->cgroup);
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
percpu_up_read(&scx_cgroup_rwsem);
- return ops_sanitize_err("cgroup_prep_move", ret);
+ return ops_sanitize_err(sch, "cgroup_prep_move", ret);
}
void scx_cgroup_move_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
+
if (!scx_cgroup_enabled)
return;
@@ -4000,9 +4202,11 @@ void scx_cgroup_move_task(struct task_struct *p)
* @p must have ops.cgroup_prep_move() called on it and thus
* cgrp_moving_from set.
*/
- if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
- SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
- p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+ if (SCX_HAS_OP(sch, cgroup_move) &&
+ !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+ SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL,
+ p, p->scx.cgrp_moving_from,
+ tg_cgrp(task_group(p)));
p->scx.cgrp_moving_from = NULL;
}
@@ -4013,6 +4217,7 @@ void scx_cgroup_finish_attach(void)
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
+ struct scx_sched *sch = scx_root;
struct cgroup_subsys_state *css;
struct task_struct *p;
@@ -4020,9 +4225,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
goto out_unlock;
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
- p->scx.cgrp_moving_from, css->cgroup);
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
out_unlock:
@@ -4031,11 +4237,13 @@ out_unlock:
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
+ struct scx_sched *sch = scx_root;
+
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled && tg->scx_weight != weight) {
- if (SCX_HAS_OP(cgroup_set_weight))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
+ if (SCX_HAS_OP(sch, cgroup_set_weight))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
tg_cgrp(tg), weight);
tg->scx_weight = weight;
}
@@ -4124,29 +4332,6 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
dsq->id = dsq_id;
}
-static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
-{
- struct scx_dispatch_q *dsq;
- int ret;
-
- if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
- return ERR_PTR(-EINVAL);
-
- dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq)
- return ERR_PTR(-ENOMEM);
-
- init_dsq(dsq, dsq_id);
-
- ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
- dsq_hash_params);
- if (ret) {
- kfree(dsq);
- return ERR_PTR(ret);
- }
- return dsq;
-}
-
static void free_dsq_irq_workfn(struct irq_work *irq_work)
{
struct llist_node *to_free = llist_del_all(&dsqs_to_free);
@@ -4158,26 +4343,27 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
-static void destroy_dsq(u64 dsq_id)
+static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
{
struct scx_dispatch_q *dsq;
unsigned long flags;
rcu_read_lock();
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (!dsq)
goto out_unlock_rcu;
raw_spin_lock_irqsave(&dsq->lock, flags);
if (dsq->nr) {
- scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
- dsq->id, dsq->nr);
+ scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)",
+ dsq->id, dsq->nr);
goto out_unlock_dsq;
}
- if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
+ if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params))
goto out_unlock_dsq;
/*
@@ -4197,7 +4383,7 @@ out_unlock_rcu:
}
#ifdef CONFIG_EXT_GROUP_SCHED
-static void scx_cgroup_exit(void)
+static void scx_cgroup_exit(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
@@ -4217,14 +4403,15 @@ static void scx_cgroup_exit(void)
continue;
tg->scx_flags &= ~SCX_TG_INITED;
- if (!scx_ops.cgroup_exit)
+ if (!sch->ops.cgroup_exit)
continue;
if (WARN_ON_ONCE(!css_tryget(css)))
continue;
rcu_read_unlock();
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ css->cgroup);
rcu_read_lock();
css_put(css);
@@ -4232,7 +4419,7 @@ static void scx_cgroup_exit(void)
rcu_read_unlock();
}
-static int scx_cgroup_init(void)
+static int scx_cgroup_init(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
int ret;
@@ -4252,7 +4439,7 @@ static int scx_cgroup_init(void)
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
continue;
- if (!scx_ops.cgroup_init) {
+ if (!sch->ops.cgroup_init) {
tg->scx_flags |= SCX_TG_INITED;
continue;
}
@@ -4261,11 +4448,11 @@ static int scx_cgroup_init(void)
continue;
rcu_read_unlock();
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
css_put(css);
- scx_ops_error("ops.cgroup_init() failed (%d)", ret);
+ scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
tg->scx_flags |= SCX_TG_INITED;
@@ -4282,8 +4469,8 @@ static int scx_cgroup_init(void)
}
#else
-static void scx_cgroup_exit(void) {}
-static int scx_cgroup_init(void) { return 0; }
+static void scx_cgroup_exit(struct scx_sched *sch) {}
+static int scx_cgroup_init(struct scx_sched *sch) { return 0; }
#endif
@@ -4300,8 +4487,7 @@ static int scx_cgroup_init(void) { return 0; }
static ssize_t scx_attr_state_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- scx_ops_enable_state_str[scx_ops_enable_state()]);
+ return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]);
}
SCX_ATTR(state);
@@ -4346,15 +4532,51 @@ static const struct attribute_group scx_global_attr_group = {
.attrs = scx_global_attrs,
};
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+ struct rcu_work *rcu_work = to_rcu_work(work);
+ struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+ struct rhashtable_iter rht_iter;
+ struct scx_dispatch_q *dsq;
+ int node;
+
+ kthread_stop(sch->helper->task);
+ free_percpu(sch->event_stats_cpu);
+
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+
+ rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
+ do {
+ rhashtable_walk_start(&rht_iter);
+
+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+ destroy_dsq(sch, dsq->id);
+
+ rhashtable_walk_stop(&rht_iter);
+ } while (dsq == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&rht_iter);
+
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+ free_exit_info(sch->exit_info);
+ kfree(sch);
+}
+
static void scx_kobj_release(struct kobject *kobj)
{
- kfree(kobj);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+ queue_rcu_work(system_unbound_wq, &sch->rcu_work);
}
static ssize_t scx_attr_ops_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n", scx_ops.name);
+ return sysfs_emit(buf, "%s\n", scx_root->ops.name);
}
SCX_ATTR(ops);
@@ -4365,16 +4587,17 @@ SCX_ATTR(ops);
static ssize_t scx_attr_events_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
struct scx_event_stats events;
int at = 0;
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(sch, &events);
at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -4397,7 +4620,7 @@ static const struct kobj_type scx_ktype = {
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
- return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
+ return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
}
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -4410,14 +4633,20 @@ static const struct kset_uevent_ops scx_uevent_ops = {
*/
bool task_should_scx(int policy)
{
- if (!scx_enabled() ||
- unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
+ if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING))
return false;
if (READ_ONCE(scx_switching_all))
return true;
return policy == SCHED_EXT;
}
+bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+ return !scx_enabled() ||
+ (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
+ p->sched_class != &ext_sched_class;
+}
+
/**
* scx_softlockup - sched_ext softlockup handler
* @dur_s: number of seconds of CPU stuck due to soft lockup
@@ -4430,40 +4659,48 @@ bool task_should_scx(int policy)
*/
void scx_softlockup(u32 dur_s)
{
- switch (scx_ops_enable_state()) {
- case SCX_OPS_ENABLING:
- case SCX_OPS_ENABLED:
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ goto out_unlock;
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
break;
default:
- return;
+ goto out_unlock;
}
- /* allow only one instance, cleared at the end of scx_ops_bypass() */
+ /* allow only one instance, cleared at the end of scx_bypass() */
if (test_and_set_bit(0, &scx_in_softlockup))
- return;
+ goto out_unlock;
printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
- smp_processor_id(), dur_s, scx_ops.name);
+ smp_processor_id(), dur_s, scx_root->ops.name);
/*
* Some CPUs may be trapped in the dispatch paths. Enable breather
- * immediately; otherwise, we might even be able to get to
- * scx_ops_bypass().
+ * immediately; otherwise, we might even be able to get to scx_bypass().
*/
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
- scx_ops_error("soft lockup - CPU#%d stuck for %us",
- smp_processor_id(), dur_s);
+ scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s);
+out_unlock:
+ rcu_read_unlock();
}
static void scx_clear_softlockup(void)
{
if (test_and_clear_bit(0, &scx_in_softlockup))
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
}
/**
- * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @bypass: true for bypass, false for unbypass
*
* Bypassing guarantees that all runnable tasks make forward progress without
@@ -4493,32 +4730,36 @@ static void scx_clear_softlockup(void)
*
* - scx_prio_less() reverts to the default core_sched_at order.
*/
-static void scx_ops_bypass(bool bypass)
+static void scx_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
static unsigned long bypass_timestamp;
-
- int cpu;
+ struct scx_sched *sch;
unsigned long flags;
+ int cpu;
raw_spin_lock_irqsave(&bypass_lock, flags);
+ sch = rcu_dereference_bh(scx_root);
+
if (bypass) {
- scx_ops_bypass_depth++;
- WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
- if (scx_ops_bypass_depth != 1)
+ scx_bypass_depth++;
+ WARN_ON_ONCE(scx_bypass_depth <= 0);
+ if (scx_bypass_depth != 1)
goto unlock;
bypass_timestamp = ktime_get_ns();
- scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
} else {
- scx_ops_bypass_depth--;
- WARN_ON_ONCE(scx_ops_bypass_depth < 0);
- if (scx_ops_bypass_depth != 0)
+ scx_bypass_depth--;
+ WARN_ON_ONCE(scx_bypass_depth < 0);
+ if (scx_bypass_depth != 0)
goto unlock;
- scx_add_event(SCX_EV_BYPASS_DURATION,
- ktime_get_ns() - bypass_timestamp);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
}
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
/*
* No task property is changing. We just need to make sure all currently
@@ -4576,7 +4817,7 @@ static void scx_ops_bypass(bool bypass)
raw_spin_rq_unlock(rq);
}
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags);
scx_clear_softlockup();
@@ -4632,42 +4873,36 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
-static void scx_ops_disable_workfn(struct kthread_work *work)
+static void scx_disable_workfn(struct kthread_work *work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
+ struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
- struct rhashtable_iter rht_iter;
- struct scx_dispatch_q *dsq;
- int i, kind, cpu;
+ int kind, cpu;
- kind = atomic_read(&scx_exit_kind);
+ kind = atomic_read(&sch->exit_kind);
while (true) {
- /*
- * NONE indicates that a new scx_ops has been registered since
- * disable was scheduled - don't kill the new ops. DONE
- * indicates that the ops has already been disabled.
- */
- if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
+ if (kind == SCX_EXIT_DONE) /* already disabled? */
return;
- if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
+ WARN_ON_ONCE(kind == SCX_EXIT_NONE);
+ if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
break;
}
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
/* guarantee forward progress by bypassing scx_ops */
- scx_ops_bypass(true);
+ scx_bypass(true);
- switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
- case SCX_OPS_DISABLING:
+ switch (scx_set_enable_state(SCX_DISABLING)) {
+ case SCX_DISABLING:
WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
break;
- case SCX_OPS_DISABLED:
+ case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n",
- scx_exit_info->msg);
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ sch->exit_info->msg);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
goto done;
default:
break;
@@ -4678,17 +4913,17 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
* we can safely use blocking synchronization constructs. Actually
* disable ops.
*/
- mutex_lock(&scx_ops_enable_mutex);
+ mutex_lock(&scx_enable_mutex);
static_branch_disable(&__scx_switched_all);
WRITE_ONCE(scx_switching_all, false);
/*
* Shut down cgroup support before tasks so that the cgroup attach path
- * doesn't race against scx_ops_exit_task().
+ * doesn't race against scx_exit_task().
*/
scx_cgroup_lock();
- scx_cgroup_exit();
+ scx_cgroup_exit(sch);
scx_cgroup_unlock();
/*
@@ -4697,7 +4932,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
*/
percpu_down_write(&scx_fork_rwsem);
- scx_ops_init_task_enabled = false;
+ scx_init_task_enabled = false;
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
@@ -4717,7 +4952,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -4732,98 +4967,71 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
}
/* no task is on scx, turn off all the switches and flush in-progress calls */
- static_branch_disable(&__scx_ops_enabled);
- for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
- static_branch_disable(&scx_has_op[i]);
- static_branch_disable(&scx_ops_allow_queued_wakeup);
- static_branch_disable(&scx_ops_enq_last);
- static_branch_disable(&scx_ops_enq_exiting);
- static_branch_disable(&scx_ops_enq_migration_disabled);
- static_branch_disable(&scx_ops_cpu_preempt);
+ static_branch_disable(&__scx_enabled);
+ bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ sch->ops.name, ei->reason);
if (ei->msg[0] != '\0')
- pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
+ pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
#ifdef CONFIG_STACKTRACE
stack_trace_print(ei->bt, ei->bt_len, 2);
#endif
} else {
pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ sch->ops.name, ei->reason);
}
- if (scx_ops.exit)
- SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
/*
- * Delete the kobject from the hierarchy eagerly in addition to just
- * dropping a reference. Otherwise, if the object is deleted
- * asynchronously, sysfs could observe an object of the same name still
- * in the hierarchy when another scheduler is loaded.
+ * scx_root clearing must be inside cpus_read_lock(). See
+ * handle_hotplug().
*/
- kobject_del(scx_root_kobj);
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
-
- memset(&scx_ops, 0, sizeof(scx_ops));
-
- rhashtable_walk_enter(&dsq_hash, &rht_iter);
- do {
- rhashtable_walk_start(&rht_iter);
-
- while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
- destroy_dsq(dsq->id);
+ cpus_read_lock();
+ RCU_INIT_POINTER(scx_root, NULL);
+ cpus_read_unlock();
- rhashtable_walk_stop(&rht_iter);
- } while (dsq == ERR_PTR(-EAGAIN));
- rhashtable_walk_exit(&rht_iter);
+ /*
+ * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+ * could observe an object of the same name still in the hierarchy when
+ * the next scheduler is loaded.
+ */
+ kobject_del(&sch->kobj);
free_percpu(scx_dsp_ctx);
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
-
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
done:
- scx_ops_bypass(false);
+ scx_bypass(false);
}
-static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
-
-static void schedule_scx_ops_disable_work(void)
-{
- struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
-
- /*
- * We may be called spuriously before the first bpf_sched_ext_reg(). If
- * scx_ops_helper isn't set up yet, there's nothing to do.
- */
- if (helper)
- kthread_queue_work(helper, &scx_ops_disable_work);
-}
-
-static void scx_ops_disable(enum scx_exit_kind kind)
+static void scx_disable(enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
kind = SCX_EXIT_ERROR;
- atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
-
- schedule_scx_ops_disable_work();
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+ kthread_queue_work(sch->helper, &sch->disable_work);
+ }
+ rcu_read_unlock();
}
static void dump_newline(struct seq_buf *s)
@@ -4941,6 +5149,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
struct task_struct *p, char marker)
{
static unsigned long bt[SCX_EXIT_BT_LEN];
+ struct scx_sched *sch = scx_root;
char dsq_id_buf[19] = "(n/a)";
unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
unsigned int bt_len = 0;
@@ -4963,9 +5172,9 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
- if (SCX_HAS_OP(dump_task)) {
+ if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p);
ops_dump_exit();
}
@@ -4982,6 +5191,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
{
static DEFINE_SPINLOCK(dump_lock);
static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+ struct scx_sched *sch = scx_root;
struct scx_dump_ctx dctx = {
.kind = ei->kind,
.exit_code = ei->exit_code,
@@ -5010,9 +5220,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
}
- if (SCX_HAS_OP(dump)) {
+ if (SCX_HAS_OP(sch, dump)) {
ops_dump_init(&s, "");
- SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx);
ops_dump_exit();
}
@@ -5033,7 +5243,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
idle = list_empty(&rq->scx.runnable_list) &&
rq->curr->sched_class == &idle_sched_class;
- if (idle && !SCX_HAS_OP(dump_cpu))
+ if (idle && !SCX_HAS_OP(sch, dump_cpu))
goto next;
/*
@@ -5067,9 +5277,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
cpumask_pr_args(rq->scx.cpus_to_wait));
used = seq_buf_used(&ns);
- if (SCX_HAS_OP(dump_cpu)) {
+ if (SCX_HAS_OP(sch, dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL,
+ &dctx, cpu, idle);
ops_dump_exit();
}
@@ -5103,13 +5314,13 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_line(&s, "Event counters");
dump_line(&s, "--------------");
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(sch, &events);
scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
+ scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL);
scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -5121,27 +5332,25 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
spin_unlock_irqrestore(&dump_lock, flags);
}
-static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
+static void scx_error_irq_workfn(struct irq_work *irq_work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
+ struct scx_exit_info *ei = sch->exit_info;
if (ei->kind >= SCX_EXIT_ERROR)
- scx_dump_state(ei, scx_ops.exit_dump_len);
+ scx_dump_state(ei, sch->ops.exit_dump_len);
- schedule_scx_ops_disable_work();
+ kthread_queue_work(sch->helper, &sch->disable_work);
}
-static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
-
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...)
+static void scx_vexit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, va_list args)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_exit_info *ei = sch->exit_info;
int none = SCX_EXIT_NONE;
- va_list args;
- if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
+ if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
return;
ei->exit_code = exit_code;
@@ -5149,31 +5358,98 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
if (kind >= SCX_EXIT_ERROR)
ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
#endif
- va_start(args, fmt);
vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
- va_end(args);
/*
* Set ei->kind and ->reason for scx_dump_state(). They'll be set again
- * in scx_ops_disable_workfn().
+ * in scx_disable_workfn().
*/
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
- irq_work_queue(&scx_ops_error_irq_work);
+ irq_work_queue(&sch->error_irq_work);
}
-static struct kthread_worker *scx_create_rt_helper(const char *name)
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
{
- struct kthread_worker *helper;
+ struct scx_sched *sch;
+ int node, ret;
- helper = kthread_run_worker(0, name);
- if (helper)
- sched_set_fifo(helper->task);
- return helper;
-}
+ sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+ if (!sch)
+ return ERR_PTR(-ENOMEM);
+
+ sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!sch->exit_info) {
+ ret = -ENOMEM;
+ goto err_free_sch;
+ }
+
+ ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
+ if (ret < 0)
+ goto err_free_ei;
-static void check_hotplug_seq(const struct sched_ext_ops *ops)
+ sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]),
+ GFP_KERNEL);
+ if (!sch->global_dsqs) {
+ ret = -ENOMEM;
+ goto err_free_hash;
+ }
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct scx_dispatch_q *dsq;
+
+ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq) {
+ ret = -ENOMEM;
+ goto err_free_gdsqs;
+ }
+
+ init_dsq(dsq, SCX_DSQ_GLOBAL);
+ sch->global_dsqs[node] = dsq;
+ }
+
+ sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
+ if (!sch->event_stats_cpu)
+ goto err_free_gdsqs;
+
+ sch->helper = kthread_run_worker(0, "sched_ext_helper");
+ if (!sch->helper)
+ goto err_free_event_stats;
+ sched_set_fifo(sch->helper->task);
+
+ atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+ init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
+ kthread_init_work(&sch->disable_work, scx_disable_workfn);
+ sch->ops = *ops;
+ ops->priv = sch;
+
+ sch->kobj.kset = scx_kset;
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err_stop_helper;
+
+ return sch;
+
+err_stop_helper:
+ kthread_stop(sch->helper->task);
+err_free_event_stats:
+ free_percpu(sch->event_stats_cpu);
+err_free_gdsqs:
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+err_free_hash:
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+err_free_ei:
+ free_exit_info(sch->exit_info);
+err_free_sch:
+ kfree(sch);
+ return ERR_PTR(ret);
+}
+
+static void check_hotplug_seq(struct scx_sched *sch,
+ const struct sched_ext_ops *ops)
{
unsigned long long global_hotplug_seq;
@@ -5185,21 +5461,22 @@ static void check_hotplug_seq(const struct sched_ext_ops *ops)
if (ops->hotplug_seq) {
global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
if (ops->hotplug_seq != global_hotplug_seq) {
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "expected hotplug seq %llu did not match actual %llu",
- ops->hotplug_seq, global_hotplug_seq);
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "expected hotplug seq %llu did not match actual %llu",
+ ops->hotplug_seq, global_hotplug_seq);
}
}
}
-static int validate_ops(const struct sched_ext_ops *ops)
+static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
{
/*
* It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
* ops.enqueue() callback isn't implemented.
*/
if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
- scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
+ scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
return -EINVAL;
}
@@ -5209,7 +5486,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
*/
if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
(ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
return -EINVAL;
}
@@ -5219,12 +5496,13 @@ static int validate_ops(const struct sched_ext_ops *ops)
return 0;
}
-static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
+ struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
- int i, cpu, node, ret;
+ int i, cpu, ret;
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
@@ -5232,87 +5510,25 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return -EINVAL;
}
- mutex_lock(&scx_ops_enable_mutex);
-
- /*
- * Clear event counters so a new scx scheduler gets
- * fresh event counter values.
- */
- for_each_possible_cpu(cpu) {
- struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
- memset(e, 0, sizeof(*e));
- }
-
- if (!scx_ops_helper) {
- WRITE_ONCE(scx_ops_helper,
- scx_create_rt_helper("sched_ext_ops_helper"));
- if (!scx_ops_helper) {
- ret = -ENOMEM;
- goto err_unlock;
- }
- }
-
- if (!global_dsqs) {
- struct scx_dispatch_q **dsqs;
-
- dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
- if (!dsqs) {
- ret = -ENOMEM;
- goto err_unlock;
- }
+ mutex_lock(&scx_enable_mutex);
- for_each_node_state(node, N_POSSIBLE) {
- struct scx_dispatch_q *dsq;
-
- dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq) {
- for_each_node_state(node, N_POSSIBLE)
- kfree(dsqs[node]);
- kfree(dsqs);
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- init_dsq(dsq, SCX_DSQ_GLOBAL);
- dsqs[node] = dsq;
- }
-
- global_dsqs = dsqs;
- }
-
- if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
+ if (scx_enable_state() != SCX_DISABLED) {
ret = -EBUSY;
goto err_unlock;
}
- scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
- if (!scx_root_kobj) {
- ret = -ENOMEM;
+ sch = scx_alloc_and_add_sched(ops);
+ if (IS_ERR(sch)) {
+ ret = PTR_ERR(sch);
goto err_unlock;
}
- scx_root_kobj->kset = scx_kset;
- ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
- if (ret < 0)
- goto err;
-
- scx_exit_info = alloc_exit_info(ops->exit_dump_len);
- if (!scx_exit_info) {
- ret = -ENOMEM;
- goto err_del;
- }
-
/*
- * Set scx_ops, transition to ENABLING and clear exit info to arm the
- * disable path. Failure triggers full disabling from here on.
+ * Transition to ENABLING and clear exit info to arm the disable path.
+ * Failure triggers full disabling from here on.
*/
- scx_ops = *ops;
-
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
- SCX_OPS_DISABLED);
-
- atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
- scx_warned_zero_slice = false;
+ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
+ WARN_ON_ONCE(scx_root);
atomic_long_set(&scx_nr_rejected, 0);
@@ -5325,28 +5541,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
cpus_read_lock();
+ /*
+ * Make the scheduler instance visible. Must be inside cpus_read_lock().
+ * See handle_hotplug().
+ */
+ rcu_assign_pointer(scx_root, sch);
+
scx_idle_enable(ops);
- if (scx_ops.init) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
+ if (sch->ops.init) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
if (ret) {
- ret = ops_sanitize_err("init", ret);
+ ret = ops_sanitize_err(sch, "init", ret);
cpus_read_unlock();
- scx_ops_error("ops.init() failed (%d)", ret);
+ scx_error(sch, "ops.init() failed (%d)", ret);
goto err_disable;
}
}
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
- static_branch_enable_cpuslocked(&scx_has_op[i]);
+ set_bit(i, sch->has_op);
- check_hotplug_seq(ops);
+ check_hotplug_seq(sch, ops);
scx_idle_update_selcpu_topology(ops);
cpus_read_unlock();
- ret = validate_ops(ops);
+ ret = validate_ops(sch, ops);
if (ret)
goto err_disable;
@@ -5371,27 +5593,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_watchdog_timeout / 2);
/*
- * Once __scx_ops_enabled is set, %current can be switched to SCX
- * anytime. This can lead to stalls as some BPF schedulers (e.g.
- * userspace scheduling) may not function correctly before all tasks are
- * switched. Init in bypass mode to guarantee forward progress.
+ * Once __scx_enabled is set, %current can be switched to SCX anytime.
+ * This can lead to stalls as some BPF schedulers (e.g. userspace
+ * scheduling) may not function correctly before all tasks are switched.
+ * Init in bypass mode to guarantee forward progress.
*/
- scx_ops_bypass(true);
+ scx_bypass(true);
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
- static_branch_enable(&scx_has_op[i]);
-
- if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
- static_branch_enable(&scx_ops_allow_queued_wakeup);
- if (ops->flags & SCX_OPS_ENQ_LAST)
- static_branch_enable(&scx_ops_enq_last);
- if (ops->flags & SCX_OPS_ENQ_EXITING)
- static_branch_enable(&scx_ops_enq_exiting);
- if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
- static_branch_enable(&scx_ops_enq_migration_disabled);
- if (scx_ops.cpu_acquire || scx_ops.cpu_release)
- static_branch_enable(&scx_ops_cpu_preempt);
+ set_bit(i, sch->has_op);
+
+ if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+ sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -5399,8 +5613,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
percpu_down_write(&scx_fork_rwsem);
- WARN_ON_ONCE(scx_ops_init_task_enabled);
- scx_ops_init_task_enabled = true;
+ WARN_ON_ONCE(scx_init_task_enabled);
+ scx_init_task_enabled = true;
/*
* Enable ops for every task. Fork is excluded by scx_fork_rwsem
@@ -5409,14 +5623,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* tasks. Prep all tasks first and then enable them with preemption
* disabled.
*
- * All cgroups should be initialized before scx_ops_init_task() so that
- * the BPF scheduler can reliably track each task's cgroup membership
- * from scx_ops_init_task(). Lock out cgroup on/offlining and task
- * migrations while tasks are being initialized so that
- * scx_cgroup_can_attach() never sees uninitialized tasks.
+ * All cgroups should be initialized before scx_init_task() so that the
+ * BPF scheduler can reliably track each task's cgroup membership from
+ * scx_init_task(). Lock out cgroup on/offlining and task migrations
+ * while tasks are being initialized so that scx_cgroup_can_attach()
+ * never sees uninitialized tasks.
*/
scx_cgroup_lock();
- ret = scx_cgroup_init();
+ ret = scx_cgroup_init(sch);
if (ret)
goto err_disable_unlock_all;
@@ -5432,13 +5646,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_unlock(&sti);
- ret = scx_ops_init_task(p, task_group(p), false);
+ ret = scx_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
- scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
- ret, p->comm, p->pid);
+ scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
+ ret, p->comm, p->pid);
goto err_disable_unlock_all;
}
@@ -5456,7 +5670,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* all eligible tasks.
*/
WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
- static_branch_enable(&__scx_ops_enabled);
+ static_branch_enable(&__scx_enabled);
/*
* We're fully committed and can't fail. The task READY -> ENABLED
@@ -5487,10 +5701,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
- if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
- WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+ if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
+ WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
goto err_disable;
}
@@ -5498,44 +5712,35 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable(&__scx_switched_all);
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
- scx_ops.name, scx_switched_all() ? "" : " (partial)");
- kobject_uevent(scx_root_kobj, KOBJ_ADD);
- mutex_unlock(&scx_ops_enable_mutex);
+ sch->ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
+ mutex_unlock(&scx_enable_mutex);
atomic_long_inc(&scx_enable_seq);
return 0;
-err_del:
- kobject_del(scx_root_kobj);
-err:
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
- if (scx_exit_info) {
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
- }
err_unlock:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
return ret;
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
err_disable:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
/*
* Returning an error code here would not pass all the error information
- * to userspace. Record errno using scx_ops_error() for cases
- * scx_ops_error() wasn't already invoked and exit indicating success so
- * that the error is notified through ops.exit() with all the details.
+ * to userspace. Record errno using scx_error() for cases scx_error()
+ * wasn't already invoked and exit indicating success so that the error
+ * is notified through ops.exit() with all the details.
*
- * Flush scx_ops_disable_work to ensure that error is reported before
- * init completion.
+ * Flush scx_disable_work to ensure that error is reported before init
+ * completion. sch's base reference will be put by bpf_scx_unreg().
*/
- scx_ops_error("scx_ops_enable() failed (%d)", ret);
- kthread_flush_work(&scx_ops_disable_work);
+ scx_error(sch, "scx_enable() failed (%d)", ret);
+ kthread_flush_work(&sch->disable_work);
return 0;
}
@@ -5679,13 +5884,17 @@ static int bpf_scx_check_member(const struct btf_type *t,
static int bpf_scx_reg(void *kdata, struct bpf_link *link)
{
- return scx_ops_enable(kdata, link);
+ return scx_enable(kdata, link);
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
- scx_ops_disable(SCX_EXIT_UNREG);
- kthread_flush_work(&scx_ops_disable_work);
+ struct sched_ext_ops *ops = kdata;
+ struct scx_sched *sch = ops->priv;
+
+ scx_disable(SCX_EXIT_UNREG);
+ kthread_flush_work(&sch->disable_work);
+ kobject_put(&sch->kobj);
}
static int bpf_scx_init(struct btf *btf)
@@ -5807,10 +6016,7 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
static void sysrq_handle_sched_ext_reset(u8 key)
{
- if (scx_ops_helper)
- scx_ops_disable(SCX_EXIT_SYSRQ);
- else
- pr_info("sched_ext: BPF scheduler not yet used\n");
+ scx_disable(SCX_EXIT_SYSRQ);
}
static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
@@ -5958,13 +6164,14 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
*/
void print_scx_info(const char *log_lvl, struct task_struct *p)
{
- enum scx_ops_enable_state state = scx_ops_enable_state();
+ struct scx_sched *sch = scx_root;
+ enum scx_enable_state state = scx_enable_state();
const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
char runnable_at_buf[22] = "?";
struct sched_class *class;
unsigned long runnable_at;
- if (state == SCX_OPS_DISABLED)
+ if (state == SCX_DISABLED)
return;
/*
@@ -5973,8 +6180,8 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
*/
if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
class != &ext_sched_class) {
- printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
- scx_ops_enable_state_str[state], all);
+ printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name,
+ scx_enable_state_str[state], all);
return;
}
@@ -5985,7 +6192,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
/* print everything onto one line to conserve console space */
printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
- log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
+ log_lvl, sch->ops.name, scx_enable_state_str[state], all,
runnable_at_buf);
}
@@ -6001,12 +6208,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
case PM_RESTORE_PREPARE:
- scx_ops_bypass(true);
+ scx_bypass(true);
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
case PM_POST_RESTORE:
- scx_ops_bypass(false);
+ scx_bypass(false);
break;
}
@@ -6029,7 +6236,6 @@ void __init init_sched_ext_class(void)
WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
SCX_TG_ONLINE);
- BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
scx_idle_init_masks();
scx_kick_cpus_pnt_seqs =
@@ -6073,12 +6279,12 @@ static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
- scx_ops_error("called with NULL task");
+ scx_kf_error("called with NULL task");
return false;
}
if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
+ scx_kf_error("invalid enq_flags 0x%llx", enq_flags);
return false;
}
@@ -6098,7 +6304,7 @@ static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
}
if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
- scx_ops_error("dispatch buffer overflow");
+ scx_kf_error("dispatch buffer overflow");
return;
}
@@ -6230,6 +6436,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
@@ -6264,7 +6471,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
* cause similar live-lock conditions as consume_dispatch_q(). Insert a
* breather if necessary.
*/
- scx_ops_breather(src_rq);
+ scx_breather(src_rq);
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
@@ -6282,7 +6489,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
}
/* @p is still on $src_dsq and stable, determine the destination */
- dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
+ dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
/*
* Apply vtime and slice updates before moving so that the new time is
@@ -6295,7 +6502,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
p->scx.slice = kit->slice;
/* execute move */
- locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq);
+ locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq);
dispatched = true;
out:
if (in_balance) {
@@ -6343,7 +6550,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
if (dspc->cursor > 0)
dspc->cursor--;
else
- scx_ops_error("dispatch buffer underflow");
+ scx_kf_error("dispatch buffer underflow");
}
/**
@@ -6362,21 +6569,22 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
if (!scx_kf_allowed(SCX_KF_DISPATCH))
return false;
- flush_dispatch_buf(dspc->rq);
+ flush_dispatch_buf(sch, dspc->rq);
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
+ scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id);
return false;
}
- if (consume_dispatch_q(dspc->rq, dsq)) {
+ if (consume_dispatch_q(sch, dspc->rq, dsq)) {
/*
* A successfully consumed task can be dequeued before it starts
* running while the CPU is trying to migrate other dispatched
@@ -6630,10 +6838,36 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
{
+ struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
+ s32 ret;
+
if (unlikely(node >= (int)nr_node_ids ||
(node < 0 && node != NUMA_NO_NODE)))
return -EINVAL;
- return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
+
+ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN))
+ return -EINVAL;
+
+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq)
+ return -ENOMEM;
+
+ init_dsq(dsq, dsq_id);
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
+ else
+ ret = -ENODEV;
+
+ rcu_read_unlock();
+ if (ret)
+ kfree(dsq);
+ return ret;
}
__bpf_kfunc_end_defs();
@@ -6672,7 +6906,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *this_rq;
unsigned long irq_flags;
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return;
local_irq_save(irq_flags);
@@ -6696,7 +6930,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
- scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+ scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) {
if (can_skip_idle_kick(target_rq)) {
@@ -6729,23 +6963,30 @@ out:
*/
__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
{
+ struct scx_sched *sch;
struct scx_dispatch_q *dsq;
s32 ret;
preempt_disable();
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
if (dsq_id == SCX_DSQ_LOCAL) {
ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
goto out;
} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
- if (ops_cpu_valid(cpu, NULL)) {
+ if (ops_cpu_valid(sch, cpu, NULL)) {
ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
goto out;
}
} else {
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (dsq) {
ret = READ_ONCE(dsq->nr);
goto out;
@@ -6768,7 +7009,13 @@ out:
*/
__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
{
- destroy_dsq(dsq_id);
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ destroy_dsq(sch, dsq_id);
+ rcu_read_unlock();
}
/**
@@ -6785,16 +7032,27 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
u64 flags)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+ struct scx_sched *sch;
BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
sizeof(struct bpf_iter_scx_dsq));
BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
__alignof__(struct bpf_iter_scx_dsq));
+ /*
+ * next() and destroy() will be called regardless of the return value.
+ * Always clear $kit->dsq.
+ */
+ kit->dsq = NULL;
+
+ sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held());
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
return -EINVAL;
- kit->dsq = find_user_dsq(dsq_id);
+ kit->dsq = find_user_dsq(sch, dsq_id);
if (!kit->dsq)
return -ENOENT;
@@ -6884,21 +7142,20 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
(data__sz && !data)) {
- scx_ops_error("invalid data=%p and data__sz=%u",
- (void *)data, data__sz);
+ scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
return -EINVAL;
}
ret = copy_from_kernel_nofault(data_buf, data, data__sz);
if (ret < 0) {
- scx_ops_error("failed to read data fields (%d)", ret);
+ scx_kf_error("failed to read data fields (%d)", ret);
return ret;
}
ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
&bprintf_data);
if (ret < 0) {
- scx_ops_error("format preparation failed (%d)", ret);
+ scx_kf_error("format preparation failed (%d)", ret);
return ret;
}
@@ -6906,8 +7163,7 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
bprintf_data.bin_args);
bpf_bprintf_cleanup(&bprintf_data);
if (ret < 0) {
- scx_ops_error("(\"%s\", %p, %u) failed to format",
- fmt, data, data__sz);
+ scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
return ret;
}
@@ -6940,8 +7196,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
- scx_exit_bstr_buf.line);
+ scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -6961,8 +7216,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
- scx_exit_bstr_buf.line);
+ scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -6986,7 +7240,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
s32 ret;
if (raw_smp_processor_id() != dd->cpu) {
- scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends");
return;
}
@@ -7027,7 +7281,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
{
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7049,7 +7303,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
{
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7072,18 +7326,37 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
if (unlikely(perf > SCX_CPUPERF_ONE)) {
- scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
return;
}
- if (ops_cpu_valid(cpu, NULL)) {
- struct rq *rq = cpu_rq(cpu);
+ if (kf_cpu_valid(cpu, NULL)) {
+ struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
+ struct rq_flags rf;
+
+ /*
+ * When called with an rq lock held, restrict the operation
+ * to the corresponding CPU to prevent ABBA deadlocks.
+ */
+ if (locked_rq && rq != locked_rq) {
+ scx_kf_error("Invalid target CPU %d", cpu);
+ return;
+ }
+
+ /*
+ * If no rq lock is held, allow to operate on any CPU by
+ * acquiring the corresponding rq lock.
+ */
+ if (!locked_rq) {
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ }
rq->scx.cpuperf_target = perf;
+ cpufreq_update_util(rq, 0);
- rcu_read_lock_sched_notrace();
- cpufreq_update_util(cpu_rq(cpu), 0);
- rcu_read_unlock_sched_notrace();
+ if (!locked_rq)
+ rq_unlock_irqrestore(rq, &rf);
}
}
@@ -7161,7 +7434,7 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
*/
__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
{
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return NULL;
return cpu_rq(cpu);
@@ -7257,6 +7530,27 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
+static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
+{
+ struct scx_event_stats *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into @events. */
+ memset(events, 0, sizeof(*events));
+ for_each_possible_cpu(cpu) {
+ e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
+ scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+}
+
/*
* scx_bpf_events - Get a system-wide event counter to
* @events: output buffer from a BPF program
@@ -7265,23 +7559,16 @@ __bpf_kfunc u64 scx_bpf_now(void)
__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
size_t events__sz)
{
- struct scx_event_stats e_sys, *e_cpu;
- int cpu;
+ struct scx_sched *sch;
+ struct scx_event_stats e_sys;
- /* Aggregate per-CPU event counters into the system-wide counters. */
- memset(&e_sys, 0, sizeof(e_sys));
- for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
- }
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ scx_read_events(sch, &e_sys);
+ else
+ memset(&e_sys, 0, sizeof(e_sys));
+ rcu_read_unlock();
/*
* We cannot entirely trust a BPF-provided size since a BPF program
@@ -7314,12 +7601,6 @@ BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
-BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq)