diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/autogroup.c | 4 | ||||
| -rw-r--r-- | kernel/sched/core.c | 8 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 54 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 1067 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.c | 43 | ||||
| -rw-r--r-- | kernel/sched/ext_internal.h | 29 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 4 |
7 files changed, 922 insertions, 287 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index cdea931aae30..954137775f38 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * this process can already run with task_group() == prev->tg or we can * race with cgroup code which can read autogroup = prev under rq->lock. * In the latter case for_each_thread() can not miss a migrating thread, - * cpu_cgroup_attach() must not be possible after cgroup_exit() and it - * can't be removed from thread list, we hold ->siglock. + * cpu_cgroup_attach() must not be possible after cgroup_task_exit() + * and it can't be removed from thread list, we hold ->siglock. * * If an exiting thread was already removed from thread list we rely on * sched_autogroup_exit_task(). diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc358c1b6ca9..b7801cd05d5a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5143,6 +5143,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + /* + * sched_ext_dead() must come before cgroup_task_dead() to + * prevent cgroups from being removed while its member tasks are + * visible to SCX schedulers. + */ + sched_ext_dead(prev); + cgroup_task_dead(prev); + /* Task is done with its stack. */ put_task_stack(prev); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 67f540c23717..319439fe1870 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2675,6 +2675,7 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; } +/* Access rule: must be called on local CPU with preemption disabled */ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) @@ -3117,11 +3118,43 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +/* + * This function always returns a non-empty bitmap in @cpus. This is because + * if a root domain has reserved bandwidth for DL tasks, the DL bandwidth + * check will prevent CPU hotplug from deactivating all CPUs in that domain. + */ +static void dl_get_task_effective_cpus(struct task_struct *p, struct cpumask *cpus) +{ + const struct cpumask *hk_msk; + + hk_msk = housekeeping_cpumask(HK_TYPE_DOMAIN); + if (housekeeping_enabled(HK_TYPE_DOMAIN)) { + if (!cpumask_intersects(p->cpus_ptr, hk_msk)) { + /* + * CPUs isolated by isolcpu="domain" always belong to + * def_root_domain. + */ + cpumask_andnot(cpus, cpu_active_mask, hk_msk); + return; + } + } + + /* + * If a root domain holds a DL task, it must have active CPUs. So + * active CPUs can always be found by walking up the task's cpuset + * hierarchy up to the partition root. + */ + cpuset_cpus_allowed_locked(p, cpus); +} + +/* The caller should hold cpuset_mutex */ void dl_add_task_root_domain(struct task_struct *p) { struct rq_flags rf; struct rq *rq; struct dl_bw *dl_b; + unsigned int cpu; + struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); if (!dl_task(p) || dl_entity_is_special(&p->dl)) { @@ -3129,16 +3162,25 @@ void dl_add_task_root_domain(struct task_struct *p) return; } - rq = __task_rq_lock(p, &rf); - + /* + * Get an active rq, whose rq->rd traces the correct root + * domain. + * Ideally this would be under cpuset reader lock until rq->rd is + * fetched. However, sleepable locks cannot nest inside pi_lock, so we + * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex' + * to guarantee the CPU stays in the cpuset. + */ + dl_get_task_effective_cpus(p, msk); + cpu = cpumask_first_and(cpu_active_mask, msk); + BUG_ON(cpu >= nr_cpu_ids); + rq = cpu_rq(cpu); dl_b = &rq->rd->dl_bw; - raw_spin_lock(&dl_b->lock); + /* End of fetching rd */ + raw_spin_lock(&dl_b->lock); __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); - raw_spin_unlock(&dl_b->lock); - - task_rq_unlock(rq, p, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); } void dl_clear_root_domain(struct root_domain *rd) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6827689a0966..05f5a49e9649 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -33,9 +33,10 @@ static DEFINE_MUTEX(scx_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); -static unsigned long scx_in_softlockup; -static atomic_t scx_breather_depth = ATOMIC_INIT(0); static int scx_bypass_depth; +static cpumask_var_t scx_bypass_lb_donee_cpumask; +static cpumask_var_t scx_bypass_lb_resched_cpumask; +static bool scx_aborting; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -68,18 +69,18 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; /* - * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated * lazily when enabling and freed when disabling to avoid waste when sched_ext * isn't active. */ -struct scx_kick_pseqs { +struct scx_kick_syncs { struct rcu_head rcu; - unsigned long seqs[]; + unsigned long syncs[]; }; -static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs); +static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); /* * Direct dispatch marker. @@ -143,26 +144,70 @@ static struct scx_dump_data scx_dump_data = { /* /sys/kernel/sched_ext interface */ static struct kset *scx_kset; +/* + * Parameters that can be adjusted through /sys/module/sched_ext/parameters. + * There usually is no reason to modify these as normal scheduler operation + * shouldn't be affected by them. The knobs are primarily for debugging. + */ +static u64 scx_slice_dfl = SCX_SLICE_DFL; +static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; +static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; + +static int set_slice_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); +} + +static const struct kernel_param_ops slice_us_param_ops = { + .set = set_slice_us, + .get = param_get_uint, +}; + +static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); +} + +static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { + .set = set_bypass_lb_intv_us, + .get = param_get_uint, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "sched_ext." + +module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); +MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); +module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); +MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); + +#undef MODULE_PARAM_PREFIX + #define CREATE_TRACE_POINTS #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); +static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); -static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, +static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); -static __printf(4, 5) void scx_exit(struct scx_sched *sch, +static __printf(4, 5) bool scx_exit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, ...) { va_list args; + bool ret; va_start(args, fmt); - scx_vexit(sch, kind, exit_code, fmt, args); + ret = scx_vexit(sch, kind, exit_code, fmt, args); va_end(args); + + return ret; } #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) @@ -200,7 +245,15 @@ static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) { - return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); + return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); +} + +static const struct sched_class *scx_setscheduler_class(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) + return &stop_sched_class; + + return __setscheduler_class(p->policy, p->prio); } /* @@ -469,19 +522,16 @@ struct scx_task_iter { * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be - * visited as long as they still exist. + * visited as long as they are not dead. */ static void scx_task_iter_start(struct scx_task_iter *iter) { - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & - ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + memset(iter, 0, sizeof(*iter)); raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked_task = NULL; - iter->cnt = 0; iter->list_locked = true; } @@ -547,14 +597,13 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - __scx_task_iter_maybe_relock(iter); - if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - __scx_task_iter_maybe_relock(iter); } + __scx_task_iter_maybe_relock(iter); + list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) return NULL; @@ -755,6 +804,11 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err static void run_deferred(struct rq *rq) { process_ddsp_deferred_locals(rq); + + if (local_read(&rq->scx.reenq_local_deferred)) { + local_set(&rq->scx.reenq_local_deferred, 0); + reenq_local(rq); + } } static void deferred_bal_cb_workfn(struct rq *rq) @@ -775,12 +829,28 @@ static void deferred_irq_workfn(struct irq_work *irq_work) * schedule_deferred - Schedule execution of deferred actions on an rq * @rq: target rq * - * Schedule execution of deferred actions on @rq. Must be called with @rq - * locked. Deferred actions are executed with @rq locked but unpinned, and thus - * can unlock @rq to e.g. migrate tasks to other rqs. + * Schedule execution of deferred actions on @rq. Deferred actions are executed + * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks + * to other rqs. */ static void schedule_deferred(struct rq *rq) { + /* + * Queue an irq work. They are executed on IRQ re-enable which may take + * a bit longer than the scheduler hook in schedule_deferred_locked(). + */ + irq_work_queue(&rq->scx.deferred_irq_work); +} + +/** + * schedule_deferred_locked - Schedule execution of deferred actions on an rq + * @rq: target rq + * + * Schedule execution of deferred actions on @rq. Equivalent to + * schedule_deferred() but requires @rq to be locked and can be more efficient. + */ +static void schedule_deferred_locked(struct rq *rq) +{ lockdep_assert_rq_held(rq); /* @@ -812,12 +882,11 @@ static void schedule_deferred(struct rq *rq) } /* - * No scheduler hooks available. Queue an irq work. They are executed on - * IRQ re-enable which may take a bit longer than the scheduler hooks. - * The above WAKEUP and BALANCE paths should cover most of the cases and - * the time to IRQ re-enable shouldn't be long. + * No scheduler hooks available. Use the generic irq_work path. The + * above WAKEUP and BALANCE paths should cover most of the cases and the + * time to IRQ re-enable shouldn't be long. */ - irq_work_queue(&rq->scx.deferred_irq_work); + schedule_deferred(rq); } /** @@ -902,7 +971,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = READ_ONCE(scx_slice_dfl); __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } @@ -916,7 +985,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, !RB_EMPTY_NODE(&p->scx.dsq_priq)); if (!is_local) { - raw_spin_lock(&dsq->lock); + raw_spin_lock_nested(&dsq->lock, + (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ @@ -965,8 +1036,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, container_of(rbp, struct task_struct, scx.dsq_priq); list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); + /* first task unchanged - no update needed */ } else { list_add(&p->scx.dsq_list.node, &dsq->list); + /* not builtin and new task is at head - use fastpath */ + rcu_assign_pointer(dsq->first_task, p); } } else { /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ @@ -974,10 +1048,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", dsq->id); - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { list_add(&p->scx.dsq_list.node, &dsq->list); - else + /* new task inserted at head - use fastpath */ + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } else { + bool was_empty; + + was_empty = list_empty(&dsq->list); list_add_tail(&p->scx.dsq_list.node, &dsq->list); + if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } } /* seq records the order tasks are queued, used by BPF DSQ iterator */ @@ -1034,6 +1117,13 @@ static void task_unlink_from_dsq(struct task_struct *p, list_del_init(&p->scx.dsq_list.node); dsq_mod_nr(dsq, -1); + + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { + struct task_struct *first_task; + + first_task = nldsq_next_task(dsq, NULL, false); + rcu_assign_pointer(dsq->first_task, first_task); + } } static void dispatch_dequeue(struct rq *rq, struct task_struct *p) @@ -1041,6 +1131,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) struct scx_dispatch_q *dsq = p->scx.dsq; bool is_local = dsq == &rq->scx.local_dsq; + lockdep_assert_rq_held(rq); + if (!dsq) { /* * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. @@ -1087,6 +1179,20 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } +/* + * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq + * and dsq are locked. + */ +static void dispatch_dequeue_locked(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_held(&dsq->lock); + + task_unlink_from_dsq(p, dsq); + p->scx.dsq = NULL; +} + static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, struct rq *rq, u64 dsq_id, struct task_struct *p) @@ -1192,7 +1298,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); list_add_tail(&p->scx.dsq_list.node, &rq->scx.ddsp_deferred_locals); - schedule_deferred(rq); + schedule_deferred_locked(rq); return; } @@ -1217,6 +1323,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, { struct scx_sched *sch = scx_root; struct task_struct **ddsp_taskp; + struct scx_dispatch_q *dsq; unsigned long qseq; WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); @@ -1235,7 +1342,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (scx_rq_bypassing(rq)) { __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); - goto global; + goto bypass; } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -1284,8 +1391,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, direct: direct_dispatch(sch, p, enq_flags); return; - +local_norefill: + dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); + return; local: + dsq = &rq->scx.local_dsq; + goto enqueue; +global: + dsq = find_global_dsq(sch, p); + goto enqueue; +bypass: + dsq = &task_rq(p)->scx.bypass_dsq; + goto enqueue; + +enqueue: /* * For task-ordering, slice refill must be treated as implying the end * of the current slice. Otherwise, the longer @p stays on the CPU, the @@ -1293,14 +1412,7 @@ local: */ touch_core_sched(rq, p); refill_task_slice_dfl(sch, p); -local_norefill: - dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); - return; - -global: - touch_core_sched(rq, p); /* see the comment in local: */ - refill_task_slice_dfl(sch, p); - dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); + dispatch_enqueue(sch, dsq, p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -1741,8 +1853,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, * @p is going from a non-local DSQ to a non-local DSQ. As * $src_dsq is already locked, do an abbreviated dequeue. */ - task_unlink_from_dsq(p, src_dsq); - p->scx.dsq = NULL; + dispatch_dequeue_locked(p, src_dsq); raw_spin_unlock(&src_dsq->lock); dispatch_enqueue(sch, dst_dsq, p, enq_flags); @@ -1751,49 +1862,12 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, return dst_rq; } -/* - * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly - * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artificial delays while the - * bypass mode is switching to guarantee timely completion. - */ -static void scx_breather(struct rq *rq) -{ - u64 until; - - lockdep_assert_rq_held(rq); - - if (likely(!atomic_read(&scx_breather_depth))) - return; - - raw_spin_rq_unlock(rq); - - until = ktime_get_ns() + NSEC_PER_MSEC; - - do { - int cnt = 1024; - while (atomic_read(&scx_breather_depth) && --cnt) - cpu_relax(); - } while (atomic_read(&scx_breather_depth) && - time_before64(ktime_get_ns(), until)); - - raw_spin_rq_lock(rq); -} - static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, struct scx_dispatch_q *dsq) { struct task_struct *p; retry: /* - * This retry loop can repeatedly race against scx_bypass() dequeueing - * tasks from @dsq trying to put the system into the bypass mode. On - * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock - * the machine into soft lockups. Give a breather. - */ - scx_breather(rq); - - /* * The caller can't expect to successfully consume a task if the task's * addition to @dsq isn't guaranteed to be visible somehow. Test * @dsq->list without locking and skip if it seems empty. @@ -1806,6 +1880,17 @@ retry: nldsq_for_each_task(p, dsq) { struct rq *task_rq = task_rq(p); + /* + * This loop can lead to multiple lockup scenarios, e.g. the BPF + * scheduler can put an enormous number of affinitized tasks into + * a contended DSQ, or the outer retry loop can repeatedly race + * against scx_bypass() dequeueing tasks from @dsq trying to put + * the system into the bypass mode. This can easily live-lock the + * machine. If aborting, exit from all non-bypass DSQs. + */ + if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS) + break; + if (rq == task_rq) { task_unlink_from_dsq(p, dsq); move_local_task_to_local_dsq(p, 0, dsq, rq); @@ -2089,8 +2174,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (consume_global_dsq(sch, rq)) goto has_tasks; - if (unlikely(!SCX_HAS_OP(sch, dispatch)) || - scx_rq_bypassing(rq) || !scx_rq_online(rq)) + if (scx_rq_bypassing(rq)) { + if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) + goto has_tasks; + else + goto no_tasks; + } + + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) goto no_tasks; dspc->rq = rq; @@ -2241,12 +2332,6 @@ static void switch_class(struct rq *rq, struct task_struct *next) struct scx_sched *sch = scx_root; const struct sched_class *next_class = next->sched_class; - /* - * Pairs with the smp_load_acquire() issued by a CPU in - * kick_cpus_irq_workfn() who is waiting for this CPU to perform a - * resched. - */ - smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return; @@ -2286,6 +2371,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { struct scx_sched *sch = scx_root; + + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + update_curr_scx(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ @@ -2332,18 +2421,32 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) +static struct task_struct * +do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) { struct task_struct *prev = rq->curr; bool keep_prev, kick_idle = false; struct task_struct *p; + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + rq_modified_clear(rq); + rq_unpin_lock(rq, rf); balance_one(rq, prev); rq_repin_lock(rq, rf); maybe_queue_balance_callback(rq); - if (rq_modified_above(rq, &ext_sched_class)) + + /* + * If any higher-priority sched class enqueued a runnable task on + * this rq during balance_one(), abort and return RETRY_TASK, so + * that the scheduler loop can restart. + * + * If @force_scx is true, always try to pick a SCHED_EXT task, + * regardless of any higher-priority sched classes activity. + */ + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; @@ -2386,6 +2489,11 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) return p; } +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) +{ + return do_pick_task_scx(rq, rf, false); +} + #ifdef CONFIG_SCHED_CORE /** * scx_prio_less - Task ordering for core-sched @@ -2842,7 +2950,7 @@ void init_scx_entity(struct sched_ext_entity *scx) INIT_LIST_HEAD(&scx->runnable_node); scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; - scx->slice = SCX_SLICE_DFL; + scx->slice = READ_ONCE(scx_slice_dfl); } void scx_pre_fork(struct task_struct *p) @@ -2908,7 +3016,7 @@ void scx_cancel_fork(struct task_struct *p) percpu_up_read(&scx_fork_rwsem); } -void sched_ext_free(struct task_struct *p) +void sched_ext_dead(struct task_struct *p) { unsigned long flags; @@ -3012,6 +3120,7 @@ void scx_tg_init(struct task_group *tg) tg->scx.weight = CGROUP_WEIGHT_DFL; tg->scx.bw_period_us = default_bw_period_us(); tg->scx.bw_quota_us = RUNTIME_INF; + tg->scx.idle = false; } int scx_tg_online(struct task_group *tg) @@ -3160,7 +3269,18 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - /* TODO: Implement ops->cgroup_set_idle() */ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_ops_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, + tg_cgrp(tg), idle); + + /* Update the task group's idle state */ + tg->scx.idle = idle; + + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_bandwidth(struct task_group *tg, @@ -3575,38 +3695,55 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) } /** - * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * handle_lockup - sched_ext common lockup handler + * @fmt: format string * - * While there are various reasons why RCU CPU stalls can occur on a system - * that may not be caused by the current BPF scheduler, try kicking out the - * current scheduler in an attempt to recover the system to a good state before - * issuing panics. + * Called on system stall or lockup condition and initiates abort of sched_ext + * if enabled, which may resolve the reported lockup. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the lockup. %false if sched_ext is not enabled or abort was already + * initiated by someone else. */ -bool scx_rcu_cpu_stall(void) +static __printf(1, 2) bool handle_lockup(const char *fmt, ...) { struct scx_sched *sch; + va_list args; + bool ret; - rcu_read_lock(); + guard(rcu)(); sch = rcu_dereference(scx_root); - if (unlikely(!sch)) { - rcu_read_unlock(); + if (unlikely(!sch)) return false; - } switch (scx_enable_state()) { case SCX_ENABLING: case SCX_ENABLED: - break; + va_start(args, fmt); + ret = scx_verror(sch, fmt, args); + va_end(args); + return ret; default: - rcu_read_unlock(); return false; } +} - scx_error(sch, "RCU CPU stall detected!"); - rcu_read_unlock(); - - return true; +/** + * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * + * While there are various reasons why RCU CPU stalls can occur on a system + * that may not be caused by the current BPF scheduler, try kicking out the + * current scheduler in an attempt to recover the system to a good state before + * issuing panics. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported RCU stall. %false if sched_ext is not enabled or someone + * else already initiated abort. + */ +bool scx_rcu_cpu_stall(void) +{ + return handle_lockup("RCU CPU stall detected!"); } /** @@ -3617,50 +3754,240 @@ bool scx_rcu_cpu_stall(void) * live-lock the system by making many CPUs target the same DSQ to the point * where soft-lockup detection triggers. This function is called from * soft-lockup watchdog when the triggering point is close and tries to unjam - * the system by enabling the breather and aborting the BPF scheduler. + * the system and aborting the BPF scheduler. */ void scx_softlockup(u32 dur_s) { - struct scx_sched *sch; + if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) + return; - rcu_read_lock(); + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", + smp_processor_id(), dur_s); +} - sch = rcu_dereference(scx_root); - if (unlikely(!sch)) - goto out_unlock; +/** + * scx_hardlockup - sched_ext hardlockup handler + * + * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting + * numerous affinitized tasks in a single queue and directing all CPUs at it. + * Try kicking out the current scheduler in an attempt to recover the system to + * a good state before taking more drastic actions. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported hardlockdup. %false if sched_ext is not enabled or + * someone else already initiated abort. + */ +bool scx_hardlockup(int cpu) +{ + if (!handle_lockup("hard lockup - CPU %d", cpu)) + return false; - switch (scx_enable_state()) { - case SCX_ENABLING: - case SCX_ENABLED: - break; - default: - goto out_unlock; + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + cpu); + return true; +} + +static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, + struct cpumask *donee_mask, struct cpumask *resched_mask, + u32 nr_donor_target, u32 nr_donee_target) +{ + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + struct task_struct *p, *n; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); + s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; + u32 nr_balanced = 0, min_delta_us; + + /* + * All we want to guarantee is reasonable forward progress. No reason to + * fine tune. Assuming every task on @donor_dsq runs their full slice, + * consider offloading iff the total queued duration is over the + * threshold. + */ + min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; + if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) + return 0; + + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + list_add(&cursor.node, &donor_dsq->list); +resume: + n = container_of(&cursor, struct task_struct, scx.dsq_list); + n = nldsq_next_task(donor_dsq, n, false); + + while ((p = n)) { + struct rq *donee_rq; + struct scx_dispatch_q *donee_dsq; + int donee; + + n = nldsq_next_task(donor_dsq, n, false); + + if (donor_dsq->nr <= nr_donor_target) + break; + + if (cpumask_empty(donee_mask)) + break; + + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); + if (donee >= nr_cpu_ids) + continue; + + donee_rq = cpu_rq(donee); + donee_dsq = &donee_rq->scx.bypass_dsq; + + /* + * $p's rq is not locked but $p's DSQ lock protects its + * scheduling properties making this test safe. + */ + if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) + continue; + + /* + * Moving $p from one non-local DSQ to another. The source rq + * and DSQ are already locked. Do an abbreviated dequeue and + * then perform enqueue without unlocking $donor_dsq. + * + * We don't want to drop and reacquire the lock on each + * iteration as @donor_dsq can be very long and potentially + * highly contended. Donee DSQs are less likely to be contended. + * The nested locking is safe as only this LB moves tasks + * between bypass DSQs. + */ + dispatch_dequeue_locked(p, donor_dsq); + dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); + + /* + * $donee might have been idle and need to be woken up. No need + * to be clever. Kick every CPU that receives tasks. + */ + cpumask_set_cpu(donee, resched_mask); + + if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) + cpumask_clear_cpu(donee, donee_mask); + + nr_balanced++; + if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { + list_move_tail(&cursor.node, &n->scx.dsq_list.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); + cpu_relax(); + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + goto resume; + } } - /* allow only one instance, cleared at the end of scx_bypass() */ - if (test_and_set_bit(0, &scx_in_softlockup)) - goto out_unlock; + list_del_init(&cursor.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); - printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", - smp_processor_id(), dur_s, scx_root->ops.name); + return nr_balanced; +} + +static void bypass_lb_node(struct scx_sched *sch, int node) +{ + const struct cpumask *node_mask = cpumask_of_node(node); + struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; + struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; + u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; + u32 nr_target, nr_donor_target; + u32 before_min = U32_MAX, before_max = 0; + u32 after_min = U32_MAX, after_max = 0; + int cpu; + + /* count the target tasks and CPUs */ + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + + nr_tasks += nr; + nr_cpus++; + + before_min = min(nr, before_min); + before_max = max(nr, before_max); + } + + if (!nr_cpus) + return; /* - * Some CPUs may be trapped in the dispatch paths. Enable breather - * immediately; otherwise, we might even be able to get to scx_bypass(). + * We don't want CPUs to have more than $nr_donor_target tasks and + * balancing to fill donee CPUs upto $nr_target. Once targets are + * calculated, find the donee CPUs. */ - atomic_inc(&scx_breather_depth); + nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); + nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); - scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s); -out_unlock: - rcu_read_unlock(); + cpumask_clear(donee_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) + cpumask_set_cpu(cpu, donee_mask); + } + + /* iterate !donee CPUs and see if they should be offloaded */ + cpumask_clear(resched_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + struct rq *rq = cpu_rq(cpu); + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + + if (cpumask_empty(donee_mask)) + break; + if (cpumask_test_cpu(cpu, donee_mask)) + continue; + if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) + continue; + + nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, + nr_donor_target, nr_target); + } + + for_each_cpu(cpu, resched_mask) { + struct rq *rq = cpu_rq(cpu); + + raw_spin_rq_lock_irq(rq); + resched_curr(rq); + raw_spin_rq_unlock_irq(rq); + } + + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + + after_min = min(nr, after_min); + after_max = max(nr, after_max); + + } + + trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, + before_min, before_max, after_min, after_max); } -static void scx_clear_softlockup(void) +/* + * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine + * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some + * bypass DSQs can be overloaded. If there are enough tasks to saturate other + * lightly loaded CPUs, such imbalance can lead to very high execution latency + * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such + * outcomes, a simple load balancing mechanism is implemented by the following + * timer which runs periodically while bypass mode is in effect. + */ +static void scx_bypass_lb_timerfn(struct timer_list *timer) { - if (test_and_clear_bit(0, &scx_in_softlockup)) - atomic_dec(&scx_breather_depth); + struct scx_sched *sch; + int node; + u32 intv_us; + + sch = rcu_dereference_all(scx_root); + if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth)) + return; + + for_each_node_with_cpus(node) + bypass_lb_node(sch, node); + + intv_us = READ_ONCE(scx_bypass_lb_intv_us); + if (intv_us) + mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); } +static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); + /** * scx_bypass - [Un]bypass scx_ops and guarantee forward progress * @bypass: true for bypass, false for unbypass @@ -3704,25 +4031,34 @@ static void scx_bypass(bool bypass) sch = rcu_dereference_bh(scx_root); if (bypass) { - scx_bypass_depth++; + u32 intv_us; + + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1); WARN_ON_ONCE(scx_bypass_depth <= 0); if (scx_bypass_depth != 1) goto unlock; + WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC); bypass_timestamp = ktime_get_ns(); if (sch) scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); + + intv_us = READ_ONCE(scx_bypass_lb_intv_us); + if (intv_us && !timer_pending(&scx_bypass_lb_timer)) { + scx_bypass_lb_timer.expires = + jiffies + usecs_to_jiffies(intv_us); + add_timer_global(&scx_bypass_lb_timer); + } } else { - scx_bypass_depth--; + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1); WARN_ON_ONCE(scx_bypass_depth < 0); if (scx_bypass_depth != 0) goto unlock; + WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL); if (sch) scx_add_event(sch, SCX_EV_BYPASS_DURATION, ktime_get_ns() - bypass_timestamp); } - atomic_inc(&scx_breather_depth); - /* * No task property is changing. We just need to make sure all currently * queued tasks are re-queued according to the new scx_rq_bypassing() @@ -3778,10 +4114,8 @@ static void scx_bypass(bool bypass) raw_spin_rq_unlock(rq); } - atomic_dec(&scx_breather_depth); unlock: raw_spin_unlock_irqrestore(&bypass_lock, flags); - scx_clear_softlockup(); } static void free_exit_info(struct scx_exit_info *ei) @@ -3834,24 +4168,17 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) } } -static void free_kick_pseqs_rcu(struct rcu_head *rcu) -{ - struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu); - - kvfree(pseqs); -} - -static void free_kick_pseqs(void) +static void free_kick_syncs(void) { int cpu; for_each_possible_cpu(cpu) { - struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); - struct scx_kick_pseqs *to_free; + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); + struct scx_kick_syncs *to_free; - to_free = rcu_replace_pointer(*pseqs, NULL, true); + to_free = rcu_replace_pointer(*ksyncs, NULL, true); if (to_free) - call_rcu(&to_free->rcu, free_kick_pseqs_rcu); + kvfree_rcu(to_free, rcu); } } @@ -3876,6 +4203,7 @@ static void scx_disable_workfn(struct kthread_work *work) /* guarantee forward progress by bypassing scx_ops */ scx_bypass(true); + WRITE_ONCE(scx_aborting, false); switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING: @@ -3920,8 +4248,7 @@ static void scx_disable_workfn(struct kthread_work *work) while ((p = scx_task_iter_next_locked(&sti))) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; - const struct sched_class *new_class = - __setscheduler_class(p->policy, p->prio); + const struct sched_class *new_class = scx_setscheduler_class(p); update_rq_clock(task_rq(p)); @@ -3989,7 +4316,7 @@ static void scx_disable_workfn(struct kthread_work *work) free_percpu(scx_dsp_ctx); scx_dsp_ctx = NULL; scx_dsp_max_batch = 0; - free_kick_pseqs(); + free_kick_syncs(); mutex_unlock(&scx_enable_mutex); @@ -3998,9 +4325,24 @@ done: scx_bypass(false); } -static void scx_disable(enum scx_exit_kind kind) +static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) { int none = SCX_EXIT_NONE; + + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) + return false; + + /* + * Some CPUs may be trapped in the dispatch paths. Set the aborting + * flag to break potential live-lock scenarios, ensuring we can + * successfully reach scx_bypass(). + */ + WRITE_ONCE(scx_aborting, true); + return true; +} + +static void scx_disable(enum scx_exit_kind kind) +{ struct scx_sched *sch; if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) @@ -4009,7 +4351,7 @@ static void scx_disable(enum scx_exit_kind kind) rcu_read_lock(); sch = rcu_dereference(scx_root); if (sch) { - atomic_try_cmpxchg(&sch->exit_kind, &none, kind); + scx_claim_exit(sch, kind); kthread_queue_work(sch->helper, &sch->disable_work); } rcu_read_unlock(); @@ -4238,10 +4580,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) seq_buf_init(&ns, buf, avail); dump_newline(&ns); - dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", cpu, rq->scx.nr_running, rq->scx.flags, rq->scx.cpu_released, rq->scx.ops_qseq, - rq->scx.pnt_seq); + rq->scx.kick_sync); dump_line(&ns, " curr=%s[%d] class=%ps", rq->curr->comm, rq->curr->pid, rq->curr->sched_class); @@ -4325,15 +4667,14 @@ static void scx_error_irq_workfn(struct irq_work *irq_work) kthread_queue_work(sch->helper, &sch->disable_work); } -static void scx_vexit(struct scx_sched *sch, +static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args) { struct scx_exit_info *ei = sch->exit_info; - int none = SCX_EXIT_NONE; - if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) - return; + if (!scx_claim_exit(sch, kind)) + return false; ei->exit_code = exit_code; #ifdef CONFIG_STACKTRACE @@ -4350,9 +4691,10 @@ static void scx_vexit(struct scx_sched *sch, ei->reason = scx_exit_reason(ei->kind); irq_work_queue(&sch->error_irq_work); + return true; } -static int alloc_kick_pseqs(void) +static int alloc_kick_syncs(void) { int cpu; @@ -4361,19 +4703,19 @@ static int alloc_kick_pseqs(void) * can exceed percpu allocator limits on large machines. */ for_each_possible_cpu(cpu) { - struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); - struct scx_kick_pseqs *new_pseqs; + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); + struct scx_kick_syncs *new_ksyncs; - WARN_ON_ONCE(rcu_access_pointer(*pseqs)); + WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); - new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids), - GFP_KERNEL, cpu_to_node(cpu)); - if (!new_pseqs) { - free_kick_pseqs(); + new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), + GFP_KERNEL, cpu_to_node(cpu)); + if (!new_ksyncs) { + free_kick_syncs(); return -ENOMEM; } - rcu_assign_pointer(*pseqs, new_pseqs); + rcu_assign_pointer(*ksyncs, new_ksyncs); } return 0; @@ -4460,7 +4802,7 @@ err_free_sch: return ERR_PTR(ret); } -static void check_hotplug_seq(struct scx_sched *sch, +static int check_hotplug_seq(struct scx_sched *sch, const struct sched_ext_ops *ops) { unsigned long long global_hotplug_seq; @@ -4477,8 +4819,11 @@ static void check_hotplug_seq(struct scx_sched *sch, SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, "expected hotplug seq %llu did not match actual %llu", ops->hotplug_seq, global_hotplug_seq); + return -EBUSY; } } + + return 0; } static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) @@ -4505,6 +4850,9 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + if (ops->cpu_acquire || ops->cpu_release) + pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); + return 0; } @@ -4529,14 +4877,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) goto err_unlock; } - ret = alloc_kick_pseqs(); + ret = alloc_kick_syncs(); if (ret) goto err_unlock; sch = scx_alloc_and_add_sched(ops); if (IS_ERR(sch)) { ret = PTR_ERR(sch); - goto err_free_pseqs; + goto err_free_ksyncs; } /* @@ -4545,6 +4893,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); WARN_ON_ONCE(scx_root); + if (WARN_ON_ONCE(READ_ONCE(scx_aborting))) + WRITE_ONCE(scx_aborting, false); atomic_long_set(&scx_nr_rejected, 0); @@ -4580,7 +4930,11 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) set_bit(i, sch->has_op); - check_hotplug_seq(sch, ops); + ret = check_hotplug_seq(sch, ops); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } scx_idle_update_selcpu_topology(ops); cpus_read_unlock(); @@ -4697,21 +5051,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) while ((p = scx_task_iter_next_locked(&sti))) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; - const struct sched_class *new_class = - __setscheduler_class(p->policy, p->prio); + const struct sched_class *new_class = scx_setscheduler_class(p); - if (!tryget_task_struct(p)) + if (scx_get_task_state(p) != SCX_TASK_READY) continue; if (old_class != new_class) queue_flags |= DEQUEUE_CLASS; scoped_guard (sched_change, p, queue_flags) { - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = READ_ONCE(scx_slice_dfl); p->sched_class = new_class; } - - put_task_struct(p); } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); @@ -4735,8 +5086,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) return 0; -err_free_pseqs: - free_kick_pseqs(); +err_free_ksyncs: + free_kick_syncs(); err_unlock: mutex_unlock(&scx_enable_mutex); return ret; @@ -4953,6 +5304,7 @@ static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *fro static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} +static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} #endif static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} @@ -4991,6 +5343,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, + .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, @@ -5064,29 +5417,38 @@ static bool can_skip_idle_kick(struct rq *rq) return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); } -static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) +static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) { struct rq *rq = cpu_rq(cpu); struct scx_rq *this_scx = &this_rq->scx; + const struct sched_class *cur_class; bool should_wait = false; unsigned long flags; raw_spin_rq_lock_irqsave(rq, flags); + cur_class = rq->curr->sched_class; /* * During CPU hotplug, a CPU may depend on kicking itself to make - * forward progress. Allow kicking self regardless of online state. + * forward progress. Allow kicking self regardless of online state. If + * @cpu is running a higher class task, we have no control over @cpu. + * Skip kicking. */ - if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { + if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && + !sched_class_above(cur_class, &ext_sched_class)) { if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { - if (rq->curr->sched_class == &ext_sched_class) + if (cur_class == &ext_sched_class) rq->curr->scx.slice = 0; cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); } if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { - pseqs[cpu] = rq->scx.pnt_seq; - should_wait = true; + if (cur_class == &ext_sched_class) { + ksyncs[cpu] = rq->scx.kick_sync; + should_wait = true; + } else { + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } } resched_curr(rq); @@ -5118,20 +5480,20 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) { struct rq *this_rq = this_rq(); struct scx_rq *this_scx = &this_rq->scx; - struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs); + struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); bool should_wait = false; - unsigned long *pseqs; + unsigned long *ksyncs; s32 cpu; - if (unlikely(!pseqs_pcpu)) { - pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs"); + if (unlikely(!ksyncs_pcpu)) { + pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs"); return; } - pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs; + ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; for_each_cpu(cpu, this_scx->cpus_to_kick) { - should_wait |= kick_one_cpu(cpu, this_rq, pseqs); + should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } @@ -5145,20 +5507,21 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) return; for_each_cpu(cpu, this_scx->cpus_to_wait) { - unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; + unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; - if (cpu != cpu_of(this_rq)) { - /* - * Pairs with smp_store_release() issued by this CPU in - * switch_class() on the resched path. - * - * We busy-wait here to guarantee that no other task can - * be scheduled on our core before the target CPU has - * entered the resched path. - */ - while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) - cpu_relax(); - } + /* + * Busy-wait until the task running at the time of kicking is no + * longer running. This can be used to implement e.g. core + * scheduling. + * + * smp_cond_load_acquire() pairs with store_releases in + * pick_task_scx() and put_prev_task_scx(). The former breaks + * the wait if SCX's scheduling path is entered even if the same + * task is picked subsequently. The latter is necessary to break + * the wait when $cpu is taken by a higher sched class. + */ + if (cpu != cpu_of(this_rq)) + smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } @@ -5257,6 +5620,7 @@ void __init init_sched_ext_class(void) int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -5362,19 +5726,23 @@ __bpf_kfunc_start_defs(); * exhaustion. If zero, the current residual slice is maintained. If * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with * scx_bpf_kick_cpu() to trigger scheduling. + * + * Returns %true on successful insertion, %false on failure. On the root + * scheduler, %false return triggers scheduler abort and the caller doesn't need + * to check the return value. */ -__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) +__bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, + u64 slice, u64 enq_flags) { struct scx_sched *sch; guard(rcu)(); sch = rcu_dereference(scx_root); if (unlikely(!sch)) - return; + return false; if (!scx_dsq_insert_preamble(sch, p, enq_flags)) - return; + return false; if (slice) p->scx.slice = slice; @@ -5382,56 +5750,114 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice p->scx.slice = p->scx.slice ?: 1; scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); + + return true; +} + +/* + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. + */ +__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, + u64 slice, u64 enq_flags) +{ + scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags); +} + +static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) +{ + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) + return false; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + + return true; } +struct scx_bpf_dsq_insert_vtime_args { + /* @p can't be packed together as KF_RCU is not transitive */ + u64 dsq_id; + u64 slice; + u64 vtime; + u64 enq_flags; +}; + /** - * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion * @p: task_struct to insert - * @dsq_id: DSQ to insert into - * @slice: duration @p can run for in nsecs, 0 to keep the current value - * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ - * @enq_flags: SCX_ENQ_* + * @args: struct containing the rest of the arguments + * @args->dsq_id: DSQ to insert into + * @args->slice: duration @p can run for in nsecs, 0 to keep the current value + * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @args->enq_flags: SCX_ENQ_* + * + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument + * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided + * as an inline wrapper in common.bpf.h. * - * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. - * Tasks queued into the priority queue are ordered by @vtime. All other aspects - * are identical to scx_bpf_dsq_insert(). + * Insert @p into the vtime priority queue of the DSQ identified by + * @args->dsq_id. Tasks queued into the priority queue are ordered by + * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). * - * @vtime ordering is according to time_before64() which considers wrapping. A - * numerically larger vtime may indicate an earlier position in the ordering and - * vice-versa. + * @args->vtime ordering is according to time_before64() which considers + * wrapping. A numerically larger vtime may indicate an earlier position in the + * ordering and vice-versa. * * A DSQ can only be used as a FIFO or priority queue at any given time and this * function must not be called on a DSQ which already has one or more FIFO tasks * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and * SCX_DSQ_GLOBAL) cannot be used as priority queues. + * + * Returns %true on successful insertion, %false on failure. On the root + * scheduler, %false return triggers scheduler abort and the caller doesn't need + * to check the return value. */ -__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) +__bpf_kfunc bool +__scx_bpf_dsq_insert_vtime(struct task_struct *p, + struct scx_bpf_dsq_insert_vtime_args *args) { struct scx_sched *sch; guard(rcu)(); + sch = rcu_dereference(scx_root); if (unlikely(!sch)) - return; + return false; - if (!scx_dsq_insert_preamble(sch, p, enq_flags)) - return; + return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, + args->vtime, args->enq_flags); +} - if (slice) - p->scx.slice = slice; - else - p->scx.slice = p->scx.slice ?: 1; +/* + * COMPAT: Will be removed in v6.23. + */ +__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + struct scx_sched *sch; - p->scx.dsq_vtime = vtime; + guard(rcu)(); - scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) @@ -5455,6 +5881,13 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, return false; /* + * If the BPF scheduler keeps calling this function repeatedly, it can + * cause similar live-lock conditions as consume_dispatch_q(). + */ + if (unlikely(READ_ONCE(scx_aborting))) + return false; + + /* * Can be called from either ops.dispatch() locking this_rq() or any * context where no rq lock is held. If latter, lock @p's task_rq which * we'll likely need anyway. @@ -5474,13 +5907,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, raw_spin_rq_lock(src_rq); } - /* - * If the BPF scheduler keeps calling this function repeatedly, it can - * cause similar live-lock conditions as consume_dispatch_q(). Insert a - * breather if necessary. - */ - scx_breather(src_rq); - locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); @@ -5685,8 +6111,9 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq * lock (e.g. BPF timers or SYSCALL programs). * - * Returns %true if @p has been consumed, %false if @p had already been consumed - * or dequeued. + * Returns %true if @p has been consumed, %false if @p had already been + * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local + * DSQ. */ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, @@ -5738,32 +6165,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { .set = &scx_kfunc_ids_dispatch, }; -__bpf_kfunc_start_defs(); - -/** - * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ - * - * Iterate over all of the tasks currently enqueued on the local DSQ of the - * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of - * processed tasks. Can only be called from ops.cpu_release(). - */ -__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +static u32 reenq_local(struct rq *rq) { - struct scx_sched *sch; LIST_HEAD(tasks); u32 nr_enqueued = 0; - struct rq *rq; struct task_struct *p, *n; - guard(rcu)(); - sch = rcu_dereference(scx_root); - if (unlikely(!sch)) - return 0; - - if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) - return 0; - - rq = cpu_rq(smp_processor_id()); lockdep_assert_rq_held(rq); /* @@ -5800,6 +6207,37 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) return nr_enqueued; } +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + * + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void + * returning variant that can be called from anywhere. + */ +__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +{ + struct scx_sched *sch; + struct rq *rq; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + + return reenq_local(rq); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) @@ -5872,6 +6310,34 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { __bpf_kfunc_start_defs(); +/** + * scx_bpf_task_set_slice - Set task's time slice + * @p: task of interest + * @slice: time slice to set in nsecs + * + * Set @p's time slice to @slice. Returns %true on success, %false if the + * calling scheduler doesn't have authority over @p. + */ +__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +{ + p->scx.slice = slice; + return true; +} + +/** + * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering + * @p: task of interest + * @vtime: virtual time to set + * + * Set @p's virtual time to @vtime. Returns %true on success, %false if the + * calling scheduler doesn't have authority over @p. + */ +__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +{ + p->scx.dsq_vtime = vtime; + return true; +} + static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) { struct rq *this_rq; @@ -6029,6 +6495,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, sizeof(struct bpf_iter_scx_dsq)); BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & + ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); /* * next() and destroy() will be called regardless of the return value. @@ -6047,9 +6515,8 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (!kit->dsq) return -ENOENT; - INIT_LIST_HEAD(&kit->cursor.node); - kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; - kit->cursor.priv = READ_ONCE(kit->dsq->seq); + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, + READ_ONCE(kit->dsq->seq)); return 0; } @@ -6123,6 +6590,40 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) kit->dsq = NULL; } +/** + * scx_bpf_dsq_peek - Lockless peek at the first element. + * @dsq_id: DSQ to examine. + * + * Read the first element in the DSQ. This is semantically equivalent to using + * the DSQ iterator, but is lockfree. Of course, like any lockless operation, + * this provides only a point-in-time snapshot, and the contents may change + * by the time any subsequent locking operation reads the queue. + * + * Returns the pointer, or NULL indicates an empty queue OR internal error. + */ +__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) +{ + struct scx_sched *sch; + struct scx_dispatch_q *dsq; + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { + scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); + return NULL; + } + + dsq = find_user_dsq(sch, dsq_id); + if (unlikely(!dsq)) { + scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); + return NULL; + } + + return rcu_dereference(dsq->first_task); +} + __bpf_kfunc_end_defs(); static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, @@ -6277,6 +6778,24 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, } /** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from + * anywhere. + */ +__bpf_kfunc void scx_bpf_reenqueue_local___v2(void) +{ + struct rq *rq; + + guard(preempt)(); + + rq = this_rq(); + local_set(&rq->scx.reenq_local_deferred, 1); + schedule_deferred(rq); +} + +/** * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU * @cpu: CPU of interest * @@ -6677,15 +7196,19 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU); +BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) @@ -6776,6 +7299,12 @@ static int __init scx_init(void) return ret; } + if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || + !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { + pr_err("sched_ext: Failed to allocate cpumasks\n"); + return -ENOMEM; + } + return 0; } __initcall(scx_init); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index d2434c954848..3d9d404d5cd2 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -995,26 +995,56 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, return prev_cpu; } +struct scx_bpf_select_cpu_and_args { + /* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */ + s32 prev_cpu; + u64 wake_flags; + u64 flags; +}; + /** - * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p, - * prioritizing those in @cpus_allowed + * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags * @cpus_allowed: cpumask of allowed CPUs - * @flags: %SCX_PICK_IDLE* flags + * @args: struct containing the rest of the arguments + * @args->prev_cpu: CPU @p was on previously + * @args->wake_flags: %SCX_WAKE_* flags + * @args->flags: %SCX_PICK_IDLE* flags + * + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument + * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided + * as an inline wrapper in common.bpf.h. * * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked * context such as a BPF test_run() call, as long as built-in CPU selection * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE * is set. * - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu(). * * Returns the selected idle CPU, which will be automatically awakened upon * returning from ops.select_cpu() and can be used for direct dispatch, or * a negative value if no idle CPU is available. */ +__bpf_kfunc s32 +__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, + struct scx_bpf_select_cpu_and_args *args) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags, + cpus_allowed, args->flags); +} + +/* + * COMPAT: Will be removed in v6.22. + */ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags) { @@ -1383,6 +1413,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_idle) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index b3617abed510..386c677e4c9a 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -23,6 +23,11 @@ enum scx_consts { * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. */ SCX_TASK_ITER_BATCH = 32, + + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, + SCX_BYPASS_LB_DONOR_PCT = 125, + SCX_BYPASS_LB_MIN_DELTA_DIV = 4, + SCX_BYPASS_LB_BATCH = 256, }; enum scx_exit_kind { @@ -697,12 +702,23 @@ struct sched_ext_ops { * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be * interpreted in the same fashion and specifies how much @cgrp can * burst temporarily. The specific control mechanism and thus the - * interpretation of @period_us and burstiness is upto to the BPF + * interpretation of @period_us and burstiness is up to the BPF * scheduler. */ void (*cgroup_set_bandwidth)(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us); + /** + * @cgroup_set_idle: A cgroup's idle state is being changed + * @cgrp: cgroup whose idle state is being updated + * @idle: whether the cgroup is entering or exiting idle state + * + * Update @cgrp's idle state to @idle. This callback is invoked when + * a cgroup transitions between idle and non-idle states, allowing the + * BPF scheduler to adjust its behavior accordingly. + */ + void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle); + #endif /* CONFIG_EXT_GROUP_SCHED */ /* @@ -884,6 +900,10 @@ struct scx_sched { struct scx_dispatch_q **global_dsqs; struct scx_sched_pcpu __percpu *pcpu; + /* + * Updates to the following warned bitfields can race causing RMW issues + * but it doesn't really matter. + */ bool warned_zero_slice:1; bool warned_deprecated_rq:1; @@ -948,6 +968,7 @@ enum scx_enq_flags { SCX_ENQ_CLEAR_OPSS = 1LLU << 56, SCX_ENQ_DSQ_PRIQ = 1LLU << 57, + SCX_ENQ_NESTED = 1LLU << 58, }; enum scx_deq_flags { @@ -986,8 +1007,10 @@ enum scx_kick_flags { SCX_KICK_PREEMPT = 1LLU << 1, /* - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will - * return after the target CPU finishes picking the next task. + * The scx_bpf_kick_cpu() call will return after the current SCX task of + * the target CPU switches out. This can be used to implement e.g. core + * scheduling. This has no effect if the current task on the target CPU + * is not on SCX. */ SCX_KICK_WAIT = 1LLU << 2, }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8590113e4a60..bbf513b3e76c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -803,10 +803,12 @@ struct scx_rq { cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; - unsigned long pnt_seq; + unsigned long kick_sync; + local_t reenq_local_deferred; struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; + struct scx_dispatch_q bypass_dsq; }; #endif /* CONFIG_SCHED_CLASS_EXT */ |
