summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cpuset.c6
-rw-r--r--kernel/sched/ext.c191
-rw-r--r--kernel/sched/ext_idle.c2
-rw-r--r--kernel/time/timekeeping.c50
-rw-r--r--kernel/time/vsyscall.c4
5 files changed, 172 insertions, 81 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 306b60430091..24b70ea3e6ce 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1116,9 +1116,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
if (top_cs) {
/*
- * Percpu kthreads in top_cpuset are ignored
+ * PF_NO_SETAFFINITY tasks are ignored.
+ * All per cpu kthreads should have PF_NO_SETAFFINITY
+ * flag set, see kthread_set_per_cpu().
*/
- if (kthread_is_per_cpu(task))
+ if (task->flags & PF_NO_SETAFFINITY)
continue;
cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
} else {
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fdbf249d1c68..f5133249fd4d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1118,8 +1118,38 @@ static void scx_kf_disallow(u32 mask)
current->scx.kf_mask &= ~mask;
}
-#define SCX_CALL_OP(mask, op, args...) \
+/*
+ * Track the rq currently locked.
+ *
+ * This allows kfuncs to safely operate on rq from any scx ops callback,
+ * knowing which rq is already locked.
+ */
+static DEFINE_PER_CPU(struct rq *, locked_rq);
+
+static inline void update_locked_rq(struct rq *rq)
+{
+ /*
+ * Check whether @rq is actually locked. This can help expose bugs
+ * or incorrect assumptions about the context in which a kfunc or
+ * callback is executed.
+ */
+ if (rq)
+ lockdep_assert_rq_held(rq);
+ __this_cpu_write(locked_rq, rq);
+}
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(locked_rq);
+}
+
+#define SCX_CALL_OP(mask, op, rq, args...) \
do { \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
scx_ops.op(args); \
@@ -1127,11 +1157,14 @@ do { \
} else { \
scx_ops.op(args); \
} \
+ update_locked_rq(NULL); \
} while (0)
-#define SCX_CALL_OP_RET(mask, op, args...) \
+#define SCX_CALL_OP_RET(mask, op, rq, args...) \
({ \
__typeof__(scx_ops.op(args)) __ret; \
+ \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
__ret = scx_ops.op(args); \
@@ -1139,6 +1172,7 @@ do { \
} else { \
__ret = scx_ops.op(args); \
} \
+ update_locked_rq(NULL); \
__ret; \
})
@@ -1153,31 +1187,31 @@ do { \
* scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
* the specific task.
*/
-#define SCX_CALL_OP_TASK(mask, op, task, args...) \
+#define SCX_CALL_OP_TASK(mask, op, rq, task, args...) \
do { \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP(mask, op, task, ##args); \
+ SCX_CALL_OP(mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \
+#define SCX_CALL_OP_TASK_RET(mask, op, rq, task, args...) \
({ \
__typeof__(scx_ops.op(task, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \
+ __ret = SCX_CALL_OP_RET(mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(mask, op, rq, task0, task1, args...) \
({ \
__typeof__(scx_ops.op(task0, task1, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET(mask, op, rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
@@ -2172,7 +2206,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+ SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
*ddsp_taskp = NULL;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2269,7 +2303,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
add_nr_running(rq, 1);
if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
+ SCX_CALL_OP_TASK(SCX_KF_REST, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
@@ -2283,7 +2317,7 @@ out:
__scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
}
-static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
{
unsigned long opss;
@@ -2304,7 +2338,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
BUG();
case SCX_OPSS_QUEUED:
if (SCX_HAS_OP(dequeue))
- SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
+ SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, rq, p, deq_flags);
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
@@ -2337,7 +2371,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
return true;
}
- ops_dequeue(p, deq_flags);
+ ops_dequeue(rq, p, deq_flags);
/*
* A currently running task which is going off @rq first gets dequeued
@@ -2353,11 +2387,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
*/
if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
update_curr_scx(rq);
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, false);
}
if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
+ SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, rq, p, deq_flags);
if (deq_flags & SCX_DEQ_SLEEP)
p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -2377,7 +2411,7 @@ static void yield_task_scx(struct rq *rq)
struct task_struct *p = rq->curr;
if (SCX_HAS_OP(yield))
- SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
+ SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, p, NULL);
else
p->scx.slice = 0;
}
@@ -2387,7 +2421,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
struct task_struct *from = rq->curr;
if (SCX_HAS_OP(yield))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, from, to);
else
return false;
}
@@ -2945,7 +2979,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* emitted in switch_class().
*/
if (SCX_HAS_OP(cpu_acquire))
- SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
+ SCX_CALL_OP(SCX_KF_REST, cpu_acquire, rq, cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -2990,7 +3024,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
do {
dspc->nr_tasks = 0;
- SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
+ SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, rq, cpu_of(rq),
prev_on_scx ? prev : NULL);
flush_dispatch_buf(rq);
@@ -3104,7 +3138,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
* Core-sched might decide to execute @p before it is
* dispatched. Call ops_dequeue() to notify the BPF scheduler.
*/
- ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
+ ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC);
dispatch_dequeue(rq, p);
}
@@ -3112,7 +3146,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
/* see dequeue_task_scx() on why we skip when !QUEUED */
if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, running, rq, p);
clr_task_runnable(p, true);
@@ -3193,8 +3227,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
.task = next,
};
- SCX_CALL_OP(SCX_KF_CPU_RELEASE,
- cpu_release, cpu_of(rq), &args);
+ SCX_CALL_OP(SCX_KF_CPU_RELEASE, cpu_release, rq, cpu_of(rq), &args);
}
rq->scx.cpu_released = true;
}
@@ -3207,7 +3240,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
/* see dequeue_task_scx() on why we skip when !QUEUED */
if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, true);
if (p->scx.flags & SCX_TASK_QUEUED) {
set_task_runnable(rq, p);
@@ -3348,7 +3381,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
* verifier.
*/
if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, NULL,
(struct task_struct *)a,
(struct task_struct *)b);
else
@@ -3385,7 +3418,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
*ddsp_taskp = p;
cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
- select_cpu, p, prev_cpu, wake_flags);
+ select_cpu, NULL, p, prev_cpu, wake_flags);
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
@@ -3430,8 +3463,8 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* designation pointless. Cast it away when calling the operation.
*/
if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
- (struct cpumask *)p->cpus_ptr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, NULL,
+ p, (struct cpumask *)p->cpus_ptr);
}
static void handle_hotplug(struct rq *rq, bool online)
@@ -3444,9 +3477,9 @@ static void handle_hotplug(struct rq *rq, bool online)
scx_idle_update_selcpu_topology(&scx_ops);
if (online && SCX_HAS_OP(cpu_online))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
else if (!online && SCX_HAS_OP(cpu_offline))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
else
scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
"cpu %d going %s, exiting scheduler", cpu,
@@ -3550,7 +3583,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
curr->scx.slice = 0;
touch_core_sched(rq, curr);
} else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, tick, rq, curr);
}
if (!curr->scx.slice)
@@ -3627,7 +3660,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
.fork = fork,
};
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, NULL, p, &args);
if (unlikely(ret)) {
ret = ops_sanitize_err("init_task", ret);
return ret;
@@ -3668,9 +3701,10 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
static void scx_ops_enable_task(struct task_struct *p)
{
+ struct rq *rq = task_rq(p);
u32 weight;
- lockdep_assert_rq_held(task_rq(p));
+ lockdep_assert_rq_held(rq);
/*
* Set the weight before calling ops.enable() so that the scheduler
@@ -3684,20 +3718,22 @@ static void scx_ops_enable_task(struct task_struct *p)
p->scx.weight = sched_weight_to_cgroup(weight);
if (SCX_HAS_OP(enable))
- SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, enable, rq, p);
scx_set_task_state(p, SCX_TASK_ENABLED);
if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
}
static void scx_ops_disable_task(struct task_struct *p)
{
- lockdep_assert_rq_held(task_rq(p));
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
if (SCX_HAS_OP(disable))
- SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, disable, rq, p);
scx_set_task_state(p, SCX_TASK_READY);
}
@@ -3726,7 +3762,7 @@ static void scx_ops_exit_task(struct task_struct *p)
}
if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
+ SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, task_rq(p), p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -3835,7 +3871,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
}
static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
@@ -3851,8 +3887,8 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
- (struct cpumask *)p->cpus_ptr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, rq,
+ p, (struct cpumask *)p->cpus_ptr);
}
static void switched_from_scx(struct rq *rq, struct task_struct *p)
@@ -3913,7 +3949,7 @@ int scx_tg_online(struct task_group *tg)
struct scx_cgroup_init_args args =
{ .weight = tg->scx_weight };
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL,
tg->css.cgroup, &args);
if (ret)
ret = ops_sanitize_err("cgroup_init", ret);
@@ -3935,7 +3971,7 @@ void scx_tg_offline(struct task_group *tg)
percpu_down_read(&scx_cgroup_rwsem);
if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
percpu_up_read(&scx_cgroup_rwsem);
@@ -3968,7 +4004,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
continue;
if (SCX_HAS_OP(cgroup_prep_move)) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, NULL,
p, from, css->cgroup);
if (ret)
goto err;
@@ -3982,8 +4018,8 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
err:
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
- p->scx.cgrp_moving_from, css->cgroup);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
@@ -4001,8 +4037,8 @@ void scx_cgroup_move_task(struct task_struct *p)
* cgrp_moving_from set.
*/
if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
- SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
- p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, NULL,
+ p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
p->scx.cgrp_moving_from = NULL;
}
@@ -4021,8 +4057,8 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
- p->scx.cgrp_moving_from, css->cgroup);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
out_unlock:
@@ -4035,7 +4071,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
if (scx_cgroup_enabled && tg->scx_weight != weight) {
if (SCX_HAS_OP(cgroup_set_weight))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
tg_cgrp(tg), weight);
tg->scx_weight = weight;
}
@@ -4224,7 +4260,7 @@ static void scx_cgroup_exit(void)
continue;
rcu_read_unlock();
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup);
rcu_read_lock();
css_put(css);
@@ -4261,7 +4297,7 @@ static int scx_cgroup_init(void)
continue;
rcu_read_unlock();
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
css_put(css);
@@ -4758,7 +4794,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
}
if (scx_ops.exit)
- SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
@@ -4965,7 +5001,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
if (SCX_HAS_OP(dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+ SCX_CALL_OP(SCX_KF_REST, dump_task, NULL, dctx, p);
ops_dump_exit();
}
@@ -5012,7 +5048,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
if (SCX_HAS_OP(dump)) {
ops_dump_init(&s, "");
- SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+ SCX_CALL_OP(SCX_KF_UNLOCKED, dump, NULL, &dctx);
ops_dump_exit();
}
@@ -5069,7 +5105,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
used = seq_buf_used(&ns);
if (SCX_HAS_OP(dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+ SCX_CALL_OP(SCX_KF_REST, dump_cpu, NULL, &dctx, cpu, idle);
ops_dump_exit();
}
@@ -5328,7 +5364,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_idle_enable(ops);
if (scx_ops.init) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
if (ret) {
ret = ops_sanitize_err("init", ret);
cpus_read_unlock();
@@ -6791,6 +6827,12 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
__alignof__(struct bpf_iter_scx_dsq));
+ /*
+ * next() and destroy() will be called regardless of the return value.
+ * Always clear $kit->dsq.
+ */
+ kit->dsq = NULL;
+
if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
return -EINVAL;
@@ -7077,13 +7119,32 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
}
if (ops_cpu_valid(cpu, NULL)) {
- struct rq *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
+ struct rq_flags rf;
+
+ /*
+ * When called with an rq lock held, restrict the operation
+ * to the corresponding CPU to prevent ABBA deadlocks.
+ */
+ if (locked_rq && rq != locked_rq) {
+ scx_ops_error("Invalid target CPU %d", cpu);
+ return;
+ }
+
+ /*
+ * If no rq lock is held, allow to operate on any CPU by
+ * acquiring the corresponding rq lock.
+ */
+ if (!locked_rq) {
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ }
rq->scx.cpuperf_target = perf;
+ cpufreq_update_util(rq, 0);
- rcu_read_lock_sched_notrace();
- cpufreq_update_util(cpu_rq(cpu), 0);
- rcu_read_unlock_sched_notrace();
+ if (!locked_rq)
+ rq_unlock_irqrestore(rq, &rf);
}
}
@@ -7314,12 +7375,6 @@ BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
-BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index cb343ca889e0..e67a19a071c1 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -674,7 +674,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
* managed by put_prev_task_idle()/set_next_task_idle().
*/
if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+ SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
/*
* Update the idle masks:
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e67d076f195..a009c91f7b05 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -164,10 +164,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
return ts;
}
+static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = tk->coarse_nsec;
+ return ts;
+}
+
+/*
+ * Update the nanoseconds part for the coarse time keepers. They can't rely
+ * on xtime_nsec because xtime_nsec could be adjusted by a small negative
+ * amount when the multiplication factor of the clock is adjusted, which
+ * could cause the coarse clocks to go slightly backwards. See
+ * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
+ * clockids which only is updated when the clock has been set or we have
+ * accumulated time.
+ */
+static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
+{
+ tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+}
+
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
+ tk_update_coarse_nsecs(tk);
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
@@ -175,6 +199,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
tk->xtime_sec += ts->tv_sec;
tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
+ tk_update_coarse_nsecs(tk);
}
static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
@@ -708,6 +733,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk_normalize_xtime(tk);
delta -= incr;
}
+ tk_update_coarse_nsecs(tk);
}
/**
@@ -804,8 +830,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ unsigned int seq;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -813,7 +839,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
- nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsecs = tk->coarse_nsec;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -2161,7 +2187,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
struct timekeeper *real_tk = &tk_core.timekeeper;
unsigned int clock_set = 0;
int shift = 0, maxshift;
- u64 offset;
+ u64 offset, orig_offset;
guard(raw_spinlock_irqsave)(&tk_core.lock);
@@ -2172,7 +2198,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
tk->tkr_mono.clock->max_raw_delta);
-
+ orig_offset = offset;
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
return false;
@@ -2205,6 +2231,14 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
*/
clock_set |= accumulate_nsecs_to_secs(tk);
+ /*
+ * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
+ * making small negative adjustments to the base xtime_nsec
+ * value, only update the coarse clocks if we accumulated time
+ */
+ if (orig_offset != offset)
+ tk_update_coarse_nsecs(tk);
+
timekeeping_update_from_shadow(&tk_core, clock_set);
return !!clock_set;
@@ -2248,7 +2282,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
@@ -2271,7 +2305,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
offset = tk_core.timekeeper.offs_real;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -2350,12 +2384,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_xtime(tk);
+ now = tk_xtime_coarse(tk);
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
- now.tv_nsec + mono.tv_nsec);
+ now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 01c2ab1e8971..32ef27c71b57 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -98,12 +98,12 @@ void update_vsyscall(struct timekeeper *tk)
/* CLOCK_REALTIME_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ vdso_ts->nsec = tk->coarse_nsec;
/* CLOCK_MONOTONIC_COARSE */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec = tk->coarse_nsec;
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);