summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/helpers.c2
-rw-r--r--kernel/bpf/ringbuf.c2
-rw-r--r--kernel/power/hibernate.c4
-rw-r--r--kernel/power/main.c22
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/suspend.c1
-rw-r--r--kernel/sched/ext.c126
-rw-r--r--kernel/sched/sched.h1
8 files changed, 134 insertions, 25 deletions
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8eb117c52817..eb25e70e0bdc 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4345,6 +4345,7 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_local_irq_save)
BTF_ID_FLAGS(func, bpf_local_irq_restore)
+#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
@@ -4353,6 +4354,7 @@ BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+#endif
#ifdef CONFIG_DMA_SHARED_BUFFER
BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 719d73299397..d706c4b7f532 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -216,6 +216,8 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
{
+ irq_work_sync(&rb->work);
+
/* copy pages pointer and nr_pages to local variable, as we are going
* to unmap rb itself with vunmap() below
*/
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 14e85ff23551..53166ef86ba4 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -706,7 +706,6 @@ static void power_down(void)
#ifdef CONFIG_SUSPEND
if (hibernation_mode == HIBERNATION_SUSPEND) {
- pm_restore_gfp_mask();
error = suspend_devices_and_enter(mem_sleep_current);
if (!error)
goto exit;
@@ -746,9 +745,6 @@ static void power_down(void)
cpu_relax();
exit:
- /* Match the pm_restore_gfp_mask() call in hibernate(). */
- pm_restrict_gfp_mask();
-
/* Restore swap signature. */
error = swsusp_unmark();
if (error)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3cf2d7e72567..549f51ca3a1e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -31,23 +31,35 @@
* held, unless the suspend/hibernate code is guaranteed not to run in parallel
* with that modification).
*/
+static unsigned int saved_gfp_count;
static gfp_t saved_gfp_mask;
void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
- if (saved_gfp_mask) {
- gfp_allowed_mask = saved_gfp_mask;
- saved_gfp_mask = 0;
- }
+
+ if (WARN_ON(!saved_gfp_count) || --saved_gfp_count)
+ return;
+
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+
+ pm_pr_dbg("GFP mask restored\n");
}
void pm_restrict_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
- WARN_ON(saved_gfp_mask);
+
+ if (saved_gfp_count++) {
+ WARN_ON((saved_gfp_mask & ~(__GFP_IO | __GFP_FS)) != gfp_allowed_mask);
+ return;
+ }
+
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
+
+ pm_pr_dbg("GFP mask restricted\n");
}
unsigned int lock_system_sleep(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ff68ebaa1e0..dc0dfc349f22 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,6 +132,7 @@ int freeze_processes(void)
if (!pm_freezing)
static_branch_inc(&freezer_active);
+ pm_wakeup_clear(0);
pm_freezing = true;
error = try_to_freeze_tasks(true);
if (!error)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4bb4686c1c08..b4ca17c2fecf 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -595,7 +595,6 @@ static int enter_state(suspend_state_t state)
}
pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
- pm_wakeup_clear(0);
pm_suspend_clear_flags();
error = suspend_prepare(state);
if (error)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2b0e88206d07..ecb251e883ea 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
static struct delayed_work scx_watchdog_work;
-/* for %SCX_KICK_WAIT */
-static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
+/*
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
+ * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
+ * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
+ * lazily when enabling and freed when disabling to avoid waste when sched_ext
+ * isn't active.
+ */
+struct scx_kick_pseqs {
+ struct rcu_head rcu;
+ unsigned long seqs[];
+};
+
+static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
/*
* Direct dispatch marker.
@@ -780,13 +791,23 @@ static void schedule_deferred(struct rq *rq)
if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
return;
+ /* Don't do anything if there already is a deferred operation. */
+ if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING)
+ return;
+
/*
* If in balance, the balance callbacks will be called before rq lock is
* released. Schedule one.
+ *
+ *
+ * We can't directly insert the callback into the
+ * rq's list: The call can drop its lock and make the pending balance
+ * callback visible to unrelated code paths that call rq_pin_lock().
+ *
+ * Just let balance_one() know that it must do it itself.
*/
if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
- queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
- deferred_bal_cb_workfn);
+ rq->scx.flags |= SCX_RQ_BAL_CB_PENDING;
return;
}
@@ -2003,6 +2024,19 @@ static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
dspc->cursor = 0;
}
+static inline void maybe_queue_balance_callback(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+ if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING))
+ return;
+
+ queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
+ deferred_bal_cb_workfn);
+
+ rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
+}
+
static int balance_one(struct rq *rq, struct task_struct *prev)
{
struct scx_sched *sch = scx_root;
@@ -2150,6 +2184,8 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
#endif
rq_repin_lock(rq, rf);
+ maybe_queue_balance_callback(rq);
+
return ret;
}
@@ -3471,7 +3507,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
struct scx_dispatch_q *dsq;
int node;
+ irq_work_sync(&sch->error_irq_work);
kthread_stop(sch->helper->task);
+
free_percpu(sch->pcpu);
for_each_node_state(node, N_POSSIBLE)
@@ -3850,6 +3888,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
+static void free_kick_pseqs_rcu(struct rcu_head *rcu)
+{
+ struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
+
+ kvfree(pseqs);
+}
+
+static void free_kick_pseqs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+ struct scx_kick_pseqs *to_free;
+
+ to_free = rcu_replace_pointer(*pseqs, NULL, true);
+ if (to_free)
+ call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
+ }
+}
+
static void scx_disable_workfn(struct kthread_work *work)
{
struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
@@ -3986,6 +4045,7 @@ static void scx_disable_workfn(struct kthread_work *work)
free_percpu(scx_dsp_ctx);
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
+ free_kick_pseqs();
mutex_unlock(&scx_enable_mutex);
@@ -4348,6 +4408,33 @@ static void scx_vexit(struct scx_sched *sch,
irq_work_queue(&sch->error_irq_work);
}
+static int alloc_kick_pseqs(void)
+{
+ int cpu;
+
+ /*
+ * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
+ * can exceed percpu allocator limits on large machines.
+ */
+ for_each_possible_cpu(cpu) {
+ struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+ struct scx_kick_pseqs *new_pseqs;
+
+ WARN_ON_ONCE(rcu_access_pointer(*pseqs));
+
+ new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!new_pseqs) {
+ free_kick_pseqs();
+ return -ENOMEM;
+ }
+
+ rcu_assign_pointer(*pseqs, new_pseqs);
+ }
+
+ return 0;
+}
+
static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
{
struct scx_sched *sch;
@@ -4495,10 +4582,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock;
}
+ ret = alloc_kick_pseqs();
+ if (ret)
+ goto err_unlock;
+
sch = scx_alloc_and_add_sched(ops);
if (IS_ERR(sch)) {
ret = PTR_ERR(sch);
- goto err_unlock;
+ goto err_free_pseqs;
}
/*
@@ -4701,6 +4792,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return 0;
+err_free_pseqs:
+ free_kick_pseqs();
err_unlock:
mutex_unlock(&scx_enable_mutex);
return ret;
@@ -5082,10 +5175,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
{
struct rq *this_rq = this_rq();
struct scx_rq *this_scx = &this_rq->scx;
- unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+ struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
bool should_wait = false;
+ unsigned long *pseqs;
s32 cpu;
+ if (unlikely(!pseqs_pcpu)) {
+ pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
+ return;
+ }
+
+ pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
+
for_each_cpu(cpu, this_scx->cpus_to_kick) {
should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
@@ -5208,11 +5309,6 @@ void __init init_sched_ext_class(void)
scx_idle_init_masks();
- scx_kick_cpus_pnt_seqs =
- __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
- __alignof__(scx_kick_cpus_pnt_seqs[0]));
- BUG_ON(!scx_kick_cpus_pnt_seqs);
-
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
int n = cpu_to_node(cpu);
@@ -5688,8 +5784,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
@@ -5820,8 +5916,8 @@ __bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 361f9101cef9..adfb6e3409d7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -784,6 +784,7 @@ enum scx_rq_flags {
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
+ SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */
SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17,