summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2025-01-16 10:30:22 -0800
committerJakub Kicinski <kuba@kernel.org>2025-01-16 10:34:59 -0800
commit2ee738e90e80850582cbe10f34c6447965c1d87b (patch)
tree75a5d764d283bfac0648710f1e8c51680ef992fa /kernel
parentb44e27b4df1a1cd3fd84cf26c82156ed0301575f (diff)
parentce69b4019001407f9cd738dd2ba217b3a8ab831b (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.13-rc8). Conflicts: drivers/net/ethernet/realtek/r8169_main.c 1f691a1fc4be ("r8169: remove redundant hwmon support") 152d00a91396 ("r8169: simplify setting hwmon attribute visibility") https://lore.kernel.org/20250115122152.760b4e8d@canb.auug.org.au Adjacent changes: drivers/net/ethernet/broadcom/bnxt/bnxt.c 152f4da05aee ("bnxt_en: add support for rx-copybreak ethtool command") f0aa6a37a3db ("eth: bnxt: always recalculate features after XDP clearing, fix null-deref") drivers/net/ethernet/intel/ice/ice_type.h 50327223a8bb ("ice: add lock to protect low latency interface") dc26548d729e ("ice: Fix quad registers read on E825") Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cpuset.c44
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/sched/ext.c87
-rw-r--r--kernel/sched/ext.h8
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/workqueue.c7
7 files changed, 97 insertions, 62 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f321ed515f3a..0f910c828973 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -197,10 +197,8 @@ static struct cpuset top_cpuset = {
/*
* There are two global locks guarding cpuset structures - cpuset_mutex and
- * callback_lock. We also require taking task_lock() when dereferencing a
- * task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems
- * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
+ * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
+ * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
* structures. Note that cpuset_mutex needs to be a mutex as it is used in
* paths that rely on priority inheritance (e.g. scheduler - on RT) for
* correctness.
@@ -229,9 +227,6 @@ static struct cpuset top_cpuset = {
* The cpuset_common_seq_show() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
- *
- * Accessing a task's cpuset should be done in accordance with the
- * guidelines for accessing subsystem state in kernel/cgroup.c
*/
static DEFINE_MUTEX(cpuset_mutex);
@@ -890,7 +885,15 @@ v2:
*/
if (cgrpv2) {
for (i = 0; i < ndoms; i++) {
- cpumask_copy(doms[i], csa[i]->effective_cpus);
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (csa[i] == &top_cpuset)
+ cpumask_and(doms[i], csa[i]->effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
if (dattr)
dattr[i] = SD_ATTR_INIT;
}
@@ -3121,29 +3124,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
int retval = -ENODEV;
buf = strstrip(buf);
-
- /*
- * CPU or memory hotunplug may leave @cs w/o any execution
- * resources, in which case the hotplug code asynchronously updates
- * configuration and transfers all tasks to the nearest ancestor
- * which can execute.
- *
- * As writes to "cpus" or "mems" may restore @cs's execution
- * resources, wait for the previously scheduled operations before
- * proceeding, so that we don't end up keep removing tasks added
- * after execution capability is restored.
- *
- * cpuset_handle_hotplug may call back into cgroup core asynchronously
- * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
- * operation like this one can lead to a deadlock through kernfs
- * active_ref protection. Let's break the protection. Losing the
- * protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
- * hierarchies.
- */
- css_get(&cs->css);
- kernfs_break_active_protection(of->kn);
-
cpus_read_lock();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
@@ -3176,8 +3156,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
out_unlock:
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
- kernfs_unbreak_active_protection(of->kn);
- css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
return retval ?: nbytes;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index fa04b14a7d72..5d71ef85420c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1915,6 +1915,7 @@ void uprobe_free_utask(struct task_struct *t)
if (!utask)
return;
+ t->utask = NULL;
WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
timer_delete_sync(&utask->ri_timer);
@@ -1924,7 +1925,6 @@ void uprobe_free_utask(struct task_struct *t)
ri = free_ret_instance(ri, true /* cleanup_hprobe */);
kfree(utask);
- t->utask = NULL;
}
#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 19d2699cf638..19813b387ef9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2747,6 +2747,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
int nr_loops = SCX_DSP_MAX_LOOPS;
lockdep_assert_rq_held(rq);
@@ -2779,8 +2780,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* See scx_ops_disable_workfn() for the explanation on the
* bypassing test.
*/
- if ((prev->scx.flags & SCX_TASK_QUEUED) &&
- prev->scx.slice && !scx_rq_bypassing(rq)) {
+ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks;
}
@@ -2813,6 +2813,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
flush_dispatch_buf(rq);
+ if (prev_on_rq && prev->scx.slice) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
if (rq->scx.local_dsq.nr)
goto has_tasks;
if (consume_global_dsq(rq))
@@ -2838,8 +2842,7 @@ no_tasks:
* Didn't find another task to run. Keep running @prev unless
* %SCX_OPS_ENQ_LAST is in effect.
*/
- if ((prev->scx.flags & SCX_TASK_QUEUED) &&
- (!static_branch_unlikely(&scx_ops_enq_last) ||
+ if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks;
@@ -3034,7 +3037,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
*/
if (p->scx.slice && !scx_rq_bypassing(rq)) {
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
- return;
+ goto switch_class;
}
/*
@@ -3051,6 +3054,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
}
}
+switch_class:
if (next && next->sched_class != &ext_sched_class)
switch_class(rq, next);
}
@@ -3586,16 +3590,8 @@ static void reset_idle_masks(void)
cpumask_copy(idle_masks.smt, cpu_online_mask);
}
-void __scx_update_idle(struct rq *rq, bool idle)
+static void update_builtin_idle(int cpu, bool idle)
{
- int cpu = cpu_of(rq);
-
- if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
- if (!static_branch_unlikely(&scx_builtin_idle_enabled))
- return;
- }
-
if (idle)
cpumask_set_cpu(cpu, idle_masks.cpu);
else
@@ -3622,6 +3618,57 @@ void __scx_update_idle(struct rq *rq, bool idle)
#endif
}
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ */
+ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+
+ /*
+ * Update the idle masks:
+ * - for real idle transitions (do_notify == true)
+ * - for idle-to-idle transitions (indicated by the previous task
+ * being the idle thread, managed by pick_task_idle())
+ *
+ * Skip updating idle masks if the previous task is not the idle
+ * thread, since set_next_task_idle() has already handled it when
+ * transitioning from a task to the idle thread (calling this
+ * function with do_notify == true).
+ *
+ * In this way we can avoid updating the idle masks twice,
+ * unnecessarily.
+ */
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ if (do_notify || is_idle_task(rq->curr))
+ update_builtin_idle(cpu, idle);
+}
+
static void handle_hotplug(struct rq *rq, bool online)
{
int cpu = cpu_of(rq);
@@ -4744,10 +4791,9 @@ static void scx_ops_bypass(bool bypass)
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
struct task_struct *p, *n;
- rq_lock(rq, &rf);
+ raw_spin_rq_lock(rq);
if (bypass) {
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@@ -4763,7 +4809,7 @@ static void scx_ops_bypass(bool bypass)
* sees scx_rq_bypassing() before moving tasks to SCX.
*/
if (!scx_enabled()) {
- rq_unlock(rq, &rf);
+ raw_spin_rq_unlock(rq);
continue;
}
@@ -4783,10 +4829,11 @@ static void scx_ops_bypass(bool bypass)
sched_enq_and_set_task(&ctx);
}
- rq_unlock(rq, &rf);
-
/* resched to restore ticks and idle state */
- resched_cpu(cpu);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
+
+ raw_spin_rq_unlock(rq);
}
atomic_dec(&scx_ops_breather_depth);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index b1675bb59fc4..4d022d17ac7d 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
-void __scx_update_idle(struct rq *rq, bool idle);
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
-static inline void scx_update_idle(struct rq *rq, bool idle)
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
if (scx_enabled())
- __scx_update_idle(rq, idle);
+ __scx_update_idle(rq, idle, do_notify);
}
#else
-static inline void scx_update_idle(struct rq *rq, bool idle) {}
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
#endif
#ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 621696269584..2c85c86b455f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
dl_server_update_idle_time(rq, prev);
- scx_update_idle(rq, false);
+ scx_update_idle(rq, false, true);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
- scx_update_idle(rq, true);
+ scx_update_idle(rq, true, true);
schedstat_inc(rq->sched_goidle);
next->se.exec_start = rq_clock_task(rq);
}
struct task_struct *pick_task_idle(struct rq *rq)
{
+ scx_update_idle(rq, true, false);
return rq->idle;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 935a886af40c..0642ea174849 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -940,8 +940,10 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
/* a symbol specified */
symbol = kstrdup(argv[1], GFP_KERNEL);
- if (!symbol)
- return -ENOMEM;
+ if (!symbol) {
+ ret = -ENOMEM;
+ goto error;
+ }
tmp = strchr(symbol, '%');
if (tmp) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f7d8fc204579..9362484a653c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2508,6 +2508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
return;
}
+ WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu));
dwork->wq = wq;
dwork->cpu = cpu;
timer->expires = jiffies + delay;
@@ -2533,6 +2534,12 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
+ * We queue the delayed_work to a specific CPU, for non-zero delays the
+ * caller must ensure it is online and can't go away. Callers that fail
+ * to ensure this, may get @dwork->timer queued to an offlined CPU and
+ * this will prevent queueing of @dwork->work unless the offlined CPU
+ * becomes online again.
+ *
* Return: %false if @work was already on a queue, %true otherwise. If
* @delay is zero and @dwork is idle, it will be scheduled for immediate
* execution.