diff options
author | Jakub Kicinski <kuba@kernel.org> | 2025-01-16 10:30:22 -0800 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2025-01-16 10:34:59 -0800 |
commit | 2ee738e90e80850582cbe10f34c6447965c1d87b (patch) | |
tree | 75a5d764d283bfac0648710f1e8c51680ef992fa /kernel | |
parent | b44e27b4df1a1cd3fd84cf26c82156ed0301575f (diff) | |
parent | ce69b4019001407f9cd738dd2ba217b3a8ab831b (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.13-rc8).
Conflicts:
drivers/net/ethernet/realtek/r8169_main.c
1f691a1fc4be ("r8169: remove redundant hwmon support")
152d00a91396 ("r8169: simplify setting hwmon attribute visibility")
https://lore.kernel.org/20250115122152.760b4e8d@canb.auug.org.au
Adjacent changes:
drivers/net/ethernet/broadcom/bnxt/bnxt.c
152f4da05aee ("bnxt_en: add support for rx-copybreak ethtool command")
f0aa6a37a3db ("eth: bnxt: always recalculate features after XDP clearing, fix null-deref")
drivers/net/ethernet/intel/ice/ice_type.h
50327223a8bb ("ice: add lock to protect low latency interface")
dc26548d729e ("ice: Fix quad registers read on E825")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup/cpuset.c | 44 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 2 | ||||
-rw-r--r-- | kernel/sched/ext.c | 87 | ||||
-rw-r--r-- | kernel/sched/ext.h | 8 | ||||
-rw-r--r-- | kernel/sched/idle.c | 5 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 6 | ||||
-rw-r--r-- | kernel/workqueue.c | 7 |
7 files changed, 97 insertions, 62 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f321ed515f3a..0f910c828973 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -197,10 +197,8 @@ static struct cpuset top_cpuset = { /* * There are two global locks guarding cpuset structures - cpuset_mutex and - * callback_lock. We also require taking task_lock() when dereferencing a - * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems - * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset + * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel + * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset * structures. Note that cpuset_mutex needs to be a mutex as it is used in * paths that rely on priority inheritance (e.g. scheduler - on RT) for * correctness. @@ -229,9 +227,6 @@ static struct cpuset top_cpuset = { * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. - * - * Accessing a task's cpuset should be done in accordance with the - * guidelines for accessing subsystem state in kernel/cgroup.c */ static DEFINE_MUTEX(cpuset_mutex); @@ -890,7 +885,15 @@ v2: */ if (cgrpv2) { for (i = 0; i < ndoms; i++) { - cpumask_copy(doms[i], csa[i]->effective_cpus); + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (csa[i] == &top_cpuset) + cpumask_and(doms[i], csa[i]->effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); if (dattr) dattr[i] = SD_ATTR_INIT; } @@ -3121,29 +3124,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, int retval = -ENODEV; buf = strstrip(buf); - - /* - * CPU or memory hotunplug may leave @cs w/o any execution - * resources, in which case the hotplug code asynchronously updates - * configuration and transfers all tasks to the nearest ancestor - * which can execute. - * - * As writes to "cpus" or "mems" may restore @cs's execution - * resources, wait for the previously scheduled operations before - * proceeding, so that we don't end up keep removing tasks added - * after execution capability is restored. - * - * cpuset_handle_hotplug may call back into cgroup core asynchronously - * via cgroup_transfer_tasks() and waiting for it from a cgroupfs - * operation like this one can lead to a deadlock through kernfs - * active_ref protection. Let's break the protection. Losing the - * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy - * hierarchies. - */ - css_get(&cs->css); - kernfs_break_active_protection(of->kn); - cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) @@ -3176,8 +3156,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); - kernfs_unbreak_active_protection(of->kn); - css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index fa04b14a7d72..5d71ef85420c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1915,6 +1915,7 @@ void uprobe_free_utask(struct task_struct *t) if (!utask) return; + t->utask = NULL; WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); timer_delete_sync(&utask->ri_timer); @@ -1924,7 +1925,6 @@ void uprobe_free_utask(struct task_struct *t) ri = free_ret_instance(ri, true /* cleanup_hprobe */); kfree(utask); - t->utask = NULL; } #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 19d2699cf638..19813b387ef9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2747,6 +2747,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; int nr_loops = SCX_DSP_MAX_LOOPS; lockdep_assert_rq_held(rq); @@ -2779,8 +2780,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * See scx_ops_disable_workfn() for the explanation on the * bypassing test. */ - if ((prev->scx.flags & SCX_TASK_QUEUED) && - prev->scx.slice && !scx_rq_bypassing(rq)) { + if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { rq->scx.flags |= SCX_RQ_BAL_KEEP; goto has_tasks; } @@ -2813,6 +2813,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) flush_dispatch_buf(rq); + if (prev_on_rq && prev->scx.slice) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } if (rq->scx.local_dsq.nr) goto has_tasks; if (consume_global_dsq(rq)) @@ -2838,8 +2842,7 @@ no_tasks: * Didn't find another task to run. Keep running @prev unless * %SCX_OPS_ENQ_LAST is in effect. */ - if ((prev->scx.flags & SCX_TASK_QUEUED) && - (!static_branch_unlikely(&scx_ops_enq_last) || + if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; goto has_tasks; @@ -3034,7 +3037,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, */ if (p->scx.slice && !scx_rq_bypassing(rq)) { dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); - return; + goto switch_class; } /* @@ -3051,6 +3054,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, } } +switch_class: if (next && next->sched_class != &ext_sched_class) switch_class(rq, next); } @@ -3586,16 +3590,8 @@ static void reset_idle_masks(void) cpumask_copy(idle_masks.smt, cpu_online_mask); } -void __scx_update_idle(struct rq *rq, bool idle) +static void update_builtin_idle(int cpu, bool idle) { - int cpu = cpu_of(rq); - - if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - if (!static_branch_unlikely(&scx_builtin_idle_enabled)) - return; - } - if (idle) cpumask_set_cpu(cpu, idle_masks.cpu); else @@ -3622,6 +3618,57 @@ void __scx_update_idle(struct rq *rq, bool idle) #endif } +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} + static void handle_hotplug(struct rq *rq, bool online) { int cpu = cpu_of(rq); @@ -4744,10 +4791,9 @@ static void scx_ops_bypass(bool bypass) */ for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; struct task_struct *p, *n; - rq_lock(rq, &rf); + raw_spin_rq_lock(rq); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); @@ -4763,7 +4809,7 @@ static void scx_ops_bypass(bool bypass) * sees scx_rq_bypassing() before moving tasks to SCX. */ if (!scx_enabled()) { - rq_unlock(rq, &rf); + raw_spin_rq_unlock(rq); continue; } @@ -4783,10 +4829,11 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } - rq_unlock(rq, &rf); - /* resched to restore ticks and idle state */ - resched_cpu(cpu); + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); + + raw_spin_rq_unlock(rq); } atomic_dec(&scx_ops_breather_depth); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index b1675bb59fc4..4d022d17ac7d 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ #if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) -void __scx_update_idle(struct rq *rq, bool idle); +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify); -static inline void scx_update_idle(struct rq *rq, bool idle) +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) { if (scx_enabled()) - __scx_update_idle(rq, idle); + __scx_update_idle(rq, idle, do_notify); } #else -static inline void scx_update_idle(struct rq *rq, bool idle) {} +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} #endif #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 621696269584..2c85c86b455f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { dl_server_update_idle_time(rq, prev); - scx_update_idle(rq, false); + scx_update_idle(rq, false, true); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); - scx_update_idle(rq, true); + scx_update_idle(rq, true, true); schedstat_inc(rq->sched_goidle); next->se.exec_start = rq_clock_task(rq); } struct task_struct *pick_task_idle(struct rq *rq) { + scx_update_idle(rq, true, false); return rq->idle; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 935a886af40c..0642ea174849 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -940,8 +940,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } /* a symbol specified */ symbol = kstrdup(argv[1], GFP_KERNEL); - if (!symbol) - return -ENOMEM; + if (!symbol) { + ret = -ENOMEM; + goto error; + } tmp = strchr(symbol, '%'); if (tmp) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f7d8fc204579..9362484a653c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2508,6 +2508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, return; } + WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu)); dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; @@ -2533,6 +2534,12 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, * @dwork: work to queue * @delay: number of jiffies to wait before queueing * + * We queue the delayed_work to a specific CPU, for non-zero delays the + * caller must ensure it is online and can't go away. Callers that fail + * to ensure this, may get @dwork->timer queued to an offlined CPU and + * this will prevent queueing of @dwork->work unless the offlined CPU + * becomes online again. + * * Return: %false if @work was already on a queue, %true otherwise. If * @delay is zero and @dwork is idle, it will be scheduled for immediate * execution. |