From 37eacb9f6e89fb399a79e952bc9c78eb3e16290e Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 12 Apr 2024 16:11:00 +0200 Subject: bpf: Fix a verifier verbose message Long ago a map file descriptor in a pseudo ldimm64 instruction could only be present as an immediate value insn[0].imm, and thus this value was used in a verbose verifier message printed when the file descriptor wasn't valid. Since addition of BPF_PSEUDO_MAP_IDX_VALUE/BPF_PSEUDO_MAP_IDX the insn[0].imm field can also contain an index pointing to the file descriptor in the attr.fd_array array. However, if the file descriptor is invalid, the verifier still prints the verbose message containing value of insn[0].imm. Patch the verifier message to always print the actual file descriptor value. Fixes: 387544bfa291 ("bpf: Introduce fd_idx") Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240412141100.3562942-1-aspsk@isovalent.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98188379d5c7..aa24beb65393 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18289,8 +18289,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) f = fdget(fd); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose(env, "fd %d is not pointing to valid bpf_map\n", - insn[0].imm); + verbose(env, "fd %d is not pointing to valid bpf_map\n", fd); return PTR_ERR(map); } -- cgit From 11b1b8bc2b98e21ddf47e08b56c21502c685b2c3 Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Wed, 6 Mar 2024 10:21:32 +0800 Subject: sched/eevdf: Always update V if se->on_rq when reweighting reweight_eevdf() needs the latest V to do accurate calculation for new ve and vd. So update V unconditionally when se is runnable. Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Suggested-by: Abel Wu Signed-off-by: Tianchen Ding Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abel Wu Tested-by: K Prateek Nayak Tested-by: Chen Yu Link: https://lore.kernel.org/r/20240306022133.81008-2-dtcccc@linux.alibaba.com --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03be0d1330a6..5551ce2af73e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3790,9 +3790,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (se->on_rq) { /* commit outstanding execution time */ - if (curr) - update_curr(cfs_rq); - else + update_curr(cfs_rq); + if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } -- cgit From afae8002b4fd3560c8f5f1567f3c3202c30a70fa Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Wed, 6 Mar 2024 10:21:33 +0800 Subject: sched/eevdf: Fix miscalculation in reweight_entity() when se is not curr reweight_eevdf() only keeps V unchanged inside itself. When se != cfs_rq->curr, it would be dequeued from rb tree first. So that V is changed and the result is wrong. Pass the original V to reweight_eevdf() to fix this issue. Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Signed-off-by: Tianchen Ding [peterz: flip if() condition for clarity] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abel Wu Link: https://lkml.kernel.org/r/20240306022133.81008-3-dtcccc@linux.alibaba.com --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5551ce2af73e..6d266917d38d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3676,11 +3676,10 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif -static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, +static void reweight_eevdf(struct sched_entity *se, u64 avruntime, unsigned long weight) { unsigned long old_weight = se->load.weight; - u64 avruntime = avg_vruntime(cfs_rq); s64 vlag, vslice; /* @@ -3787,24 +3786,26 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; + u64 avruntime; if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); + avruntime = avg_vruntime(cfs_rq); if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - if (!se->on_rq) { + if (se->on_rq) { + reweight_eevdf(se, avruntime, weight); + } else { /* * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), * we need to scale se->vlag when w_i changes. */ se->vlag = div_s64(se->vlag * se->load.weight, weight); - } else { - reweight_eevdf(cfs_rq, se, weight); } update_load_set(&se->load, weight); -- cgit From 1560d1f6eb6b398bddd80c16676776c0325fe5fe Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Mon, 22 Apr 2024 16:22:38 +0800 Subject: sched/eevdf: Prevent vlag from going out of bounds in reweight_eevdf() It was possible to have pick_eevdf() return NULL, which then causes a NULL-deref. This turned out to be due to entity_eligible() returning falsely negative because of a s64 multiplcation overflow. Specifically, reweight_eevdf() computes the vlag without considering the limit placed upon vlag as update_entity_lag() does, and then the scaling multiplication (remember that weight is 20bit fixed point) can overflow. This then leads to the new vruntime being weird which then causes the above entity_eligible() to go side-ways and claim nothing is eligible. Thus limit the range of vlag accordingly. All this was quite rare, but fatal when it does happen. Closes: https://lore.kernel.org/all/ZhuYyrh3mweP_Kd8@nz.home/ Closes: https://lore.kernel.org/all/CA+9S74ih+45M_2TPUY_mPPVDhNvyYfy1J1ftSix+KjiTVxg8nw@mail.gmail.com/ Closes: https://lore.kernel.org/lkml/202401301012.2ed95df0-oliver.sang@intel.com/ Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Reported-by: Sergei Trofimovich Reported-by: Igor Raits Reported-by: Breno Leitao Reported-by: kernel test robot Reported-by: Yujie Liu Signed-off-by: Xuewen Yan Reviewed-and-tested-by: Chen Yu Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240422082238.5784-1-xuewen.yan@unisoc.com --- kernel/sched/fair.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6d266917d38d..c62805dbd608 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -696,15 +696,21 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) * * XXX could add max_slice to the augmented data to track this. */ -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +static s64 entity_lag(u64 avruntime, struct sched_entity *se) { - s64 lag, limit; + s64 vlag, limit; + + vlag = avruntime - se->vruntime; + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + + return clamp(vlag, -limit, limit); +} +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ SCHED_WARN_ON(!se->on_rq); - lag = avg_vruntime(cfs_rq) - se->vruntime; - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); - se->vlag = clamp(lag, -limit, limit); + se->vlag = entity_lag(avg_vruntime(cfs_rq), se); } /* @@ -3760,7 +3766,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, * = V - vl' */ if (avruntime != se->vruntime) { - vlag = (s64)(avruntime - se->vruntime); + vlag = entity_lag(avruntime, se); vlag = div_s64(vlag * old_weight, weight); se->vruntime = avruntime - vlag; } -- cgit From 57a01eafdcf78f6da34fad9ff075ed5dfdd9f420 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 23 Apr 2024 08:19:05 +0200 Subject: workqueue: Fix selection of wake_cpu in kick_pool() With cpu_possible_mask=0-63 and cpu_online_mask=0-7 the following kernel oops was observed: smp: Bringing up secondary CPUs ... smp: Brought up 1 node, 8 CPUs Unable to handle kernel pointer dereference in virtual kernel address space Failing address: 0000000000000000 TEID: 0000000000000803 [..] Call Trace: arch_vcpu_is_preempted+0x12/0x80 select_idle_sibling+0x42/0x560 select_task_rq_fair+0x29a/0x3b0 try_to_wake_up+0x38e/0x6e0 kick_pool+0xa4/0x198 __queue_work.part.0+0x2bc/0x3a8 call_timer_fn+0x36/0x160 __run_timers+0x1e2/0x328 __run_timer_base+0x5a/0x88 run_timer_softirq+0x40/0x78 __do_softirq+0x118/0x388 irq_exit_rcu+0xc0/0xd8 do_ext_irq+0xae/0x168 ext_int_handler+0xbe/0xf0 psw_idle_exit+0x0/0xc default_idle_call+0x3c/0x110 do_idle+0xd4/0x158 cpu_startup_entry+0x40/0x48 rest_init+0xc6/0xc8 start_kernel+0x3c4/0x5e0 startup_continue+0x3c/0x50 The crash is caused by calling arch_vcpu_is_preempted() for an offline CPU. To avoid this, select the cpu with cpumask_any_and_distribute() to mask __pod_cpumask with cpu_online_mask. In case no cpu is left in the pool, skip the assignment. tj: This doesn't fully fix the bug as CPUs can still go down between picking the target CPU and the wake call. Fixing that likely requires adding cpu_online() test to either the sched or s390 arch code. However, regardless of how that is fixed, workqueue shouldn't be picking a CPU which isn't online as that would result in unpredictable and worse behavior. Signed-off-by: Sven Schnelle Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues") Cc: stable@vger.kernel.org # v6.6+ Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0066c8f6c154..a2af0aaf026b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1277,8 +1277,12 @@ static bool kick_pool(struct worker_pool *pool) !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); - p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask); - get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; + int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask, + cpu_online_mask); + if (wake_cpu < nr_cpu_ids) { + p->wake_cpu = wake_cpu; + get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; + } } #endif wake_up_process(p); -- cgit From d40f92020c7a225b77e68599e4b099a4a0823408 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Apr 2024 14:43:48 -1000 Subject: workqueue: The default node_nr_active should have its max set to max_active The default nna (node_nr_active) is used when the pool isn't tied to a specific NUMA node. This can happen in the following cases: 1. On NUMA, if per-node pwq init failure and the fallback pwq is used. 2. On NUMA, if a pool is configured to span multiple nodes. 3. On single node setups. 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues") set the default nna->max to min_active because only #1 was being considered. For #2 and #3, using min_active means that the max concurrency in normal operation is pushed down to min_active which is currently 8, which can obviously lead to performance issues. exact value nna->max is set to doesn't really matter. #2 can only happen if the workqueue is intentionally configured to ignore NUMA boundaries and there's no good way to distribute max_active in this case. #3 is the default behavior on single node machines. Let's set it the default nna->max to max_active. This fixes the artificially lowered concurrency problem on single node machines and shouldn't hurt anything for other cases. Signed-off-by: Tejun Heo Reported-by: Shinichiro Kawasaki Fixes: 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues") Link: https://lore.kernel.org/dm-devel/20240410084531.2134621-1-shinichiro.kawasaki@wdc.com/ Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a2af0aaf026b..5f536c63a48d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1610,7 +1610,7 @@ static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) min_active, max_active); } - wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active; + wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; } /** -- cgit From 91f098704c25106d88706fc9f8bcfce01fdb97df Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Apr 2024 21:51:54 +0800 Subject: workqueue: Fix divide error in wq_update_node_max_active() Yue Sun and xingwei lee reported a divide error bug in wq_update_node_max_active(): divide error: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 PID: 21 Comm: cpuhp/1 Not tainted 6.9.0-rc5 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:wq_update_node_max_active+0x369/0x6b0 kernel/workqueue.c:1605 Code: 24 bf 00 00 00 80 44 89 fe e8 83 27 33 00 41 83 fc ff 75 0d 41 81 ff 00 00 00 80 0f 84 68 01 00 00 e8 fb 22 33 00 44 89 f8 99 <41> f7 fc 89 c5 89 c7 44 89 ee e8 a8 24 33 00 89 ef 8b 5c 24 04 89 RSP: 0018:ffffc9000018fbb0 EFLAGS: 00010293 RAX: 00000000000000ff RBX: 0000000000000001 RCX: ffff888100ada500 RDX: 0000000000000000 RSI: 00000000000000ff RDI: 0000000080000000 RBP: 0000000000000001 R08: ffffffff815b1fcd R09: 1ffff1100364ad72 R10: dffffc0000000000 R11: ffffed100364ad73 R12: 0000000000000000 R13: 0000000000000100 R14: 0000000000000000 R15: 00000000000000ff FS: 0000000000000000(0000) GS:ffff888135c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb8c06ca6f8 CR3: 000000010d6c6000 CR4: 0000000000750ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: workqueue_offline_cpu+0x56f/0x600 kernel/workqueue.c:6525 cpuhp_invoke_callback+0x4e1/0x870 kernel/cpu.c:194 cpuhp_thread_fun+0x411/0x7d0 kernel/cpu.c:1092 smpboot_thread_fn+0x544/0xa10 kernel/smpboot.c:164 kthread+0x2ed/0x390 kernel/kthread.c:388 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:244 Modules linked in: ---[ end trace 0000000000000000 ]--- After analysis, it happens when all of the CPUs in a workqueue's affinity get offine. The problem can be easily reproduced by: # echo 8 > /sys/devices/virtual/workqueue//cpumask # echo 0 > /sys/devices/system/cpu/cpu3/online Use the default max_actives for nodes when all of the CPUs in the workqueue's affinity get offline to fix the problem. Reported-by: Yue Sun Reported-by: xingwei lee Link: https://lore.kernel.org/lkml/CAEkJfYPGS1_4JqvpSo0=FM0S1ytB8CEbyreLTtWpR900dUZymw@mail.gmail.com/ Fixes: 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues") Cc: stable@vger.kernel.org Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f536c63a48d..d2dbe099286b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1598,6 +1598,15 @@ static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) if (off_cpu >= 0) total_cpus--; + /* If all CPUs of the wq get offline, use the default values */ + if (unlikely(!total_cpus)) { + for_each_node(node) + wq_node_nr_active(wq, node)->max = min_active; + + wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active; + return; + } + for_each_node(node) { int node_cpus; -- cgit From d99e3140a4d33e26066183ff727d8f02f56bec64 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:43 +0000 Subject: mm: turn folio_test_hugetlb into a PageType The current folio_test_hugetlb() can be fooled by a concurrent folio split into returning true for a folio which has never belonged to hugetlbfs. This can't happen if the caller holds a refcount on it, but we have a few places (memory-failure, compaction, procfs) which do not and should not take a speculative reference. Since hugetlb pages do not use individual page mapcounts (they are always fully mapped and use the entire_mapcount field to record the number of mappings), the PageType field is available now that page_mapcount() ignores the value in this field. In compaction and with CONFIG_DEBUG_VM enabled, the current implementation can result in an oops, as reported by Luis. This happens since 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") effectively added some VM_BUG_ON() checks in the PageHuge() testing path. [willy@infradead.org: update vmcoreinfo] Link: https://lkml.kernel.org/r/ZgGZUvsdhaT1Va-T@casper.infradead.org Link: https://lkml.kernel.org/r/20240321142448.1645400-6-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Reported-by: Luis Chamberlain Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218227 Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- kernel/vmcore_info.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index f95516cd45bb..23c125c2e243 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -205,11 +205,10 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_head_mask); #define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(PG_hugetlb); +#define PAGE_HUGETLB_MAPCOUNT_VALUE (~PG_hugetlb) + VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); -#endif #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); -- cgit From fe42754b94a42d08cf9501790afc25c4f6a5f631 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2024 17:05:54 -0700 Subject: cpu: Re-enable CPU mitigations by default for !X86 architectures Rename x86's to CPU_MITIGATIONS, define it in generic code, and force it on for all architectures exception x86. A recent commit to turn mitigations off by default if SPECULATION_MITIGATIONS=n kinda sorta missed that "cpu_mitigations" is completely generic, whereas SPECULATION_MITIGATIONS is x86-specific. Rename x86's SPECULATIVE_MITIGATIONS instead of keeping both and have it select CPU_MITIGATIONS, as having two configs for the same thing is unnecessary and confusing. This will also allow x86 to use the knob to manage mitigations that aren't strictly related to speculative execution. Use another Kconfig to communicate to common code that CPU_MITIGATIONS is already defined instead of having x86's menu depend on the common CPU_MITIGATIONS. This allows keeping a single point of contact for all of x86's mitigations, and it's not clear that other architectures *want* to allow disabling mitigations at compile-time. Fixes: f337a6a21e2f ("x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n") Closes: https://lkml.kernel.org/r/20240413115324.53303a68%40canb.auug.org.au Reported-by: Stephen Rothwell Reported-by: Michael Ellerman Reported-by: Geert Uytterhoeven Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Josh Poimboeuf Acked-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240420000556.2645001-2-seanjc@google.com --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 07ad53b7f119..bb0ff275fb46 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3207,8 +3207,8 @@ enum cpu_mitigations { }; static enum cpu_mitigations cpu_mitigations __ro_after_init = - IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : - CPU_MITIGATIONS_OFF; + IS_ENABLED(CONFIG_CPU_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : + CPU_MITIGATIONS_OFF; static int __init mitigations_parse_cmdline(char *arg) { -- cgit From ce0abef6a1d540acef85068e0e82bdf1fbeeb0e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Apr 2024 17:05:55 -0700 Subject: cpu: Ignore "mitigations" kernel parameter if CPU_MITIGATIONS=n Explicitly disallow enabling mitigations at runtime for kernels that were built with CONFIG_CPU_MITIGATIONS=n, as some architectures may omit code entirely if mitigations are disabled at compile time. E.g. on x86, a large pile of Kconfigs are buried behind CPU_MITIGATIONS, and trying to provide sane behavior for retroactively enabling mitigations is extremely difficult, bordering on impossible. E.g. page table isolation and call depth tracking require build-time support, BHI mitigations will still be off without additional kernel parameters, etc. [ bp: Touchups. ] Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240420000556.2645001-3-seanjc@google.com --- kernel/cpu.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index bb0ff275fb46..63447eb85dab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3196,6 +3196,7 @@ void __init boot_cpu_hotplug_init(void) this_cpu_write(cpuhp_state.target, CPUHP_ONLINE); } +#ifdef CONFIG_CPU_MITIGATIONS /* * These are used for a global "mitigations=" cmdline option for toggling * optional CPU mitigations. @@ -3206,9 +3207,7 @@ enum cpu_mitigations { CPU_MITIGATIONS_AUTO_NOSMT, }; -static enum cpu_mitigations cpu_mitigations __ro_after_init = - IS_ENABLED(CONFIG_CPU_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : - CPU_MITIGATIONS_OFF; +static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { @@ -3224,7 +3223,6 @@ static int __init mitigations_parse_cmdline(char *arg) return 0; } -early_param("mitigations", mitigations_parse_cmdline); /* mitigations=off */ bool cpu_mitigations_off(void) @@ -3239,3 +3237,11 @@ bool cpu_mitigations_auto_nosmt(void) return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; } EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); +#else +static int __init mitigations_parse_cmdline(char *arg) +{ + pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n"); + return 0; +} +#endif +early_param("mitigations", mitigations_parse_cmdline); -- cgit From 66e13b615a0ce76b785d780ecc9776ba71983629 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 24 Apr 2024 10:02:08 +0000 Subject: bpf: verifier: prevent userspace memory access With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To thwart invalid memory accesses, the JITs add an exception table entry for all such accesses. But in case the src_reg + offset is a userspace address, the BPF program might read that memory if the user has mapped it. Make the verifier add guard instructions around such memory accesses and skip the load if the address falls into the userspace region. The JITs need to implement bpf_arch_uaddress_limit() to define where the userspace addresses end for that architecture or TASK_SIZE is taken as default. The implementation is as follows: REG_AX = SRC_REG if(offset) REG_AX += offset; REG_AX >>= 32; if (REG_AX <= (uaddress_limit >> 32)) DST_REG = 0; else DST_REG = *(size *)(SRC_REG + offset); Comparing just the upper 32 bits of the load address with the upper 32 bits of uaddress_limit implies that the values are being aligned down to a 4GB boundary before comparison. The above means that all loads with address <= uaddress_limit + 4GB are skipped. This is acceptable because there is a large hole (much larger than 4GB) between userspace and kernel space memory, therefore a correctly functioning BPF program should not access this 4GB memory above the userspace. Let's analyze what this patch does to the following fentry program dereferencing an untrusted pointer: SEC("fentry/tcp_v4_connect") int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) { *(volatile long *)sk; return 0; } BPF Program before | BPF Program after ------------------ | ----------------- 0: (79) r1 = *(u64 *)(r1 +0) 0: (79) r1 = *(u64 *)(r1 +0) ----------------------------------------------------------------------- 1: (79) r1 = *(u64 *)(r1 +0) --\ 1: (bf) r11 = r1 ----------------------------\ \ 2: (77) r11 >>= 32 2: (b7) r0 = 0 \ \ 3: (b5) if r11 <= 0x8000 goto pc+2 3: (95) exit \ \-> 4: (79) r1 = *(u64 *)(r1 +0) \ 5: (05) goto pc+1 \ 6: (b7) r1 = 0 \-------------------------------------- 7: (b7) r0 = 0 8: (95) exit As you can see from above, in the best case (off=0), 5 extra instructions are emitted. Now, we analyze the same program after it has gone through the JITs of ARM64 and RISC-V architectures. We follow the single load instruction that has the untrusted pointer and see what instrumentation has been added around it. x86-64 JIT ========== JIT's Instrumentation (upstream) --------------------- 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 7: push %rbp 8: mov %rsp,%rbp b: mov 0x0(%rdi),%rdi --------------------------------- f: movabs $0x800000000000,%r11 19: cmp %r11,%rdi 1c: jb 0x000000000000002a 1e: mov %rdi,%r11 21: add $0x0,%r11 28: jae 0x000000000000002e 2a: xor %edi,%edi 2c: jmp 0x0000000000000032 2e: mov 0x0(%rdi),%rdi --------------------------------- 32: xor %eax,%eax 34: leave 35: ret The x86-64 JIT already emits some instructions to protect against user memory access. This patch doesn't make any changes for the x86-64 JIT. ARM64 JIT ========= No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: add x9, x30, #0x0 0: add x9, x30, #0x0 4: nop 4: nop 8: paciasp 8: paciasp c: stp x29, x30, [sp, #-16]! c: stp x29, x30, [sp, #-16]! 10: mov x29, sp 10: mov x29, sp 14: stp x19, x20, [sp, #-16]! 14: stp x19, x20, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 24: mov x25, sp 24: mov x25, sp 28: mov x26, #0x0 28: mov x26, #0x0 2c: sub x27, x25, #0x0 2c: sub x27, x25, #0x0 30: sub sp, sp, #0x0 30: sub sp, sp, #0x0 34: ldr x0, [x0] 34: ldr x0, [x0] -------------------------------------------------------------------------------- 38: ldr x0, [x0] ----------\ 38: add x9, x0, #0x0 -----------------------------------\\ 3c: lsr x9, x9, #32 3c: mov x7, #0x0 \\ 40: cmp x9, #0x10, lsl #12 40: mov sp, sp \\ 44: b.ls 0x0000000000000050 44: ldp x27, x28, [sp], #16 \\--> 48: ldr x0, [x0] 48: ldp x25, x26, [sp], #16 \ 4c: b 0x0000000000000054 4c: ldp x21, x22, [sp], #16 \ 50: mov x0, #0x0 50: ldp x19, x20, [sp], #16 \--------------------------------------- 54: ldp x29, x30, [sp], #16 54: mov x7, #0x0 58: add x0, x7, #0x0 58: mov sp, sp 5c: autiasp 5c: ldp x27, x28, [sp], #16 60: ret 60: ldp x25, x26, [sp], #16 64: nop 64: ldp x21, x22, [sp], #16 68: ldr x10, 0x0000000000000070 68: ldp x19, x20, [sp], #16 6c: br x10 6c: ldp x29, x30, [sp], #16 70: add x0, x7, #0x0 74: autiasp 78: ret 7c: nop 80: ldr x10, 0x0000000000000088 84: br x10 There are 6 extra instructions added in ARM64 in the best case. This will become 7 in the worst case (off != 0). RISC-V JIT (RISCV_ISA_C Disabled) ========== No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: nop 0: nop 4: nop 4: nop 8: li a6, 33 8: li a6, 33 c: addi sp, sp, -16 c: addi sp, sp, -16 10: sd s0, 8(sp) 10: sd s0, 8(sp) 14: addi s0, sp, 16 14: addi s0, sp, 16 18: ld a0, 0(a0) 18: ld a0, 0(a0) --------------------------------------------------------------- 1c: ld a0, 0(a0) --\ 1c: mv t0, a0 --------------------------\ \ 20: srli t0, t0, 32 20: li a5, 0 \ \ 24: lui t1, 4096 24: ld s0, 8(sp) \ \ 28: sext.w t1, t1 28: addi sp, sp, 16 \ \ 2c: bgeu t1, t0, 12 2c: sext.w a0, a5 \ \--> 30: ld a0, 0(a0) 30: ret \ 34: j 8 \ 38: li a0, 0 \------------------------------ 3c: li a5, 0 40: ld s0, 8(sp) 44: addi sp, sp, 16 48: sext.w a0, a5 4c: ret There are 7 extra instructions added in RISC-V. Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables") Reported-by: Breno Leitao Suggested-by: Alexei Starovoitov Acked-by: Ilya Leoshkevich Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240424100210.11982-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 9 +++++++++ kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 696bc55de8e8..1ea5ce5bb599 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2942,6 +2942,15 @@ bool __weak bpf_jit_supports_arena(void) return false; } +u64 __weak bpf_arch_uaddress_limit(void) +{ +#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) + return TASK_SIZE; +#else + return 0; +#endif +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aa24beb65393..cb7ad1f795e1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19675,6 +19675,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } + /* Make it impossible to de-reference a userspace address */ + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { + struct bpf_insn *patch = &insn_buf[0]; + u64 uaddress_limit = bpf_arch_uaddress_limit(); + + if (!uaddress_limit) + goto next_insn; + + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + if (insn->off) + *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off); + *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32); + *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2); + *patch++ = *insn; + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0); + + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ if (BPF_CLASS(insn->code) == BPF_LD && (BPF_MODE(insn->code) == BPF_ABS || -- cgit From 2e5449f4f21a1b0bd9beec4c4b580eb1f9b9ed7f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 27 Apr 2024 15:27:58 +0900 Subject: profiling: Remove create_prof_cpu_mask(). create_prof_cpu_mask() is no longer used after commit 1f44a225777e ("s390: convert interrupt handling to use generic hardirq"). Signed-off-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- kernel/profile.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 8a77769bc4b4..2b775cc5c28f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -344,49 +344,6 @@ void profile_tick(int type) #include #include -static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask)); - return 0; -} - -static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, prof_cpu_mask_proc_show, NULL); -} - -static ssize_t prof_cpu_mask_proc_write(struct file *file, - const char __user *buffer, size_t count, loff_t *pos) -{ - cpumask_var_t new_value; - int err; - - if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) - return -ENOMEM; - - err = cpumask_parse_user(buffer, count, new_value); - if (!err) { - cpumask_copy(prof_cpu_mask, new_value); - err = count; - } - free_cpumask_var(new_value); - return err; -} - -static const struct proc_ops prof_cpu_mask_proc_ops = { - .proc_open = prof_cpu_mask_proc_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = single_release, - .proc_write = prof_cpu_mask_proc_write, -}; - -void create_prof_cpu_mask(void) -{ - /* create /proc/irq/prof_cpu_mask */ - proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops); -} - /* * This function accesses profiling information. The returned data is * binary: the sampling step and the actual contents of the profile -- cgit From 5097cbcb38e6e0d2627c9dde1985e91d2c9f880e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 11 Apr 2024 16:39:05 +0200 Subject: sched/isolation: Prevent boot crash when the boot CPU is nohz_full Documentation/timers/no_hz.rst states that the "nohz_full=" mask must not include the boot CPU, which is no longer true after: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full"). However after: aae17ebb53cd ("workqueue: Avoid using isolated cpus' timers on queue_delayed_work") the kernel will crash at boot time in this case; housekeeping_any_cpu() returns an invalid CPU number until smp_init() brings the first housekeeping CPU up. Change housekeeping_any_cpu() to check the result of cpumask_any_and() and return smp_processor_id() in this case. This is just the simple and backportable workaround which fixes the symptom, but smp_processor_id() at boot time should be safe at least for type == HK_TYPE_TIMER, this more or less matches the tick_do_timer_boot_cpu logic. There is no worry about cpu_down(); tick_nohz_cpu_down() will not allow to offline tick_do_timer_cpu (the 1st online housekeeping CPU). Fixes: aae17ebb53cd ("workqueue: Avoid using isolated cpus' timers on queue_delayed_work") Reported-by: Chris von Recklinghausen Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Phil Auld Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240411143905.GA19288@redhat.com Closes: https://lore.kernel.org/all/20240402105847.GA24832@redhat.com/ --- kernel/sched/isolation.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 373d42c707bc..2a262d3ecb3d 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -46,7 +46,16 @@ int housekeeping_any_cpu(enum hk_type type) if (cpu < nr_cpu_ids) return cpu; - return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + if (likely(cpu < nr_cpu_ids)) + return cpu; + /* + * Unless we have another problem this can only happen + * at boot time before start_secondary() brings the 1st + * housekeeping CPU up. + */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING || + type != HK_TYPE_TIMER); } } return smp_processor_id(); -- cgit From 257bf89d84121280904800acd25cc2c444c717ae Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 13 Apr 2024 16:17:46 +0200 Subject: sched/isolation: Fix boot crash when maxcpus < first housekeeping CPU housekeeping_setup() checks cpumask_intersects(present, online) to ensure that the kernel will have at least one housekeeping CPU after smp_init(), but this doesn't work if the maxcpus= kernel parameter limits the number of processors available after bootup. For example, a kernel with "maxcpus=2 nohz_full=0-2" parameters crashes at boot time on a virtual machine with 4 CPUs. Change housekeeping_setup() to use cpumask_first_and() and check that the returned CPU number is valid and less than setup_max_cpus. Another corner case is "nohz_full=0" on a machine with a single CPU or with the maxcpus=1 kernel argument. In this case non_housekeeping_mask is empty and tick_nohz_full_setup() makes no sense. And indeed, the kernel hits the WARN_ON(tick_nohz_full_running) in tick_sched_do_timer(). And how should the kernel interpret the "nohz_full=" parameter? It should be silently ignored, but currently cpulist_parse() happily returns the empty cpumask and this leads to the same problem. Change housekeeping_setup() to check cpumask_empty(non_housekeeping_mask) and do nothing in this case. Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Phil Auld Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240413141746.GA10008@redhat.com --- kernel/sched/isolation.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 2a262d3ecb3d..5891e715f00d 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -118,6 +118,7 @@ static void __init housekeeping_setup_type(enum hk_type type, static int __init housekeeping_setup(char *str, unsigned long flags) { cpumask_var_t non_housekeeping_mask, housekeeping_staging; + unsigned int first_cpu; int err = 0; if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { @@ -138,7 +139,8 @@ static int __init housekeeping_setup(char *str, unsigned long flags) cpumask_andnot(housekeeping_staging, cpu_possible_mask, non_housekeeping_mask); - if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) { + first_cpu = cpumask_first_and(cpu_present_mask, housekeeping_staging); + if (first_cpu >= nr_cpu_ids || first_cpu >= setup_max_cpus) { __cpumask_set_cpu(smp_processor_id(), housekeeping_staging); __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); if (!housekeeping.flags) { @@ -147,6 +149,9 @@ static int __init housekeeping_setup(char *str, unsigned long flags) } } + if (cpumask_empty(non_housekeeping_mask)) + goto free_housekeeping_staging; + if (!housekeeping.flags) { /* First setup call ("nohz_full=" or "isolcpus=") */ enum hk_type type; -- cgit From 1dd1eff161bd55968d3d46bc36def62d71fb4785 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sat, 27 Apr 2024 18:28:08 +0800 Subject: softirq: Fix suspicious RCU usage in __do_softirq() Currently, the condition "__this_cpu_read(ksoftirqd) == current" is used to invoke rcu_softirq_qs() in ksoftirqd tasks context for non-RT kernels. This works correctly as long as the context is actually task context but this condition is wrong when: - the current task is ksoftirqd - the task is interrupted in a RCU read side critical section - __do_softirq() is invoked on return from interrupt Syzkaller triggered the following scenario: -> finish_task_switch() -> put_task_struct_rcu_user() -> call_rcu(&task->rcu, delayed_put_task_struct) -> __kasan_record_aux_stack() -> pfn_valid() -> rcu_read_lock_sched() __irq_exit_rcu() -> __do_softirq)() -> if (!IS_ENABLED(CONFIG_PREEMPT_RT) && __this_cpu_read(ksoftirqd) == current) -> rcu_softirq_qs() -> RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map)) The rcu quiescent state is reported in the rcu-read critical section, so the lockdep warning is triggered. Fix this by splitting out the inner working of __do_softirq() into a helper function which takes an argument to distinguish between ksoftirqd task context and interrupted context and invoke it from the relevant call sites with the proper context information and use that for the conditional invocation of rcu_softirq_qs(). Reported-by: syzbot+dce04ed6d1438ad69656@syzkaller.appspotmail.com Suggested-by: Thomas Gleixner Signed-off-by: Zqiang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240427102808.29356-1-qiang.zhang1211@gmail.com Link: https://lore.kernel.org/lkml/8f281a10-b85a-4586-9586-5bbc12dc784f@paulmck-laptop/T/#mea8aba4abfcb97bbf499d169ce7f30c4cff1b0e3 --- kernel/softirq.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b315b21fb28c..02582017759a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -508,7 +508,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage __visible void __softirq_entry __do_softirq(void) +static void handle_softirqs(bool ksirqd) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; @@ -563,8 +563,7 @@ restart: pending >>= softirq_bit; } - if (!IS_ENABLED(CONFIG_PREEMPT_RT) && - __this_cpu_read(ksoftirqd) == current) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && ksirqd) rcu_softirq_qs(); local_irq_disable(); @@ -584,6 +583,11 @@ restart: current_restore_flags(old_flags, PF_MEMALLOC); } +asmlinkage __visible void __softirq_entry __do_softirq(void) +{ + handle_softirqs(false); +} + /** * irq_enter_rcu - Enter an interrupt context with RCU watching */ @@ -921,7 +925,7 @@ static void run_ksoftirqd(unsigned int cpu) * We can safely run softirq on inline stack, as we are not deep * in the task stack here. */ - __do_softirq(); + handle_softirqs(true); ksoftirqd_run_end(); cond_resched(); return; -- cgit From dce3696271af7765f04428ec31b1b87dc7d016c6 Mon Sep 17 00:00:00 2001 From: LuMingYin Date: Sat, 27 Apr 2024 08:23:47 +0100 Subject: tracing/probes: Fix memory leak in traceprobe_parse_probe_arg_body() If traceprobe_parse_probe_arg_body() failed to allocate 'parg->fmt', it jumps to the label 'out' instead of 'fail' by mistake.In the result, the buffer 'tmp' is not freed in this case and leaks its memory. Thus jump to the label 'fail' in that error case. Link: https://lore.kernel.org/all/20240427072347.1421053-1-lumingyindetect@126.com/ Fixes: 032330abd08b ("tracing/probes: Cleanup probe argument parser") Signed-off-by: LuMingYin Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dfe3ee6035ec..42bc0f362226 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1466,7 +1466,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, parg->fmt = kmalloc(len, GFP_KERNEL); if (!parg->fmt) { ret = -ENOMEM; - goto out; + goto fail; } snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, parg->count); -- cgit From 5af385f5f4cddf908f663974847a4083b2ff2c79 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 29 Apr 2024 15:47:51 +0100 Subject: bounds: Use the right number of bits for power-of-two CONFIG_NR_CPUS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bits_per() rounds up to the next power of two when passed a power of two. This causes crashes on some machines and configurations. Reported-by: Михаил Новоселов Tested-by: Ильфат Гаптрахманов Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3347 Link: https://lore.kernel.org/all/1c978cf1-2934-4e66-e4b3-e81b04cb3571@rosalinux.ru/ Fixes: f2d5dcb48f7b (bounds: support non-power-of-two CONFIG_NR_CPUS) Cc: Signed-off-by: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Mel Gorman Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bounds.c b/kernel/bounds.c index c5a9fcd2d622..29b2cd00df2c 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -19,7 +19,7 @@ int main(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); #ifdef CONFIG_SMP - DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS)); + DEFINE(NR_CPUS_BITS, order_base_2(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); #ifdef CONFIG_LRU_GEN -- cgit From 75961ffb5cb3e5196f19cae7683f35cc88b50800 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 2 May 2024 10:37:23 +0100 Subject: swiotlb: initialise restricted pool list_head when SWIOTLB_DYNAMIC=y Using restricted DMA pools (CONFIG_DMA_RESTRICTED_POOL=y) in conjunction with dynamic SWIOTLB (CONFIG_SWIOTLB_DYNAMIC=y) leads to the following crash when initialising the restricted pools at boot-time: | Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 | Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP | pc : rmem_swiotlb_device_init+0xfc/0x1ec | lr : rmem_swiotlb_device_init+0xf0/0x1ec | Call trace: | rmem_swiotlb_device_init+0xfc/0x1ec | of_reserved_mem_device_init_by_idx+0x18c/0x238 | of_dma_configure_id+0x31c/0x33c | platform_dma_configure+0x34/0x80 faddr2line reveals that the crash is in the list validation code: include/linux/list.h:83 include/linux/rculist.h:79 include/linux/rculist.h:106 kernel/dma/swiotlb.c:306 kernel/dma/swiotlb.c:1695 because add_mem_pool() is trying to list_add_rcu() to a NULL 'mem->pools'. Fix the crash by initialising the 'mem->pools' list_head in rmem_swiotlb_device_init() before calling add_mem_pool(). Reported-by: Nikita Ioffe Tested-by: Nikita Ioffe Fixes: 1aaa736815eb ("swiotlb: allocate a new memory pool when existing pools are full") Signed-off-by: Will Deacon Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index a5e0dfc44d24..0de66f0ff43a 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1798,6 +1798,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, mem->for_alloc = true; #ifdef CONFIG_SWIOTLB_DYNAMIC spin_lock_init(&mem->lock); + INIT_LIST_HEAD_RCU(&mem->pools); #endif add_mem_pool(mem, pool); -- cgit From b63db58e2fa5d6963db9c45df88e60060f0ff35f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 2 May 2024 09:03:15 -0400 Subject: eventfs/tracing: Add callback for release of an eventfs_inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Synthetic events create and destroy tracefs files when they are created and removed. The tracing subsystem has its own file descriptor representing the state of the events attached to the tracefs files. There's a race between the eventfs files and this file descriptor of the tracing system where the following can cause an issue: With two scripts 'A' and 'B' doing: Script 'A': echo "hello int aaa" > /sys/kernel/tracing/synthetic_events while : do echo 0 > /sys/kernel/tracing/events/synthetic/hello/enable done Script 'B': echo > /sys/kernel/tracing/synthetic_events Script 'A' creates a synthetic event "hello" and then just writes zero into its enable file. Script 'B' removes all synthetic events (including the newly created "hello" event). What happens is that the opening of the "enable" file has: { struct trace_event_file *file = inode->i_private; int ret; ret = tracing_check_open_get_tr(file->tr); [..] But deleting the events frees the "file" descriptor, and a "use after free" happens with the dereference at "file->tr". The file descriptor does have a reference counter, but there needs to be a way to decrement it from the eventfs when the eventfs_inode is removed that represents this file descriptor. Add an optional "release" callback to the eventfs_entry array structure, that gets called when the eventfs file is about to be removed. This allows for the creating on the eventfs file to increment the tracing file descriptor ref counter. When the eventfs file is deleted, it can call the release function that will call the put function for the tracing file descriptor. This will protect the tracing file from being freed while a eventfs file that references it is being opened. Link: https://lore.kernel.org/linux-trace-kernel/20240426073410.17154-1-Tze-nan.Wu@mediatek.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240502090315.448cba46@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: Tze-nan wu Tested-by: Tze-nan Wu (吳澤南) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 52f75c36bbca..6ef29eba90ce 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2552,6 +2552,14 @@ static int event_callback(const char *name, umode_t *mode, void **data, return 0; } +/* The file is incremented on creation and freeing the enable file decrements it */ +static void event_release(const char *name, void *data) +{ + struct trace_event_file *file = data; + + event_file_put(file); +} + static int event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { @@ -2566,6 +2574,7 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { .name = "enable", .callback = event_callback, + .release = event_release, }, { .name = "filter", @@ -2634,6 +2643,9 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) return ret; } + /* Gets decremented on freeing of the "enable" file */ + event_file_get(file); + return 0; } -- cgit