From eb3182ef0405ff2f6668fd3e5ff9883f60ce8801 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 15 Oct 2025 13:18:28 +0800 Subject: perf/core: Fix system hang caused by cpu-clock usage cpu-clock usage by the async-profiler tool can trigger a system hang, which got bisected back to the following commit by Octavia Togami: 18dbcbfabfff ("perf: Fix the POLL_HUP delivery breakage") causes this issue The root cause of the hang is that cpu-clock is a special type of SW event which relies on hrtimers. The __perf_event_overflow() callback is invoked from the hrtimer handler for cpu-clock events, and __perf_event_overflow() tries to call cpu_clock_event_stop() to stop the event, which calls htimer_cancel() to cancel the hrtimer. But that's a recursion into the hrtimer code from a hrtimer handler, which (unsurprisingly) deadlocks. To fix this bug, use hrtimer_try_to_cancel() instead, and set the PERF_HES_STOPPED flag, which causes perf_swevent_hrtimer() to stop the event once it sees the PERF_HES_STOPPED flag. [ mingo: Fixed the comments and improved the changelog. ] Closes: https://lore.kernel.org/all/CAHPNGSQpXEopYreir+uDDEbtXTBvBvi8c6fYXJvceqtgTPao3Q@mail.gmail.com/ Fixes: 18dbcbfabfff ("perf: Fix the POLL_HUP delivery breakage") Reported-by: Octavia Togami Suggested-by: Peter Zijlstra Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Octavia Togami Cc: stable@vger.kernel.org Link: https://github.com/lucko/spark/issues/530 Link: https://patch.msgid.link/20251015051828.12809-1-dapeng1.mi@linux.intel.com --- kernel/events/core.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 177e57c1a362..1fd347da9026 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11773,7 +11773,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event = container_of(hrtimer, struct perf_event, hw.hrtimer); - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE || + event->hw.state & PERF_HES_STOPPED) return HRTIMER_NORESTART; event->pmu->read(event); @@ -11819,15 +11820,20 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; /* - * The throttle can be triggered in the hrtimer handler. - * The HRTIMER_NORESTART should be used to stop the timer, - * rather than hrtimer_cancel(). See perf_swevent_hrtimer() + * Careful: this function can be triggered in the hrtimer handler, + * for cpu-clock events, so hrtimer_cancel() would cause a + * deadlock. + * + * So use hrtimer_try_to_cancel() to try to stop the hrtimer, + * and the cpu-clock handler also sets the PERF_HES_STOPPED flag, + * which guarantees that perf_swevent_hrtimer() will stop the + * hrtimer once it sees the PERF_HES_STOPPED flag. */ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); - hrtimer_cancel(&hwc->hrtimer); + hrtimer_try_to_cancel(&hwc->hrtimer); } } @@ -11871,12 +11877,14 @@ static void cpu_clock_event_update(struct perf_event *event) static void cpu_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) cpu_clock_event_update(event); @@ -11950,12 +11958,14 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) task_clock_event_update(event, event->ctx->time); -- cgit From 56b3c85e153b84f27e6cff39623ba40a1ad299d3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 27 Oct 2025 10:50:21 -0700 Subject: ftrace: Fix BPF fexit with livepatch When livepatch is attached to the same function as bpf trampoline with a fexit program, bpf trampoline code calls register_ftrace_direct() twice. The first time will fail with -EAGAIN, and the second time it will succeed. This requires register_ftrace_direct() to unregister the address on the first attempt. Otherwise, the bpf trampoline cannot attach. Here is an easy way to reproduce this issue: insmod samples/livepatch/livepatch-sample.ko bpftrace -e 'fexit:cmdline_proc_show {}' ERROR: Unable to attach probe: fexit:vmlinux:cmdline_proc_show... Fix this by cleaning up the hash when register_ftrace_function_nolock hits errors. Also, move the code that resets ops->func and ops->trampoline to the error path of register_ftrace_direct(); and add a helper function reset_direct() in register_ftrace_direct() and unregister_ftrace_direct(). Fixes: d05cb470663a ("ftrace: Fix modification of direct_function hash while in use") Cc: stable@vger.kernel.org # v6.6+ Reported-by: Andrey Grodzovsky Closes: https://lore.kernel.org/live-patching/c5058315a39d4615b333e485893345be@crowdstrike.com/ Cc: Steven Rostedt (Google) Cc: Masami Hiramatsu (Google) Acked-and-tested-by: Andrey Grodzovsky Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20251027175023.1521602-2-song@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- kernel/bpf/trampoline.c | 5 ----- kernel/trace/ftrace.c | 20 ++++++++++++++------ 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5949095e51c3..f2cb0b097093 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -479,11 +479,6 @@ again: * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ - /* reset fops->func and fops->trampoline for re-register */ - tr->fops->func = NULL; - tr->fops->trampoline = 0; - - /* free im memory and reallocate later */ bpf_tramp_image_free(im); goto again; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 42bd2ba68a82..cbeb7e833131 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5953,6 +5953,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp) free_ftrace_hash(fhp); } +static void reset_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + remove_direct_functions_hash(hash, addr); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -6048,6 +6059,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) ops->direct_call = addr; err = register_ftrace_function_nolock(ops); + if (err) + reset_direct(ops, addr); out_unlock: mutex_unlock(&direct_mutex); @@ -6080,7 +6093,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, bool free_filters) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; int err; if (check_direct_multi(ops)) @@ -6090,13 +6102,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, mutex_lock(&direct_mutex); err = unregister_ftrace_function(ops); - remove_direct_functions_hash(hash, addr); + reset_direct(ops, addr); mutex_unlock(&direct_mutex); - /* cleanup for possible another register call */ - ops->func = NULL; - ops->trampoline = 0; - if (free_filters) ftrace_free_filter(ops); return err; -- cgit From 3e9a18e1c3e931abecf501cbb23d28d69f85bb56 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 27 Oct 2025 10:50:22 -0700 Subject: ftrace: bpf: Fix IPMODIFY + DIRECT in modify_ftrace_direct() ftrace_hash_ipmodify_enable() checks IPMODIFY and DIRECT ftrace_ops on the same kernel function. When needed, ftrace_hash_ipmodify_enable() calls ops->ops_func() to prepare the direct ftrace (BPF trampoline) to share the same function as the IPMODIFY ftrace (livepatch). ftrace_hash_ipmodify_enable() is called in register_ftrace_direct() path, but not called in modify_ftrace_direct() path. As a result, the following operations will break livepatch: 1. Load livepatch to a kernel function; 2. Attach fentry program to the kernel function; 3. Attach fexit program to the kernel function. After 3, the kernel function being used will not be the livepatched version, but the original version. Fix this by adding __ftrace_hash_update_ipmodify() to __modify_ftrace_direct() and adjust some logic around the call. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20251027175023.1521602-3-song@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cbeb7e833131..59cfacb8a5bb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1971,7 +1971,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops) */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, - struct ftrace_hash *new_hash) + struct ftrace_hash *new_hash, + bool update_target) { struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; @@ -2006,10 +2007,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (rec->flags & FTRACE_FL_DISABLED) continue; - /* We need to update only differences of filter_hash */ + /* + * Unless we are updating the target of a direct function, + * we only need to update differences of filter_hash + */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); - if (in_old == in_new) + if (!update_target && (in_old == in_new)) continue; if (in_new) { @@ -2020,7 +2024,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (is_ipmodify) goto rollback; - FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + /* + * If this is called by __modify_ftrace_direct() + * then it is only changing where the direct + * pointer is jumping to, and the record already + * points to a direct trampoline. If it isn't, + * then it is a bug to update ipmodify on a direct + * caller. + */ + FTRACE_WARN_ON(!update_target && + (rec->flags & FTRACE_FL_DIRECT)); /* * Another ops with IPMODIFY is already @@ -2076,7 +2089,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false); } /* Disabling always succeeds */ @@ -2087,7 +2100,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false); } static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, @@ -2101,7 +2114,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, if (ftrace_hash_empty(new_hash)) new_hash = NULL; - return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false); } static void print_ip_ins(const char *fmt, const unsigned char *p) @@ -6114,7 +6127,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct); static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash; + struct ftrace_hash *hash = ops->func_hash->filter_hash; struct ftrace_func_entry *entry, *iter; static struct ftrace_ops tmp_ops = { .func = ftrace_stub, @@ -6134,13 +6147,21 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (err) return err; + /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, hash, hash, true); + if (err) + goto out; + /* * Now the ftrace_ops_list_func() is called to do the direct callers. * We can safely change the direct functions attached to each entry. */ mutex_lock(&ftrace_lock); - hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -6155,6 +6176,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) mutex_unlock(&ftrace_lock); +out: /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); -- cgit From ea0714d61dea6e00b853a0116d0afe2b2fe70ef3 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 4 Nov 2025 22:54:25 +0000 Subject: bpf:add _impl suffix for bpf_task_work_schedule* kfuncs Rename: bpf_task_work_schedule_resume()->bpf_task_work_schedule_resume_impl() bpf_task_work_schedule_signal()->bpf_task_work_schedule_signal_impl() This aligns task work scheduling kfuncs with the established naming scheme for kfuncs with the bpf_prog_aux argument provided by the verifier implicitly. This convention will be taken advantage of with the upcoming KF_IMPLICIT_ARGS feature to preserve backwards compatibility to BPF programs. Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20251104-implv2-v3-1-4772b9ae0e06@meta.com Signed-off-by: Alexei Starovoitov Acked-by: Ihor Solodrai --- kernel/bpf/helpers.c | 24 ++++++++++++++---------- kernel/bpf/verifier.c | 12 ++++++------ 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index eb25e70e0bdc..33173b027ccf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4169,7 +4169,8 @@ release_prog: } /** - * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4178,15 +4179,17 @@ release_prog: * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4195,9 +4198,10 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } @@ -4377,8 +4381,8 @@ BTF_ID_FLAGS(func, bpf_strnstr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff40e5e65c43..8314518c8d93 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12259,8 +12259,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, - KF_bpf_task_work_schedule_signal, - KF_bpf_task_work_schedule_resume, + KF_bpf_task_work_schedule_signal_impl, + KF_bpf_task_work_schedule_resume_impl, }; BTF_ID_LIST(special_kfunc_list) @@ -12331,13 +12331,13 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal) -BTF_ID(func, bpf_task_work_schedule_resume) +BTF_ID(func, bpf_task_work_schedule_signal_impl) +BTF_ID(func, bpf_task_work_schedule_resume_impl) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) -- cgit From 137cc92ffe2e71705fce112656a460d924934ebe Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 4 Nov 2025 22:54:26 +0000 Subject: bpf: add _impl suffix for bpf_stream_vprintk() kfunc Rename bpf_stream_vprintk() to bpf_stream_vprintk_impl(). This makes bpf_stream_vprintk() follow the already established "_impl" suffix-based naming convention for kfuncs with the bpf_prog_aux argument provided by the verifier implicitly. This convention will be taken advantage of with the upcoming KF_IMPLICIT_ARGS feature to preserve backwards compatibility to BPF programs. Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20251104-implv2-v3-2-4772b9ae0e06@meta.com Signed-off-by: Alexei Starovoitov Acked-by: Ihor Solodrai --- kernel/bpf/helpers.c | 2 +- kernel/bpf/stream.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 33173b027ccf..e4007fea4909 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4380,7 +4380,7 @@ BTF_ID_FLAGS(func, bpf_strnstr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index eb6c5a21c2ef..ff16c631951b 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -355,7 +355,8 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, void *aux__prog) { struct bpf_bprintf_data data = { .get_bin_args = true, -- cgit From 956dfda6a70885f18c0f8236a461aa2bc4f556ad Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 30 Oct 2025 11:27:55 +0800 Subject: sched/fair: Prevent cfs_rq from being unthrottled with zero runtime_remaining When a cfs_rq is to be throttled, its limbo list should be empty and that's why there is a warn in tg_throttle_down() for non empty cfs_rq->throttled_limbo_list. When running a test with the following hierarchy: root / \ A* ... / | \ ... B / \ C* where both A and C have quota settings, that warn on non empty limbo list is triggered for a cfs_rq of C, let's call it cfs_rq_c(and ignore the cpu part of the cfs_rq for the sake of simpler representation). Debug showed it happened like this: Task group C is created and quota is set, so in tg_set_cfs_bandwidth(), cfs_rq_c is initialized with runtime_enabled set, runtime_remaining equals to 0 and *unthrottled*. Before any tasks are enqueued to cfs_rq_c, *multiple* throttled tasks can migrate to cfs_rq_c (e.g., due to task group changes). When enqueue_task_fair(cfs_rq_c, throttled_task) is called and cfs_rq_c is in a throttled hierarchy (e.g., A is throttled), these throttled tasks are directly placed into cfs_rq_c's limbo list by enqueue_throttled_task(). Later, when A is unthrottled, tg_unthrottle_up(cfs_rq_c) enqueues these tasks. The first enqueue triggers check_enqueue_throttle(), and with zero runtime_remaining, cfs_rq_c can be throttled in throttle_cfs_rq() if it can't get more runtime and enters tg_throttle_down(), where the warning is hit due to remaining tasks in the limbo list. I think it's a chaos to trigger throttle on unthrottle path, the status of a being unthrottled cfs_rq can be in a mixed state in the end, so fix this by granting 1ns to cfs_rq in tg_set_cfs_bandwidth(). This ensures cfs_rq_c has a positive runtime_remaining when initialized as unthrottled and cannot enter tg_unthrottle_up() with zero runtime_remaining. Also, update outdated comments in tg_throttle_down() since unthrottle_cfs_rq() is no longer called with zero runtime_remaining. While at it, remove a redundant assignment to se in tg_throttle_down(). Fixes: e1fad12dcb66 ("sched/fair: Switch to task based throttle model") Reviewed-By: Benjamin Segall Suggested-by: Benjamin Segall Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Tested-by: Hao Jia Link: https://patch.msgid.link/20251030032755.560-1-ziqianlu@bytedance.com --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ebf67b48e2..f754a60de848 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9606,7 +9606,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; + cfs_rq->runtime_remaining = 1; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25970dbbb279..5b752324270b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6024,20 +6024,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; /* - * It's possible we are called with !runtime_remaining due to things - * like user changed quota setting(see tg_set_cfs_bandwidth()) or async - * unthrottled us with a positive runtime_remaining but other still - * running entities consumed those runtime before we reached here. + * It's possible we are called with runtime_remaining < 0 due to things + * like async unthrottled us with a positive runtime_remaining but other + * still running entities consumed those runtime before we reached here. * - * Anyway, we can't unthrottle this cfs_rq without any runtime remaining - * because any enqueue in tg_unthrottle_up() will immediately trigger a - * throttle, which is not supposed to happen on unthrottle path. + * We can't unthrottle this cfs_rq without any runtime remaining because + * any enqueue in tg_unthrottle_up() will immediately trigger a throttle, + * which is not supposed to happen on unthrottle path. */ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) return; - se = cfs_rq->tg->se[cpu_of(rq)]; - cfs_rq->throttled = 0; update_rq_clock(rq); -- cgit From 4cb5ac2626b5704ed712ac1d46b9d89fdfc12c5d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Jul 2025 16:29:46 +0200 Subject: futex: Optimize per-cpu reference counting Shrikanth noted that the per-cpu reference counter was still some 10% slower than the old immutable option (which removes the reference counting entirely). Further optimize the per-cpu reference counter by: - switching from RCU to preempt; - using __this_cpu_*() since we now have preempt disabled; - switching from smp_load_acquire() to READ_ONCE(). This is all safe because disabling preemption inhibits the RCU grace period exactly like rcu_read_lock(). Having preemption disabled allows using __this_cpu_*() provided the only access to the variable is in task context -- which is the case here. Furthermore, since we know changing fph->state to FR_ATOMIC demands a full RCU grace period we can rely on the implied smp_mb() from that to replace the acquire barrier(). This is very similar to the percpu_down_read_internal() fast-path. The reason this is significant for PowerPC is that it uses the generic this_cpu_*() implementation which relies on local_irq_disable() (the x86 implementation relies on it being a single memop instruction to be IRQ-safe). Switching to preempt_disable() and __this_cpu*() avoids this IRQ state swizzling. Also, PowerPC needs LWSYNC for the ACQUIRE barrier, not having to use explicit barriers safes a bunch. Combined this reduces the performance gap by half, down to some 5%. Fixes: 760e6f7befba ("futex: Remove support for IMMUTABLE") Reported-by: Shrikanth Hegde Tested-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20251106092929.GR4067720@noisy.programming.kicks-ass.net --- kernel/futex/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 125804fbb5cb..2e77a6e5c865 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1680,10 +1680,10 @@ static bool futex_ref_get(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; - guard(rcu)(); + guard(preempt)(); - if (smp_load_acquire(&fph->state) == FR_PERCPU) { - this_cpu_inc(*mm->futex_ref); + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_inc(*mm->futex_ref); return true; } @@ -1694,10 +1694,10 @@ static bool futex_ref_put(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; - guard(rcu)(); + guard(preempt)(); - if (smp_load_acquire(&fph->state) == FR_PERCPU) { - this_cpu_dec(*mm->futex_ref); + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_dec(*mm->futex_ref); return false; } -- cgit From 10d9dda426d684e98b17161f02f77894c6de9b60 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 7 Nov 2025 01:52:15 +0900 Subject: tracing: tprobe-events: Fix to register tracepoint correctly Since __tracepoint_user_init() calls tracepoint_user_register() without initializing tuser->tpoint with given tracpoint, it does not register tracepoint stub function as callback correctly, and tprobe does not work. Initializing tuser->tpoint correctly before tracepoint_user_register() so that it sets up tracepoint callback. I confirmed below example works fine again. echo "t sched_switch preempt prev_pid=prev->pid next_pid=next->pid" > /sys/kernel/tracing/dynamic_events echo 1 > /sys/kernel/tracing/events/tracepoints/sched_switch/enable cat /sys/kernel/tracing/trace_pipe Link: https://lore.kernel.org/all/176244793514.155515.6466348656998627773.stgit@devnote2/ Fixes: 2867495dea86 ("tracing: tprobe-events: Register tracepoint when enable tprobe event") Reported-by: Beau Belgrave Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Tested-by: Beau Belgrave Reviewed-by: Beau Belgrave --- kernel/trace/trace_fprobe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index ad9d6347b5fa..fd1b108ab639 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -106,13 +106,14 @@ static struct tracepoint_user *__tracepoint_user_init(const char *name, struct t if (!tuser->name) return NULL; + /* Register tracepoint if it is loaded. */ if (tpoint) { + tuser->tpoint = tpoint; ret = tracepoint_user_register(tuser); if (ret) return ERR_PTR(ret); } - tuser->tpoint = tpoint; tuser->refcount = 1; INIT_LIST_HEAD(&tuser->list); list_add(&tuser->list, &tracepoint_user_list); -- cgit From c91afa7610235f89a5e8f5686aac23892ab227ed Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 7 Nov 2025 01:52:24 +0900 Subject: tracing: tprobe-events: Fix to put tracepoint_user when disable the tprobe __unregister_trace_fprobe() checks tf->tuser to put it when removing tprobe. However, disable_trace_fprobe() does not use it and only calls unregister_fprobe(). Thus it forgets to disable tracepoint_user. If the trace_fprobe has tuser, put it for unregistering the tracepoint callbacks when disabling tprobe correctly. Link: https://lore.kernel.org/all/176244794466.155515.3971904050506100243.stgit@devnote2/ Fixes: 2867495dea86 ("tracing: tprobe-events: Register tracepoint when enable tprobe event") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Tested-by: Beau Belgrave Reviewed-by: Beau Belgrave --- kernel/trace/trace_fprobe.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index fd1b108ab639..8001dbf16891 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1514,6 +1514,10 @@ static int disable_trace_fprobe(struct trace_event_call *call, if (!trace_probe_is_enabled(tp)) { list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { unregister_fprobe(&tf->fp); + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; + } } } -- cgit From aa997d2d2a0b2e76f4df0f1f12829f02acb4fb6b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Oct 2025 13:28:48 -0400 Subject: ring-buffer: Do not warn in ring_buffer_map_get_reader() when reader catches up The function ring_buffer_map_get_reader() is a bit more strict than the other get reader functions, and except for certain situations the rb_get_reader_page() should not return NULL. If it does, it triggers a warning. This warning was triggering but after looking at why, it was because another acceptable situation was happening and it wasn't checked for. If the reader catches up to the writer and there's still data to be read on the reader page, then the rb_get_reader_page() will return NULL as there's no new page to get. In this situation, the reader page should not be updated and no warning should trigger. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Reported-by: syzbot+92a3745cea5ec6360309@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/690babec.050a0220.baf87.0064.GAE@google.com/ Link: https://lore.kernel.org/20251016132848.1b11bb37@gandalf.local.home Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1244d2c5c384..afcd3747264d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7344,6 +7344,10 @@ consume: goto out; } + /* Did the reader catch up with the writer? */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + goto out; + reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; -- cgit From 80f0d631dcc76ee1b7755bfca1d8417d91d71414 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Thu, 6 Nov 2025 12:01:32 +0000 Subject: tracing: Fix memory leaks in create_field_var() The function create_field_var() allocates memory for 'val' through create_hist_field() inside parse_atom(), and for 'var' through create_var(), which in turn allocates var->type and var->var.name internally. Simply calling kfree() to release these structures will result in memory leaks. Use destroy_hist_field() to properly free 'val', and explicitly release the memory of var->type and var->var.name before freeing 'var' itself. Link: https://patch.msgid.link/20251106120132.3639920-1-zilin@seu.edu.cn Fixes: 02205a6752f22 ("tracing: Add support for 'field variables'") Signed-off-by: Zilin Guan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1d536219b624..6bfaf1210dd2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3272,14 +3272,16 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); - kfree(val); + destroy_hist_field(val, 0); ret = PTR_ERR(var); goto err; } field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); if (!field_var) { - kfree(val); + destroy_hist_field(val, 0); + kfree_const(var->type); + kfree(var->var.name); kfree(var); ret = -ENOMEM; goto err; -- cgit From 62b9ca1706e1bbb60d945a58de7c7b5826f6b2a2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 5 Nov 2025 22:51:05 -0600 Subject: PM: hibernate: Emit an error when image writing fails If image writing fails, a return code is passed up to the caller, but none of the callers log anything to the log and so the only record of it is the return code that userspace gets. Adjust the logging so that the image size and speed of writing is only emitted on success and if there is an error, it's saved to the logs. Fixes: a06c6f5d3cc9 ("PM: hibernate: Move to crypto APIs for LZO compression") Reported-by: Askar Safin Closes: https://lore.kernel.org/linux-pm/20251105180506.137448-1-safinaskar@gmail.com/ Signed-off-by: Mario Limonciello (AMD) Tested-by: Askar Safin Cc: 6.9+ # 6.9+ [ rjw: Added missing braces after "else", changelog edits ] Link: https://patch.msgid.link/20251106045158.3198061-2-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 0beff7eeaaba..7daa716d2cb1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -877,11 +877,14 @@ out_finish: stop = ktime_get(); if (!ret) ret = err2; - if (!ret) + if (!ret) { + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %d kbytes\n", + (atomic_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); - swsusp_show_speed(start, stop, nr_to_write, "Wrote"); - pr_info("Image size after compression: %d kbytes\n", - (atomic_read(&compressed_size) / 1024)); + } else { + pr_err("Image saving failed: %d\n", ret); + } out_clean: hib_finish_batch(&hb); -- cgit From 66ededc694f1d06a71ca35a3c8e3689e9b85b3ce Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 5 Nov 2025 22:51:06 -0600 Subject: PM: hibernate: Use atomic64_t for compressed_size variable `compressed_size` can overflow, showing nonsensical values. Change from `atomic_t` to `atomic64_t` to prevent overflow. Fixes: a06c6f5d3cc9 ("PM: hibernate: Move to crypto APIs for LZO compression") Reported-by: Askar Safin Closes: https://lore.kernel.org/linux-pm/20251105180506.137448-1-safinaskar@gmail.com/ Signed-off-by: Mario Limonciello (AMD) Tested-by: Askar Safin Cc: 6.9+ # 6.9+ Link: https://patch.msgid.link/20251106045158.3198061-3-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7daa716d2cb1..e0441483dbee 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -635,7 +635,7 @@ struct cmp_data { }; /* Indicates the image size after compression */ -static atomic_t compressed_size = ATOMIC_INIT(0); +static atomic64_t compressed_size = ATOMIC_INIT(0); /* * Compression function that runs in its own thread. @@ -664,7 +664,7 @@ static int compress_threadfn(void *data) d->ret = crypto_acomp_compress(d->cr); d->cmp_len = d->cr->dlen; - atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); + atomic64_add(d->cmp_len, &compressed_size); atomic_set_release(&d->stop, 1); wake_up(&d->done); } @@ -696,7 +696,7 @@ static int save_compressed_image(struct swap_map_handle *handle, hib_init_batch(&hb); - atomic_set(&compressed_size, 0); + atomic64_set(&compressed_size, 0); /* * We'll limit the number of threads for compression to limit memory @@ -879,8 +879,8 @@ out_finish: ret = err2; if (!ret) { swsusp_show_speed(start, stop, nr_to_write, "Wrote"); - pr_info("Image size after compression: %d kbytes\n", - (atomic_read(&compressed_size) / 1024)); + pr_info("Image size after compression: %lld kbytes\n", + (atomic64_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); } else { pr_err("Image saving failed: %d\n", ret); -- cgit From 0b6c10cb8479d0d1b7b208277df2e2afe082d4bd Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 5 Nov 2025 22:51:07 -0600 Subject: PM: hibernate: Fix style issues in save_compressed_image() Address two issues indicated by checkpatch: - Trailing statements should be on next line. - Prefer 'unsigned int' to bare use of 'unsigned'. Signed-off-by: Mario Limonciello (AMD) [ rjw: Changelog edits ] Link: https://patch.msgid.link/20251106045158.3198061-4-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e0441483dbee..70ae21f7370d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -689,7 +689,7 @@ static int save_compressed_image(struct swap_map_handle *handle, ktime_t start; ktime_t stop; size_t off; - unsigned thr, run_threads, nr_threads; + unsigned int thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; @@ -902,7 +902,8 @@ out_clean: } vfree(data); } - if (page) free_page((unsigned long)page); + if (page) + free_page((unsigned long)page); return ret; } -- cgit From e38f65d317df1fd2dcafe614d9c537475ecf9992 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Mon, 20 Oct 2025 20:08:50 -0400 Subject: kho: warn and fail on metadata or preserved memory in scratch area Patch series "KHO: kfence + KHO memory corruption fix", v3. This series fixes a memory corruption bug in KHO that occurs when KFENCE is enabled. The root cause is that KHO metadata, allocated via kzalloc(), can be randomly serviced by kfence_alloc(). When a kernel boots via KHO, the early memblock allocator is restricted to a "scratch area". This forces the KFENCE pool to be allocated within this scratch area, creating a conflict. If KHO metadata is subsequently placed in this pool, it gets corrupted during the next kexec operation. Google is using KHO and have had obscure crashes due to this memory corruption, with stacks all over the place. I would prefer this fix to be properly backported to stable so we can also automatically consume it once we switch to the upstream KHO. Patch 1/3 introduces a debug-only feature (CONFIG_KEXEC_HANDOVER_DEBUG) that adds checks to detect and fail any operation that attempts to place KHO metadata or preserved memory within the scratch area. This serves as a validation and diagnostic tool to confirm the problem without affecting production builds. Patch 2/3 Increases bitmap to PAGE_SIZE, so buddy allocator can be used. Patch 3/3 Provides the fix by modifying KHO to allocate its metadata directly from the buddy allocator instead of slab. This bypasses the KFENCE interception entirely. This patch (of 3): It is invalid for KHO metadata or preserved memory regions to be located within the KHO scratch area, as this area is overwritten when the next kernel is loaded, and used early in boot by the next kernel. This can lead to memory corruption. Add checks to kho_preserve_* and KHO's internal metadata allocators (xa_load_or_alloc, new_chunk) to verify that the physical address of the memory does not overlap with any defined scratch region. If an overlap is detected, the operation will fail and a WARN_ON is triggered. To avoid performance overhead in production kernels, these checks are enabled only when CONFIG_KEXEC_HANDOVER_DEBUG is selected. [rppt@kernel.org: fix KEXEC_HANDOVER_DEBUG Kconfig dependency] Link: https://lkml.kernel.org/r/aQHUyyFtiNZhx8jo@kernel.org [pasha.tatashin@soleen.com: build fix] Link: https://lkml.kernel.org/r/CA+CK2bBnorfsTymKtv4rKvqGBHs=y=MjEMMRg_tE-RME6n-zUw@mail.gmail.com Link: https://lkml.kernel.org/r/20251021000852.2924827-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20251021000852.2924827-2-pasha.tatashin@soleen.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin Signed-off-by: Mike Rapoport Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Christian Brauner Cc: David Matlack Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Samiullah Khawaja Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 9 +++++++ kernel/Makefile | 1 + kernel/kexec_handover.c | 57 ++++++++++++++++++++++++++-------------- kernel/kexec_handover_debug.c | 25 ++++++++++++++++++ kernel/kexec_handover_internal.h | 20 ++++++++++++++ 5 files changed, 93 insertions(+), 19 deletions(-) create mode 100644 kernel/kexec_handover_debug.c create mode 100644 kernel/kexec_handover_internal.h (limited to 'kernel') diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 422270d64820..54e581072617 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -109,6 +109,15 @@ config KEXEC_HANDOVER to keep data or state alive across the kexec. For this to work, both source and target kernels need to have this option enabled. +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index df3dd8291bb6..9fe722305c9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 76f0940fb485..0bc9001e532a 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "KHO: " fmt +#include #include #include #include @@ -22,6 +23,7 @@ #include +#include "kexec_handover_internal.h" /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. @@ -133,26 +135,26 @@ static struct kho_out kho_out = { static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) { - void *elm, *res; + void *res = xa_load(xa, index); - elm = xa_load(xa, index); - if (elm) - return elm; + if (res) + return res; + + void *elm __free(kfree) = kzalloc(sz, GFP_KERNEL); - elm = kzalloc(sz, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), sz))) + return ERR_PTR(-EINVAL); + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); if (xa_is_err(res)) - res = ERR_PTR(xa_err(res)); - - if (res) { - kfree(elm); + return ERR_PTR(xa_err(res)); + else if (res) return res; - } - return elm; + return no_free_ptr(elm); } static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, @@ -345,15 +347,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk; + struct khoser_mem_chunk *chunk __free(kfree) = NULL; chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!chunk) - return NULL; + return ERR_PTR(-ENOMEM); + + if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + chunk->hdr.order = order; if (cur_chunk) KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return chunk; + return no_free_ptr(chunk); } static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) @@ -374,14 +380,17 @@ static int kho_mem_serialize(struct kho_serialization *ser) struct khoser_mem_chunk *chunk = NULL; struct kho_mem_phys *physxa; unsigned long order; + int err = -ENOMEM; xa_for_each(&ser->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } if (!first_chunk) first_chunk = chunk; @@ -391,8 +400,10 @@ static int kho_mem_serialize(struct kho_serialization *ser) if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } } elm = &chunk->bitmaps[chunk->hdr.num_elms]; @@ -409,7 +420,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) err_free: kho_mem_ser_free(first_chunk); - return -ENOMEM; + return err; } static void __init deserialize_bitmap(unsigned int order, @@ -465,8 +476,8 @@ static void __init kho_mem_deserialize(const void *fdt) * area for early allocations that happen before page allocator is * initialized. */ -static struct kho_scratch *kho_scratch; -static unsigned int kho_scratch_cnt; +struct kho_scratch *kho_scratch; +unsigned int kho_scratch_cnt; /* * The scratch areas are scaled by default as percent of memory allocated from @@ -752,6 +763,9 @@ int kho_preserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.ser.track; + if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) + return -EINVAL; + return __kho_preserve_order(track, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -775,6 +789,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) unsigned long failed_pfn = 0; int err = 0; + if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT))) { + return -EINVAL; + } + while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); diff --git a/kernel/kexec_handover_debug.c b/kernel/kexec_handover_debug.c new file mode 100644 index 000000000000..6efb696f5426 --- /dev/null +++ b/kernel/kexec_handover_debug.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debug.c - kexec handover optional debug functionality + * Copyright (C) 2025 Google LLC, Pasha Tatashin + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include "kexec_handover_internal.h" + +bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + phys_addr_t scratch_start, scratch_end; + unsigned int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + scratch_start = kho_scratch[i].addr; + scratch_end = kho_scratch[i].addr + kho_scratch[i].size; + + if (phys < scratch_end && (phys + size) > scratch_start) + return true; + } + + return false; +} diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h new file mode 100644 index 000000000000..3c3c7148ceed --- /dev/null +++ b/kernel/kexec_handover_internal.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include +#include + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ -- cgit From a2fff99f92dae9c0eaf0d75de3def70ec68dad92 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Mon, 20 Oct 2025 20:08:51 -0400 Subject: kho: increase metadata bitmap size to PAGE_SIZE KHO memory preservation metadata is preserved in 512 byte chunks which requires their allocation from slab allocator. Slabs are not safe to be used with KHO because of kfence, and because partial slabs may lead leaks to the next kernel. Change the size to be PAGE_SIZE. The kfence specifically may cause memory corruption, where it randomly provides slab objects that can be within the scratch area. The reason for that is that kfence allocates its objects prior to KHO scratch is marked as CMA region. While this change could potentially increase metadata overhead on systems with sparsely preserved memory, this is being mitigated by ongoing work to reduce sparseness during preservation via 1G guest pages. Furthermore, this change aligns with future work on a stateless KHO, which will also use page-sized bitmaps for its radix tree metadata. Link: https://lkml.kernel.org/r/20251021000852.2924827-3-pasha.tatashin@soleen.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Christian Brauner Cc: David Matlack Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Samiullah Khawaja Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 0bc9001e532a..9217d2fdd2d3 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -69,10 +69,10 @@ early_param("kho", kho_parse_enable); * Keep track of memory that is to be preserved across KHO. * * The serializing side uses two levels of xarrays to manage chunks of per-order - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most - * 512K of bitmap memory will be needed for order 0. + * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order + * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 + * allocations each bitmap will cover 128M of address space. Thus, for 16G of + * memory at most 512K of bitmap memory will be needed for order 0. * * This approach is fully incremental, as the serialization progresses folios * can continue be aggregated to the tracker. The final step, immediately prior @@ -80,12 +80,14 @@ early_param("kho", kho_parse_enable); * successor kernel to parse. */ -#define PRESERVE_BITS (512 * 8) +#define PRESERVE_BITS (PAGE_SIZE * 8) struct kho_mem_phys_bits { DECLARE_BITMAP(preserve, PRESERVE_BITS); }; +static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); + struct kho_mem_phys { /* * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized @@ -133,19 +135,19 @@ static struct kho_out kho_out = { .finalized = false, }; -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) { void *res = xa_load(xa, index); if (res) return res; - void *elm __free(kfree) = kzalloc(sz, GFP_KERNEL); + void *elm __free(kfree) = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); - if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), sz))) + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) return ERR_PTR(-EINVAL); res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); @@ -218,8 +220,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } } - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, - sizeof(*bits)); + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); if (IS_ERR(bits)) return PTR_ERR(bits); -- cgit From fa759cd75bce5489eed34596daa53f721849a86f Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Mon, 20 Oct 2025 20:08:52 -0400 Subject: kho: allocate metadata directly from the buddy allocator KHO allocates metadata for its preserved memory map using the slab allocator via kzalloc(). This metadata is temporary and is used by the next kernel during early boot to find preserved memory. A problem arises when KFENCE is enabled. kzalloc() calls can be randomly intercepted by kfence_alloc(), which services the allocation from a dedicated KFENCE memory pool. This pool is allocated early in boot via memblock. When booting via KHO, the memblock allocator is restricted to a "scratch area", forcing the KFENCE pool to be allocated within it. This creates a conflict, as the scratch area is expected to be ephemeral and overwriteable by a subsequent kexec. If KHO metadata is placed in this KFENCE pool, it leads to memory corruption when the next kernel is loaded. To fix this, modify KHO to allocate its metadata directly from the buddy allocator instead of slab. Link: https://lkml.kernel.org/r/20251021000852.2924827-4-pasha.tatashin@soleen.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: David Matlack Cc: Alexander Graf Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Samiullah Khawaja Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 9217d2fdd2d3..2a8c20c238a8 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -142,7 +142,7 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) if (res) return res; - void *elm __free(kfree) = kzalloc(PAGE_SIZE, GFP_KERNEL); + void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); @@ -348,9 +348,9 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk __free(kfree) = NULL; + struct khoser_mem_chunk *chunk __free(free_page) = NULL; - chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + chunk = (void *)get_zeroed_page(GFP_KERNEL); if (!chunk) return ERR_PTR(-ENOMEM); -- cgit From ec4d11fc4b2dd4a2fa8c9d801ee9753b74623554 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 28 Oct 2025 12:51:25 +0100 Subject: gcov: add support for GCC 15 Using gcov on kernels compiled with GCC 15 results in truncated 16-byte long .gcda files with no usable data. To fix this, update GCOV_COUNTERS to match the value defined by GCC 15. Tested with GCC 14.3.0 and GCC 15.2.0. Link: https://lkml.kernel.org/r/20251028115125.1319410-1-oberpar@linux.ibm.com Signed-off-by: Peter Oberparleiter Reported-by: Matthieu Baerts Closes: https://github.com/linux-test-project/lcov/issues/445 Tested-by: Matthieu Baerts Cc: Signed-off-by: Andrew Morton --- kernel/gcov/gcc_4_7.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index a08cc076f332..ffde93d051a4 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include #include "gcov.h" -#if (__GNUC__ >= 14) +#if (__GNUC__ >= 15) +#define GCOV_COUNTERS 10 +#elif (__GNUC__ >= 14) #define GCOV_COUNTERS 9 #elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 -- cgit From 0b07092d09e54e49b85379a9c60f82d54a881514 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Mon, 3 Nov 2025 12:01:57 +0100 Subject: kho: fix out-of-bounds access of vmalloc chunk The list of pages in a vmalloc chunk is NULL-terminated. So when looping through the pages in a vmalloc chunk, both kho_restore_vmalloc() and kho_vmalloc_unpreserve_chunk() rightly make sure to stop when encountering a NULL page. But when the chunk is full, the loops do not stop and go past the bounds of chunk->phys, resulting in out-of-bounds memory access, and possibly the restoration or unpreservation of an invalid page. Fix this by making sure the processing of chunk stops at the end of the array. Link: https://lkml.kernel.org/r/20251103110159.8399-1-pratyush@kernel.org Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations") Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 2a8c20c238a8..36fdce2667c5 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -889,7 +889,7 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) __kho_unpreserve(track, pfn, pfn + 1); - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); __kho_unpreserve(track, pfn, pfn + 1); } @@ -1012,7 +1012,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) while (chunk) { struct page *page; - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { phys_addr_t phys = chunk->phys[i]; if (idx + contig_pages > total_pages) -- cgit From 7ecd2e439d1272ac02d798b0033a426e3b00dff5 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Mon, 3 Nov 2025 19:02:31 +0100 Subject: kho: fix unpreservation of higher-order vmalloc preservations kho_vmalloc_unpreserve_chunk() calls __kho_unpreserve() with end_pfn as pfn + 1. This happens to work for 0-order pages, but leaks higher order pages. For example, say order 2 pages back the allocation. During preservation, they get preserved in the order 2 bitmaps, but kho_vmalloc_unpreserve_chunk() would try to unpreserve them from the order 0 bitmaps, which should not have these bits set anyway, leaving the order 2 bitmaps untouched. This results in the pages being carried over to the next kernel. Nothing will free those pages in the next boot, leaking them. Fix this by taking the order into account when calculating the end PFN for __kho_unpreserve(). Link: https://lkml.kernel.org/r/20251103180235.71409-2-pratyush@kernel.org Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations") Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 36fdce2667c5..e0bafe7c0ded 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -882,7 +882,8 @@ err_free: return NULL; } -static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, + unsigned short order) { struct kho_mem_track *track = &kho_out.ser.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); @@ -891,7 +892,7 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); - __kho_unpreserve(track, pfn, pfn + 1); + __kho_unpreserve(track, pfn, pfn + (1 << order)); } } @@ -902,7 +903,7 @@ static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) while (chunk) { struct kho_vmalloc_chunk *tmp = chunk; - kho_vmalloc_unpreserve_chunk(chunk); + kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); free_page((unsigned long)tmp); -- cgit From b05addf6f0596edb1f82ab4059438c7ef2d2686d Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Mon, 3 Nov 2025 19:02:32 +0100 Subject: kho: warn and exit when unpreserved page wasn't preserved Calling __kho_unpreserve() on a pair of (pfn, end_pfn) that wasn't preserved is a bug. Currently, if that is done, the physxa or bits can be NULL. This results in a soft lockup since a NULL physxa or bits results in redoing the loop without ever making any progress. Return when physxa or bits are not found, but WARN first to loudly indicate invalid behaviour. Link: https://lkml.kernel.org/r/20251103180235.71409-3-pratyush@kernel.org Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Pasha Tatashin Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index e0bafe7c0ded..03d12e27189f 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -171,12 +171,12 @@ static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; physxa = xa_load(&track->orders, order); - if (!physxa) - continue; + if (WARN_ON_ONCE(!physxa)) + return; bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (!bits) - continue; + if (WARN_ON_ONCE(!bits)) + return; clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); -- cgit From e0fd4d42e27f761e9cc82801b3f183e658dc749d Mon Sep 17 00:00:00 2001 From: Eslam Khafagy Date: Fri, 14 Nov 2025 14:27:39 +0200 Subject: posix-timers: Plug potential memory leak in do_timer_create() When posix timer creation is set to allocate a given timer ID and the access to the user space value faults, the function terminates without freeing the already allocated posix timer structure. Move the allocation after the user space access to cure that. [ tglx: Massaged change log ] Fixes: ec2d0c04624b3 ("posix-timers: Provide a mechanism to allocate a given timer ID") Reported-by: syzbot+9c47ad18f978d4394986@syzkaller.appspotmail.com Suggested-by: Cyrill Gorcunov Signed-off-by: Eslam Khafagy Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://patch.msgid.link/20251114122739.994326-1-eslam.medhat1993@gmail.com Closes: https://lore.kernel.org/all/69155df4.a70a0220.3124cb.0017.GAE@google.com/T/ --- kernel/time/posix-timers.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index aa3120104a51..56e17b625c72 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -475,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (!kc->timer_create) return -EOPNOTSUPP; - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - /* Special case for CRIU to restore timers with a given timer ID. */ if (unlikely(current->signal->timer_create_restore_ids)) { if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) @@ -490,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EINVAL; } + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + /* * Add the timer to the hash table. The timer is not yet valid * after insertion, but has a unique ID allocated. -- cgit From b0c8e6d3d866b6a7f73877f71968dbffd27b7785 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 13 Nov 2025 18:57:29 -0800 Subject: bpf: account for current allocated stack depth in widen_imprecise_scalars() The usage pattern for widen_imprecise_scalars() looks as follows: prev_st = find_prev_entry(env, ...); queued_st = push_stack(...); widen_imprecise_scalars(env, prev_st, queued_st); Where prev_st is an ancestor of the queued_st in the explored states tree. This ancestor is not guaranteed to have same allocated stack depth as queued_st. E.g. in the following case: def main(): for i in 1..2: foo(i) // same callsite, differnt param def foo(i): if i == 1: use 128 bytes of stack iterator based loop Here, for a second 'foo' call prev_st->allocated_stack is 128, while queued_st->allocated_stack is much smaller. widen_imprecise_scalars() needs to take this into account and avoid accessing bpf_verifier_state->frame[*]->stack out of bounds. Fixes: 2793a8b015f7 ("bpf: exact states comparison for iterator convergence checks") Reported-by: Emil Tsalapatis Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251114025730.772723-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8314518c8d93..fbe4bb91c564 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8866,7 +8866,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { struct bpf_func_state *fold, *fcur; - int i, fr; + int i, fr, num_slots; reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { @@ -8879,7 +8879,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, &fcur->regs[i], &env->idmap_scratch); - for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + num_slots = min(fold->allocated_stack / BPF_REG_SIZE, + fcur->allocated_stack / BPF_REG_SIZE); + for (i = 0; i < num_slots; i++) { if (!is_spilled_reg(&fold->stack[i]) || !is_spilled_reg(&fcur->stack[i])) continue; -- cgit From 00fbff75c5acb4755f06f08bd1071879c63940c5 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Sun, 2 Nov 2025 01:07:41 +0530 Subject: crash: fix crashkernel resource shrink When crashkernel is configured with a high reservation, shrinking its value below the low crashkernel reservation causes two issues: 1. Invalid crashkernel resource objects 2. Kernel crash if crashkernel shrinking is done twice For example, with crashkernel=200M,high, the kernel reserves 200MB of high memory and some default low memory (say 256MB). The reservation appears as: cat /proc/iomem | grep -i crash af000000-beffffff : Crash kernel 433000000-43f7fffff : Crash kernel If crashkernel is then shrunk to 50MB (echo 52428800 > /sys/kernel/kexec_crash_size), /proc/iomem still shows 256MB reserved: af000000-beffffff : Crash kernel Instead, it should show 50MB: af000000-b21fffff : Crash kernel Further shrinking crashkernel to 40MB causes a kernel crash with the following trace (x86): BUG: kernel NULL pointer dereference, address: 0000000000000038 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI Call Trace: ? __die_body.cold+0x19/0x27 ? page_fault_oops+0x15a/0x2f0 ? search_module_extables+0x19/0x60 ? search_bpf_extables+0x5f/0x80 ? exc_page_fault+0x7e/0x180 ? asm_exc_page_fault+0x26/0x30 ? __release_resource+0xd/0xb0 release_resource+0x26/0x40 __crash_shrink_memory+0xe5/0x110 crash_shrink_memory+0x12a/0x190 kexec_crash_size_store+0x41/0x80 kernfs_fop_write_iter+0x141/0x1f0 vfs_write+0x294/0x460 ksys_write+0x6d/0xf0 This happens because __crash_shrink_memory()/kernel/crash_core.c incorrectly updates the crashk_res resource object even when crashk_low_res should be updated. Fix this by ensuring the correct crashkernel resource object is updated when shrinking crashkernel memory. Link: https://lkml.kernel.org/r/20251101193741.289252-1-sourabhjain@linux.ibm.com Fixes: 16c6006af4d4 ("kexec: enable kexec_crash_size to support two crash kernel regions") Signed-off-by: Sourabh Jain Acked-by: Baoquan He Cc: Zhen Lei Cc: Signed-off-by: Andrew Morton --- kernel/crash_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 3b1c43382eec..99dac1aa972a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -373,7 +373,7 @@ static int __crash_shrink_memory(struct resource *old_res, old_res->start = 0; old_res->end = 0; } else { - crashk_res.end = ram_res->start - 1; + old_res->end = ram_res->start - 1; } crash_free_reserved_phys_range(ram_res->start, ram_res->end); -- cgit