diff options
Diffstat (limited to 'kernel')
55 files changed, 3268 insertions, 667 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 54e581072617..15632358bcf7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -94,30 +94,6 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC -config KEXEC_HANDOVER - bool "kexec handover" - depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE - depends on !DEFERRED_STRUCT_PAGE_INIT - select MEMBLOCK_KHO_SCRATCH - select KEXEC_FILE - select DEBUG_FS - select LIBFDT - select CMA - help - Allow kexec to hand over state across kernels by generating and - passing additional metadata to the target kernel. This is useful - to keep data or state alive across the kexec. For this to work, - both source and target kernels need to have this option enabled. - -config KEXEC_HANDOVER_DEBUG - bool "Enable Kexec Handover debug checks" - depends on KEXEC_HANDOVER - help - This option enables extra sanity checks for the Kexec Handover - subsystem. Since, KHO performance is crucial in live update - scenarios and the extra code might be adding overhead it is - only optionally enabled. - config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index 9fe722305c9b..e83669841b8c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -52,6 +52,7 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += liveupdate/ obj-y += dma/ obj-y += entry/ obj-y += unwind/ @@ -82,8 +83,6 @@ obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o -obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o -obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/bounds.c b/kernel/bounds.c index 29b2cd00df2c..02b619eb6106 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -6,6 +6,7 @@ */ #define __GENERATING_BOUNDS_H +#define COMPILE_OFFSETS /* Include headers that define the enum constants of interest */ #include <linux/page-flags.h> #include <linux/mmzone.h> diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e81327d2cd63..9f6ab7dabf67 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -83,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y # # Debug Oops, Lockups and Hangs # -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index b0f0d15085db..7481fbb947d3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -173,7 +173,7 @@ int cpu_cluster_pm_exit(void) EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); #ifdef CONFIG_PM -static int cpu_pm_suspend(void) +static int cpu_pm_suspend(void *data) { int ret; @@ -185,20 +185,24 @@ static int cpu_pm_suspend(void) return ret; } -static void cpu_pm_resume(void) +static void cpu_pm_resume(void *data) { cpu_cluster_pm_exit(); cpu_pm_exit(); } -static struct syscore_ops cpu_pm_syscore_ops = { +static const struct syscore_ops cpu_pm_syscore_ops = { .suspend = cpu_pm_suspend, .resume = cpu_pm_resume, }; +static struct syscore cpu_pm_syscore = { + .ops = &cpu_pm_syscore_ops, +}; + static int cpu_pm_init(void) { - register_syscore_ops(&cpu_pm_syscore_ops); + register_syscore(&cpu_pm_syscore); return 0; } core_initcall(cpu_pm_init); diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 87bf4d41eabb..62e60e0223cf 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsigned long long cma_size) #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { + if (!arch_add_crash_res_to_iomem()) + return 0; + if (crashk_res.start < crashk_res.end) insert_resource(&iomem_resource, &crashk_res); diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index 92de80e5b057..16a51736a2a3 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -11,17 +11,16 @@ static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma, return -ENXIO; } -static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +static dma_addr_t dma_dummy_map_phys(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) { return DMA_MAPPING_ERROR; } -static void dma_dummy_unmap_page(struct device *dev, dma_addr_t dma_handle, +static void dma_dummy_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs) { /* - * Dummy ops doesn't support map_page, so unmap_page should never be + * Dummy ops doesn't support map_phys, so unmap_page should never be * called. */ WARN_ON_ONCE(true); @@ -51,8 +50,8 @@ static int dma_dummy_supported(struct device *hwdev, u64 mask) const struct dma_map_ops dma_dummy_ops = { .mmap = dma_dummy_mmap, - .map_page = dma_dummy_map_page, - .unmap_page = dma_dummy_unmap_page, + .map_phys = dma_dummy_map_phys, + .unmap_phys = dma_dummy_unmap_phys, .map_sg = dma_dummy_map_sg, .unmap_sg = dma_dummy_unmap_sg, .dma_supported = dma_dummy_supported, diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index cc19a3efea89..794041a39e65 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -11,13 +11,13 @@ #include <linux/dma-mapping.h> #include <linux/kernel.h> #include <linux/kthread.h> -#include <linux/map_benchmark.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/timekeeping.h> +#include <uapi/linux/map_benchmark.h> struct map_benchmark_data { struct map_benchmark bparam; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index fe7472f13b10..37163eb49f9f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -157,7 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; - dma_addr_t addr; + dma_addr_t addr = DMA_MAPPING_ERROR; BUG_ON(!valid_dma_direction(dir)); @@ -169,21 +169,8 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); - else if (is_mmio) { - if (!ops->map_resource) - return DMA_MAPPING_ERROR; - - addr = ops->map_resource(dev, phys, size, dir, attrs); - } else { - struct page *page = phys_to_page(phys); - size_t offset = offset_in_page(phys); - - /* - * The dma_ops API contract for ops->map_page() requires - * kmappable memory, while ops->map_resource() does not. - */ - addr = ops->map_page(dev, page, offset, size, dir, attrs); - } + else if (ops->map_phys) + addr = ops->map_phys(dev, phys, size, dir, attrs); if (!is_mmio) kmsan_handle_dma(phys, size, dir); @@ -223,11 +210,8 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); - else if (is_mmio) { - if (ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - } else - ops->unmap_page(dev, addr, size, dir, attrs); + else if (ops->unmap_phys) + ops->unmap_phys(dev, addr, size, dir, attrs); trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); } diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 6f9d604d9d40..20caf9cabf69 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -64,6 +64,7 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); struct page *page; + phys_addr_t phys; page = dma_alloc_contiguous(dev, size, gfp); if (!page) @@ -71,11 +72,12 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; + phys = page_to_phys(page); if (use_dma_iommu(dev)) - *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size, - dir, DMA_ATTR_SKIP_CPU_SYNC); + *dma_handle = iommu_dma_map_phys(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); else - *dma_handle = ops->map_page(dev, page, 0, size, dir, + *dma_handle = ops->map_phys(dev, phys, size, dir, DMA_ATTR_SKIP_CPU_SYNC); if (*dma_handle == DMA_MAPPING_ERROR) { dma_free_contiguous(dev, page, size); @@ -94,8 +96,8 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); - else if (ops->unmap_page) - ops->unmap_page(dev, dma_handle, size, dir, + else if (ops->unmap_phys) + ops->unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); dma_free_contiguous(dev, page, size); } diff --git a/kernel/exit.c b/kernel/exit.c index 4dc1918db67b..8a87021211ae 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -251,10 +251,8 @@ repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials. But shut RCU-lockdep up */ - rcu_read_lock(); + * can't be modifying its own credentials. */ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); - rcu_read_unlock(); pidfs_exit(p); cgroup_task_release(p); diff --git a/kernel/fork.c b/kernel/fork.c index 198e02e21e6e..b1f3915d5f8e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -208,15 +208,62 @@ struct vm_stack { struct vm_struct *stack_vm_area; }; +static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node) +{ + struct vm_struct *vm_area; + unsigned int i; + + /* + * If the node has memory, we are guaranteed the stacks are backed by local pages. + * Otherwise the pages are arbitrary. + * + * Note that depending on cpuset it is possible we will get migrated to a different + * node immediately after allocating here, so this does *not* guarantee locality for + * arbitrary callers. + */ + scoped_guard(preempt) { + if (node != NUMA_NO_NODE && numa_node_id() != node) + return NULL; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (vm_area) + return vm_area; + } + } + + return NULL; +} + static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; + int nid; - for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *tmp = NULL; + /* + * Don't cache stacks if any of the pages don't match the local domain, unless + * there is no local memory to begin with. + * + * Note that lack of local memory does not automatically mean it makes no difference + * performance-wise which other domain backs the stack. In this case we are merely + * trying to avoid constantly going to vmalloc. + */ + scoped_guard(preempt) { + nid = numa_node_id(); + if (node_state(nid, N_MEMORY)) { + for (i = 0; i < vm_area->nr_pages; i++) { + struct page *page = vm_area->pages[i]; + if (page_to_nid(page) != nid) + return false; + } + } + + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) - return true; + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) + return true; + } } return false; } @@ -283,13 +330,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct vm_struct *vm_area; void *stack; - int i; - - for (i = 0; i < NR_CACHED_STACKS; i++) { - vm_area = this_cpu_xchg(cached_stacks[i], NULL); - if (!vm_area) - continue; + vm_area = alloc_thread_stack_node_from_cache(tsk, node); + if (vm_area) { if (memcg_charge_kernel_stack(vm_area)) { vfree(vm_area->addr); return -ENOMEM; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b2c1f14b8129..d2254c91450b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -24,6 +24,7 @@ #include <linux/sched/sysctl.h> #include <linux/hung_task.h> #include <linux/rwsem.h> +#include <linux/sys_info.h> #include <trace/events/sched.h> @@ -50,7 +51,6 @@ static unsigned long __read_mostly sysctl_hung_task_detect_count; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; -EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: @@ -60,12 +60,17 @@ static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; static int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; -static bool hung_task_show_lock; static bool hung_task_call_panic; -static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +/* + * A bitmask to control what kinds of system info to be printed when + * a hung task is detected, it could be task, memory, lock etc. Refer + * include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long hung_task_si_mask; + #ifdef CONFIG_SMP /* * Should we dump all CPUs backtraces in a hung task event? @@ -81,7 +86,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ static unsigned int __read_mostly sysctl_hung_task_panic = - IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); + CONFIG_BOOTPARAM_HUNG_TASK_PANIC; static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) @@ -218,8 +223,11 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti } #endif -static void check_hung_task(struct task_struct *t, unsigned long timeout) +static void check_hung_task(struct task_struct *t, unsigned long timeout, + unsigned long prev_detect_count) { + unsigned long total_hung_task; + if (!task_is_hung(t, timeout)) return; @@ -229,11 +237,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) */ sysctl_hung_task_detect_count++; + total_hung_task = sysctl_hung_task_detect_count - prev_detect_count; trace_sched_process_hang(t); - if (sysctl_hung_task_panic) { + if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) { console_verbose(); - hung_task_show_lock = true; hung_task_call_panic = true; } @@ -256,10 +264,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); debug_show_blocker(t, timeout); - hung_task_show_lock = true; - if (sysctl_hung_task_all_cpu_backtrace) - hung_task_show_all_bt = true; if (!sysctl_hung_task_warnings) pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); } @@ -300,6 +305,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; + unsigned long prev_detect_count = sysctl_hung_task_detect_count; + int need_warning = sysctl_hung_task_warnings; + unsigned long si_mask = hung_task_si_mask; /* * If the system crashed already then all bets are off, @@ -308,7 +316,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; - hung_task_show_lock = false; + rcu_read_lock(); for_each_process_thread(g, t) { @@ -320,18 +328,23 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) last_break = jiffies; } - check_hung_task(t, timeout); + check_hung_task(t, timeout, prev_detect_count); } unlock: rcu_read_unlock(); - if (hung_task_show_lock) - debug_show_all_locks(); - if (hung_task_show_all_bt) { - hung_task_show_all_bt = false; - trigger_all_cpu_backtrace(); + if (!(sysctl_hung_task_detect_count - prev_detect_count)) + return; + + if (need_warning || hung_task_call_panic) { + si_mask |= SYS_INFO_LOCKS; + + if (sysctl_hung_task_all_cpu_backtrace) + si_mask |= SYS_INFO_ALL_BT; } + sys_info(si_mask); + if (hung_task_call_panic) panic("hung_task: blocked tasks"); } @@ -389,7 +402,7 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "hung_task_check_count", @@ -430,6 +443,13 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "hung_task_sys_info", + .data = &hung_task_si_mask, + .maxlen = sizeof(hung_task_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static void __init hung_task_sysctl_init(void) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index bf59e37d650a..3cd0c40282c0 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -650,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) } #ifdef CONFIG_PM -static int irq_gc_suspend(void) +static int irq_gc_suspend(void *data) { struct irq_chip_generic *gc; @@ -670,7 +670,7 @@ static int irq_gc_suspend(void) return 0; } -static void irq_gc_resume(void) +static void irq_gc_resume(void *data) { struct irq_chip_generic *gc; @@ -693,7 +693,7 @@ static void irq_gc_resume(void) #define irq_gc_resume NULL #endif -static void irq_gc_shutdown(void) +static void irq_gc_shutdown(void *data) { struct irq_chip_generic *gc; @@ -709,15 +709,19 @@ static void irq_gc_shutdown(void) } } -static struct syscore_ops irq_gc_syscore_ops = { +static const struct syscore_ops irq_gc_syscore_ops = { .suspend = irq_gc_suspend, .resume = irq_gc_resume, .shutdown = irq_gc_shutdown, }; +static struct syscore irq_gc_syscore = { + .ops = &irq_gc_syscore_ops, +}; + static int __init irq_gc_init_ops(void) { - register_syscore_ops(&irq_gc_syscore_ops); + register_syscore(&irq_gc_syscore); return 0; } device_initcall(irq_gc_init_ops); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f7394729cedc..99ff65466d87 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -211,21 +211,26 @@ void rearm_wake_irq(unsigned int irq) /** * irq_pm_syscore_resume - enable interrupt lines early + * @data: syscore context * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ -static void irq_pm_syscore_resume(void) +static void irq_pm_syscore_resume(void *data) { resume_irqs(true); } -static struct syscore_ops irq_pm_syscore_ops = { +static const struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; +static struct syscore irq_pm_syscore = { + .ops = &irq_pm_syscore_ops, +}; + static int __init irq_pm_init_ops(void) { - register_syscore_ops(&irq_pm_syscore_ops); + register_syscore(&irq_pm_syscore); return 0; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index fa00b239c5d9..0f92acdd354d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -15,6 +15,7 @@ #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/liveupdate.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> @@ -41,6 +42,7 @@ #include <linux/objtool.h> #include <linux/kmsg_dump.h> #include <linux/dma-map-ops.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/sections.h> @@ -742,7 +744,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) struct kexec_segment *segment = &image->segment[idx]; struct page *cma = image->segment_cma[idx]; char *ptr = page_address(cma); - unsigned long maddr; size_t ubytes, mbytes; int result = 0; unsigned char __user *buf = NULL; @@ -754,15 +755,12 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; - maddr = segment->mem; /* Then copy from source buffer to the CMA one */ while (mbytes) { size_t uchunk, mchunk; - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { @@ -784,7 +782,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) } ptr += mchunk; - maddr += mchunk; mbytes -= mchunk; cond_resched(); @@ -839,9 +836,7 @@ static int kimage_load_normal_segment(struct kimage *image, int idx) ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { @@ -904,9 +899,7 @@ static int kimage_load_crash_segment(struct kimage *image, int idx) } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap_local_page(page); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ @@ -1146,6 +1139,10 @@ int kernel_kexec(void) goto Unlock; } + error = liveupdate_reboot(); + if (error) + goto Unlock; + #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* @@ -1229,3 +1226,143 @@ int kernel_kexec(void) kexec_unlock(); return error; } + +static ssize_t loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!kexec_image); +} +static struct kobj_attribute loaded_attr = __ATTR_RO(loaded); + +#ifdef CONFIG_CRASH_DUMP +static ssize_t crash_loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); +} +static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded); + +#ifdef CONFIG_CRASH_RESERVE +static ssize_t crash_cma_ranges_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + + ssize_t len = 0; + int i; + + for (i = 0; i < crashk_cma_cnt; ++i) { + len += sysfs_emit_at(buf, len, "%08llx-%08llx\n", + crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end); + } + return len; +} +static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges); +#endif + +static ssize_t crash_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sysfs_emit(buf, "%zd\n", size); +} +static ssize_t crash_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long cnt; + int ret; + + if (kstrtoul(buf, 0, &cnt)) + return -EINVAL; + + ret = crash_shrink_memory(cnt); + return ret < 0 ? ret : count; +} +static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size); + +#ifdef CONFIG_CRASH_HOTPLUG +static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int sz = crash_get_elfcorehdr_size(); + + return sysfs_emit(buf, "%u\n", sz); +} +static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size); + +#endif /* CONFIG_CRASH_HOTPLUG */ +#endif /* CONFIG_CRASH_DUMP */ + +static struct attribute *kexec_attrs[] = { + &loaded_attr.attr, +#ifdef CONFIG_CRASH_DUMP + &crash_loaded_attr.attr, + &crash_size_attr.attr, +#ifdef CONFIG_CRASH_RESERVE + &crash_cma_ranges_attr.attr, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + &crash_elfcorehdr_size_attr.attr, +#endif +#endif + NULL +}; + +struct kexec_link_entry { + const char *target; + const char *name; +}; + +static struct kexec_link_entry kexec_links[] = { + { "loaded", "kexec_loaded" }, +#ifdef CONFIG_CRASH_DUMP + { "crash_loaded", "kexec_crash_loaded" }, + { "crash_size", "kexec_crash_size" }, +#ifdef CONFIG_CRASH_RESERVE + {"crash_cma_ranges", "kexec_crash_cma_ranges"}, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + { "crash_elfcorehdr_size", "crash_elfcorehdr_size" }, +#endif +#endif +}; + +static struct kobject *kexec_kobj; +ATTRIBUTE_GROUPS(kexec); + +static int __init init_kexec_sysctl(void) +{ + int error; + int i; + + kexec_kobj = kobject_create_and_add("kexec", kernel_kobj); + if (!kexec_kobj) { + pr_err("failed to create kexec kobject\n"); + return -ENOMEM; + } + + error = sysfs_create_groups(kexec_kobj, kexec_groups); + if (error) + goto kset_exit; + + for (i = 0; i < ARRAY_SIZE(kexec_links); i++) { + error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj, + kexec_links[i].target, + kexec_links[i].name); + if (error) + pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error); + } + + return 0; + +kset_exit: + kobject_put(kexec_kobj); + return error; +} + +subsys_initcall(init_kexec_sysctl); diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h deleted file mode 100644 index 3c3c7148ceed..000000000000 --- a/kernel/kexec_handover_internal.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H -#define LINUX_KEXEC_HANDOVER_INTERNAL_H - -#include <linux/kexec_handover.h> -#include <linux/types.h> - -extern struct kho_scratch *kho_scratch; -extern unsigned int kho_scratch_cnt; - -#ifdef CONFIG_KEXEC_HANDOVER_DEBUG -bool kho_scratch_overlap(phys_addr_t phys, size_t size); -#else -static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) -{ - return false; -} -#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ - -#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index eefb67d9883c..a9e6354d9e25 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -12,7 +12,7 @@ #include <linux/sysfs.h> #include <linux/export.h> #include <linux/init.h> -#include <linux/kexec.h> +#include <linux/vmcore_info.h> #include <linux/profile.h> #include <linux/stat.h> #include <linux/sched.h> @@ -119,50 +119,6 @@ static ssize_t profiling_store(struct kobject *kobj, KERNEL_ATTR_RW(profiling); #endif -#ifdef CONFIG_KEXEC_CORE -static ssize_t kexec_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", !!kexec_image); -} -KERNEL_ATTR_RO(kexec_loaded); - -#ifdef CONFIG_CRASH_DUMP -static ssize_t kexec_crash_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); -} -KERNEL_ATTR_RO(kexec_crash_loaded); - -static ssize_t kexec_crash_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - ssize_t size = crash_get_memory_size(); - - if (size < 0) - return size; - - return sysfs_emit(buf, "%zd\n", size); -} -static ssize_t kexec_crash_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long cnt; - int ret; - - if (kstrtoul(buf, 0, &cnt)) - return -EINVAL; - - ret = crash_shrink_memory(cnt); - return ret < 0 ? ret : count; -} -KERNEL_ATTR_RW(kexec_crash_size); - -#endif /* CONFIG_CRASH_DUMP*/ -#endif /* CONFIG_KEXEC_CORE */ - #ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, @@ -174,18 +130,6 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#ifdef CONFIG_CRASH_HOTPLUG -static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - unsigned int sz = crash_get_elfcorehdr_size(); - - return sysfs_emit(buf, "%u\n", sz); -} -KERNEL_ATTR_RO(crash_elfcorehdr_size); - -#endif - #endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ @@ -255,18 +199,8 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif -#ifdef CONFIG_KEXEC_CORE - &kexec_loaded_attr.attr, -#ifdef CONFIG_CRASH_DUMP - &kexec_crash_loaded_attr.attr, - &kexec_crash_size_attr.attr, -#endif -#endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, -#ifdef CONFIG_CRASH_HOTPLUG - &crash_elfcorehdr_size_attr.attr, -#endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig new file mode 100644 index 000000000000..9b2515f31afb --- /dev/null +++ b/kernel/liveupdate/Kconfig @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Live Update and Kexec HandOver" + depends on !DEFERRED_STRUCT_PAGE_INIT + +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + depends on !DEFERRED_STRUCT_PAGE_INIT + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + +config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + default KEXEC_HANDOVER + depends on KEXEC_HANDOVER + select DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + +config KEXEC_HANDOVER_ENABLE_DEFAULT + bool "Enable kexec handover by default" + depends on KEXEC_HANDOVER + help + Enable Kexec Handover by default. This avoids the need to + explicitly pass 'kho=on' on the kernel command line. + + This is useful for systems where KHO is a prerequisite for other + features, such as Live Update, ensuring the mechanism is always + active. + + The default behavior can still be overridden at boot time by + passing 'kho=off'. + +config LIVEUPDATE + bool "Live Update Orchestrator" + depends on KEXEC_HANDOVER + help + Enable the Live Update Orchestrator. Live Update is a mechanism, + typically based on kexec, that allows the kernel to be updated + while keeping selected devices operational across the transition. + These devices are intended to be reclaimed by the new kernel and + re-attached to their original workload without requiring a device + reset. + + Ability to handover a device from current to the next kernel depends + on specific support within device drivers and related kernel + subsystems. + + This feature primarily targets virtual machine hosts to quickly update + the kernel hypervisor with minimal disruption to the running virtual + machines. + + If unsure, say N. + +endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile new file mode 100644 index 000000000000..7cad2eece32d --- /dev/null +++ b/kernel/liveupdate/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +luo-y := \ + luo_core.o \ + luo_file.o \ + luo_session.o + +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o + +obj-$(CONFIG_LIVEUPDATE) += luo.o diff --git a/kernel/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 03d12e27189f..9dc51fab604f 100644 --- a/kernel/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -4,21 +4,22 @@ * Copyright (C) 2023 Alexander Graf <graf@amazon.com> * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com> */ #define pr_fmt(fmt) "KHO: " fmt #include <linux/cleanup.h> #include <linux/cma.h> +#include <linux/kmemleak.h> #include <linux/count_zeros.h> -#include <linux/debugfs.h> #include <linux/kexec.h> #include <linux/kexec_handover.h> #include <linux/libfdt.h> #include <linux/list.h> #include <linux/memblock.h> -#include <linux/notifier.h> #include <linux/page-isolation.h> +#include <linux/unaligned.h> #include <linux/vmalloc.h> #include <asm/early_ioremap.h> @@ -28,8 +29,9 @@ * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. */ -#include "../mm/internal.h" -#include "kexec_internal.h" +#include "../../mm/internal.h" +#include "../kexec_internal.h" +#include "kexec_handover_internal.h" #define KHO_FDT_COMPATIBLE "kho-v1" #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" @@ -51,7 +53,7 @@ union kho_page_info { static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); -static bool kho_enable __ro_after_init; +static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT); bool kho_is_enabled(void) { @@ -103,34 +105,19 @@ struct kho_mem_track { struct khoser_mem_chunk; -struct kho_serialization { - struct page *fdt; - struct list_head fdt_list; - struct dentry *sub_fdt_dir; - struct kho_mem_track track; - /* First chunk of serialized preserved memory map */ - struct khoser_mem_chunk *preserved_mem_map; -}; - struct kho_out { - struct blocking_notifier_head chain_head; - - struct dentry *dir; - + void *fdt; + bool finalized; struct mutex lock; /* protects KHO FDT finalization */ - struct kho_serialization ser; - bool finalized; + struct kho_mem_track track; + struct kho_debugfs dbg; }; static struct kho_out kho_out = { - .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), .lock = __MUTEX_INITIALIZER(kho_out.lock), - .ser = { - .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), - .track = { - .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), - }, + .track = { + .orders = XARRAY_INIT(kho_out.track.orders, 0), }, .finalized = false, }; @@ -159,26 +146,33 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) return no_free_ptr(elm); } -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, - unsigned long end_pfn) +static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) { struct kho_mem_phys_bits *bits; struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order; - while (pfn < end_pfn) { - const unsigned int order = - min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - const unsigned long pfn_high = pfn >> order; + physxa = xa_load(&track->orders, order); + if (WARN_ON_ONCE(!physxa)) + return; - physxa = xa_load(&track->orders, order); - if (WARN_ON_ONCE(!physxa)) - return; + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (WARN_ON_ONCE(!bits)) + return; - bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (WARN_ON_ONCE(!bits)) - return; + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); +} - clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + unsigned int order; + + while (pfn < end_pfn) { + order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + __kho_unpreserve_order(track, pfn, order); pfn += 1 << order; } @@ -192,10 +186,6 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; might_sleep(); - - if (kho_out.finalized) - return -EBUSY; - physxa = xa_load(&track->orders, order); if (!physxa) { int err; @@ -229,11 +219,11 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, return 0; } -static struct page *kho_restore_page(phys_addr_t phys) +static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) { struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + unsigned int nr_pages, ref_cnt; union kho_page_info info; - unsigned int nr_pages; if (!page) return NULL; @@ -253,11 +243,16 @@ static struct page *kho_restore_page(phys_addr_t phys) /* Head page gets refcount of 1. */ set_page_count(page, 1); - /* For higher order folios, tail pages get a page count of zero. */ + /* + * For higher order folios, tail pages get a page count of zero. + * For physically contiguous order-0 pages every pages gets a page + * count of 1 + */ + ref_cnt = is_folio ? 0 : 1; for (unsigned int i = 1; i < nr_pages; i++) - set_page_count(page + i, 0); + set_page_count(page + i, ref_cnt); - if (info.order > 0) + if (is_folio && info.order) prep_compound_page(page, info.order); adjust_managed_page_count(page, nr_pages); @@ -272,7 +267,7 @@ static struct page *kho_restore_page(phys_addr_t phys) */ struct folio *kho_restore_folio(phys_addr_t phys) { - struct page *page = kho_restore_page(phys); + struct page *page = kho_restore_page(phys, true); return page ? page_folio(page) : NULL; } @@ -297,11 +292,10 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - struct page *page = kho_restore_page(PFN_PHYS(pfn)); + struct page *page = kho_restore_page(PFN_PHYS(pfn), false); if (!page) return NULL; - split_page(page, order); pfn += 1 << order; } @@ -371,11 +365,32 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) struct khoser_mem_chunk *tmp = chunk; chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - kfree(tmp); + free_page((unsigned long)tmp); } } -static int kho_mem_serialize(struct kho_serialization *ser) +/* + * Update memory map property, if old one is found discard it via + * kho_mem_ser_free(). + */ +static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) +{ + void *ptr; + u64 phys; + + ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL); + + /* Check and discard previous memory map */ + phys = get_unaligned((u64 *)ptr); + if (phys) + kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); + + /* Update with the new value */ + phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; + put_unaligned(phys, (u64 *)ptr); +} + +static int kho_mem_serialize(struct kho_out *kho_out) { struct khoser_mem_chunk *first_chunk = NULL; struct khoser_mem_chunk *chunk = NULL; @@ -383,7 +398,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) unsigned long order; int err = -ENOMEM; - xa_for_each(&ser->track.orders, order, physxa) { + xa_for_each(&kho_out->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; @@ -415,7 +430,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) } } - ser->preserved_mem_map = first_chunk; + kho_update_memory_map(first_chunk); return 0; @@ -445,20 +460,27 @@ static void __init deserialize_bitmap(unsigned int order, } } -static void __init kho_mem_deserialize(const void *fdt) +/* Return true if memory was deserizlied */ +static bool __init kho_mem_deserialize(const void *fdt) { struct khoser_mem_chunk *chunk; - const phys_addr_t *mem; + const void *mem_ptr; + u64 mem; int len; - mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); - - if (!mem || len != sizeof(*mem)) { + mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + if (!mem_ptr || len != sizeof(u64)) { pr_err("failed to get preserved memory bitmaps\n"); - return; + return false; } - chunk = *mem ? phys_to_virt(*mem) : NULL; + mem = get_unaligned((const u64 *)mem_ptr); + chunk = mem ? phys_to_virt(mem) : NULL; + + /* No preserved physical pages were passed, no deserialization */ + if (!chunk) + return false; + while (chunk) { unsigned int i; @@ -467,6 +489,8 @@ static void __init kho_mem_deserialize(const void *fdt) &chunk->bitmaps[i]); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); } + + return true; } /* @@ -674,40 +698,8 @@ err_disable_kho: kho_enable = false; } -struct fdt_debugfs { - struct list_head list; - struct debugfs_blob_wrapper wrapper; - struct dentry *file; -}; - -static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, - const char *name, const void *fdt) -{ - struct fdt_debugfs *f; - struct dentry *file; - - f = kmalloc(sizeof(*f), GFP_KERNEL); - if (!f) - return -ENOMEM; - - f->wrapper.data = (void *)fdt; - f->wrapper.size = fdt_totalsize(fdt); - - file = debugfs_create_blob(name, 0400, dir, &f->wrapper); - if (IS_ERR(file)) { - kfree(f); - return PTR_ERR(file); - } - - f->file = file; - list_add(&f->list, list); - - return 0; -} - /** * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. - * @ser: serialization control object passed by KHO notifiers. * @name: name of the sub tree. * @fdt: the sub tree blob. * @@ -716,38 +708,76 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, * by KHO for the new kernel to retrieve it after kexec. * * A debugfs blob entry is also created at - * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. + * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with + * CONFIG_KEXEC_HANDOVER_DEBUGFS * * Return: 0 on success, error code on failure */ -int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) +int kho_add_subtree(const char *name, void *fdt) { - int err = 0; - u64 phys = (u64)virt_to_phys(fdt); - void *root = page_to_virt(ser->fdt); + phys_addr_t phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int err = -ENOMEM; + int off, fdt_err; - err |= fdt_begin_node(root, name); - err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); - err |= fdt_end_node(root); + guard(mutex)(&kho_out.lock); - if (err) + fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (fdt_err < 0) return err; - return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); + off = fdt_add_subnode(root_fdt, 0, name); + if (off < 0) { + if (off == -FDT_ERR_EXISTS) + err = -EEXIST; + goto out_pack; + } + + err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys)); + if (err < 0) + goto out_pack; + + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false)); + +out_pack: + fdt_pack(root_fdt); + + return err; } EXPORT_SYMBOL_GPL(kho_add_subtree); -int register_kho_notifier(struct notifier_block *nb) +void kho_remove_subtree(void *fdt) { - return blocking_notifier_chain_register(&kho_out.chain_head, nb); -} -EXPORT_SYMBOL_GPL(register_kho_notifier); + phys_addr_t target_phys = virt_to_phys(fdt); + void *root_fdt = kho_out.fdt; + int off; + int err; -int unregister_kho_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); + guard(mutex)(&kho_out.lock); + + err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE); + if (err < 0) + return; + + for (off = fdt_first_subnode(root_fdt, 0); off >= 0; + off = fdt_next_subnode(root_fdt, off)) { + const u64 *val; + int len; + + val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len); + if (!val || len != sizeof(phys_addr_t)) + continue; + + if ((phys_addr_t)*val == target_phys) { + fdt_del_node(root_fdt, off); + kho_debugfs_fdt_remove(&kho_out.dbg, fdt); + break; + } + } + + fdt_pack(root_fdt); } -EXPORT_SYMBOL_GPL(unregister_kho_notifier); +EXPORT_SYMBOL_GPL(kho_remove_subtree); /** * kho_preserve_folio - preserve a folio across kexec. @@ -762,7 +792,7 @@ int kho_preserve_folio(struct folio *folio) { const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) return -EINVAL; @@ -772,6 +802,24 @@ int kho_preserve_folio(struct folio *folio) EXPORT_SYMBOL_GPL(kho_preserve_folio); /** + * kho_unpreserve_folio - unpreserve a folio. + * @folio: folio to unpreserve. + * + * Instructs KHO to unpreserve a folio that was preserved by + * kho_preserve_folio() before. The provided @folio (pfn and order) + * must exactly match a previously preserved folio. + */ +void kho_unpreserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.track; + + __kho_unpreserve_order(track, pfn, order); +} +EXPORT_SYMBOL_GPL(kho_unpreserve_folio); + +/** * kho_preserve_pages - preserve contiguous pages across kexec * @page: first page in the list. * @nr_pages: number of pages. @@ -783,7 +831,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio); */ int kho_preserve_pages(struct page *page, unsigned int nr_pages) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn = start_pfn; @@ -815,6 +863,26 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_preserve_pages); +/** + * kho_unpreserve_pages - unpreserve contiguous pages. + * @page: first page in the list. + * @nr_pages: number of pages. + * + * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page. + * This must be called with the same @page and @nr_pages as the corresponding + * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger + * preserved blocks is not supported. + */ +void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + struct kho_mem_track *track = &kho_out.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + + __kho_unpreserve(track, start_pfn, end_pfn); +} +EXPORT_SYMBOL_GPL(kho_unpreserve_pages); + struct kho_vmalloc_hdr { DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); }; @@ -885,7 +953,7 @@ err_free: static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, unsigned short order) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); __kho_unpreserve(track, pfn, pfn + 1); @@ -896,20 +964,6 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, } } -static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) -{ - struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first); - - while (chunk) { - struct kho_vmalloc_chunk *tmp = chunk; - - kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); - - chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - free_page((unsigned long)tmp); - } -} - /** * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec * @ptr: pointer to the area in vmalloc address space @@ -971,12 +1025,34 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) return 0; err_free: - kho_vmalloc_free_chunks(preservation); + kho_unpreserve_vmalloc(preservation); return err; } EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); /** + * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc() + * @preservation: preservation metadata returned by kho_preserve_vmalloc() + * + * Instructs KHO to unpreserve the area in vmalloc address space that was + * previously preserved with kho_preserve_vmalloc(). + */ +void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + + while (chunk) { + struct kho_vmalloc_chunk *tmp = chunk; + + kho_vmalloc_unpreserve_chunk(chunk, preservation->order); + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + free_page((unsigned long)tmp); + } +} +EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc); + +/** * kho_restore_vmalloc - recreates and populates an area in vmalloc address * space from the preserved memory. * @preservation: preservation metadata. @@ -1024,7 +1100,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) goto err_free_pages_array; for (int j = 0; j < contig_pages; j++) - pages[idx++] = page; + pages[idx++] = page + j; phys += contig_pages * PAGE_SIZE; } @@ -1065,217 +1141,122 @@ err_free_pages_array: } EXPORT_SYMBOL_GPL(kho_restore_vmalloc); -/* Handling for debug/kho/out */ - -static struct dentry *debugfs_root; - -static int kho_out_update_debugfs_fdt(void) -{ - int err = 0; - struct fdt_debugfs *ff, *tmp; - - if (kho_out.finalized) { - err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, - "fdt", page_to_virt(kho_out.ser.fdt)); - } else { - list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { - debugfs_remove(ff->file); - list_del(&ff->list); - kfree(ff); - } - } - - return err; -} - -static int kho_abort(void) +/** + * kho_alloc_preserve - Allocate, zero, and preserve memory. + * @size: The number of bytes to allocate. + * + * Allocates a physically contiguous block of zeroed pages that is large + * enough to hold @size bytes. The allocated memory is then registered with + * KHO for preservation across a kexec. + * + * Note: The actual allocated size will be rounded up to the nearest + * power-of-two page boundary. + * + * @return A virtual pointer to the allocated and preserved memory on success, + * or an ERR_PTR() encoded error on failure. + */ +void *kho_alloc_preserve(size_t size) { - int err; - unsigned long order; - struct kho_mem_phys *physxa; + struct folio *folio; + int order, ret; - xa_for_each(&kho_out.ser.track.orders, order, physxa) { - struct kho_mem_phys_bits *bits; - unsigned long phys; + if (!size) + return ERR_PTR(-EINVAL); - xa_for_each(&physxa->phys_bits, phys, bits) - kfree(bits); + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-E2BIG); - xa_destroy(&physxa->phys_bits); - kfree(physxa); - } - xa_destroy(&kho_out.ser.track.orders); + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order); + if (!folio) + return ERR_PTR(-ENOMEM); - if (kho_out.ser.preserved_mem_map) { - kho_mem_ser_free(kho_out.ser.preserved_mem_map); - kho_out.ser.preserved_mem_map = NULL; + ret = kho_preserve_folio(folio); + if (ret) { + folio_put(folio); + return ERR_PTR(ret); } - err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, - NULL); - err = notifier_to_errno(err); - - if (err) - pr_err("Failed to abort KHO finalization: %d\n", err); - - return err; + return folio_address(folio); } +EXPORT_SYMBOL_GPL(kho_alloc_preserve); -static int kho_finalize(void) +/** + * kho_unpreserve_free - Unpreserve and free memory. + * @mem: Pointer to the memory allocated by kho_alloc_preserve(). + * + * Unregisters the memory from KHO preservation and frees the underlying + * pages back to the system. This function should be called to clean up + * memory allocated with kho_alloc_preserve(). + */ +void kho_unpreserve_free(void *mem) { - int err = 0; - u64 *preserved_mem_map; - void *fdt = page_to_virt(kho_out.ser.fdt); - - err |= fdt_create(fdt, PAGE_SIZE); - err |= fdt_finish_reservemap(fdt); - err |= fdt_begin_node(fdt, ""); - err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); - /** - * Reserve the preserved-memory-map property in the root FDT, so - * that all property definitions will precede subnodes created by - * KHO callers. - */ - err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, - sizeof(*preserved_mem_map), - (void **)&preserved_mem_map); - if (err) - goto abort; - - err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); - if (err) - goto abort; - - err = blocking_notifier_call_chain(&kho_out.chain_head, - KEXEC_KHO_FINALIZE, &kho_out.ser); - err = notifier_to_errno(err); - if (err) - goto abort; - - err = kho_mem_serialize(&kho_out.ser); - if (err) - goto abort; - - *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); - - err |= fdt_end_node(fdt); - err |= fdt_finish(fdt); + struct folio *folio; -abort: - if (err) { - pr_err("Failed to convert KHO state tree: %d\n", err); - kho_abort(); - } + if (!mem) + return; - return err; + folio = virt_to_folio(mem); + kho_unpreserve_folio(folio); + folio_put(folio); } +EXPORT_SYMBOL_GPL(kho_unpreserve_free); -static int kho_out_finalize_get(void *data, u64 *val) +/** + * kho_restore_free - Restore and free memory after kexec. + * @mem: Pointer to the memory (in the new kernel's address space) + * that was allocated by the old kernel. + * + * This function is intended to be called in the new kernel (post-kexec) + * to take ownership of and free a memory region that was preserved by the + * old kernel using kho_alloc_preserve(). + * + * It first restores the pages from KHO (using their physical address) + * and then frees the pages back to the new kernel's page allocator. + */ +void kho_restore_free(void *mem) { - mutex_lock(&kho_out.lock); - *val = kho_out.finalized; - mutex_unlock(&kho_out.lock); + struct folio *folio; - return 0; + if (!mem) + return; + + folio = kho_restore_folio(__pa(mem)); + if (!WARN_ON(!folio)) + folio_put(folio); } +EXPORT_SYMBOL_GPL(kho_restore_free); -static int kho_out_finalize_set(void *data, u64 _val) +int kho_finalize(void) { - int ret = 0; - bool val = !!_val; - - mutex_lock(&kho_out.lock); - - if (val == kho_out.finalized) { - if (kho_out.finalized) - ret = -EEXIST; - else - ret = -ENOENT; - goto unlock; - } + int ret; - if (val) - ret = kho_finalize(); - else - ret = kho_abort(); + if (!kho_enable) + return -EOPNOTSUPP; + guard(mutex)(&kho_out.lock); + ret = kho_mem_serialize(&kho_out); if (ret) - goto unlock; - - kho_out.finalized = val; - ret = kho_out_update_debugfs_fdt(); - -unlock: - mutex_unlock(&kho_out.lock); - return ret; -} - -DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, - kho_out_finalize_set, "%llu\n"); + return ret; -static int scratch_phys_show(struct seq_file *m, void *v) -{ - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + kho_out.finalized = true; return 0; } -DEFINE_SHOW_ATTRIBUTE(scratch_phys); -static int scratch_len_show(struct seq_file *m, void *v) +bool kho_finalized(void) { - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].size); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(scratch_len); - -static __init int kho_out_debugfs_init(void) -{ - struct dentry *dir, *f, *sub_fdt_dir; - - dir = debugfs_create_dir("out", debugfs_root); - if (IS_ERR(dir)) - return -ENOMEM; - - sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); - if (IS_ERR(sub_fdt_dir)) - goto err_rmdir; - - f = debugfs_create_file("scratch_phys", 0400, dir, NULL, - &scratch_phys_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("scratch_len", 0400, dir, NULL, - &scratch_len_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("finalize", 0600, dir, NULL, - &fops_kho_out_finalize); - if (IS_ERR(f)) - goto err_rmdir; - - kho_out.dir = dir; - kho_out.ser.sub_fdt_dir = sub_fdt_dir; - return 0; - -err_rmdir: - debugfs_remove_recursive(dir); - return -ENOENT; + guard(mutex)(&kho_out.lock); + return kho_out.finalized; } struct kho_in { - struct dentry *dir; phys_addr_t fdt_phys; phys_addr_t scratch_phys; - struct list_head fdt_list; + struct kho_debugfs dbg; }; static struct kho_in kho_in = { - .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), }; static const void *kho_get_fdt(void) @@ -1339,91 +1320,52 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); -/* Handling for debugfs/kho/in */ - -static __init int kho_in_debugfs_init(const void *fdt) +static __init int kho_out_fdt_setup(void) { - struct dentry *sub_fdt_dir; - int err, child; - - kho_in.dir = debugfs_create_dir("in", debugfs_root); - if (IS_ERR(kho_in.dir)) - return PTR_ERR(kho_in.dir); - - sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); - if (IS_ERR(sub_fdt_dir)) { - err = PTR_ERR(sub_fdt_dir); - goto err_rmdir; - } - - err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); - if (err) - goto err_rmdir; - - fdt_for_each_subnode(child, fdt, 0) { - int len = 0; - const char *name = fdt_get_name(fdt, child, NULL); - const u64 *fdt_phys; - - fdt_phys = fdt_getprop(fdt, child, "fdt", &len); - if (!fdt_phys) - continue; - if (len != sizeof(*fdt_phys)) { - pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", - name, len); - continue; - } - err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, - phys_to_virt(*fdt_phys)); - if (err) { - pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, - err); - continue; - } - } + void *root = kho_out.fdt; + u64 empty_mem_map = 0; + int err; - return 0; + err = fdt_create(root, PAGE_SIZE); + err |= fdt_finish_reservemap(root); + err |= fdt_begin_node(root, ""); + err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); + err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + sizeof(empty_mem_map)); + err |= fdt_end_node(root); + err |= fdt_finish(root); -err_rmdir: - debugfs_remove_recursive(kho_in.dir); return err; } static __init int kho_init(void) { - int err = 0; const void *fdt = kho_get_fdt(); + int err = 0; if (!kho_enable) return 0; - kho_out.ser.fdt = alloc_page(GFP_KERNEL); - if (!kho_out.ser.fdt) { - err = -ENOMEM; + kho_out.fdt = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(kho_out.fdt)) { + err = PTR_ERR(kho_out.fdt); goto err_free_scratch; } - debugfs_root = debugfs_create_dir("kho", NULL); - if (IS_ERR(debugfs_root)) { - err = -ENOENT; + err = kho_debugfs_init(); + if (err) goto err_free_fdt; - } - err = kho_out_debugfs_init(); + err = kho_out_debugfs_init(&kho_out.dbg); if (err) goto err_free_fdt; - if (fdt) { - err = kho_in_debugfs_init(fdt); - /* - * Failure to create /sys/kernel/debug/kho/in does not prevent - * reviving state from KHO and setting up KHO for the next - * kexec. - */ - if (err) - pr_err("failed exposing handover FDT in debugfs: %d\n", - err); + err = kho_out_fdt_setup(); + if (err) + goto err_free_fdt; + if (fdt) { + kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; } @@ -1432,17 +1374,29 @@ static __init int kho_init(void) unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; unsigned long pfn; + /* + * When debug_pagealloc is enabled, __free_pages() clears the + * corresponding PRESENT bit in the kernel page table. + * Subsequent kmemleak scans of these pages cause the + * non-PRESENT page faults. + * Mark scratch areas with kmemleak_ignore_phys() to exclude + * them from kmemleak scanning. + */ + kmemleak_ignore_phys(kho_scratch[i].addr); for (pfn = base_pfn; pfn < base_pfn + count; pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); } + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + kho_out.fdt, true)); + return 0; err_free_fdt: - put_page(kho_out.ser.fdt); - kho_out.ser.fdt = NULL; + kho_unpreserve_free(kho_out.fdt); err_free_scratch: + kho_out.fdt = NULL; for (int i = 0; i < kho_scratch_cnt; i++) { void *start = __va(kho_scratch[i].addr); void *end = start + kho_scratch[i].size; @@ -1452,7 +1406,7 @@ err_free_scratch: kho_enable = false; return err; } -late_initcall(kho_init); +fs_initcall(kho_init); static void __init kho_release_scratch(void) { @@ -1480,16 +1434,12 @@ static void __init kho_release_scratch(void) void __init kho_memory_init(void) { - struct folio *folio; - if (kho_in.scratch_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); kho_release_scratch(); - kho_mem_deserialize(kho_get_fdt()); - folio = kho_restore_folio(kho_in.fdt_phys); - if (!folio) - pr_warn("failed to restore folio for KHO fdt\n"); + if (!kho_mem_deserialize(kho_get_fdt())) + kho_in.fdt_phys = 0; } else { kho_reserve_scratch(); } @@ -1545,8 +1495,8 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, memblock_add(area->addr, size); err = memblock_mark_kho_scratch(area->addr, size); if (WARN_ON(err)) { - pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", - &area->addr, &size, err); + pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe", + &area->addr, &size, ERR_PTR(err)); goto out; } pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); @@ -1566,7 +1516,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_in.fdt_phys = fdt_phys; kho_in.scratch_phys = scratch_phys; kho_scratch_cnt = scratch_cnt; - pr_info("found kexec handover data. Will skip init for some devices\n"); + pr_info("found kexec handover data.\n"); out: if (fdt) @@ -1585,10 +1535,10 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_out.finalized) + if (!kho_enable) return 0; - image->kho.fdt = page_to_phys(kho_out.ser.fdt); + image->kho.fdt = virt_to_phys(kho_out.fdt); scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; scratch = (struct kexec_buf){ diff --git a/kernel/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c index 6efb696f5426..6efb696f5426 100644 --- a/kernel/kexec_handover_debug.c +++ b/kernel/liveupdate/kexec_handover_debug.c diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c new file mode 100644 index 000000000000..2abbf62ba942 --- /dev/null +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debugfs.c - kexec handover debugfs interfaces + * Copyright (C) 2023 Alexander Graf <graf@amazon.com> + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> + * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include <linux/init.h> +#include <linux/io.h> +#include <linux/libfdt.h> +#include <linux/mm.h> +#include "kexec_handover_internal.h" + +static struct dentry *debugfs_root; + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) +{ + struct dentry *dir; + + if (root) + dir = dbg->dir; + else + dir = dbg->sub_fdt_dir; + + return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); +} + +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) +{ + struct fdt_debugfs *ff; + + list_for_each_entry(ff, &dbg->fdt_list, list) { + if (ff->wrapper.data == fdt) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + break; + } + } +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + *val = kho_finalized(); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 val) +{ + if (val) + return kho_finalize(); + else + return -EINVAL; +} + +DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) +{ + struct dentry *dir, *sub_fdt_dir; + int err, child; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto err_out; + } + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node %s prop fdt has invalid length: %d\n", + name, len); + continue; + } + err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt %s to debugfs: %pe\n", name, + ERR_PTR(err)); + continue; + } + } + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + + return; +err_rmdir: + debugfs_remove_recursive(dir); +err_out: + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) { + pr_err("failed exposing handover FDT in debugfs: %pe\n", + ERR_PTR(err)); + } +} + +__init int kho_out_debugfs_init(struct kho_debugfs *dbg) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &kho_out_finalize_fops); + if (IS_ERR(f)) + goto err_rmdir; + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +__init int kho_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) + return -ENOENT; + return 0; +} diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h new file mode 100644 index 000000000000..0202c85ad14f --- /dev/null +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include <linux/kexec_handover.h> +#include <linux/list.h> +#include <linux/types.h> + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +#include <linux/debugfs.h> + +struct kho_debugfs { + struct dentry *dir; + struct dentry *sub_fdt_dir; + struct list_head fdt_list; +}; + +#else +struct kho_debugfs {}; +#endif + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +bool kho_finalized(void); +int kho_finalize(void); + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +int kho_debugfs_init(void); +void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); +int kho_out_debugfs_init(struct kho_debugfs *dbg); +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root); +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); +#else +static inline int kho_debugfs_init(void) { return 0; } +static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, + const void *fdt) { } +static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } +static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) { return 0; } +static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, + void *fdt) { } +#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c new file mode 100644 index 000000000000..f7ecaf7740d1 --- /dev/null +++ b/kernel/liveupdate/luo_core.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: Live Update Orchestrator (LUO) + * + * Live Update is a specialized, kexec-based reboot process that allows a + * running kernel to be updated from one version to another while preserving + * the state of selected resources and keeping designated hardware devices + * operational. For these devices, DMA activity may continue throughout the + * kernel transition. + * + * While the primary use case driving this work is supporting live updates of + * the Linux kernel when it is used as a hypervisor in cloud environments, the + * LUO framework itself is designed to be workload-agnostic. Live Update + * facilitates a full kernel version upgrade for any type of system. + * + * For example, a non-hypervisor system running an in-memory cache like + * memcached with many gigabytes of data can use LUO. The userspace service + * can place its cache into a memfd, have its state preserved by LUO, and + * restore it immediately after the kernel kexec. + * + * Whether the system is running virtual machines, containers, a + * high-performance database, or networking services, LUO's primary goal is to + * enable a full kernel update by preserving critical userspace state and + * keeping essential devices operational. + * + * The core of LUO is a mechanism that tracks the progress of a live update, + * along with a callback API that allows other kernel subsystems to participate + * in the process. Example subsystems that can hook into LUO include: kvm, + * iommu, interrupts, vfio, participating filesystems, and memory management. + * + * LUO uses Kexec Handover to transfer memory state from the current kernel to + * the next kernel. For more details see + * Documentation/core-api/kho/concepts.rst. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/kobject.h> +#include <linux/libfdt.h> +#include <linux/liveupdate.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/sizes.h> +#include <linux/string.h> +#include <linux/unaligned.h> + +#include "kexec_handover_internal.h" +#include "luo_internal.h" + +static struct { + bool enabled; + void *fdt_out; + void *fdt_in; + u64 liveupdate_num; +} luo_global; + +static int __init early_liveupdate_param(char *buf) +{ + return kstrtobool(buf, &luo_global.enabled); +} +early_param("liveupdate", early_liveupdate_param); + +static int __init luo_early_startup(void) +{ + phys_addr_t fdt_phys; + int err, ln_size; + const void *ptr; + + if (!kho_is_enabled()) { + if (liveupdate_enabled()) + pr_warn("Disabling liveupdate because KHO is disabled\n"); + luo_global.enabled = false; + return 0; + } + + /* Retrieve LUO subtree, and verify its format. */ + err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys); + if (err) { + if (err != -ENOENT) { + pr_err("failed to retrieve FDT '%s' from KHO: %pe\n", + LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err)); + return err; + } + + return 0; + } + + luo_global.fdt_in = phys_to_virt(fdt_phys); + err = fdt_node_check_compatible(luo_global.fdt_in, 0, + LUO_FDT_COMPATIBLE); + if (err) { + pr_err("FDT '%s' is incompatible with '%s' [%d]\n", + LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err); + + return -EINVAL; + } + + ln_size = 0; + ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM, + &ln_size); + if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) { + pr_err("Unable to get live update number '%s' [%d]\n", + LUO_FDT_LIVEUPDATE_NUM, ln_size); + + return -EINVAL; + } + + luo_global.liveupdate_num = get_unaligned((u64 *)ptr); + pr_info("Retrieved live update data, liveupdate number: %lld\n", + luo_global.liveupdate_num); + + err = luo_session_setup_incoming(luo_global.fdt_in); + if (err) + return err; + + return 0; +} + +static int __init liveupdate_early_init(void) +{ + int err; + + err = luo_early_startup(); + if (err) { + luo_global.enabled = false; + luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n", + ERR_PTR(err)); + } + + return err; +} +early_initcall(liveupdate_early_init); + +/* Called during boot to create outgoing LUO fdt tree */ +static int __init luo_fdt_setup(void) +{ + const u64 ln = luo_global.liveupdate_num + 1; + void *fdt_out; + int err; + + fdt_out = kho_alloc_preserve(LUO_FDT_SIZE); + if (IS_ERR(fdt_out)) { + pr_err("failed to allocate/preserve FDT memory\n"); + return PTR_ERR(fdt_out); + } + + err = fdt_create(fdt_out, LUO_FDT_SIZE); + err |= fdt_finish_reservemap(fdt_out); + err |= fdt_begin_node(fdt_out, ""); + err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln)); + err |= luo_session_setup_outgoing(fdt_out); + err |= fdt_end_node(fdt_out); + err |= fdt_finish(fdt_out); + if (err) + goto exit_free; + + err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out); + if (err) + goto exit_free; + luo_global.fdt_out = fdt_out; + + return 0; + +exit_free: + kho_unpreserve_free(fdt_out); + pr_err("failed to prepare LUO FDT: %d\n", err); + + return err; +} + +/* + * late initcall because it initializes the outgoing tree that is needed only + * once userspace starts using /dev/liveupdate. + */ +static int __init luo_late_startup(void) +{ + int err; + + if (!liveupdate_enabled()) + return 0; + + err = luo_fdt_setup(); + if (err) + luo_global.enabled = false; + + return err; +} +late_initcall(luo_late_startup); + +/* Public Functions */ + +/** + * liveupdate_reboot() - Kernel reboot notifier for live update final + * serialization. + * + * This function is invoked directly from the reboot() syscall pathway + * if kexec is in progress. + * + * If any callback fails, this function aborts KHO, undoes the freeze() + * callbacks, and returns an error. + */ +int liveupdate_reboot(void) +{ + int err; + + if (!liveupdate_enabled()) + return 0; + + err = luo_session_serialize(); + if (err) + return err; + + err = kho_finalize(); + if (err) { + pr_err("kho_finalize failed %d\n", err); + /* + * kho_finalize() may return libfdt errors, to aboid passing to + * userspace unknown errors, change this to EAGAIN. + */ + err = -EAGAIN; + } + + return err; +} + +/** + * liveupdate_enabled - Check if the live update feature is enabled. + * + * This function returns the state of the live update feature flag, which + * can be controlled via the ``liveupdate`` kernel command-line parameter. + * + * @return true if live update is enabled, false otherwise. + */ +bool liveupdate_enabled(void) +{ + return luo_global.enabled; +} + +/** + * DOC: LUO ioctl Interface + * + * The IOCTL user-space control interface for the LUO subsystem. + * It registers a character device, typically found at ``/dev/liveupdate``, + * which allows a userspace agent to manage the LUO state machine and its + * associated resources, such as preservable file descriptors. + * + * To ensure that the state machine is controlled by a single entity, access + * to this device is exclusive: only one process is permitted to have + * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will + * fail with -EBUSY until the first process closes its file descriptor. + * This singleton model simplifies state management by preventing conflicting + * commands from multiple userspace agents. + */ + +struct luo_device_state { + struct miscdevice miscdev; + atomic_t in_use; +}; + +static int luo_ioctl_create_session(struct luo_ucmd *ucmd) +{ + struct liveupdate_ioctl_create_session *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + err = luo_session_create(argp->name, &file); + if (err) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd) +{ + struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + err = luo_session_retrieve(argp->name, &file); + if (err < 0) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_open(struct inode *inodep, struct file *filep) +{ + struct luo_device_state *ldev = container_of(filep->private_data, + struct luo_device_state, + miscdev); + + if (atomic_cmpxchg(&ldev->in_use, 0, 1)) + return -EBUSY; + + /* Always return -EIO to user if deserialization fail */ + if (luo_session_deserialize()) { + atomic_set(&ldev->in_use, 0); + return -EIO; + } + + return 0; +} + +static int luo_release(struct inode *inodep, struct file *filep) +{ + struct luo_device_state *ldev = container_of(filep->private_data, + struct luo_device_state, + miscdev); + atomic_set(&ldev->in_use, 0); + + return 0; +} + +union ucmd_buffer { + struct liveupdate_ioctl_create_session create; + struct liveupdate_ioctl_retrieve_session retrieve; +}; + +struct luo_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct luo_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } + +static const struct luo_ioctl_op luo_ioctl_ops[] = { + IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session, + struct liveupdate_ioctl_create_session, name), + IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session, + struct liveupdate_ioctl_retrieve_session, name), +}; + +static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + const struct luo_ioctl_op *op; + struct luo_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int err; + + nr = _IOC_NR(cmd); + if (nr < LIVEUPDATE_CMD_BASE || + (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) { + return -EINVAL; + } + + ucmd.ubuffer = (void __user *)arg; + err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (err) + return err; + + op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (err) + return err; + + return op->execute(&ucmd); +} + +static const struct file_operations luo_fops = { + .owner = THIS_MODULE, + .open = luo_open, + .release = luo_release, + .unlocked_ioctl = luo_ioctl, +}; + +static struct luo_device_state luo_dev = { + .miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "liveupdate", + .fops = &luo_fops, + }, + .in_use = ATOMIC_INIT(0), +}; + +static int __init liveupdate_ioctl_init(void) +{ + if (!liveupdate_enabled()) + return 0; + + return misc_register(&luo_dev.miscdev); +} +late_initcall(liveupdate_ioctl_init); diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c new file mode 100644 index 000000000000..ddff87917b21 --- /dev/null +++ b/kernel/liveupdate/luo_file.c @@ -0,0 +1,889 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: LUO File Descriptors + * + * LUO provides the infrastructure to preserve specific, stateful file + * descriptors across a kexec-based live update. The primary goal is to allow + * workloads, such as virtual machines using vfio, memfd, or iommufd, to + * retain access to their essential resources without interruption. + * + * The framework is built around a callback-based handler model and a well- + * defined lifecycle for each preserved file. + * + * Handler Registration: + * Kernel modules responsible for a specific file type (e.g., memfd, vfio) + * register a &struct liveupdate_file_handler. This handler provides a set of + * callbacks that LUO invokes at different stages of the update process, most + * notably: + * + * - can_preserve(): A lightweight check to determine if the handler is + * compatible with a given 'struct file'. + * - preserve(): The heavyweight operation that saves the file's state and + * returns an opaque u64 handle. This is typically performed while the + * workload is still active to minimize the downtime during the + * actual reboot transition. + * - unpreserve(): Cleans up any resources allocated by .preserve(), called + * if the preservation process is aborted before the reboot (i.e. session is + * closed). + * - freeze(): A final pre-reboot opportunity to prepare the state for kexec. + * We are already in reboot syscall, and therefore userspace cannot mutate + * the file anymore. + * - unfreeze(): Undoes the actions of .freeze(), called if the live update + * is aborted after the freeze phase. + * - retrieve(): Reconstructs the file in the new kernel from the preserved + * handle. + * - finish(): Performs final check and cleanup in the new kernel. After + * succesul finish call, LUO gives up ownership to this file. + * + * File Preservation Lifecycle happy path: + * + * 1. Preserve (Normal Operation): A userspace agent preserves files one by one + * via an ioctl. For each file, luo_preserve_file() finds a compatible + * handler, calls its .preserve() operation, and creates an internal &struct + * luo_file to track the live state. + * + * 2. Freeze (Pre-Reboot): Just before the kexec, luo_file_freeze() is called. + * It iterates through all preserved files, calls their respective .freeze() + * operation, and serializes their final metadata (compatible string, token, + * and data handle) into a contiguous memory block for KHO. + * + * 3. Deserialize: After kexec, luo_file_deserialize() runs when session gets + * deserialized (which is when /dev/liveupdate is first opened). It reads the + * serialized data from the KHO memory region and reconstructs the in-memory + * list of &struct luo_file instances for the new kernel, linking them to + * their corresponding handlers. + * + * 4. Retrieve (New Kernel - Userspace Ready): The userspace agent can now + * restore file descriptors by providing a token. luo_retrieve_file() + * searches for the matching token, calls the handler's .retrieve() op to + * re-create the 'struct file', and returns a new FD. Files can be + * retrieved in ANY order. + * + * 5. Finish (New Kernel - Cleanup): Once a session retrival is complete, + * luo_file_finish() is called. It iterates through all files, invokes their + * .finish() operations for final cleanup, and releases all associated kernel + * resources. + * + * File Preservation Lifecycle unhappy paths: + * + * 1. Abort Before Reboot: If the userspace agent aborts the live update + * process before calling reboot (e.g., by closing the session file + * descriptor), the session's release handler calls + * luo_file_unpreserve_files(). This invokes the .unpreserve() callback on + * all preserved files, ensuring all allocated resources are cleaned up and + * returning the system to a clean state. + * + * 2. Freeze Failure: During the reboot() syscall, if any handler's .freeze() + * op fails, the .unfreeze() op is invoked on all previously *successful* + * freezes to roll back their state. The reboot() syscall then returns an + * error to userspace, canceling the live update. + * + * 3. Finish Failure: In the new kernel, if a handler's .finish() op fails, + * the luo_file_finish() operation is aborted. LUO retains ownership of + * all files within that session, including those that were not yet + * processed. The userspace agent can attempt to call the finish operation + * again later. If the issue cannot be resolved, these resources will be held + * by LUO until the next live update cycle, at which point they will be + * discarded. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cleanup.h> +#include <linux/compiler.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/liveupdate.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/string.h> +#include "luo_internal.h" + +static LIST_HEAD(luo_file_handler_list); + +/* 2 4K pages, give space for 128 files per file_set */ +#define LUO_FILE_PGCNT 2ul +#define LUO_FILE_MAX \ + ((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser)) + +/** + * struct luo_file - Represents a single preserved file instance. + * @fh: Pointer to the &struct liveupdate_file_handler that manages + * this type of file. + * @file: Pointer to the kernel's &struct file that is being preserved. + * This is NULL in the new kernel until the file is successfully + * retrieved. + * @serialized_data: The opaque u64 handle to the serialized state of the file. + * This handle is passed back to the handler's .freeze(), + * .retrieve(), and .finish() callbacks, allowing it to track + * and update its serialized state across phases. + * @private_data: Pointer to the private data for the file used to hold runtime + * state that is not preserved. Set by the handler's .preserve() + * callback, and must be freed in the handler's .unpreserve() + * callback. + * @retrieved: A flag indicating whether a user/kernel in the new kernel has + * successfully called retrieve() on this file. This prevents + * multiple retrieval attempts. + * @mutex: A mutex that protects the fields of this specific instance + * (e.g., @retrieved, @file), ensuring that operations like + * retrieving or finishing a file are atomic. + * @list: The list_head linking this instance into its parent + * file_set's list of preserved files. + * @token: The user-provided unique token used to identify this file. + * + * This structure is the core in-kernel representation of a single file being + * managed through a live update. An instance is created by luo_preserve_file() + * to link a 'struct file' to its corresponding handler, a user-provided token, + * and the serialized state handle returned by the handler's .preserve() + * operation. + * + * These instances are tracked in a per-file_set list. The @serialized_data + * field, which holds a handle to the file's serialized state, may be updated + * during the .freeze() callback before being serialized for the next kernel. + * After reboot, these structures are recreated by luo_file_deserialize() and + * are finally cleaned up by luo_file_finish(). + */ +struct luo_file { + struct liveupdate_file_handler *fh; + struct file *file; + u64 serialized_data; + void *private_data; + bool retrieved; + struct mutex mutex; + struct list_head list; + u64 token; +}; + +static int luo_alloc_files_mem(struct luo_file_set *file_set) +{ + size_t size; + void *mem; + + if (file_set->files) + return 0; + + WARN_ON_ONCE(file_set->count); + + size = LUO_FILE_PGCNT << PAGE_SHIFT; + mem = kho_alloc_preserve(size); + if (IS_ERR(mem)) + return PTR_ERR(mem); + + file_set->files = mem; + + return 0; +} + +static void luo_free_files_mem(struct luo_file_set *file_set) +{ + /* If file_set has files, no need to free preservation memory */ + if (file_set->count) + return; + + if (!file_set->files) + return; + + kho_unpreserve_free(file_set->files); + file_set->files = NULL; +} + +static bool luo_token_is_used(struct luo_file_set *file_set, u64 token) +{ + struct luo_file *iter; + + list_for_each_entry(iter, &file_set->files_list, list) { + if (iter->token == token) + return true; + } + + return false; +} + +/** + * luo_preserve_file - Initiate the preservation of a file descriptor. + * @file_set: The file_set to which the preserved file will be added. + * @token: A unique, user-provided identifier for the file. + * @fd: The file descriptor to be preserved. + * + * This function orchestrates the first phase of preserving a file. Upon entry, + * it takes a reference to the 'struct file' via fget(), effectively making LUO + * a co-owner of the file. This reference is held until the file is either + * unpreserved or successfully finished in the next kernel, preventing the file + * from being prematurely destroyed. + * + * This function orchestrates the first phase of preserving a file. It performs + * the following steps: + * + * 1. Validates that the @token is not already in use within the file_set. + * 2. Ensures the file_set's memory for files serialization is allocated + * (allocates if needed). + * 3. Iterates through registered handlers, calling can_preserve() to find one + * compatible with the given @fd. + * 4. Calls the handler's .preserve() operation, which saves the file's state + * and returns an opaque private data handle. + * 5. Adds the new instance to the file_set's internal list. + * + * On success, LUO takes a reference to the 'struct file' and considers it + * under its management until it is unpreserved or finished. + * + * In case of any failure, all intermediate allocations (file reference, memory + * for the 'luo_file' struct, etc.) are cleaned up before returning an error. + * + * Context: Can be called from an ioctl handler during normal system operation. + * Return: 0 on success. Returns a negative errno on failure: + * -EEXIST if the token is already used. + * -EBADF if the file descriptor is invalid. + * -ENOSPC if the file_set is full. + * -ENOENT if no compatible handler is found. + * -ENOMEM on memory allocation failure. + * Other erros might be returned by .preserve(). + */ +int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) +{ + struct liveupdate_file_op_args args = {0}; + struct liveupdate_file_handler *fh; + struct luo_file *luo_file; + struct file *file; + int err; + + if (luo_token_is_used(file_set, token)) + return -EEXIST; + + if (file_set->count == LUO_FILE_MAX) + return -ENOSPC; + + file = fget(fd); + if (!file) + return -EBADF; + + err = luo_alloc_files_mem(file_set); + if (err) + goto err_fput; + + err = -ENOENT; + luo_list_for_each_private(fh, &luo_file_handler_list, list) { + if (fh->ops->can_preserve(fh, file)) { + err = 0; + break; + } + } + + /* err is still -ENOENT if no handler was found */ + if (err) + goto err_free_files_mem; + + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); + if (!luo_file) { + err = -ENOMEM; + goto err_free_files_mem; + } + + luo_file->file = file; + luo_file->fh = fh; + luo_file->token = token; + luo_file->retrieved = false; + mutex_init(&luo_file->mutex); + + args.handler = fh; + args.file = file; + err = fh->ops->preserve(&args); + if (err) + goto err_kfree; + + luo_file->serialized_data = args.serialized_data; + luo_file->private_data = args.private_data; + list_add_tail(&luo_file->list, &file_set->files_list); + file_set->count++; + + return 0; + +err_kfree: + kfree(luo_file); +err_free_files_mem: + luo_free_files_mem(file_set); +err_fput: + fput(file); + + return err; +} + +/** + * luo_file_unpreserve_files - Unpreserves all files from a file_set. + * @file_set: The files to be cleaned up. + * + * This function serves as the primary cleanup path for a file_set. It is + * invoked when the userspace agent closes the file_set's file descriptor. + * + * For each file, it performs the following cleanup actions: + * 1. Calls the handler's .unpreserve() callback to allow the handler to + * release any resources it allocated. + * 2. Removes the file from the file_set's internal tracking list. + * 3. Releases the reference to the 'struct file' that was taken by + * luo_preserve_file() via fput(), returning ownership. + * 4. Frees the memory associated with the internal 'struct luo_file'. + * + * After all individual files are unpreserved, it frees the contiguous memory + * block that was allocated to hold their serialization data. + */ +void luo_file_unpreserve_files(struct luo_file_set *file_set) +{ + struct luo_file *luo_file; + + while (!list_empty(&file_set->files_list)) { + struct liveupdate_file_op_args args = {0}; + + luo_file = list_last_entry(&file_set->files_list, + struct luo_file, list); + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; + luo_file->fh->ops->unpreserve(&args); + + list_del(&luo_file->list); + file_set->count--; + + fput(luo_file->file); + mutex_destroy(&luo_file->mutex); + kfree(luo_file); + } + + luo_free_files_mem(file_set); +} + +static int luo_file_freeze_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + int err = 0; + + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->freeze) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; + + err = luo_file->fh->ops->freeze(&args); + if (!err) + luo_file->serialized_data = args.serialized_data; + } + + return err; +} + +static void luo_file_unfreeze_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->unfreeze) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.private_data = luo_file->private_data; + + luo_file->fh->ops->unfreeze(&args); + } + + luo_file->serialized_data = 0; +} + +static void __luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file *failed_entry) +{ + struct list_head *files_list = &file_set->files_list; + struct luo_file *luo_file; + + list_for_each_entry(luo_file, files_list, list) { + if (luo_file == failed_entry) + break; + + luo_file_unfreeze_one(file_set, luo_file); + } + + memset(file_set->files, 0, LUO_FILE_PGCNT << PAGE_SHIFT); +} + +/** + * luo_file_freeze - Freezes all preserved files and serializes their metadata. + * @file_set: The file_set whose files are to be frozen. + * @file_set_ser: Where to put the serialized file_set. + * + * This function is called from the reboot() syscall path, just before the + * kernel transitions to the new image via kexec. Its purpose is to perform the + * final preparation and serialization of all preserved files in the file_set. + * + * It iterates through each preserved file in FIFO order (the order of + * preservation) and performs two main actions: + * + * 1. Freezes the File: It calls the handler's .freeze() callback for each + * file. This gives the handler a final opportunity to quiesce the device or + * prepare its state for the upcoming reboot. The handler may update its + * private data handle during this step. + * + * 2. Serializes Metadata: After a successful freeze, it copies the final file + * metadata—the handler's compatible string, the user token, and the final + * private data handle—into the pre-allocated contiguous memory buffer + * (file_set->files) that will be handed over to the next kernel via KHO. + * + * Error Handling (Rollback): + * This function is atomic. If any handler's .freeze() operation fails, the + * entire live update is aborted. The __luo_file_unfreeze() helper is + * immediately called to invoke the .unfreeze() op on all files that were + * successfully frozen before the point of failure, rolling them back to a + * running state. The function then returns an error, causing the reboot() + * syscall to fail. + * + * Context: Called only from the liveupdate_reboot() path. + * Return: 0 on success, or a negative errno on failure. + */ +int luo_file_freeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + struct luo_file_ser *file_ser = file_set->files; + struct luo_file *luo_file; + int err; + int i; + + if (!file_set->count) + return 0; + + if (WARN_ON(!file_ser)) + return -EINVAL; + + i = 0; + list_for_each_entry(luo_file, &file_set->files_list, list) { + err = luo_file_freeze_one(file_set, luo_file); + if (err < 0) { + pr_warn("Freeze failed for token[%#0llx] handler[%s] err[%pe]\n", + luo_file->token, luo_file->fh->compatible, + ERR_PTR(err)); + goto err_unfreeze; + } + + strscpy(file_ser[i].compatible, luo_file->fh->compatible, + sizeof(file_ser[i].compatible)); + file_ser[i].data = luo_file->serialized_data; + file_ser[i].token = luo_file->token; + i++; + } + + file_set_ser->count = file_set->count; + if (file_set->files) + file_set_ser->files = virt_to_phys(file_set->files); + + return 0; + +err_unfreeze: + __luo_file_unfreeze(file_set, luo_file); + + return err; +} + +/** + * luo_file_unfreeze - Unfreezes all files in a file_set and clear serialization + * @file_set: The file_set whose files are to be unfrozen. + * @file_set_ser: Serialized file_set. + * + * This function rolls back the state of all files in a file_set after the + * freeze phase has begun but must be aborted. It is the counterpart to + * luo_file_freeze(). + * + * It invokes the __luo_file_unfreeze() helper with a NULL argument, which + * signals the helper to iterate through all files in the file_set and call + * their respective .unfreeze() handler callbacks. + * + * Context: This is called when the live update is aborted during + * the reboot() syscall, after luo_file_freeze() has been called. + */ +void luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + if (!file_set->count) + return; + + __luo_file_unfreeze(file_set, NULL); + memset(file_set_ser, 0, sizeof(*file_set_ser)); +} + +/** + * luo_retrieve_file - Restores a preserved file from a file_set by its token. + * @file_set: The file_set from which to retrieve the file. + * @token: The unique token identifying the file to be restored. + * @filep: Output parameter; on success, this is populated with a pointer + * to the newly retrieved 'struct file'. + * + * This function is the primary mechanism for recreating a file in the new + * kernel after a live update. It searches the file_set's list of deserialized + * files for an entry matching the provided @token. + * + * The operation is idempotent: if a file has already been successfully + * retrieved, this function will simply return a pointer to the existing + * 'struct file' and report success without re-executing the retrieve + * operation. This is handled by checking the 'retrieved' flag under a lock. + * + * File retrieval can happen in any order; it is not bound by the order of + * preservation. + * + * Context: Can be called from an ioctl or other in-kernel code in the new + * kernel. + * Return: 0 on success. Returns a negative errno on failure: + * -ENOENT if no file with the matching token is found. + * Any error code returned by the handler's .retrieve() op. + */ +int luo_retrieve_file(struct luo_file_set *file_set, u64 token, + struct file **filep) +{ + struct liveupdate_file_op_args args = {0}; + struct luo_file *luo_file; + int err; + + if (list_empty(&file_set->files_list)) + return -ENOENT; + + list_for_each_entry(luo_file, &file_set->files_list, list) { + if (luo_file->token == token) + break; + } + + if (luo_file->token != token) + return -ENOENT; + + guard(mutex)(&luo_file->mutex); + if (luo_file->retrieved) { + /* + * Someone is asking for this file again, so get a reference + * for them. + */ + get_file(luo_file->file); + *filep = luo_file->file; + return 0; + } + + args.handler = luo_file->fh; + args.serialized_data = luo_file->serialized_data; + err = luo_file->fh->ops->retrieve(&args); + if (!err) { + luo_file->file = args.file; + + /* Get reference so we can keep this file in LUO until finish */ + get_file(luo_file->file); + *filep = luo_file->file; + luo_file->retrieved = true; + } + + return err; +} + +static int luo_file_can_finish_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + bool can_finish = true; + + guard(mutex)(&luo_file->mutex); + + if (luo_file->fh->ops->can_finish) { + struct liveupdate_file_op_args args = {0}; + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.retrieved = luo_file->retrieved; + can_finish = luo_file->fh->ops->can_finish(&args); + } + + return can_finish ? 0 : -EBUSY; +} + +static void luo_file_finish_one(struct luo_file_set *file_set, + struct luo_file *luo_file) +{ + struct liveupdate_file_op_args args = {0}; + + guard(mutex)(&luo_file->mutex); + + args.handler = luo_file->fh; + args.file = luo_file->file; + args.serialized_data = luo_file->serialized_data; + args.retrieved = luo_file->retrieved; + + luo_file->fh->ops->finish(&args); +} + +/** + * luo_file_finish - Completes the lifecycle for all files in a file_set. + * @file_set: The file_set to be finalized. + * + * This function orchestrates the final teardown of a live update file_set in + * the new kernel. It should be called after all necessary files have been + * retrieved and the userspace agent is ready to release the preserved state. + * + * The function iterates through all tracked files. For each file, it performs + * the following sequence of cleanup actions: + * + * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on + * every file in the file_set. If all can_finish return true, continue to + * finish. + * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to + * allow for final resource cleanup within the handler. + * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This + * is the counterpart to the get_file() call in luo_retrieve_file(). + * 4. Removes the 'struct luo_file' from the file_set's internal list. + * 5. Frees the memory for the 'struct luo_file' instance itself. + * + * After successfully finishing all individual files, it frees the + * contiguous memory block that was used to transfer the serialized metadata + * from the previous kernel. + * + * Error Handling (Atomic Failure): + * This operation is atomic. If any handler's .can_finish() op fails, the entire + * function aborts immediately and returns an error. + * + * Context: Can be called from an ioctl handler in the new kernel. + * Return: 0 on success, or a negative errno on failure. + */ +int luo_file_finish(struct luo_file_set *file_set) +{ + struct list_head *files_list = &file_set->files_list; + struct luo_file *luo_file; + int err; + + if (!file_set->count) + return 0; + + list_for_each_entry(luo_file, files_list, list) { + err = luo_file_can_finish_one(file_set, luo_file); + if (err) + return err; + } + + while (!list_empty(&file_set->files_list)) { + luo_file = list_last_entry(&file_set->files_list, + struct luo_file, list); + + luo_file_finish_one(file_set, luo_file); + + if (luo_file->file) + fput(luo_file->file); + list_del(&luo_file->list); + file_set->count--; + mutex_destroy(&luo_file->mutex); + kfree(luo_file); + } + + if (file_set->files) { + kho_restore_free(file_set->files); + file_set->files = NULL; + } + + return 0; +} + +/** + * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel. + * @file_set: The incoming file_set to fill with deserialized data. + * @file_set_ser: Serialized KHO file_set data from the previous kernel. + * + * This function is called during the early boot process of the new kernel. It + * takes the raw, contiguous memory block of 'struct luo_file_ser' entries, + * provided by the previous kernel, and transforms it back into a live, + * in-memory linked list of 'struct luo_file' instances. + * + * For each serialized entry, it performs the following steps: + * 1. Reads the 'compatible' string. + * 2. Searches the global list of registered file handlers for one that + * matches the compatible string. + * 3. Allocates a new 'struct luo_file'. + * 4. Populates the new structure with the deserialized data (token, private + * data handle) and links it to the found handler. The 'file' pointer is + * initialized to NULL, as the file has not been retrieved yet. + * 5. Adds the new 'struct luo_file' to the file_set's files_list. + * + * This prepares the file_set for userspace, which can later call + * luo_retrieve_file() to restore the actual file descriptors. + * + * Context: Called from session deserialization. + */ +int luo_file_deserialize(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser) +{ + struct luo_file_ser *file_ser; + u64 i; + + if (!file_set_ser->files) { + WARN_ON(file_set_ser->count); + return 0; + } + + file_set->count = file_set_ser->count; + file_set->files = phys_to_virt(file_set_ser->files); + + /* + * Note on error handling: + * + * If deserialization fails (e.g., allocation failure or corrupt data), + * we intentionally skip cleanup of files that were already restored. + * + * A partial failure leaves the preserved state inconsistent. + * Implementing a safe "undo" to unwind complex dependencies (sessions, + * files, hardware state) is error-prone and provides little value, as + * the system is effectively in a broken state. + * + * We treat these resources as leaked. The expected recovery path is for + * userspace to detect the failure and trigger a reboot, which will + * reliably reset devices and reclaim memory. + */ + file_ser = file_set->files; + for (i = 0; i < file_set->count; i++) { + struct liveupdate_file_handler *fh; + bool handler_found = false; + struct luo_file *luo_file; + + luo_list_for_each_private(fh, &luo_file_handler_list, list) { + if (!strcmp(fh->compatible, file_ser[i].compatible)) { + handler_found = true; + break; + } + } + + if (!handler_found) { + pr_warn("No registered handler for compatible '%s'\n", + file_ser[i].compatible); + return -ENOENT; + } + + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); + if (!luo_file) + return -ENOMEM; + + luo_file->fh = fh; + luo_file->file = NULL; + luo_file->serialized_data = file_ser[i].data; + luo_file->token = file_ser[i].token; + luo_file->retrieved = false; + mutex_init(&luo_file->mutex); + list_add_tail(&luo_file->list, &file_set->files_list); + } + + return 0; +} + +void luo_file_set_init(struct luo_file_set *file_set) +{ + INIT_LIST_HEAD(&file_set->files_list); +} + +void luo_file_set_destroy(struct luo_file_set *file_set) +{ + WARN_ON(file_set->count); + WARN_ON(!list_empty(&file_set->files_list)); +} + +/** + * liveupdate_register_file_handler - Register a file handler with LUO. + * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler. + * The caller must initialize this structure, including a unique + * 'compatible' string and a valid 'fh' callbacks. This function adds the + * handler to the global list of supported file handlers. + * + * Context: Typically called during module initialization for file types that + * support live update preservation. + * + * Return: 0 on success. Negative errno on failure. + */ +int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) +{ + struct liveupdate_file_handler *fh_iter; + int err; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + /* Sanity check that all required callbacks are set */ + if (!fh->ops->preserve || !fh->ops->unpreserve || !fh->ops->retrieve || + !fh->ops->finish || !fh->ops->can_preserve) { + return -EINVAL; + } + + /* + * Ensure the system is quiescent (no active sessions). + * This prevents registering new handlers while sessions are active or + * while deserialization is in progress. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Check for duplicate compatible strings */ + luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) { + if (!strcmp(fh_iter->compatible, fh->compatible)) { + pr_err("File handler registration failed: Compatible string '%s' already registered.\n", + fh->compatible); + err = -EEXIST; + goto err_resume; + } + } + + /* Pin the module implementing the handler */ + if (!try_module_get(fh->ops->owner)) { + err = -EAGAIN; + goto err_resume; + } + + INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list)); + list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_unregister_file_handler - Unregister a liveupdate file handler + * @fh: The file handler to unregister + * + * Unregisters the file handler from the liveupdate core. This function + * reverses the operations of liveupdate_register_file_handler(). + * + * It ensures safe removal by checking that: + * No live update session is currently in progress. + * + * If the unregistration fails, the internal test state is reverted. + * + * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live + * update is in progress, can't quiesce live update. + */ +int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +{ + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (!luo_session_quiesce()) + return -EBUSY; + + list_del(&ACCESS_PRIVATE(fh, list)); + module_put(fh->ops->owner); + luo_session_resume(); + + return 0; +} diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h new file mode 100644 index 000000000000..c8973b543d1d --- /dev/null +++ b/kernel/liveupdate/luo_internal.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#ifndef _LINUX_LUO_INTERNAL_H +#define _LINUX_LUO_INTERNAL_H + +#include <linux/liveupdate.h> +#include <linux/uaccess.h> + +struct luo_ucmd { + void __user *ubuffer; + u32 user_size; + void *cmd; +}; + +static inline int luo_ucmd_respond(struct luo_ucmd *ucmd, + size_t kernel_cmd_size) +{ + /* + * Copy the minimum of what the user provided and what we actually + * have. + */ + if (copy_to_user(ucmd->ubuffer, ucmd->cmd, + min_t(size_t, ucmd->user_size, kernel_cmd_size))) { + return -EFAULT; + } + return 0; +} + +/* + * Handles a deserialization failure: devices and memory is in unpredictable + * state. + * + * Continuing the boot process after a failure is dangerous because it could + * lead to leaks of private data. + */ +#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__) + +/* Mimics list_for_each_entry() but for private list head entries */ +#define luo_list_for_each_private(pos, head, member) \ + for (struct list_head *__iter = (head)->next; \ + __iter != (head) && \ + ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \ + __iter = __iter->next) + +/** + * struct luo_file_set - A set of files that belong to the same sessions. + * @files_list: An ordered list of files associated with this session, it is + * ordered by preservation time. + * @files: The physically contiguous memory block that holds the serialized + * state of files. + * @count: A counter tracking the number of files currently stored in the + * @files_list for this session. + */ +struct luo_file_set { + struct list_head files_list; + struct luo_file_ser *files; + long count; +}; + +/** + * struct luo_session - Represents an active or incoming Live Update session. + * @name: A unique name for this session, used for identification and + * retrieval. + * @ser: Pointer to the serialized data for this session. + * @list: A list_head member used to link this session into a global list + * of either outgoing (to be preserved) or incoming (restored from + * previous kernel) sessions. + * @retrieved: A boolean flag indicating whether this session has been + * retrieved by a consumer in the new kernel. + * @file_set: A set of files that belong to this session. + * @mutex: protects fields in the luo_session. + */ +struct luo_session { + char name[LIVEUPDATE_SESSION_NAME_LENGTH]; + struct luo_session_ser *ser; + struct list_head list; + bool retrieved; + struct luo_file_set file_set; + struct mutex mutex; +}; + +int luo_session_create(const char *name, struct file **filep); +int luo_session_retrieve(const char *name, struct file **filep); +int __init luo_session_setup_outgoing(void *fdt); +int __init luo_session_setup_incoming(void *fdt); +int luo_session_serialize(void); +int luo_session_deserialize(void); +bool luo_session_quiesce(void); +void luo_session_resume(void); + +int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd); +void luo_file_unpreserve_files(struct luo_file_set *file_set); +int luo_file_freeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +void luo_file_unfreeze(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +int luo_retrieve_file(struct luo_file_set *file_set, u64 token, + struct file **filep); +int luo_file_finish(struct luo_file_set *file_set); +int luo_file_deserialize(struct luo_file_set *file_set, + struct luo_file_set_ser *file_set_ser); +void luo_file_set_init(struct luo_file_set *file_set); +void luo_file_set_destroy(struct luo_file_set *file_set); + +#endif /* _LINUX_LUO_INTERNAL_H */ diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c new file mode 100644 index 000000000000..dbdbc3bd7929 --- /dev/null +++ b/kernel/liveupdate/luo_session.c @@ -0,0 +1,646 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: LUO Sessions + * + * LUO Sessions provide the core mechanism for grouping and managing `struct + * file *` instances that need to be preserved across a kexec-based live + * update. Each session acts as a named container for a set of file objects, + * allowing a userspace agent to manage the lifecycle of resources critical to a + * workload. + * + * Core Concepts: + * + * - Named Containers: Sessions are identified by a unique, user-provided name, + * which is used for both creation in the current kernel and retrieval in the + * next kernel. + * + * - Userspace Interface: Session management is driven from userspace via + * ioctls on /dev/liveupdate. + * + * - Serialization: Session metadata is preserved using the KHO framework. When + * a live update is triggered via kexec, an array of `struct luo_session_ser` + * is populated and placed in a preserved memory region. An FDT node is also + * created, containing the count of sessions and the physical address of this + * array. + * + * Session Lifecycle: + * + * 1. Creation: A userspace agent calls `luo_session_create()` to create a + * new, empty session and receives a file descriptor for it. + * + * 2. Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is + * made, `luo_session_serialize()` is called. It iterates through all + * active sessions and writes their metadata into a memory area preserved + * by KHO. + * + * 3. Deserialization (in new kernel): After kexec, `luo_session_deserialize()` + * runs, reading the serialized data and creating a list of `struct + * luo_session` objects representing the preserved sessions. + * + * 4. Retrieval: A userspace agent in the new kernel can then call + * `luo_session_retrieve()` with a session name to get a new file + * descriptor and access the preserved state. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/anon_inodes.h> +#include <linux/cleanup.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/libfdt.h> +#include <linux/list.h> +#include <linux/liveupdate.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/unaligned.h> +#include <uapi/linux/liveupdate.h> +#include "luo_internal.h" + +/* 16 4K pages, give space for 744 sessions */ +#define LUO_SESSION_PGCNT 16ul +#define LUO_SESSION_MAX (((LUO_SESSION_PGCNT << PAGE_SHIFT) - \ + sizeof(struct luo_session_header_ser)) / \ + sizeof(struct luo_session_ser)) + +/** + * struct luo_session_header - Header struct for managing LUO sessions. + * @count: The number of sessions currently tracked in the @list. + * @list: The head of the linked list of `struct luo_session` instances. + * @rwsem: A read-write semaphore providing synchronized access to the + * session list and other fields in this structure. + * @header_ser: The header data of serialization array. + * @ser: The serialized session data (an array of + * `struct luo_session_ser`). + * @active: Set to true when first initialized. If previous kernel did not + * send session data, active stays false for incoming. + */ +struct luo_session_header { + long count; + struct list_head list; + struct rw_semaphore rwsem; + struct luo_session_header_ser *header_ser; + struct luo_session_ser *ser; + bool active; +}; + +/** + * struct luo_session_global - Global container for managing LUO sessions. + * @incoming: The sessions passed from the previous kernel. + * @outgoing: The sessions that are going to be passed to the next kernel. + */ +struct luo_session_global { + struct luo_session_header incoming; + struct luo_session_header outgoing; +}; + +static struct luo_session_global luo_session_global = { + .incoming = { + .list = LIST_HEAD_INIT(luo_session_global.incoming.list), + .rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem), + }, + .outgoing = { + .list = LIST_HEAD_INIT(luo_session_global.outgoing.list), + .rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem), + }, +}; + +static struct luo_session *luo_session_alloc(const char *name) +{ + struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL); + + if (!session) + return ERR_PTR(-ENOMEM); + + strscpy(session->name, name, sizeof(session->name)); + INIT_LIST_HEAD(&session->file_set.files_list); + luo_file_set_init(&session->file_set); + INIT_LIST_HEAD(&session->list); + mutex_init(&session->mutex); + + return session; +} + +static void luo_session_free(struct luo_session *session) +{ + luo_file_set_destroy(&session->file_set); + mutex_destroy(&session->mutex); + kfree(session); +} + +static int luo_session_insert(struct luo_session_header *sh, + struct luo_session *session) +{ + struct luo_session *it; + + guard(rwsem_write)(&sh->rwsem); + + /* + * For outgoing we should make sure there is room in serialization array + * for new session. + */ + if (sh == &luo_session_global.outgoing) { + if (sh->count == LUO_SESSION_MAX) + return -ENOMEM; + } + + /* + * For small number of sessions this loop won't hurt performance + * but if we ever start using a lot of sessions, this might + * become a bottle neck during deserialization time, as it would + * cause O(n*n) complexity. + */ + list_for_each_entry(it, &sh->list, list) { + if (!strncmp(it->name, session->name, sizeof(it->name))) + return -EEXIST; + } + list_add_tail(&session->list, &sh->list); + sh->count++; + + return 0; +} + +static void luo_session_remove(struct luo_session_header *sh, + struct luo_session *session) +{ + guard(rwsem_write)(&sh->rwsem); + list_del(&session->list); + sh->count--; +} + +static int luo_session_finish_one(struct luo_session *session) +{ + guard(mutex)(&session->mutex); + return luo_file_finish(&session->file_set); +} + +static void luo_session_unfreeze_one(struct luo_session *session, + struct luo_session_ser *ser) +{ + guard(mutex)(&session->mutex); + luo_file_unfreeze(&session->file_set, &ser->file_set_ser); +} + +static int luo_session_freeze_one(struct luo_session *session, + struct luo_session_ser *ser) +{ + guard(mutex)(&session->mutex); + return luo_file_freeze(&session->file_set, &ser->file_set_ser); +} + +static int luo_session_release(struct inode *inodep, struct file *filep) +{ + struct luo_session *session = filep->private_data; + struct luo_session_header *sh; + + /* If retrieved is set, it means this session is from incoming list */ + if (session->retrieved) { + int err = luo_session_finish_one(session); + + if (err) { + pr_warn("Unable to finish session [%s] on release\n", + session->name); + return err; + } + sh = &luo_session_global.incoming; + } else { + scoped_guard(mutex, &session->mutex) + luo_file_unpreserve_files(&session->file_set); + sh = &luo_session_global.outgoing; + } + + luo_session_remove(sh, session); + luo_session_free(session); + + return 0; +} + +static int luo_session_preserve_fd(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_preserve_fd *argp = ucmd->cmd; + int err; + + guard(mutex)(&session->mutex); + err = luo_preserve_file(&session->file_set, argp->token, argp->fd); + if (err) + return err; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + pr_warn("The file was successfully preserved, but response to user failed\n"); + + return err; +} + +static int luo_session_retrieve_fd(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_retrieve_fd *argp = ucmd->cmd; + struct file *file; + int err; + + argp->fd = get_unused_fd_flags(O_CLOEXEC); + if (argp->fd < 0) + return argp->fd; + + guard(mutex)(&session->mutex); + err = luo_retrieve_file(&session->file_set, argp->token, &file); + if (err < 0) + goto err_put_fd; + + err = luo_ucmd_respond(ucmd, sizeof(*argp)); + if (err) + goto err_put_file; + + fd_install(argp->fd, file); + + return 0; + +err_put_file: + fput(file); +err_put_fd: + put_unused_fd(argp->fd); + + return err; +} + +static int luo_session_finish(struct luo_session *session, + struct luo_ucmd *ucmd) +{ + struct liveupdate_session_finish *argp = ucmd->cmd; + int err = luo_session_finish_one(session); + + if (err) + return err; + + return luo_ucmd_respond(ucmd, sizeof(*argp)); +} + +union ucmd_buffer { + struct liveupdate_session_finish finish; + struct liveupdate_session_preserve_fd preserve; + struct liveupdate_session_retrieve_fd retrieve; +}; + +struct luo_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct luo_session *session, struct luo_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_SESSION_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } + +static const struct luo_ioctl_op luo_session_ioctl_ops[] = { + IOCTL_OP(LIVEUPDATE_SESSION_FINISH, luo_session_finish, + struct liveupdate_session_finish, reserved), + IOCTL_OP(LIVEUPDATE_SESSION_PRESERVE_FD, luo_session_preserve_fd, + struct liveupdate_session_preserve_fd, token), + IOCTL_OP(LIVEUPDATE_SESSION_RETRIEVE_FD, luo_session_retrieve_fd, + struct liveupdate_session_retrieve_fd, token), +}; + +static long luo_session_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + struct luo_session *session = filep->private_data; + const struct luo_ioctl_op *op; + struct luo_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int ret; + + nr = _IOC_NR(cmd); + if (nr < LIVEUPDATE_CMD_SESSION_BASE || (nr - LIVEUPDATE_CMD_SESSION_BASE) >= + ARRAY_SIZE(luo_session_ioctl_ops)) { + return -EINVAL; + } + + ucmd.ubuffer = (void __user *)arg; + ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (ret) + return ret; + + op = &luo_session_ioctl_ops[nr - LIVEUPDATE_CMD_SESSION_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (ret) + return ret; + + return op->execute(session, &ucmd); +} + +static const struct file_operations luo_session_fops = { + .owner = THIS_MODULE, + .release = luo_session_release, + .unlocked_ioctl = luo_session_ioctl, +}; + +/* Create a "struct file" for session */ +static int luo_session_getfile(struct luo_session *session, struct file **filep) +{ + char name_buf[128]; + struct file *file; + + lockdep_assert_held(&session->mutex); + snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name); + file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR); + if (IS_ERR(file)) + return PTR_ERR(file); + + *filep = file; + + return 0; +} + +int luo_session_create(const char *name, struct file **filep) +{ + struct luo_session *session; + int err; + + session = luo_session_alloc(name); + if (IS_ERR(session)) + return PTR_ERR(session); + + err = luo_session_insert(&luo_session_global.outgoing, session); + if (err) + goto err_free; + + scoped_guard(mutex, &session->mutex) + err = luo_session_getfile(session, filep); + if (err) + goto err_remove; + + return 0; + +err_remove: + luo_session_remove(&luo_session_global.outgoing, session); +err_free: + luo_session_free(session); + + return err; +} + +int luo_session_retrieve(const char *name, struct file **filep) +{ + struct luo_session_header *sh = &luo_session_global.incoming; + struct luo_session *session = NULL; + struct luo_session *it; + int err; + + scoped_guard(rwsem_read, &sh->rwsem) { + list_for_each_entry(it, &sh->list, list) { + if (!strncmp(it->name, name, sizeof(it->name))) { + session = it; + break; + } + } + } + + if (!session) + return -ENOENT; + + guard(mutex)(&session->mutex); + if (session->retrieved) + return -EINVAL; + + err = luo_session_getfile(session, filep); + if (!err) + session->retrieved = true; + + return err; +} + +int __init luo_session_setup_outgoing(void *fdt_out) +{ + struct luo_session_header_ser *header_ser; + u64 header_ser_pa; + int err; + + header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT); + if (IS_ERR(header_ser)) + return PTR_ERR(header_ser); + header_ser_pa = virt_to_phys(header_ser); + + err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME); + err |= fdt_property_string(fdt_out, "compatible", + LUO_FDT_SESSION_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa, + sizeof(header_ser_pa)); + err |= fdt_end_node(fdt_out); + + if (err) + goto err_unpreserve; + + luo_session_global.outgoing.header_ser = header_ser; + luo_session_global.outgoing.ser = (void *)(header_ser + 1); + luo_session_global.outgoing.active = true; + + return 0; + +err_unpreserve: + kho_unpreserve_free(header_ser); + return err; +} + +int __init luo_session_setup_incoming(void *fdt_in) +{ + struct luo_session_header_ser *header_ser; + int err, header_size, offset; + u64 header_ser_pa; + const void *ptr; + + offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME); + if (offset < 0) { + pr_err("Unable to get session node: [%s]\n", + LUO_FDT_SESSION_NODE_NAME); + return -EINVAL; + } + + err = fdt_node_check_compatible(fdt_in, offset, + LUO_FDT_SESSION_COMPATIBLE); + if (err) { + pr_err("Session node incompatible [%s]\n", + LUO_FDT_SESSION_COMPATIBLE); + return -EINVAL; + } + + header_size = 0; + ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size); + if (!ptr || header_size != sizeof(u64)) { + pr_err("Unable to get session header '%s' [%d]\n", + LUO_FDT_SESSION_HEADER, header_size); + return -EINVAL; + } + + header_ser_pa = get_unaligned((u64 *)ptr); + header_ser = phys_to_virt(header_ser_pa); + + luo_session_global.incoming.header_ser = header_ser; + luo_session_global.incoming.ser = (void *)(header_ser + 1); + luo_session_global.incoming.active = true; + + return 0; +} + +int luo_session_deserialize(void) +{ + struct luo_session_header *sh = &luo_session_global.incoming; + static bool is_deserialized; + static int err; + + /* If has been deserialized, always return the same error code */ + if (is_deserialized) + return err; + + is_deserialized = true; + if (!sh->active) + return 0; + + /* + * Note on error handling: + * + * If deserialization fails (e.g., allocation failure or corrupt data), + * we intentionally skip cleanup of sessions that were already restored. + * + * A partial failure leaves the preserved state inconsistent. + * Implementing a safe "undo" to unwind complex dependencies (sessions, + * files, hardware state) is error-prone and provides little value, as + * the system is effectively in a broken state. + * + * We treat these resources as leaked. The expected recovery path is for + * userspace to detect the failure and trigger a reboot, which will + * reliably reset devices and reclaim memory. + */ + for (int i = 0; i < sh->header_ser->count; i++) { + struct luo_session *session; + + session = luo_session_alloc(sh->ser[i].name); + if (IS_ERR(session)) { + pr_warn("Failed to allocate session [%s] during deserialization %pe\n", + sh->ser[i].name, session); + return PTR_ERR(session); + } + + err = luo_session_insert(sh, session); + if (err) { + pr_warn("Failed to insert session [%s] %pe\n", + session->name, ERR_PTR(err)); + luo_session_free(session); + return err; + } + + scoped_guard(mutex, &session->mutex) { + luo_file_deserialize(&session->file_set, + &sh->ser[i].file_set_ser); + } + } + + kho_restore_free(sh->header_ser); + sh->header_ser = NULL; + sh->ser = NULL; + + return 0; +} + +int luo_session_serialize(void) +{ + struct luo_session_header *sh = &luo_session_global.outgoing; + struct luo_session *session; + int i = 0; + int err; + + guard(rwsem_write)(&sh->rwsem); + list_for_each_entry(session, &sh->list, list) { + err = luo_session_freeze_one(session, &sh->ser[i]); + if (err) + goto err_undo; + + strscpy(sh->ser[i].name, session->name, + sizeof(sh->ser[i].name)); + i++; + } + sh->header_ser->count = sh->count; + + return 0; + +err_undo: + list_for_each_entry_continue_reverse(session, &sh->list, list) { + i--; + luo_session_unfreeze_one(session, &sh->ser[i]); + memset(sh->ser[i].name, 0, sizeof(sh->ser[i].name)); + } + + return err; +} + +/** + * luo_session_quiesce - Ensure no active sessions exist and lock session lists. + * + * Acquires exclusive write locks on both incoming and outgoing session lists. + * It then validates no sessions exist in either list. + * + * This mechanism is used during file handler un/registration to ensure that no + * sessions are currently using the handler, and no new sessions can be created + * while un/registration is in progress. + * + * This prevents registering new handlers while sessions are active or + * while deserialization is in progress. + * + * Return: + * true - System is quiescent (0 sessions) and locked. + * false - Active sessions exist. The locks are released internally. + */ +bool luo_session_quiesce(void) +{ + down_write(&luo_session_global.incoming.rwsem); + down_write(&luo_session_global.outgoing.rwsem); + + if (luo_session_global.incoming.count || + luo_session_global.outgoing.count) { + up_write(&luo_session_global.outgoing.rwsem); + up_write(&luo_session_global.incoming.rwsem); + return false; + } + + return true; +} + +/** + * luo_session_resume - Unlock session lists and resume normal activity. + * + * Releases the exclusive locks acquired by a successful call to + * luo_session_quiesce(). + */ +void luo_session_resume(void) +{ + up_write(&luo_session_global.outgoing.rwsem); + up_write(&luo_session_global.incoming.rwsem); +} diff --git a/kernel/module/main.c b/kernel/module/main.c index 7b3ec2fa6e7c..710ee30b3bea 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -954,7 +954,7 @@ size_t module_flags_taint(unsigned long taints, char *buf) int i; for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &taints)) + if (test_bit(i, &taints)) buf[l++] = taint_flags[i].c_true; } diff --git a/kernel/panic.c b/kernel/panic.c index b2f2470af7e5..0d52210a9e2b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -401,7 +401,7 @@ static void panic_trigger_all_cpu_backtrace(void) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & SYS_INFO_ALL_CPU_BT) + if (panic_print & SYS_INFO_ALL_BT) panic_trigger_all_cpu_backtrace(); /* @@ -628,38 +628,40 @@ void panic(const char *fmt, ...) } EXPORT_SYMBOL(panic); -#define TAINT_FLAG(taint, _c_true, _c_false, _module) \ +#define TAINT_FLAG(taint, _c_true, _c_false) \ [ TAINT_##taint ] = { \ .c_true = _c_true, .c_false = _c_false, \ - .module = _module, \ .desc = #taint, \ } /* - * TAINT_FORCED_RMMOD could be a per-module flag but the module - * is being removed anyway. + * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT, + * please also modify tools/debugging/kernel-chktaint and + * Documentation/admin-guide/tainted-kernels.rst, including its + * small shell script that prints the TAINT_FLAGS_COUNT bits of + * /proc/sys/kernel/tainted. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true), - TAINT_FLAG(FORCED_MODULE, 'F', ' ', true), - TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' ', false), - TAINT_FLAG(FORCED_RMMOD, 'R', ' ', false), - TAINT_FLAG(MACHINE_CHECK, 'M', ' ', false), - TAINT_FLAG(BAD_PAGE, 'B', ' ', false), - TAINT_FLAG(USER, 'U', ' ', false), - TAINT_FLAG(DIE, 'D', ' ', false), - TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' ', false), - TAINT_FLAG(WARN, 'W', ' ', false), - TAINT_FLAG(CRAP, 'C', ' ', true), - TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' ', false), - TAINT_FLAG(OOT_MODULE, 'O', ' ', true), - TAINT_FLAG(UNSIGNED_MODULE, 'E', ' ', true), - TAINT_FLAG(SOFTLOCKUP, 'L', ' ', false), - TAINT_FLAG(LIVEPATCH, 'K', ' ', true), - TAINT_FLAG(AUX, 'X', ' ', true), - TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), - TAINT_FLAG(TEST, 'N', ' ', true), - TAINT_FLAG(FWCTL, 'J', ' ', true), + TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'), + TAINT_FLAG(FORCED_MODULE, 'F', ' '), + TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '), + TAINT_FLAG(FORCED_RMMOD, 'R', ' '), + TAINT_FLAG(MACHINE_CHECK, 'M', ' '), + TAINT_FLAG(BAD_PAGE, 'B', ' '), + TAINT_FLAG(USER, 'U', ' '), + TAINT_FLAG(DIE, 'D', ' '), + TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '), + TAINT_FLAG(WARN, 'W', ' '), + TAINT_FLAG(CRAP, 'C', ' '), + TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '), + TAINT_FLAG(OOT_MODULE, 'O', ' '), + TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '), + TAINT_FLAG(SOFTLOCKUP, 'L', ' '), + TAINT_FLAG(LIVEPATCH, 'K', ' '), + TAINT_FLAG(AUX, 'X', ' '), + TAINT_FLAG(RANDSTRUCT, 'T', ' '), + TAINT_FLAG(TEST, 'N', ' '), + TAINT_FLAG(FWCTL, 'J', ' '), }; #undef TAINT_FLAG diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7394f1b6033b..1d765ad242b8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3705,12 +3705,13 @@ static bool legacy_kthread_create(void) /** * printk_kthreads_shutdown - shutdown all threaded printers + * @data: syscore context * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ -static void printk_kthreads_shutdown(void) +static void printk_kthreads_shutdown(void *data) { struct console *con; @@ -3732,10 +3733,14 @@ static void printk_kthreads_shutdown(void) console_list_unlock(); } -static struct syscore_ops printk_syscore_ops = { +static const struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; +static struct syscore printk_syscore = { + .ops = &printk_syscore_ops, +}; + /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. @@ -3803,7 +3808,7 @@ static void printk_kthreads_check_locked(void) static int __init printk_set_kthreads_ready(void) { - register_syscore_ops(&printk_syscore_ops); + register_syscore(&printk_syscore); console_list_lock(); printk_kthreads_ready = true; diff --git a/kernel/resource.c b/kernel/resource.c index b9fa2a4ce089..e4e9bac12e6e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -341,6 +341,8 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, struct resource *res) { + /* Skip children until we find a top level range that matches */ + bool skip_children = true; struct resource *p; if (!res) @@ -351,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for_each_resource(&iomem_resource, p, false) { + for_each_resource(&iomem_resource, p, skip_children) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -362,6 +364,12 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; + /* + * We found a top level range that matches what we are looking + * for. Time to start checking children too. + */ + skip_children = false; + /* Found a match, break */ if (is_type_match(p, flags, desc)) break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7801cd05d5a..41ba0be16911 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -878,7 +878,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->donor->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->donor, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; @@ -7360,15 +7360,12 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->prio = prio; } out_unlock: - /* Avoid rq from going away on us: */ - preempt_disable(); + /* Caller holds task_struct::pi_lock, IRQs are still disabled */ rq_unpin_lock(rq, &rf); __balance_callbacks(rq); rq_repin_lock(rq, &rf); __task_rq_unlock(rq, p, &rf); - - preempt_enable(); } #endif /* CONFIG_RT_MUTEXES */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 769d7b7990df..da46c3164537 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4034,6 +4034,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (child_cfs_rq_on_list(cfs_rq)) return false; + if (cfs_rq->tg_load_avg_contrib) + return false; + return true; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bbf513b3e76c..d30cca6870f5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1167,7 +1167,7 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; + unsigned long nr_uninterruptible; #ifdef CONFIG_SCHED_PROXY_EXEC struct task_struct __rcu *donor; /* Scheduling context */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index cbf7206b3f9d..c903f1a42891 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -180,8 +180,13 @@ static inline void psi_dequeue(struct task_struct *p, int flags) * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. + * + * In the SCHED_PROXY_EXECUTION case we may do sleeping + * dequeues that are not followed by a task switch, so check + * TSK_ONCPU is set to ensure the task switch is imminent. + * Otherwise clear the flags as usual. */ - if (flags & DEQUEUE_SLEEP) + if ((flags & DEQUEUE_SLEEP) && (p->psi_flags & TSK_ONCPU)) return; /* diff --git a/kernel/scs.c b/kernel/scs.c index d7809affe740..772488afd5b9 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -135,7 +135,7 @@ static void scs_check_usage(struct task_struct *tsk) if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) return; - for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + for (p = task_scs(tsk); p < __scs_magic(task_scs(tsk)); ++p) { if (!READ_ONCE_NOCHECK(*p)) break; used += sizeof(*p); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cc1afec306b3..f39111830ca3 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -296,6 +296,11 @@ int sched_clock_suspend(void) return 0; } +static int sched_clock_syscore_suspend(void *data) +{ + return sched_clock_suspend(); +} + void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -305,14 +310,23 @@ void sched_clock_resume(void) rd->read_sched_clock = cd.actual_read_sched_clock; } -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, +static void sched_clock_syscore_resume(void *data) +{ + sched_clock_resume(); +} + +static const struct syscore_ops sched_clock_syscore_ops = { + .suspend = sched_clock_syscore_suspend, + .resume = sched_clock_syscore_resume, +}; + +static struct syscore sched_clock_syscore = { + .ops = &sched_clock_syscore_ops, }; static int __init sched_clock_syscore_init(void) { - register_syscore_ops(&sched_clock_ops); + register_syscore(&sched_clock_syscore); return 0; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4790da895203..3ec3daa4acab 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1994,6 +1994,11 @@ void timekeeping_resume(void) timerfd_resume(); } +static void timekeeping_syscore_resume(void *data) +{ + timekeeping_resume(); +} + int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; @@ -2061,15 +2066,24 @@ int timekeeping_suspend(void) return 0; } +static int timekeeping_syscore_suspend(void *data) +{ + return timekeeping_suspend(); +} + /* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, +static const struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_syscore_resume, + .suspend = timekeeping_syscore_suspend, +}; + +static struct syscore timekeeping_syscore = { + .ops = &timekeeping_syscore_ops, }; static int __init timekeeping_init_ops(void) { - register_syscore_ops(&timekeeping_syscore_ops); + register_syscore(&timekeeping_syscore); return 0; } device_initcall(timekeeping_init_ops); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 65ac0f04a946..cc48d16be43e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -163,7 +163,7 @@ enum { #define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset])) /* - * Each fgraph_ops has a reservered unsigned long at the end (top) of the + * Each fgraph_ops has a reserved unsigned long at the end (top) of the * ret_stack to store task specific state. */ #define SHADOW_STACK_TASK_VARS(ret_stack) \ diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 0b1ee8e585f2..1188eefef07c 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -30,7 +30,7 @@ * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still * exists. The key is the address of fprobe instance. * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe - * instance related to the funciton address. The key is the ftrace IP + * instance related to the function address. The key is the ftrace IP * address. * * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8688c88534de..41c9f5d079be 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1770,7 +1770,7 @@ static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) bmeta->total_size = total_size; bmeta->buffers_offset = (void *)ptr - (void *)bmeta; - /* Zero out the scatch pad */ + /* Zero out the scratch pad */ memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); return false; @@ -6089,7 +6089,7 @@ static void rb_clear_buffer_page(struct buffer_page *page) * id field, and updated via this function. * * But for a fixed memory mapped buffer, the id is already assigned for - * fixed memory ording in the memory layout and can not be used. Instead + * fixed memory ordering in the memory layout and can not be used. Instead * the index of where the page lies in the memory layout is used. * * For the normal pages, set the buffer page id with the passed in @id @@ -7669,7 +7669,7 @@ static __init int test_ringbuffer(void) /* * Show buffer is enabled before setting rb_test_started. * Yes there's a small race window where events could be - * dropped and the thread wont catch it. But when a ring + * dropped and the thread won't catch it. But when a ring * buffer gets enabled, there will always be some kind of * delay before other CPUs see it. Thus, we don't care about * those dropped events. We care about events dropped after diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index cdc3aea12c93..593e3b59e42e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -433,7 +433,7 @@ static int __init ring_buffer_benchmark_init(void) { int ret; - /* make a one meg buffer in overwite mode */ + /* make a one meg buffer in overwrite mode */ buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); if (!buffer) return -ENOMEM; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c9fbb316dcbd..e575956ef9b5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -125,7 +125,7 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * If there is an oops (or kernel panic) and the ftrace_dump_on_oops * is set, then ftrace_dump is called. This will output the contents * of the ftrace buffers to the console. This is very useful for - * capturing traces that lead to crashes and outputing it to a + * capturing traces that lead to crashes and outputting it to a * serial console. * * It is default off, but you can enable it with either specifying @@ -134,7 +134,7 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops * Set instance name if you want to dump the specific trace instance - * Multiple instance dump is also supported, and instances are seperated + * Multiple instance dump is also supported, and instances are separated * by commas. */ /* Set to string format zero to disable by default */ @@ -4709,8 +4709,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) * If pause-on-trace is enabled, then stop the trace while * dumping, unless this is the "snapshot" file */ - if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) + if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) { + iter->iter_flags |= TRACE_FILE_PAUSE; tracing_stop_tr(tr); + } if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { @@ -4842,7 +4844,7 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - if (!iter->snapshot && tr->stop_count) + if (iter->iter_flags & TRACE_FILE_PAUSE) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); @@ -5276,7 +5278,7 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) return -EINVAL; /* * An instance must always have it set. - * by default, that's the global_trace instane. + * by default, that's the global_trace instance. */ if (printk_trace == tr) update_printk_trace(&global_trace); @@ -7554,7 +7556,7 @@ char *trace_user_fault_read(struct trace_user_buf_info *tinfo, migrate_disable(); /* - * Now preemption is being enabed and another task can come in + * Now preemption is being enabled and another task can come in * and use the same buffer and corrupt our data. */ preempt_enable_notrace(); @@ -11329,7 +11331,7 @@ __init static void do_allocate_snapshot(const char *name) /* * When allocate_snapshot is set, the next call to * allocate_trace_buffers() (called by trace_array_get_by_name()) - * will allocate the snapshot buffer. That will alse clear + * will allocate the snapshot buffer. That will also clear * this flag. */ allocate_snapshot = true; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9b07ad9eb284..b16a5a158040 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -360,7 +360,7 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca /* Anything else, this isn't a function */ break; } - /* A function could be wrapped in parethesis, try the next one */ + /* A function could be wrapped in parenthesis, try the next one */ s = r + 1; } while (s < e); @@ -567,7 +567,7 @@ static void test_event_printk(struct trace_event_call *call) * If start_arg is zero, then this is the start of the * first argument. The processing of the argument happens * when the end of the argument is found, as it needs to - * handle paranthesis and such. + * handle parenthesis and such. */ if (!start_arg) { start_arg = i; @@ -785,7 +785,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * * When soft_disable is not set but the soft_mode is, * we do nothing. Do not disable the tracepoint, otherwise - * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + * "soft enable"s (clearing the SOFT_DISABLED bit) won't work. */ if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) @@ -1394,7 +1394,7 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) if (!tr) return -ENOENT; - /* Modules events can be appened with :mod:<module> */ + /* Modules events can be appended with :mod:<module> */ mod = strstr(buf, ":mod:"); if (mod) { *mod = '\0'; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 54226b48b2d1..385af8405392 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -142,7 +142,7 @@ static bool is_not(const char *str) } /** - * struct prog_entry - a singe entry in the filter program + * struct prog_entry - a single entry in the filter program * @target: Index to jump to on a branch (actually one minus the index) * @when_to_branch: The value of the result of the predicate to do a branch * @pred: The predicate to execute. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 289bdea98776..5e6e70540eef 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5283,7 +5283,7 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, * on the stack, so when the histogram trigger is initialized * a percpu array of 4 hist_pad structures is allocated. * This will cover every context from normal, softirq, irq and NMI - * in the very unlikely event that a tigger happens at each of + * in the very unlikely event that a trigger happens at each of * these contexts and interrupts a currently active trigger. */ struct hist_pad { diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 2f19bbe73d27..4554c458b78c 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -375,7 +375,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)&entry->fields[n_u64].as_u64, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 96aad82b1628..06b75bcfc7b8 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -732,7 +732,7 @@ static void unregister_trigger(char *glob, * param - text following cmd and ':' and stripped of filter * filter - the optional filter text following (and including) 'if' * - * To illustrate the use of these componenents, here are some concrete + * To illustrate the use of these components, here are some concrete * examples. For the following triggers: * * echo 'traceon:5 if pid == 0' > trigger diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b15854c75d4f..dca6e50b3b21 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1041,7 +1041,7 @@ static int user_field_array_size(const char *type) static int user_field_size(const char *type) { - /* long is not allowed from a user, since it's ambigious in size */ + /* long is not allowed from a user, since it's ambiguous in size */ if (strcmp(type, "s64") == 0) return sizeof(s64); if (strcmp(type, "u64") == 0) @@ -1079,7 +1079,7 @@ static int user_field_size(const char *type) if (str_has_prefix(type, "__rel_loc ")) return sizeof(u32); - /* Uknown basic type, error */ + /* Unknown basic type, error */ return -EINVAL; } @@ -2465,7 +2465,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, /* * Prevent users from using the same address and bit multiple times * within the same mm address space. This can cause unexpected behavior - * for user processes that is far easier to debug if this is explictly + * for user processes that is far easier to debug if this is explicitly * an error upon registering. */ if (current_user_event_enabler_exists((unsigned long)reg.enable_addr, diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a9962d4497e8..827104d00bc0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -329,7 +329,7 @@ static struct osnoise_data { u64 print_stack; /* print IRQ stack if total > */ int timerlat_tracer; /* timerlat tracer */ #endif - bool tainted; /* infor users and developers about a problem */ + bool tainted; /* info users and developers about a problem */ } osnoise_data = { .sample_period = DEFAULT_SAMPLE_PERIOD, .sample_runtime = DEFAULT_SAMPLE_RUNTIME, @@ -738,7 +738,7 @@ cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration) /* * get_int_safe_duration - Get the duration of a window * - * The irq, softirq and thread varaibles need to have its duration without + * The irq, softirq and thread variables need to have its duration without * the interference from higher priority interrupts. Instead of keeping a * variable to discount the interrupt interference from these variables, the * starting time of these variables are pushed forward with the interrupt's @@ -1460,7 +1460,7 @@ static int run_osnoise(void) stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC; /* - * Start timestemp + * Start timestamp */ start = time_get(); @@ -1881,7 +1881,7 @@ static int timerlat_main(void *data) tlat->kthread = current; osn_var->pid = current->pid; /* - * Anotate the arrival time. + * Annotate the arrival time. */ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); @@ -1978,7 +1978,7 @@ static void stop_per_cpu_kthreads(void) } /* - * start_kthread - Start a workload tread + * start_kthread - Start a workload thread */ static int start_kthread(unsigned int cpu) { @@ -2705,7 +2705,7 @@ static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) * Why not using tracing instance per_cpu/ dir? * * Because osnoise/timerlat have a single workload, having - * multiple files like these are wast of memory. + * multiple files like these are waste of memory. */ per_cpu = tracefs_create_dir("per_cpu", top_dir); if (!per_cpu) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index bb67f6a2136c..2f571083ce9e 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -517,7 +517,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx) } } -/* Return 1 if the field separater is arrow operator ('->') */ +/* Return 1 if the field separator is arrow operator ('->') */ static int split_next_field(char *varname, char **next_field, struct traceprobe_parse_context *ctx) { diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index c158d65a8a88..32684ef4fb9d 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -15,7 +15,7 @@ * * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into - * the buffer but it wont update the pointers). This allows users to + * the buffer but it won't update the pointers). This allows users to * try to write something into the trace_seq buffer and if it fails * they can flush it and try again. * diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index e066d31d08f8..fe9bf8db1922 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -31,6 +31,13 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; +struct hwerr_info { + atomic_t count; + time64_t timestamp; +}; + +static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { @@ -118,6 +125,16 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) } EXPORT_SYMBOL(paddr_vmcoreinfo_note); +void hwerr_log_error_type(enum hwerr_error_type src) +{ + if (src < 0 || src >= HWERR_RECOV_MAX) + return; + + atomic_inc(&hwerr_data[src].count); + WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds()); +} +EXPORT_SYMBOL_GPL(hwerr_log_error_type); + static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a567600cf3ed..0685e3a8aa0a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -25,6 +25,7 @@ #include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/tick.h> +#include <linux/sys_info.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> @@ -65,6 +66,13 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); +/* + * bitmasks to control what kinds of system info to be printed when + * hard lockup is detected, it could be task, memory, lock etc. + * Refer include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long hardlockup_si_mask; + #ifdef CONFIG_SYSFS static unsigned int hardlockup_count; @@ -178,11 +186,15 @@ static void watchdog_hardlockup_kick(void) void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { + int hardlockup_all_cpu_backtrace; + if (per_cpu(watchdog_hardlockup_touched, cpu)) { per_cpu(watchdog_hardlockup_touched, cpu) = false; return; } + hardlockup_all_cpu_backtrace = (hardlockup_si_mask & SYS_INFO_ALL_BT) ? + 1 : sysctl_hardlockup_all_cpu_backtrace; /* * Check for a hardlockup by making sure the CPU's timer * interrupt is incrementing. The timer interrupt should have @@ -214,7 +226,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) * Prevent multiple hard-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ - if (sysctl_hardlockup_all_cpu_backtrace) { + if (hardlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) return; } @@ -243,12 +255,13 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) trigger_single_cpu_backtrace(cpu); } - if (sysctl_hardlockup_all_cpu_backtrace) { + if (hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); if (!hardlockup_panic) clear_bit_unlock(0, &hard_lockup_nmi_warn); } + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); @@ -339,6 +352,13 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +/* + * bitmasks to control what kinds of system info to be printed when + * soft lockup is detected, it could be task, memory, lock etc. + * Refer include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long softlockup_si_mask; + static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ @@ -755,7 +775,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); int duration; - int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + int softlockup_all_cpu_backtrace; unsigned long flags; if (!watchdog_enabled) @@ -767,6 +787,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (panic_in_progress()) return HRTIMER_NORESTART; + softlockup_all_cpu_backtrace = (softlockup_si_mask & SYS_INFO_ALL_BT) ? + 1 : sysctl_softlockup_all_cpu_backtrace; + watchdog_hardlockup_kick(); /* kick the softlockup detector */ @@ -855,6 +878,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); + sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); if (softlockup_panic) panic("softlockup: hung tasks"); } @@ -1206,6 +1230,13 @@ static const struct ctl_table watchdog_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "softlockup_sys_info", + .data = &softlockup_si_mask, + .maxlen = sizeof(softlockup_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", @@ -1228,6 +1259,13 @@ static const struct ctl_table watchdog_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "hardlockup_sys_info", + .data = &hardlockup_si_mask, + .maxlen = sizeof(hardlockup_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, #ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", |
