summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec24
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/bounds.c1
-rw-r--r--kernel/configs/debug.config2
-rw-r--r--kernel/cpu_pm.c12
-rw-r--r--kernel/crash_reserve.c3
-rw-r--r--kernel/dma/dummy.c13
-rw-r--r--kernel/dma/map_benchmark.c2
-rw-r--r--kernel/dma/mapping.c26
-rw-r--r--kernel/dma/ops_helpers.c12
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c63
-rw-r--r--kernel/hung_task.c56
-rw-r--r--kernel/irq/generic-chip.c14
-rw-r--r--kernel/irq/pm.c11
-rw-r--r--kernel/kexec_core.c161
-rw-r--r--kernel/kexec_handover_internal.h20
-rw-r--r--kernel/ksysfs.c68
-rw-r--r--kernel/liveupdate/Kconfig75
-rw-r--r--kernel/liveupdate/Makefile12
-rw-r--r--kernel/liveupdate/kexec_handover.c (renamed from kernel/kexec_handover.c)716
-rw-r--r--kernel/liveupdate/kexec_handover_debug.c (renamed from kernel/kexec_handover_debug.c)0
-rw-r--r--kernel/liveupdate/kexec_handover_debugfs.c221
-rw-r--r--kernel/liveupdate/kexec_handover_internal.h55
-rw-r--r--kernel/liveupdate/luo_core.c450
-rw-r--r--kernel/liveupdate/luo_file.c889
-rw-r--r--kernel/liveupdate/luo_internal.h110
-rw-r--r--kernel/liveupdate/luo_session.c646
-rw-r--r--kernel/module/main.c2
-rw-r--r--kernel/panic.c52
-rw-r--r--kernel/printk/printk.c11
-rw-r--r--kernel/resource.c10
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/fair.c3
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/stats.h7
-rw-r--r--kernel/scs.c2
-rw-r--r--kernel/time/sched_clock.c22
-rw-r--r--kernel/time/timekeeping.c22
-rw-r--r--kernel/trace/fgraph.c2
-rw-r--r--kernel/trace/fprobe.c2
-rw-r--r--kernel/trace/ring_buffer.c6
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c16
-rw-r--r--kernel/trace/trace_events.c8
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_events_synth.c1
-rw-r--r--kernel/trace/trace_events_trigger.c2
-rw-r--r--kernel/trace/trace_events_user.c6
-rw-r--r--kernel/trace/trace_osnoise.c12
-rw-r--r--kernel/trace/trace_probe.c2
-rw-r--r--kernel/trace/trace_seq.c2
-rw-r--r--kernel/vmcore_info.c17
-rw-r--r--kernel/watchdog.c44
55 files changed, 3268 insertions, 667 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 54e581072617..15632358bcf7 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -94,30 +94,6 @@ config KEXEC_JUMP
Jump between original kernel and kexeced kernel and invoke
code in physical address mode via KEXEC
-config KEXEC_HANDOVER
- bool "kexec handover"
- depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
- depends on !DEFERRED_STRUCT_PAGE_INIT
- select MEMBLOCK_KHO_SCRATCH
- select KEXEC_FILE
- select DEBUG_FS
- select LIBFDT
- select CMA
- help
- Allow kexec to hand over state across kernels by generating and
- passing additional metadata to the target kernel. This is useful
- to keep data or state alive across the kexec. For this to work,
- both source and target kernels need to have this option enabled.
-
-config KEXEC_HANDOVER_DEBUG
- bool "Enable Kexec Handover debug checks"
- depends on KEXEC_HANDOVER
- help
- This option enables extra sanity checks for the Kexec Handover
- subsystem. Since, KHO performance is crucial in live update
- scenarios and the extra code might be adding overhead it is
- only optionally enabled.
-
config CRASH_DUMP
bool "kernel crash dumps"
default ARCH_DEFAULT_CRASH_DUMP
diff --git a/kernel/Makefile b/kernel/Makefile
index 9fe722305c9b..e83669841b8c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -52,6 +52,7 @@ obj-y += printk/
obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
+obj-y += liveupdate/
obj-y += dma/
obj-y += entry/
obj-y += unwind/
@@ -82,8 +83,6 @@ obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
-obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
-obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 29b2cd00df2c..02b619eb6106 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -6,6 +6,7 @@
*/
#define __GENERATING_BOUNDS_H
+#define COMPILE_OFFSETS
/* Include headers that define the enum constants of interest */
#include <linux/page-flags.h>
#include <linux/mmzone.h>
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e81327d2cd63..9f6ab7dabf67 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -83,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y
#
# Debug Oops, Lockups and Hangs
#
-# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DETECT_HUNG_TASK=y
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index b0f0d15085db..7481fbb947d3 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -173,7 +173,7 @@ int cpu_cluster_pm_exit(void)
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
#ifdef CONFIG_PM
-static int cpu_pm_suspend(void)
+static int cpu_pm_suspend(void *data)
{
int ret;
@@ -185,20 +185,24 @@ static int cpu_pm_suspend(void)
return ret;
}
-static void cpu_pm_resume(void)
+static void cpu_pm_resume(void *data)
{
cpu_cluster_pm_exit();
cpu_pm_exit();
}
-static struct syscore_ops cpu_pm_syscore_ops = {
+static const struct syscore_ops cpu_pm_syscore_ops = {
.suspend = cpu_pm_suspend,
.resume = cpu_pm_resume,
};
+static struct syscore cpu_pm_syscore = {
+ .ops = &cpu_pm_syscore_ops,
+};
+
static int cpu_pm_init(void)
{
- register_syscore_ops(&cpu_pm_syscore_ops);
+ register_syscore(&cpu_pm_syscore);
return 0;
}
core_initcall(cpu_pm_init);
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index 87bf4d41eabb..62e60e0223cf 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsigned long long cma_size)
#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
static __init int insert_crashkernel_resources(void)
{
+ if (!arch_add_crash_res_to_iomem())
+ return 0;
+
if (crashk_res.start < crashk_res.end)
insert_resource(&iomem_resource, &crashk_res);
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index 92de80e5b057..16a51736a2a3 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -11,17 +11,16 @@ static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
return -ENXIO;
}
-static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+static dma_addr_t dma_dummy_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
{
return DMA_MAPPING_ERROR;
}
-static void dma_dummy_unmap_page(struct device *dev, dma_addr_t dma_handle,
+static void dma_dummy_unmap_phys(struct device *dev, dma_addr_t dma_handle,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
/*
- * Dummy ops doesn't support map_page, so unmap_page should never be
+ * Dummy ops doesn't support map_phys, so unmap_page should never be
* called.
*/
WARN_ON_ONCE(true);
@@ -51,8 +50,8 @@ static int dma_dummy_supported(struct device *hwdev, u64 mask)
const struct dma_map_ops dma_dummy_ops = {
.mmap = dma_dummy_mmap,
- .map_page = dma_dummy_map_page,
- .unmap_page = dma_dummy_unmap_page,
+ .map_phys = dma_dummy_map_phys,
+ .unmap_phys = dma_dummy_unmap_phys,
.map_sg = dma_dummy_map_sg,
.unmap_sg = dma_dummy_unmap_sg,
.dma_supported = dma_dummy_supported,
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index cc19a3efea89..794041a39e65 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -11,13 +11,13 @@
#include <linux/dma-mapping.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
-#include <linux/map_benchmark.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/timekeeping.h>
+#include <uapi/linux/map_benchmark.h>
struct map_benchmark_data {
struct map_benchmark bparam;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index fe7472f13b10..37163eb49f9f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -157,7 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
bool is_mmio = attrs & DMA_ATTR_MMIO;
- dma_addr_t addr;
+ dma_addr_t addr = DMA_MAPPING_ERROR;
BUG_ON(!valid_dma_direction(dir));
@@ -169,21 +169,8 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
else if (use_dma_iommu(dev))
addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
- else if (is_mmio) {
- if (!ops->map_resource)
- return DMA_MAPPING_ERROR;
-
- addr = ops->map_resource(dev, phys, size, dir, attrs);
- } else {
- struct page *page = phys_to_page(phys);
- size_t offset = offset_in_page(phys);
-
- /*
- * The dma_ops API contract for ops->map_page() requires
- * kmappable memory, while ops->map_resource() does not.
- */
- addr = ops->map_page(dev, page, offset, size, dir, attrs);
- }
+ else if (ops->map_phys)
+ addr = ops->map_phys(dev, phys, size, dir, attrs);
if (!is_mmio)
kmsan_handle_dma(phys, size, dir);
@@ -223,11 +210,8 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
dma_direct_unmap_phys(dev, addr, size, dir, attrs);
else if (use_dma_iommu(dev))
iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
- else if (is_mmio) {
- if (ops->unmap_resource)
- ops->unmap_resource(dev, addr, size, dir, attrs);
- } else
- ops->unmap_page(dev, addr, size, dir, attrs);
+ else if (ops->unmap_phys)
+ ops->unmap_phys(dev, addr, size, dir, attrs);
trace_dma_unmap_phys(dev, addr, size, dir, attrs);
debug_dma_unmap_phys(dev, addr, size, dir);
}
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 6f9d604d9d40..20caf9cabf69 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -64,6 +64,7 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
struct page *page;
+ phys_addr_t phys;
page = dma_alloc_contiguous(dev, size, gfp);
if (!page)
@@ -71,11 +72,12 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
if (!page)
return NULL;
+ phys = page_to_phys(page);
if (use_dma_iommu(dev))
- *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size,
- dir, DMA_ATTR_SKIP_CPU_SYNC);
+ *dma_handle = iommu_dma_map_phys(dev, phys, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
else
- *dma_handle = ops->map_page(dev, page, 0, size, dir,
+ *dma_handle = ops->map_phys(dev, phys, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
if (*dma_handle == DMA_MAPPING_ERROR) {
dma_free_contiguous(dev, page, size);
@@ -94,8 +96,8 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
if (use_dma_iommu(dev))
iommu_dma_unmap_phys(dev, dma_handle, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
- else if (ops->unmap_page)
- ops->unmap_page(dev, dma_handle, size, dir,
+ else if (ops->unmap_phys)
+ ops->unmap_phys(dev, dma_handle, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
dma_free_contiguous(dev, page, size);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 4dc1918db67b..8a87021211ae 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -251,10 +251,8 @@ repeat:
memset(&post, 0, sizeof(post));
/* don't need to get the RCU readlock here - the process is dead and
- * can't be modifying its own credentials. But shut RCU-lockdep up */
- rcu_read_lock();
+ * can't be modifying its own credentials. */
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
- rcu_read_unlock();
pidfs_exit(p);
cgroup_task_release(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 198e02e21e6e..b1f3915d5f8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -208,15 +208,62 @@ struct vm_stack {
struct vm_struct *stack_vm_area;
};
+static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm_area;
+ unsigned int i;
+
+ /*
+ * If the node has memory, we are guaranteed the stacks are backed by local pages.
+ * Otherwise the pages are arbitrary.
+ *
+ * Note that depending on cpuset it is possible we will get migrated to a different
+ * node immediately after allocating here, so this does *not* guarantee locality for
+ * arbitrary callers.
+ */
+ scoped_guard(preempt) {
+ if (node != NUMA_NO_NODE && numa_node_id() != node)
+ return NULL;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ vm_area = this_cpu_xchg(cached_stacks[i], NULL);
+ if (vm_area)
+ return vm_area;
+ }
+ }
+
+ return NULL;
+}
+
static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
{
unsigned int i;
+ int nid;
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *tmp = NULL;
+ /*
+ * Don't cache stacks if any of the pages don't match the local domain, unless
+ * there is no local memory to begin with.
+ *
+ * Note that lack of local memory does not automatically mean it makes no difference
+ * performance-wise which other domain backs the stack. In this case we are merely
+ * trying to avoid constantly going to vmalloc.
+ */
+ scoped_guard(preempt) {
+ nid = numa_node_id();
+ if (node_state(nid, N_MEMORY)) {
+ for (i = 0; i < vm_area->nr_pages; i++) {
+ struct page *page = vm_area->pages[i];
+ if (page_to_nid(page) != nid)
+ return false;
+ }
+ }
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *tmp = NULL;
- if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
- return true;
+ if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
+ return true;
+ }
}
return false;
}
@@ -283,13 +330,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
struct vm_struct *vm_area;
void *stack;
- int i;
-
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- vm_area = this_cpu_xchg(cached_stacks[i], NULL);
- if (!vm_area)
- continue;
+ vm_area = alloc_thread_stack_node_from_cache(tsk, node);
+ if (vm_area) {
if (memcg_charge_kernel_stack(vm_area)) {
vfree(vm_area->addr);
return -ENOMEM;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index b2c1f14b8129..d2254c91450b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -24,6 +24,7 @@
#include <linux/sched/sysctl.h>
#include <linux/hung_task.h>
#include <linux/rwsem.h>
+#include <linux/sys_info.h>
#include <trace/events/sched.h>
@@ -50,7 +51,6 @@ static unsigned long __read_mostly sysctl_hung_task_detect_count;
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
-EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs);
/*
* Zero (default value) means use sysctl_hung_task_timeout_secs:
@@ -60,12 +60,17 @@ static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
static int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
-static bool hung_task_show_lock;
static bool hung_task_call_panic;
-static bool hung_task_show_all_bt;
static struct task_struct *watchdog_task;
+/*
+ * A bitmask to control what kinds of system info to be printed when
+ * a hung task is detected, it could be task, memory, lock etc. Refer
+ * include/linux/sys_info.h for detailed bit definition.
+ */
+static unsigned long hung_task_si_mask;
+
#ifdef CONFIG_SMP
/*
* Should we dump all CPUs backtraces in a hung task event?
@@ -81,7 +86,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
* hung task is detected:
*/
static unsigned int __read_mostly sysctl_hung_task_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC;
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -218,8 +223,11 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti
}
#endif
-static void check_hung_task(struct task_struct *t, unsigned long timeout)
+static void check_hung_task(struct task_struct *t, unsigned long timeout,
+ unsigned long prev_detect_count)
{
+ unsigned long total_hung_task;
+
if (!task_is_hung(t, timeout))
return;
@@ -229,11 +237,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
*/
sysctl_hung_task_detect_count++;
+ total_hung_task = sysctl_hung_task_detect_count - prev_detect_count;
trace_sched_process_hang(t);
- if (sysctl_hung_task_panic) {
+ if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) {
console_verbose();
- hung_task_show_lock = true;
hung_task_call_panic = true;
}
@@ -256,10 +264,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
" disables this message.\n");
sched_show_task(t);
debug_show_blocker(t, timeout);
- hung_task_show_lock = true;
- if (sysctl_hung_task_all_cpu_backtrace)
- hung_task_show_all_bt = true;
if (!sysctl_hung_task_warnings)
pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
}
@@ -300,6 +305,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
int max_count = sysctl_hung_task_check_count;
unsigned long last_break = jiffies;
struct task_struct *g, *t;
+ unsigned long prev_detect_count = sysctl_hung_task_detect_count;
+ int need_warning = sysctl_hung_task_warnings;
+ unsigned long si_mask = hung_task_si_mask;
/*
* If the system crashed already then all bets are off,
@@ -308,7 +316,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (test_taint(TAINT_DIE) || did_panic)
return;
- hung_task_show_lock = false;
+
rcu_read_lock();
for_each_process_thread(g, t) {
@@ -320,18 +328,23 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
last_break = jiffies;
}
- check_hung_task(t, timeout);
+ check_hung_task(t, timeout, prev_detect_count);
}
unlock:
rcu_read_unlock();
- if (hung_task_show_lock)
- debug_show_all_locks();
- if (hung_task_show_all_bt) {
- hung_task_show_all_bt = false;
- trigger_all_cpu_backtrace();
+ if (!(sysctl_hung_task_detect_count - prev_detect_count))
+ return;
+
+ if (need_warning || hung_task_call_panic) {
+ si_mask |= SYS_INFO_LOCKS;
+
+ if (sysctl_hung_task_all_cpu_backtrace)
+ si_mask |= SYS_INFO_ALL_BT;
}
+ sys_info(si_mask);
+
if (hung_task_call_panic)
panic("hung_task: blocked tasks");
}
@@ -389,7 +402,7 @@ static const struct ctl_table hung_task_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "hung_task_check_count",
@@ -430,6 +443,13 @@ static const struct ctl_table hung_task_sysctls[] = {
.mode = 0444,
.proc_handler = proc_doulongvec_minmax,
},
+ {
+ .procname = "hung_task_sys_info",
+ .data = &hung_task_si_mask,
+ .maxlen = sizeof(hung_task_si_mask),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
};
static void __init hung_task_sysctl_init(void)
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index bf59e37d650a..3cd0c40282c0 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -650,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
}
#ifdef CONFIG_PM
-static int irq_gc_suspend(void)
+static int irq_gc_suspend(void *data)
{
struct irq_chip_generic *gc;
@@ -670,7 +670,7 @@ static int irq_gc_suspend(void)
return 0;
}
-static void irq_gc_resume(void)
+static void irq_gc_resume(void *data)
{
struct irq_chip_generic *gc;
@@ -693,7 +693,7 @@ static void irq_gc_resume(void)
#define irq_gc_resume NULL
#endif
-static void irq_gc_shutdown(void)
+static void irq_gc_shutdown(void *data)
{
struct irq_chip_generic *gc;
@@ -709,15 +709,19 @@ static void irq_gc_shutdown(void)
}
}
-static struct syscore_ops irq_gc_syscore_ops = {
+static const struct syscore_ops irq_gc_syscore_ops = {
.suspend = irq_gc_suspend,
.resume = irq_gc_resume,
.shutdown = irq_gc_shutdown,
};
+static struct syscore irq_gc_syscore = {
+ .ops = &irq_gc_syscore_ops,
+};
+
static int __init irq_gc_init_ops(void)
{
- register_syscore_ops(&irq_gc_syscore_ops);
+ register_syscore(&irq_gc_syscore);
return 0;
}
device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f7394729cedc..99ff65466d87 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -211,21 +211,26 @@ void rearm_wake_irq(unsigned int irq)
/**
* irq_pm_syscore_resume - enable interrupt lines early
+ * @data: syscore context
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
*/
-static void irq_pm_syscore_resume(void)
+static void irq_pm_syscore_resume(void *data)
{
resume_irqs(true);
}
-static struct syscore_ops irq_pm_syscore_ops = {
+static const struct syscore_ops irq_pm_syscore_ops = {
.resume = irq_pm_syscore_resume,
};
+static struct syscore irq_pm_syscore = {
+ .ops = &irq_pm_syscore_ops,
+};
+
static int __init irq_pm_init_ops(void)
{
- register_syscore_ops(&irq_pm_syscore_ops);
+ register_syscore(&irq_pm_syscore);
return 0;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index fa00b239c5d9..0f92acdd354d 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -15,6 +15,7 @@
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
+#include <linux/liveupdate.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
@@ -41,6 +42,7 @@
#include <linux/objtool.h>
#include <linux/kmsg_dump.h>
#include <linux/dma-map-ops.h>
+#include <linux/sysfs.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -742,7 +744,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx)
struct kexec_segment *segment = &image->segment[idx];
struct page *cma = image->segment_cma[idx];
char *ptr = page_address(cma);
- unsigned long maddr;
size_t ubytes, mbytes;
int result = 0;
unsigned char __user *buf = NULL;
@@ -754,15 +755,12 @@ static int kimage_load_cma_segment(struct kimage *image, int idx)
buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
- maddr = segment->mem;
/* Then copy from source buffer to the CMA one */
while (mbytes) {
size_t uchunk, mchunk;
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
uchunk = min(ubytes, mchunk);
if (uchunk) {
@@ -784,7 +782,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx)
}
ptr += mchunk;
- maddr += mchunk;
mbytes -= mchunk;
cond_resched();
@@ -839,9 +836,7 @@ static int kimage_load_normal_segment(struct kimage *image, int idx)
ptr = kmap_local_page(page);
/* Start with a clear page */
clear_page(ptr);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
uchunk = min(ubytes, mchunk);
if (uchunk) {
@@ -904,9 +899,7 @@ static int kimage_load_crash_segment(struct kimage *image, int idx)
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap_local_page(page);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
uchunk = min(ubytes, mchunk);
if (mchunk > uchunk) {
/* Zero the trailing part of the page */
@@ -1146,6 +1139,10 @@ int kernel_kexec(void)
goto Unlock;
}
+ error = liveupdate_reboot();
+ if (error)
+ goto Unlock;
+
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
/*
@@ -1229,3 +1226,143 @@ int kernel_kexec(void)
kexec_unlock();
return error;
}
+
+static ssize_t loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", !!kexec_image);
+}
+static struct kobj_attribute loaded_attr = __ATTR_RO(loaded);
+
+#ifdef CONFIG_CRASH_DUMP
+static ssize_t crash_loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
+}
+static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded);
+
+#ifdef CONFIG_CRASH_RESERVE
+static ssize_t crash_cma_ranges_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+
+ ssize_t len = 0;
+ int i;
+
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ len += sysfs_emit_at(buf, len, "%08llx-%08llx\n",
+ crashk_cma_ranges[i].start,
+ crashk_cma_ranges[i].end);
+ }
+ return len;
+}
+static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges);
+#endif
+
+static ssize_t crash_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ ssize_t size = crash_get_memory_size();
+
+ if (size < 0)
+ return size;
+
+ return sysfs_emit(buf, "%zd\n", size);
+}
+static ssize_t crash_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long cnt;
+ int ret;
+
+ if (kstrtoul(buf, 0, &cnt))
+ return -EINVAL;
+
+ ret = crash_shrink_memory(cnt);
+ return ret < 0 ? ret : count;
+}
+static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size);
+
+#ifdef CONFIG_CRASH_HOTPLUG
+static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ unsigned int sz = crash_get_elfcorehdr_size();
+
+ return sysfs_emit(buf, "%u\n", sz);
+}
+static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size);
+
+#endif /* CONFIG_CRASH_HOTPLUG */
+#endif /* CONFIG_CRASH_DUMP */
+
+static struct attribute *kexec_attrs[] = {
+ &loaded_attr.attr,
+#ifdef CONFIG_CRASH_DUMP
+ &crash_loaded_attr.attr,
+ &crash_size_attr.attr,
+#ifdef CONFIG_CRASH_RESERVE
+ &crash_cma_ranges_attr.attr,
+#endif
+#ifdef CONFIG_CRASH_HOTPLUG
+ &crash_elfcorehdr_size_attr.attr,
+#endif
+#endif
+ NULL
+};
+
+struct kexec_link_entry {
+ const char *target;
+ const char *name;
+};
+
+static struct kexec_link_entry kexec_links[] = {
+ { "loaded", "kexec_loaded" },
+#ifdef CONFIG_CRASH_DUMP
+ { "crash_loaded", "kexec_crash_loaded" },
+ { "crash_size", "kexec_crash_size" },
+#ifdef CONFIG_CRASH_RESERVE
+ {"crash_cma_ranges", "kexec_crash_cma_ranges"},
+#endif
+#ifdef CONFIG_CRASH_HOTPLUG
+ { "crash_elfcorehdr_size", "crash_elfcorehdr_size" },
+#endif
+#endif
+};
+
+static struct kobject *kexec_kobj;
+ATTRIBUTE_GROUPS(kexec);
+
+static int __init init_kexec_sysctl(void)
+{
+ int error;
+ int i;
+
+ kexec_kobj = kobject_create_and_add("kexec", kernel_kobj);
+ if (!kexec_kobj) {
+ pr_err("failed to create kexec kobject\n");
+ return -ENOMEM;
+ }
+
+ error = sysfs_create_groups(kexec_kobj, kexec_groups);
+ if (error)
+ goto kset_exit;
+
+ for (i = 0; i < ARRAY_SIZE(kexec_links); i++) {
+ error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj,
+ kexec_links[i].target,
+ kexec_links[i].name);
+ if (error)
+ pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error);
+ }
+
+ return 0;
+
+kset_exit:
+ kobject_put(kexec_kobj);
+ return error;
+}
+
+subsys_initcall(init_kexec_sysctl);
diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h
deleted file mode 100644
index 3c3c7148ceed..000000000000
--- a/kernel/kexec_handover_internal.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H
-#define LINUX_KEXEC_HANDOVER_INTERNAL_H
-
-#include <linux/kexec_handover.h>
-#include <linux/types.h>
-
-extern struct kho_scratch *kho_scratch;
-extern unsigned int kho_scratch_cnt;
-
-#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
-bool kho_scratch_overlap(phys_addr_t phys, size_t size);
-#else
-static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size)
-{
- return false;
-}
-#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */
-
-#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index eefb67d9883c..a9e6354d9e25 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -12,7 +12,7 @@
#include <linux/sysfs.h>
#include <linux/export.h>
#include <linux/init.h>
-#include <linux/kexec.h>
+#include <linux/vmcore_info.h>
#include <linux/profile.h>
#include <linux/stat.h>
#include <linux/sched.h>
@@ -119,50 +119,6 @@ static ssize_t profiling_store(struct kobject *kobj,
KERNEL_ATTR_RW(profiling);
#endif
-#ifdef CONFIG_KEXEC_CORE
-static ssize_t kexec_loaded_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sysfs_emit(buf, "%d\n", !!kexec_image);
-}
-KERNEL_ATTR_RO(kexec_loaded);
-
-#ifdef CONFIG_CRASH_DUMP
-static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
-}
-KERNEL_ATTR_RO(kexec_crash_loaded);
-
-static ssize_t kexec_crash_size_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- ssize_t size = crash_get_memory_size();
-
- if (size < 0)
- return size;
-
- return sysfs_emit(buf, "%zd\n", size);
-}
-static ssize_t kexec_crash_size_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned long cnt;
- int ret;
-
- if (kstrtoul(buf, 0, &cnt))
- return -EINVAL;
-
- ret = crash_shrink_memory(cnt);
- return ret < 0 ? ret : count;
-}
-KERNEL_ATTR_RW(kexec_crash_size);
-
-#endif /* CONFIG_CRASH_DUMP*/
-#endif /* CONFIG_KEXEC_CORE */
-
#ifdef CONFIG_VMCORE_INFO
static ssize_t vmcoreinfo_show(struct kobject *kobj,
@@ -174,18 +130,6 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
-#ifdef CONFIG_CRASH_HOTPLUG
-static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- unsigned int sz = crash_get_elfcorehdr_size();
-
- return sysfs_emit(buf, "%u\n", sz);
-}
-KERNEL_ATTR_RO(crash_elfcorehdr_size);
-
-#endif
-
#endif /* CONFIG_VMCORE_INFO */
/* whether file capabilities are enabled */
@@ -255,18 +199,8 @@ static struct attribute * kernel_attrs[] = {
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
-#ifdef CONFIG_KEXEC_CORE
- &kexec_loaded_attr.attr,
-#ifdef CONFIG_CRASH_DUMP
- &kexec_crash_loaded_attr.attr,
- &kexec_crash_size_attr.attr,
-#endif
-#endif
#ifdef CONFIG_VMCORE_INFO
&vmcoreinfo_attr.attr,
-#ifdef CONFIG_CRASH_HOTPLUG
- &crash_elfcorehdr_size_attr.attr,
-#endif
#endif
#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
new file mode 100644
index 000000000000..9b2515f31afb
--- /dev/null
+++ b/kernel/liveupdate/Kconfig
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Live Update and Kexec HandOver"
+ depends on !DEFERRED_STRUCT_PAGE_INIT
+
+config KEXEC_HANDOVER
+ bool "kexec handover"
+ depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
+ depends on !DEFERRED_STRUCT_PAGE_INIT
+ select MEMBLOCK_KHO_SCRATCH
+ select KEXEC_FILE
+ select LIBFDT
+ select CMA
+ help
+ Allow kexec to hand over state across kernels by generating and
+ passing additional metadata to the target kernel. This is useful
+ to keep data or state alive across the kexec. For this to work,
+ both source and target kernels need to have this option enabled.
+
+config KEXEC_HANDOVER_DEBUG
+ bool "Enable Kexec Handover debug checks"
+ depends on KEXEC_HANDOVER
+ help
+ This option enables extra sanity checks for the Kexec Handover
+ subsystem. Since, KHO performance is crucial in live update
+ scenarios and the extra code might be adding overhead it is
+ only optionally enabled.
+
+config KEXEC_HANDOVER_DEBUGFS
+ bool "kexec handover debugfs interface"
+ default KEXEC_HANDOVER
+ depends on KEXEC_HANDOVER
+ select DEBUG_FS
+ help
+ Allow to control kexec handover device tree via debugfs
+ interface, i.e. finalize the state or aborting the finalization.
+ Also, enables inspecting the KHO fdt trees with the debugfs binary
+ blobs.
+
+config KEXEC_HANDOVER_ENABLE_DEFAULT
+ bool "Enable kexec handover by default"
+ depends on KEXEC_HANDOVER
+ help
+ Enable Kexec Handover by default. This avoids the need to
+ explicitly pass 'kho=on' on the kernel command line.
+
+ This is useful for systems where KHO is a prerequisite for other
+ features, such as Live Update, ensuring the mechanism is always
+ active.
+
+ The default behavior can still be overridden at boot time by
+ passing 'kho=off'.
+
+config LIVEUPDATE
+ bool "Live Update Orchestrator"
+ depends on KEXEC_HANDOVER
+ help
+ Enable the Live Update Orchestrator. Live Update is a mechanism,
+ typically based on kexec, that allows the kernel to be updated
+ while keeping selected devices operational across the transition.
+ These devices are intended to be reclaimed by the new kernel and
+ re-attached to their original workload without requiring a device
+ reset.
+
+ Ability to handover a device from current to the next kernel depends
+ on specific support within device drivers and related kernel
+ subsystems.
+
+ This feature primarily targets virtual machine hosts to quickly update
+ the kernel hypervisor with minimal disruption to the running virtual
+ machines.
+
+ If unsure, say N.
+
+endmenu
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
new file mode 100644
index 000000000000..7cad2eece32d
--- /dev/null
+++ b/kernel/liveupdate/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+luo-y := \
+ luo_core.o \
+ luo_file.o \
+ luo_session.o
+
+obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o
+
+obj-$(CONFIG_LIVEUPDATE) += luo.o
diff --git a/kernel/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 03d12e27189f..9dc51fab604f 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -4,21 +4,22 @@
* Copyright (C) 2023 Alexander Graf <graf@amazon.com>
* Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
* Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
*/
#define pr_fmt(fmt) "KHO: " fmt
#include <linux/cleanup.h>
#include <linux/cma.h>
+#include <linux/kmemleak.h>
#include <linux/count_zeros.h>
-#include <linux/debugfs.h>
#include <linux/kexec.h>
#include <linux/kexec_handover.h>
#include <linux/libfdt.h>
#include <linux/list.h>
#include <linux/memblock.h>
-#include <linux/notifier.h>
#include <linux/page-isolation.h>
+#include <linux/unaligned.h>
#include <linux/vmalloc.h>
#include <asm/early_ioremap.h>
@@ -28,8 +29,9 @@
* KHO is tightly coupled with mm init and needs access to some of mm
* internal APIs.
*/
-#include "../mm/internal.h"
-#include "kexec_internal.h"
+#include "../../mm/internal.h"
+#include "../kexec_internal.h"
+#include "kexec_handover_internal.h"
#define KHO_FDT_COMPATIBLE "kho-v1"
#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
@@ -51,7 +53,7 @@ union kho_page_info {
static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private));
-static bool kho_enable __ro_after_init;
+static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT);
bool kho_is_enabled(void)
{
@@ -103,34 +105,19 @@ struct kho_mem_track {
struct khoser_mem_chunk;
-struct kho_serialization {
- struct page *fdt;
- struct list_head fdt_list;
- struct dentry *sub_fdt_dir;
- struct kho_mem_track track;
- /* First chunk of serialized preserved memory map */
- struct khoser_mem_chunk *preserved_mem_map;
-};
-
struct kho_out {
- struct blocking_notifier_head chain_head;
-
- struct dentry *dir;
-
+ void *fdt;
+ bool finalized;
struct mutex lock; /* protects KHO FDT finalization */
- struct kho_serialization ser;
- bool finalized;
+ struct kho_mem_track track;
+ struct kho_debugfs dbg;
};
static struct kho_out kho_out = {
- .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
.lock = __MUTEX_INITIALIZER(kho_out.lock),
- .ser = {
- .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
- .track = {
- .orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
- },
+ .track = {
+ .orders = XARRAY_INIT(kho_out.track.orders, 0),
},
.finalized = false,
};
@@ -159,26 +146,33 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
return no_free_ptr(elm);
}
-static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
- unsigned long end_pfn)
+static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn,
+ unsigned int order)
{
struct kho_mem_phys_bits *bits;
struct kho_mem_phys *physxa;
+ const unsigned long pfn_high = pfn >> order;
- while (pfn < end_pfn) {
- const unsigned int order =
- min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
- const unsigned long pfn_high = pfn >> order;
+ physxa = xa_load(&track->orders, order);
+ if (WARN_ON_ONCE(!physxa))
+ return;
- physxa = xa_load(&track->orders, order);
- if (WARN_ON_ONCE(!physxa))
- return;
+ bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
+ if (WARN_ON_ONCE(!bits))
+ return;
- bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
- if (WARN_ON_ONCE(!bits))
- return;
+ clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+}
- clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
+ unsigned long end_pfn)
+{
+ unsigned int order;
+
+ while (pfn < end_pfn) {
+ order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+ __kho_unpreserve_order(track, pfn, order);
pfn += 1 << order;
}
@@ -192,10 +186,6 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
const unsigned long pfn_high = pfn >> order;
might_sleep();
-
- if (kho_out.finalized)
- return -EBUSY;
-
physxa = xa_load(&track->orders, order);
if (!physxa) {
int err;
@@ -229,11 +219,11 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
return 0;
}
-static struct page *kho_restore_page(phys_addr_t phys)
+static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
{
struct page *page = pfn_to_online_page(PHYS_PFN(phys));
+ unsigned int nr_pages, ref_cnt;
union kho_page_info info;
- unsigned int nr_pages;
if (!page)
return NULL;
@@ -253,11 +243,16 @@ static struct page *kho_restore_page(phys_addr_t phys)
/* Head page gets refcount of 1. */
set_page_count(page, 1);
- /* For higher order folios, tail pages get a page count of zero. */
+ /*
+ * For higher order folios, tail pages get a page count of zero.
+ * For physically contiguous order-0 pages every pages gets a page
+ * count of 1
+ */
+ ref_cnt = is_folio ? 0 : 1;
for (unsigned int i = 1; i < nr_pages; i++)
- set_page_count(page + i, 0);
+ set_page_count(page + i, ref_cnt);
- if (info.order > 0)
+ if (is_folio && info.order)
prep_compound_page(page, info.order);
adjust_managed_page_count(page, nr_pages);
@@ -272,7 +267,7 @@ static struct page *kho_restore_page(phys_addr_t phys)
*/
struct folio *kho_restore_folio(phys_addr_t phys)
{
- struct page *page = kho_restore_page(phys);
+ struct page *page = kho_restore_page(phys, true);
return page ? page_folio(page) : NULL;
}
@@ -297,11 +292,10 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages)
while (pfn < end_pfn) {
const unsigned int order =
min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
- struct page *page = kho_restore_page(PFN_PHYS(pfn));
+ struct page *page = kho_restore_page(PFN_PHYS(pfn), false);
if (!page)
return NULL;
- split_page(page, order);
pfn += 1 << order;
}
@@ -371,11 +365,32 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
struct khoser_mem_chunk *tmp = chunk;
chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
- kfree(tmp);
+ free_page((unsigned long)tmp);
}
}
-static int kho_mem_serialize(struct kho_serialization *ser)
+/*
+ * Update memory map property, if old one is found discard it via
+ * kho_mem_ser_free().
+ */
+static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
+{
+ void *ptr;
+ u64 phys;
+
+ ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL);
+
+ /* Check and discard previous memory map */
+ phys = get_unaligned((u64 *)ptr);
+ if (phys)
+ kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys));
+
+ /* Update with the new value */
+ phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0;
+ put_unaligned(phys, (u64 *)ptr);
+}
+
+static int kho_mem_serialize(struct kho_out *kho_out)
{
struct khoser_mem_chunk *first_chunk = NULL;
struct khoser_mem_chunk *chunk = NULL;
@@ -383,7 +398,7 @@ static int kho_mem_serialize(struct kho_serialization *ser)
unsigned long order;
int err = -ENOMEM;
- xa_for_each(&ser->track.orders, order, physxa) {
+ xa_for_each(&kho_out->track.orders, order, physxa) {
struct kho_mem_phys_bits *bits;
unsigned long phys;
@@ -415,7 +430,7 @@ static int kho_mem_serialize(struct kho_serialization *ser)
}
}
- ser->preserved_mem_map = first_chunk;
+ kho_update_memory_map(first_chunk);
return 0;
@@ -445,20 +460,27 @@ static void __init deserialize_bitmap(unsigned int order,
}
}
-static void __init kho_mem_deserialize(const void *fdt)
+/* Return true if memory was deserizlied */
+static bool __init kho_mem_deserialize(const void *fdt)
{
struct khoser_mem_chunk *chunk;
- const phys_addr_t *mem;
+ const void *mem_ptr;
+ u64 mem;
int len;
- mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
-
- if (!mem || len != sizeof(*mem)) {
+ mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
+ if (!mem_ptr || len != sizeof(u64)) {
pr_err("failed to get preserved memory bitmaps\n");
- return;
+ return false;
}
- chunk = *mem ? phys_to_virt(*mem) : NULL;
+ mem = get_unaligned((const u64 *)mem_ptr);
+ chunk = mem ? phys_to_virt(mem) : NULL;
+
+ /* No preserved physical pages were passed, no deserialization */
+ if (!chunk)
+ return false;
+
while (chunk) {
unsigned int i;
@@ -467,6 +489,8 @@ static void __init kho_mem_deserialize(const void *fdt)
&chunk->bitmaps[i]);
chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
}
+
+ return true;
}
/*
@@ -674,40 +698,8 @@ err_disable_kho:
kho_enable = false;
}
-struct fdt_debugfs {
- struct list_head list;
- struct debugfs_blob_wrapper wrapper;
- struct dentry *file;
-};
-
-static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
- const char *name, const void *fdt)
-{
- struct fdt_debugfs *f;
- struct dentry *file;
-
- f = kmalloc(sizeof(*f), GFP_KERNEL);
- if (!f)
- return -ENOMEM;
-
- f->wrapper.data = (void *)fdt;
- f->wrapper.size = fdt_totalsize(fdt);
-
- file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
- if (IS_ERR(file)) {
- kfree(f);
- return PTR_ERR(file);
- }
-
- f->file = file;
- list_add(&f->list, list);
-
- return 0;
-}
-
/**
* kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
- * @ser: serialization control object passed by KHO notifiers.
* @name: name of the sub tree.
* @fdt: the sub tree blob.
*
@@ -716,38 +708,76 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
* by KHO for the new kernel to retrieve it after kexec.
*
* A debugfs blob entry is also created at
- * ``/sys/kernel/debug/kho/out/sub_fdts/@name``.
+ * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with
+ * CONFIG_KEXEC_HANDOVER_DEBUGFS
*
* Return: 0 on success, error code on failure
*/
-int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt)
+int kho_add_subtree(const char *name, void *fdt)
{
- int err = 0;
- u64 phys = (u64)virt_to_phys(fdt);
- void *root = page_to_virt(ser->fdt);
+ phys_addr_t phys = virt_to_phys(fdt);
+ void *root_fdt = kho_out.fdt;
+ int err = -ENOMEM;
+ int off, fdt_err;
- err |= fdt_begin_node(root, name);
- err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys));
- err |= fdt_end_node(root);
+ guard(mutex)(&kho_out.lock);
- if (err)
+ fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
+ if (fdt_err < 0)
return err;
- return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt);
+ off = fdt_add_subnode(root_fdt, 0, name);
+ if (off < 0) {
+ if (off == -FDT_ERR_EXISTS)
+ err = -EEXIST;
+ goto out_pack;
+ }
+
+ err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys));
+ if (err < 0)
+ goto out_pack;
+
+ WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false));
+
+out_pack:
+ fdt_pack(root_fdt);
+
+ return err;
}
EXPORT_SYMBOL_GPL(kho_add_subtree);
-int register_kho_notifier(struct notifier_block *nb)
+void kho_remove_subtree(void *fdt)
{
- return blocking_notifier_chain_register(&kho_out.chain_head, nb);
-}
-EXPORT_SYMBOL_GPL(register_kho_notifier);
+ phys_addr_t target_phys = virt_to_phys(fdt);
+ void *root_fdt = kho_out.fdt;
+ int off;
+ int err;
-int unregister_kho_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_unregister(&kho_out.chain_head, nb);
+ guard(mutex)(&kho_out.lock);
+
+ err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
+ if (err < 0)
+ return;
+
+ for (off = fdt_first_subnode(root_fdt, 0); off >= 0;
+ off = fdt_next_subnode(root_fdt, off)) {
+ const u64 *val;
+ int len;
+
+ val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len);
+ if (!val || len != sizeof(phys_addr_t))
+ continue;
+
+ if ((phys_addr_t)*val == target_phys) {
+ fdt_del_node(root_fdt, off);
+ kho_debugfs_fdt_remove(&kho_out.dbg, fdt);
+ break;
+ }
+ }
+
+ fdt_pack(root_fdt);
}
-EXPORT_SYMBOL_GPL(unregister_kho_notifier);
+EXPORT_SYMBOL_GPL(kho_remove_subtree);
/**
* kho_preserve_folio - preserve a folio across kexec.
@@ -762,7 +792,7 @@ int kho_preserve_folio(struct folio *folio)
{
const unsigned long pfn = folio_pfn(folio);
const unsigned int order = folio_order(folio);
- struct kho_mem_track *track = &kho_out.ser.track;
+ struct kho_mem_track *track = &kho_out.track;
if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
return -EINVAL;
@@ -772,6 +802,24 @@ int kho_preserve_folio(struct folio *folio)
EXPORT_SYMBOL_GPL(kho_preserve_folio);
/**
+ * kho_unpreserve_folio - unpreserve a folio.
+ * @folio: folio to unpreserve.
+ *
+ * Instructs KHO to unpreserve a folio that was preserved by
+ * kho_preserve_folio() before. The provided @folio (pfn and order)
+ * must exactly match a previously preserved folio.
+ */
+void kho_unpreserve_folio(struct folio *folio)
+{
+ const unsigned long pfn = folio_pfn(folio);
+ const unsigned int order = folio_order(folio);
+ struct kho_mem_track *track = &kho_out.track;
+
+ __kho_unpreserve_order(track, pfn, order);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
+
+/**
* kho_preserve_pages - preserve contiguous pages across kexec
* @page: first page in the list.
* @nr_pages: number of pages.
@@ -783,7 +831,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio);
*/
int kho_preserve_pages(struct page *page, unsigned int nr_pages)
{
- struct kho_mem_track *track = &kho_out.ser.track;
+ struct kho_mem_track *track = &kho_out.track;
const unsigned long start_pfn = page_to_pfn(page);
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn = start_pfn;
@@ -815,6 +863,26 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
}
EXPORT_SYMBOL_GPL(kho_preserve_pages);
+/**
+ * kho_unpreserve_pages - unpreserve contiguous pages.
+ * @page: first page in the list.
+ * @nr_pages: number of pages.
+ *
+ * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page.
+ * This must be called with the same @page and @nr_pages as the corresponding
+ * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
+ * preserved blocks is not supported.
+ */
+void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
+{
+ struct kho_mem_track *track = &kho_out.track;
+ const unsigned long start_pfn = page_to_pfn(page);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+
+ __kho_unpreserve(track, start_pfn, end_pfn);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
+
struct kho_vmalloc_hdr {
DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
};
@@ -885,7 +953,7 @@ err_free:
static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
unsigned short order)
{
- struct kho_mem_track *track = &kho_out.ser.track;
+ struct kho_mem_track *track = &kho_out.track;
unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
__kho_unpreserve(track, pfn, pfn + 1);
@@ -896,20 +964,6 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
}
}
-static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc)
-{
- struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first);
-
- while (chunk) {
- struct kho_vmalloc_chunk *tmp = chunk;
-
- kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order);
-
- chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
- free_page((unsigned long)tmp);
- }
-}
-
/**
* kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
* @ptr: pointer to the area in vmalloc address space
@@ -971,12 +1025,34 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
return 0;
err_free:
- kho_vmalloc_free_chunks(preservation);
+ kho_unpreserve_vmalloc(preservation);
return err;
}
EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
/**
+ * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc()
+ * @preservation: preservation metadata returned by kho_preserve_vmalloc()
+ *
+ * Instructs KHO to unpreserve the area in vmalloc address space that was
+ * previously preserved with kho_preserve_vmalloc().
+ */
+void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
+
+ while (chunk) {
+ struct kho_vmalloc_chunk *tmp = chunk;
+
+ kho_vmalloc_unpreserve_chunk(chunk, preservation->order);
+
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ free_page((unsigned long)tmp);
+ }
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
+
+/**
* kho_restore_vmalloc - recreates and populates an area in vmalloc address
* space from the preserved memory.
* @preservation: preservation metadata.
@@ -1024,7 +1100,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
goto err_free_pages_array;
for (int j = 0; j < contig_pages; j++)
- pages[idx++] = page;
+ pages[idx++] = page + j;
phys += contig_pages * PAGE_SIZE;
}
@@ -1065,217 +1141,122 @@ err_free_pages_array:
}
EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
-/* Handling for debug/kho/out */
-
-static struct dentry *debugfs_root;
-
-static int kho_out_update_debugfs_fdt(void)
-{
- int err = 0;
- struct fdt_debugfs *ff, *tmp;
-
- if (kho_out.finalized) {
- err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir,
- "fdt", page_to_virt(kho_out.ser.fdt));
- } else {
- list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) {
- debugfs_remove(ff->file);
- list_del(&ff->list);
- kfree(ff);
- }
- }
-
- return err;
-}
-
-static int kho_abort(void)
+/**
+ * kho_alloc_preserve - Allocate, zero, and preserve memory.
+ * @size: The number of bytes to allocate.
+ *
+ * Allocates a physically contiguous block of zeroed pages that is large
+ * enough to hold @size bytes. The allocated memory is then registered with
+ * KHO for preservation across a kexec.
+ *
+ * Note: The actual allocated size will be rounded up to the nearest
+ * power-of-two page boundary.
+ *
+ * @return A virtual pointer to the allocated and preserved memory on success,
+ * or an ERR_PTR() encoded error on failure.
+ */
+void *kho_alloc_preserve(size_t size)
{
- int err;
- unsigned long order;
- struct kho_mem_phys *physxa;
+ struct folio *folio;
+ int order, ret;
- xa_for_each(&kho_out.ser.track.orders, order, physxa) {
- struct kho_mem_phys_bits *bits;
- unsigned long phys;
+ if (!size)
+ return ERR_PTR(-EINVAL);
- xa_for_each(&physxa->phys_bits, phys, bits)
- kfree(bits);
+ order = get_order(size);
+ if (order > MAX_PAGE_ORDER)
+ return ERR_PTR(-E2BIG);
- xa_destroy(&physxa->phys_bits);
- kfree(physxa);
- }
- xa_destroy(&kho_out.ser.track.orders);
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+ if (!folio)
+ return ERR_PTR(-ENOMEM);
- if (kho_out.ser.preserved_mem_map) {
- kho_mem_ser_free(kho_out.ser.preserved_mem_map);
- kho_out.ser.preserved_mem_map = NULL;
+ ret = kho_preserve_folio(folio);
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
}
- err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT,
- NULL);
- err = notifier_to_errno(err);
-
- if (err)
- pr_err("Failed to abort KHO finalization: %d\n", err);
-
- return err;
+ return folio_address(folio);
}
+EXPORT_SYMBOL_GPL(kho_alloc_preserve);
-static int kho_finalize(void)
+/**
+ * kho_unpreserve_free - Unpreserve and free memory.
+ * @mem: Pointer to the memory allocated by kho_alloc_preserve().
+ *
+ * Unregisters the memory from KHO preservation and frees the underlying
+ * pages back to the system. This function should be called to clean up
+ * memory allocated with kho_alloc_preserve().
+ */
+void kho_unpreserve_free(void *mem)
{
- int err = 0;
- u64 *preserved_mem_map;
- void *fdt = page_to_virt(kho_out.ser.fdt);
-
- err |= fdt_create(fdt, PAGE_SIZE);
- err |= fdt_finish_reservemap(fdt);
- err |= fdt_begin_node(fdt, "");
- err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE);
- /**
- * Reserve the preserved-memory-map property in the root FDT, so
- * that all property definitions will precede subnodes created by
- * KHO callers.
- */
- err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP,
- sizeof(*preserved_mem_map),
- (void **)&preserved_mem_map);
- if (err)
- goto abort;
-
- err = kho_preserve_folio(page_folio(kho_out.ser.fdt));
- if (err)
- goto abort;
-
- err = blocking_notifier_call_chain(&kho_out.chain_head,
- KEXEC_KHO_FINALIZE, &kho_out.ser);
- err = notifier_to_errno(err);
- if (err)
- goto abort;
-
- err = kho_mem_serialize(&kho_out.ser);
- if (err)
- goto abort;
-
- *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map);
-
- err |= fdt_end_node(fdt);
- err |= fdt_finish(fdt);
+ struct folio *folio;
-abort:
- if (err) {
- pr_err("Failed to convert KHO state tree: %d\n", err);
- kho_abort();
- }
+ if (!mem)
+ return;
- return err;
+ folio = virt_to_folio(mem);
+ kho_unpreserve_folio(folio);
+ folio_put(folio);
}
+EXPORT_SYMBOL_GPL(kho_unpreserve_free);
-static int kho_out_finalize_get(void *data, u64 *val)
+/**
+ * kho_restore_free - Restore and free memory after kexec.
+ * @mem: Pointer to the memory (in the new kernel's address space)
+ * that was allocated by the old kernel.
+ *
+ * This function is intended to be called in the new kernel (post-kexec)
+ * to take ownership of and free a memory region that was preserved by the
+ * old kernel using kho_alloc_preserve().
+ *
+ * It first restores the pages from KHO (using their physical address)
+ * and then frees the pages back to the new kernel's page allocator.
+ */
+void kho_restore_free(void *mem)
{
- mutex_lock(&kho_out.lock);
- *val = kho_out.finalized;
- mutex_unlock(&kho_out.lock);
+ struct folio *folio;
- return 0;
+ if (!mem)
+ return;
+
+ folio = kho_restore_folio(__pa(mem));
+ if (!WARN_ON(!folio))
+ folio_put(folio);
}
+EXPORT_SYMBOL_GPL(kho_restore_free);
-static int kho_out_finalize_set(void *data, u64 _val)
+int kho_finalize(void)
{
- int ret = 0;
- bool val = !!_val;
-
- mutex_lock(&kho_out.lock);
-
- if (val == kho_out.finalized) {
- if (kho_out.finalized)
- ret = -EEXIST;
- else
- ret = -ENOENT;
- goto unlock;
- }
+ int ret;
- if (val)
- ret = kho_finalize();
- else
- ret = kho_abort();
+ if (!kho_enable)
+ return -EOPNOTSUPP;
+ guard(mutex)(&kho_out.lock);
+ ret = kho_mem_serialize(&kho_out);
if (ret)
- goto unlock;
-
- kho_out.finalized = val;
- ret = kho_out_update_debugfs_fdt();
-
-unlock:
- mutex_unlock(&kho_out.lock);
- return ret;
-}
-
-DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get,
- kho_out_finalize_set, "%llu\n");
+ return ret;
-static int scratch_phys_show(struct seq_file *m, void *v)
-{
- for (int i = 0; i < kho_scratch_cnt; i++)
- seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
+ kho_out.finalized = true;
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(scratch_phys);
-static int scratch_len_show(struct seq_file *m, void *v)
+bool kho_finalized(void)
{
- for (int i = 0; i < kho_scratch_cnt; i++)
- seq_printf(m, "0x%llx\n", kho_scratch[i].size);
-
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(scratch_len);
-
-static __init int kho_out_debugfs_init(void)
-{
- struct dentry *dir, *f, *sub_fdt_dir;
-
- dir = debugfs_create_dir("out", debugfs_root);
- if (IS_ERR(dir))
- return -ENOMEM;
-
- sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
- if (IS_ERR(sub_fdt_dir))
- goto err_rmdir;
-
- f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
- &scratch_phys_fops);
- if (IS_ERR(f))
- goto err_rmdir;
-
- f = debugfs_create_file("scratch_len", 0400, dir, NULL,
- &scratch_len_fops);
- if (IS_ERR(f))
- goto err_rmdir;
-
- f = debugfs_create_file("finalize", 0600, dir, NULL,
- &fops_kho_out_finalize);
- if (IS_ERR(f))
- goto err_rmdir;
-
- kho_out.dir = dir;
- kho_out.ser.sub_fdt_dir = sub_fdt_dir;
- return 0;
-
-err_rmdir:
- debugfs_remove_recursive(dir);
- return -ENOENT;
+ guard(mutex)(&kho_out.lock);
+ return kho_out.finalized;
}
struct kho_in {
- struct dentry *dir;
phys_addr_t fdt_phys;
phys_addr_t scratch_phys;
- struct list_head fdt_list;
+ struct kho_debugfs dbg;
};
static struct kho_in kho_in = {
- .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list),
};
static const void *kho_get_fdt(void)
@@ -1339,91 +1320,52 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
}
EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
-/* Handling for debugfs/kho/in */
-
-static __init int kho_in_debugfs_init(const void *fdt)
+static __init int kho_out_fdt_setup(void)
{
- struct dentry *sub_fdt_dir;
- int err, child;
-
- kho_in.dir = debugfs_create_dir("in", debugfs_root);
- if (IS_ERR(kho_in.dir))
- return PTR_ERR(kho_in.dir);
-
- sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir);
- if (IS_ERR(sub_fdt_dir)) {
- err = PTR_ERR(sub_fdt_dir);
- goto err_rmdir;
- }
-
- err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt);
- if (err)
- goto err_rmdir;
-
- fdt_for_each_subnode(child, fdt, 0) {
- int len = 0;
- const char *name = fdt_get_name(fdt, child, NULL);
- const u64 *fdt_phys;
-
- fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
- if (!fdt_phys)
- continue;
- if (len != sizeof(*fdt_phys)) {
- pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n",
- name, len);
- continue;
- }
- err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name,
- phys_to_virt(*fdt_phys));
- if (err) {
- pr_warn("failed to add fdt `%s` to debugfs: %d\n", name,
- err);
- continue;
- }
- }
+ void *root = kho_out.fdt;
+ u64 empty_mem_map = 0;
+ int err;
- return 0;
+ err = fdt_create(root, PAGE_SIZE);
+ err |= fdt_finish_reservemap(root);
+ err |= fdt_begin_node(root, "");
+ err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
+ err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map,
+ sizeof(empty_mem_map));
+ err |= fdt_end_node(root);
+ err |= fdt_finish(root);
-err_rmdir:
- debugfs_remove_recursive(kho_in.dir);
return err;
}
static __init int kho_init(void)
{
- int err = 0;
const void *fdt = kho_get_fdt();
+ int err = 0;
if (!kho_enable)
return 0;
- kho_out.ser.fdt = alloc_page(GFP_KERNEL);
- if (!kho_out.ser.fdt) {
- err = -ENOMEM;
+ kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
+ if (IS_ERR(kho_out.fdt)) {
+ err = PTR_ERR(kho_out.fdt);
goto err_free_scratch;
}
- debugfs_root = debugfs_create_dir("kho", NULL);
- if (IS_ERR(debugfs_root)) {
- err = -ENOENT;
+ err = kho_debugfs_init();
+ if (err)
goto err_free_fdt;
- }
- err = kho_out_debugfs_init();
+ err = kho_out_debugfs_init(&kho_out.dbg);
if (err)
goto err_free_fdt;
- if (fdt) {
- err = kho_in_debugfs_init(fdt);
- /*
- * Failure to create /sys/kernel/debug/kho/in does not prevent
- * reviving state from KHO and setting up KHO for the next
- * kexec.
- */
- if (err)
- pr_err("failed exposing handover FDT in debugfs: %d\n",
- err);
+ err = kho_out_fdt_setup();
+ if (err)
+ goto err_free_fdt;
+ if (fdt) {
+ kho_in_debugfs_init(&kho_in.dbg, fdt);
return 0;
}
@@ -1432,17 +1374,29 @@ static __init int kho_init(void)
unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
unsigned long pfn;
+ /*
+ * When debug_pagealloc is enabled, __free_pages() clears the
+ * corresponding PRESENT bit in the kernel page table.
+ * Subsequent kmemleak scans of these pages cause the
+ * non-PRESENT page faults.
+ * Mark scratch areas with kmemleak_ignore_phys() to exclude
+ * them from kmemleak scanning.
+ */
+ kmemleak_ignore_phys(kho_scratch[i].addr);
for (pfn = base_pfn; pfn < base_pfn + count;
pfn += pageblock_nr_pages)
init_cma_reserved_pageblock(pfn_to_page(pfn));
}
+ WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
+ kho_out.fdt, true));
+
return 0;
err_free_fdt:
- put_page(kho_out.ser.fdt);
- kho_out.ser.fdt = NULL;
+ kho_unpreserve_free(kho_out.fdt);
err_free_scratch:
+ kho_out.fdt = NULL;
for (int i = 0; i < kho_scratch_cnt; i++) {
void *start = __va(kho_scratch[i].addr);
void *end = start + kho_scratch[i].size;
@@ -1452,7 +1406,7 @@ err_free_scratch:
kho_enable = false;
return err;
}
-late_initcall(kho_init);
+fs_initcall(kho_init);
static void __init kho_release_scratch(void)
{
@@ -1480,16 +1434,12 @@ static void __init kho_release_scratch(void)
void __init kho_memory_init(void)
{
- struct folio *folio;
-
if (kho_in.scratch_phys) {
kho_scratch = phys_to_virt(kho_in.scratch_phys);
kho_release_scratch();
- kho_mem_deserialize(kho_get_fdt());
- folio = kho_restore_folio(kho_in.fdt_phys);
- if (!folio)
- pr_warn("failed to restore folio for KHO fdt\n");
+ if (!kho_mem_deserialize(kho_get_fdt()))
+ kho_in.fdt_phys = 0;
} else {
kho_reserve_scratch();
}
@@ -1545,8 +1495,8 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
memblock_add(area->addr, size);
err = memblock_mark_kho_scratch(area->addr, size);
if (WARN_ON(err)) {
- pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d",
- &area->addr, &size, err);
+ pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe",
+ &area->addr, &size, ERR_PTR(err));
goto out;
}
pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
@@ -1566,7 +1516,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
kho_in.fdt_phys = fdt_phys;
kho_in.scratch_phys = scratch_phys;
kho_scratch_cnt = scratch_cnt;
- pr_info("found kexec handover data. Will skip init for some devices\n");
+ pr_info("found kexec handover data.\n");
out:
if (fdt)
@@ -1585,10 +1535,10 @@ int kho_fill_kimage(struct kimage *image)
int err = 0;
struct kexec_buf scratch;
- if (!kho_out.finalized)
+ if (!kho_enable)
return 0;
- image->kho.fdt = page_to_phys(kho_out.ser.fdt);
+ image->kho.fdt = virt_to_phys(kho_out.fdt);
scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
scratch = (struct kexec_buf){
diff --git a/kernel/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c
index 6efb696f5426..6efb696f5426 100644
--- a/kernel/kexec_handover_debug.c
+++ b/kernel/liveupdate/kexec_handover_debug.c
diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c
new file mode 100644
index 000000000000..2abbf62ba942
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_debugfs.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover_debugfs.c - kexec handover debugfs interfaces
+ * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
+ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
+ * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/libfdt.h>
+#include <linux/mm.h>
+#include "kexec_handover_internal.h"
+
+static struct dentry *debugfs_root;
+
+struct fdt_debugfs {
+ struct list_head list;
+ struct debugfs_blob_wrapper wrapper;
+ struct dentry *file;
+};
+
+static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
+ const char *name, const void *fdt)
+{
+ struct fdt_debugfs *f;
+ struct dentry *file;
+
+ f = kmalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ return -ENOMEM;
+
+ f->wrapper.data = (void *)fdt;
+ f->wrapper.size = fdt_totalsize(fdt);
+
+ file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
+ if (IS_ERR(file)) {
+ kfree(f);
+ return PTR_ERR(file);
+ }
+
+ f->file = file;
+ list_add(&f->list, list);
+
+ return 0;
+}
+
+int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root)
+{
+ struct dentry *dir;
+
+ if (root)
+ dir = dbg->dir;
+ else
+ dir = dbg->sub_fdt_dir;
+
+ return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt);
+}
+
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt)
+{
+ struct fdt_debugfs *ff;
+
+ list_for_each_entry(ff, &dbg->fdt_list, list) {
+ if (ff->wrapper.data == fdt) {
+ debugfs_remove(ff->file);
+ list_del(&ff->list);
+ kfree(ff);
+ break;
+ }
+ }
+}
+
+static int kho_out_finalize_get(void *data, u64 *val)
+{
+ *val = kho_finalized();
+
+ return 0;
+}
+
+static int kho_out_finalize_set(void *data, u64 val)
+{
+ if (val)
+ return kho_finalize();
+ else
+ return -EINVAL;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get,
+ kho_out_finalize_set, "%llu\n");
+
+static int scratch_phys_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_phys);
+
+static int scratch_len_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].size);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_len);
+
+__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt)
+{
+ struct dentry *dir, *sub_fdt_dir;
+ int err, child;
+
+ INIT_LIST_HEAD(&dbg->fdt_list);
+
+ dir = debugfs_create_dir("in", debugfs_root);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto err_out;
+ }
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir)) {
+ err = PTR_ERR(sub_fdt_dir);
+ goto err_rmdir;
+ }
+
+ err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt);
+ if (err)
+ goto err_rmdir;
+
+ fdt_for_each_subnode(child, fdt, 0) {
+ int len = 0;
+ const char *name = fdt_get_name(fdt, child, NULL);
+ const u64 *fdt_phys;
+
+ fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
+ if (!fdt_phys)
+ continue;
+ if (len != sizeof(*fdt_phys)) {
+ pr_warn("node %s prop fdt has invalid length: %d\n",
+ name, len);
+ continue;
+ }
+ err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name,
+ phys_to_virt(*fdt_phys));
+ if (err) {
+ pr_warn("failed to add fdt %s to debugfs: %pe\n", name,
+ ERR_PTR(err));
+ continue;
+ }
+ }
+
+ dbg->dir = dir;
+ dbg->sub_fdt_dir = sub_fdt_dir;
+
+ return;
+err_rmdir:
+ debugfs_remove_recursive(dir);
+err_out:
+ /*
+ * Failure to create /sys/kernel/debug/kho/in does not prevent
+ * reviving state from KHO and setting up KHO for the next
+ * kexec.
+ */
+ if (err) {
+ pr_err("failed exposing handover FDT in debugfs: %pe\n",
+ ERR_PTR(err));
+ }
+}
+
+__init int kho_out_debugfs_init(struct kho_debugfs *dbg)
+{
+ struct dentry *dir, *f, *sub_fdt_dir;
+
+ INIT_LIST_HEAD(&dbg->fdt_list);
+
+ dir = debugfs_create_dir("out", debugfs_root);
+ if (IS_ERR(dir))
+ return -ENOMEM;
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
+ &scratch_phys_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_len", 0400, dir, NULL,
+ &scratch_len_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("finalize", 0600, dir, NULL,
+ &kho_out_finalize_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ dbg->dir = dir;
+ dbg->sub_fdt_dir = sub_fdt_dir;
+ return 0;
+
+err_rmdir:
+ debugfs_remove_recursive(dir);
+ return -ENOENT;
+}
+
+__init int kho_debugfs_init(void)
+{
+ debugfs_root = debugfs_create_dir("kho", NULL);
+ if (IS_ERR(debugfs_root))
+ return -ENOENT;
+ return 0;
+}
diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h
new file mode 100644
index 000000000000..0202c85ad14f
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_internal.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H
+#define LINUX_KEXEC_HANDOVER_INTERNAL_H
+
+#include <linux/kexec_handover.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS
+#include <linux/debugfs.h>
+
+struct kho_debugfs {
+ struct dentry *dir;
+ struct dentry *sub_fdt_dir;
+ struct list_head fdt_list;
+};
+
+#else
+struct kho_debugfs {};
+#endif
+
+extern struct kho_scratch *kho_scratch;
+extern unsigned int kho_scratch_cnt;
+
+bool kho_finalized(void);
+int kho_finalize(void);
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS
+int kho_debugfs_init(void);
+void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt);
+int kho_out_debugfs_init(struct kho_debugfs *dbg);
+int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root);
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt);
+#else
+static inline int kho_debugfs_init(void) { return 0; }
+static inline void kho_in_debugfs_init(struct kho_debugfs *dbg,
+ const void *fdt) { }
+static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; }
+static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root) { return 0; }
+static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg,
+ void *fdt) { }
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
+bool kho_scratch_overlap(phys_addr_t phys, size_t size);
+#else
+static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */
+
+#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
new file mode 100644
index 000000000000..f7ecaf7740d1
--- /dev/null
+++ b/kernel/liveupdate/luo_core.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: Live Update Orchestrator (LUO)
+ *
+ * Live Update is a specialized, kexec-based reboot process that allows a
+ * running kernel to be updated from one version to another while preserving
+ * the state of selected resources and keeping designated hardware devices
+ * operational. For these devices, DMA activity may continue throughout the
+ * kernel transition.
+ *
+ * While the primary use case driving this work is supporting live updates of
+ * the Linux kernel when it is used as a hypervisor in cloud environments, the
+ * LUO framework itself is designed to be workload-agnostic. Live Update
+ * facilitates a full kernel version upgrade for any type of system.
+ *
+ * For example, a non-hypervisor system running an in-memory cache like
+ * memcached with many gigabytes of data can use LUO. The userspace service
+ * can place its cache into a memfd, have its state preserved by LUO, and
+ * restore it immediately after the kernel kexec.
+ *
+ * Whether the system is running virtual machines, containers, a
+ * high-performance database, or networking services, LUO's primary goal is to
+ * enable a full kernel update by preserving critical userspace state and
+ * keeping essential devices operational.
+ *
+ * The core of LUO is a mechanism that tracks the progress of a live update,
+ * along with a callback API that allows other kernel subsystems to participate
+ * in the process. Example subsystems that can hook into LUO include: kvm,
+ * iommu, interrupts, vfio, participating filesystems, and memory management.
+ *
+ * LUO uses Kexec Handover to transfer memory state from the current kernel to
+ * the next kernel. For more details see
+ * Documentation/core-api/kho/concepts.rst.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/kobject.h>
+#include <linux/libfdt.h>
+#include <linux/liveupdate.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+#include "kexec_handover_internal.h"
+#include "luo_internal.h"
+
+static struct {
+ bool enabled;
+ void *fdt_out;
+ void *fdt_in;
+ u64 liveupdate_num;
+} luo_global;
+
+static int __init early_liveupdate_param(char *buf)
+{
+ return kstrtobool(buf, &luo_global.enabled);
+}
+early_param("liveupdate", early_liveupdate_param);
+
+static int __init luo_early_startup(void)
+{
+ phys_addr_t fdt_phys;
+ int err, ln_size;
+ const void *ptr;
+
+ if (!kho_is_enabled()) {
+ if (liveupdate_enabled())
+ pr_warn("Disabling liveupdate because KHO is disabled\n");
+ luo_global.enabled = false;
+ return 0;
+ }
+
+ /* Retrieve LUO subtree, and verify its format. */
+ err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT) {
+ pr_err("failed to retrieve FDT '%s' from KHO: %pe\n",
+ LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+ }
+
+ luo_global.fdt_in = phys_to_virt(fdt_phys);
+ err = fdt_node_check_compatible(luo_global.fdt_in, 0,
+ LUO_FDT_COMPATIBLE);
+ if (err) {
+ pr_err("FDT '%s' is incompatible with '%s' [%d]\n",
+ LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err);
+
+ return -EINVAL;
+ }
+
+ ln_size = 0;
+ ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM,
+ &ln_size);
+ if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) {
+ pr_err("Unable to get live update number '%s' [%d]\n",
+ LUO_FDT_LIVEUPDATE_NUM, ln_size);
+
+ return -EINVAL;
+ }
+
+ luo_global.liveupdate_num = get_unaligned((u64 *)ptr);
+ pr_info("Retrieved live update data, liveupdate number: %lld\n",
+ luo_global.liveupdate_num);
+
+ err = luo_session_setup_incoming(luo_global.fdt_in);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int __init liveupdate_early_init(void)
+{
+ int err;
+
+ err = luo_early_startup();
+ if (err) {
+ luo_global.enabled = false;
+ luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n",
+ ERR_PTR(err));
+ }
+
+ return err;
+}
+early_initcall(liveupdate_early_init);
+
+/* Called during boot to create outgoing LUO fdt tree */
+static int __init luo_fdt_setup(void)
+{
+ const u64 ln = luo_global.liveupdate_num + 1;
+ void *fdt_out;
+ int err;
+
+ fdt_out = kho_alloc_preserve(LUO_FDT_SIZE);
+ if (IS_ERR(fdt_out)) {
+ pr_err("failed to allocate/preserve FDT memory\n");
+ return PTR_ERR(fdt_out);
+ }
+
+ err = fdt_create(fdt_out, LUO_FDT_SIZE);
+ err |= fdt_finish_reservemap(fdt_out);
+ err |= fdt_begin_node(fdt_out, "");
+ err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
+ err |= luo_session_setup_outgoing(fdt_out);
+ err |= fdt_end_node(fdt_out);
+ err |= fdt_finish(fdt_out);
+ if (err)
+ goto exit_free;
+
+ err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out);
+ if (err)
+ goto exit_free;
+ luo_global.fdt_out = fdt_out;
+
+ return 0;
+
+exit_free:
+ kho_unpreserve_free(fdt_out);
+ pr_err("failed to prepare LUO FDT: %d\n", err);
+
+ return err;
+}
+
+/*
+ * late initcall because it initializes the outgoing tree that is needed only
+ * once userspace starts using /dev/liveupdate.
+ */
+static int __init luo_late_startup(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = luo_fdt_setup();
+ if (err)
+ luo_global.enabled = false;
+
+ return err;
+}
+late_initcall(luo_late_startup);
+
+/* Public Functions */
+
+/**
+ * liveupdate_reboot() - Kernel reboot notifier for live update final
+ * serialization.
+ *
+ * This function is invoked directly from the reboot() syscall pathway
+ * if kexec is in progress.
+ *
+ * If any callback fails, this function aborts KHO, undoes the freeze()
+ * callbacks, and returns an error.
+ */
+int liveupdate_reboot(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = luo_session_serialize();
+ if (err)
+ return err;
+
+ err = kho_finalize();
+ if (err) {
+ pr_err("kho_finalize failed %d\n", err);
+ /*
+ * kho_finalize() may return libfdt errors, to aboid passing to
+ * userspace unknown errors, change this to EAGAIN.
+ */
+ err = -EAGAIN;
+ }
+
+ return err;
+}
+
+/**
+ * liveupdate_enabled - Check if the live update feature is enabled.
+ *
+ * This function returns the state of the live update feature flag, which
+ * can be controlled via the ``liveupdate`` kernel command-line parameter.
+ *
+ * @return true if live update is enabled, false otherwise.
+ */
+bool liveupdate_enabled(void)
+{
+ return luo_global.enabled;
+}
+
+/**
+ * DOC: LUO ioctl Interface
+ *
+ * The IOCTL user-space control interface for the LUO subsystem.
+ * It registers a character device, typically found at ``/dev/liveupdate``,
+ * which allows a userspace agent to manage the LUO state machine and its
+ * associated resources, such as preservable file descriptors.
+ *
+ * To ensure that the state machine is controlled by a single entity, access
+ * to this device is exclusive: only one process is permitted to have
+ * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will
+ * fail with -EBUSY until the first process closes its file descriptor.
+ * This singleton model simplifies state management by preventing conflicting
+ * commands from multiple userspace agents.
+ */
+
+struct luo_device_state {
+ struct miscdevice miscdev;
+ atomic_t in_use;
+};
+
+static int luo_ioctl_create_session(struct luo_ucmd *ucmd)
+{
+ struct liveupdate_ioctl_create_session *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ err = luo_session_create(argp->name, &file);
+ if (err)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd)
+{
+ struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ err = luo_session_retrieve(argp->name, &file);
+ if (err < 0)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_open(struct inode *inodep, struct file *filep)
+{
+ struct luo_device_state *ldev = container_of(filep->private_data,
+ struct luo_device_state,
+ miscdev);
+
+ if (atomic_cmpxchg(&ldev->in_use, 0, 1))
+ return -EBUSY;
+
+ /* Always return -EIO to user if deserialization fail */
+ if (luo_session_deserialize()) {
+ atomic_set(&ldev->in_use, 0);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int luo_release(struct inode *inodep, struct file *filep)
+{
+ struct luo_device_state *ldev = container_of(filep->private_data,
+ struct luo_device_state,
+ miscdev);
+ atomic_set(&ldev->in_use, 0);
+
+ return 0;
+}
+
+union ucmd_buffer {
+ struct liveupdate_ioctl_create_session create;
+ struct liveupdate_ioctl_retrieve_session retrieve;
+};
+
+struct luo_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+
+static const struct luo_ioctl_op luo_ioctl_ops[] = {
+ IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session,
+ struct liveupdate_ioctl_create_session, name),
+ IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session,
+ struct liveupdate_ioctl_retrieve_session, name),
+};
+
+static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ const struct luo_ioctl_op *op;
+ struct luo_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int err;
+
+ nr = _IOC_NR(cmd);
+ if (nr < LIVEUPDATE_CMD_BASE ||
+ (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) {
+ return -EINVAL;
+ }
+
+ ucmd.ubuffer = (void __user *)arg;
+ err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (err)
+ return err;
+
+ op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (err)
+ return err;
+
+ return op->execute(&ucmd);
+}
+
+static const struct file_operations luo_fops = {
+ .owner = THIS_MODULE,
+ .open = luo_open,
+ .release = luo_release,
+ .unlocked_ioctl = luo_ioctl,
+};
+
+static struct luo_device_state luo_dev = {
+ .miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "liveupdate",
+ .fops = &luo_fops,
+ },
+ .in_use = ATOMIC_INIT(0),
+};
+
+static int __init liveupdate_ioctl_init(void)
+{
+ if (!liveupdate_enabled())
+ return 0;
+
+ return misc_register(&luo_dev.miscdev);
+}
+late_initcall(liveupdate_ioctl_init);
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
new file mode 100644
index 000000000000..ddff87917b21
--- /dev/null
+++ b/kernel/liveupdate/luo_file.c
@@ -0,0 +1,889 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO File Descriptors
+ *
+ * LUO provides the infrastructure to preserve specific, stateful file
+ * descriptors across a kexec-based live update. The primary goal is to allow
+ * workloads, such as virtual machines using vfio, memfd, or iommufd, to
+ * retain access to their essential resources without interruption.
+ *
+ * The framework is built around a callback-based handler model and a well-
+ * defined lifecycle for each preserved file.
+ *
+ * Handler Registration:
+ * Kernel modules responsible for a specific file type (e.g., memfd, vfio)
+ * register a &struct liveupdate_file_handler. This handler provides a set of
+ * callbacks that LUO invokes at different stages of the update process, most
+ * notably:
+ *
+ * - can_preserve(): A lightweight check to determine if the handler is
+ * compatible with a given 'struct file'.
+ * - preserve(): The heavyweight operation that saves the file's state and
+ * returns an opaque u64 handle. This is typically performed while the
+ * workload is still active to minimize the downtime during the
+ * actual reboot transition.
+ * - unpreserve(): Cleans up any resources allocated by .preserve(), called
+ * if the preservation process is aborted before the reboot (i.e. session is
+ * closed).
+ * - freeze(): A final pre-reboot opportunity to prepare the state for kexec.
+ * We are already in reboot syscall, and therefore userspace cannot mutate
+ * the file anymore.
+ * - unfreeze(): Undoes the actions of .freeze(), called if the live update
+ * is aborted after the freeze phase.
+ * - retrieve(): Reconstructs the file in the new kernel from the preserved
+ * handle.
+ * - finish(): Performs final check and cleanup in the new kernel. After
+ * succesul finish call, LUO gives up ownership to this file.
+ *
+ * File Preservation Lifecycle happy path:
+ *
+ * 1. Preserve (Normal Operation): A userspace agent preserves files one by one
+ * via an ioctl. For each file, luo_preserve_file() finds a compatible
+ * handler, calls its .preserve() operation, and creates an internal &struct
+ * luo_file to track the live state.
+ *
+ * 2. Freeze (Pre-Reboot): Just before the kexec, luo_file_freeze() is called.
+ * It iterates through all preserved files, calls their respective .freeze()
+ * operation, and serializes their final metadata (compatible string, token,
+ * and data handle) into a contiguous memory block for KHO.
+ *
+ * 3. Deserialize: After kexec, luo_file_deserialize() runs when session gets
+ * deserialized (which is when /dev/liveupdate is first opened). It reads the
+ * serialized data from the KHO memory region and reconstructs the in-memory
+ * list of &struct luo_file instances for the new kernel, linking them to
+ * their corresponding handlers.
+ *
+ * 4. Retrieve (New Kernel - Userspace Ready): The userspace agent can now
+ * restore file descriptors by providing a token. luo_retrieve_file()
+ * searches for the matching token, calls the handler's .retrieve() op to
+ * re-create the 'struct file', and returns a new FD. Files can be
+ * retrieved in ANY order.
+ *
+ * 5. Finish (New Kernel - Cleanup): Once a session retrival is complete,
+ * luo_file_finish() is called. It iterates through all files, invokes their
+ * .finish() operations for final cleanup, and releases all associated kernel
+ * resources.
+ *
+ * File Preservation Lifecycle unhappy paths:
+ *
+ * 1. Abort Before Reboot: If the userspace agent aborts the live update
+ * process before calling reboot (e.g., by closing the session file
+ * descriptor), the session's release handler calls
+ * luo_file_unpreserve_files(). This invokes the .unpreserve() callback on
+ * all preserved files, ensuring all allocated resources are cleaned up and
+ * returning the system to a clean state.
+ *
+ * 2. Freeze Failure: During the reboot() syscall, if any handler's .freeze()
+ * op fails, the .unfreeze() op is invoked on all previously *successful*
+ * freezes to roll back their state. The reboot() syscall then returns an
+ * error to userspace, canceling the live update.
+ *
+ * 3. Finish Failure: In the new kernel, if a handler's .finish() op fails,
+ * the luo_file_finish() operation is aborted. LUO retains ownership of
+ * all files within that session, including those that were not yet
+ * processed. The userspace agent can attempt to call the finish operation
+ * again later. If the issue cannot be resolved, these resources will be held
+ * by LUO until the next live update cycle, at which point they will be
+ * discarded.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cleanup.h>
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/liveupdate.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "luo_internal.h"
+
+static LIST_HEAD(luo_file_handler_list);
+
+/* 2 4K pages, give space for 128 files per file_set */
+#define LUO_FILE_PGCNT 2ul
+#define LUO_FILE_MAX \
+ ((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser))
+
+/**
+ * struct luo_file - Represents a single preserved file instance.
+ * @fh: Pointer to the &struct liveupdate_file_handler that manages
+ * this type of file.
+ * @file: Pointer to the kernel's &struct file that is being preserved.
+ * This is NULL in the new kernel until the file is successfully
+ * retrieved.
+ * @serialized_data: The opaque u64 handle to the serialized state of the file.
+ * This handle is passed back to the handler's .freeze(),
+ * .retrieve(), and .finish() callbacks, allowing it to track
+ * and update its serialized state across phases.
+ * @private_data: Pointer to the private data for the file used to hold runtime
+ * state that is not preserved. Set by the handler's .preserve()
+ * callback, and must be freed in the handler's .unpreserve()
+ * callback.
+ * @retrieved: A flag indicating whether a user/kernel in the new kernel has
+ * successfully called retrieve() on this file. This prevents
+ * multiple retrieval attempts.
+ * @mutex: A mutex that protects the fields of this specific instance
+ * (e.g., @retrieved, @file), ensuring that operations like
+ * retrieving or finishing a file are atomic.
+ * @list: The list_head linking this instance into its parent
+ * file_set's list of preserved files.
+ * @token: The user-provided unique token used to identify this file.
+ *
+ * This structure is the core in-kernel representation of a single file being
+ * managed through a live update. An instance is created by luo_preserve_file()
+ * to link a 'struct file' to its corresponding handler, a user-provided token,
+ * and the serialized state handle returned by the handler's .preserve()
+ * operation.
+ *
+ * These instances are tracked in a per-file_set list. The @serialized_data
+ * field, which holds a handle to the file's serialized state, may be updated
+ * during the .freeze() callback before being serialized for the next kernel.
+ * After reboot, these structures are recreated by luo_file_deserialize() and
+ * are finally cleaned up by luo_file_finish().
+ */
+struct luo_file {
+ struct liveupdate_file_handler *fh;
+ struct file *file;
+ u64 serialized_data;
+ void *private_data;
+ bool retrieved;
+ struct mutex mutex;
+ struct list_head list;
+ u64 token;
+};
+
+static int luo_alloc_files_mem(struct luo_file_set *file_set)
+{
+ size_t size;
+ void *mem;
+
+ if (file_set->files)
+ return 0;
+
+ WARN_ON_ONCE(file_set->count);
+
+ size = LUO_FILE_PGCNT << PAGE_SHIFT;
+ mem = kho_alloc_preserve(size);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+
+ file_set->files = mem;
+
+ return 0;
+}
+
+static void luo_free_files_mem(struct luo_file_set *file_set)
+{
+ /* If file_set has files, no need to free preservation memory */
+ if (file_set->count)
+ return;
+
+ if (!file_set->files)
+ return;
+
+ kho_unpreserve_free(file_set->files);
+ file_set->files = NULL;
+}
+
+static bool luo_token_is_used(struct luo_file_set *file_set, u64 token)
+{
+ struct luo_file *iter;
+
+ list_for_each_entry(iter, &file_set->files_list, list) {
+ if (iter->token == token)
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * luo_preserve_file - Initiate the preservation of a file descriptor.
+ * @file_set: The file_set to which the preserved file will be added.
+ * @token: A unique, user-provided identifier for the file.
+ * @fd: The file descriptor to be preserved.
+ *
+ * This function orchestrates the first phase of preserving a file. Upon entry,
+ * it takes a reference to the 'struct file' via fget(), effectively making LUO
+ * a co-owner of the file. This reference is held until the file is either
+ * unpreserved or successfully finished in the next kernel, preventing the file
+ * from being prematurely destroyed.
+ *
+ * This function orchestrates the first phase of preserving a file. It performs
+ * the following steps:
+ *
+ * 1. Validates that the @token is not already in use within the file_set.
+ * 2. Ensures the file_set's memory for files serialization is allocated
+ * (allocates if needed).
+ * 3. Iterates through registered handlers, calling can_preserve() to find one
+ * compatible with the given @fd.
+ * 4. Calls the handler's .preserve() operation, which saves the file's state
+ * and returns an opaque private data handle.
+ * 5. Adds the new instance to the file_set's internal list.
+ *
+ * On success, LUO takes a reference to the 'struct file' and considers it
+ * under its management until it is unpreserved or finished.
+ *
+ * In case of any failure, all intermediate allocations (file reference, memory
+ * for the 'luo_file' struct, etc.) are cleaned up before returning an error.
+ *
+ * Context: Can be called from an ioctl handler during normal system operation.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -EEXIST if the token is already used.
+ * -EBADF if the file descriptor is invalid.
+ * -ENOSPC if the file_set is full.
+ * -ENOENT if no compatible handler is found.
+ * -ENOMEM on memory allocation failure.
+ * Other erros might be returned by .preserve().
+ */
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
+{
+ struct liveupdate_file_op_args args = {0};
+ struct liveupdate_file_handler *fh;
+ struct luo_file *luo_file;
+ struct file *file;
+ int err;
+
+ if (luo_token_is_used(file_set, token))
+ return -EEXIST;
+
+ if (file_set->count == LUO_FILE_MAX)
+ return -ENOSPC;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ err = luo_alloc_files_mem(file_set);
+ if (err)
+ goto err_fput;
+
+ err = -ENOENT;
+ luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ if (fh->ops->can_preserve(fh, file)) {
+ err = 0;
+ break;
+ }
+ }
+
+ /* err is still -ENOENT if no handler was found */
+ if (err)
+ goto err_free_files_mem;
+
+ luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+ if (!luo_file) {
+ err = -ENOMEM;
+ goto err_free_files_mem;
+ }
+
+ luo_file->file = file;
+ luo_file->fh = fh;
+ luo_file->token = token;
+ luo_file->retrieved = false;
+ mutex_init(&luo_file->mutex);
+
+ args.handler = fh;
+ args.file = file;
+ err = fh->ops->preserve(&args);
+ if (err)
+ goto err_kfree;
+
+ luo_file->serialized_data = args.serialized_data;
+ luo_file->private_data = args.private_data;
+ list_add_tail(&luo_file->list, &file_set->files_list);
+ file_set->count++;
+
+ return 0;
+
+err_kfree:
+ kfree(luo_file);
+err_free_files_mem:
+ luo_free_files_mem(file_set);
+err_fput:
+ fput(file);
+
+ return err;
+}
+
+/**
+ * luo_file_unpreserve_files - Unpreserves all files from a file_set.
+ * @file_set: The files to be cleaned up.
+ *
+ * This function serves as the primary cleanup path for a file_set. It is
+ * invoked when the userspace agent closes the file_set's file descriptor.
+ *
+ * For each file, it performs the following cleanup actions:
+ * 1. Calls the handler's .unpreserve() callback to allow the handler to
+ * release any resources it allocated.
+ * 2. Removes the file from the file_set's internal tracking list.
+ * 3. Releases the reference to the 'struct file' that was taken by
+ * luo_preserve_file() via fput(), returning ownership.
+ * 4. Frees the memory associated with the internal 'struct luo_file'.
+ *
+ * After all individual files are unpreserved, it frees the contiguous memory
+ * block that was allocated to hold their serialization data.
+ */
+void luo_file_unpreserve_files(struct luo_file_set *file_set)
+{
+ struct luo_file *luo_file;
+
+ while (!list_empty(&file_set->files_list)) {
+ struct liveupdate_file_op_args args = {0};
+
+ luo_file = list_last_entry(&file_set->files_list,
+ struct luo_file, list);
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+ luo_file->fh->ops->unpreserve(&args);
+
+ list_del(&luo_file->list);
+ file_set->count--;
+
+ fput(luo_file->file);
+ mutex_destroy(&luo_file->mutex);
+ kfree(luo_file);
+ }
+
+ luo_free_files_mem(file_set);
+}
+
+static int luo_file_freeze_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ int err = 0;
+
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->freeze) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+
+ err = luo_file->fh->ops->freeze(&args);
+ if (!err)
+ luo_file->serialized_data = args.serialized_data;
+ }
+
+ return err;
+}
+
+static void luo_file_unfreeze_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->unfreeze) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+
+ luo_file->fh->ops->unfreeze(&args);
+ }
+
+ luo_file->serialized_data = 0;
+}
+
+static void __luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file *failed_entry)
+{
+ struct list_head *files_list = &file_set->files_list;
+ struct luo_file *luo_file;
+
+ list_for_each_entry(luo_file, files_list, list) {
+ if (luo_file == failed_entry)
+ break;
+
+ luo_file_unfreeze_one(file_set, luo_file);
+ }
+
+ memset(file_set->files, 0, LUO_FILE_PGCNT << PAGE_SHIFT);
+}
+
+/**
+ * luo_file_freeze - Freezes all preserved files and serializes their metadata.
+ * @file_set: The file_set whose files are to be frozen.
+ * @file_set_ser: Where to put the serialized file_set.
+ *
+ * This function is called from the reboot() syscall path, just before the
+ * kernel transitions to the new image via kexec. Its purpose is to perform the
+ * final preparation and serialization of all preserved files in the file_set.
+ *
+ * It iterates through each preserved file in FIFO order (the order of
+ * preservation) and performs two main actions:
+ *
+ * 1. Freezes the File: It calls the handler's .freeze() callback for each
+ * file. This gives the handler a final opportunity to quiesce the device or
+ * prepare its state for the upcoming reboot. The handler may update its
+ * private data handle during this step.
+ *
+ * 2. Serializes Metadata: After a successful freeze, it copies the final file
+ * metadata—the handler's compatible string, the user token, and the final
+ * private data handle—into the pre-allocated contiguous memory buffer
+ * (file_set->files) that will be handed over to the next kernel via KHO.
+ *
+ * Error Handling (Rollback):
+ * This function is atomic. If any handler's .freeze() operation fails, the
+ * entire live update is aborted. The __luo_file_unfreeze() helper is
+ * immediately called to invoke the .unfreeze() op on all files that were
+ * successfully frozen before the point of failure, rolling them back to a
+ * running state. The function then returns an error, causing the reboot()
+ * syscall to fail.
+ *
+ * Context: Called only from the liveupdate_reboot() path.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_freeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ struct luo_file_ser *file_ser = file_set->files;
+ struct luo_file *luo_file;
+ int err;
+ int i;
+
+ if (!file_set->count)
+ return 0;
+
+ if (WARN_ON(!file_ser))
+ return -EINVAL;
+
+ i = 0;
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ err = luo_file_freeze_one(file_set, luo_file);
+ if (err < 0) {
+ pr_warn("Freeze failed for token[%#0llx] handler[%s] err[%pe]\n",
+ luo_file->token, luo_file->fh->compatible,
+ ERR_PTR(err));
+ goto err_unfreeze;
+ }
+
+ strscpy(file_ser[i].compatible, luo_file->fh->compatible,
+ sizeof(file_ser[i].compatible));
+ file_ser[i].data = luo_file->serialized_data;
+ file_ser[i].token = luo_file->token;
+ i++;
+ }
+
+ file_set_ser->count = file_set->count;
+ if (file_set->files)
+ file_set_ser->files = virt_to_phys(file_set->files);
+
+ return 0;
+
+err_unfreeze:
+ __luo_file_unfreeze(file_set, luo_file);
+
+ return err;
+}
+
+/**
+ * luo_file_unfreeze - Unfreezes all files in a file_set and clear serialization
+ * @file_set: The file_set whose files are to be unfrozen.
+ * @file_set_ser: Serialized file_set.
+ *
+ * This function rolls back the state of all files in a file_set after the
+ * freeze phase has begun but must be aborted. It is the counterpart to
+ * luo_file_freeze().
+ *
+ * It invokes the __luo_file_unfreeze() helper with a NULL argument, which
+ * signals the helper to iterate through all files in the file_set and call
+ * their respective .unfreeze() handler callbacks.
+ *
+ * Context: This is called when the live update is aborted during
+ * the reboot() syscall, after luo_file_freeze() has been called.
+ */
+void luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ if (!file_set->count)
+ return;
+
+ __luo_file_unfreeze(file_set, NULL);
+ memset(file_set_ser, 0, sizeof(*file_set_ser));
+}
+
+/**
+ * luo_retrieve_file - Restores a preserved file from a file_set by its token.
+ * @file_set: The file_set from which to retrieve the file.
+ * @token: The unique token identifying the file to be restored.
+ * @filep: Output parameter; on success, this is populated with a pointer
+ * to the newly retrieved 'struct file'.
+ *
+ * This function is the primary mechanism for recreating a file in the new
+ * kernel after a live update. It searches the file_set's list of deserialized
+ * files for an entry matching the provided @token.
+ *
+ * The operation is idempotent: if a file has already been successfully
+ * retrieved, this function will simply return a pointer to the existing
+ * 'struct file' and report success without re-executing the retrieve
+ * operation. This is handled by checking the 'retrieved' flag under a lock.
+ *
+ * File retrieval can happen in any order; it is not bound by the order of
+ * preservation.
+ *
+ * Context: Can be called from an ioctl or other in-kernel code in the new
+ * kernel.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -ENOENT if no file with the matching token is found.
+ * Any error code returned by the handler's .retrieve() op.
+ */
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+ struct file **filep)
+{
+ struct liveupdate_file_op_args args = {0};
+ struct luo_file *luo_file;
+ int err;
+
+ if (list_empty(&file_set->files_list))
+ return -ENOENT;
+
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ if (luo_file->token == token)
+ break;
+ }
+
+ if (luo_file->token != token)
+ return -ENOENT;
+
+ guard(mutex)(&luo_file->mutex);
+ if (luo_file->retrieved) {
+ /*
+ * Someone is asking for this file again, so get a reference
+ * for them.
+ */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ return 0;
+ }
+
+ args.handler = luo_file->fh;
+ args.serialized_data = luo_file->serialized_data;
+ err = luo_file->fh->ops->retrieve(&args);
+ if (!err) {
+ luo_file->file = args.file;
+
+ /* Get reference so we can keep this file in LUO until finish */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ luo_file->retrieved = true;
+ }
+
+ return err;
+}
+
+static int luo_file_can_finish_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ bool can_finish = true;
+
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->can_finish) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.retrieved = luo_file->retrieved;
+ can_finish = luo_file->fh->ops->can_finish(&args);
+ }
+
+ return can_finish ? 0 : -EBUSY;
+}
+
+static void luo_file_finish_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ struct liveupdate_file_op_args args = {0};
+
+ guard(mutex)(&luo_file->mutex);
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.retrieved = luo_file->retrieved;
+
+ luo_file->fh->ops->finish(&args);
+}
+
+/**
+ * luo_file_finish - Completes the lifecycle for all files in a file_set.
+ * @file_set: The file_set to be finalized.
+ *
+ * This function orchestrates the final teardown of a live update file_set in
+ * the new kernel. It should be called after all necessary files have been
+ * retrieved and the userspace agent is ready to release the preserved state.
+ *
+ * The function iterates through all tracked files. For each file, it performs
+ * the following sequence of cleanup actions:
+ *
+ * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on
+ * every file in the file_set. If all can_finish return true, continue to
+ * finish.
+ * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to
+ * allow for final resource cleanup within the handler.
+ * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This
+ * is the counterpart to the get_file() call in luo_retrieve_file().
+ * 4. Removes the 'struct luo_file' from the file_set's internal list.
+ * 5. Frees the memory for the 'struct luo_file' instance itself.
+ *
+ * After successfully finishing all individual files, it frees the
+ * contiguous memory block that was used to transfer the serialized metadata
+ * from the previous kernel.
+ *
+ * Error Handling (Atomic Failure):
+ * This operation is atomic. If any handler's .can_finish() op fails, the entire
+ * function aborts immediately and returns an error.
+ *
+ * Context: Can be called from an ioctl handler in the new kernel.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_finish(struct luo_file_set *file_set)
+{
+ struct list_head *files_list = &file_set->files_list;
+ struct luo_file *luo_file;
+ int err;
+
+ if (!file_set->count)
+ return 0;
+
+ list_for_each_entry(luo_file, files_list, list) {
+ err = luo_file_can_finish_one(file_set, luo_file);
+ if (err)
+ return err;
+ }
+
+ while (!list_empty(&file_set->files_list)) {
+ luo_file = list_last_entry(&file_set->files_list,
+ struct luo_file, list);
+
+ luo_file_finish_one(file_set, luo_file);
+
+ if (luo_file->file)
+ fput(luo_file->file);
+ list_del(&luo_file->list);
+ file_set->count--;
+ mutex_destroy(&luo_file->mutex);
+ kfree(luo_file);
+ }
+
+ if (file_set->files) {
+ kho_restore_free(file_set->files);
+ file_set->files = NULL;
+ }
+
+ return 0;
+}
+
+/**
+ * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel.
+ * @file_set: The incoming file_set to fill with deserialized data.
+ * @file_set_ser: Serialized KHO file_set data from the previous kernel.
+ *
+ * This function is called during the early boot process of the new kernel. It
+ * takes the raw, contiguous memory block of 'struct luo_file_ser' entries,
+ * provided by the previous kernel, and transforms it back into a live,
+ * in-memory linked list of 'struct luo_file' instances.
+ *
+ * For each serialized entry, it performs the following steps:
+ * 1. Reads the 'compatible' string.
+ * 2. Searches the global list of registered file handlers for one that
+ * matches the compatible string.
+ * 3. Allocates a new 'struct luo_file'.
+ * 4. Populates the new structure with the deserialized data (token, private
+ * data handle) and links it to the found handler. The 'file' pointer is
+ * initialized to NULL, as the file has not been retrieved yet.
+ * 5. Adds the new 'struct luo_file' to the file_set's files_list.
+ *
+ * This prepares the file_set for userspace, which can later call
+ * luo_retrieve_file() to restore the actual file descriptors.
+ *
+ * Context: Called from session deserialization.
+ */
+int luo_file_deserialize(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ struct luo_file_ser *file_ser;
+ u64 i;
+
+ if (!file_set_ser->files) {
+ WARN_ON(file_set_ser->count);
+ return 0;
+ }
+
+ file_set->count = file_set_ser->count;
+ file_set->files = phys_to_virt(file_set_ser->files);
+
+ /*
+ * Note on error handling:
+ *
+ * If deserialization fails (e.g., allocation failure or corrupt data),
+ * we intentionally skip cleanup of files that were already restored.
+ *
+ * A partial failure leaves the preserved state inconsistent.
+ * Implementing a safe "undo" to unwind complex dependencies (sessions,
+ * files, hardware state) is error-prone and provides little value, as
+ * the system is effectively in a broken state.
+ *
+ * We treat these resources as leaked. The expected recovery path is for
+ * userspace to detect the failure and trigger a reboot, which will
+ * reliably reset devices and reclaim memory.
+ */
+ file_ser = file_set->files;
+ for (i = 0; i < file_set->count; i++) {
+ struct liveupdate_file_handler *fh;
+ bool handler_found = false;
+ struct luo_file *luo_file;
+
+ luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ if (!strcmp(fh->compatible, file_ser[i].compatible)) {
+ handler_found = true;
+ break;
+ }
+ }
+
+ if (!handler_found) {
+ pr_warn("No registered handler for compatible '%s'\n",
+ file_ser[i].compatible);
+ return -ENOENT;
+ }
+
+ luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+ if (!luo_file)
+ return -ENOMEM;
+
+ luo_file->fh = fh;
+ luo_file->file = NULL;
+ luo_file->serialized_data = file_ser[i].data;
+ luo_file->token = file_ser[i].token;
+ luo_file->retrieved = false;
+ mutex_init(&luo_file->mutex);
+ list_add_tail(&luo_file->list, &file_set->files_list);
+ }
+
+ return 0;
+}
+
+void luo_file_set_init(struct luo_file_set *file_set)
+{
+ INIT_LIST_HEAD(&file_set->files_list);
+}
+
+void luo_file_set_destroy(struct luo_file_set *file_set)
+{
+ WARN_ON(file_set->count);
+ WARN_ON(!list_empty(&file_set->files_list));
+}
+
+/**
+ * liveupdate_register_file_handler - Register a file handler with LUO.
+ * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler.
+ * The caller must initialize this structure, including a unique
+ * 'compatible' string and a valid 'fh' callbacks. This function adds the
+ * handler to the global list of supported file handlers.
+ *
+ * Context: Typically called during module initialization for file types that
+ * support live update preservation.
+ *
+ * Return: 0 on success. Negative errno on failure.
+ */
+int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
+{
+ struct liveupdate_file_handler *fh_iter;
+ int err;
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ /* Sanity check that all required callbacks are set */
+ if (!fh->ops->preserve || !fh->ops->unpreserve || !fh->ops->retrieve ||
+ !fh->ops->finish || !fh->ops->can_preserve) {
+ return -EINVAL;
+ }
+
+ /*
+ * Ensure the system is quiescent (no active sessions).
+ * This prevents registering new handlers while sessions are active or
+ * while deserialization is in progress.
+ */
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ /* Check for duplicate compatible strings */
+ luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) {
+ if (!strcmp(fh_iter->compatible, fh->compatible)) {
+ pr_err("File handler registration failed: Compatible string '%s' already registered.\n",
+ fh->compatible);
+ err = -EEXIST;
+ goto err_resume;
+ }
+ }
+
+ /* Pin the module implementing the handler */
+ if (!try_module_get(fh->ops->owner)) {
+ err = -EAGAIN;
+ goto err_resume;
+ }
+
+ INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list));
+ list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list);
+ luo_session_resume();
+
+ return 0;
+
+err_resume:
+ luo_session_resume();
+ return err;
+}
+
+/**
+ * liveupdate_unregister_file_handler - Unregister a liveupdate file handler
+ * @fh: The file handler to unregister
+ *
+ * Unregisters the file handler from the liveupdate core. This function
+ * reverses the operations of liveupdate_register_file_handler().
+ *
+ * It ensures safe removal by checking that:
+ * No live update session is currently in progress.
+ *
+ * If the unregistration fails, the internal test state is reverted.
+ *
+ * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live
+ * update is in progress, can't quiesce live update.
+ */
+int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
+{
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ list_del(&ACCESS_PRIVATE(fh, list));
+ module_put(fh->ops->owner);
+ luo_session_resume();
+
+ return 0;
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
new file mode 100644
index 000000000000..c8973b543d1d
--- /dev/null
+++ b/kernel/liveupdate/luo_internal.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _LINUX_LUO_INTERNAL_H
+#define _LINUX_LUO_INTERNAL_H
+
+#include <linux/liveupdate.h>
+#include <linux/uaccess.h>
+
+struct luo_ucmd {
+ void __user *ubuffer;
+ u32 user_size;
+ void *cmd;
+};
+
+static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
+ size_t kernel_cmd_size)
+{
+ /*
+ * Copy the minimum of what the user provided and what we actually
+ * have.
+ */
+ if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
+ min_t(size_t, ucmd->user_size, kernel_cmd_size))) {
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * Handles a deserialization failure: devices and memory is in unpredictable
+ * state.
+ *
+ * Continuing the boot process after a failure is dangerous because it could
+ * lead to leaks of private data.
+ */
+#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
+
+/* Mimics list_for_each_entry() but for private list head entries */
+#define luo_list_for_each_private(pos, head, member) \
+ for (struct list_head *__iter = (head)->next; \
+ __iter != (head) && \
+ ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \
+ __iter = __iter->next)
+
+/**
+ * struct luo_file_set - A set of files that belong to the same sessions.
+ * @files_list: An ordered list of files associated with this session, it is
+ * ordered by preservation time.
+ * @files: The physically contiguous memory block that holds the serialized
+ * state of files.
+ * @count: A counter tracking the number of files currently stored in the
+ * @files_list for this session.
+ */
+struct luo_file_set {
+ struct list_head files_list;
+ struct luo_file_ser *files;
+ long count;
+};
+
+/**
+ * struct luo_session - Represents an active or incoming Live Update session.
+ * @name: A unique name for this session, used for identification and
+ * retrieval.
+ * @ser: Pointer to the serialized data for this session.
+ * @list: A list_head member used to link this session into a global list
+ * of either outgoing (to be preserved) or incoming (restored from
+ * previous kernel) sessions.
+ * @retrieved: A boolean flag indicating whether this session has been
+ * retrieved by a consumer in the new kernel.
+ * @file_set: A set of files that belong to this session.
+ * @mutex: protects fields in the luo_session.
+ */
+struct luo_session {
+ char name[LIVEUPDATE_SESSION_NAME_LENGTH];
+ struct luo_session_ser *ser;
+ struct list_head list;
+ bool retrieved;
+ struct luo_file_set file_set;
+ struct mutex mutex;
+};
+
+int luo_session_create(const char *name, struct file **filep);
+int luo_session_retrieve(const char *name, struct file **filep);
+int __init luo_session_setup_outgoing(void *fdt);
+int __init luo_session_setup_incoming(void *fdt);
+int luo_session_serialize(void);
+int luo_session_deserialize(void);
+bool luo_session_quiesce(void);
+void luo_session_resume(void);
+
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd);
+void luo_file_unpreserve_files(struct luo_file_set *file_set);
+int luo_file_freeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+void luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+ struct file **filep);
+int luo_file_finish(struct luo_file_set *file_set);
+int luo_file_deserialize(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+void luo_file_set_init(struct luo_file_set *file_set);
+void luo_file_set_destroy(struct luo_file_set *file_set);
+
+#endif /* _LINUX_LUO_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
new file mode 100644
index 000000000000..dbdbc3bd7929
--- /dev/null
+++ b/kernel/liveupdate/luo_session.c
@@ -0,0 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO Sessions
+ *
+ * LUO Sessions provide the core mechanism for grouping and managing `struct
+ * file *` instances that need to be preserved across a kexec-based live
+ * update. Each session acts as a named container for a set of file objects,
+ * allowing a userspace agent to manage the lifecycle of resources critical to a
+ * workload.
+ *
+ * Core Concepts:
+ *
+ * - Named Containers: Sessions are identified by a unique, user-provided name,
+ * which is used for both creation in the current kernel and retrieval in the
+ * next kernel.
+ *
+ * - Userspace Interface: Session management is driven from userspace via
+ * ioctls on /dev/liveupdate.
+ *
+ * - Serialization: Session metadata is preserved using the KHO framework. When
+ * a live update is triggered via kexec, an array of `struct luo_session_ser`
+ * is populated and placed in a preserved memory region. An FDT node is also
+ * created, containing the count of sessions and the physical address of this
+ * array.
+ *
+ * Session Lifecycle:
+ *
+ * 1. Creation: A userspace agent calls `luo_session_create()` to create a
+ * new, empty session and receives a file descriptor for it.
+ *
+ * 2. Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is
+ * made, `luo_session_serialize()` is called. It iterates through all
+ * active sessions and writes their metadata into a memory area preserved
+ * by KHO.
+ *
+ * 3. Deserialization (in new kernel): After kexec, `luo_session_deserialize()`
+ * runs, reading the serialized data and creating a list of `struct
+ * luo_session` objects representing the preserved sessions.
+ *
+ * 4. Retrieval: A userspace agent in the new kernel can then call
+ * `luo_session_retrieve()` with a session name to get a new file
+ * descriptor and access the preserved state.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/anon_inodes.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/liveupdate.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/unaligned.h>
+#include <uapi/linux/liveupdate.h>
+#include "luo_internal.h"
+
+/* 16 4K pages, give space for 744 sessions */
+#define LUO_SESSION_PGCNT 16ul
+#define LUO_SESSION_MAX (((LUO_SESSION_PGCNT << PAGE_SHIFT) - \
+ sizeof(struct luo_session_header_ser)) / \
+ sizeof(struct luo_session_ser))
+
+/**
+ * struct luo_session_header - Header struct for managing LUO sessions.
+ * @count: The number of sessions currently tracked in the @list.
+ * @list: The head of the linked list of `struct luo_session` instances.
+ * @rwsem: A read-write semaphore providing synchronized access to the
+ * session list and other fields in this structure.
+ * @header_ser: The header data of serialization array.
+ * @ser: The serialized session data (an array of
+ * `struct luo_session_ser`).
+ * @active: Set to true when first initialized. If previous kernel did not
+ * send session data, active stays false for incoming.
+ */
+struct luo_session_header {
+ long count;
+ struct list_head list;
+ struct rw_semaphore rwsem;
+ struct luo_session_header_ser *header_ser;
+ struct luo_session_ser *ser;
+ bool active;
+};
+
+/**
+ * struct luo_session_global - Global container for managing LUO sessions.
+ * @incoming: The sessions passed from the previous kernel.
+ * @outgoing: The sessions that are going to be passed to the next kernel.
+ */
+struct luo_session_global {
+ struct luo_session_header incoming;
+ struct luo_session_header outgoing;
+};
+
+static struct luo_session_global luo_session_global = {
+ .incoming = {
+ .list = LIST_HEAD_INIT(luo_session_global.incoming.list),
+ .rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem),
+ },
+ .outgoing = {
+ .list = LIST_HEAD_INIT(luo_session_global.outgoing.list),
+ .rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem),
+ },
+};
+
+static struct luo_session *luo_session_alloc(const char *name)
+{
+ struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL);
+
+ if (!session)
+ return ERR_PTR(-ENOMEM);
+
+ strscpy(session->name, name, sizeof(session->name));
+ INIT_LIST_HEAD(&session->file_set.files_list);
+ luo_file_set_init(&session->file_set);
+ INIT_LIST_HEAD(&session->list);
+ mutex_init(&session->mutex);
+
+ return session;
+}
+
+static void luo_session_free(struct luo_session *session)
+{
+ luo_file_set_destroy(&session->file_set);
+ mutex_destroy(&session->mutex);
+ kfree(session);
+}
+
+static int luo_session_insert(struct luo_session_header *sh,
+ struct luo_session *session)
+{
+ struct luo_session *it;
+
+ guard(rwsem_write)(&sh->rwsem);
+
+ /*
+ * For outgoing we should make sure there is room in serialization array
+ * for new session.
+ */
+ if (sh == &luo_session_global.outgoing) {
+ if (sh->count == LUO_SESSION_MAX)
+ return -ENOMEM;
+ }
+
+ /*
+ * For small number of sessions this loop won't hurt performance
+ * but if we ever start using a lot of sessions, this might
+ * become a bottle neck during deserialization time, as it would
+ * cause O(n*n) complexity.
+ */
+ list_for_each_entry(it, &sh->list, list) {
+ if (!strncmp(it->name, session->name, sizeof(it->name)))
+ return -EEXIST;
+ }
+ list_add_tail(&session->list, &sh->list);
+ sh->count++;
+
+ return 0;
+}
+
+static void luo_session_remove(struct luo_session_header *sh,
+ struct luo_session *session)
+{
+ guard(rwsem_write)(&sh->rwsem);
+ list_del(&session->list);
+ sh->count--;
+}
+
+static int luo_session_finish_one(struct luo_session *session)
+{
+ guard(mutex)(&session->mutex);
+ return luo_file_finish(&session->file_set);
+}
+
+static void luo_session_unfreeze_one(struct luo_session *session,
+ struct luo_session_ser *ser)
+{
+ guard(mutex)(&session->mutex);
+ luo_file_unfreeze(&session->file_set, &ser->file_set_ser);
+}
+
+static int luo_session_freeze_one(struct luo_session *session,
+ struct luo_session_ser *ser)
+{
+ guard(mutex)(&session->mutex);
+ return luo_file_freeze(&session->file_set, &ser->file_set_ser);
+}
+
+static int luo_session_release(struct inode *inodep, struct file *filep)
+{
+ struct luo_session *session = filep->private_data;
+ struct luo_session_header *sh;
+
+ /* If retrieved is set, it means this session is from incoming list */
+ if (session->retrieved) {
+ int err = luo_session_finish_one(session);
+
+ if (err) {
+ pr_warn("Unable to finish session [%s] on release\n",
+ session->name);
+ return err;
+ }
+ sh = &luo_session_global.incoming;
+ } else {
+ scoped_guard(mutex, &session->mutex)
+ luo_file_unpreserve_files(&session->file_set);
+ sh = &luo_session_global.outgoing;
+ }
+
+ luo_session_remove(sh, session);
+ luo_session_free(session);
+
+ return 0;
+}
+
+static int luo_session_preserve_fd(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_preserve_fd *argp = ucmd->cmd;
+ int err;
+
+ guard(mutex)(&session->mutex);
+ err = luo_preserve_file(&session->file_set, argp->token, argp->fd);
+ if (err)
+ return err;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ pr_warn("The file was successfully preserved, but response to user failed\n");
+
+ return err;
+}
+
+static int luo_session_retrieve_fd(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_retrieve_fd *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ guard(mutex)(&session->mutex);
+ err = luo_retrieve_file(&session->file_set, argp->token, &file);
+ if (err < 0)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_session_finish(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_finish *argp = ucmd->cmd;
+ int err = luo_session_finish_one(session);
+
+ if (err)
+ return err;
+
+ return luo_ucmd_respond(ucmd, sizeof(*argp));
+}
+
+union ucmd_buffer {
+ struct liveupdate_session_finish finish;
+ struct liveupdate_session_preserve_fd preserve;
+ struct liveupdate_session_retrieve_fd retrieve;
+};
+
+struct luo_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct luo_session *session, struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_SESSION_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+
+static const struct luo_ioctl_op luo_session_ioctl_ops[] = {
+ IOCTL_OP(LIVEUPDATE_SESSION_FINISH, luo_session_finish,
+ struct liveupdate_session_finish, reserved),
+ IOCTL_OP(LIVEUPDATE_SESSION_PRESERVE_FD, luo_session_preserve_fd,
+ struct liveupdate_session_preserve_fd, token),
+ IOCTL_OP(LIVEUPDATE_SESSION_RETRIEVE_FD, luo_session_retrieve_fd,
+ struct liveupdate_session_retrieve_fd, token),
+};
+
+static long luo_session_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ struct luo_session *session = filep->private_data;
+ const struct luo_ioctl_op *op;
+ struct luo_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int ret;
+
+ nr = _IOC_NR(cmd);
+ if (nr < LIVEUPDATE_CMD_SESSION_BASE || (nr - LIVEUPDATE_CMD_SESSION_BASE) >=
+ ARRAY_SIZE(luo_session_ioctl_ops)) {
+ return -EINVAL;
+ }
+
+ ucmd.ubuffer = (void __user *)arg;
+ ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (ret)
+ return ret;
+
+ op = &luo_session_ioctl_ops[nr - LIVEUPDATE_CMD_SESSION_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (ret)
+ return ret;
+
+ return op->execute(session, &ucmd);
+}
+
+static const struct file_operations luo_session_fops = {
+ .owner = THIS_MODULE,
+ .release = luo_session_release,
+ .unlocked_ioctl = luo_session_ioctl,
+};
+
+/* Create a "struct file" for session */
+static int luo_session_getfile(struct luo_session *session, struct file **filep)
+{
+ char name_buf[128];
+ struct file *file;
+
+ lockdep_assert_held(&session->mutex);
+ snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name);
+ file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ *filep = file;
+
+ return 0;
+}
+
+int luo_session_create(const char *name, struct file **filep)
+{
+ struct luo_session *session;
+ int err;
+
+ session = luo_session_alloc(name);
+ if (IS_ERR(session))
+ return PTR_ERR(session);
+
+ err = luo_session_insert(&luo_session_global.outgoing, session);
+ if (err)
+ goto err_free;
+
+ scoped_guard(mutex, &session->mutex)
+ err = luo_session_getfile(session, filep);
+ if (err)
+ goto err_remove;
+
+ return 0;
+
+err_remove:
+ luo_session_remove(&luo_session_global.outgoing, session);
+err_free:
+ luo_session_free(session);
+
+ return err;
+}
+
+int luo_session_retrieve(const char *name, struct file **filep)
+{
+ struct luo_session_header *sh = &luo_session_global.incoming;
+ struct luo_session *session = NULL;
+ struct luo_session *it;
+ int err;
+
+ scoped_guard(rwsem_read, &sh->rwsem) {
+ list_for_each_entry(it, &sh->list, list) {
+ if (!strncmp(it->name, name, sizeof(it->name))) {
+ session = it;
+ break;
+ }
+ }
+ }
+
+ if (!session)
+ return -ENOENT;
+
+ guard(mutex)(&session->mutex);
+ if (session->retrieved)
+ return -EINVAL;
+
+ err = luo_session_getfile(session, filep);
+ if (!err)
+ session->retrieved = true;
+
+ return err;
+}
+
+int __init luo_session_setup_outgoing(void *fdt_out)
+{
+ struct luo_session_header_ser *header_ser;
+ u64 header_ser_pa;
+ int err;
+
+ header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT);
+ if (IS_ERR(header_ser))
+ return PTR_ERR(header_ser);
+ header_ser_pa = virt_to_phys(header_ser);
+
+ err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME);
+ err |= fdt_property_string(fdt_out, "compatible",
+ LUO_FDT_SESSION_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa,
+ sizeof(header_ser_pa));
+ err |= fdt_end_node(fdt_out);
+
+ if (err)
+ goto err_unpreserve;
+
+ luo_session_global.outgoing.header_ser = header_ser;
+ luo_session_global.outgoing.ser = (void *)(header_ser + 1);
+ luo_session_global.outgoing.active = true;
+
+ return 0;
+
+err_unpreserve:
+ kho_unpreserve_free(header_ser);
+ return err;
+}
+
+int __init luo_session_setup_incoming(void *fdt_in)
+{
+ struct luo_session_header_ser *header_ser;
+ int err, header_size, offset;
+ u64 header_ser_pa;
+ const void *ptr;
+
+ offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME);
+ if (offset < 0) {
+ pr_err("Unable to get session node: [%s]\n",
+ LUO_FDT_SESSION_NODE_NAME);
+ return -EINVAL;
+ }
+
+ err = fdt_node_check_compatible(fdt_in, offset,
+ LUO_FDT_SESSION_COMPATIBLE);
+ if (err) {
+ pr_err("Session node incompatible [%s]\n",
+ LUO_FDT_SESSION_COMPATIBLE);
+ return -EINVAL;
+ }
+
+ header_size = 0;
+ ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size);
+ if (!ptr || header_size != sizeof(u64)) {
+ pr_err("Unable to get session header '%s' [%d]\n",
+ LUO_FDT_SESSION_HEADER, header_size);
+ return -EINVAL;
+ }
+
+ header_ser_pa = get_unaligned((u64 *)ptr);
+ header_ser = phys_to_virt(header_ser_pa);
+
+ luo_session_global.incoming.header_ser = header_ser;
+ luo_session_global.incoming.ser = (void *)(header_ser + 1);
+ luo_session_global.incoming.active = true;
+
+ return 0;
+}
+
+int luo_session_deserialize(void)
+{
+ struct luo_session_header *sh = &luo_session_global.incoming;
+ static bool is_deserialized;
+ static int err;
+
+ /* If has been deserialized, always return the same error code */
+ if (is_deserialized)
+ return err;
+
+ is_deserialized = true;
+ if (!sh->active)
+ return 0;
+
+ /*
+ * Note on error handling:
+ *
+ * If deserialization fails (e.g., allocation failure or corrupt data),
+ * we intentionally skip cleanup of sessions that were already restored.
+ *
+ * A partial failure leaves the preserved state inconsistent.
+ * Implementing a safe "undo" to unwind complex dependencies (sessions,
+ * files, hardware state) is error-prone and provides little value, as
+ * the system is effectively in a broken state.
+ *
+ * We treat these resources as leaked. The expected recovery path is for
+ * userspace to detect the failure and trigger a reboot, which will
+ * reliably reset devices and reclaim memory.
+ */
+ for (int i = 0; i < sh->header_ser->count; i++) {
+ struct luo_session *session;
+
+ session = luo_session_alloc(sh->ser[i].name);
+ if (IS_ERR(session)) {
+ pr_warn("Failed to allocate session [%s] during deserialization %pe\n",
+ sh->ser[i].name, session);
+ return PTR_ERR(session);
+ }
+
+ err = luo_session_insert(sh, session);
+ if (err) {
+ pr_warn("Failed to insert session [%s] %pe\n",
+ session->name, ERR_PTR(err));
+ luo_session_free(session);
+ return err;
+ }
+
+ scoped_guard(mutex, &session->mutex) {
+ luo_file_deserialize(&session->file_set,
+ &sh->ser[i].file_set_ser);
+ }
+ }
+
+ kho_restore_free(sh->header_ser);
+ sh->header_ser = NULL;
+ sh->ser = NULL;
+
+ return 0;
+}
+
+int luo_session_serialize(void)
+{
+ struct luo_session_header *sh = &luo_session_global.outgoing;
+ struct luo_session *session;
+ int i = 0;
+ int err;
+
+ guard(rwsem_write)(&sh->rwsem);
+ list_for_each_entry(session, &sh->list, list) {
+ err = luo_session_freeze_one(session, &sh->ser[i]);
+ if (err)
+ goto err_undo;
+
+ strscpy(sh->ser[i].name, session->name,
+ sizeof(sh->ser[i].name));
+ i++;
+ }
+ sh->header_ser->count = sh->count;
+
+ return 0;
+
+err_undo:
+ list_for_each_entry_continue_reverse(session, &sh->list, list) {
+ i--;
+ luo_session_unfreeze_one(session, &sh->ser[i]);
+ memset(sh->ser[i].name, 0, sizeof(sh->ser[i].name));
+ }
+
+ return err;
+}
+
+/**
+ * luo_session_quiesce - Ensure no active sessions exist and lock session lists.
+ *
+ * Acquires exclusive write locks on both incoming and outgoing session lists.
+ * It then validates no sessions exist in either list.
+ *
+ * This mechanism is used during file handler un/registration to ensure that no
+ * sessions are currently using the handler, and no new sessions can be created
+ * while un/registration is in progress.
+ *
+ * This prevents registering new handlers while sessions are active or
+ * while deserialization is in progress.
+ *
+ * Return:
+ * true - System is quiescent (0 sessions) and locked.
+ * false - Active sessions exist. The locks are released internally.
+ */
+bool luo_session_quiesce(void)
+{
+ down_write(&luo_session_global.incoming.rwsem);
+ down_write(&luo_session_global.outgoing.rwsem);
+
+ if (luo_session_global.incoming.count ||
+ luo_session_global.outgoing.count) {
+ up_write(&luo_session_global.outgoing.rwsem);
+ up_write(&luo_session_global.incoming.rwsem);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * luo_session_resume - Unlock session lists and resume normal activity.
+ *
+ * Releases the exclusive locks acquired by a successful call to
+ * luo_session_quiesce().
+ */
+void luo_session_resume(void)
+{
+ up_write(&luo_session_global.outgoing.rwsem);
+ up_write(&luo_session_global.incoming.rwsem);
+}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 7b3ec2fa6e7c..710ee30b3bea 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -954,7 +954,7 @@ size_t module_flags_taint(unsigned long taints, char *buf)
int i;
for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
- if (taint_flags[i].module && test_bit(i, &taints))
+ if (test_bit(i, &taints))
buf[l++] = taint_flags[i].c_true;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index b2f2470af7e5..0d52210a9e2b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -401,7 +401,7 @@ static void panic_trigger_all_cpu_backtrace(void)
*/
static void panic_other_cpus_shutdown(bool crash_kexec)
{
- if (panic_print & SYS_INFO_ALL_CPU_BT)
+ if (panic_print & SYS_INFO_ALL_BT)
panic_trigger_all_cpu_backtrace();
/*
@@ -628,38 +628,40 @@ void panic(const char *fmt, ...)
}
EXPORT_SYMBOL(panic);
-#define TAINT_FLAG(taint, _c_true, _c_false, _module) \
+#define TAINT_FLAG(taint, _c_true, _c_false) \
[ TAINT_##taint ] = { \
.c_true = _c_true, .c_false = _c_false, \
- .module = _module, \
.desc = #taint, \
}
/*
- * TAINT_FORCED_RMMOD could be a per-module flag but the module
- * is being removed anyway.
+ * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT,
+ * please also modify tools/debugging/kernel-chktaint and
+ * Documentation/admin-guide/tainted-kernels.rst, including its
+ * small shell script that prints the TAINT_FLAGS_COUNT bits of
+ * /proc/sys/kernel/tainted.
*/
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
- TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true),
- TAINT_FLAG(FORCED_MODULE, 'F', ' ', true),
- TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' ', false),
- TAINT_FLAG(FORCED_RMMOD, 'R', ' ', false),
- TAINT_FLAG(MACHINE_CHECK, 'M', ' ', false),
- TAINT_FLAG(BAD_PAGE, 'B', ' ', false),
- TAINT_FLAG(USER, 'U', ' ', false),
- TAINT_FLAG(DIE, 'D', ' ', false),
- TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' ', false),
- TAINT_FLAG(WARN, 'W', ' ', false),
- TAINT_FLAG(CRAP, 'C', ' ', true),
- TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' ', false),
- TAINT_FLAG(OOT_MODULE, 'O', ' ', true),
- TAINT_FLAG(UNSIGNED_MODULE, 'E', ' ', true),
- TAINT_FLAG(SOFTLOCKUP, 'L', ' ', false),
- TAINT_FLAG(LIVEPATCH, 'K', ' ', true),
- TAINT_FLAG(AUX, 'X', ' ', true),
- TAINT_FLAG(RANDSTRUCT, 'T', ' ', true),
- TAINT_FLAG(TEST, 'N', ' ', true),
- TAINT_FLAG(FWCTL, 'J', ' ', true),
+ TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'),
+ TAINT_FLAG(FORCED_MODULE, 'F', ' '),
+ TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '),
+ TAINT_FLAG(FORCED_RMMOD, 'R', ' '),
+ TAINT_FLAG(MACHINE_CHECK, 'M', ' '),
+ TAINT_FLAG(BAD_PAGE, 'B', ' '),
+ TAINT_FLAG(USER, 'U', ' '),
+ TAINT_FLAG(DIE, 'D', ' '),
+ TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '),
+ TAINT_FLAG(WARN, 'W', ' '),
+ TAINT_FLAG(CRAP, 'C', ' '),
+ TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '),
+ TAINT_FLAG(OOT_MODULE, 'O', ' '),
+ TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '),
+ TAINT_FLAG(SOFTLOCKUP, 'L', ' '),
+ TAINT_FLAG(LIVEPATCH, 'K', ' '),
+ TAINT_FLAG(AUX, 'X', ' '),
+ TAINT_FLAG(RANDSTRUCT, 'T', ' '),
+ TAINT_FLAG(TEST, 'N', ' '),
+ TAINT_FLAG(FWCTL, 'J', ' '),
};
#undef TAINT_FLAG
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7394f1b6033b..1d765ad242b8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3705,12 +3705,13 @@ static bool legacy_kthread_create(void)
/**
* printk_kthreads_shutdown - shutdown all threaded printers
+ * @data: syscore context
*
* On system shutdown all threaded printers are stopped. This allows printk
* to transition back to atomic printing, thus providing a robust mechanism
* for the final shutdown/reboot messages to be output.
*/
-static void printk_kthreads_shutdown(void)
+static void printk_kthreads_shutdown(void *data)
{
struct console *con;
@@ -3732,10 +3733,14 @@ static void printk_kthreads_shutdown(void)
console_list_unlock();
}
-static struct syscore_ops printk_syscore_ops = {
+static const struct syscore_ops printk_syscore_ops = {
.shutdown = printk_kthreads_shutdown,
};
+static struct syscore printk_syscore = {
+ .ops = &printk_syscore_ops,
+};
+
/*
* If appropriate, start nbcon kthreads and set @printk_kthreads_running.
* If any kthreads fail to start, those consoles are unregistered.
@@ -3803,7 +3808,7 @@ static void printk_kthreads_check_locked(void)
static int __init printk_set_kthreads_ready(void)
{
- register_syscore_ops(&printk_syscore_ops);
+ register_syscore(&printk_syscore);
console_list_lock();
printk_kthreads_ready = true;
diff --git a/kernel/resource.c b/kernel/resource.c
index b9fa2a4ce089..e4e9bac12e6e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -341,6 +341,8 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
struct resource *res)
{
+ /* Skip children until we find a top level range that matches */
+ bool skip_children = true;
struct resource *p;
if (!res)
@@ -351,7 +353,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for_each_resource(&iomem_resource, p, false) {
+ for_each_resource(&iomem_resource, p, skip_children) {
/* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
@@ -362,6 +364,12 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
if (p->end < start)
continue;
+ /*
+ * We found a top level range that matches what we are looking
+ * for. Time to start checking children too.
+ */
+ skip_children = false;
+
/* Found a match, break */
if (is_type_match(p, flags, desc))
break;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7801cd05d5a..41ba0be16911 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -878,7 +878,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
rq_lock(rq, &rf);
update_rq_clock(rq);
- rq->donor->sched_class->task_tick(rq, rq->curr, 1);
+ rq->donor->sched_class->task_tick(rq, rq->donor, 1);
rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
@@ -7360,15 +7360,12 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
p->prio = prio;
}
out_unlock:
- /* Avoid rq from going away on us: */
- preempt_disable();
+ /* Caller holds task_struct::pi_lock, IRQs are still disabled */
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
rq_repin_lock(rq, &rf);
__task_rq_unlock(rq, p, &rf);
-
- preempt_enable();
}
#endif /* CONFIG_RT_MUTEXES */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 769d7b7990df..da46c3164537 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4034,6 +4034,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (child_cfs_rq_on_list(cfs_rq))
return false;
+ if (cfs_rq->tg_load_avg_contrib)
+ return false;
+
return true;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bbf513b3e76c..d30cca6870f5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1167,7 +1167,7 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned long nr_uninterruptible;
+ unsigned long nr_uninterruptible;
#ifdef CONFIG_SCHED_PROXY_EXEC
struct task_struct __rcu *donor; /* Scheduling context */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index cbf7206b3f9d..c903f1a42891 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -180,8 +180,13 @@ static inline void psi_dequeue(struct task_struct *p, int flags)
* avoid walking all ancestors twice, psi_task_switch() handles
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
+ *
+ * In the SCHED_PROXY_EXECUTION case we may do sleeping
+ * dequeues that are not followed by a task switch, so check
+ * TSK_ONCPU is set to ensure the task switch is imminent.
+ * Otherwise clear the flags as usual.
*/
- if (flags & DEQUEUE_SLEEP)
+ if ((flags & DEQUEUE_SLEEP) && (p->psi_flags & TSK_ONCPU))
return;
/*
diff --git a/kernel/scs.c b/kernel/scs.c
index d7809affe740..772488afd5b9 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -135,7 +135,7 @@ static void scs_check_usage(struct task_struct *tsk)
if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE))
return;
- for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) {
+ for (p = task_scs(tsk); p < __scs_magic(task_scs(tsk)); ++p) {
if (!READ_ONCE_NOCHECK(*p))
break;
used += sizeof(*p);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index cc1afec306b3..f39111830ca3 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -296,6 +296,11 @@ int sched_clock_suspend(void)
return 0;
}
+static int sched_clock_syscore_suspend(void *data)
+{
+ return sched_clock_suspend();
+}
+
void sched_clock_resume(void)
{
struct clock_read_data *rd = &cd.read_data[0];
@@ -305,14 +310,23 @@ void sched_clock_resume(void)
rd->read_sched_clock = cd.actual_read_sched_clock;
}
-static struct syscore_ops sched_clock_ops = {
- .suspend = sched_clock_suspend,
- .resume = sched_clock_resume,
+static void sched_clock_syscore_resume(void *data)
+{
+ sched_clock_resume();
+}
+
+static const struct syscore_ops sched_clock_syscore_ops = {
+ .suspend = sched_clock_syscore_suspend,
+ .resume = sched_clock_syscore_resume,
+};
+
+static struct syscore sched_clock_syscore = {
+ .ops = &sched_clock_syscore_ops,
};
static int __init sched_clock_syscore_init(void)
{
- register_syscore_ops(&sched_clock_ops);
+ register_syscore(&sched_clock_syscore);
return 0;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4790da895203..3ec3daa4acab 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1994,6 +1994,11 @@ void timekeeping_resume(void)
timerfd_resume();
}
+static void timekeeping_syscore_resume(void *data)
+{
+ timekeeping_resume();
+}
+
int timekeeping_suspend(void)
{
struct timekeeper *tks = &tk_core.shadow_timekeeper;
@@ -2061,15 +2066,24 @@ int timekeeping_suspend(void)
return 0;
}
+static int timekeeping_syscore_suspend(void *data)
+{
+ return timekeeping_suspend();
+}
+
/* sysfs resume/suspend bits for timekeeping */
-static struct syscore_ops timekeeping_syscore_ops = {
- .resume = timekeeping_resume,
- .suspend = timekeeping_suspend,
+static const struct syscore_ops timekeeping_syscore_ops = {
+ .resume = timekeeping_syscore_resume,
+ .suspend = timekeeping_syscore_suspend,
+};
+
+static struct syscore timekeeping_syscore = {
+ .ops = &timekeeping_syscore_ops,
};
static int __init timekeeping_init_ops(void)
{
- register_syscore_ops(&timekeeping_syscore_ops);
+ register_syscore(&timekeeping_syscore);
return 0;
}
device_initcall(timekeeping_init_ops);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 65ac0f04a946..cc48d16be43e 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -163,7 +163,7 @@ enum {
#define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset]))
/*
- * Each fgraph_ops has a reservered unsigned long at the end (top) of the
+ * Each fgraph_ops has a reserved unsigned long at the end (top) of the
* ret_stack to store task specific state.
*/
#define SHADOW_STACK_TASK_VARS(ret_stack) \
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 0b1ee8e585f2..1188eefef07c 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -30,7 +30,7 @@
* fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
* exists. The key is the address of fprobe instance.
* fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
- * instance related to the funciton address. The key is the ftrace IP
+ * instance related to the function address. The key is the ftrace IP
* address.
*
* When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8688c88534de..41c9f5d079be 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1770,7 +1770,7 @@ static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size)
bmeta->total_size = total_size;
bmeta->buffers_offset = (void *)ptr - (void *)bmeta;
- /* Zero out the scatch pad */
+ /* Zero out the scratch pad */
memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta));
return false;
@@ -6089,7 +6089,7 @@ static void rb_clear_buffer_page(struct buffer_page *page)
* id field, and updated via this function.
*
* But for a fixed memory mapped buffer, the id is already assigned for
- * fixed memory ording in the memory layout and can not be used. Instead
+ * fixed memory ordering in the memory layout and can not be used. Instead
* the index of where the page lies in the memory layout is used.
*
* For the normal pages, set the buffer page id with the passed in @id
@@ -7669,7 +7669,7 @@ static __init int test_ringbuffer(void)
/*
* Show buffer is enabled before setting rb_test_started.
* Yes there's a small race window where events could be
- * dropped and the thread wont catch it. But when a ring
+ * dropped and the thread won't catch it. But when a ring
* buffer gets enabled, there will always be some kind of
* delay before other CPUs see it. Thus, we don't care about
* those dropped events. We care about events dropped after
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index cdc3aea12c93..593e3b59e42e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -433,7 +433,7 @@ static int __init ring_buffer_benchmark_init(void)
{
int ret;
- /* make a one meg buffer in overwite mode */
+ /* make a one meg buffer in overwrite mode */
buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
if (!buffer)
return -ENOMEM;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c9fbb316dcbd..e575956ef9b5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -125,7 +125,7 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
* If there is an oops (or kernel panic) and the ftrace_dump_on_oops
* is set, then ftrace_dump is called. This will output the contents
* of the ftrace buffers to the console. This is very useful for
- * capturing traces that lead to crashes and outputing it to a
+ * capturing traces that lead to crashes and outputting it to a
* serial console.
*
* It is default off, but you can enable it with either specifying
@@ -134,7 +134,7 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
* Set 1 if you want to dump buffers of all CPUs
* Set 2 if you want to dump the buffer of the CPU that triggered oops
* Set instance name if you want to dump the specific trace instance
- * Multiple instance dump is also supported, and instances are seperated
+ * Multiple instance dump is also supported, and instances are separated
* by commas.
*/
/* Set to string format zero to disable by default */
@@ -4709,8 +4709,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
* If pause-on-trace is enabled, then stop the trace while
* dumping, unless this is the "snapshot" file
*/
- if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE)))
+ if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) {
+ iter->iter_flags |= TRACE_FILE_PAUSE;
tracing_stop_tr(tr);
+ }
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
@@ -4842,7 +4844,7 @@ static int tracing_release(struct inode *inode, struct file *file)
if (iter->trace && iter->trace->close)
iter->trace->close(iter);
- if (!iter->snapshot && tr->stop_count)
+ if (iter->iter_flags & TRACE_FILE_PAUSE)
/* reenable tracing if it was previously enabled */
tracing_start_tr(tr);
@@ -5276,7 +5278,7 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled)
return -EINVAL;
/*
* An instance must always have it set.
- * by default, that's the global_trace instane.
+ * by default, that's the global_trace instance.
*/
if (printk_trace == tr)
update_printk_trace(&global_trace);
@@ -7554,7 +7556,7 @@ char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
migrate_disable();
/*
- * Now preemption is being enabed and another task can come in
+ * Now preemption is being enabled and another task can come in
* and use the same buffer and corrupt our data.
*/
preempt_enable_notrace();
@@ -11329,7 +11331,7 @@ __init static void do_allocate_snapshot(const char *name)
/*
* When allocate_snapshot is set, the next call to
* allocate_trace_buffers() (called by trace_array_get_by_name())
- * will allocate the snapshot buffer. That will alse clear
+ * will allocate the snapshot buffer. That will also clear
* this flag.
*/
allocate_snapshot = true;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9b07ad9eb284..b16a5a158040 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -360,7 +360,7 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca
/* Anything else, this isn't a function */
break;
}
- /* A function could be wrapped in parethesis, try the next one */
+ /* A function could be wrapped in parenthesis, try the next one */
s = r + 1;
} while (s < e);
@@ -567,7 +567,7 @@ static void test_event_printk(struct trace_event_call *call)
* If start_arg is zero, then this is the start of the
* first argument. The processing of the argument happens
* when the end of the argument is found, as it needs to
- * handle paranthesis and such.
+ * handle parenthesis and such.
*/
if (!start_arg) {
start_arg = i;
@@ -785,7 +785,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
*
* When soft_disable is not set but the soft_mode is,
* we do nothing. Do not disable the tracepoint, otherwise
- * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
+ * "soft enable"s (clearing the SOFT_DISABLED bit) won't work.
*/
if (soft_disable) {
if (atomic_dec_return(&file->sm_ref) > 0)
@@ -1394,7 +1394,7 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
if (!tr)
return -ENOENT;
- /* Modules events can be appened with :mod:<module> */
+ /* Modules events can be appended with :mod:<module> */
mod = strstr(buf, ":mod:");
if (mod) {
*mod = '\0';
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 54226b48b2d1..385af8405392 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -142,7 +142,7 @@ static bool is_not(const char *str)
}
/**
- * struct prog_entry - a singe entry in the filter program
+ * struct prog_entry - a single entry in the filter program
* @target: Index to jump to on a branch (actually one minus the index)
* @when_to_branch: The value of the result of the predicate to do a branch
* @pred: The predicate to execute.
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 289bdea98776..5e6e70540eef 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5283,7 +5283,7 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
* on the stack, so when the histogram trigger is initialized
* a percpu array of 4 hist_pad structures is allocated.
* This will cover every context from normal, softirq, irq and NMI
- * in the very unlikely event that a tigger happens at each of
+ * in the very unlikely event that a trigger happens at each of
* these contexts and interrupts a currently active trigger.
*/
struct hist_pad {
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 2f19bbe73d27..4554c458b78c 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -375,7 +375,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
n_u64++;
} else {
trace_seq_printf(s, print_fmt, se->fields[i]->name,
- STR_VAR_LEN_MAX,
(char *)&entry->fields[n_u64].as_u64,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 96aad82b1628..06b75bcfc7b8 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -732,7 +732,7 @@ static void unregister_trigger(char *glob,
* param - text following cmd and ':' and stripped of filter
* filter - the optional filter text following (and including) 'if'
*
- * To illustrate the use of these componenents, here are some concrete
+ * To illustrate the use of these components, here are some concrete
* examples. For the following triggers:
*
* echo 'traceon:5 if pid == 0' > trigger
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index b15854c75d4f..dca6e50b3b21 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1041,7 +1041,7 @@ static int user_field_array_size(const char *type)
static int user_field_size(const char *type)
{
- /* long is not allowed from a user, since it's ambigious in size */
+ /* long is not allowed from a user, since it's ambiguous in size */
if (strcmp(type, "s64") == 0)
return sizeof(s64);
if (strcmp(type, "u64") == 0)
@@ -1079,7 +1079,7 @@ static int user_field_size(const char *type)
if (str_has_prefix(type, "__rel_loc "))
return sizeof(u32);
- /* Uknown basic type, error */
+ /* Unknown basic type, error */
return -EINVAL;
}
@@ -2465,7 +2465,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
/*
* Prevent users from using the same address and bit multiple times
* within the same mm address space. This can cause unexpected behavior
- * for user processes that is far easier to debug if this is explictly
+ * for user processes that is far easier to debug if this is explicitly
* an error upon registering.
*/
if (current_user_event_enabler_exists((unsigned long)reg.enable_addr,
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a9962d4497e8..827104d00bc0 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -329,7 +329,7 @@ static struct osnoise_data {
u64 print_stack; /* print IRQ stack if total > */
int timerlat_tracer; /* timerlat tracer */
#endif
- bool tainted; /* infor users and developers about a problem */
+ bool tainted; /* info users and developers about a problem */
} osnoise_data = {
.sample_period = DEFAULT_SAMPLE_PERIOD,
.sample_runtime = DEFAULT_SAMPLE_RUNTIME,
@@ -738,7 +738,7 @@ cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration)
/*
* get_int_safe_duration - Get the duration of a window
*
- * The irq, softirq and thread varaibles need to have its duration without
+ * The irq, softirq and thread variables need to have its duration without
* the interference from higher priority interrupts. Instead of keeping a
* variable to discount the interrupt interference from these variables, the
* starting time of these variables are pushed forward with the interrupt's
@@ -1460,7 +1460,7 @@ static int run_osnoise(void)
stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC;
/*
- * Start timestemp
+ * Start timestamp
*/
start = time_get();
@@ -1881,7 +1881,7 @@ static int timerlat_main(void *data)
tlat->kthread = current;
osn_var->pid = current->pid;
/*
- * Anotate the arrival time.
+ * Annotate the arrival time.
*/
tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
@@ -1978,7 +1978,7 @@ static void stop_per_cpu_kthreads(void)
}
/*
- * start_kthread - Start a workload tread
+ * start_kthread - Start a workload thread
*/
static int start_kthread(unsigned int cpu)
{
@@ -2705,7 +2705,7 @@ static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir)
* Why not using tracing instance per_cpu/ dir?
*
* Because osnoise/timerlat have a single workload, having
- * multiple files like these are wast of memory.
+ * multiple files like these are waste of memory.
*/
per_cpu = tracefs_create_dir("per_cpu", top_dir);
if (!per_cpu)
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index bb67f6a2136c..2f571083ce9e 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -517,7 +517,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx)
}
}
-/* Return 1 if the field separater is arrow operator ('->') */
+/* Return 1 if the field separator is arrow operator ('->') */
static int split_next_field(char *varname, char **next_field,
struct traceprobe_parse_context *ctx)
{
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index c158d65a8a88..32684ef4fb9d 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -15,7 +15,7 @@
*
* A write to the buffer will either succeed or fail. That is, unlike
* sprintf() there will not be a partial write (well it may write into
- * the buffer but it wont update the pointers). This allows users to
+ * the buffer but it won't update the pointers). This allows users to
* try to write something into the trace_seq buffer and if it fails
* they can flush it and try again.
*
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index e066d31d08f8..fe9bf8db1922 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -31,6 +31,13 @@ u32 *vmcoreinfo_note;
/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
static unsigned char *vmcoreinfo_data_safecopy;
+struct hwerr_info {
+ atomic_t count;
+ time64_t timestamp;
+};
+
+static struct hwerr_info hwerr_data[HWERR_RECOV_MAX];
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
@@ -118,6 +125,16 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
}
EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+void hwerr_log_error_type(enum hwerr_error_type src)
+{
+ if (src < 0 || src >= HWERR_RECOV_MAX)
+ return;
+
+ atomic_inc(&hwerr_data[src].count);
+ WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds());
+}
+EXPORT_SYMBOL_GPL(hwerr_log_error_type);
+
static int __init crash_save_vmcoreinfo_init(void)
{
vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a567600cf3ed..0685e3a8aa0a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,7 @@
#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/tick.h>
+#include <linux/sys_info.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
@@ -65,6 +66,13 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
unsigned int __read_mostly hardlockup_panic =
IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
+/*
+ * bitmasks to control what kinds of system info to be printed when
+ * hard lockup is detected, it could be task, memory, lock etc.
+ * Refer include/linux/sys_info.h for detailed bit definition.
+ */
+static unsigned long hardlockup_si_mask;
+
#ifdef CONFIG_SYSFS
static unsigned int hardlockup_count;
@@ -178,11 +186,15 @@ static void watchdog_hardlockup_kick(void)
void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
{
+ int hardlockup_all_cpu_backtrace;
+
if (per_cpu(watchdog_hardlockup_touched, cpu)) {
per_cpu(watchdog_hardlockup_touched, cpu) = false;
return;
}
+ hardlockup_all_cpu_backtrace = (hardlockup_si_mask & SYS_INFO_ALL_BT) ?
+ 1 : sysctl_hardlockup_all_cpu_backtrace;
/*
* Check for a hardlockup by making sure the CPU's timer
* interrupt is incrementing. The timer interrupt should have
@@ -214,7 +226,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
* Prevent multiple hard-lockup reports if one cpu is already
* engaged in dumping all cpu back traces.
*/
- if (sysctl_hardlockup_all_cpu_backtrace) {
+ if (hardlockup_all_cpu_backtrace) {
if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
return;
}
@@ -243,12 +255,13 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
trigger_single_cpu_backtrace(cpu);
}
- if (sysctl_hardlockup_all_cpu_backtrace) {
+ if (hardlockup_all_cpu_backtrace) {
trigger_allbutcpu_cpu_backtrace(cpu);
if (!hardlockup_panic)
clear_bit_unlock(0, &hard_lockup_nmi_warn);
}
+ sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT);
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
@@ -339,6 +352,13 @@ static void lockup_detector_update_enable(void)
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#endif
+/*
+ * bitmasks to control what kinds of system info to be printed when
+ * soft lockup is detected, it could be task, memory, lock etc.
+ * Refer include/linux/sys_info.h for detailed bit definition.
+ */
+static unsigned long softlockup_si_mask;
+
static struct cpumask watchdog_allowed_mask __read_mostly;
/* Global variables, exported for sysctl */
@@ -755,7 +775,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
unsigned long touch_ts, period_ts, now;
struct pt_regs *regs = get_irq_regs();
int duration;
- int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ int softlockup_all_cpu_backtrace;
unsigned long flags;
if (!watchdog_enabled)
@@ -767,6 +787,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (panic_in_progress())
return HRTIMER_NORESTART;
+ softlockup_all_cpu_backtrace = (softlockup_si_mask & SYS_INFO_ALL_BT) ?
+ 1 : sysctl_softlockup_all_cpu_backtrace;
+
watchdog_hardlockup_kick();
/* kick the softlockup detector */
@@ -855,6 +878,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
}
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
+ sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT);
if (softlockup_panic)
panic("softlockup: hung tasks");
}
@@ -1206,6 +1230,13 @@ static const struct ctl_table watchdog_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "softlockup_sys_info",
+ .data = &softlockup_si_mask,
+ .maxlen = sizeof(softlockup_si_mask),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
#ifdef CONFIG_SMP
{
.procname = "softlockup_all_cpu_backtrace",
@@ -1228,6 +1259,13 @@ static const struct ctl_table watchdog_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "hardlockup_sys_info",
+ .data = &hardlockup_si_mask,
+ .maxlen = sizeof(hardlockup_si_mask),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
#ifdef CONFIG_SMP
{
.procname = "hardlockup_all_cpu_backtrace",