diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.kexec | 34 | ||||
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 2 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 40 | ||||
-rw-r--r-- | kernel/crash_dump_dm_crypt.c | 464 | ||||
-rw-r--r-- | kernel/crash_reserve.c | 2 | ||||
-rw-r--r-- | kernel/delayacct.c | 51 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 357 | ||||
-rw-r--r-- | kernel/exit.c | 68 | ||||
-rw-r--r-- | kernel/fork.c | 277 | ||||
-rw-r--r-- | kernel/gcov/gcc_4_7.c | 4 | ||||
-rw-r--r-- | kernel/hung_task.c | 55 | ||||
-rw-r--r-- | kernel/kexec_file.c | 94 | ||||
-rw-r--r-- | kernel/kexec_handover.c | 1266 | ||||
-rw-r--r-- | kernel/kexec_internal.h | 16 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 5 | ||||
-rw-r--r-- | kernel/locking/semaphore.c | 57 | ||||
-rw-r--r-- | kernel/panic.c | 8 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 42 | ||||
-rw-r--r-- | kernel/ptrace.c | 179 | ||||
-rw-r--r-- | kernel/relay.c | 111 | ||||
-rw-r--r-- | kernel/sched/fair.c | 9 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 213 | ||||
-rw-r--r-- | kernel/trace/trace.c | 45 | ||||
-rw-r--r-- | kernel/vmcore_info.c | 4 | ||||
-rw-r--r-- | kernel/watchdog.c | 94 |
26 files changed, 2603 insertions, 896 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 4d111f871951..e64ce21f9a80 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -38,8 +38,7 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE - select CRYPTO - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select KEXEC_CORE help This is new version of kexec system call. This system call is @@ -95,6 +94,20 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select DEBUG_FS + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP @@ -116,6 +129,23 @@ config CRASH_DUMP For s390, this option also enables zfcpdump. See also <file:Documentation/arch/s390/zfcpdump.rst> +config CRASH_DM_CRYPT + bool "Support saving crash dump to dm-crypt encrypted volume" + depends on KEXEC_FILE + depends on CRASH_DUMP + depends on DM_CRYPT + help + With this option enabled, user space can intereact with + /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys + persistent for the dump-capture kernel. + +config CRASH_DM_CRYPT_CONFIGS + def_tristate CRASH_DM_CRYPT + select CONFIGFS_FS + help + CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that + is required to be built-in. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y diff --git a/kernel/Makefile b/kernel/Makefile index 434929de17ef..32e80dd626af 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -77,9 +77,11 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_CRASH_DUMP) += crash_core.o +obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4b5f29168618..dd5304c6ac3c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -579,7 +579,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return try_alloc_pages(nid, 0); + return alloc_pages_nolock(nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6d3ac19cc2ac..3bc4301466f3 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4202,7 +4202,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /* - * cpuset_node_allowed - Can we allocate on a memory node? + * cpuset_current_node_allowed - Can current task allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -4241,7 +4241,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -bool cpuset_node_allowed(int node, gfp_t gfp_mask) +bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ @@ -4275,6 +4275,42 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask) return allowed; } +bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +{ + struct cgroup_subsys_state *css; + struct cpuset *cs; + bool allowed; + + /* + * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy + * and mems_allowed is likely to be empty even if we could get to it, + * so return true to avoid taking a global lock on the empty check. + */ + if (!cpuset_v2()) + return true; + + css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); + if (!css) + return true; + + /* + * Normally, accessing effective_mems would require the cpuset_mutex + * or callback_lock - but node_isset is atomic and the reference + * taken via cgroup_get_e_css is sufficient to protect css. + * + * Since this interface is intended for use by migration paths, we + * relax locking here to avoid taking global locks - while accepting + * there may be rare scenarios where the result may be innaccurate. + * + * Reclaim and migration are subject to these same race conditions, and + * cannot make strong isolation guarantees, so this is acceptable. + */ + cs = container_of(css, struct cpuset, css); + allowed = node_isset(nid, cs->effective_mems); + css_put(css); + return allowed; +} + /** * cpuset_spread_node() - On which node to begin search for a page * @rotor: round robin rotor diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c new file mode 100644 index 000000000000..401423ba477d --- /dev/null +++ b/kernel/crash_dump_dm_crypt.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/key.h> +#include <linux/keyctl.h> +#include <keys/user-type.h> +#include <linux/crash_dump.h> +#include <linux/cc_platform.h> +#include <linux/configfs.h> +#include <linux/module.h> + +#define KEY_NUM_MAX 128 /* maximum dm crypt keys */ +#define KEY_SIZE_MAX 256 /* maximum dm crypt key size */ +#define KEY_DESC_MAX_LEN 128 /* maximum dm crypt key description size */ + +static unsigned int key_count; + +struct dm_crypt_key { + unsigned int key_size; + char key_desc[KEY_DESC_MAX_LEN]; + u8 data[KEY_SIZE_MAX]; +}; + +static struct keys_header { + unsigned int total_keys; + struct dm_crypt_key keys[] __counted_by(total_keys); +} *keys_header; + +static size_t get_keys_header_size(size_t total_keys) +{ + return struct_size(keys_header, keys, total_keys); +} + +unsigned long long dm_crypt_keys_addr; +EXPORT_SYMBOL_GPL(dm_crypt_keys_addr); + +static int __init setup_dmcryptkeys(char *arg) +{ + char *end; + + if (!arg) + return -EINVAL; + dm_crypt_keys_addr = memparse(arg, &end); + if (end > arg) + return 0; + + dm_crypt_keys_addr = 0; + return -EINVAL; +} + +early_param("dmcryptkeys", setup_dmcryptkeys); + +/* + * Architectures may override this function to read dm crypt keys + */ +ssize_t __weak dm_crypt_keys_read(char *buf, size_t count, u64 *ppos) +{ + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); +} + +static int add_key_to_keyring(struct dm_crypt_key *dm_key, + key_ref_t keyring_ref) +{ + key_ref_t key_ref; + int r; + + /* create or update the requested key and add it to the target keyring */ + key_ref = key_create_or_update(keyring_ref, "user", dm_key->key_desc, + dm_key->data, dm_key->key_size, + KEY_USR_ALL, KEY_ALLOC_IN_QUOTA); + + if (!IS_ERR(key_ref)) { + r = key_ref_to_ptr(key_ref)->serial; + key_ref_put(key_ref); + kexec_dprintk("Success adding key %s", dm_key->key_desc); + } else { + r = PTR_ERR(key_ref); + kexec_dprintk("Error when adding key"); + } + + key_ref_put(keyring_ref); + return r; +} + +static void get_keys_from_kdump_reserved_memory(void) +{ + struct keys_header *keys_header_loaded; + + arch_kexec_unprotect_crashkres(); + + keys_header_loaded = kmap_local_page(pfn_to_page( + kexec_crash_image->dm_crypt_keys_addr >> PAGE_SHIFT)); + + memcpy(keys_header, keys_header_loaded, get_keys_header_size(key_count)); + kunmap_local(keys_header_loaded); + arch_kexec_protect_crashkres(); +} + +static int restore_dm_crypt_keys_to_thread_keyring(void) +{ + struct dm_crypt_key *key; + size_t keys_header_size; + key_ref_t keyring_ref; + u64 addr; + + /* find the target keyring (which must be writable) */ + keyring_ref = + lookup_user_key(KEY_SPEC_USER_KEYRING, 0x01, KEY_NEED_WRITE); + if (IS_ERR(keyring_ref)) { + kexec_dprintk("Failed to get the user keyring\n"); + return PTR_ERR(keyring_ref); + } + + addr = dm_crypt_keys_addr; + dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr); + if (key_count < 0 || key_count > KEY_NUM_MAX) { + kexec_dprintk("Failed to read the number of dm-crypt keys\n"); + return -1; + } + + kexec_dprintk("There are %u keys\n", key_count); + addr = dm_crypt_keys_addr; + + keys_header_size = get_keys_header_size(key_count); + keys_header = kzalloc(keys_header_size, GFP_KERNEL); + if (!keys_header) + return -ENOMEM; + + dm_crypt_keys_read((char *)keys_header, keys_header_size, &addr); + + for (int i = 0; i < keys_header->total_keys; i++) { + key = &keys_header->keys[i]; + kexec_dprintk("Get key (size=%u)\n", key->key_size); + add_key_to_keyring(key, keyring_ref); + } + + return 0; +} + +static int read_key_from_user_keying(struct dm_crypt_key *dm_key) +{ + const struct user_key_payload *ukp; + struct key *key; + + kexec_dprintk("Requesting logon key %s", dm_key->key_desc); + key = request_key(&key_type_logon, dm_key->key_desc, NULL); + + if (IS_ERR(key)) { + pr_warn("No such logon key %s\n", dm_key->key_desc); + return PTR_ERR(key); + } + + ukp = user_key_payload_locked(key); + if (!ukp) + return -EKEYREVOKED; + + if (ukp->datalen > KEY_SIZE_MAX) { + pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX); + return -EINVAL; + } + + memcpy(dm_key->data, ukp->data, ukp->datalen); + dm_key->key_size = ukp->datalen; + kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size, + dm_key->key_desc, dm_key->data); + return 0; +} + +struct config_key { + struct config_item item; + const char *description; +}; + +static inline struct config_key *to_config_key(struct config_item *item) +{ + return container_of(item, struct config_key, item); +} + +static ssize_t config_key_description_show(struct config_item *item, char *page) +{ + return sprintf(page, "%s\n", to_config_key(item)->description); +} + +static ssize_t config_key_description_store(struct config_item *item, + const char *page, size_t count) +{ + struct config_key *config_key = to_config_key(item); + size_t len; + int ret; + + ret = -EINVAL; + len = strcspn(page, "\n"); + + if (len > KEY_DESC_MAX_LEN) { + pr_err("The key description shouldn't exceed %u characters", KEY_DESC_MAX_LEN); + return ret; + } + + if (!len) + return ret; + + kfree(config_key->description); + ret = -ENOMEM; + config_key->description = kmemdup_nul(page, len, GFP_KERNEL); + if (!config_key->description) + return ret; + + return count; +} + +CONFIGFS_ATTR(config_key_, description); + +static struct configfs_attribute *config_key_attrs[] = { + &config_key_attr_description, + NULL, +}; + +static void config_key_release(struct config_item *item) +{ + kfree(to_config_key(item)); + key_count--; +} + +static struct configfs_item_operations config_key_item_ops = { + .release = config_key_release, +}; + +static const struct config_item_type config_key_type = { + .ct_item_ops = &config_key_item_ops, + .ct_attrs = config_key_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item *config_keys_make_item(struct config_group *group, + const char *name) +{ + struct config_key *config_key; + + if (key_count > KEY_NUM_MAX) { + pr_err("Only %u keys at maximum to be created\n", KEY_NUM_MAX); + return ERR_PTR(-EINVAL); + } + + config_key = kzalloc(sizeof(struct config_key), GFP_KERNEL); + if (!config_key) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&config_key->item, name, &config_key_type); + + key_count++; + + return &config_key->item; +} + +static ssize_t config_keys_count_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", key_count); +} + +CONFIGFS_ATTR_RO(config_keys_, count); + +static bool is_dm_key_reused; + +static ssize_t config_keys_reuse_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", is_dm_key_reused); +} + +static ssize_t config_keys_reuse_store(struct config_item *item, + const char *page, size_t count) +{ + if (!kexec_crash_image || !kexec_crash_image->dm_crypt_keys_addr) { + kexec_dprintk( + "dm-crypt keys haven't be saved to crash-reserved memory\n"); + return -EINVAL; + } + + if (kstrtobool(page, &is_dm_key_reused)) + return -EINVAL; + + if (is_dm_key_reused) + get_keys_from_kdump_reserved_memory(); + + return count; +} + +CONFIGFS_ATTR(config_keys_, reuse); + +static struct configfs_attribute *config_keys_attrs[] = { + &config_keys_attr_count, + &config_keys_attr_reuse, + NULL, +}; + +/* + * Note that, since no extra work is required on ->drop_item(), + * no ->drop_item() is provided. + */ +static struct configfs_group_operations config_keys_group_ops = { + .make_item = config_keys_make_item, +}; + +static const struct config_item_type config_keys_type = { + .ct_group_ops = &config_keys_group_ops, + .ct_attrs = config_keys_attrs, + .ct_owner = THIS_MODULE, +}; + +static bool restore; + +static ssize_t config_keys_restore_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", restore); +} + +static ssize_t config_keys_restore_store(struct config_item *item, + const char *page, size_t count) +{ + if (!restore) + restore_dm_crypt_keys_to_thread_keyring(); + + if (kstrtobool(page, &restore)) + return -EINVAL; + + return count; +} + +CONFIGFS_ATTR(config_keys_, restore); + +static struct configfs_attribute *kdump_config_keys_attrs[] = { + &config_keys_attr_restore, + NULL, +}; + +static const struct config_item_type kdump_config_keys_type = { + .ct_attrs = kdump_config_keys_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem config_keys_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "crash_dm_crypt_keys", + .ci_type = &config_keys_type, + }, + }, +}; + +static int build_keys_header(void) +{ + struct config_item *item = NULL; + struct config_key *key; + int i, r; + + if (keys_header != NULL) + kvfree(keys_header); + + keys_header = kzalloc(get_keys_header_size(key_count), GFP_KERNEL); + if (!keys_header) + return -ENOMEM; + + keys_header->total_keys = key_count; + + i = 0; + list_for_each_entry(item, &config_keys_subsys.su_group.cg_children, + ci_entry) { + if (item->ci_type != &config_key_type) + continue; + + key = to_config_key(item); + + if (!key->description) { + pr_warn("No key description for key %s\n", item->ci_name); + return -EINVAL; + } + + strscpy(keys_header->keys[i].key_desc, key->description, + KEY_DESC_MAX_LEN); + r = read_key_from_user_keying(&keys_header->keys[i]); + if (r != 0) { + kexec_dprintk("Failed to read key %s\n", + keys_header->keys[i].key_desc); + return r; + } + i++; + kexec_dprintk("Found key: %s\n", item->ci_name); + } + + return 0; +} + +int crash_load_dm_crypt_keys(struct kimage *image) +{ + struct kexec_buf kbuf = { + .image = image, + .buf_min = 0, + .buf_max = ULONG_MAX, + .top_down = false, + .random = true, + }; + int r; + + + if (key_count <= 0) { + kexec_dprintk("No dm-crypt keys\n"); + return -ENOENT; + } + + if (!is_dm_key_reused) { + image->dm_crypt_keys_addr = 0; + r = build_keys_header(); + if (r) + return r; + } + + kbuf.buffer = keys_header; + kbuf.bufsz = get_keys_header_size(key_count); + + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + r = kexec_add_buffer(&kbuf); + if (r) { + kvfree((void *)kbuf.buffer); + return r; + } + image->dm_crypt_keys_addr = kbuf.mem; + image->dm_crypt_keys_sz = kbuf.bufsz; + kexec_dprintk( + "Loaded dm crypt keys to kexec_buffer bufsz=0x%lx memsz=0x%lx\n", + kbuf.bufsz, kbuf.memsz); + + return r; +} + +static int __init configfs_dmcrypt_keys_init(void) +{ + int ret; + + if (is_kdump_kernel()) { + config_keys_subsys.su_group.cg_item.ci_type = + &kdump_config_keys_type; + } + + config_group_init(&config_keys_subsys.su_group); + mutex_init(&config_keys_subsys.su_mutex); + ret = configfs_register_subsystem(&config_keys_subsys); + if (ret) { + pr_err("Error %d while registering subsystem %s\n", ret, + config_keys_subsys.su_group.cg_item.ci_namebuf); + goto out_unregister; + } + + return 0; + +out_unregister: + configfs_unregister_subsystem(&config_keys_subsys); + + return ret; +} + +module_init(configfs_dmcrypt_keys_init); diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index aff7c0fdbefa..acb6bf42e30d 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -131,7 +131,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("crahskernel: Memory value expected after '@'\n"); + pr_warn("crashkernel: Memory value expected after '@'\n"); return -EINVAL; } } diff --git a/kernel/delayacct.c b/kernel/delayacct.c index eb63a021ac04..30e7912ebb0d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -14,6 +14,15 @@ #include <linux/delayacct.h> #include <linux/module.h> +#define UPDATE_DELAY(type) \ +do { \ + d->type##_delay_max = tsk->delays->type##_delay_max; \ + d->type##_delay_min = tsk->delays->type##_delay_min; \ + tmp = d->type##_delay_total + tsk->delays->type##_delay; \ + d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ + d->type##_count += tsk->delays->type##_count; \ +} while (0) + DEFINE_STATIC_KEY_FALSE(delayacct_key); int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; @@ -173,41 +182,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ raw_spin_lock_irqsave(&tsk->delays->lock, flags); - d->blkio_delay_max = tsk->delays->blkio_delay_max; - d->blkio_delay_min = tsk->delays->blkio_delay_min; - tmp = d->blkio_delay_total + tsk->delays->blkio_delay; - d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; - d->swapin_delay_max = tsk->delays->swapin_delay_max; - d->swapin_delay_min = tsk->delays->swapin_delay_min; - tmp = d->swapin_delay_total + tsk->delays->swapin_delay; - d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; - d->freepages_delay_max = tsk->delays->freepages_delay_max; - d->freepages_delay_min = tsk->delays->freepages_delay_min; - tmp = d->freepages_delay_total + tsk->delays->freepages_delay; - d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; - d->thrashing_delay_max = tsk->delays->thrashing_delay_max; - d->thrashing_delay_min = tsk->delays->thrashing_delay_min; - tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; - d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; - d->compact_delay_max = tsk->delays->compact_delay_max; - d->compact_delay_min = tsk->delays->compact_delay_min; - tmp = d->compact_delay_total + tsk->delays->compact_delay; - d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; - d->wpcopy_delay_max = tsk->delays->wpcopy_delay_max; - d->wpcopy_delay_min = tsk->delays->wpcopy_delay_min; - tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; - d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; - d->irq_delay_max = tsk->delays->irq_delay_max; - d->irq_delay_min = tsk->delays->irq_delay_min; - tmp = d->irq_delay_total + tsk->delays->irq_delay; - d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; - d->blkio_count += tsk->delays->blkio_count; - d->swapin_count += tsk->delays->swapin_count; - d->freepages_count += tsk->delays->freepages_count; - d->thrashing_count += tsk->delays->thrashing_count; - d->compact_count += tsk->delays->compact_count; - d->wpcopy_count += tsk->delays->wpcopy_count; - d->irq_count += tsk->delays->irq_count; + UPDATE_DELAY(blkio); + UPDATE_DELAY(swapin); + UPDATE_DELAY(freepages); + UPDATE_DELAY(thrashing); + UPDATE_DELAY(compact); + UPDATE_DELAY(wpcopy); + UPDATE_DELAY(irq); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8d783b5882b6..4c965ba77f9f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -29,6 +29,7 @@ #include <linux/workqueue.h> #include <linux/srcu.h> #include <linux/oom.h> /* check_stable_address_space */ +#include <linux/pagewalk.h> #include <linux/uprobes.h> @@ -152,91 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) } /** - * __replace_page - replace page in vma by new page. - * based on replace_page in mm/ksm.c - * - * @vma: vma that holds the pte pointing to page - * @addr: address the old @page is mapped at - * @old_page: the page we are replacing by new_page - * @new_page: the modified page we replace page by - * - * If @new_page is NULL, only unmap @old_page. - * - * Returns 0 on success, negative error code otherwise. - */ -static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *old_page, struct page *new_page) -{ - struct folio *old_folio = page_folio(old_page); - struct folio *new_folio; - struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); - int err; - struct mmu_notifier_range range; - pte_t pte; - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, - addr + PAGE_SIZE); - - if (new_page) { - new_folio = page_folio(new_page); - err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); - if (err) - return err; - } - - /* For folio_free_swap() below */ - folio_lock(old_folio); - - mmu_notifier_invalidate_range_start(&range); - err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) - goto unlock; - VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - pte = ptep_get(pvmw.pte); - - /* - * Handle PFN swap PTES, such as device-exclusive ones, that actually - * map pages: simply trigger GUP again to fix it up. - */ - if (unlikely(!pte_present(pte))) { - page_vma_mapped_walk_done(&pvmw); - goto unlock; - } - - if (new_page) { - folio_get(new_folio); - folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(new_folio, vma); - } else - /* no new page, just dec_mm_counter for old_page */ - dec_mm_counter(mm, MM_ANONPAGES); - - if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(old_folio)); - inc_mm_counter(mm, MM_ANONPAGES); - } - - flush_cache_page(vma, addr, pte_pfn(pte)); - ptep_clear_flush(vma, addr, pvmw.pte); - if (new_page) - set_pte_at(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); - - folio_remove_rmap_pte(old_folio, old_page, vma); - if (!folio_mapped(old_folio)) - folio_free_swap(old_folio); - page_vma_mapped_walk_done(&pvmw); - folio_put(old_folio); - - err = 0; - unlock: - mmu_notifier_invalidate_range_end(&range); - folio_unlock(old_folio); - return err; -} - -/** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn @@ -463,6 +379,95 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, return ret; } +static bool orig_page_is_identical(struct vm_area_struct *vma, + unsigned long vaddr, struct page *page, bool *pmd_mappable) +{ + const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; + struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, + index); + struct page *orig_page; + bool identical; + + if (IS_ERR(orig_folio)) + return false; + orig_page = folio_file_page(orig_folio, index); + + *pmd_mappable = folio_test_pmd_mappable(orig_folio); + identical = folio_test_uptodate(orig_folio) && + pages_identical(page, orig_page); + folio_put(orig_folio); + return identical; +} + +static int __uprobe_write_opcode(struct vm_area_struct *vma, + struct folio_walk *fw, struct folio *folio, + unsigned long opcode_vaddr, uprobe_opcode_t opcode) +{ + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + const bool is_register = !!is_swbp_insn(&opcode); + bool pmd_mappable; + + /* For now, we'll only handle PTE-mapped folios. */ + if (fw->level != FW_LEVEL_PTE) + return -EFAULT; + + /* + * See can_follow_write_pte(): we'd actually prefer a writable PTE here, + * but the VMA might not be writable. + */ + if (!pte_write(fw->pte)) { + if (!PageAnonExclusive(fw->page)) + return -EFAULT; + if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) + return -EFAULT; + /* SOFTDIRTY is handled via pte_mkdirty() below. */ + } + + /* + * We'll temporarily unmap the page and flush the TLB, such that we can + * modify the page atomically. + */ + flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); + fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); + copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + + /* + * When unregistering, we may only zap a PTE if uffd is disabled and + * there are no unexpected folio references ... + */ + if (is_register || userfaultfd_missing(vma) || + (folio_ref_count(folio) != folio_mapcount(folio) + 1 + + folio_test_swapcache(folio) * folio_nr_pages(folio))) + goto remap; + + /* + * ... and the mapped page is identical to the original page that + * would get faulted in on next access. + */ + if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) + goto remap; + + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); + folio_remove_rmap_pte(folio, fw->page, vma); + if (!folio_mapped(folio) && folio_test_swapcache(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); + } + folio_put(folio); + + return pmd_mappable; +remap: + /* + * Make sure that our copy_to_page() changes become visible before the + * set_pte_at() write. + */ + smp_wmb(); + /* We modified the page. Make sure to mark the PTE dirty. */ + set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); + return 0; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -474,146 +479,146 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. - * @vaddr: the virtual address to store the opcode. - * @opcode: opcode to be written at @vaddr. + * @vma: the probed virtual memory area. + * @opcode_vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @opcode_vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) +int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - struct page *old_page, *new_page; - struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; - bool orig_page_huge = false; unsigned int gup_flags = FOLL_FORCE; + struct mmu_notifier_range range; + struct folio_walk fw; + struct folio *folio; + struct page *page; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); -retry: + if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) + return -EINVAL; + + /* + * When registering, we have to break COW to get an exclusive anonymous + * page that we can safely modify. Use FOLL_WRITE to trigger a write + * fault if required. When unregistering, we might be lucky and the + * anon page is already gone. So defer write faults until really + * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * cannot deal with PMDs yet. + */ if (is_register) - gup_flags |= FOLL_SPLIT_PMD; - /* Read the page with vaddr into memory */ - old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); - if (IS_ERR(old_page)) - return PTR_ERR(old_page); + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; - ret = verify_opcode(old_page, vaddr, &opcode); +retry: + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) - goto put_old; - - if (is_zero_page(old_page)) { - ret = -EINVAL; - goto put_old; - } + goto out; + folio = page_folio(page); - if (WARN(!is_register && PageCompound(old_page), - "uprobe unregister should never work on compound page\n")) { - ret = -EINVAL; - goto put_old; + ret = verify_opcode(page, opcode_vaddr, &opcode); + if (ret <= 0) { + folio_put(folio); + goto out; } /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); - if (ret) - goto put_old; + if (ret) { + folio_put(folio); + goto out; + } ref_ctr_updated = 1; } ret = 0; - if (!is_register && !PageAnon(old_page)) - goto put_old; - - ret = anon_vma_prepare(vma); - if (ret) - goto put_old; - - ret = -ENOMEM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); - if (!new_page) - goto put_old; - - __SetPageUptodate(new_page); - copy_highpage(new_page, old_page); - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (unlikely(!folio_test_anon(folio))) { + VM_WARN_ON_ONCE(is_register); + folio_put(folio); + goto out; + } if (!is_register) { - struct page *orig_page; - pgoff_t index; - - VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); - - index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; - orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, - index); - - if (orig_page) { - if (PageUptodate(orig_page) && - pages_identical(new_page, orig_page)) { - /* let go new_page */ - put_page(new_page); - new_page = NULL; - - if (PageCompound(orig_page)) - orig_page_huge = true; - } - put_page(orig_page); - } + /* + * In the common case, we'll be able to zap the page when + * unregistering. So trigger MMU notifiers now, as we won't + * be able to do it under PTL. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + vaddr, vaddr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); } - ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); - if (new_page) - put_page(new_page); -put_old: - put_page(old_page); + ret = -EAGAIN; + /* Walk the page tables again, to perform the actual update. */ + if (folio_walk_start(&fw, vma, vaddr, 0)) { + if (fw.page == page) + ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + folio_walk_end(&fw, vma); + } + + if (!is_register) + mmu_notifier_invalidate_range_end(&range); - if (unlikely(ret == -EAGAIN)) + folio_put(folio); + switch (ret) { + case -EFAULT: + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; + fallthrough; + case -EAGAIN: goto retry; + default: + break; + } +out: /* Revert back reference counter if instruction update failed. */ - if (ret && is_register && ref_ctr_updated) + if (ret < 0 && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); /* try collapse pmd for compound page */ - if (!ret && orig_page_huge) + if (ret > 0) collapse_pte_mapped_thp(mm, vaddr, false); - return ret; + return ret < 0 ? ret : 0; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); } /** * set_orig_insn - Restore the original instruction. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_orig_insn(struct arch_uprobe *auprobe, + struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } @@ -1134,10 +1139,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) return ret; } -static int -install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long vaddr) +static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; @@ -1153,7 +1158,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); - ret = set_swbp(&uprobe->arch, mm, vaddr); + ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) @@ -1162,11 +1167,13 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; } -static int -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) +static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; + set_bit(MMF_RECALC_UPROBES, &mm->flags); - return set_orig_insn(&uprobe->arch, mm, vaddr); + return set_orig_insn(&uprobe->arch, vma, vaddr); } struct map_info { @@ -1296,10 +1303,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); + err = install_breakpoint(uprobe, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, mm)) - err |= remove_breakpoint(uprobe, mm, info->vaddr); + err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: @@ -1472,7 +1479,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); - err |= remove_breakpoint(uprobe, mm, vaddr); + err |= remove_breakpoint(uprobe, vma, vaddr); } mmap_read_unlock(mm); @@ -1610,7 +1617,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } diff --git a/kernel/exit.c b/kernel/exit.c index 38645039dd8f..bd743900354c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -421,44 +421,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -static void coredump_task_exit(struct task_struct *tsk) +static void coredump_task_exit(struct task_struct *tsk, + struct core_state *core_state) { - struct core_state *core_state; + struct core_thread self; + self.task = tsk; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* - * Serialize with any possible pending coredump. - * We must hold siglock around checking core_state - * and setting PF_POSTCOREDUMP. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group without PF_POSTCOREDUMP set. + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. */ - spin_lock_irq(&tsk->sighand->siglock); - tsk->flags |= PF_POSTCOREDUMP; - core_state = tsk->signal->core_state; - spin_unlock_irq(&tsk->sighand->siglock); - if (core_state) { - struct core_thread self; - - self.task = current; - if (self.task->flags & PF_SIGNALED) - self.next = xchg(&core_state->dumper.next, &self); - else - self.task = NULL; - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); - for (;;) { - set_current_state(TASK_IDLE|TASK_FREEZABLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_current_state(TASK_RUNNING); + for (;;) { + set_current_state(TASK_IDLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); } + __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG @@ -882,6 +868,7 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; + struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; @@ -891,7 +878,19 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) signal->group_exit_code = code; signal->group_stop_count = 0; } + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + tsk->flags |= PF_POSTCOREDUMP; + core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); + + if (unlikely(core_state)) + coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) @@ -900,15 +899,12 @@ void __noreturn do_exit(long code) int group_dead; WARN_ON(irqs_disabled()); - - synchronize_group_exit(tsk, code); - WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); - coredump_task_exit(tsk); + synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 85afccfdf3b1..1ee8eb11f38b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -112,6 +112,9 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +/* For dup_mmap(). */ +#include "../mm/internal.h" + #include <trace/events/sched.h> #define CREATE_TRACE_POINTS @@ -428,88 +431,9 @@ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; -/* SLAB cache for vm_area_struct structures */ -static struct kmem_cache *vm_area_cachep; - /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!vma) - return NULL; - - vma_init(vma, mm); - - return vma; -} - -static void vm_area_init_from(const struct vm_area_struct *src, - struct vm_area_struct *dest) -{ - dest->vm_mm = src->vm_mm; - dest->vm_ops = src->vm_ops; - dest->vm_start = src->vm_start; - dest->vm_end = src->vm_end; - dest->anon_vma = src->anon_vma; - dest->vm_pgoff = src->vm_pgoff; - dest->vm_file = src->vm_file; - dest->vm_private_data = src->vm_private_data; - vm_flags_init(dest, src->vm_flags); - memcpy(&dest->vm_page_prot, &src->vm_page_prot, - sizeof(dest->vm_page_prot)); - /* - * src->shared.rb may be modified concurrently when called from - * dup_mmap(), but the clone will reinitialize it. - */ - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, - sizeof(dest->vm_userfaultfd_ctx)); -#ifdef CONFIG_ANON_VMA_NAME - dest->anon_name = src->anon_name; -#endif -#ifdef CONFIG_SWAP - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, - sizeof(dest->swap_readahead_info)); -#endif -#ifndef CONFIG_MMU - dest->vm_region = src->vm_region; -#endif -#ifdef CONFIG_NUMA - dest->vm_policy = src->vm_policy; -#endif -} - -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) -{ - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - - if (!new) - return NULL; - - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - vm_area_init_from(orig, new); - vma_lock_init(new, true); - INIT_LIST_HEAD(&new->anon_vma_chain); - vma_numab_state_init(new); - dup_anon_vma_name(orig, new); - - return new; -} - -void vm_area_free(struct vm_area_struct *vma) -{ - /* The vma should be detached while being destroyed. */ - vma_assert_detached(vma); - vma_numab_state_free(vma); - free_anon_vma_name(vma); - kmem_cache_free(vm_area_cachep, vma); -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -589,7 +513,7 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) { struct file *exe_file; @@ -604,183 +528,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) } #ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp; - int retval; - unsigned long charge = 0; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, 0); - - if (mmap_write_lock_killable(oldmm)) - return -EINTR; - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - dup_mm_exe_file(mm, oldmm); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - /* Use __mt_dup() to efficiently build an identical maple tree. */ - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); - if (unlikely(retval)) - goto out; - - mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(vmi, mpnt) { - struct file *file; - - vma_start_write(mpnt); - if (mpnt->vm_flags & VM_DONTCOPY) { - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, - mpnt->vm_end, GFP_KERNEL); - if (retval) - goto loop_out; - - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - /* - * Don't duplicate many vmas if we've been oom-killed (for - * example) - */ - if (fatal_signal_pending(current)) { - retval = -EINTR; - goto loop_out; - } - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = vm_area_dup(mpnt); - if (!tmp) - goto fail_nomem; - - /* track_pfn_copy() will later take care of copying internal state. */ - if (unlikely(tmp->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(tmp); - - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* - * VM_WIPEONFORK gets a clean slate in the child. - * Don't prepare anon_vma until fault since we don't - * copy page for current vma. - */ - tmp->anon_vma = NULL; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - vm_flags_clear(tmp, VM_LOCKED_MASK); - /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - file = tmp->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - get_file(file); - i_mmap_lock_write(mapping); - if (vma_is_shared_maywrite(tmp)) - mapping_allow_writable(mapping); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(tmp, mpnt); - - if (retval) { - mpnt = vma_next(&vmi); - goto loop_out; - } - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -loop_out: - vma_iter_free(&vmi); - if (!retval) { - mt_set_in_rcu(vmi.mas.tree); - ksm_fork(mm, oldmm); - khugepaged_fork(mm, oldmm); - } else { - - /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. - */ - if (mpnt) { - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); - } - /* - * The mm_struct is going to exit, but the locks will be dropped - * first. Set the mm_struct as unstable is advisable as it is - * not fully initialised. - */ - set_bit(MMF_UNSTABLE, &mm->flags); - } -out: - mmap_write_unlock(mm); - flush_tlb_mm(oldmm); - mmap_write_unlock(oldmm); - if (!retval) - dup_userfaultfd_complete(&uf); - else - dup_userfaultfd_fail(&uf); - return retval; - -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - vm_area_free(tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto loop_out; -} - static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); @@ -794,13 +541,6 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - mmap_write_lock(oldmm); - dup_mm_exe_file(mm, oldmm); - mmap_write_unlock(oldmm); - return 0; -} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ @@ -3228,11 +2968,6 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { - struct kmem_cache_args args = { - .use_freeptr_offset = true, - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), - }; - sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3249,10 +2984,6 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), &args, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index fd75b4a484d7..a08cc076f332 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -22,10 +22,6 @@ #define GCOV_COUNTERS 9 #elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 -#elif (__GNUC__ >= 7) -#define GCOV_COUNTERS 9 -#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) -#define GCOV_COUNTERS 10 #else #define GCOV_COUNTERS 9 #endif diff --git a/kernel/hung_task.c b/kernel/hung_task.c index dc898ec93463..d2432df2b905 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -22,6 +22,7 @@ #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> +#include <linux/hung_task.h> #include <trace/events/sched.h> @@ -98,30 +99,62 @@ static struct notifier_block panic_block = { static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; - unsigned long owner; - struct mutex *lock; + unsigned long owner, blocker, blocker_type; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); - lock = READ_ONCE(task->blocker_mutex); - if (!lock) + blocker = READ_ONCE(task->blocker); + if (!blocker) return; - owner = mutex_get_owner(lock); + blocker_type = hung_task_get_blocker_type(blocker); + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + owner = mutex_get_owner( + (struct mutex *)hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_SEM: + owner = sem_last_holder( + (struct semaphore *)hung_task_blocker_to_lock(blocker)); + break; + default: + WARN_ON_ONCE(1); + return; + } + + if (unlikely(!owner)) { - pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", - task->comm, task->pid); + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", + task->comm, task->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", + task->comm, task->pid); + break; + } return; } /* Ensure the owner information is correct. */ for_each_process_thread(g, t) { - if ((unsigned long)t == owner) { + if ((unsigned long)t != owner) + continue; + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", - task->comm, task->pid, t->comm, t->pid); - sched_show_task(t); - return; + task->comm, task->pid, t->comm, t->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", + task->comm, task->pid, t->comm, t->pid); + break; } + sched_show_task(t); + return; } } #else diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 0adb645072aa..69fe76fd9233 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -19,7 +19,6 @@ #include <linux/list.h> #include <linux/fs.h> #include <linux/ima.h> -#include <crypto/hash.h> #include <crypto/sha2.h> #include <linux/elf.h> #include <linux/elfcore.h> @@ -277,6 +276,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* IMA needs to pass the measurement list to the next kernel. */ ima_add_kexec_buffer(image); + /* If KHO is active, add its images to the list */ + ret = kho_fill_kimage(image); + if (ret) + goto out; + /* Call image load handler */ ldata = kexec_image_load_default(image); @@ -469,6 +473,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, temp_end = min(end, kbuf->buf_max); temp_start = temp_end - kbuf->memsz + 1; + kexec_random_range_start(temp_start, temp_end, kbuf, &temp_start); do { /* align down start */ @@ -513,6 +518,8 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, temp_start = max(start, kbuf->buf_min); + kexec_random_range_start(temp_start, end, kbuf, &temp_start); + do { temp_start = ALIGN(temp_start, kbuf->buf_align); temp_end = temp_start + kbuf->memsz - 1; @@ -672,6 +679,14 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; + /* + * If KHO is active, only use KHO scratch memory. All other memory + * could potentially be handed over. + */ + ret = kho_locate_mem_hole(kbuf, locate_mem_hole_callback); + if (ret <= 0) + return ret; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -736,11 +751,10 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Calculate and store the digest of segments */ static int kexec_calculate_store_digests(struct kimage *image) { - struct crypto_shash *tfm; - struct shash_desc *desc; + struct sha256_state state; int ret = 0, i, j, zero_buf_sz, sha_region_sz; - size_t desc_size, nullsz; - char *digest; + size_t nullsz; + u8 digest[SHA256_DIGEST_SIZE]; void *zero_buf; struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; @@ -751,37 +765,12 @@ static int kexec_calculate_store_digests(struct kimage *image) zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); zero_buf_sz = PAGE_SIZE; - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out; - } - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - desc = kzalloc(desc_size, GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto out_free_tfm; - } - sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) { - ret = -ENOMEM; - goto out_free_desc; - } - - desc->tfm = tfm; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto out_free_sha_regions; + if (!sha_regions) + return -ENOMEM; - digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); - if (!digest) { - ret = -ENOMEM; - goto out_free_sha_regions; - } + sha256_init(&state); for (j = i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; @@ -807,10 +796,7 @@ static int kexec_calculate_store_digests(struct kimage *image) if (check_ima_segment_index(image, i)) continue; - ret = crypto_shash_update(desc, ksegment->kbuf, - ksegment->bufsz); - if (ret) - break; + sha256_update(&state, ksegment->kbuf, ksegment->bufsz); /* * Assume rest of the buffer is filled with zero and @@ -822,44 +808,26 @@ static int kexec_calculate_store_digests(struct kimage *image) if (bytes > zero_buf_sz) bytes = zero_buf_sz; - ret = crypto_shash_update(desc, zero_buf, bytes); - if (ret) - break; + sha256_update(&state, zero_buf, bytes); nullsz -= bytes; } - if (ret) - break; - sha_regions[j].start = ksegment->mem; sha_regions[j].len = ksegment->memsz; j++; } - if (!ret) { - ret = crypto_shash_final(desc, digest); - if (ret) - goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", - sha_regions, sha_region_sz, 0); - if (ret) - goto out_free_digest; + sha256_final(&state, digest); - ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); - if (ret) - goto out_free_digest; - } + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_sha_regions; -out_free_digest: - kfree(digest); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); out_free_sha_regions: vfree(sha_regions); -out_free_desc: - kfree(desc); -out_free_tfm: - kfree(tfm); -out: return ret; } diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c new file mode 100644 index 000000000000..69b953551677 --- /dev/null +++ b/kernel/kexec_handover.c @@ -0,0 +1,1266 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover.c - kexec handover metadata processing + * Copyright (C) 2023 Alexander Graf <graf@amazon.com> + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> + * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include <linux/cma.h> +#include <linux/count_zeros.h> +#include <linux/debugfs.h> +#include <linux/kexec.h> +#include <linux/kexec_handover.h> +#include <linux/libfdt.h> +#include <linux/list.h> +#include <linux/memblock.h> +#include <linux/notifier.h> +#include <linux/page-isolation.h> + +#include <asm/early_ioremap.h> + +/* + * KHO is tightly coupled with mm init and needs access to some of mm + * internal APIs. + */ +#include "../mm/internal.h" +#include "kexec_internal.h" + +#define KHO_FDT_COMPATIBLE "kho-v1" +#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" +#define PROP_SUB_FDT "fdt" + +static bool kho_enable __ro_after_init; + +bool kho_is_enabled(void) +{ + return kho_enable; +} +EXPORT_SYMBOL_GPL(kho_is_enabled); + +static int __init kho_parse_enable(char *p) +{ + return kstrtobool(p, &kho_enable); +} +early_param("kho", kho_parse_enable); + +/* + * Keep track of memory that is to be preserved across KHO. + * + * The serializing side uses two levels of xarrays to manage chunks of per-order + * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a + * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations + * each bitmap will cover 16M of address space. Thus, for 16G of memory at most + * 512K of bitmap memory will be needed for order 0. + * + * This approach is fully incremental, as the serialization progresses folios + * can continue be aggregated to the tracker. The final step, immediately prior + * to kexec would serialize the xarray information into a linked list for the + * successor kernel to parse. + */ + +#define PRESERVE_BITS (512 * 8) + +struct kho_mem_phys_bits { + DECLARE_BITMAP(preserve, PRESERVE_BITS); +}; + +struct kho_mem_phys { + /* + * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized + * to order. + */ + struct xarray phys_bits; +}; + +struct kho_mem_track { + /* Points to kho_mem_phys, each order gets its own bitmap tree */ + struct xarray orders; +}; + +struct khoser_mem_chunk; + +struct kho_serialization { + struct page *fdt; + struct list_head fdt_list; + struct dentry *sub_fdt_dir; + struct kho_mem_track track; + /* First chunk of serialized preserved memory map */ + struct khoser_mem_chunk *preserved_mem_map; +}; + +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +{ + void *elm, *res; + + elm = xa_load(xa, index); + if (elm) + return elm; + + elm = kzalloc(sz, GFP_KERNEL); + if (!elm) + return ERR_PTR(-ENOMEM); + + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); + if (xa_is_err(res)) + res = ERR_PTR(xa_err(res)); + + if (res) { + kfree(elm); + return res; + } + + return elm; +} + +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + const unsigned long pfn_high = pfn >> order; + + physxa = xa_load(&track->orders, order); + if (!physxa) + continue; + + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (!bits) + continue; + + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + + pfn += 1 << order; + } +} + +static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order; + + might_sleep(); + + physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa)); + if (IS_ERR(physxa)) + return PTR_ERR(physxa); + + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, + sizeof(*bits)); + if (IS_ERR(bits)) + return PTR_ERR(bits); + + set_bit(pfn_high % PRESERVE_BITS, bits->preserve); + + return 0; +} + +/* almost as free_reserved_page(), just don't free the page */ +static void kho_restore_page(struct page *page) +{ + ClearPageReserved(page); + init_page_count(page); + adjust_managed_page_count(page, 1); +} + +/** + * kho_restore_folio - recreates the folio from the preserved memory. + * @phys: physical address of the folio. + * + * Return: pointer to the struct folio on success, NULL on failure. + */ +struct folio *kho_restore_folio(phys_addr_t phys) +{ + struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + unsigned long order; + + if (!page) + return NULL; + + order = page->private; + if (order) { + if (order > MAX_PAGE_ORDER) + return NULL; + + prep_compound_page(page, order); + } else { + kho_restore_page(page); + } + + return page_folio(page); +} +EXPORT_SYMBOL_GPL(kho_restore_folio); + +/* Serialize and deserialize struct kho_mem_phys across kexec + * + * Record all the bitmaps in a linked list of pages for the next kernel to + * process. Each chunk holds bitmaps of the same order and each block of bitmaps + * starts at a given physical address. This allows the bitmaps to be sparse. The + * xarray is used to store them in a tree while building up the data structure, + * but the KHO successor kernel only needs to process them once in order. + * + * All of this memory is normal kmalloc() memory and is not marked for + * preservation. The successor kernel will remain isolated to the scratch space + * until it completes processing this list. Once processed all the memory + * storing these ranges will be marked as free. + */ + +struct khoser_mem_bitmap_ptr { + phys_addr_t phys_start; + DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); +}; + +struct khoser_mem_chunk_hdr { + DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); + unsigned int order; + unsigned int num_elms; +}; + +#define KHOSER_BITMAP_SIZE \ + ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ + sizeof(struct khoser_mem_bitmap_ptr)) + +struct khoser_mem_chunk { + struct khoser_mem_chunk_hdr hdr; + struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; +}; + +static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); + +static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, + unsigned long order) +{ + struct khoser_mem_chunk *chunk; + + chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!chunk) + return NULL; + chunk->hdr.order = order; + if (cur_chunk) + KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); + return chunk; +} + +static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) +{ + struct khoser_mem_chunk *chunk = first_chunk; + + while (chunk) { + struct khoser_mem_chunk *tmp = chunk; + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + kfree(tmp); + } +} + +static int kho_mem_serialize(struct kho_serialization *ser) +{ + struct khoser_mem_chunk *first_chunk = NULL; + struct khoser_mem_chunk *chunk = NULL; + struct kho_mem_phys *physxa; + unsigned long order; + + xa_for_each(&ser->track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + chunk = new_chunk(chunk, order); + if (!chunk) + goto err_free; + + if (!first_chunk) + first_chunk = chunk; + + xa_for_each(&physxa->phys_bits, phys, bits) { + struct khoser_mem_bitmap_ptr *elm; + + if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { + chunk = new_chunk(chunk, order); + if (!chunk) + goto err_free; + } + + elm = &chunk->bitmaps[chunk->hdr.num_elms]; + chunk->hdr.num_elms++; + elm->phys_start = (phys * PRESERVE_BITS) + << (order + PAGE_SHIFT); + KHOSER_STORE_PTR(elm->bitmap, bits); + } + } + + ser->preserved_mem_map = first_chunk; + + return 0; + +err_free: + kho_mem_ser_free(first_chunk); + return -ENOMEM; +} + +static void deserialize_bitmap(unsigned int order, + struct khoser_mem_bitmap_ptr *elm) +{ + struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); + unsigned long bit; + + for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { + int sz = 1 << (order + PAGE_SHIFT); + phys_addr_t phys = + elm->phys_start + (bit << (order + PAGE_SHIFT)); + struct page *page = phys_to_page(phys); + + memblock_reserve(phys, sz); + memblock_reserved_mark_noinit(phys, sz); + page->private = order; + } +} + +static void __init kho_mem_deserialize(const void *fdt) +{ + struct khoser_mem_chunk *chunk; + const phys_addr_t *mem; + int len; + + mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + + if (!mem || len != sizeof(*mem)) { + pr_err("failed to get preserved memory bitmaps\n"); + return; + } + + chunk = *mem ? phys_to_virt(*mem) : NULL; + while (chunk) { + unsigned int i; + + for (i = 0; i != chunk->hdr.num_elms; i++) + deserialize_bitmap(chunk->hdr.order, + &chunk->bitmaps[i]); + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + } +} + +/* + * With KHO enabled, memory can become fragmented because KHO regions may + * be anywhere in physical address space. The scratch regions give us a + * safe zones that we will never see KHO allocations from. This is where we + * can later safely load our new kexec images into and then use the scratch + * area for early allocations that happen before page allocator is + * initialized. + */ +static struct kho_scratch *kho_scratch; +static unsigned int kho_scratch_cnt; + +/* + * The scratch areas are scaled by default as percent of memory allocated from + * memblock. A user can override the scale with command line parameter: + * + * kho_scratch=N% + * + * It is also possible to explicitly define size for a lowmem, a global and + * per-node scratch areas: + * + * kho_scratch=l[KMG],n[KMG],m[KMG] + * + * The explicit size definition takes precedence over scale definition. + */ +static unsigned int scratch_scale __initdata = 200; +static phys_addr_t scratch_size_global __initdata; +static phys_addr_t scratch_size_pernode __initdata; +static phys_addr_t scratch_size_lowmem __initdata; + +static int __init kho_parse_scratch_size(char *p) +{ + size_t len; + unsigned long sizes[3]; + int i; + + if (!p) + return -EINVAL; + + len = strlen(p); + if (!len) + return -EINVAL; + + /* parse nn% */ + if (p[len - 1] == '%') { + /* unsigned int max is 4,294,967,295, 10 chars */ + char s_scale[11] = {}; + int ret = 0; + + if (len > ARRAY_SIZE(s_scale)) + return -EINVAL; + + memcpy(s_scale, p, len - 1); + ret = kstrtouint(s_scale, 10, &scratch_scale); + if (!ret) + pr_notice("scratch scale is %d%%\n", scratch_scale); + return ret; + } + + /* parse ll[KMG],mm[KMG],nn[KMG] */ + for (i = 0; i < ARRAY_SIZE(sizes); i++) { + char *endp = p; + + if (i > 0) { + if (*p != ',') + return -EINVAL; + p += 1; + } + + sizes[i] = memparse(p, &endp); + if (!sizes[i] || endp == p) + return -EINVAL; + p = endp; + } + + scratch_size_lowmem = sizes[0]; + scratch_size_global = sizes[1]; + scratch_size_pernode = sizes[2]; + scratch_scale = 0; + + pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", + (u64)(scratch_size_lowmem >> 20), + (u64)(scratch_size_global >> 20), + (u64)(scratch_size_pernode >> 20)); + + return 0; +} +early_param("kho_scratch", kho_parse_scratch_size); + +static void __init scratch_size_update(void) +{ + phys_addr_t size; + + if (!scratch_scale) + return; + + size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, + NUMA_NO_NODE); + size = size * scratch_scale / 100; + scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); + + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + NUMA_NO_NODE); + size = size * scratch_scale / 100 - scratch_size_lowmem; + scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +static phys_addr_t __init scratch_size_node(int nid) +{ + phys_addr_t size; + + if (scratch_scale) { + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + nid); + size = size * scratch_scale / 100; + } else { + size = scratch_size_pernode; + } + + return round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +/** + * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec + * + * With KHO we can preserve arbitrary pages in the system. To ensure we still + * have a large contiguous region of memory when we search the physical address + * space for target memory, let's make sure we always have a large CMA region + * active. This CMA region will only be used for movable pages which are not a + * problem for us during KHO because we can just move them somewhere else. + */ +static void __init kho_reserve_scratch(void) +{ + phys_addr_t addr, size; + int nid, i = 0; + + if (!kho_enable) + return; + + scratch_size_update(); + + /* FIXME: deal with node hot-plug/remove */ + kho_scratch_cnt = num_online_nodes() + 2; + size = kho_scratch_cnt * sizeof(*kho_scratch); + kho_scratch = memblock_alloc(size, PAGE_SIZE); + if (!kho_scratch) + goto err_disable_kho; + + /* + * reserve scratch area in low memory for lowmem allocations in the + * next kernel + */ + size = scratch_size_lowmem; + addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, + ARCH_LOW_ADDRESS_LIMIT); + if (!addr) + goto err_free_scratch_desc; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + /* reserve large contiguous area for allocations without nid */ + size = scratch_size_global; + addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + for_each_online_node(nid) { + size = scratch_size_node(nid); + addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, + 0, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, true); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + } + + return; + +err_free_scratch_areas: + for (i--; i >= 0; i--) + memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); +err_free_scratch_desc: + memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); +err_disable_kho: + kho_enable = false; +} + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +/** + * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. + * @ser: serialization control object passed by KHO notifiers. + * @name: name of the sub tree. + * @fdt: the sub tree blob. + * + * Creates a new child node named @name in KHO root FDT and records + * the physical address of @fdt. The pages of @fdt must also be preserved + * by KHO for the new kernel to retrieve it after kexec. + * + * A debugfs blob entry is also created at + * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. + * + * Return: 0 on success, error code on failure + */ +int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) +{ + int err = 0; + u64 phys = (u64)virt_to_phys(fdt); + void *root = page_to_virt(ser->fdt); + + err |= fdt_begin_node(root, name); + err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); + err |= fdt_end_node(root); + + if (err) + return err; + + return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); +} +EXPORT_SYMBOL_GPL(kho_add_subtree); + +struct kho_out { + struct blocking_notifier_head chain_head; + + struct dentry *dir; + + struct mutex lock; /* protects KHO FDT finalization */ + + struct kho_serialization ser; + bool finalized; +}; + +static struct kho_out kho_out = { + .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), + .lock = __MUTEX_INITIALIZER(kho_out.lock), + .ser = { + .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), + .track = { + .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), + }, + }, + .finalized = false, +}; + +int register_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(register_kho_notifier); + +int unregister_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(unregister_kho_notifier); + +/** + * kho_preserve_folio - preserve a folio across kexec. + * @folio: folio to preserve. + * + * Instructs KHO to preserve the whole folio across kexec. The order + * will be preserved as well. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.ser.track; + + if (kho_out.finalized) + return -EBUSY; + + return __kho_preserve_order(track, pfn, order); +} +EXPORT_SYMBOL_GPL(kho_preserve_folio); + +/** + * kho_preserve_phys - preserve a physically contiguous range across kexec. + * @phys: physical address of the range. + * @size: size of the range. + * + * Instructs KHO to preserve the memory range from @phys to @phys + @size + * across kexec. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_phys(phys_addr_t phys, size_t size) +{ + unsigned long pfn = PHYS_PFN(phys); + unsigned long failed_pfn = 0; + const unsigned long start_pfn = pfn; + const unsigned long end_pfn = PHYS_PFN(phys + size); + int err = 0; + struct kho_mem_track *track = &kho_out.ser.track; + + if (kho_out.finalized) + return -EBUSY; + + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) + return -EINVAL; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + err = __kho_preserve_order(track, pfn, order); + if (err) { + failed_pfn = pfn; + break; + } + + pfn += 1 << order; + } + + if (err) + __kho_unpreserve(track, start_pfn, failed_pfn); + + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_phys); + +/* Handling for debug/kho/out */ + +static struct dentry *debugfs_root; + +static int kho_out_update_debugfs_fdt(void) +{ + int err = 0; + struct fdt_debugfs *ff, *tmp; + + if (kho_out.finalized) { + err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, + "fdt", page_to_virt(kho_out.ser.fdt)); + } else { + list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + } + } + + return err; +} + +static int kho_abort(void) +{ + int err; + unsigned long order; + struct kho_mem_phys *physxa; + + xa_for_each(&kho_out.ser.track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + xa_for_each(&physxa->phys_bits, phys, bits) + kfree(bits); + + xa_destroy(&physxa->phys_bits); + kfree(physxa); + } + xa_destroy(&kho_out.ser.track.orders); + + if (kho_out.ser.preserved_mem_map) { + kho_mem_ser_free(kho_out.ser.preserved_mem_map); + kho_out.ser.preserved_mem_map = NULL; + } + + err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, + NULL); + err = notifier_to_errno(err); + + if (err) + pr_err("Failed to abort KHO finalization: %d\n", err); + + return err; +} + +static int kho_finalize(void) +{ + int err = 0; + u64 *preserved_mem_map; + void *fdt = page_to_virt(kho_out.ser.fdt); + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); + /** + * Reserve the preserved-memory-map property in the root FDT, so + * that all property definitions will precede subnodes created by + * KHO callers. + */ + err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, + sizeof(*preserved_mem_map), + (void **)&preserved_mem_map); + if (err) + goto abort; + + err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); + if (err) + goto abort; + + err = blocking_notifier_call_chain(&kho_out.chain_head, + KEXEC_KHO_FINALIZE, &kho_out.ser); + err = notifier_to_errno(err); + if (err) + goto abort; + + err = kho_mem_serialize(&kho_out.ser); + if (err) + goto abort; + + *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); + + err |= fdt_end_node(fdt); + err |= fdt_finish(fdt); + +abort: + if (err) { + pr_err("Failed to convert KHO state tree: %d\n", err); + kho_abort(); + } + + return err; +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + mutex_lock(&kho_out.lock); + *val = kho_out.finalized; + mutex_unlock(&kho_out.lock); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 _val) +{ + int ret = 0; + bool val = !!_val; + + mutex_lock(&kho_out.lock); + + if (val == kho_out.finalized) { + if (kho_out.finalized) + ret = -EEXIST; + else + ret = -ENOENT; + goto unlock; + } + + if (val) + ret = kho_finalize(); + else + ret = kho_abort(); + + if (ret) + goto unlock; + + kho_out.finalized = val; + ret = kho_out_update_debugfs_fdt(); + +unlock: + mutex_unlock(&kho_out.lock); + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +static __init int kho_out_debugfs_init(void) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &fops_kho_out_finalize); + if (IS_ERR(f)) + goto err_rmdir; + + kho_out.dir = dir; + kho_out.ser.sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +struct kho_in { + struct dentry *dir; + phys_addr_t fdt_phys; + phys_addr_t scratch_phys; + struct list_head fdt_list; +}; + +static struct kho_in kho_in = { + .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), +}; + +static const void *kho_get_fdt(void) +{ + return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; +} + +/** + * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. + * @name: the name of the sub FDT passed to kho_add_subtree(). + * @phys: if found, the physical address of the sub FDT is stored in @phys. + * + * Retrieve a preserved sub FDT named @name and store its physical + * address in @phys. + * + * Return: 0 on success, error code on failure + */ +int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +{ + const void *fdt = kho_get_fdt(); + const u64 *val; + int offset, len; + + if (!fdt) + return -ENOENT; + + if (!phys) + return -EINVAL; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) + return -ENOENT; + + val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); + if (!val || len != sizeof(*val)) + return -EINVAL; + + *phys = (phys_addr_t)*val; + + return 0; +} +EXPORT_SYMBOL_GPL(kho_retrieve_subtree); + +/* Handling for debugfs/kho/in */ + +static __init int kho_in_debugfs_init(const void *fdt) +{ + struct dentry *sub_fdt_dir; + int err, child; + + kho_in.dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(kho_in.dir)) + return PTR_ERR(kho_in.dir); + + sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", + name, len); + continue; + } + err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, + err); + continue; + } + } + + return 0; + +err_rmdir: + debugfs_remove_recursive(kho_in.dir); + return err; +} + +static __init int kho_init(void) +{ + int err = 0; + const void *fdt = kho_get_fdt(); + + if (!kho_enable) + return 0; + + kho_out.ser.fdt = alloc_page(GFP_KERNEL); + if (!kho_out.ser.fdt) { + err = -ENOMEM; + goto err_free_scratch; + } + + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) { + err = -ENOENT; + goto err_free_fdt; + } + + err = kho_out_debugfs_init(); + if (err) + goto err_free_fdt; + + if (fdt) { + err = kho_in_debugfs_init(fdt); + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) + pr_err("failed exposing handover FDT in debugfs: %d\n", + err); + + return 0; + } + + for (int i = 0; i < kho_scratch_cnt; i++) { + unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); + unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; + unsigned long pfn; + + for (pfn = base_pfn; pfn < base_pfn + count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); + } + + return 0; + +err_free_fdt: + put_page(kho_out.ser.fdt); + kho_out.ser.fdt = NULL; +err_free_scratch: + for (int i = 0; i < kho_scratch_cnt; i++) { + void *start = __va(kho_scratch[i].addr); + void *end = start + kho_scratch[i].size; + + free_reserved_area(start, end, -1, ""); + } + kho_enable = false; + return err; +} +late_initcall(kho_init); + +static void __init kho_release_scratch(void) +{ + phys_addr_t start, end; + u64 i; + + memmap_init_kho_scratch_pages(); + + /* + * Mark scratch mem as CMA before we return it. That way we + * ensure that no kernel allocations happen on it. That means + * we can reuse it as scratch memory again later. + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { + ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); + ulong end_pfn = pageblock_align(PFN_UP(end)); + ulong pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) + set_pageblock_migratetype(pfn_to_page(pfn), + MIGRATE_CMA); + } +} + +void __init kho_memory_init(void) +{ + struct folio *folio; + + if (kho_in.scratch_phys) { + kho_scratch = phys_to_virt(kho_in.scratch_phys); + kho_release_scratch(); + + kho_mem_deserialize(kho_get_fdt()); + folio = kho_restore_folio(kho_in.fdt_phys); + if (!folio) + pr_warn("failed to restore folio for KHO fdt\n"); + } else { + kho_reserve_scratch(); + } +} + +void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, + phys_addr_t scratch_phys, u64 scratch_len) +{ + void *fdt = NULL; + struct kho_scratch *scratch = NULL; + int err = 0; + unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); + + /* Validate the input FDT */ + fdt = early_memremap(fdt_phys, fdt_len); + if (!fdt) { + pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); + err = -EFAULT; + goto out; + } + err = fdt_check_header(fdt); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", + fdt_phys, err); + err = -EINVAL; + goto out; + } + err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", + fdt_phys, KHO_FDT_COMPATIBLE, err); + err = -EINVAL; + goto out; + } + + scratch = early_memremap(scratch_phys, scratch_len); + if (!scratch) { + pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", + scratch_phys, scratch_len); + err = -EFAULT; + goto out; + } + + /* + * We pass a safe contiguous blocks of memory to use for early boot + * purporses from the previous kernel so that we can resize the + * memblock array as needed. + */ + for (int i = 0; i < scratch_cnt; i++) { + struct kho_scratch *area = &scratch[i]; + u64 size = area->size; + + memblock_add(area->addr, size); + err = memblock_mark_kho_scratch(area->addr, size); + if (WARN_ON(err)) { + pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", + &area->addr, &size, err); + goto out; + } + pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); + } + + memblock_reserve(scratch_phys, scratch_len); + + /* + * Now that we have a viable region of scratch memory, let's tell + * the memblocks allocator to only use that for any allocations. + * That way we ensure that nothing scribbles over in use data while + * we initialize the page tables which we will need to ingest all + * memory reservations from the previous kernel. + */ + memblock_set_kho_scratch_only(); + + kho_in.fdt_phys = fdt_phys; + kho_in.scratch_phys = scratch_phys; + kho_scratch_cnt = scratch_cnt; + pr_info("found kexec handover data. Will skip init for some devices\n"); + +out: + if (fdt) + early_memunmap(fdt, fdt_len); + if (scratch) + early_memunmap(scratch, scratch_len); + if (err) + pr_warn("disabling KHO revival: %d\n", err); +} + +/* Helper functions for kexec_file_load */ + +int kho_fill_kimage(struct kimage *image) +{ + ssize_t scratch_size; + int err = 0; + struct kexec_buf scratch; + + if (!kho_enable) + return 0; + + image->kho.fdt = page_to_phys(kho_out.ser.fdt); + + scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; + scratch = (struct kexec_buf){ + .image = image, + .buffer = kho_scratch, + .bufsz = scratch_size, + .mem = KEXEC_BUF_MEM_UNKNOWN, + .memsz = scratch_size, + .buf_align = SZ_64K, /* Makes it easier to map */ + .buf_max = ULONG_MAX, + .top_down = true, + }; + err = kexec_add_buffer(&scratch); + if (err) + return err; + image->kho.scratch = &image->segment[image->nr_segments - 1]; + + return 0; +} + +static int kho_walk_scratch(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret = 0; + int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + struct resource res = { + .start = kho_scratch[i].addr, + .end = kho_scratch[i].addr + kho_scratch[i].size - 1, + }; + + /* Try to fit the kimage into our KHO scratch region */ + ret = func(&res, kbuf); + if (ret) + break; + } + + return ret; +} + +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret; + + if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) + return 1; + + ret = kho_walk_scratch(kbuf, func); + + return ret == 1 ? 0 : -EADDRNOTAVAIL; +} diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index d35d9792402d..30a733a55a67 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -39,4 +39,20 @@ extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ + +struct kexec_buf; + +#ifdef CONFIG_KEXEC_HANDOVER +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)); +int kho_fill_kimage(struct kimage *image); +#else +static inline int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 1; +} + +static inline int kho_fill_kimage(struct kimage *image) { return 0; } +#endif /* CONFIG_KEXEC_HANDOVER */ #endif /* LINUX_KEXEC_INTERNAL_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 555e2b3a665a..61fa97da7989 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -29,6 +29,7 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#include <linux/hung_task.h> #define CREATE_TRACE_POINTS #include <trace/events/lock.h> @@ -191,7 +192,7 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - WRITE_ONCE(current->blocker_mutex, lock); + hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); #endif debug_mutex_add_waiter(lock, waiter, current); @@ -209,7 +210,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) debug_mutex_remove_waiter(lock, waiter, current); #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - WRITE_ONCE(current->blocker_mutex, NULL); + hung_task_clear_blocker(); #endif } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index de9117c0e671..3ef032e22f7e 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -34,6 +34,7 @@ #include <linux/spinlock.h> #include <linux/ftrace.h> #include <trace/events/lock.h> +#include <linux/hung_task.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -41,6 +42,41 @@ static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ + WRITE_ONCE((sem)->last_holder, (unsigned long)current); +} + +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ + if (READ_ONCE((sem)->last_holder) == (unsigned long)current) + WRITE_ONCE((sem)->last_holder, 0UL); +} + +unsigned long sem_last_holder(struct semaphore *sem) +{ + return READ_ONCE(sem->last_holder); +} +#else +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ +} +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ +} +unsigned long sem_last_holder(struct semaphore *sem) +{ + return 0UL; +} +#endif + +static inline void __sem_acquire(struct semaphore *sem) +{ + sem->count--; + hung_task_sem_set_holder(sem); +} + /** * down - acquire the semaphore * @sem: the semaphore to be acquired @@ -59,7 +95,7 @@ void __sched down(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else __down(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -83,7 +119,7 @@ int __sched down_interruptible(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_interruptible(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -110,7 +146,7 @@ int __sched down_killable(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_killable(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -140,7 +176,7 @@ int __sched down_trylock(struct semaphore *sem) raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) - sem->count = count; + __sem_acquire(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); @@ -165,7 +201,7 @@ int __sched down_timeout(struct semaphore *sem, long timeout) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -187,6 +223,9 @@ void __sched up(struct semaphore *sem) DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); + + hung_task_sem_clear_if_holder(sem); + if (likely(list_empty(&sem->wait_list))) sem->count++; else @@ -228,8 +267,10 @@ static inline int __sched ___down_common(struct semaphore *sem, long state, raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); - if (waiter.up) + if (waiter.up) { + hung_task_sem_set_holder(sem); return 0; + } } timed_out: @@ -246,10 +287,14 @@ static inline int __sched __down_common(struct semaphore *sem, long state, { int ret; + hung_task_set_blocker(sem, BLOCKER_TYPE_SEM); + trace_contention_begin(sem, 0); ret = ___down_common(sem, state, timeout); trace_contention_end(sem, ret); + hung_task_clear_blocker(); + return ret; } diff --git a/kernel/panic.c b/kernel/panic.c index 047ea3215312..b0b9a8bf4560 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -307,12 +307,10 @@ static void panic_other_cpus_shutdown(bool crash_kexec) } /** - * panic - halt the system - * @fmt: The text string to print + * panic - halt the system + * @fmt: The text string to print * - * Display a message, then perform cleanups. - * - * This function never returns. + * Display a message, then perform cleanups. This function never returns. */ void panic(const char *fmt, ...) { diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4e6e24e8b854..2af36cfe35cd 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1094,16 +1094,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) ((unsigned long long) region->end_pfn << PAGE_SHIFT) - 1); - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - /* - * It is safe to ignore the result of - * mem_bm_set_bit_check() here, since we won't - * touch the PFNs for which the error is - * returned anyway. - */ - mem_bm_set_bit_check(bm, pfn); - } + for_each_valid_pfn(pfn, region->start_pfn, region->end_pfn) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } @@ -1255,21 +1254,20 @@ static void mark_free_pages(struct zone *zone) spin_lock_irqsave(&zone->lock, flags); max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); + for_each_valid_pfn(pfn, zone->zone_start_pfn, max_zone_pfn) { + page = pfn_to_page(pfn); - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } - if (page_zone(page) != zone) - continue; + if (page_zone(page) != zone) + continue; - if (!swsusp_page_is_forbidden(page)) - swsusp_unset_page_free(page); - } + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } for_each_migratetype_order(order, t) { list_for_each_entry(page, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d5f89f9ef29f..75a84efad40f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -921,7 +921,6 @@ ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, unsigned long args[ARRAY_SIZE(info->entry.args)]; int i; - info->op = PTRACE_SYSCALL_INFO_ENTRY; info->entry.nr = syscall_get_nr(child, regs); syscall_get_arguments(child, regs, args); for (i = 0; i < ARRAY_SIZE(args); i++) @@ -943,10 +942,12 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, * diverge significantly enough. */ ptrace_get_syscall_info_entry(child, regs, info); - info->op = PTRACE_SYSCALL_INFO_SECCOMP; info->seccomp.ret_data = child->ptrace_message; - /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + /* + * ret_data is the last non-reserved field + * in struct ptrace_syscall_info.seccomp + */ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); } @@ -954,7 +955,6 @@ static unsigned long ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, struct ptrace_syscall_info *info) { - info->op = PTRACE_SYSCALL_INFO_EXIT; info->exit.rval = syscall_get_error(child, regs); info->exit.is_error = !!info->exit.rval; if (!info->exit.is_error) @@ -965,19 +965,8 @@ ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, } static int -ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, - void __user *datavp) +ptrace_get_syscall_info_op(struct task_struct *child) { - struct pt_regs *regs = task_pt_regs(child); - struct ptrace_syscall_info info = { - .op = PTRACE_SYSCALL_INFO_NONE, - .arch = syscall_get_arch(child), - .instruction_pointer = instruction_pointer(regs), - .stack_pointer = user_stack_pointer(regs), - }; - unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); - unsigned long write_size; - /* * This does not need lock_task_sighand() to access * child->last_siginfo because ptrace_freeze_traced() @@ -988,24 +977,160 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, case SIGTRAP | 0x80: switch (child->ptrace_message) { case PTRACE_EVENTMSG_SYSCALL_ENTRY: - actual_size = ptrace_get_syscall_info_entry(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_ENTRY; case PTRACE_EVENTMSG_SYSCALL_EXIT: - actual_size = ptrace_get_syscall_info_exit(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_EXIT; + default: + return PTRACE_SYSCALL_INFO_NONE; } - break; case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): - actual_size = ptrace_get_syscall_info_seccomp(child, regs, - &info); + return PTRACE_SYSCALL_INFO_SECCOMP; + default: + return PTRACE_SYSCALL_INFO_NONE; + } +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = ptrace_get_syscall_info_op(child), + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_SECCOMP: + actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info); break; } write_size = min(actual_size, user_size); return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; } + +static int +ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int nr = info->entry.nr; + int i; + + /* + * Check that the syscall number specified in info->entry.nr + * is either a value of type "int" or a sign-extended value + * of type "int". + */ + if (nr != info->entry.nr) + return -ERANGE; + + for (i = 0; i < ARRAY_SIZE(args); i++) { + args[i] = info->entry.args[i]; + /* + * Check that the syscall argument specified in + * info->entry.args[i] is either a value of type + * "unsigned long" or a sign-extended value of type "long". + */ + if (args[i] != info->entry.args[i]) + return -ERANGE; + } + + syscall_set_nr(child, regs, nr); + /* + * If the syscall number is set to -1, setting syscall arguments is not + * just pointless, it would also clobber the syscall return value on + * those architectures that share the same register both for the first + * argument of syscall and its return value. + */ + if (nr != -1) + syscall_set_arguments(child, regs, args); + + return 0; +} + +static int +ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * info->entry is currently a subset of info->seccomp, + * info->seccomp.ret_data is currently ignored. + */ + return ptrace_set_syscall_info_entry(child, regs, info); +} + +static int +ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + long rval = info->exit.rval; + + /* + * Check that the return value specified in info->exit.rval + * is either a value of type "long" or a sign-extended value + * of type "long". + */ + if (rval != info->exit.rval) + return -ERANGE; + + if (info->exit.is_error) + syscall_set_return_value(child, regs, rval, 0); + else + syscall_set_return_value(child, regs, 0, rval); + + return 0; +} + +static int +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size, + const void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info; + + if (user_size < sizeof(info)) + return -EINVAL; + + /* + * The compatibility is tracked by info.op and info.flags: if user-space + * does not instruct us to use unknown extra bits from future versions + * of ptrace_syscall_info, we are not going to read them either. + */ + if (copy_from_user(&info, datavp, sizeof(info))) + return -EFAULT; + + /* Reserved for future use. */ + if (info.flags || info.reserved) + return -EINVAL; + + /* Changing the type of the system call stop is not supported yet. */ + if (ptrace_get_syscall_info_op(child) != info.op) + return -EINVAL; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + return ptrace_set_syscall_info_entry(child, regs, &info); + case PTRACE_SYSCALL_INFO_EXIT: + return ptrace_set_syscall_info_exit(child, regs, &info); + case PTRACE_SYSCALL_INFO_SECCOMP: + return ptrace_set_syscall_info_seccomp(child, regs, &info); + default: + /* Other types of system call stops are not supported yet. */ + return -EINVAL; + } +} #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, @@ -1224,6 +1349,10 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_GET_SYSCALL_INFO: ret = ptrace_get_syscall_info(child, addr, datavp); break; + + case PTRACE_SET_SYSCALL_INFO: + ret = ptrace_set_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/kernel/relay.c b/kernel/relay.c index 5ac7e711e4b6..c0c93a04d4ce 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -452,7 +452,7 @@ int relay_prepare_cpu(unsigned int cpu) /** * relay_open - create a new relay channel - * @base_filename: base name of files to create, %NULL for buffering only + * @base_filename: base name of files to create * @parent: dentry of parent directory, %NULL for root directory or buffer * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers @@ -465,10 +465,6 @@ int relay_prepare_cpu(unsigned int cpu) * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File * permissions will be %S_IRUSR. - * - * If opening a buffer (@parent = NULL) that you later wish to register - * in a filesystem, call relay_late_setup_files() once the @parent dentry - * is available. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, @@ -540,111 +536,6 @@ struct rchan_percpu_buf_dispatcher { struct dentry *dentry; }; -/* Called in atomic context. */ -static void __relay_set_buf_dentry(void *info) -{ - struct rchan_percpu_buf_dispatcher *p = info; - - relay_set_buf_dentry(p->buf, p->dentry); -} - -/** - * relay_late_setup_files - triggers file creation - * @chan: channel to operate on - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory - * - * Returns 0 if successful, non-zero otherwise. - * - * Use to setup files for a previously buffer-only channel created - * by relay_open() with a NULL parent dentry. - * - * For example, this is useful for perfomring early tracing in kernel, - * before VFS is up and then exposing the early results once the dentry - * is available. - */ -int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent) -{ - int err = 0; - unsigned int i, curr_cpu; - unsigned long flags; - struct dentry *dentry; - struct rchan_buf *buf; - struct rchan_percpu_buf_dispatcher disp; - - if (!chan || !base_filename) - return -EINVAL; - - strscpy(chan->base_filename, base_filename, NAME_MAX); - - mutex_lock(&relay_channels_mutex); - /* Is chan already set up? */ - if (unlikely(chan->has_base_filename)) { - mutex_unlock(&relay_channels_mutex); - return -EEXIST; - } - chan->has_base_filename = 1; - chan->parent = parent; - - if (chan->is_global) { - err = -EINVAL; - buf = *per_cpu_ptr(chan->buf, 0); - if (!WARN_ON_ONCE(!buf)) { - dentry = relay_create_buf_file(chan, buf, 0); - if (dentry && !WARN_ON_ONCE(!chan->is_global)) { - relay_set_buf_dentry(buf, dentry); - err = 0; - } - } - mutex_unlock(&relay_channels_mutex); - return err; - } - - curr_cpu = get_cpu(); - /* - * The CPU hotplug notifier ran before us and created buffers with - * no files associated. So it's safe to call relay_setup_buf_file() - * on all currently online CPUs. - */ - for_each_online_cpu(i) { - buf = *per_cpu_ptr(chan->buf, i); - if (unlikely(!buf)) { - WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); - err = -EINVAL; - break; - } - - dentry = relay_create_buf_file(chan, buf, i); - if (unlikely(!dentry)) { - err = -EINVAL; - break; - } - - if (curr_cpu == i) { - local_irq_save(flags); - relay_set_buf_dentry(buf, dentry); - local_irq_restore(flags); - } else { - disp.buf = buf; - disp.dentry = dentry; - smp_mb(); - /* relay_channels_mutex must be held, so wait. */ - err = smp_call_function_single(i, - __relay_set_buf_dentry, - &disp, 1); - } - if (unlikely(err)) - break; - } - put_cpu(); - mutex_unlock(&relay_channels_mutex); - - return err; -} -EXPORT_SYMBOL_GPL(relay_late_setup_files); - /** * relay_switch_subbuf - switch to a new sub-buffer * @buf: channel buffer diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 125912c0e9dd..b4326827e326 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3329,6 +3329,15 @@ static void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * Memory is pinned to only one NUMA node via cpuset.mems, naturally + * no page can be migrated. + */ + if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) { + trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed); + return; + } + if (!mm->numa_next_scan) { mm->numa_next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6859008ca34d..e24509bd0af5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2226,7 +2226,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL; struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; struct page *page; @@ -2252,7 +2252,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) - goto fail_free_buffer; + return NULL; rb_check_bpage(cpu_buffer, bpage); @@ -2318,13 +2318,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_head_page_activate(cpu_buffer); } - return cpu_buffer; + return_ptr(cpu_buffer); fail_free_reader: free_buffer_page(cpu_buffer->reader_page); - fail_free_buffer: - kfree(cpu_buffer); return NULL; } @@ -2359,7 +2357,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, unsigned long scratch_size, struct lock_class_key *key) { - struct trace_buffer *buffer; + struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; int subbuf_size; int bsize; @@ -2373,7 +2371,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) - goto fail_free_buffer; + return NULL; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); @@ -2472,7 +2470,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); - return buffer; + return_ptr(buffer); fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { @@ -2484,8 +2482,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - fail_free_buffer: - kfree(buffer); return NULL; } @@ -2849,6 +2845,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; + /* + * Keep CPUs from coming online while resizing to synchronize + * with new per CPU buffers being created. + */ + guard(cpus_read_lock)(); + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); @@ -2893,7 +2895,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cond_resched(); } - cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2933,7 +2934,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2961,8 +2961,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - cpus_read_lock(); - /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); @@ -2981,7 +2979,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - cpus_read_unlock(); } out: @@ -4684,10 +4681,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); - if (rb_try_to_discard(cpu_buffer, event)) - goto out; - - out: + rb_try_to_discard(cpu_buffer, event); rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); @@ -6020,6 +6014,39 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +/* + * When the buffer is memory mapped to user space, each sub buffer + * has a unique id that is used by the meta data to tell the user + * where the current reader page is. + * + * For a normal allocated ring buffer, the id is saved in the buffer page + * id field, and updated via this function. + * + * But for a fixed memory mapped buffer, the id is already assigned for + * fixed memory ording in the memory layout and can not be used. Instead + * the index of where the page lies in the memory layout is used. + * + * For the normal pages, set the buffer page id with the passed in @id + * value and return that. + * + * For fixed memory mapped pages, get the page index in the memory layout + * and return that as the id. + */ +static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage, int id) +{ + /* + * For boot buffers, the id is the index, + * otherwise, set the buffer page with this id + */ + if (cpu_buffer->ring_meta) + id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page); + else + bpage->id = id; + + return id; +} + static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; @@ -6028,7 +6055,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) return; meta->reader.read = cpu_buffer->reader_page->read; - meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, + cpu_buffer->reader_page->id); + meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); @@ -6098,21 +6127,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + return; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); - - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -6300,37 +6324,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - goto out; + return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ - if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { - ret = -EBUSY; - goto out; - } + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) + return -EBUSY; /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) - goto out; + return -EINVAL; if (buffer_a->subbuf_order != buffer_b->subbuf_order) - goto out; - - ret = -EAGAIN; + return -EINVAL; if (atomic_read(&buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&buffer_b->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_b->record_disabled)) - goto out; + return -EAGAIN; /* * We can't do a synchronize_rcu here because this @@ -6367,7 +6387,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); -out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -6526,38 +6545,37 @@ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; - unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; - int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -1; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) - goto out; + return -1; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) - goto out; + return -1; + if (data_page->order != buffer->subbuf_order) - goto out; + return -1; bpage = data_page->data; if (!bpage) - goto out; + return -1; - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); reader = rb_get_reader_page(cpu_buffer); if (!reader) - goto out_unlock; + return -1; event = rb_reader_event(cpu_buffer); @@ -6591,7 +6609,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) - goto out_unlock; + return -1; if (len > (commit - read)) len = (commit - read); @@ -6600,7 +6618,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, size = rb_event_ts_length(event); if (len < size) - goto out_unlock; + return -1; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -6658,7 +6676,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (reader->real_end) local_set(&bpage->commit, reader->real_end); } - ret = read; cpu_buffer->lost_events = 0; @@ -6685,11 +6702,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); - out_unlock: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - out: - return ret; + return read; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); @@ -6944,23 +6957,29 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; + int cnt = 0; int id = 0; - subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; - cpu_buffer->reader_page->id = id++; + id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); + subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page; + cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { + id = rb_page_id(cpu_buffer, subbuf, id); + if (WARN_ON(id >= nr_subbufs)) break; subbuf_ids[id] = (unsigned long)subbuf->page; - subbuf->id = id; rb_inc_page(&subbuf); id++; + cnt++; } while (subbuf != first_subbuf); + WARN_ON(cnt != nr_subbufs); + /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; @@ -7052,7 +7071,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; - struct page **pages; + struct page **pages __free(kfree) = NULL; int p = 0, s = 0; int err; @@ -7120,10 +7139,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct page *page; int off = 0; - if (WARN_ON_ONCE(s >= nr_subbufs)) { - err = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(s >= nr_subbufs)) + return -EINVAL; page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); @@ -7138,9 +7155,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -out: - kfree(pages); - return err; } #else @@ -7156,36 +7170,34 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags, *subbuf_ids; - int err = 0; + int err; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); - mutex_unlock(&cpu_buffer->mapping_lock); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) - goto unlock; + return err; /* subbuf_ids include the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); - err = -ENOMEM; - goto unlock; + return -ENOMEM; } atomic_inc(&cpu_buffer->resize_disabled); @@ -7213,35 +7225,29 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, atomic_dec(&cpu_buffer->resize_disabled); } -unlock: - mutex_unlock(&buffer->mutex); - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { - err = -ENODEV; - goto out; + return -ENODEV; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); - goto out; + return 0; } - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ @@ -7256,12 +7262,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); - mutex_unlock(&buffer->mutex); - -out: - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) @@ -7302,8 +7303,8 @@ consume: /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; - if (cpu_buffer->reader_page != cpu_buffer->commit_page) { - if (missed_events) { + if (missed_events) { + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* @@ -7324,13 +7325,23 @@ consume: local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); + } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, + "Reader on commit with %ld missed events", + missed_events)) { + /* + * There shouldn't be any missed events if the tail_page + * is on the reader page. But if the tail page is not on the + * reader page and the commit_page is, that would mean that + * there's a commit_overrun (an interrupt preempted an + * addition of an event and then filled the buffer + * with new events). In this case it's not an + * error, but it should still be reported. + * + * TODO: Add missed events to the page for user space to know. + */ + pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n", + cpu, missed_events, cpu_buffer->reader_page->page->time_stamp); } - } else { - /* - * There really shouldn't be any missed events if the commit - * is on the reader page. - */ - WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d70b0351fb61..95ae7c4e5835 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -51,6 +51,7 @@ #include <linux/workqueue.h> #include <linux/sort.h> #include <linux/io.h> /* vmap_page_range() */ +#include <linux/fs_context.h> #include <asm/setup.h> /* COMMAND_LINE_SIZE */ @@ -6711,6 +6712,22 @@ static int tracing_wait_pipe(struct file *filp) return 1; } +static bool update_last_data_if_empty(struct trace_array *tr) +{ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return false; + + if (!ring_buffer_empty(tr->array_buffer.buffer)) + return false; + + /* + * If the buffer contains the last boot data and all per-cpu + * buffers are empty, reset it from the kernel side. + */ + update_last_data(tr); + return true; +} + /* * Consumer reader. */ @@ -6742,6 +6759,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } waitagain: + if (update_last_data_if_empty(iter->tr)) + return 0; + sret = tracing_wait_pipe(filp); if (sret <= 0) return sret; @@ -8320,6 +8340,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) { if (trace_empty(iter) && !iter->closed) { + if (update_last_data_if_empty(iter->tr)) + return 0; + if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; @@ -8661,10 +8684,6 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) return -ENODEV; - /* Currently the boot mapped buffer is not supported for mmap */ - if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) - return -ENODEV; - ret = get_snapshot_map(iter->tr); if (ret) return ret; @@ -10241,6 +10260,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; + struct fs_context *fc; + int ret; /* * To maintain backward compatibility for tools that mount @@ -10250,12 +10271,20 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_submount(mntpt, type, "tracefs", NULL); + + fc = fs_context_for_submount(type, mntpt); put_filesystem(type); - if (IS_ERR(mnt)) - return NULL; - mntget(mnt); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + ret = vfs_parse_fs_string(fc, "source", + "tracefs", strlen("tracefs")); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + put_fs_context(fc); return mnt; } diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 1fec61603ef3..e066d31d08f8 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -210,6 +210,10 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); +#ifdef CONFIG_UNACCEPTED_MEMORY +#define PAGE_UNACCEPTED_MAPCOUNT_VALUE (PGTY_unaccepted << 24) + VMCOREINFO_NUMBER(PAGE_UNACCEPTED_MAPCOUNT_VALUE); +#endif #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9fa2af9dbf2c..80b56c002c7f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled = 1; static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_thresh_next; static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; @@ -63,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; */ unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); + +#ifdef CONFIG_SYSFS + +static unsigned int hardlockup_count; + +static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", hardlockup_count); +} + +static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count); + +static __init int kernel_hardlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_hardlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -169,6 +193,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) unsigned int this_cpu = smp_processor_id(); unsigned long flags; +#ifdef CONFIG_SYSFS + ++hardlockup_count; +#endif + /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) return; @@ -311,6 +339,28 @@ unsigned int __read_mostly softlockup_panic = static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; +#ifdef CONFIG_SYSFS + +static unsigned int softlockup_count; + +static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", softlockup_count); +} + +static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count); + +static __init int kernel_softlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_softlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); /* Timestamp of the last softlockup report. */ @@ -742,6 +792,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) touch_ts = __this_cpu_read(watchdog_touch_ts); duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { +#ifdef CONFIG_SYSFS + ++softlockup_count; +#endif + /* * Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping all cpu back traces. @@ -870,12 +924,20 @@ int lockup_detector_offline_cpu(unsigned int cpu) return 0; } -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); softlockup_stop_all(); + /* + * To prevent watchdog_timer_fn from using the old interval and + * the new watchdog_thresh at the same time, which could lead to + * false softlockup reports, it is necessary to update the + * watchdog_thresh after the softlockup is completed. + */ + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) @@ -888,7 +950,7 @@ static void __lockup_detector_reconfigure(void) void lockup_detector_reconfigure(void) { mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); mutex_unlock(&watchdog_mutex); } @@ -908,27 +970,29 @@ static __init void lockup_detector_setup(void) return; mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); lockup_detector_update_enable(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } static inline void lockup_detector_setup(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -946,11 +1010,11 @@ void lockup_detector_soft_poweroff(void) #ifdef CONFIG_SYSCTL /* Propagate any changes to the watchdog infrastructure */ -static void proc_watchdog_update(void) +static void proc_watchdog_update(bool thresh_changed) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(thresh_changed); } /* @@ -984,7 +1048,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr } else { err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) - proc_watchdog_update(); + proc_watchdog_update(false); } mutex_unlock(&watchdog_mutex); return err; @@ -1035,11 +1099,13 @@ static int proc_watchdog_thresh(const struct ctl_table *table, int write, mutex_lock(&watchdog_mutex); - old = READ_ONCE(watchdog_thresh); + watchdog_thresh_next = READ_ONCE(watchdog_thresh); + + old = watchdog_thresh_next; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!err && write && old != READ_ONCE(watchdog_thresh)) - proc_watchdog_update(); + if (!err && write && old != READ_ONCE(watchdog_thresh_next)) + proc_watchdog_update(true); mutex_unlock(&watchdog_mutex); return err; @@ -1060,7 +1126,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) - proc_watchdog_update(); + proc_watchdog_update(false); mutex_unlock(&watchdog_mutex); return err; @@ -1080,7 +1146,7 @@ static const struct ctl_table watchdog_sysctls[] = { }, { .procname = "watchdog_thresh", - .data = &watchdog_thresh, + .data = &watchdog_thresh_next, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, |