summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec34
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/cgroup/cpuset.c40
-rw-r--r--kernel/crash_dump_dm_crypt.c464
-rw-r--r--kernel/crash_reserve.c2
-rw-r--r--kernel/delayacct.c51
-rw-r--r--kernel/events/uprobes.c357
-rw-r--r--kernel/exit.c68
-rw-r--r--kernel/fork.c277
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/hung_task.c55
-rw-r--r--kernel/kexec_file.c94
-rw-r--r--kernel/kexec_handover.c1266
-rw-r--r--kernel/kexec_internal.h16
-rw-r--r--kernel/locking/mutex.c5
-rw-r--r--kernel/locking/semaphore.c57
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/power/snapshot.c42
-rw-r--r--kernel/ptrace.c179
-rw-r--r--kernel/relay.c111
-rw-r--r--kernel/sched/fair.c9
-rw-r--r--kernel/trace/ring_buffer.c213
-rw-r--r--kernel/trace/trace.c45
-rw-r--r--kernel/vmcore_info.c4
-rw-r--r--kernel/watchdog.c94
26 files changed, 2603 insertions, 896 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 4d111f871951..e64ce21f9a80 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -38,8 +38,7 @@ config KEXEC
config KEXEC_FILE
bool "Enable kexec file based system call"
depends on ARCH_SUPPORTS_KEXEC_FILE
- select CRYPTO
- select CRYPTO_SHA256
+ select CRYPTO_LIB_SHA256
select KEXEC_CORE
help
This is new version of kexec system call. This system call is
@@ -95,6 +94,20 @@ config KEXEC_JUMP
Jump between original kernel and kexeced kernel and invoke
code in physical address mode via KEXEC
+config KEXEC_HANDOVER
+ bool "kexec handover"
+ depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
+ select MEMBLOCK_KHO_SCRATCH
+ select KEXEC_FILE
+ select DEBUG_FS
+ select LIBFDT
+ select CMA
+ help
+ Allow kexec to hand over state across kernels by generating and
+ passing additional metadata to the target kernel. This is useful
+ to keep data or state alive across the kexec. For this to work,
+ both source and target kernels need to have this option enabled.
+
config CRASH_DUMP
bool "kernel crash dumps"
default ARCH_DEFAULT_CRASH_DUMP
@@ -116,6 +129,23 @@ config CRASH_DUMP
For s390, this option also enables zfcpdump.
See also <file:Documentation/arch/s390/zfcpdump.rst>
+config CRASH_DM_CRYPT
+ bool "Support saving crash dump to dm-crypt encrypted volume"
+ depends on KEXEC_FILE
+ depends on CRASH_DUMP
+ depends on DM_CRYPT
+ help
+ With this option enabled, user space can intereact with
+ /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys
+ persistent for the dump-capture kernel.
+
+config CRASH_DM_CRYPT_CONFIGS
+ def_tristate CRASH_DM_CRYPT
+ select CONFIGFS_FS
+ help
+ CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that
+ is required to be built-in.
+
config CRASH_HOTPLUG
bool "Update the crash elfcorehdr on system configuration changes"
default y
diff --git a/kernel/Makefile b/kernel/Makefile
index 434929de17ef..32e80dd626af 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -77,9 +77,11 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_CRASH_DUMP) += crash_core.o
+obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
+obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4b5f29168618..dd5304c6ac3c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -579,7 +579,7 @@ static bool can_alloc_pages(void)
static struct page *__bpf_alloc_page(int nid)
{
if (!can_alloc_pages())
- return try_alloc_pages(nid, 0);
+ return alloc_pages_nolock(nid, 0);
return alloc_pages_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6d3ac19cc2ac..3bc4301466f3 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4202,7 +4202,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
}
/*
- * cpuset_node_allowed - Can we allocate on a memory node?
+ * cpuset_current_node_allowed - Can current task allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -4241,7 +4241,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-bool cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
bool allowed; /* is allocation in zone z allowed? */
@@ -4275,6 +4275,42 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask)
return allowed;
}
+bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+ bool allowed;
+
+ /*
+ * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
+ * and mems_allowed is likely to be empty even if we could get to it,
+ * so return true to avoid taking a global lock on the empty check.
+ */
+ if (!cpuset_v2())
+ return true;
+
+ css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+ if (!css)
+ return true;
+
+ /*
+ * Normally, accessing effective_mems would require the cpuset_mutex
+ * or callback_lock - but node_isset is atomic and the reference
+ * taken via cgroup_get_e_css is sufficient to protect css.
+ *
+ * Since this interface is intended for use by migration paths, we
+ * relax locking here to avoid taking global locks - while accepting
+ * there may be rare scenarios where the result may be innaccurate.
+ *
+ * Reclaim and migration are subject to these same race conditions, and
+ * cannot make strong isolation guarantees, so this is acceptable.
+ */
+ cs = container_of(css, struct cpuset, css);
+ allowed = node_isset(nid, cs->effective_mems);
+ css_put(css);
+ return allowed;
+}
+
/**
* cpuset_spread_node() - On which node to begin search for a page
* @rotor: round robin rotor
diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c
new file mode 100644
index 000000000000..401423ba477d
--- /dev/null
+++ b/kernel/crash_dump_dm_crypt.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <keys/user-type.h>
+#include <linux/crash_dump.h>
+#include <linux/cc_platform.h>
+#include <linux/configfs.h>
+#include <linux/module.h>
+
+#define KEY_NUM_MAX 128 /* maximum dm crypt keys */
+#define KEY_SIZE_MAX 256 /* maximum dm crypt key size */
+#define KEY_DESC_MAX_LEN 128 /* maximum dm crypt key description size */
+
+static unsigned int key_count;
+
+struct dm_crypt_key {
+ unsigned int key_size;
+ char key_desc[KEY_DESC_MAX_LEN];
+ u8 data[KEY_SIZE_MAX];
+};
+
+static struct keys_header {
+ unsigned int total_keys;
+ struct dm_crypt_key keys[] __counted_by(total_keys);
+} *keys_header;
+
+static size_t get_keys_header_size(size_t total_keys)
+{
+ return struct_size(keys_header, keys, total_keys);
+}
+
+unsigned long long dm_crypt_keys_addr;
+EXPORT_SYMBOL_GPL(dm_crypt_keys_addr);
+
+static int __init setup_dmcryptkeys(char *arg)
+{
+ char *end;
+
+ if (!arg)
+ return -EINVAL;
+ dm_crypt_keys_addr = memparse(arg, &end);
+ if (end > arg)
+ return 0;
+
+ dm_crypt_keys_addr = 0;
+ return -EINVAL;
+}
+
+early_param("dmcryptkeys", setup_dmcryptkeys);
+
+/*
+ * Architectures may override this function to read dm crypt keys
+ */
+ssize_t __weak dm_crypt_keys_read(char *buf, size_t count, u64 *ppos)
+{
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+}
+
+static int add_key_to_keyring(struct dm_crypt_key *dm_key,
+ key_ref_t keyring_ref)
+{
+ key_ref_t key_ref;
+ int r;
+
+ /* create or update the requested key and add it to the target keyring */
+ key_ref = key_create_or_update(keyring_ref, "user", dm_key->key_desc,
+ dm_key->data, dm_key->key_size,
+ KEY_USR_ALL, KEY_ALLOC_IN_QUOTA);
+
+ if (!IS_ERR(key_ref)) {
+ r = key_ref_to_ptr(key_ref)->serial;
+ key_ref_put(key_ref);
+ kexec_dprintk("Success adding key %s", dm_key->key_desc);
+ } else {
+ r = PTR_ERR(key_ref);
+ kexec_dprintk("Error when adding key");
+ }
+
+ key_ref_put(keyring_ref);
+ return r;
+}
+
+static void get_keys_from_kdump_reserved_memory(void)
+{
+ struct keys_header *keys_header_loaded;
+
+ arch_kexec_unprotect_crashkres();
+
+ keys_header_loaded = kmap_local_page(pfn_to_page(
+ kexec_crash_image->dm_crypt_keys_addr >> PAGE_SHIFT));
+
+ memcpy(keys_header, keys_header_loaded, get_keys_header_size(key_count));
+ kunmap_local(keys_header_loaded);
+ arch_kexec_protect_crashkres();
+}
+
+static int restore_dm_crypt_keys_to_thread_keyring(void)
+{
+ struct dm_crypt_key *key;
+ size_t keys_header_size;
+ key_ref_t keyring_ref;
+ u64 addr;
+
+ /* find the target keyring (which must be writable) */
+ keyring_ref =
+ lookup_user_key(KEY_SPEC_USER_KEYRING, 0x01, KEY_NEED_WRITE);
+ if (IS_ERR(keyring_ref)) {
+ kexec_dprintk("Failed to get the user keyring\n");
+ return PTR_ERR(keyring_ref);
+ }
+
+ addr = dm_crypt_keys_addr;
+ dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr);
+ if (key_count < 0 || key_count > KEY_NUM_MAX) {
+ kexec_dprintk("Failed to read the number of dm-crypt keys\n");
+ return -1;
+ }
+
+ kexec_dprintk("There are %u keys\n", key_count);
+ addr = dm_crypt_keys_addr;
+
+ keys_header_size = get_keys_header_size(key_count);
+ keys_header = kzalloc(keys_header_size, GFP_KERNEL);
+ if (!keys_header)
+ return -ENOMEM;
+
+ dm_crypt_keys_read((char *)keys_header, keys_header_size, &addr);
+
+ for (int i = 0; i < keys_header->total_keys; i++) {
+ key = &keys_header->keys[i];
+ kexec_dprintk("Get key (size=%u)\n", key->key_size);
+ add_key_to_keyring(key, keyring_ref);
+ }
+
+ return 0;
+}
+
+static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
+{
+ const struct user_key_payload *ukp;
+ struct key *key;
+
+ kexec_dprintk("Requesting logon key %s", dm_key->key_desc);
+ key = request_key(&key_type_logon, dm_key->key_desc, NULL);
+
+ if (IS_ERR(key)) {
+ pr_warn("No such logon key %s\n", dm_key->key_desc);
+ return PTR_ERR(key);
+ }
+
+ ukp = user_key_payload_locked(key);
+ if (!ukp)
+ return -EKEYREVOKED;
+
+ if (ukp->datalen > KEY_SIZE_MAX) {
+ pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX);
+ return -EINVAL;
+ }
+
+ memcpy(dm_key->data, ukp->data, ukp->datalen);
+ dm_key->key_size = ukp->datalen;
+ kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size,
+ dm_key->key_desc, dm_key->data);
+ return 0;
+}
+
+struct config_key {
+ struct config_item item;
+ const char *description;
+};
+
+static inline struct config_key *to_config_key(struct config_item *item)
+{
+ return container_of(item, struct config_key, item);
+}
+
+static ssize_t config_key_description_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%s\n", to_config_key(item)->description);
+}
+
+static ssize_t config_key_description_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct config_key *config_key = to_config_key(item);
+ size_t len;
+ int ret;
+
+ ret = -EINVAL;
+ len = strcspn(page, "\n");
+
+ if (len > KEY_DESC_MAX_LEN) {
+ pr_err("The key description shouldn't exceed %u characters", KEY_DESC_MAX_LEN);
+ return ret;
+ }
+
+ if (!len)
+ return ret;
+
+ kfree(config_key->description);
+ ret = -ENOMEM;
+ config_key->description = kmemdup_nul(page, len, GFP_KERNEL);
+ if (!config_key->description)
+ return ret;
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_key_, description);
+
+static struct configfs_attribute *config_key_attrs[] = {
+ &config_key_attr_description,
+ NULL,
+};
+
+static void config_key_release(struct config_item *item)
+{
+ kfree(to_config_key(item));
+ key_count--;
+}
+
+static struct configfs_item_operations config_key_item_ops = {
+ .release = config_key_release,
+};
+
+static const struct config_item_type config_key_type = {
+ .ct_item_ops = &config_key_item_ops,
+ .ct_attrs = config_key_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item *config_keys_make_item(struct config_group *group,
+ const char *name)
+{
+ struct config_key *config_key;
+
+ if (key_count > KEY_NUM_MAX) {
+ pr_err("Only %u keys at maximum to be created\n", KEY_NUM_MAX);
+ return ERR_PTR(-EINVAL);
+ }
+
+ config_key = kzalloc(sizeof(struct config_key), GFP_KERNEL);
+ if (!config_key)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&config_key->item, name, &config_key_type);
+
+ key_count++;
+
+ return &config_key->item;
+}
+
+static ssize_t config_keys_count_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", key_count);
+}
+
+CONFIGFS_ATTR_RO(config_keys_, count);
+
+static bool is_dm_key_reused;
+
+static ssize_t config_keys_reuse_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", is_dm_key_reused);
+}
+
+static ssize_t config_keys_reuse_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ if (!kexec_crash_image || !kexec_crash_image->dm_crypt_keys_addr) {
+ kexec_dprintk(
+ "dm-crypt keys haven't be saved to crash-reserved memory\n");
+ return -EINVAL;
+ }
+
+ if (kstrtobool(page, &is_dm_key_reused))
+ return -EINVAL;
+
+ if (is_dm_key_reused)
+ get_keys_from_kdump_reserved_memory();
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_keys_, reuse);
+
+static struct configfs_attribute *config_keys_attrs[] = {
+ &config_keys_attr_count,
+ &config_keys_attr_reuse,
+ NULL,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations config_keys_group_ops = {
+ .make_item = config_keys_make_item,
+};
+
+static const struct config_item_type config_keys_type = {
+ .ct_group_ops = &config_keys_group_ops,
+ .ct_attrs = config_keys_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static bool restore;
+
+static ssize_t config_keys_restore_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", restore);
+}
+
+static ssize_t config_keys_restore_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ if (!restore)
+ restore_dm_crypt_keys_to_thread_keyring();
+
+ if (kstrtobool(page, &restore))
+ return -EINVAL;
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_keys_, restore);
+
+static struct configfs_attribute *kdump_config_keys_attrs[] = {
+ &config_keys_attr_restore,
+ NULL,
+};
+
+static const struct config_item_type kdump_config_keys_type = {
+ .ct_attrs = kdump_config_keys_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem config_keys_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "crash_dm_crypt_keys",
+ .ci_type = &config_keys_type,
+ },
+ },
+};
+
+static int build_keys_header(void)
+{
+ struct config_item *item = NULL;
+ struct config_key *key;
+ int i, r;
+
+ if (keys_header != NULL)
+ kvfree(keys_header);
+
+ keys_header = kzalloc(get_keys_header_size(key_count), GFP_KERNEL);
+ if (!keys_header)
+ return -ENOMEM;
+
+ keys_header->total_keys = key_count;
+
+ i = 0;
+ list_for_each_entry(item, &config_keys_subsys.su_group.cg_children,
+ ci_entry) {
+ if (item->ci_type != &config_key_type)
+ continue;
+
+ key = to_config_key(item);
+
+ if (!key->description) {
+ pr_warn("No key description for key %s\n", item->ci_name);
+ return -EINVAL;
+ }
+
+ strscpy(keys_header->keys[i].key_desc, key->description,
+ KEY_DESC_MAX_LEN);
+ r = read_key_from_user_keying(&keys_header->keys[i]);
+ if (r != 0) {
+ kexec_dprintk("Failed to read key %s\n",
+ keys_header->keys[i].key_desc);
+ return r;
+ }
+ i++;
+ kexec_dprintk("Found key: %s\n", item->ci_name);
+ }
+
+ return 0;
+}
+
+int crash_load_dm_crypt_keys(struct kimage *image)
+{
+ struct kexec_buf kbuf = {
+ .image = image,
+ .buf_min = 0,
+ .buf_max = ULONG_MAX,
+ .top_down = false,
+ .random = true,
+ };
+ int r;
+
+
+ if (key_count <= 0) {
+ kexec_dprintk("No dm-crypt keys\n");
+ return -ENOENT;
+ }
+
+ if (!is_dm_key_reused) {
+ image->dm_crypt_keys_addr = 0;
+ r = build_keys_header();
+ if (r)
+ return r;
+ }
+
+ kbuf.buffer = keys_header;
+ kbuf.bufsz = get_keys_header_size(key_count);
+
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ r = kexec_add_buffer(&kbuf);
+ if (r) {
+ kvfree((void *)kbuf.buffer);
+ return r;
+ }
+ image->dm_crypt_keys_addr = kbuf.mem;
+ image->dm_crypt_keys_sz = kbuf.bufsz;
+ kexec_dprintk(
+ "Loaded dm crypt keys to kexec_buffer bufsz=0x%lx memsz=0x%lx\n",
+ kbuf.bufsz, kbuf.memsz);
+
+ return r;
+}
+
+static int __init configfs_dmcrypt_keys_init(void)
+{
+ int ret;
+
+ if (is_kdump_kernel()) {
+ config_keys_subsys.su_group.cg_item.ci_type =
+ &kdump_config_keys_type;
+ }
+
+ config_group_init(&config_keys_subsys.su_group);
+ mutex_init(&config_keys_subsys.su_mutex);
+ ret = configfs_register_subsystem(&config_keys_subsys);
+ if (ret) {
+ pr_err("Error %d while registering subsystem %s\n", ret,
+ config_keys_subsys.su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+
+ return 0;
+
+out_unregister:
+ configfs_unregister_subsystem(&config_keys_subsys);
+
+ return ret;
+}
+
+module_init(configfs_dmcrypt_keys_init);
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index aff7c0fdbefa..acb6bf42e30d 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -131,7 +131,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
cur++;
*crash_base = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warn("crahskernel: Memory value expected after '@'\n");
+ pr_warn("crashkernel: Memory value expected after '@'\n");
return -EINVAL;
}
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index eb63a021ac04..30e7912ebb0d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -14,6 +14,15 @@
#include <linux/delayacct.h>
#include <linux/module.h>
+#define UPDATE_DELAY(type) \
+do { \
+ d->type##_delay_max = tsk->delays->type##_delay_max; \
+ d->type##_delay_min = tsk->delays->type##_delay_min; \
+ tmp = d->type##_delay_total + tsk->delays->type##_delay; \
+ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
+ d->type##_count += tsk->delays->type##_count; \
+} while (0)
+
DEFINE_STATIC_KEY_FALSE(delayacct_key);
int delayacct_on __read_mostly; /* Delay accounting turned on/off */
struct kmem_cache *delayacct_cache;
@@ -173,41 +182,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
- d->blkio_delay_max = tsk->delays->blkio_delay_max;
- d->blkio_delay_min = tsk->delays->blkio_delay_min;
- tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
- d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
- d->swapin_delay_max = tsk->delays->swapin_delay_max;
- d->swapin_delay_min = tsk->delays->swapin_delay_min;
- tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
- d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
- d->freepages_delay_max = tsk->delays->freepages_delay_max;
- d->freepages_delay_min = tsk->delays->freepages_delay_min;
- tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
- d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
- d->thrashing_delay_max = tsk->delays->thrashing_delay_max;
- d->thrashing_delay_min = tsk->delays->thrashing_delay_min;
- tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
- d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
- d->compact_delay_max = tsk->delays->compact_delay_max;
- d->compact_delay_min = tsk->delays->compact_delay_min;
- tmp = d->compact_delay_total + tsk->delays->compact_delay;
- d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
- d->wpcopy_delay_max = tsk->delays->wpcopy_delay_max;
- d->wpcopy_delay_min = tsk->delays->wpcopy_delay_min;
- tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
- d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
- d->irq_delay_max = tsk->delays->irq_delay_max;
- d->irq_delay_min = tsk->delays->irq_delay_min;
- tmp = d->irq_delay_total + tsk->delays->irq_delay;
- d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp;
- d->blkio_count += tsk->delays->blkio_count;
- d->swapin_count += tsk->delays->swapin_count;
- d->freepages_count += tsk->delays->freepages_count;
- d->thrashing_count += tsk->delays->thrashing_count;
- d->compact_count += tsk->delays->compact_count;
- d->wpcopy_count += tsk->delays->wpcopy_count;
- d->irq_count += tsk->delays->irq_count;
+ UPDATE_DELAY(blkio);
+ UPDATE_DELAY(swapin);
+ UPDATE_DELAY(freepages);
+ UPDATE_DELAY(thrashing);
+ UPDATE_DELAY(compact);
+ UPDATE_DELAY(wpcopy);
+ UPDATE_DELAY(irq);
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8d783b5882b6..4c965ba77f9f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -29,6 +29,7 @@
#include <linux/workqueue.h>
#include <linux/srcu.h>
#include <linux/oom.h> /* check_stable_address_space */
+#include <linux/pagewalk.h>
#include <linux/uprobes.h>
@@ -152,91 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
}
/**
- * __replace_page - replace page in vma by new page.
- * based on replace_page in mm/ksm.c
- *
- * @vma: vma that holds the pte pointing to page
- * @addr: address the old @page is mapped at
- * @old_page: the page we are replacing by new_page
- * @new_page: the modified page we replace page by
- *
- * If @new_page is NULL, only unmap @old_page.
- *
- * Returns 0 on success, negative error code otherwise.
- */
-static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *old_page, struct page *new_page)
-{
- struct folio *old_folio = page_folio(old_page);
- struct folio *new_folio;
- struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
- int err;
- struct mmu_notifier_range range;
- pte_t pte;
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
- addr + PAGE_SIZE);
-
- if (new_page) {
- new_folio = page_folio(new_page);
- err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
- if (err)
- return err;
- }
-
- /* For folio_free_swap() below */
- folio_lock(old_folio);
-
- mmu_notifier_invalidate_range_start(&range);
- err = -EAGAIN;
- if (!page_vma_mapped_walk(&pvmw))
- goto unlock;
- VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
- pte = ptep_get(pvmw.pte);
-
- /*
- * Handle PFN swap PTES, such as device-exclusive ones, that actually
- * map pages: simply trigger GUP again to fix it up.
- */
- if (unlikely(!pte_present(pte))) {
- page_vma_mapped_walk_done(&pvmw);
- goto unlock;
- }
-
- if (new_page) {
- folio_get(new_folio);
- folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE);
- folio_add_lru_vma(new_folio, vma);
- } else
- /* no new page, just dec_mm_counter for old_page */
- dec_mm_counter(mm, MM_ANONPAGES);
-
- if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(old_folio));
- inc_mm_counter(mm, MM_ANONPAGES);
- }
-
- flush_cache_page(vma, addr, pte_pfn(pte));
- ptep_clear_flush(vma, addr, pvmw.pte);
- if (new_page)
- set_pte_at(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
-
- folio_remove_rmap_pte(old_folio, old_page, vma);
- if (!folio_mapped(old_folio))
- folio_free_swap(old_folio);
- page_vma_mapped_walk_done(&pvmw);
- folio_put(old_folio);
-
- err = 0;
- unlock:
- mmu_notifier_invalidate_range_end(&range);
- folio_unlock(old_folio);
- return err;
-}
-
-/**
* is_swbp_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
* Default implementation of is_swbp_insn
@@ -463,6 +379,95 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
+static bool orig_page_is_identical(struct vm_area_struct *vma,
+ unsigned long vaddr, struct page *page, bool *pmd_mappable)
+{
+ const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
+ struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping,
+ index);
+ struct page *orig_page;
+ bool identical;
+
+ if (IS_ERR(orig_folio))
+ return false;
+ orig_page = folio_file_page(orig_folio, index);
+
+ *pmd_mappable = folio_test_pmd_mappable(orig_folio);
+ identical = folio_test_uptodate(orig_folio) &&
+ pages_identical(page, orig_page);
+ folio_put(orig_folio);
+ return identical;
+}
+
+static int __uprobe_write_opcode(struct vm_area_struct *vma,
+ struct folio_walk *fw, struct folio *folio,
+ unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+{
+ const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ const bool is_register = !!is_swbp_insn(&opcode);
+ bool pmd_mappable;
+
+ /* For now, we'll only handle PTE-mapped folios. */
+ if (fw->level != FW_LEVEL_PTE)
+ return -EFAULT;
+
+ /*
+ * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
+ * but the VMA might not be writable.
+ */
+ if (!pte_write(fw->pte)) {
+ if (!PageAnonExclusive(fw->page))
+ return -EFAULT;
+ if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
+ return -EFAULT;
+ /* SOFTDIRTY is handled via pte_mkdirty() below. */
+ }
+
+ /*
+ * We'll temporarily unmap the page and flush the TLB, such that we can
+ * modify the page atomically.
+ */
+ flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
+ fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
+ copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+
+ /*
+ * When unregistering, we may only zap a PTE if uffd is disabled and
+ * there are no unexpected folio references ...
+ */
+ if (is_register || userfaultfd_missing(vma) ||
+ (folio_ref_count(folio) != folio_mapcount(folio) + 1 +
+ folio_test_swapcache(folio) * folio_nr_pages(folio)))
+ goto remap;
+
+ /*
+ * ... and the mapped page is identical to the original page that
+ * would get faulted in on next access.
+ */
+ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
+ goto remap;
+
+ dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ folio_remove_rmap_pte(folio, fw->page, vma);
+ if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+
+ return pmd_mappable;
+remap:
+ /*
+ * Make sure that our copy_to_page() changes become visible before the
+ * set_pte_at() write.
+ */
+ smp_wmb();
+ /* We modified the page. Make sure to mark the PTE dirty. */
+ set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
+ return 0;
+}
+
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
@@ -474,146 +479,146 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
*
* uprobe_write_opcode - write the opcode at a given virtual address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to store the opcode.
- * @opcode: opcode to be written at @vaddr.
+ * @vma: the probed virtual memory area.
+ * @opcode_vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @opcode_vaddr.
*
* Called with mm->mmap_lock held for read or write.
* Return 0 (success) or a negative errno.
*/
-int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
- unsigned long vaddr, uprobe_opcode_t opcode)
+int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
{
+ const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ struct mm_struct *mm = vma->vm_mm;
struct uprobe *uprobe;
- struct page *old_page, *new_page;
- struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
- bool orig_page_huge = false;
unsigned int gup_flags = FOLL_FORCE;
+ struct mmu_notifier_range range;
+ struct folio_walk fw;
+ struct folio *folio;
+ struct page *page;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
-retry:
+ if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
+ return -EINVAL;
+
+ /*
+ * When registering, we have to break COW to get an exclusive anonymous
+ * page that we can safely modify. Use FOLL_WRITE to trigger a write
+ * fault if required. When unregistering, we might be lucky and the
+ * anon page is already gone. So defer write faults until really
+ * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
+ * cannot deal with PMDs yet.
+ */
if (is_register)
- gup_flags |= FOLL_SPLIT_PMD;
- /* Read the page with vaddr into memory */
- old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
- if (IS_ERR(old_page))
- return PTR_ERR(old_page);
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
- ret = verify_opcode(old_page, vaddr, &opcode);
+retry:
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
if (ret <= 0)
- goto put_old;
-
- if (is_zero_page(old_page)) {
- ret = -EINVAL;
- goto put_old;
- }
+ goto out;
+ folio = page_folio(page);
- if (WARN(!is_register && PageCompound(old_page),
- "uprobe unregister should never work on compound page\n")) {
- ret = -EINVAL;
- goto put_old;
+ ret = verify_opcode(page, opcode_vaddr, &opcode);
+ if (ret <= 0) {
+ folio_put(folio);
+ goto out;
}
/* We are going to replace instruction, update ref_ctr. */
if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
- if (ret)
- goto put_old;
+ if (ret) {
+ folio_put(folio);
+ goto out;
+ }
ref_ctr_updated = 1;
}
ret = 0;
- if (!is_register && !PageAnon(old_page))
- goto put_old;
-
- ret = anon_vma_prepare(vma);
- if (ret)
- goto put_old;
-
- ret = -ENOMEM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
- if (!new_page)
- goto put_old;
-
- __SetPageUptodate(new_page);
- copy_highpage(new_page, old_page);
- copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (unlikely(!folio_test_anon(folio))) {
+ VM_WARN_ON_ONCE(is_register);
+ folio_put(folio);
+ goto out;
+ }
if (!is_register) {
- struct page *orig_page;
- pgoff_t index;
-
- VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
-
- index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
- orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
- index);
-
- if (orig_page) {
- if (PageUptodate(orig_page) &&
- pages_identical(new_page, orig_page)) {
- /* let go new_page */
- put_page(new_page);
- new_page = NULL;
-
- if (PageCompound(orig_page))
- orig_page_huge = true;
- }
- put_page(orig_page);
- }
+ /*
+ * In the common case, we'll be able to zap the page when
+ * unregistering. So trigger MMU notifiers now, as we won't
+ * be able to do it under PTL.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ vaddr, vaddr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
}
- ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
- if (new_page)
- put_page(new_page);
-put_old:
- put_page(old_page);
+ ret = -EAGAIN;
+ /* Walk the page tables again, to perform the actual update. */
+ if (folio_walk_start(&fw, vma, vaddr, 0)) {
+ if (fw.page == page)
+ ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
+ folio_walk_end(&fw, vma);
+ }
+
+ if (!is_register)
+ mmu_notifier_invalidate_range_end(&range);
- if (unlikely(ret == -EAGAIN))
+ folio_put(folio);
+ switch (ret) {
+ case -EFAULT:
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
+ fallthrough;
+ case -EAGAIN:
goto retry;
+ default:
+ break;
+ }
+out:
/* Revert back reference counter if instruction update failed. */
- if (ret && is_register && ref_ctr_updated)
+ if (ret < 0 && is_register && ref_ctr_updated)
update_ref_ctr(uprobe, mm, -1);
/* try collapse pmd for compound page */
- if (!ret && orig_page_huge)
+ if (ret > 0)
collapse_pte_mapped_thp(mm, vaddr, false);
- return ret;
+ return ret < 0 ? ret : 0;
}
/**
* set_swbp - store breakpoint at a given address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, store the breakpoint instruction at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
}
/**
* set_orig_insn - Restore the original instruction.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @auprobe: arch specific probepoint information.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, restore the original opcode (opcode) at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_orig_insn(struct arch_uprobe *auprobe,
+ struct vm_area_struct *vma, unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr,
+ return uprobe_write_opcode(auprobe, vma, vaddr,
*(uprobe_opcode_t *)&auprobe->insn);
}
@@ -1134,10 +1139,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
return ret;
}
-static int
-install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long vaddr)
+static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
bool first_uprobe;
int ret;
@@ -1153,7 +1158,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
if (first_uprobe)
set_bit(MMF_HAS_UPROBES, &mm->flags);
- ret = set_swbp(&uprobe->arch, mm, vaddr);
+ ret = set_swbp(&uprobe->arch, vma, vaddr);
if (!ret)
clear_bit(MMF_RECALC_UPROBES, &mm->flags);
else if (first_uprobe)
@@ -1162,11 +1167,13 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
-static int
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
+static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
+
set_bit(MMF_RECALC_UPROBES, &mm->flags);
- return set_orig_insn(&uprobe->arch, mm, vaddr);
+ return set_orig_insn(&uprobe->arch, vma, vaddr);
}
struct map_info {
@@ -1296,10 +1303,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
if (consumer_filter(new, mm))
- err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ err = install_breakpoint(uprobe, vma, info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
if (!filter_chain(uprobe, mm))
- err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ err |= remove_breakpoint(uprobe, vma, info->vaddr);
}
unlock:
@@ -1472,7 +1479,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
continue;
vaddr = offset_to_vaddr(vma, uprobe->offset);
- err |= remove_breakpoint(uprobe, mm, vaddr);
+ err |= remove_breakpoint(uprobe, vma, vaddr);
}
mmap_read_unlock(mm);
@@ -1610,7 +1617,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (!fatal_signal_pending(current) &&
filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
- install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ install_breakpoint(uprobe, vma, vaddr);
}
put_uprobe(uprobe);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 38645039dd8f..bd743900354c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -421,44 +421,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-static void coredump_task_exit(struct task_struct *tsk)
+static void coredump_task_exit(struct task_struct *tsk,
+ struct core_state *core_state)
{
- struct core_state *core_state;
+ struct core_thread self;
+ self.task = tsk;
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
/*
- * Serialize with any possible pending coredump.
- * We must hold siglock around checking core_state
- * and setting PF_POSTCOREDUMP. The core-inducing thread
- * will increment ->nr_threads for each thread in the
- * group without PF_POSTCOREDUMP set.
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
*/
- spin_lock_irq(&tsk->sighand->siglock);
- tsk->flags |= PF_POSTCOREDUMP;
- core_state = tsk->signal->core_state;
- spin_unlock_irq(&tsk->sighand->siglock);
- if (core_state) {
- struct core_thread self;
-
- self.task = current;
- if (self.task->flags & PF_SIGNALED)
- self.next = xchg(&core_state->dumper.next, &self);
- else
- self.task = NULL;
- /*
- * Implies mb(), the result of xchg() must be visible
- * to core_state->dumper.
- */
- if (atomic_dec_and_test(&core_state->nr_threads))
- complete(&core_state->startup);
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
- for (;;) {
- set_current_state(TASK_IDLE|TASK_FREEZABLE);
- if (!self.task) /* see coredump_finish() */
- break;
- schedule();
- }
- __set_current_state(TASK_RUNNING);
+ for (;;) {
+ set_current_state(TASK_IDLE|TASK_FREEZABLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ schedule();
}
+ __set_current_state(TASK_RUNNING);
}
#ifdef CONFIG_MEMCG
@@ -882,6 +868,7 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
{
struct sighand_struct *sighand = tsk->sighand;
struct signal_struct *signal = tsk->signal;
+ struct core_state *core_state;
spin_lock_irq(&sighand->siglock);
signal->quick_threads--;
@@ -891,7 +878,19 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
signal->group_exit_code = code;
signal->group_stop_count = 0;
}
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold siglock around checking core_state
+ * and setting PF_POSTCOREDUMP. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group without PF_POSTCOREDUMP set.
+ */
+ tsk->flags |= PF_POSTCOREDUMP;
+ core_state = signal->core_state;
spin_unlock_irq(&sighand->siglock);
+
+ if (unlikely(core_state))
+ coredump_task_exit(tsk, core_state);
}
void __noreturn do_exit(long code)
@@ -900,15 +899,12 @@ void __noreturn do_exit(long code)
int group_dead;
WARN_ON(irqs_disabled());
-
- synchronize_group_exit(tsk, code);
-
WARN_ON(tsk->plug);
kcov_task_exit(tsk);
kmsan_task_exit(tsk);
- coredump_task_exit(tsk);
+ synchronize_group_exit(tsk, code);
ptrace_event(PTRACE_EVENT_EXIT, code);
user_events_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 85afccfdf3b1..1ee8eb11f38b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -112,6 +112,9 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+/* For dup_mmap(). */
+#include "../mm/internal.h"
+
#include <trace/events/sched.h>
#define CREATE_TRACE_POINTS
@@ -428,88 +431,9 @@ struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;
-/* SLAB cache for vm_area_struct structures */
-static struct kmem_cache *vm_area_cachep;
-
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
-
- vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!vma)
- return NULL;
-
- vma_init(vma, mm);
-
- return vma;
-}
-
-static void vm_area_init_from(const struct vm_area_struct *src,
- struct vm_area_struct *dest)
-{
- dest->vm_mm = src->vm_mm;
- dest->vm_ops = src->vm_ops;
- dest->vm_start = src->vm_start;
- dest->vm_end = src->vm_end;
- dest->anon_vma = src->anon_vma;
- dest->vm_pgoff = src->vm_pgoff;
- dest->vm_file = src->vm_file;
- dest->vm_private_data = src->vm_private_data;
- vm_flags_init(dest, src->vm_flags);
- memcpy(&dest->vm_page_prot, &src->vm_page_prot,
- sizeof(dest->vm_page_prot));
- /*
- * src->shared.rb may be modified concurrently when called from
- * dup_mmap(), but the clone will reinitialize it.
- */
- data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
- memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
- sizeof(dest->vm_userfaultfd_ctx));
-#ifdef CONFIG_ANON_VMA_NAME
- dest->anon_name = src->anon_name;
-#endif
-#ifdef CONFIG_SWAP
- memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
- sizeof(dest->swap_readahead_info));
-#endif
-#ifndef CONFIG_MMU
- dest->vm_region = src->vm_region;
-#endif
-#ifdef CONFIG_NUMA
- dest->vm_policy = src->vm_policy;
-#endif
-}
-
-struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
-{
- struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-
- if (!new)
- return NULL;
-
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- vm_area_init_from(orig, new);
- vma_lock_init(new, true);
- INIT_LIST_HEAD(&new->anon_vma_chain);
- vma_numab_state_init(new);
- dup_anon_vma_name(orig, new);
-
- return new;
-}
-
-void vm_area_free(struct vm_area_struct *vma)
-{
- /* The vma should be detached while being destroyed. */
- vma_assert_detached(vma);
- vma_numab_state_free(vma);
- free_anon_vma_name(vma);
- kmem_cache_free(vm_area_cachep, vma);
-}
-
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -589,7 +513,7 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
-static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
struct file *exe_file;
@@ -604,183 +528,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
}
#ifdef CONFIG_MMU
-static __latent_entropy int dup_mmap(struct mm_struct *mm,
- struct mm_struct *oldmm)
-{
- struct vm_area_struct *mpnt, *tmp;
- int retval;
- unsigned long charge = 0;
- LIST_HEAD(uf);
- VMA_ITERATOR(vmi, mm, 0);
-
- if (mmap_write_lock_killable(oldmm))
- return -EINTR;
- flush_cache_dup_mm(oldmm);
- uprobe_dup_mmap(oldmm, mm);
- /*
- * Not linked in yet - no deadlock potential:
- */
- mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
-
- /* No ordering required: file already has been exposed. */
- dup_mm_exe_file(mm, oldmm);
-
- mm->total_vm = oldmm->total_vm;
- mm->data_vm = oldmm->data_vm;
- mm->exec_vm = oldmm->exec_vm;
- mm->stack_vm = oldmm->stack_vm;
-
- /* Use __mt_dup() to efficiently build an identical maple tree. */
- retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
- if (unlikely(retval))
- goto out;
-
- mt_clear_in_rcu(vmi.mas.tree);
- for_each_vma(vmi, mpnt) {
- struct file *file;
-
- vma_start_write(mpnt);
- if (mpnt->vm_flags & VM_DONTCOPY) {
- retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
- mpnt->vm_end, GFP_KERNEL);
- if (retval)
- goto loop_out;
-
- vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
- continue;
- }
- charge = 0;
- /*
- * Don't duplicate many vmas if we've been oom-killed (for
- * example)
- */
- if (fatal_signal_pending(current)) {
- retval = -EINTR;
- goto loop_out;
- }
- if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned long len = vma_pages(mpnt);
-
- if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
- goto fail_nomem;
- charge = len;
- }
- tmp = vm_area_dup(mpnt);
- if (!tmp)
- goto fail_nomem;
-
- /* track_pfn_copy() will later take care of copying internal state. */
- if (unlikely(tmp->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(tmp);
-
- retval = vma_dup_policy(mpnt, tmp);
- if (retval)
- goto fail_nomem_policy;
- tmp->vm_mm = mm;
- retval = dup_userfaultfd(tmp, &uf);
- if (retval)
- goto fail_nomem_anon_vma_fork;
- if (tmp->vm_flags & VM_WIPEONFORK) {
- /*
- * VM_WIPEONFORK gets a clean slate in the child.
- * Don't prepare anon_vma until fault since we don't
- * copy page for current vma.
- */
- tmp->anon_vma = NULL;
- } else if (anon_vma_fork(tmp, mpnt))
- goto fail_nomem_anon_vma_fork;
- vm_flags_clear(tmp, VM_LOCKED_MASK);
- /*
- * Copy/update hugetlb private vma information.
- */
- if (is_vm_hugetlb_page(tmp))
- hugetlb_dup_vma_private(tmp);
-
- /*
- * Link the vma into the MT. After using __mt_dup(), memory
- * allocation is not necessary here, so it cannot fail.
- */
- vma_iter_bulk_store(&vmi, tmp);
-
- mm->map_count++;
-
- if (tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
-
- file = tmp->vm_file;
- if (file) {
- struct address_space *mapping = file->f_mapping;
-
- get_file(file);
- i_mmap_lock_write(mapping);
- if (vma_is_shared_maywrite(tmp))
- mapping_allow_writable(mapping);
- flush_dcache_mmap_lock(mapping);
- /* insert tmp into the share list, just after mpnt */
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
- i_mmap_unlock_write(mapping);
- }
-
- if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(tmp, mpnt);
-
- if (retval) {
- mpnt = vma_next(&vmi);
- goto loop_out;
- }
- }
- /* a new mm has just been created */
- retval = arch_dup_mmap(oldmm, mm);
-loop_out:
- vma_iter_free(&vmi);
- if (!retval) {
- mt_set_in_rcu(vmi.mas.tree);
- ksm_fork(mm, oldmm);
- khugepaged_fork(mm, oldmm);
- } else {
-
- /*
- * The entire maple tree has already been duplicated. If the
- * mmap duplication fails, mark the failure point with
- * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
- * stop releasing VMAs that have not been duplicated after this
- * point.
- */
- if (mpnt) {
- mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
- mas_store(&vmi.mas, XA_ZERO_ENTRY);
- /* Avoid OOM iterating a broken tree */
- set_bit(MMF_OOM_SKIP, &mm->flags);
- }
- /*
- * The mm_struct is going to exit, but the locks will be dropped
- * first. Set the mm_struct as unstable is advisable as it is
- * not fully initialised.
- */
- set_bit(MMF_UNSTABLE, &mm->flags);
- }
-out:
- mmap_write_unlock(mm);
- flush_tlb_mm(oldmm);
- mmap_write_unlock(oldmm);
- if (!retval)
- dup_userfaultfd_complete(&uf);
- else
- dup_userfaultfd_fail(&uf);
- return retval;
-
-fail_nomem_anon_vma_fork:
- mpol_put(vma_policy(tmp));
-fail_nomem_policy:
- vm_area_free(tmp);
-fail_nomem:
- retval = -ENOMEM;
- vm_unacct_memory(charge);
- goto loop_out;
-}
-
static inline int mm_alloc_pgd(struct mm_struct *mm)
{
mm->pgd = pgd_alloc(mm);
@@ -794,13 +541,6 @@ static inline void mm_free_pgd(struct mm_struct *mm)
pgd_free(mm, mm->pgd);
}
#else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
- mmap_write_lock(oldmm);
- dup_mm_exe_file(mm, oldmm);
- mmap_write_unlock(oldmm);
- return 0;
-}
#define mm_alloc_pgd(mm) (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
@@ -3228,11 +2968,6 @@ void __init mm_cache_init(void)
void __init proc_caches_init(void)
{
- struct kmem_cache_args args = {
- .use_freeptr_offset = true,
- .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
- };
-
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3249,10 +2984,6 @@ void __init proc_caches_init(void)
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- vm_area_cachep = kmem_cache_create("vm_area_struct",
- sizeof(struct vm_area_struct), &args,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
- SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index fd75b4a484d7..a08cc076f332 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -22,10 +22,6 @@
#define GCOV_COUNTERS 9
#elif (__GNUC__ >= 10)
#define GCOV_COUNTERS 8
-#elif (__GNUC__ >= 7)
-#define GCOV_COUNTERS 9
-#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
-#define GCOV_COUNTERS 10
#else
#define GCOV_COUNTERS 9
#endif
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index dc898ec93463..d2432df2b905 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -22,6 +22,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
+#include <linux/hung_task.h>
#include <trace/events/sched.h>
@@ -98,30 +99,62 @@ static struct notifier_block panic_block = {
static void debug_show_blocker(struct task_struct *task)
{
struct task_struct *g, *t;
- unsigned long owner;
- struct mutex *lock;
+ unsigned long owner, blocker, blocker_type;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
- lock = READ_ONCE(task->blocker_mutex);
- if (!lock)
+ blocker = READ_ONCE(task->blocker);
+ if (!blocker)
return;
- owner = mutex_get_owner(lock);
+ blocker_type = hung_task_get_blocker_type(blocker);
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ owner = mutex_get_owner(
+ (struct mutex *)hung_task_blocker_to_lock(blocker));
+ break;
+ case BLOCKER_TYPE_SEM:
+ owner = sem_last_holder(
+ (struct semaphore *)hung_task_blocker_to_lock(blocker));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+
if (unlikely(!owner)) {
- pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
- task->comm, task->pid);
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
+ task->comm, task->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
+ task->comm, task->pid);
+ break;
+ }
return;
}
/* Ensure the owner information is correct. */
for_each_process_thread(g, t) {
- if ((unsigned long)t == owner) {
+ if ((unsigned long)t != owner)
+ continue;
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n",
- task->comm, task->pid, t->comm, t->pid);
- sched_show_task(t);
- return;
+ task->comm, task->pid, t->comm, t->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
+ task->comm, task->pid, t->comm, t->pid);
+ break;
}
+ sched_show_task(t);
+ return;
}
}
#else
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 0adb645072aa..69fe76fd9233 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -19,7 +19,6 @@
#include <linux/list.h>
#include <linux/fs.h>
#include <linux/ima.h>
-#include <crypto/hash.h>
#include <crypto/sha2.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
@@ -277,6 +276,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
/* IMA needs to pass the measurement list to the next kernel. */
ima_add_kexec_buffer(image);
+ /* If KHO is active, add its images to the list */
+ ret = kho_fill_kimage(image);
+ if (ret)
+ goto out;
+
/* Call image load handler */
ldata = kexec_image_load_default(image);
@@ -469,6 +473,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
temp_end = min(end, kbuf->buf_max);
temp_start = temp_end - kbuf->memsz + 1;
+ kexec_random_range_start(temp_start, temp_end, kbuf, &temp_start);
do {
/* align down start */
@@ -513,6 +518,8 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
temp_start = max(start, kbuf->buf_min);
+ kexec_random_range_start(temp_start, end, kbuf, &temp_start);
+
do {
temp_start = ALIGN(temp_start, kbuf->buf_align);
temp_end = temp_start + kbuf->memsz - 1;
@@ -672,6 +679,14 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
return 0;
+ /*
+ * If KHO is active, only use KHO scratch memory. All other memory
+ * could potentially be handed over.
+ */
+ ret = kho_locate_mem_hole(kbuf, locate_mem_hole_callback);
+ if (ret <= 0)
+ return ret;
+
if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
@@ -736,11 +751,10 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
/* Calculate and store the digest of segments */
static int kexec_calculate_store_digests(struct kimage *image)
{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
+ struct sha256_state state;
int ret = 0, i, j, zero_buf_sz, sha_region_sz;
- size_t desc_size, nullsz;
- char *digest;
+ size_t nullsz;
+ u8 digest[SHA256_DIGEST_SIZE];
void *zero_buf;
struct kexec_sha_region *sha_regions;
struct purgatory_info *pi = &image->purgatory_info;
@@ -751,37 +765,12 @@ static int kexec_calculate_store_digests(struct kimage *image)
zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
zero_buf_sz = PAGE_SIZE;
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto out;
- }
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- desc = kzalloc(desc_size, GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto out_free_tfm;
- }
-
sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions) {
- ret = -ENOMEM;
- goto out_free_desc;
- }
-
- desc->tfm = tfm;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto out_free_sha_regions;
+ if (!sha_regions)
+ return -ENOMEM;
- digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
- if (!digest) {
- ret = -ENOMEM;
- goto out_free_sha_regions;
- }
+ sha256_init(&state);
for (j = i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
@@ -807,10 +796,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (check_ima_segment_index(image, i))
continue;
- ret = crypto_shash_update(desc, ksegment->kbuf,
- ksegment->bufsz);
- if (ret)
- break;
+ sha256_update(&state, ksegment->kbuf, ksegment->bufsz);
/*
* Assume rest of the buffer is filled with zero and
@@ -822,44 +808,26 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (bytes > zero_buf_sz)
bytes = zero_buf_sz;
- ret = crypto_shash_update(desc, zero_buf, bytes);
- if (ret)
- break;
+ sha256_update(&state, zero_buf, bytes);
nullsz -= bytes;
}
- if (ret)
- break;
-
sha_regions[j].start = ksegment->mem;
sha_regions[j].len = ksegment->memsz;
j++;
}
- if (!ret) {
- ret = crypto_shash_final(desc, digest);
- if (ret)
- goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
- sha_regions, sha_region_sz, 0);
- if (ret)
- goto out_free_digest;
+ sha256_final(&state, digest);
- ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest",
- digest, SHA256_DIGEST_SIZE, 0);
- if (ret)
- goto out_free_digest;
- }
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_sha_regions;
-out_free_digest:
- kfree(digest);
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
out_free_sha_regions:
vfree(sha_regions);
-out_free_desc:
- kfree(desc);
-out_free_tfm:
- kfree(tfm);
-out:
return ret;
}
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
new file mode 100644
index 000000000000..69b953551677
--- /dev/null
+++ b/kernel/kexec_handover.c
@@ -0,0 +1,1266 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover.c - kexec handover metadata processing
+ * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
+ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
+ * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include <linux/cma.h>
+#include <linux/count_zeros.h>
+#include <linux/debugfs.h>
+#include <linux/kexec.h>
+#include <linux/kexec_handover.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/notifier.h>
+#include <linux/page-isolation.h>
+
+#include <asm/early_ioremap.h>
+
+/*
+ * KHO is tightly coupled with mm init and needs access to some of mm
+ * internal APIs.
+ */
+#include "../mm/internal.h"
+#include "kexec_internal.h"
+
+#define KHO_FDT_COMPATIBLE "kho-v1"
+#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
+#define PROP_SUB_FDT "fdt"
+
+static bool kho_enable __ro_after_init;
+
+bool kho_is_enabled(void)
+{
+ return kho_enable;
+}
+EXPORT_SYMBOL_GPL(kho_is_enabled);
+
+static int __init kho_parse_enable(char *p)
+{
+ return kstrtobool(p, &kho_enable);
+}
+early_param("kho", kho_parse_enable);
+
+/*
+ * Keep track of memory that is to be preserved across KHO.
+ *
+ * The serializing side uses two levels of xarrays to manage chunks of per-order
+ * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
+ * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
+ * each bitmap will cover 16M of address space. Thus, for 16G of memory at most
+ * 512K of bitmap memory will be needed for order 0.
+ *
+ * This approach is fully incremental, as the serialization progresses folios
+ * can continue be aggregated to the tracker. The final step, immediately prior
+ * to kexec would serialize the xarray information into a linked list for the
+ * successor kernel to parse.
+ */
+
+#define PRESERVE_BITS (512 * 8)
+
+struct kho_mem_phys_bits {
+ DECLARE_BITMAP(preserve, PRESERVE_BITS);
+};
+
+struct kho_mem_phys {
+ /*
+ * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
+ * to order.
+ */
+ struct xarray phys_bits;
+};
+
+struct kho_mem_track {
+ /* Points to kho_mem_phys, each order gets its own bitmap tree */
+ struct xarray orders;
+};
+
+struct khoser_mem_chunk;
+
+struct kho_serialization {
+ struct page *fdt;
+ struct list_head fdt_list;
+ struct dentry *sub_fdt_dir;
+ struct kho_mem_track track;
+ /* First chunk of serialized preserved memory map */
+ struct khoser_mem_chunk *preserved_mem_map;
+};
+
+static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
+{
+ void *elm, *res;
+
+ elm = xa_load(xa, index);
+ if (elm)
+ return elm;
+
+ elm = kzalloc(sz, GFP_KERNEL);
+ if (!elm)
+ return ERR_PTR(-ENOMEM);
+
+ res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
+ if (xa_is_err(res))
+ res = ERR_PTR(xa_err(res));
+
+ if (res) {
+ kfree(elm);
+ return res;
+ }
+
+ return elm;
+}
+
+static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
+ unsigned long end_pfn)
+{
+ struct kho_mem_phys_bits *bits;
+ struct kho_mem_phys *physxa;
+
+ while (pfn < end_pfn) {
+ const unsigned int order =
+ min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+ const unsigned long pfn_high = pfn >> order;
+
+ physxa = xa_load(&track->orders, order);
+ if (!physxa)
+ continue;
+
+ bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
+ if (!bits)
+ continue;
+
+ clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+
+ pfn += 1 << order;
+ }
+}
+
+static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
+ unsigned int order)
+{
+ struct kho_mem_phys_bits *bits;
+ struct kho_mem_phys *physxa;
+ const unsigned long pfn_high = pfn >> order;
+
+ might_sleep();
+
+ physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa));
+ if (IS_ERR(physxa))
+ return PTR_ERR(physxa);
+
+ bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
+ sizeof(*bits));
+ if (IS_ERR(bits))
+ return PTR_ERR(bits);
+
+ set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+
+ return 0;
+}
+
+/* almost as free_reserved_page(), just don't free the page */
+static void kho_restore_page(struct page *page)
+{
+ ClearPageReserved(page);
+ init_page_count(page);
+ adjust_managed_page_count(page, 1);
+}
+
+/**
+ * kho_restore_folio - recreates the folio from the preserved memory.
+ * @phys: physical address of the folio.
+ *
+ * Return: pointer to the struct folio on success, NULL on failure.
+ */
+struct folio *kho_restore_folio(phys_addr_t phys)
+{
+ struct page *page = pfn_to_online_page(PHYS_PFN(phys));
+ unsigned long order;
+
+ if (!page)
+ return NULL;
+
+ order = page->private;
+ if (order) {
+ if (order > MAX_PAGE_ORDER)
+ return NULL;
+
+ prep_compound_page(page, order);
+ } else {
+ kho_restore_page(page);
+ }
+
+ return page_folio(page);
+}
+EXPORT_SYMBOL_GPL(kho_restore_folio);
+
+/* Serialize and deserialize struct kho_mem_phys across kexec
+ *
+ * Record all the bitmaps in a linked list of pages for the next kernel to
+ * process. Each chunk holds bitmaps of the same order and each block of bitmaps
+ * starts at a given physical address. This allows the bitmaps to be sparse. The
+ * xarray is used to store them in a tree while building up the data structure,
+ * but the KHO successor kernel only needs to process them once in order.
+ *
+ * All of this memory is normal kmalloc() memory and is not marked for
+ * preservation. The successor kernel will remain isolated to the scratch space
+ * until it completes processing this list. Once processed all the memory
+ * storing these ranges will be marked as free.
+ */
+
+struct khoser_mem_bitmap_ptr {
+ phys_addr_t phys_start;
+ DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
+};
+
+struct khoser_mem_chunk_hdr {
+ DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
+ unsigned int order;
+ unsigned int num_elms;
+};
+
+#define KHOSER_BITMAP_SIZE \
+ ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
+ sizeof(struct khoser_mem_bitmap_ptr))
+
+struct khoser_mem_chunk {
+ struct khoser_mem_chunk_hdr hdr;
+ struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
+};
+
+static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
+
+static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
+ unsigned long order)
+{
+ struct khoser_mem_chunk *chunk;
+
+ chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+ chunk->hdr.order = order;
+ if (cur_chunk)
+ KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
+ return chunk;
+}
+
+static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
+{
+ struct khoser_mem_chunk *chunk = first_chunk;
+
+ while (chunk) {
+ struct khoser_mem_chunk *tmp = chunk;
+
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ kfree(tmp);
+ }
+}
+
+static int kho_mem_serialize(struct kho_serialization *ser)
+{
+ struct khoser_mem_chunk *first_chunk = NULL;
+ struct khoser_mem_chunk *chunk = NULL;
+ struct kho_mem_phys *physxa;
+ unsigned long order;
+
+ xa_for_each(&ser->track.orders, order, physxa) {
+ struct kho_mem_phys_bits *bits;
+ unsigned long phys;
+
+ chunk = new_chunk(chunk, order);
+ if (!chunk)
+ goto err_free;
+
+ if (!first_chunk)
+ first_chunk = chunk;
+
+ xa_for_each(&physxa->phys_bits, phys, bits) {
+ struct khoser_mem_bitmap_ptr *elm;
+
+ if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
+ chunk = new_chunk(chunk, order);
+ if (!chunk)
+ goto err_free;
+ }
+
+ elm = &chunk->bitmaps[chunk->hdr.num_elms];
+ chunk->hdr.num_elms++;
+ elm->phys_start = (phys * PRESERVE_BITS)
+ << (order + PAGE_SHIFT);
+ KHOSER_STORE_PTR(elm->bitmap, bits);
+ }
+ }
+
+ ser->preserved_mem_map = first_chunk;
+
+ return 0;
+
+err_free:
+ kho_mem_ser_free(first_chunk);
+ return -ENOMEM;
+}
+
+static void deserialize_bitmap(unsigned int order,
+ struct khoser_mem_bitmap_ptr *elm)
+{
+ struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
+ unsigned long bit;
+
+ for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
+ int sz = 1 << (order + PAGE_SHIFT);
+ phys_addr_t phys =
+ elm->phys_start + (bit << (order + PAGE_SHIFT));
+ struct page *page = phys_to_page(phys);
+
+ memblock_reserve(phys, sz);
+ memblock_reserved_mark_noinit(phys, sz);
+ page->private = order;
+ }
+}
+
+static void __init kho_mem_deserialize(const void *fdt)
+{
+ struct khoser_mem_chunk *chunk;
+ const phys_addr_t *mem;
+ int len;
+
+ mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
+
+ if (!mem || len != sizeof(*mem)) {
+ pr_err("failed to get preserved memory bitmaps\n");
+ return;
+ }
+
+ chunk = *mem ? phys_to_virt(*mem) : NULL;
+ while (chunk) {
+ unsigned int i;
+
+ for (i = 0; i != chunk->hdr.num_elms; i++)
+ deserialize_bitmap(chunk->hdr.order,
+ &chunk->bitmaps[i]);
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ }
+}
+
+/*
+ * With KHO enabled, memory can become fragmented because KHO regions may
+ * be anywhere in physical address space. The scratch regions give us a
+ * safe zones that we will never see KHO allocations from. This is where we
+ * can later safely load our new kexec images into and then use the scratch
+ * area for early allocations that happen before page allocator is
+ * initialized.
+ */
+static struct kho_scratch *kho_scratch;
+static unsigned int kho_scratch_cnt;
+
+/*
+ * The scratch areas are scaled by default as percent of memory allocated from
+ * memblock. A user can override the scale with command line parameter:
+ *
+ * kho_scratch=N%
+ *
+ * It is also possible to explicitly define size for a lowmem, a global and
+ * per-node scratch areas:
+ *
+ * kho_scratch=l[KMG],n[KMG],m[KMG]
+ *
+ * The explicit size definition takes precedence over scale definition.
+ */
+static unsigned int scratch_scale __initdata = 200;
+static phys_addr_t scratch_size_global __initdata;
+static phys_addr_t scratch_size_pernode __initdata;
+static phys_addr_t scratch_size_lowmem __initdata;
+
+static int __init kho_parse_scratch_size(char *p)
+{
+ size_t len;
+ unsigned long sizes[3];
+ int i;
+
+ if (!p)
+ return -EINVAL;
+
+ len = strlen(p);
+ if (!len)
+ return -EINVAL;
+
+ /* parse nn% */
+ if (p[len - 1] == '%') {
+ /* unsigned int max is 4,294,967,295, 10 chars */
+ char s_scale[11] = {};
+ int ret = 0;
+
+ if (len > ARRAY_SIZE(s_scale))
+ return -EINVAL;
+
+ memcpy(s_scale, p, len - 1);
+ ret = kstrtouint(s_scale, 10, &scratch_scale);
+ if (!ret)
+ pr_notice("scratch scale is %d%%\n", scratch_scale);
+ return ret;
+ }
+
+ /* parse ll[KMG],mm[KMG],nn[KMG] */
+ for (i = 0; i < ARRAY_SIZE(sizes); i++) {
+ char *endp = p;
+
+ if (i > 0) {
+ if (*p != ',')
+ return -EINVAL;
+ p += 1;
+ }
+
+ sizes[i] = memparse(p, &endp);
+ if (!sizes[i] || endp == p)
+ return -EINVAL;
+ p = endp;
+ }
+
+ scratch_size_lowmem = sizes[0];
+ scratch_size_global = sizes[1];
+ scratch_size_pernode = sizes[2];
+ scratch_scale = 0;
+
+ pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n",
+ (u64)(scratch_size_lowmem >> 20),
+ (u64)(scratch_size_global >> 20),
+ (u64)(scratch_size_pernode >> 20));
+
+ return 0;
+}
+early_param("kho_scratch", kho_parse_scratch_size);
+
+static void __init scratch_size_update(void)
+{
+ phys_addr_t size;
+
+ if (!scratch_scale)
+ return;
+
+ size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
+ NUMA_NO_NODE);
+ size = size * scratch_scale / 100;
+ scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+
+ size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
+ NUMA_NO_NODE);
+ size = size * scratch_scale / 100 - scratch_size_lowmem;
+ scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+}
+
+static phys_addr_t __init scratch_size_node(int nid)
+{
+ phys_addr_t size;
+
+ if (scratch_scale) {
+ size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
+ nid);
+ size = size * scratch_scale / 100;
+ } else {
+ size = scratch_size_pernode;
+ }
+
+ return round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+}
+
+/**
+ * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec
+ *
+ * With KHO we can preserve arbitrary pages in the system. To ensure we still
+ * have a large contiguous region of memory when we search the physical address
+ * space for target memory, let's make sure we always have a large CMA region
+ * active. This CMA region will only be used for movable pages which are not a
+ * problem for us during KHO because we can just move them somewhere else.
+ */
+static void __init kho_reserve_scratch(void)
+{
+ phys_addr_t addr, size;
+ int nid, i = 0;
+
+ if (!kho_enable)
+ return;
+
+ scratch_size_update();
+
+ /* FIXME: deal with node hot-plug/remove */
+ kho_scratch_cnt = num_online_nodes() + 2;
+ size = kho_scratch_cnt * sizeof(*kho_scratch);
+ kho_scratch = memblock_alloc(size, PAGE_SIZE);
+ if (!kho_scratch)
+ goto err_disable_kho;
+
+ /*
+ * reserve scratch area in low memory for lowmem allocations in the
+ * next kernel
+ */
+ size = scratch_size_lowmem;
+ addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
+ ARCH_LOW_ADDRESS_LIMIT);
+ if (!addr)
+ goto err_free_scratch_desc;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+
+ /* reserve large contiguous area for allocations without nid */
+ size = scratch_size_global;
+ addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
+ if (!addr)
+ goto err_free_scratch_areas;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+
+ for_each_online_node(nid) {
+ size = scratch_size_node(nid);
+ addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid, true);
+ if (!addr)
+ goto err_free_scratch_areas;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+ }
+
+ return;
+
+err_free_scratch_areas:
+ for (i--; i >= 0; i--)
+ memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size);
+err_free_scratch_desc:
+ memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
+err_disable_kho:
+ kho_enable = false;
+}
+
+struct fdt_debugfs {
+ struct list_head list;
+ struct debugfs_blob_wrapper wrapper;
+ struct dentry *file;
+};
+
+static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
+ const char *name, const void *fdt)
+{
+ struct fdt_debugfs *f;
+ struct dentry *file;
+
+ f = kmalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ return -ENOMEM;
+
+ f->wrapper.data = (void *)fdt;
+ f->wrapper.size = fdt_totalsize(fdt);
+
+ file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
+ if (IS_ERR(file)) {
+ kfree(f);
+ return PTR_ERR(file);
+ }
+
+ f->file = file;
+ list_add(&f->list, list);
+
+ return 0;
+}
+
+/**
+ * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
+ * @ser: serialization control object passed by KHO notifiers.
+ * @name: name of the sub tree.
+ * @fdt: the sub tree blob.
+ *
+ * Creates a new child node named @name in KHO root FDT and records
+ * the physical address of @fdt. The pages of @fdt must also be preserved
+ * by KHO for the new kernel to retrieve it after kexec.
+ *
+ * A debugfs blob entry is also created at
+ * ``/sys/kernel/debug/kho/out/sub_fdts/@name``.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt)
+{
+ int err = 0;
+ u64 phys = (u64)virt_to_phys(fdt);
+ void *root = page_to_virt(ser->fdt);
+
+ err |= fdt_begin_node(root, name);
+ err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys));
+ err |= fdt_end_node(root);
+
+ if (err)
+ return err;
+
+ return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt);
+}
+EXPORT_SYMBOL_GPL(kho_add_subtree);
+
+struct kho_out {
+ struct blocking_notifier_head chain_head;
+
+ struct dentry *dir;
+
+ struct mutex lock; /* protects KHO FDT finalization */
+
+ struct kho_serialization ser;
+ bool finalized;
+};
+
+static struct kho_out kho_out = {
+ .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
+ .lock = __MUTEX_INITIALIZER(kho_out.lock),
+ .ser = {
+ .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
+ .track = {
+ .orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
+ },
+ },
+ .finalized = false,
+};
+
+int register_kho_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&kho_out.chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(register_kho_notifier);
+
+int unregister_kho_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&kho_out.chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_kho_notifier);
+
+/**
+ * kho_preserve_folio - preserve a folio across kexec.
+ * @folio: folio to preserve.
+ *
+ * Instructs KHO to preserve the whole folio across kexec. The order
+ * will be preserved as well.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_folio(struct folio *folio)
+{
+ const unsigned long pfn = folio_pfn(folio);
+ const unsigned int order = folio_order(folio);
+ struct kho_mem_track *track = &kho_out.ser.track;
+
+ if (kho_out.finalized)
+ return -EBUSY;
+
+ return __kho_preserve_order(track, pfn, order);
+}
+EXPORT_SYMBOL_GPL(kho_preserve_folio);
+
+/**
+ * kho_preserve_phys - preserve a physically contiguous range across kexec.
+ * @phys: physical address of the range.
+ * @size: size of the range.
+ *
+ * Instructs KHO to preserve the memory range from @phys to @phys + @size
+ * across kexec.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_phys(phys_addr_t phys, size_t size)
+{
+ unsigned long pfn = PHYS_PFN(phys);
+ unsigned long failed_pfn = 0;
+ const unsigned long start_pfn = pfn;
+ const unsigned long end_pfn = PHYS_PFN(phys + size);
+ int err = 0;
+ struct kho_mem_track *track = &kho_out.ser.track;
+
+ if (kho_out.finalized)
+ return -EBUSY;
+
+ if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+ return -EINVAL;
+
+ while (pfn < end_pfn) {
+ const unsigned int order =
+ min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+ err = __kho_preserve_order(track, pfn, order);
+ if (err) {
+ failed_pfn = pfn;
+ break;
+ }
+
+ pfn += 1 << order;
+ }
+
+ if (err)
+ __kho_unpreserve(track, start_pfn, failed_pfn);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_phys);
+
+/* Handling for debug/kho/out */
+
+static struct dentry *debugfs_root;
+
+static int kho_out_update_debugfs_fdt(void)
+{
+ int err = 0;
+ struct fdt_debugfs *ff, *tmp;
+
+ if (kho_out.finalized) {
+ err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir,
+ "fdt", page_to_virt(kho_out.ser.fdt));
+ } else {
+ list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) {
+ debugfs_remove(ff->file);
+ list_del(&ff->list);
+ kfree(ff);
+ }
+ }
+
+ return err;
+}
+
+static int kho_abort(void)
+{
+ int err;
+ unsigned long order;
+ struct kho_mem_phys *physxa;
+
+ xa_for_each(&kho_out.ser.track.orders, order, physxa) {
+ struct kho_mem_phys_bits *bits;
+ unsigned long phys;
+
+ xa_for_each(&physxa->phys_bits, phys, bits)
+ kfree(bits);
+
+ xa_destroy(&physxa->phys_bits);
+ kfree(physxa);
+ }
+ xa_destroy(&kho_out.ser.track.orders);
+
+ if (kho_out.ser.preserved_mem_map) {
+ kho_mem_ser_free(kho_out.ser.preserved_mem_map);
+ kho_out.ser.preserved_mem_map = NULL;
+ }
+
+ err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT,
+ NULL);
+ err = notifier_to_errno(err);
+
+ if (err)
+ pr_err("Failed to abort KHO finalization: %d\n", err);
+
+ return err;
+}
+
+static int kho_finalize(void)
+{
+ int err = 0;
+ u64 *preserved_mem_map;
+ void *fdt = page_to_virt(kho_out.ser.fdt);
+
+ err |= fdt_create(fdt, PAGE_SIZE);
+ err |= fdt_finish_reservemap(fdt);
+ err |= fdt_begin_node(fdt, "");
+ err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE);
+ /**
+ * Reserve the preserved-memory-map property in the root FDT, so
+ * that all property definitions will precede subnodes created by
+ * KHO callers.
+ */
+ err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP,
+ sizeof(*preserved_mem_map),
+ (void **)&preserved_mem_map);
+ if (err)
+ goto abort;
+
+ err = kho_preserve_folio(page_folio(kho_out.ser.fdt));
+ if (err)
+ goto abort;
+
+ err = blocking_notifier_call_chain(&kho_out.chain_head,
+ KEXEC_KHO_FINALIZE, &kho_out.ser);
+ err = notifier_to_errno(err);
+ if (err)
+ goto abort;
+
+ err = kho_mem_serialize(&kho_out.ser);
+ if (err)
+ goto abort;
+
+ *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map);
+
+ err |= fdt_end_node(fdt);
+ err |= fdt_finish(fdt);
+
+abort:
+ if (err) {
+ pr_err("Failed to convert KHO state tree: %d\n", err);
+ kho_abort();
+ }
+
+ return err;
+}
+
+static int kho_out_finalize_get(void *data, u64 *val)
+{
+ mutex_lock(&kho_out.lock);
+ *val = kho_out.finalized;
+ mutex_unlock(&kho_out.lock);
+
+ return 0;
+}
+
+static int kho_out_finalize_set(void *data, u64 _val)
+{
+ int ret = 0;
+ bool val = !!_val;
+
+ mutex_lock(&kho_out.lock);
+
+ if (val == kho_out.finalized) {
+ if (kho_out.finalized)
+ ret = -EEXIST;
+ else
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ if (val)
+ ret = kho_finalize();
+ else
+ ret = kho_abort();
+
+ if (ret)
+ goto unlock;
+
+ kho_out.finalized = val;
+ ret = kho_out_update_debugfs_fdt();
+
+unlock:
+ mutex_unlock(&kho_out.lock);
+ return ret;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get,
+ kho_out_finalize_set, "%llu\n");
+
+static int scratch_phys_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_phys);
+
+static int scratch_len_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].size);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_len);
+
+static __init int kho_out_debugfs_init(void)
+{
+ struct dentry *dir, *f, *sub_fdt_dir;
+
+ dir = debugfs_create_dir("out", debugfs_root);
+ if (IS_ERR(dir))
+ return -ENOMEM;
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
+ &scratch_phys_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_len", 0400, dir, NULL,
+ &scratch_len_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("finalize", 0600, dir, NULL,
+ &fops_kho_out_finalize);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ kho_out.dir = dir;
+ kho_out.ser.sub_fdt_dir = sub_fdt_dir;
+ return 0;
+
+err_rmdir:
+ debugfs_remove_recursive(dir);
+ return -ENOENT;
+}
+
+struct kho_in {
+ struct dentry *dir;
+ phys_addr_t fdt_phys;
+ phys_addr_t scratch_phys;
+ struct list_head fdt_list;
+};
+
+static struct kho_in kho_in = {
+ .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list),
+};
+
+static const void *kho_get_fdt(void)
+{
+ return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
+}
+
+/**
+ * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
+ * @name: the name of the sub FDT passed to kho_add_subtree().
+ * @phys: if found, the physical address of the sub FDT is stored in @phys.
+ *
+ * Retrieve a preserved sub FDT named @name and store its physical
+ * address in @phys.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+{
+ const void *fdt = kho_get_fdt();
+ const u64 *val;
+ int offset, len;
+
+ if (!fdt)
+ return -ENOENT;
+
+ if (!phys)
+ return -EINVAL;
+
+ offset = fdt_subnode_offset(fdt, 0, name);
+ if (offset < 0)
+ return -ENOENT;
+
+ val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len);
+ if (!val || len != sizeof(*val))
+ return -EINVAL;
+
+ *phys = (phys_addr_t)*val;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
+
+/* Handling for debugfs/kho/in */
+
+static __init int kho_in_debugfs_init(const void *fdt)
+{
+ struct dentry *sub_fdt_dir;
+ int err, child;
+
+ kho_in.dir = debugfs_create_dir("in", debugfs_root);
+ if (IS_ERR(kho_in.dir))
+ return PTR_ERR(kho_in.dir);
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir);
+ if (IS_ERR(sub_fdt_dir)) {
+ err = PTR_ERR(sub_fdt_dir);
+ goto err_rmdir;
+ }
+
+ err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt);
+ if (err)
+ goto err_rmdir;
+
+ fdt_for_each_subnode(child, fdt, 0) {
+ int len = 0;
+ const char *name = fdt_get_name(fdt, child, NULL);
+ const u64 *fdt_phys;
+
+ fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
+ if (!fdt_phys)
+ continue;
+ if (len != sizeof(*fdt_phys)) {
+ pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n",
+ name, len);
+ continue;
+ }
+ err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name,
+ phys_to_virt(*fdt_phys));
+ if (err) {
+ pr_warn("failed to add fdt `%s` to debugfs: %d\n", name,
+ err);
+ continue;
+ }
+ }
+
+ return 0;
+
+err_rmdir:
+ debugfs_remove_recursive(kho_in.dir);
+ return err;
+}
+
+static __init int kho_init(void)
+{
+ int err = 0;
+ const void *fdt = kho_get_fdt();
+
+ if (!kho_enable)
+ return 0;
+
+ kho_out.ser.fdt = alloc_page(GFP_KERNEL);
+ if (!kho_out.ser.fdt) {
+ err = -ENOMEM;
+ goto err_free_scratch;
+ }
+
+ debugfs_root = debugfs_create_dir("kho", NULL);
+ if (IS_ERR(debugfs_root)) {
+ err = -ENOENT;
+ goto err_free_fdt;
+ }
+
+ err = kho_out_debugfs_init();
+ if (err)
+ goto err_free_fdt;
+
+ if (fdt) {
+ err = kho_in_debugfs_init(fdt);
+ /*
+ * Failure to create /sys/kernel/debug/kho/in does not prevent
+ * reviving state from KHO and setting up KHO for the next
+ * kexec.
+ */
+ if (err)
+ pr_err("failed exposing handover FDT in debugfs: %d\n",
+ err);
+
+ return 0;
+ }
+
+ for (int i = 0; i < kho_scratch_cnt; i++) {
+ unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
+ unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
+ unsigned long pfn;
+
+ for (pfn = base_pfn; pfn < base_pfn + count;
+ pfn += pageblock_nr_pages)
+ init_cma_reserved_pageblock(pfn_to_page(pfn));
+ }
+
+ return 0;
+
+err_free_fdt:
+ put_page(kho_out.ser.fdt);
+ kho_out.ser.fdt = NULL;
+err_free_scratch:
+ for (int i = 0; i < kho_scratch_cnt; i++) {
+ void *start = __va(kho_scratch[i].addr);
+ void *end = start + kho_scratch[i].size;
+
+ free_reserved_area(start, end, -1, "");
+ }
+ kho_enable = false;
+ return err;
+}
+late_initcall(kho_init);
+
+static void __init kho_release_scratch(void)
+{
+ phys_addr_t start, end;
+ u64 i;
+
+ memmap_init_kho_scratch_pages();
+
+ /*
+ * Mark scratch mem as CMA before we return it. That way we
+ * ensure that no kernel allocations happen on it. That means
+ * we can reuse it as scratch memory again later.
+ */
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
+ ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
+ ulong end_pfn = pageblock_align(PFN_UP(end));
+ ulong pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
+ set_pageblock_migratetype(pfn_to_page(pfn),
+ MIGRATE_CMA);
+ }
+}
+
+void __init kho_memory_init(void)
+{
+ struct folio *folio;
+
+ if (kho_in.scratch_phys) {
+ kho_scratch = phys_to_virt(kho_in.scratch_phys);
+ kho_release_scratch();
+
+ kho_mem_deserialize(kho_get_fdt());
+ folio = kho_restore_folio(kho_in.fdt_phys);
+ if (!folio)
+ pr_warn("failed to restore folio for KHO fdt\n");
+ } else {
+ kho_reserve_scratch();
+ }
+}
+
+void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
+ phys_addr_t scratch_phys, u64 scratch_len)
+{
+ void *fdt = NULL;
+ struct kho_scratch *scratch = NULL;
+ int err = 0;
+ unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
+
+ /* Validate the input FDT */
+ fdt = early_memremap(fdt_phys, fdt_len);
+ if (!fdt) {
+ pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
+ err = -EFAULT;
+ goto out;
+ }
+ err = fdt_check_header(fdt);
+ if (err) {
+ pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
+ fdt_phys, err);
+ err = -EINVAL;
+ goto out;
+ }
+ err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
+ if (err) {
+ pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
+ fdt_phys, KHO_FDT_COMPATIBLE, err);
+ err = -EINVAL;
+ goto out;
+ }
+
+ scratch = early_memremap(scratch_phys, scratch_len);
+ if (!scratch) {
+ pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
+ scratch_phys, scratch_len);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * We pass a safe contiguous blocks of memory to use for early boot
+ * purporses from the previous kernel so that we can resize the
+ * memblock array as needed.
+ */
+ for (int i = 0; i < scratch_cnt; i++) {
+ struct kho_scratch *area = &scratch[i];
+ u64 size = area->size;
+
+ memblock_add(area->addr, size);
+ err = memblock_mark_kho_scratch(area->addr, size);
+ if (WARN_ON(err)) {
+ pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d",
+ &area->addr, &size, err);
+ goto out;
+ }
+ pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
+ }
+
+ memblock_reserve(scratch_phys, scratch_len);
+
+ /*
+ * Now that we have a viable region of scratch memory, let's tell
+ * the memblocks allocator to only use that for any allocations.
+ * That way we ensure that nothing scribbles over in use data while
+ * we initialize the page tables which we will need to ingest all
+ * memory reservations from the previous kernel.
+ */
+ memblock_set_kho_scratch_only();
+
+ kho_in.fdt_phys = fdt_phys;
+ kho_in.scratch_phys = scratch_phys;
+ kho_scratch_cnt = scratch_cnt;
+ pr_info("found kexec handover data. Will skip init for some devices\n");
+
+out:
+ if (fdt)
+ early_memunmap(fdt, fdt_len);
+ if (scratch)
+ early_memunmap(scratch, scratch_len);
+ if (err)
+ pr_warn("disabling KHO revival: %d\n", err);
+}
+
+/* Helper functions for kexec_file_load */
+
+int kho_fill_kimage(struct kimage *image)
+{
+ ssize_t scratch_size;
+ int err = 0;
+ struct kexec_buf scratch;
+
+ if (!kho_enable)
+ return 0;
+
+ image->kho.fdt = page_to_phys(kho_out.ser.fdt);
+
+ scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
+ scratch = (struct kexec_buf){
+ .image = image,
+ .buffer = kho_scratch,
+ .bufsz = scratch_size,
+ .mem = KEXEC_BUF_MEM_UNKNOWN,
+ .memsz = scratch_size,
+ .buf_align = SZ_64K, /* Makes it easier to map */
+ .buf_max = ULONG_MAX,
+ .top_down = true,
+ };
+ err = kexec_add_buffer(&scratch);
+ if (err)
+ return err;
+ image->kho.scratch = &image->segment[image->nr_segments - 1];
+
+ return 0;
+}
+
+static int kho_walk_scratch(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < kho_scratch_cnt; i++) {
+ struct resource res = {
+ .start = kho_scratch[i].addr,
+ .end = kho_scratch[i].addr + kho_scratch[i].size - 1,
+ };
+
+ /* Try to fit the kimage into our KHO scratch region */
+ ret = func(&res, kbuf);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret;
+
+ if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH)
+ return 1;
+
+ ret = kho_walk_scratch(kbuf, func);
+
+ return ret == 1 ? 0 : -EADDRNOTAVAIL;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index d35d9792402d..30a733a55a67 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -39,4 +39,20 @@ extern size_t kexec_purgatory_size;
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
#endif /* CONFIG_KEXEC_FILE */
+
+struct kexec_buf;
+
+#ifdef CONFIG_KEXEC_HANDOVER
+int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *));
+int kho_fill_kimage(struct kimage *image);
+#else
+static inline int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 1;
+}
+
+static inline int kho_fill_kimage(struct kimage *image) { return 0; }
+#endif /* CONFIG_KEXEC_HANDOVER */
#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 555e2b3a665a..61fa97da7989 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -29,6 +29,7 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#include <linux/hung_task.h>
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
@@ -191,7 +192,7 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct list_head *list)
{
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
- WRITE_ONCE(current->blocker_mutex, lock);
+ hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX);
#endif
debug_mutex_add_waiter(lock, waiter, current);
@@ -209,7 +210,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
debug_mutex_remove_waiter(lock, waiter, current);
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
- WRITE_ONCE(current->blocker_mutex, NULL);
+ hung_task_clear_blocker();
#endif
}
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index de9117c0e671..3ef032e22f7e 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -34,6 +34,7 @@
#include <linux/spinlock.h>
#include <linux/ftrace.h>
#include <trace/events/lock.h>
+#include <linux/hung_task.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
@@ -41,6 +42,41 @@ static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+ WRITE_ONCE((sem)->last_holder, (unsigned long)current);
+}
+
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+ if (READ_ONCE((sem)->last_holder) == (unsigned long)current)
+ WRITE_ONCE((sem)->last_holder, 0UL);
+}
+
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return READ_ONCE(sem->last_holder);
+}
+#else
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+}
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+}
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return 0UL;
+}
+#endif
+
+static inline void __sem_acquire(struct semaphore *sem)
+{
+ sem->count--;
+ hung_task_sem_set_holder(sem);
+}
+
/**
* down - acquire the semaphore
* @sem: the semaphore to be acquired
@@ -59,7 +95,7 @@ void __sched down(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
__down(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -83,7 +119,7 @@ int __sched down_interruptible(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_interruptible(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -110,7 +146,7 @@ int __sched down_killable(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_killable(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -140,7 +176,7 @@ int __sched down_trylock(struct semaphore *sem)
raw_spin_lock_irqsave(&sem->lock, flags);
count = sem->count - 1;
if (likely(count >= 0))
- sem->count = count;
+ __sem_acquire(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return (count < 0);
@@ -165,7 +201,7 @@ int __sched down_timeout(struct semaphore *sem, long timeout)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -187,6 +223,9 @@ void __sched up(struct semaphore *sem)
DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
+
+ hung_task_sem_clear_if_holder(sem);
+
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
@@ -228,8 +267,10 @@ static inline int __sched ___down_common(struct semaphore *sem, long state,
raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
- if (waiter.up)
+ if (waiter.up) {
+ hung_task_sem_set_holder(sem);
return 0;
+ }
}
timed_out:
@@ -246,10 +287,14 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
{
int ret;
+ hung_task_set_blocker(sem, BLOCKER_TYPE_SEM);
+
trace_contention_begin(sem, 0);
ret = ___down_common(sem, state, timeout);
trace_contention_end(sem, ret);
+ hung_task_clear_blocker();
+
return ret;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 047ea3215312..b0b9a8bf4560 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -307,12 +307,10 @@ static void panic_other_cpus_shutdown(bool crash_kexec)
}
/**
- * panic - halt the system
- * @fmt: The text string to print
+ * panic - halt the system
+ * @fmt: The text string to print
*
- * Display a message, then perform cleanups.
- *
- * This function never returns.
+ * Display a message, then perform cleanups. This function never returns.
*/
void panic(const char *fmt, ...)
{
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4e6e24e8b854..2af36cfe35cd 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1094,16 +1094,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
((unsigned long long) region->end_pfn << PAGE_SHIFT)
- 1);
- for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
- if (pfn_valid(pfn)) {
- /*
- * It is safe to ignore the result of
- * mem_bm_set_bit_check() here, since we won't
- * touch the PFNs for which the error is
- * returned anyway.
- */
- mem_bm_set_bit_check(bm, pfn);
- }
+ for_each_valid_pfn(pfn, region->start_pfn, region->end_pfn) {
+ /*
+ * It is safe to ignore the result of
+ * mem_bm_set_bit_check() here, since we won't
+ * touch the PFNs for which the error is
+ * returned anyway.
+ */
+ mem_bm_set_bit_check(bm, pfn);
+ }
}
}
@@ -1255,21 +1254,20 @@ static void mark_free_pages(struct zone *zone)
spin_lock_irqsave(&zone->lock, flags);
max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
+ for_each_valid_pfn(pfn, zone->zone_start_pfn, max_zone_pfn) {
+ page = pfn_to_page(pfn);
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
- if (page_zone(page) != zone)
- continue;
+ if (page_zone(page) != zone)
+ continue;
- if (!swsusp_page_is_forbidden(page))
- swsusp_unset_page_free(page);
- }
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
+ }
for_each_migratetype_order(order, t) {
list_for_each_entry(page,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d5f89f9ef29f..75a84efad40f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -921,7 +921,6 @@ ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
unsigned long args[ARRAY_SIZE(info->entry.args)];
int i;
- info->op = PTRACE_SYSCALL_INFO_ENTRY;
info->entry.nr = syscall_get_nr(child, regs);
syscall_get_arguments(child, regs, args);
for (i = 0; i < ARRAY_SIZE(args); i++)
@@ -943,10 +942,12 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
* diverge significantly enough.
*/
ptrace_get_syscall_info_entry(child, regs, info);
- info->op = PTRACE_SYSCALL_INFO_SECCOMP;
info->seccomp.ret_data = child->ptrace_message;
- /* ret_data is the last field in struct ptrace_syscall_info.seccomp */
+ /*
+ * ret_data is the last non-reserved field
+ * in struct ptrace_syscall_info.seccomp
+ */
return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
}
@@ -954,7 +955,6 @@ static unsigned long
ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
struct ptrace_syscall_info *info)
{
- info->op = PTRACE_SYSCALL_INFO_EXIT;
info->exit.rval = syscall_get_error(child, regs);
info->exit.is_error = !!info->exit.rval;
if (!info->exit.is_error)
@@ -965,19 +965,8 @@ ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
}
static int
-ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
- void __user *datavp)
+ptrace_get_syscall_info_op(struct task_struct *child)
{
- struct pt_regs *regs = task_pt_regs(child);
- struct ptrace_syscall_info info = {
- .op = PTRACE_SYSCALL_INFO_NONE,
- .arch = syscall_get_arch(child),
- .instruction_pointer = instruction_pointer(regs),
- .stack_pointer = user_stack_pointer(regs),
- };
- unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
- unsigned long write_size;
-
/*
* This does not need lock_task_sighand() to access
* child->last_siginfo because ptrace_freeze_traced()
@@ -988,24 +977,160 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
case SIGTRAP | 0x80:
switch (child->ptrace_message) {
case PTRACE_EVENTMSG_SYSCALL_ENTRY:
- actual_size = ptrace_get_syscall_info_entry(child, regs,
- &info);
- break;
+ return PTRACE_SYSCALL_INFO_ENTRY;
case PTRACE_EVENTMSG_SYSCALL_EXIT:
- actual_size = ptrace_get_syscall_info_exit(child, regs,
- &info);
- break;
+ return PTRACE_SYSCALL_INFO_EXIT;
+ default:
+ return PTRACE_SYSCALL_INFO_NONE;
}
- break;
case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
- actual_size = ptrace_get_syscall_info_seccomp(child, regs,
- &info);
+ return PTRACE_SYSCALL_INFO_SECCOMP;
+ default:
+ return PTRACE_SYSCALL_INFO_NONE;
+ }
+}
+
+static int
+ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
+ void __user *datavp)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ struct ptrace_syscall_info info = {
+ .op = ptrace_get_syscall_info_op(child),
+ .arch = syscall_get_arch(child),
+ .instruction_pointer = instruction_pointer(regs),
+ .stack_pointer = user_stack_pointer(regs),
+ };
+ unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
+ unsigned long write_size;
+
+ switch (info.op) {
+ case PTRACE_SYSCALL_INFO_ENTRY:
+ actual_size = ptrace_get_syscall_info_entry(child, regs, &info);
+ break;
+ case PTRACE_SYSCALL_INFO_EXIT:
+ actual_size = ptrace_get_syscall_info_exit(child, regs, &info);
+ break;
+ case PTRACE_SYSCALL_INFO_SECCOMP:
+ actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info);
break;
}
write_size = min(actual_size, user_size);
return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
}
+
+static int
+ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ unsigned long args[ARRAY_SIZE(info->entry.args)];
+ int nr = info->entry.nr;
+ int i;
+
+ /*
+ * Check that the syscall number specified in info->entry.nr
+ * is either a value of type "int" or a sign-extended value
+ * of type "int".
+ */
+ if (nr != info->entry.nr)
+ return -ERANGE;
+
+ for (i = 0; i < ARRAY_SIZE(args); i++) {
+ args[i] = info->entry.args[i];
+ /*
+ * Check that the syscall argument specified in
+ * info->entry.args[i] is either a value of type
+ * "unsigned long" or a sign-extended value of type "long".
+ */
+ if (args[i] != info->entry.args[i])
+ return -ERANGE;
+ }
+
+ syscall_set_nr(child, regs, nr);
+ /*
+ * If the syscall number is set to -1, setting syscall arguments is not
+ * just pointless, it would also clobber the syscall return value on
+ * those architectures that share the same register both for the first
+ * argument of syscall and its return value.
+ */
+ if (nr != -1)
+ syscall_set_arguments(child, regs, args);
+
+ return 0;
+}
+
+static int
+ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ /*
+ * info->entry is currently a subset of info->seccomp,
+ * info->seccomp.ret_data is currently ignored.
+ */
+ return ptrace_set_syscall_info_entry(child, regs, info);
+}
+
+static int
+ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ long rval = info->exit.rval;
+
+ /*
+ * Check that the return value specified in info->exit.rval
+ * is either a value of type "long" or a sign-extended value
+ * of type "long".
+ */
+ if (rval != info->exit.rval)
+ return -ERANGE;
+
+ if (info->exit.is_error)
+ syscall_set_return_value(child, regs, rval, 0);
+ else
+ syscall_set_return_value(child, regs, 0, rval);
+
+ return 0;
+}
+
+static int
+ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size,
+ const void __user *datavp)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ struct ptrace_syscall_info info;
+
+ if (user_size < sizeof(info))
+ return -EINVAL;
+
+ /*
+ * The compatibility is tracked by info.op and info.flags: if user-space
+ * does not instruct us to use unknown extra bits from future versions
+ * of ptrace_syscall_info, we are not going to read them either.
+ */
+ if (copy_from_user(&info, datavp, sizeof(info)))
+ return -EFAULT;
+
+ /* Reserved for future use. */
+ if (info.flags || info.reserved)
+ return -EINVAL;
+
+ /* Changing the type of the system call stop is not supported yet. */
+ if (ptrace_get_syscall_info_op(child) != info.op)
+ return -EINVAL;
+
+ switch (info.op) {
+ case PTRACE_SYSCALL_INFO_ENTRY:
+ return ptrace_set_syscall_info_entry(child, regs, &info);
+ case PTRACE_SYSCALL_INFO_EXIT:
+ return ptrace_set_syscall_info_exit(child, regs, &info);
+ case PTRACE_SYSCALL_INFO_SECCOMP:
+ return ptrace_set_syscall_info_seccomp(child, regs, &info);
+ default:
+ /* Other types of system call stops are not supported yet. */
+ return -EINVAL;
+ }
+}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
int ptrace_request(struct task_struct *child, long request,
@@ -1224,6 +1349,10 @@ int ptrace_request(struct task_struct *child, long request,
case PTRACE_GET_SYSCALL_INFO:
ret = ptrace_get_syscall_info(child, addr, datavp);
break;
+
+ case PTRACE_SET_SYSCALL_INFO:
+ ret = ptrace_set_syscall_info(child, addr, datavp);
+ break;
#endif
case PTRACE_SECCOMP_GET_FILTER:
diff --git a/kernel/relay.c b/kernel/relay.c
index 5ac7e711e4b6..c0c93a04d4ce 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -452,7 +452,7 @@ int relay_prepare_cpu(unsigned int cpu)
/**
* relay_open - create a new relay channel
- * @base_filename: base name of files to create, %NULL for buffering only
+ * @base_filename: base name of files to create
* @parent: dentry of parent directory, %NULL for root directory or buffer
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
@@ -465,10 +465,6 @@ int relay_prepare_cpu(unsigned int cpu)
* attributes specified. The created channel buffer files
* will be named base_filename0...base_filenameN-1. File
* permissions will be %S_IRUSR.
- *
- * If opening a buffer (@parent = NULL) that you later wish to register
- * in a filesystem, call relay_late_setup_files() once the @parent dentry
- * is available.
*/
struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
@@ -540,111 +536,6 @@ struct rchan_percpu_buf_dispatcher {
struct dentry *dentry;
};
-/* Called in atomic context. */
-static void __relay_set_buf_dentry(void *info)
-{
- struct rchan_percpu_buf_dispatcher *p = info;
-
- relay_set_buf_dentry(p->buf, p->dentry);
-}
-
-/**
- * relay_late_setup_files - triggers file creation
- * @chan: channel to operate on
- * @base_filename: base name of files to create
- * @parent: dentry of parent directory, %NULL for root directory
- *
- * Returns 0 if successful, non-zero otherwise.
- *
- * Use to setup files for a previously buffer-only channel created
- * by relay_open() with a NULL parent dentry.
- *
- * For example, this is useful for perfomring early tracing in kernel,
- * before VFS is up and then exposing the early results once the dentry
- * is available.
- */
-int relay_late_setup_files(struct rchan *chan,
- const char *base_filename,
- struct dentry *parent)
-{
- int err = 0;
- unsigned int i, curr_cpu;
- unsigned long flags;
- struct dentry *dentry;
- struct rchan_buf *buf;
- struct rchan_percpu_buf_dispatcher disp;
-
- if (!chan || !base_filename)
- return -EINVAL;
-
- strscpy(chan->base_filename, base_filename, NAME_MAX);
-
- mutex_lock(&relay_channels_mutex);
- /* Is chan already set up? */
- if (unlikely(chan->has_base_filename)) {
- mutex_unlock(&relay_channels_mutex);
- return -EEXIST;
- }
- chan->has_base_filename = 1;
- chan->parent = parent;
-
- if (chan->is_global) {
- err = -EINVAL;
- buf = *per_cpu_ptr(chan->buf, 0);
- if (!WARN_ON_ONCE(!buf)) {
- dentry = relay_create_buf_file(chan, buf, 0);
- if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
- relay_set_buf_dentry(buf, dentry);
- err = 0;
- }
- }
- mutex_unlock(&relay_channels_mutex);
- return err;
- }
-
- curr_cpu = get_cpu();
- /*
- * The CPU hotplug notifier ran before us and created buffers with
- * no files associated. So it's safe to call relay_setup_buf_file()
- * on all currently online CPUs.
- */
- for_each_online_cpu(i) {
- buf = *per_cpu_ptr(chan->buf, i);
- if (unlikely(!buf)) {
- WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
- err = -EINVAL;
- break;
- }
-
- dentry = relay_create_buf_file(chan, buf, i);
- if (unlikely(!dentry)) {
- err = -EINVAL;
- break;
- }
-
- if (curr_cpu == i) {
- local_irq_save(flags);
- relay_set_buf_dentry(buf, dentry);
- local_irq_restore(flags);
- } else {
- disp.buf = buf;
- disp.dentry = dentry;
- smp_mb();
- /* relay_channels_mutex must be held, so wait. */
- err = smp_call_function_single(i,
- __relay_set_buf_dentry,
- &disp, 1);
- }
- if (unlikely(err))
- break;
- }
- put_cpu();
- mutex_unlock(&relay_channels_mutex);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(relay_late_setup_files);
-
/**
* relay_switch_subbuf - switch to a new sub-buffer
* @buf: channel buffer
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 125912c0e9dd..b4326827e326 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3329,6 +3329,15 @@ static void task_numa_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
+ /*
+ * Memory is pinned to only one NUMA node via cpuset.mems, naturally
+ * no page can be migrated.
+ */
+ if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) {
+ trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed);
+ return;
+ }
+
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6859008ca34d..e24509bd0af5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2226,7 +2226,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL;
struct ring_buffer_cpu_meta *meta;
struct buffer_page *bpage;
struct page *page;
@@ -2252,7 +2252,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
if (!bpage)
- goto fail_free_buffer;
+ return NULL;
rb_check_bpage(cpu_buffer, bpage);
@@ -2318,13 +2318,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_head_page_activate(cpu_buffer);
}
- return cpu_buffer;
+ return_ptr(cpu_buffer);
fail_free_reader:
free_buffer_page(cpu_buffer->reader_page);
- fail_free_buffer:
- kfree(cpu_buffer);
return NULL;
}
@@ -2359,7 +2357,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
unsigned long scratch_size,
struct lock_class_key *key)
{
- struct trace_buffer *buffer;
+ struct trace_buffer *buffer __free(kfree) = NULL;
long nr_pages;
int subbuf_size;
int bsize;
@@ -2373,7 +2371,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
return NULL;
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
- goto fail_free_buffer;
+ return NULL;
buffer->subbuf_order = order;
subbuf_size = (PAGE_SIZE << order);
@@ -2472,7 +2470,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
- return buffer;
+ return_ptr(buffer);
fail_free_buffers:
for_each_buffer_cpu(buffer, cpu) {
@@ -2484,8 +2482,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
- fail_free_buffer:
- kfree(buffer);
return NULL;
}
@@ -2849,6 +2845,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (nr_pages < 2)
nr_pages = 2;
+ /*
+ * Keep CPUs from coming online while resizing to synchronize
+ * with new per CPU buffers being created.
+ */
+ guard(cpus_read_lock)();
+
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
atomic_inc(&buffer->resizing);
@@ -2893,7 +2895,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cond_resched();
}
- cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -2933,7 +2934,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- cpus_read_unlock();
} else {
cpu_buffer = buffer->buffers[cpu_id];
@@ -2961,8 +2961,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
goto out_err;
}
- cpus_read_lock();
-
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
@@ -2981,7 +2979,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
cpu_buffer->nr_pages_to_update = 0;
- cpus_read_unlock();
}
out:
@@ -4684,10 +4681,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
rb_decrement_entry(cpu_buffer, event);
- if (rb_try_to_discard(cpu_buffer, event))
- goto out;
-
- out:
+ rb_try_to_discard(cpu_buffer, event);
rb_end_commit(cpu_buffer);
trace_recursive_unlock(cpu_buffer);
@@ -6020,6 +6014,39 @@ static void rb_clear_buffer_page(struct buffer_page *page)
page->read = 0;
}
+/*
+ * When the buffer is memory mapped to user space, each sub buffer
+ * has a unique id that is used by the meta data to tell the user
+ * where the current reader page is.
+ *
+ * For a normal allocated ring buffer, the id is saved in the buffer page
+ * id field, and updated via this function.
+ *
+ * But for a fixed memory mapped buffer, the id is already assigned for
+ * fixed memory ording in the memory layout and can not be used. Instead
+ * the index of where the page lies in the memory layout is used.
+ *
+ * For the normal pages, set the buffer page id with the passed in @id
+ * value and return that.
+ *
+ * For fixed memory mapped pages, get the page index in the memory layout
+ * and return that as the id.
+ */
+static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage, int id)
+{
+ /*
+ * For boot buffers, the id is the index,
+ * otherwise, set the buffer page with this id
+ */
+ if (cpu_buffer->ring_meta)
+ id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page);
+ else
+ bpage->id = id;
+
+ return id;
+}
+
static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
@@ -6028,7 +6055,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
return;
meta->reader.read = cpu_buffer->reader_page->read;
- meta->reader.id = cpu_buffer->reader_page->id;
+ meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page,
+ cpu_buffer->reader_page->id);
+
meta->reader.lost_events = cpu_buffer->lost_events;
meta->entries = local_read(&cpu_buffer->entries);
@@ -6098,21 +6127,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
/* Must have disabled the cpu buffer then done a synchronize_rcu */
static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
- goto out;
+ return;
arch_spin_lock(&cpu_buffer->lock);
rb_reset_cpu(cpu_buffer);
arch_spin_unlock(&cpu_buffer->lock);
-
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
/**
@@ -6300,37 +6324,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
!cpumask_test_cpu(cpu, buffer_b->cpumask))
- goto out;
+ return -EINVAL;
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
/* It's up to the callers to not try to swap mapped buffers */
- if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) {
- ret = -EBUSY;
- goto out;
- }
+ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped))
+ return -EBUSY;
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
- goto out;
+ return -EINVAL;
if (buffer_a->subbuf_order != buffer_b->subbuf_order)
- goto out;
-
- ret = -EAGAIN;
+ return -EINVAL;
if (atomic_read(&buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
/*
* We can't do a synchronize_rcu here because this
@@ -6367,7 +6387,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
out_dec:
atomic_dec(&cpu_buffer_a->record_disabled);
atomic_dec(&cpu_buffer_b->record_disabled);
-out:
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
@@ -6526,38 +6545,37 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
struct buffer_data_page *bpage;
struct buffer_page *reader;
unsigned long missed_events;
- unsigned long flags;
unsigned int commit;
unsigned int read;
u64 save_timestamp;
- int ret = -1;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- goto out;
+ return -1;
/*
* If len is not big enough to hold the page header, then
* we can not copy anything.
*/
if (len <= BUF_PAGE_HDR_SIZE)
- goto out;
+ return -1;
len -= BUF_PAGE_HDR_SIZE;
if (!data_page || !data_page->data)
- goto out;
+ return -1;
+
if (data_page->order != buffer->subbuf_order)
- goto out;
+ return -1;
bpage = data_page->data;
if (!bpage)
- goto out;
+ return -1;
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
- goto out_unlock;
+ return -1;
event = rb_reader_event(cpu_buffer);
@@ -6591,7 +6609,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (full &&
(!read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page))
- goto out_unlock;
+ return -1;
if (len > (commit - read))
len = (commit - read);
@@ -6600,7 +6618,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
size = rb_event_ts_length(event);
if (len < size)
- goto out_unlock;
+ return -1;
/* save the current timestamp, since the user will need it */
save_timestamp = cpu_buffer->read_stamp;
@@ -6658,7 +6676,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (reader->real_end)
local_set(&bpage->commit, reader->real_end);
}
- ret = read;
cpu_buffer->lost_events = 0;
@@ -6685,11 +6702,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (commit < buffer->subbuf_size)
memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
- out_unlock:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
- out:
- return ret;
+ return read;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
@@ -6944,23 +6957,29 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
struct buffer_page *first_subbuf, *subbuf;
+ int cnt = 0;
int id = 0;
- subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
- cpu_buffer->reader_page->id = id++;
+ id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
+ subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
+ cnt++;
first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
do {
+ id = rb_page_id(cpu_buffer, subbuf, id);
+
if (WARN_ON(id >= nr_subbufs))
break;
subbuf_ids[id] = (unsigned long)subbuf->page;
- subbuf->id = id;
rb_inc_page(&subbuf);
id++;
+ cnt++;
} while (subbuf != first_subbuf);
+ WARN_ON(cnt != nr_subbufs);
+
/* install subbuf ID to kern VA translation */
cpu_buffer->subbuf_ids = subbuf_ids;
@@ -7052,7 +7071,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
{
unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
unsigned int subbuf_pages, subbuf_order;
- struct page **pages;
+ struct page **pages __free(kfree) = NULL;
int p = 0, s = 0;
int err;
@@ -7120,10 +7139,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
struct page *page;
int off = 0;
- if (WARN_ON_ONCE(s >= nr_subbufs)) {
- err = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(s >= nr_subbufs))
+ return -EINVAL;
page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
@@ -7138,9 +7155,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
-out:
- kfree(pages);
-
return err;
}
#else
@@ -7156,36 +7170,34 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags, *subbuf_ids;
- int err = 0;
+ int err;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
- mutex_lock(&cpu_buffer->mapping_lock);
+ guard(mutex)(&cpu_buffer->mapping_lock);
if (cpu_buffer->user_mapped) {
err = __rb_map_vma(cpu_buffer, vma);
if (!err)
err = __rb_inc_dec_mapped(cpu_buffer, true);
- mutex_unlock(&cpu_buffer->mapping_lock);
return err;
}
/* prevent another thread from changing buffer/sub-buffer sizes */
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
err = rb_alloc_meta_page(cpu_buffer);
if (err)
- goto unlock;
+ return err;
/* subbuf_ids include the reader while nr_pages does not */
subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
if (!subbuf_ids) {
rb_free_meta_page(cpu_buffer);
- err = -ENOMEM;
- goto unlock;
+ return -ENOMEM;
}
atomic_inc(&cpu_buffer->resize_disabled);
@@ -7213,35 +7225,29 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
atomic_dec(&cpu_buffer->resize_disabled);
}
-unlock:
- mutex_unlock(&buffer->mutex);
- mutex_unlock(&cpu_buffer->mapping_lock);
-
- return err;
+ return 0;
}
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
- int err = 0;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
- mutex_lock(&cpu_buffer->mapping_lock);
+ guard(mutex)(&cpu_buffer->mapping_lock);
if (!cpu_buffer->user_mapped) {
- err = -ENODEV;
- goto out;
+ return -ENODEV;
} else if (cpu_buffer->user_mapped > 1) {
__rb_inc_dec_mapped(cpu_buffer, false);
- goto out;
+ return 0;
}
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/* This is the last user space mapping */
@@ -7256,12 +7262,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
rb_free_meta_page(cpu_buffer);
atomic_dec(&cpu_buffer->resize_disabled);
- mutex_unlock(&buffer->mutex);
-
-out:
- mutex_unlock(&cpu_buffer->mapping_lock);
-
- return err;
+ return 0;
}
int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
@@ -7302,8 +7303,8 @@ consume:
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
- if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
- if (missed_events) {
+ if (missed_events) {
+ if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
struct buffer_data_page *bpage = reader->page;
unsigned int commit;
/*
@@ -7324,13 +7325,23 @@ consume:
local_add(RB_MISSED_STORED, &bpage->commit);
}
local_add(RB_MISSED_EVENTS, &bpage->commit);
+ } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page,
+ "Reader on commit with %ld missed events",
+ missed_events)) {
+ /*
+ * There shouldn't be any missed events if the tail_page
+ * is on the reader page. But if the tail page is not on the
+ * reader page and the commit_page is, that would mean that
+ * there's a commit_overrun (an interrupt preempted an
+ * addition of an event and then filled the buffer
+ * with new events). In this case it's not an
+ * error, but it should still be reported.
+ *
+ * TODO: Add missed events to the page for user space to know.
+ */
+ pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n",
+ cpu, missed_events, cpu_buffer->reader_page->page->time_stamp);
}
- } else {
- /*
- * There really shouldn't be any missed events if the commit
- * is on the reader page.
- */
- WARN_ON_ONCE(missed_events);
}
cpu_buffer->lost_events = 0;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d70b0351fb61..95ae7c4e5835 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -51,6 +51,7 @@
#include <linux/workqueue.h>
#include <linux/sort.h>
#include <linux/io.h> /* vmap_page_range() */
+#include <linux/fs_context.h>
#include <asm/setup.h> /* COMMAND_LINE_SIZE */
@@ -6711,6 +6712,22 @@ static int tracing_wait_pipe(struct file *filp)
return 1;
}
+static bool update_last_data_if_empty(struct trace_array *tr)
+{
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return false;
+
+ if (!ring_buffer_empty(tr->array_buffer.buffer))
+ return false;
+
+ /*
+ * If the buffer contains the last boot data and all per-cpu
+ * buffers are empty, reset it from the kernel side.
+ */
+ update_last_data(tr);
+ return true;
+}
+
/*
* Consumer reader.
*/
@@ -6742,6 +6759,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
}
waitagain:
+ if (update_last_data_if_empty(iter->tr))
+ return 0;
+
sret = tracing_wait_pipe(filp);
if (sret <= 0)
return sret;
@@ -8320,6 +8340,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (ret < 0) {
if (trace_empty(iter) && !iter->closed) {
+ if (update_last_data_if_empty(iter->tr))
+ return 0;
+
if ((filp->f_flags & O_NONBLOCK))
return -EAGAIN;
@@ -8661,10 +8684,6 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP)
return -ENODEV;
- /* Currently the boot mapped buffer is not supported for mmap */
- if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
- return -ENODEV;
-
ret = get_snapshot_map(iter->tr);
if (ret)
return ret;
@@ -10241,6 +10260,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
struct file_system_type *type;
+ struct fs_context *fc;
+ int ret;
/*
* To maintain backward compatibility for tools that mount
@@ -10250,12 +10271,20 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
type = get_fs_type("tracefs");
if (!type)
return NULL;
- mnt = vfs_submount(mntpt, type, "tracefs", NULL);
+
+ fc = fs_context_for_submount(type, mntpt);
put_filesystem(type);
- if (IS_ERR(mnt))
- return NULL;
- mntget(mnt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ret = vfs_parse_fs_string(fc, "source",
+ "tracefs", strlen("tracefs"));
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+ put_fs_context(fc);
return mnt;
}
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 1fec61603ef3..e066d31d08f8 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -210,6 +210,10 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE);
#define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
+#ifdef CONFIG_UNACCEPTED_MEMORY
+#define PAGE_UNACCEPTED_MAPCOUNT_VALUE (PGTY_unaccepted << 24)
+ VMCOREINFO_NUMBER(PAGE_UNACCEPTED_MAPCOUNT_VALUE);
+#endif
#ifdef CONFIG_KALLSYMS
VMCOREINFO_SYMBOL(kallsyms_names);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9fa2af9dbf2c..80b56c002c7f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled = 1;
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_thresh_next;
static int __read_mostly watchdog_hardlockup_available;
struct cpumask watchdog_cpumask __read_mostly;
@@ -63,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
*/
unsigned int __read_mostly hardlockup_panic =
IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
+
+#ifdef CONFIG_SYSFS
+
+static unsigned int hardlockup_count;
+
+static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%u\n", hardlockup_count);
+}
+
+static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count);
+
+static __init int kernel_hardlockup_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL);
+ return 0;
+}
+
+late_initcall(kernel_hardlockup_sysfs_init);
+
+#endif // CONFIG_SYSFS
+
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -169,6 +193,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
unsigned int this_cpu = smp_processor_id();
unsigned long flags;
+#ifdef CONFIG_SYSFS
+ ++hardlockup_count;
+#endif
+
/* Only print hardlockups once. */
if (per_cpu(watchdog_hardlockup_warned, cpu))
return;
@@ -311,6 +339,28 @@ unsigned int __read_mostly softlockup_panic =
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
+#ifdef CONFIG_SYSFS
+
+static unsigned int softlockup_count;
+
+static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%u\n", softlockup_count);
+}
+
+static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count);
+
+static __init int kernel_softlockup_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL);
+ return 0;
+}
+
+late_initcall(kernel_softlockup_sysfs_init);
+
+#endif // CONFIG_SYSFS
+
/* Timestamp taken after the last successful reschedule. */
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
/* Timestamp of the last softlockup report. */
@@ -742,6 +792,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
touch_ts = __this_cpu_read(watchdog_touch_ts);
duration = is_softlockup(touch_ts, period_ts, now);
if (unlikely(duration)) {
+#ifdef CONFIG_SYSFS
+ ++softlockup_count;
+#endif
+
/*
* Prevent multiple soft-lockup reports if one cpu is already
* engaged in dumping all cpu back traces.
@@ -870,12 +924,20 @@ int lockup_detector_offline_cpu(unsigned int cpu)
return 0;
}
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
softlockup_stop_all();
+ /*
+ * To prevent watchdog_timer_fn from using the old interval and
+ * the new watchdog_thresh at the same time, which could lead to
+ * false softlockup reports, it is necessary to update the
+ * watchdog_thresh after the softlockup is completed.
+ */
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
@@ -888,7 +950,7 @@ static void __lockup_detector_reconfigure(void)
void lockup_detector_reconfigure(void)
{
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
mutex_unlock(&watchdog_mutex);
}
@@ -908,27 +970,29 @@ static __init void lockup_detector_setup(void)
return;
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
lockup_detector_update_enable();
watchdog_hardlockup_start();
cpus_read_unlock();
}
void lockup_detector_reconfigure(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
static inline void lockup_detector_setup(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
@@ -946,11 +1010,11 @@ void lockup_detector_soft_poweroff(void)
#ifdef CONFIG_SYSCTL
/* Propagate any changes to the watchdog infrastructure */
-static void proc_watchdog_update(void)
+static void proc_watchdog_update(bool thresh_changed)
{
/* Remove impossible cpus to keep sysctl output clean. */
cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(thresh_changed);
}
/*
@@ -984,7 +1048,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr
} else {
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!err && old != READ_ONCE(*param))
- proc_watchdog_update();
+ proc_watchdog_update(false);
}
mutex_unlock(&watchdog_mutex);
return err;
@@ -1035,11 +1099,13 @@ static int proc_watchdog_thresh(const struct ctl_table *table, int write,
mutex_lock(&watchdog_mutex);
- old = READ_ONCE(watchdog_thresh);
+ watchdog_thresh_next = READ_ONCE(watchdog_thresh);
+
+ old = watchdog_thresh_next;
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!err && write && old != READ_ONCE(watchdog_thresh))
- proc_watchdog_update();
+ if (!err && write && old != READ_ONCE(watchdog_thresh_next))
+ proc_watchdog_update(true);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1060,7 +1126,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write,
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write)
- proc_watchdog_update();
+ proc_watchdog_update(false);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1080,7 +1146,7 @@ static const struct ctl_table watchdog_sysctls[] = {
},
{
.procname = "watchdog_thresh",
- .data = &watchdog_thresh,
+ .data = &watchdog_thresh_next,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog_thresh,