summaryrefslogtreecommitdiff
path: root/mm/memfd_luo.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memfd_luo.c')
-rw-r--r--mm/memfd_luo.c516
1 files changed, 516 insertions, 0 deletions
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
new file mode 100644
index 000000000000..4f6ba63b4310
--- /dev/null
+++ b/mm/memfd_luo.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ *
+ * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
+ * Pratyush Yadav <ptyadav@amazon.de>
+ */
+
+/**
+ * DOC: Memfd Preservation via LUO
+ *
+ * Overview
+ * ========
+ *
+ * Memory file descriptors (memfd) can be preserved over a kexec using the Live
+ * Update Orchestrator (LUO) file preservation. This allows userspace to
+ * transfer its memory contents to the next kernel after a kexec.
+ *
+ * The preservation is not intended to be transparent. Only select properties of
+ * the file are preserved. All others are reset to default. The preserved
+ * properties are described below.
+ *
+ * .. note::
+ * The LUO API is not stabilized yet, so the preserved properties of a memfd
+ * are also not stable and are subject to backwards incompatible changes.
+ *
+ * .. note::
+ * Currently a memfd backed by Hugetlb is not supported. Memfds created
+ * with ``MFD_HUGETLB`` will be rejected.
+ *
+ * Preserved Properties
+ * ====================
+ *
+ * The following properties of the memfd are preserved across kexec:
+ *
+ * File Contents
+ * All data stored in the file is preserved.
+ *
+ * File Size
+ * The size of the file is preserved. Holes in the file are filled by
+ * allocating pages for them during preservation.
+ *
+ * File Position
+ * The current file position is preserved, allowing applications to continue
+ * reading/writing from their last position.
+ *
+ * File Status Flags
+ * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
+ * is maintained.
+ *
+ * Non-Preserved Properties
+ * ========================
+ *
+ * All properties which are not preserved must be assumed to be reset to
+ * default. This section describes some of those properties which may be more of
+ * note.
+ *
+ * ``FD_CLOEXEC`` flag
+ * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
+ * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
+ * again after restore via ``fcntl()``.
+ *
+ * Seals
+ * File seals are not preserved. The file is unsealed on restore and if
+ * needed, must be sealed again via ``fcntl()``.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bits.h>
+#include <linux/err.h>
+#include <linux/file.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/memfd.h>
+#include <linux/liveupdate.h>
+#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+
+static int memfd_luo_preserve_folios(struct file *file,
+ struct kho_vmalloc *kho_vmalloc,
+ struct memfd_luo_folio_ser **out_folios_ser,
+ u64 *nr_foliosp)
+{
+ struct inode *inode = file_inode(file);
+ struct memfd_luo_folio_ser *folios_ser;
+ unsigned int max_folios;
+ long i, size, nr_pinned;
+ struct folio **folios;
+ int err = -EINVAL;
+ pgoff_t offset;
+ u64 nr_folios;
+
+ size = i_size_read(inode);
+ /*
+ * If the file has zero size, then the folios and nr_folios properties
+ * are not set.
+ */
+ if (!size) {
+ *nr_foliosp = 0;
+ *out_folios_ser = NULL;
+ memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
+ return 0;
+ }
+
+ /*
+ * Guess the number of folios based on inode size. Real number might end
+ * up being smaller if there are higher order folios.
+ */
+ max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
+ folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
+ if (!folios)
+ return -ENOMEM;
+
+ /*
+ * Pin the folios so they don't move around behind our back. This also
+ * ensures none of the folios are in CMA -- which ensures they don't
+ * fall in KHO scratch memory. It also moves swapped out folios back to
+ * memory.
+ *
+ * A side effect of doing this is that it allocates a folio for all
+ * indices in the file. This might waste memory on sparse memfds. If
+ * that is really a problem in the future, we can have a
+ * memfd_pin_folios() variant that does not allocate a page on empty
+ * slots.
+ */
+ nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
+ &offset);
+ if (nr_pinned < 0) {
+ err = nr_pinned;
+ pr_err("failed to pin folios: %d\n", err);
+ goto err_free_folios;
+ }
+ nr_folios = nr_pinned;
+
+ folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
+ if (!folios_ser) {
+ err = -ENOMEM;
+ goto err_unpin;
+ }
+
+ for (i = 0; i < nr_folios; i++) {
+ struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+ struct folio *folio = folios[i];
+ unsigned int flags = 0;
+
+ err = kho_preserve_folio(folio);
+ if (err)
+ goto err_unpreserve;
+
+ if (folio_test_dirty(folio))
+ flags |= MEMFD_LUO_FOLIO_DIRTY;
+ if (folio_test_uptodate(folio))
+ flags |= MEMFD_LUO_FOLIO_UPTODATE;
+
+ pfolio->pfn = folio_pfn(folio);
+ pfolio->flags = flags;
+ pfolio->index = folio->index;
+ }
+
+ err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
+ if (err)
+ goto err_unpreserve;
+
+ kvfree(folios);
+ *nr_foliosp = nr_folios;
+ *out_folios_ser = folios_ser;
+
+ /*
+ * Note: folios_ser is purposely not freed here. It is preserved
+ * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
+ * that is passed via private_data.
+ */
+ return 0;
+
+err_unpreserve:
+ for (i = i - 1; i >= 0; i--)
+ kho_unpreserve_folio(folios[i]);
+ vfree(folios_ser);
+err_unpin:
+ unpin_folios(folios, nr_folios);
+err_free_folios:
+ kvfree(folios);
+
+ return err;
+}
+
+static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
+ struct memfd_luo_folio_ser *folios_ser,
+ u64 nr_folios)
+{
+ long i;
+
+ if (!nr_folios)
+ return;
+
+ kho_unpreserve_vmalloc(kho_vmalloc);
+
+ for (i = 0; i < nr_folios; i++) {
+ const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+ struct folio *folio;
+
+ if (!pfolio->pfn)
+ continue;
+
+ folio = pfn_folio(pfolio->pfn);
+
+ kho_unpreserve_folio(folio);
+ unpin_folio(folio);
+ }
+
+ vfree(folios_ser);
+}
+
+static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
+{
+ struct inode *inode = file_inode(args->file);
+ struct memfd_luo_folio_ser *folios_ser;
+ struct memfd_luo_ser *ser;
+ u64 nr_folios;
+ int err = 0;
+
+ inode_lock(inode);
+ shmem_freeze(inode, true);
+
+ /* Allocate the main serialization structure in preserved memory */
+ ser = kho_alloc_preserve(sizeof(*ser));
+ if (IS_ERR(ser)) {
+ err = PTR_ERR(ser);
+ goto err_unlock;
+ }
+
+ ser->pos = args->file->f_pos;
+ ser->size = i_size_read(inode);
+
+ err = memfd_luo_preserve_folios(args->file, &ser->folios,
+ &folios_ser, &nr_folios);
+ if (err)
+ goto err_free_ser;
+
+ ser->nr_folios = nr_folios;
+ inode_unlock(inode);
+
+ args->private_data = folios_ser;
+ args->serialized_data = virt_to_phys(ser);
+
+ return 0;
+
+err_free_ser:
+ kho_unpreserve_free(ser);
+err_unlock:
+ shmem_freeze(inode, false);
+ inode_unlock(inode);
+ return err;
+}
+
+static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
+{
+ struct memfd_luo_ser *ser;
+
+ if (WARN_ON_ONCE(!args->serialized_data))
+ return -EINVAL;
+
+ ser = phys_to_virt(args->serialized_data);
+
+ /*
+ * The pos might have changed since prepare. Everything else stays the
+ * same.
+ */
+ ser->pos = args->file->f_pos;
+
+ return 0;
+}
+
+static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
+{
+ struct inode *inode = file_inode(args->file);
+ struct memfd_luo_ser *ser;
+
+ if (WARN_ON_ONCE(!args->serialized_data))
+ return;
+
+ inode_lock(inode);
+ shmem_freeze(inode, false);
+
+ ser = phys_to_virt(args->serialized_data);
+
+ memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
+ ser->nr_folios);
+
+ kho_unpreserve_free(ser);
+ inode_unlock(inode);
+}
+
+static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
+ u64 nr_folios)
+{
+ u64 i;
+
+ for (i = 0; i < nr_folios; i++) {
+ const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+ struct folio *folio;
+ phys_addr_t phys;
+
+ if (!pfolio->pfn)
+ continue;
+
+ phys = PFN_PHYS(pfolio->pfn);
+ folio = kho_restore_folio(phys);
+ if (!folio) {
+ pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
+ phys);
+ continue;
+ }
+
+ folio_put(folio);
+ }
+}
+
+static void memfd_luo_finish(struct liveupdate_file_op_args *args)
+{
+ struct memfd_luo_folio_ser *folios_ser;
+ struct memfd_luo_ser *ser;
+
+ if (args->retrieved)
+ return;
+
+ ser = phys_to_virt(args->serialized_data);
+ if (!ser)
+ return;
+
+ if (ser->nr_folios) {
+ folios_ser = kho_restore_vmalloc(&ser->folios);
+ if (!folios_ser)
+ goto out;
+
+ memfd_luo_discard_folios(folios_ser, ser->nr_folios);
+ vfree(folios_ser);
+ }
+
+out:
+ kho_restore_free(ser);
+}
+
+static int memfd_luo_retrieve_folios(struct file *file,
+ struct memfd_luo_folio_ser *folios_ser,
+ u64 nr_folios)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio;
+ int err = -EIO;
+ long i;
+
+ for (i = 0; i < nr_folios; i++) {
+ const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
+ phys_addr_t phys;
+ u64 index;
+ int flags;
+
+ if (!pfolio->pfn)
+ continue;
+
+ phys = PFN_PHYS(pfolio->pfn);
+ folio = kho_restore_folio(phys);
+ if (!folio) {
+ pr_err("Unable to restore folio at physical address: %llx\n",
+ phys);
+ goto put_folios;
+ }
+ index = pfolio->index;
+ flags = pfolio->flags;
+
+ /* Set up the folio for insertion. */
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+
+ err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
+ if (err) {
+ pr_err("shmem: failed to charge folio index %ld: %d\n",
+ i, err);
+ goto unlock_folio;
+ }
+
+ err = shmem_add_to_page_cache(folio, mapping, index, NULL,
+ mapping_gfp_mask(mapping));
+ if (err) {
+ pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
+ i, err);
+ goto unlock_folio;
+ }
+
+ if (flags & MEMFD_LUO_FOLIO_UPTODATE)
+ folio_mark_uptodate(folio);
+ if (flags & MEMFD_LUO_FOLIO_DIRTY)
+ folio_mark_dirty(folio);
+
+ err = shmem_inode_acct_blocks(inode, 1);
+ if (err) {
+ pr_err("shmem: failed to account folio index %ld: %d\n",
+ i, err);
+ goto unlock_folio;
+ }
+
+ shmem_recalc_inode(inode, 1, 0);
+ folio_add_lru(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return 0;
+
+unlock_folio:
+ folio_unlock(folio);
+ folio_put(folio);
+put_folios:
+ /*
+ * Note: don't free the folios already added to the file. They will be
+ * freed when the file is freed. Free the ones not added yet here.
+ */
+ for (long j = i + 1; j < nr_folios; j++) {
+ const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
+
+ folio = kho_restore_folio(pfolio->pfn);
+ if (folio)
+ folio_put(folio);
+ }
+
+ return err;
+}
+
+static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
+{
+ struct memfd_luo_folio_ser *folios_ser;
+ struct memfd_luo_ser *ser;
+ struct file *file;
+ int err;
+
+ ser = phys_to_virt(args->serialized_data);
+ if (!ser)
+ return -EINVAL;
+
+ file = shmem_file_setup("", 0, VM_NORESERVE);
+
+ if (IS_ERR(file)) {
+ pr_err("failed to setup file: %pe\n", file);
+ return PTR_ERR(file);
+ }
+
+ vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
+ file->f_inode->i_size = ser->size;
+
+ if (ser->nr_folios) {
+ folios_ser = kho_restore_vmalloc(&ser->folios);
+ if (!folios_ser) {
+ err = -EINVAL;
+ goto put_file;
+ }
+
+ err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
+ vfree(folios_ser);
+ if (err)
+ goto put_file;
+ }
+
+ args->file = file;
+ kho_restore_free(ser);
+
+ return 0;
+
+put_file:
+ fput(file);
+
+ return err;
+}
+
+static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
+ struct file *file)
+{
+ struct inode *inode = file_inode(file);
+
+ return shmem_file(file) && !inode->i_nlink;
+}
+
+static const struct liveupdate_file_ops memfd_luo_file_ops = {
+ .freeze = memfd_luo_freeze,
+ .finish = memfd_luo_finish,
+ .retrieve = memfd_luo_retrieve,
+ .preserve = memfd_luo_preserve,
+ .unpreserve = memfd_luo_unpreserve,
+ .can_preserve = memfd_luo_can_preserve,
+ .owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler memfd_luo_handler = {
+ .ops = &memfd_luo_file_ops,
+ .compatible = MEMFD_LUO_FH_COMPATIBLE,
+};
+
+static int __init memfd_luo_init(void)
+{
+ int err = liveupdate_register_file_handler(&memfd_luo_handler);
+
+ if (err && err != -EOPNOTSUPP) {
+ pr_err("Could not register luo filesystem handler: %pe\n",
+ ERR_PTR(err));
+
+ return err;
+ }
+
+ return 0;
+}
+late_initcall(memfd_luo_init);