diff options
Diffstat (limited to 'mm/memfd_luo.c')
| -rw-r--r-- | mm/memfd_luo.c | 516 |
1 files changed, 516 insertions, 0 deletions
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c new file mode 100644 index 000000000000..4f6ba63b4310 --- /dev/null +++ b/mm/memfd_luo.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + * + * Copyright (C) 2025 Amazon.com Inc. or its affiliates. + * Pratyush Yadav <ptyadav@amazon.de> + */ + +/** + * DOC: Memfd Preservation via LUO + * + * Overview + * ======== + * + * Memory file descriptors (memfd) can be preserved over a kexec using the Live + * Update Orchestrator (LUO) file preservation. This allows userspace to + * transfer its memory contents to the next kernel after a kexec. + * + * The preservation is not intended to be transparent. Only select properties of + * the file are preserved. All others are reset to default. The preserved + * properties are described below. + * + * .. note:: + * The LUO API is not stabilized yet, so the preserved properties of a memfd + * are also not stable and are subject to backwards incompatible changes. + * + * .. note:: + * Currently a memfd backed by Hugetlb is not supported. Memfds created + * with ``MFD_HUGETLB`` will be rejected. + * + * Preserved Properties + * ==================== + * + * The following properties of the memfd are preserved across kexec: + * + * File Contents + * All data stored in the file is preserved. + * + * File Size + * The size of the file is preserved. Holes in the file are filled by + * allocating pages for them during preservation. + * + * File Position + * The current file position is preserved, allowing applications to continue + * reading/writing from their last position. + * + * File Status Flags + * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property + * is maintained. + * + * Non-Preserved Properties + * ======================== + * + * All properties which are not preserved must be assumed to be reset to + * default. This section describes some of those properties which may be more of + * note. + * + * ``FD_CLOEXEC`` flag + * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the + * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set + * again after restore via ``fcntl()``. + * + * Seals + * File seals are not preserved. The file is unsealed on restore and if + * needed, must be sealed again via ``fcntl()``. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/bits.h> +#include <linux/err.h> +#include <linux/file.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/memfd.h> +#include <linux/liveupdate.h> +#include <linux/shmem_fs.h> +#include <linux/vmalloc.h> +#include "internal.h" + +static int memfd_luo_preserve_folios(struct file *file, + struct kho_vmalloc *kho_vmalloc, + struct memfd_luo_folio_ser **out_folios_ser, + u64 *nr_foliosp) +{ + struct inode *inode = file_inode(file); + struct memfd_luo_folio_ser *folios_ser; + unsigned int max_folios; + long i, size, nr_pinned; + struct folio **folios; + int err = -EINVAL; + pgoff_t offset; + u64 nr_folios; + + size = i_size_read(inode); + /* + * If the file has zero size, then the folios and nr_folios properties + * are not set. + */ + if (!size) { + *nr_foliosp = 0; + *out_folios_ser = NULL; + memset(kho_vmalloc, 0, sizeof(*kho_vmalloc)); + return 0; + } + + /* + * Guess the number of folios based on inode size. Real number might end + * up being smaller if there are higher order folios. + */ + max_folios = PAGE_ALIGN(size) / PAGE_SIZE; + folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL); + if (!folios) + return -ENOMEM; + + /* + * Pin the folios so they don't move around behind our back. This also + * ensures none of the folios are in CMA -- which ensures they don't + * fall in KHO scratch memory. It also moves swapped out folios back to + * memory. + * + * A side effect of doing this is that it allocates a folio for all + * indices in the file. This might waste memory on sparse memfds. If + * that is really a problem in the future, we can have a + * memfd_pin_folios() variant that does not allocate a page on empty + * slots. + */ + nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios, + &offset); + if (nr_pinned < 0) { + err = nr_pinned; + pr_err("failed to pin folios: %d\n", err); + goto err_free_folios; + } + nr_folios = nr_pinned; + + folios_ser = vcalloc(nr_folios, sizeof(*folios_ser)); + if (!folios_ser) { + err = -ENOMEM; + goto err_unpin; + } + + for (i = 0; i < nr_folios; i++) { + struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; + struct folio *folio = folios[i]; + unsigned int flags = 0; + + err = kho_preserve_folio(folio); + if (err) + goto err_unpreserve; + + if (folio_test_dirty(folio)) + flags |= MEMFD_LUO_FOLIO_DIRTY; + if (folio_test_uptodate(folio)) + flags |= MEMFD_LUO_FOLIO_UPTODATE; + + pfolio->pfn = folio_pfn(folio); + pfolio->flags = flags; + pfolio->index = folio->index; + } + + err = kho_preserve_vmalloc(folios_ser, kho_vmalloc); + if (err) + goto err_unpreserve; + + kvfree(folios); + *nr_foliosp = nr_folios; + *out_folios_ser = folios_ser; + + /* + * Note: folios_ser is purposely not freed here. It is preserved + * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer + * that is passed via private_data. + */ + return 0; + +err_unpreserve: + for (i = i - 1; i >= 0; i--) + kho_unpreserve_folio(folios[i]); + vfree(folios_ser); +err_unpin: + unpin_folios(folios, nr_folios); +err_free_folios: + kvfree(folios); + + return err; +} + +static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc, + struct memfd_luo_folio_ser *folios_ser, + u64 nr_folios) +{ + long i; + + if (!nr_folios) + return; + + kho_unpreserve_vmalloc(kho_vmalloc); + + for (i = 0; i < nr_folios; i++) { + const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; + struct folio *folio; + + if (!pfolio->pfn) + continue; + + folio = pfn_folio(pfolio->pfn); + + kho_unpreserve_folio(folio); + unpin_folio(folio); + } + + vfree(folios_ser); +} + +static int memfd_luo_preserve(struct liveupdate_file_op_args *args) +{ + struct inode *inode = file_inode(args->file); + struct memfd_luo_folio_ser *folios_ser; + struct memfd_luo_ser *ser; + u64 nr_folios; + int err = 0; + + inode_lock(inode); + shmem_freeze(inode, true); + + /* Allocate the main serialization structure in preserved memory */ + ser = kho_alloc_preserve(sizeof(*ser)); + if (IS_ERR(ser)) { + err = PTR_ERR(ser); + goto err_unlock; + } + + ser->pos = args->file->f_pos; + ser->size = i_size_read(inode); + + err = memfd_luo_preserve_folios(args->file, &ser->folios, + &folios_ser, &nr_folios); + if (err) + goto err_free_ser; + + ser->nr_folios = nr_folios; + inode_unlock(inode); + + args->private_data = folios_ser; + args->serialized_data = virt_to_phys(ser); + + return 0; + +err_free_ser: + kho_unpreserve_free(ser); +err_unlock: + shmem_freeze(inode, false); + inode_unlock(inode); + return err; +} + +static int memfd_luo_freeze(struct liveupdate_file_op_args *args) +{ + struct memfd_luo_ser *ser; + + if (WARN_ON_ONCE(!args->serialized_data)) + return -EINVAL; + + ser = phys_to_virt(args->serialized_data); + + /* + * The pos might have changed since prepare. Everything else stays the + * same. + */ + ser->pos = args->file->f_pos; + + return 0; +} + +static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args) +{ + struct inode *inode = file_inode(args->file); + struct memfd_luo_ser *ser; + + if (WARN_ON_ONCE(!args->serialized_data)) + return; + + inode_lock(inode); + shmem_freeze(inode, false); + + ser = phys_to_virt(args->serialized_data); + + memfd_luo_unpreserve_folios(&ser->folios, args->private_data, + ser->nr_folios); + + kho_unpreserve_free(ser); + inode_unlock(inode); +} + +static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser, + u64 nr_folios) +{ + u64 i; + + for (i = 0; i < nr_folios; i++) { + const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; + struct folio *folio; + phys_addr_t phys; + + if (!pfolio->pfn) + continue; + + phys = PFN_PHYS(pfolio->pfn); + folio = kho_restore_folio(phys); + if (!folio) { + pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n", + phys); + continue; + } + + folio_put(folio); + } +} + +static void memfd_luo_finish(struct liveupdate_file_op_args *args) +{ + struct memfd_luo_folio_ser *folios_ser; + struct memfd_luo_ser *ser; + + if (args->retrieved) + return; + + ser = phys_to_virt(args->serialized_data); + if (!ser) + return; + + if (ser->nr_folios) { + folios_ser = kho_restore_vmalloc(&ser->folios); + if (!folios_ser) + goto out; + + memfd_luo_discard_folios(folios_ser, ser->nr_folios); + vfree(folios_ser); + } + +out: + kho_restore_free(ser); +} + +static int memfd_luo_retrieve_folios(struct file *file, + struct memfd_luo_folio_ser *folios_ser, + u64 nr_folios) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + int err = -EIO; + long i; + + for (i = 0; i < nr_folios; i++) { + const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; + phys_addr_t phys; + u64 index; + int flags; + + if (!pfolio->pfn) + continue; + + phys = PFN_PHYS(pfolio->pfn); + folio = kho_restore_folio(phys); + if (!folio) { + pr_err("Unable to restore folio at physical address: %llx\n", + phys); + goto put_folios; + } + index = pfolio->index; + flags = pfolio->flags; + + /* Set up the folio for insertion. */ + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); + if (err) { + pr_err("shmem: failed to charge folio index %ld: %d\n", + i, err); + goto unlock_folio; + } + + err = shmem_add_to_page_cache(folio, mapping, index, NULL, + mapping_gfp_mask(mapping)); + if (err) { + pr_err("shmem: failed to add to page cache folio index %ld: %d\n", + i, err); + goto unlock_folio; + } + + if (flags & MEMFD_LUO_FOLIO_UPTODATE) + folio_mark_uptodate(folio); + if (flags & MEMFD_LUO_FOLIO_DIRTY) + folio_mark_dirty(folio); + + err = shmem_inode_acct_blocks(inode, 1); + if (err) { + pr_err("shmem: failed to account folio index %ld: %d\n", + i, err); + goto unlock_folio; + } + + shmem_recalc_inode(inode, 1, 0); + folio_add_lru(folio); + folio_unlock(folio); + folio_put(folio); + } + + return 0; + +unlock_folio: + folio_unlock(folio); + folio_put(folio); +put_folios: + /* + * Note: don't free the folios already added to the file. They will be + * freed when the file is freed. Free the ones not added yet here. + */ + for (long j = i + 1; j < nr_folios; j++) { + const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; + + folio = kho_restore_folio(pfolio->pfn); + if (folio) + folio_put(folio); + } + + return err; +} + +static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) +{ + struct memfd_luo_folio_ser *folios_ser; + struct memfd_luo_ser *ser; + struct file *file; + int err; + + ser = phys_to_virt(args->serialized_data); + if (!ser) + return -EINVAL; + + file = shmem_file_setup("", 0, VM_NORESERVE); + + if (IS_ERR(file)) { + pr_err("failed to setup file: %pe\n", file); + return PTR_ERR(file); + } + + vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); + file->f_inode->i_size = ser->size; + + if (ser->nr_folios) { + folios_ser = kho_restore_vmalloc(&ser->folios); + if (!folios_ser) { + err = -EINVAL; + goto put_file; + } + + err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios); + vfree(folios_ser); + if (err) + goto put_file; + } + + args->file = file; + kho_restore_free(ser); + + return 0; + +put_file: + fput(file); + + return err; +} + +static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, + struct file *file) +{ + struct inode *inode = file_inode(file); + + return shmem_file(file) && !inode->i_nlink; +} + +static const struct liveupdate_file_ops memfd_luo_file_ops = { + .freeze = memfd_luo_freeze, + .finish = memfd_luo_finish, + .retrieve = memfd_luo_retrieve, + .preserve = memfd_luo_preserve, + .unpreserve = memfd_luo_unpreserve, + .can_preserve = memfd_luo_can_preserve, + .owner = THIS_MODULE, +}; + +static struct liveupdate_file_handler memfd_luo_handler = { + .ops = &memfd_luo_file_ops, + .compatible = MEMFD_LUO_FH_COMPATIBLE, +}; + +static int __init memfd_luo_init(void) +{ + int err = liveupdate_register_file_handler(&memfd_luo_handler); + + if (err && err != -EOPNOTSUPP) { + pr_err("Could not register luo filesystem handler: %pe\n", + ERR_PTR(err)); + + return err; + } + + return 0; +} +late_initcall(memfd_luo_init); |
