summaryrefslogtreecommitdiff
path: root/mm/madvise.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/madvise.c')
-rw-r--r--mm/madvise.c160
1 files changed, 106 insertions, 54 deletions
diff --git a/mm/madvise.c b/mm/madvise.c
index fb1c86e630b6..b617b1be0f53 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -29,7 +29,7 @@
#include <linux/backing-dev.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
@@ -167,7 +167,7 @@ static int madvise_update_vma(vm_flags_t new_flags,
range->start, range->end, anon_name);
else
vma = vma_modify_flags(&vmi, madv_behavior->prev, vma,
- range->start, range->end, new_flags);
+ range->start, range->end, &new_flags);
if (IS_ERR(vma))
return PTR_ERR(vma);
@@ -195,7 +195,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
for (addr = start; addr < end; addr += PAGE_SIZE) {
pte_t pte;
- swp_entry_t entry;
+ softleaf_t entry;
struct folio *folio;
if (!ptep++) {
@@ -205,10 +205,8 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
}
pte = ptep_get(ptep);
- if (!is_swap_pte(pte))
- continue;
- entry = pte_to_swp_entry(pte);
- if (unlikely(non_swap_entry(entry)))
+ entry = softleaf_from_pte(pte);
+ if (unlikely(!softleaf_is_swap(entry)))
continue;
pte_unmap_unlock(ptep, ptl);
@@ -251,7 +249,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
continue;
entry = radix_to_swp_entry(folio);
/* There might be swapin error entries in shmem mapping. */
- if (non_swap_entry(entry))
+ if (!softleaf_is_swap(entry))
continue;
addr = vma->vm_start +
@@ -392,7 +390,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
+ !pmd_is_migration_entry(orig_pmd));
goto huge_unlock;
}
@@ -690,17 +688,16 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
* (page allocation + zeroing).
*/
if (!pte_present(ptent)) {
- swp_entry_t entry;
+ softleaf_t entry = softleaf_from_pte(ptent);
- entry = pte_to_swp_entry(ptent);
- if (!non_swap_entry(entry)) {
+ if (softleaf_is_swap(entry)) {
max_nr = (end - addr) / PAGE_SIZE;
nr = swap_pte_batch(pte, max_nr, ptent);
nr_swap -= nr;
free_swap_and_cache_nr(entry, nr);
clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
- } else if (is_hwpoison_entry(entry) ||
- is_poisoned_swp_entry(entry)) {
+ } else if (softleaf_is_hwpoison(entry) ||
+ softleaf_is_poison_marker(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
}
continue;
@@ -1071,8 +1068,9 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
static bool is_guard_pte_marker(pte_t ptent)
{
- return is_swap_pte(ptent) &&
- is_guard_swp_entry(pte_to_swp_entry(ptent));
+ const softleaf_t entry = softleaf_from_pte(ptent);
+
+ return softleaf_is_guard_marker(entry);
}
static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
@@ -1122,18 +1120,17 @@ static int guard_install_set_pte(unsigned long addr, unsigned long next,
return 0;
}
-static const struct mm_walk_ops guard_install_walk_ops = {
- .pud_entry = guard_install_pud_entry,
- .pmd_entry = guard_install_pmd_entry,
- .pte_entry = guard_install_pte_entry,
- .install_pte = guard_install_set_pte,
- .walk_lock = PGWALK_RDLOCK,
-};
-
static long madvise_guard_install(struct madvise_behavior *madv_behavior)
{
struct vm_area_struct *vma = madv_behavior->vma;
struct madvise_behavior_range *range = &madv_behavior->range;
+ struct mm_walk_ops walk_ops = {
+ .pud_entry = guard_install_pud_entry,
+ .pmd_entry = guard_install_pmd_entry,
+ .pte_entry = guard_install_pte_entry,
+ .install_pte = guard_install_set_pte,
+ .walk_lock = get_walk_lock(madv_behavior->lock_mode),
+ };
long err;
int i;
@@ -1141,24 +1138,38 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
return -EINVAL;
/*
- * If we install guard markers, then the range is no longer
- * empty from a page table perspective and therefore it's
- * appropriate to have an anon_vma.
+ * Set atomically under read lock. All pertinent readers will need to
+ * acquire an mmap/VMA write lock to read it. All remaining readers may
+ * or may not see the flag set, but we don't care.
+ */
+ vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT);
+
+ /*
+ * If anonymous and we are establishing page tables the VMA ought to
+ * have an anon_vma associated with it.
*
- * This ensures that on fork, we copy page tables correctly.
+ * We will hold an mmap read lock if this is necessary, this is checked
+ * as part of the VMA lock logic.
*/
- err = anon_vma_prepare(vma);
- if (err)
- return err;
+ if (vma_is_anonymous(vma)) {
+ VM_WARN_ON_ONCE(!vma->anon_vma &&
+ madv_behavior->lock_mode != MADVISE_MMAP_READ_LOCK);
+
+ err = anon_vma_prepare(vma);
+ if (err)
+ return err;
+ }
/*
* Optimistically try to install the guard marker pages first. If any
- * non-guard pages are encountered, give up and zap the range before
- * trying again.
+ * non-guard pages or THP huge pages are encountered, give up and zap
+ * the range before trying again.
*
* We try a few times before giving up and releasing back to userland to
- * loop around, releasing locks in the process to avoid contention. This
- * would only happen if there was a great many racing page faults.
+ * loop around, releasing locks in the process to avoid contention.
+ *
+ * This would only happen due to races with e.g. page faults or
+ * khugepaged.
*
* In most cases we should simply install the guard markers immediately
* with no zap or looping.
@@ -1167,8 +1178,13 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
unsigned long nr_pages = 0;
/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
- err = walk_page_range_mm(vma->vm_mm, range->start, range->end,
- &guard_install_walk_ops, &nr_pages);
+ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK)
+ err = walk_page_range_vma_unsafe(madv_behavior->vma,
+ range->start, range->end, &walk_ops,
+ &nr_pages);
+ else
+ err = walk_page_range_mm_unsafe(vma->vm_mm, range->start,
+ range->end, &walk_ops, &nr_pages);
if (err < 0)
return err;
@@ -1189,8 +1205,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
}
/*
- * We were unable to install the guard pages due to being raced by page
- * faults. This should not happen ordinarily. We return to userspace and
+ * We were unable to install the guard pages, return to userspace and
* immediately retry, relieving lock contention.
*/
return restart_syscall();
@@ -1234,17 +1249,16 @@ static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops guard_remove_walk_ops = {
- .pud_entry = guard_remove_pud_entry,
- .pmd_entry = guard_remove_pmd_entry,
- .pte_entry = guard_remove_pte_entry,
- .walk_lock = PGWALK_RDLOCK,
-};
-
static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
{
struct vm_area_struct *vma = madv_behavior->vma;
struct madvise_behavior_range *range = &madv_behavior->range;
+ struct mm_walk_ops wallk_ops = {
+ .pud_entry = guard_remove_pud_entry,
+ .pmd_entry = guard_remove_pmd_entry,
+ .pte_entry = guard_remove_pte_entry,
+ .walk_lock = get_walk_lock(madv_behavior->lock_mode),
+ };
/*
* We're ok with removing guards in mlock()'d ranges, as this is a
@@ -1254,7 +1268,7 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
return -EINVAL;
return walk_page_range_vma(vma, range->start, range->end,
- &guard_remove_walk_ops, NULL);
+ &wallk_ops, NULL);
}
#ifdef CONFIG_64BIT
@@ -1567,6 +1581,47 @@ static bool process_madvise_remote_valid(int behavior)
}
}
+/* Does this operation invoke anon_vma_prepare()? */
+static bool prepares_anon_vma(int behavior)
+{
+ switch (behavior) {
+ case MADV_GUARD_INSTALL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * We have acquired a VMA read lock, is the VMA valid to be madvise'd under VMA
+ * read lock only now we have a VMA to examine?
+ */
+static bool is_vma_lock_sufficient(struct vm_area_struct *vma,
+ struct madvise_behavior *madv_behavior)
+{
+ /* Must span only a single VMA.*/
+ if (madv_behavior->range.end > vma->vm_end)
+ return false;
+ /* Remote processes unsupported. */
+ if (current->mm != vma->vm_mm)
+ return false;
+ /* Userfaultfd unsupported. */
+ if (userfaultfd_armed(vma))
+ return false;
+ /*
+ * anon_vma_prepare() explicitly requires an mmap lock for
+ * serialisation, so we cannot use a VMA lock in this case.
+ *
+ * Note we might race with anon_vma being set, however this makes this
+ * check overly paranoid which is safe.
+ */
+ if (vma_is_anonymous(vma) &&
+ prepares_anon_vma(madv_behavior->behavior) && !vma->anon_vma)
+ return false;
+
+ return true;
+}
+
/*
* Try to acquire a VMA read lock if possible.
*
@@ -1588,15 +1643,12 @@ static bool try_vma_read_lock(struct madvise_behavior *madv_behavior)
vma = lock_vma_under_rcu(mm, madv_behavior->range.start);
if (!vma)
goto take_mmap_read_lock;
- /*
- * Must span only a single VMA; uffd and remote processes are
- * unsupported.
- */
- if (madv_behavior->range.end > vma->vm_end || current->mm != mm ||
- userfaultfd_armed(vma)) {
+
+ if (!is_vma_lock_sufficient(vma, madv_behavior)) {
vma_end_read(vma);
goto take_mmap_read_lock;
}
+
madv_behavior->vma = vma;
return true;
@@ -1709,9 +1761,9 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
+ return MADVISE_MMAP_READ_LOCK;
case MADV_GUARD_INSTALL:
case MADV_GUARD_REMOVE:
- return MADVISE_MMAP_READ_LOCK;
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
case MADV_FREE: