summaryrefslogtreecommitdiff
path: root/mm/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/userfaultfd.c')
-rw-r--r--mm/userfaultfd.c127
1 files changed, 46 insertions, 81 deletions
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index af61b95c89e4..e6dfd5f28acd 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -10,7 +10,7 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
@@ -178,6 +178,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
spinlock_t *ptl;
struct folio *folio = page_folio(page);
bool page_in_cache = folio_mapping(folio);
+ pte_t dst_ptep;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
_dst_pte = pte_mkdirty(_dst_pte);
@@ -199,12 +200,15 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
}
ret = -EEXIST;
+
+ dst_ptep = ptep_get(dst_pte);
+
/*
- * We allow to overwrite a pte marker: consider when both MISSING|WP
- * registered, we firstly wr-protect a none pte which has no page cache
- * page backing it, then access the page.
+ * We are allowed to overwrite a UFFD pte marker: consider when both
+ * MISSING|WP registered, we firstly wr-protect a none pte which has no
+ * page cache page backing it, then access the page.
*/
- if (!pte_none_mostly(ptep_get(dst_pte)))
+ if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep))
goto out_unlock;
if (page_in_cache) {
@@ -583,12 +587,15 @@ retry:
goto out_unlock;
}
- if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
- !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
- err = -EEXIST;
- hugetlb_vma_unlock_read(dst_vma);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- goto out_unlock;
+ if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
+ const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte);
+
+ if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) {
+ err = -EEXIST;
+ hugetlb_vma_unlock_read(dst_vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out_unlock;
+ }
}
err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
@@ -1035,8 +1042,7 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
*/
static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
unsigned long src_addr,
- pte_t *src_pte, pte_t *dst_pte,
- struct anon_vma *src_anon_vma)
+ pte_t *src_pte, pte_t *dst_pte)
{
pte_t orig_dst_pte, orig_src_pte;
struct folio *folio;
@@ -1052,8 +1058,7 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
if (!folio || !folio_trylock(folio))
return NULL;
- if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) ||
- folio_anon_vma(folio) != src_anon_vma) {
+ if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) {
folio_unlock(folio);
return NULL;
}
@@ -1061,9 +1066,8 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
}
/*
- * Moves src folios to dst in a batch as long as they share the same
- * anon_vma as the first folio, are not large, and can successfully
- * take the lock via folio_trylock().
+ * Moves src folios to dst in a batch as long as they are not large, and can
+ * successfully take the lock via folio_trylock().
*/
static long move_present_ptes(struct mm_struct *mm,
struct vm_area_struct *dst_vma,
@@ -1073,8 +1077,7 @@ static long move_present_ptes(struct mm_struct *mm,
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio **first_src_folio, unsigned long len,
- struct anon_vma *src_anon_vma)
+ struct folio **first_src_folio, unsigned long len)
{
int err = 0;
struct folio *src_folio = *first_src_folio;
@@ -1116,9 +1119,8 @@ static long move_present_ptes(struct mm_struct *mm,
orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
/* Set soft dirty bit so userspace can notice the pte was moved */
-#ifdef CONFIG_MEM_SOFT_DIRTY
- orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
-#endif
+ if (pgtable_supports_soft_dirty())
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
if (pte_dirty(orig_src_pte))
orig_dst_pte = pte_mkdirty(orig_dst_pte);
orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
@@ -1132,8 +1134,8 @@ static long move_present_ptes(struct mm_struct *mm,
src_pte++;
folio_unlock(src_folio);
- src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte,
- dst_pte, src_anon_vma);
+ src_folio = check_ptes_for_batched_move(src_vma, src_addr,
+ src_pte, dst_pte);
if (!src_folio)
break;
}
@@ -1205,9 +1207,8 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
-#ifdef CONFIG_MEM_SOFT_DIRTY
- orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
-#endif
+ if (pgtable_supports_soft_dirty())
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1253,7 +1254,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd
unsigned long dst_addr, unsigned long src_addr,
unsigned long len, __u64 mode)
{
- swp_entry_t entry;
struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
@@ -1263,7 +1263,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd
pmd_t dummy_pmdval;
pmd_t dst_pmdval;
struct folio *src_folio = NULL;
- struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
long ret = 0;
@@ -1347,9 +1346,9 @@ retry:
}
/*
- * Pin and lock both source folio and anon_vma. Since we are in
- * RCU read section, we can't block, so on contention have to
- * unmap the ptes, obtain the lock and retry.
+ * Pin and lock source folio. Since we are in RCU read section,
+ * we can't block, so on contention have to unmap the ptes,
+ * obtain the lock and retry.
*/
if (!src_folio) {
struct folio *folio;
@@ -1423,46 +1422,25 @@ retry:
goto retry;
}
- if (!src_anon_vma) {
- /*
- * folio_referenced walks the anon_vma chain
- * without the folio lock. Serialize against it with
- * the anon_vma lock, the folio lock is not enough.
- */
- src_anon_vma = folio_get_anon_vma(src_folio);
- if (!src_anon_vma) {
- /* page was unmapped from under us */
- ret = -EAGAIN;
- goto out;
- }
- if (!anon_vma_trylock_write(src_anon_vma)) {
- pte_unmap(src_pte);
- pte_unmap(dst_pte);
- src_pte = dst_pte = NULL;
- /* now we can block and wait */
- anon_vma_lock_write(src_anon_vma);
- goto retry;
- }
- }
-
ret = move_present_ptes(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd,
dst_pmdval, dst_ptl, src_ptl, &src_folio,
- len, src_anon_vma);
- } else {
+ len);
+ } else { /* !pte_present() */
struct folio *folio = NULL;
+ const softleaf_t entry = softleaf_from_pte(orig_src_pte);
- entry = pte_to_swp_entry(orig_src_pte);
- if (non_swap_entry(entry)) {
- if (is_migration_entry(entry)) {
- pte_unmap(src_pte);
- pte_unmap(dst_pte);
- src_pte = dst_pte = NULL;
- migration_entry_wait(mm, src_pmd, src_addr);
- ret = -EAGAIN;
- } else
- ret = -EFAULT;
+ if (softleaf_is_migration(entry)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ migration_entry_wait(mm, src_pmd, src_addr);
+
+ ret = -EAGAIN;
+ goto out;
+ } else if (!softleaf_is_swap(entry)) {
+ ret = -EFAULT;
goto out;
}
@@ -1515,10 +1493,6 @@ retry:
}
out:
- if (src_anon_vma) {
- anon_vma_unlock_write(src_anon_vma);
- put_anon_vma(src_anon_vma);
- }
if (src_folio) {
folio_unlock(src_folio);
folio_put(src_folio);
@@ -1578,7 +1552,7 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
/*
* For now, we keep it simple and only move between writable VMAs.
- * Access flags are equal, therefore cheching only the source is enough.
+ * Access flags are equal, therefore checking only the source is enough.
*/
if (!(src_vma->vm_flags & VM_WRITE))
return -EINVAL;
@@ -1792,15 +1766,6 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma,
* virtual regions without knowing if there are transparent hugepage
* in the regions or not, but preventing the risk of having to split
* the hugepmd during the remap.
- *
- * If there's any rmap walk that is taking the anon_vma locks without
- * first obtaining the folio lock (the only current instance is
- * folio_referenced), they will have to verify if the folio->mapping
- * has changed after taking the anon_vma lock. If it changed they
- * should release the lock and retry obtaining a new anon_vma, because
- * it means the anon_vma was changed by move_pages() before the lock
- * could be obtained. This is the only additional complexity added to
- * the rmap code to provide this anonymous page remapping functionality.
*/
ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len, __u64 mode)