diff options
Diffstat (limited to 'mm/userfaultfd.c')
| -rw-r--r-- | mm/userfaultfd.c | 127 |
1 files changed, 46 insertions, 81 deletions
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af61b95c89e4..e6dfd5f28acd 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -10,7 +10,7 @@ #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> @@ -178,6 +178,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, spinlock_t *ptl; struct folio *folio = page_folio(page); bool page_in_cache = folio_mapping(folio); + pte_t dst_ptep; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -199,12 +200,15 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, } ret = -EEXIST; + + dst_ptep = ptep_get(dst_pte); + /* - * We allow to overwrite a pte marker: consider when both MISSING|WP - * registered, we firstly wr-protect a none pte which has no page cache - * page backing it, then access the page. + * We are allowed to overwrite a UFFD pte marker: consider when both + * MISSING|WP registered, we firstly wr-protect a none pte which has no + * page cache page backing it, then access the page. */ - if (!pte_none_mostly(ptep_get(dst_pte))) + if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_unlock; if (page_in_cache) { @@ -583,12 +587,15 @@ retry: goto out_unlock; } - if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && - !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { - err = -EEXIST; - hugetlb_vma_unlock_read(dst_vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - goto out_unlock; + if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { + const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); + + if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { + err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } } err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, @@ -1035,8 +1042,7 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, */ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, unsigned long src_addr, - pte_t *src_pte, pte_t *dst_pte, - struct anon_vma *src_anon_vma) + pte_t *src_pte, pte_t *dst_pte) { pte_t orig_dst_pte, orig_src_pte; struct folio *folio; @@ -1052,8 +1058,7 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !folio_trylock(folio)) return NULL; - if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) || - folio_anon_vma(folio) != src_anon_vma) { + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { folio_unlock(folio); return NULL; } @@ -1061,9 +1066,8 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, } /* - * Moves src folios to dst in a batch as long as they share the same - * anon_vma as the first folio, are not large, and can successfully - * take the lock via folio_trylock(). + * Moves src folios to dst in a batch as long as they are not large, and can + * successfully take the lock via folio_trylock(). */ static long move_present_ptes(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1073,8 +1077,7 @@ static long move_present_ptes(struct mm_struct *mm, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio **first_src_folio, unsigned long len, - struct anon_vma *src_anon_vma) + struct folio **first_src_folio, unsigned long len) { int err = 0; struct folio *src_folio = *first_src_folio; @@ -1116,9 +1119,8 @@ static long move_present_ptes(struct mm_struct *mm, orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); /* Set soft dirty bit so userspace can notice the pte was moved */ -#ifdef CONFIG_MEM_SOFT_DIRTY - orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); -#endif + if (pgtable_supports_soft_dirty()) + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); if (pte_dirty(orig_src_pte)) orig_dst_pte = pte_mkdirty(orig_dst_pte); orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); @@ -1132,8 +1134,8 @@ static long move_present_ptes(struct mm_struct *mm, src_pte++; folio_unlock(src_folio); - src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, - dst_pte, src_anon_vma); + src_folio = check_ptes_for_batched_move(src_vma, src_addr, + src_pte, dst_pte); if (!src_folio) break; } @@ -1205,9 +1207,8 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); -#ifdef CONFIG_MEM_SOFT_DIRTY - orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); -#endif + if (pgtable_supports_soft_dirty()) + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); @@ -1253,7 +1254,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd unsigned long dst_addr, unsigned long src_addr, unsigned long len, __u64 mode) { - swp_entry_t entry; struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; @@ -1263,7 +1263,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd pmd_t dummy_pmdval; pmd_t dst_pmdval; struct folio *src_folio = NULL; - struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; long ret = 0; @@ -1347,9 +1346,9 @@ retry: } /* - * Pin and lock both source folio and anon_vma. Since we are in - * RCU read section, we can't block, so on contention have to - * unmap the ptes, obtain the lock and retry. + * Pin and lock source folio. Since we are in RCU read section, + * we can't block, so on contention have to unmap the ptes, + * obtain the lock and retry. */ if (!src_folio) { struct folio *folio; @@ -1423,46 +1422,25 @@ retry: goto retry; } - if (!src_anon_vma) { - /* - * folio_referenced walks the anon_vma chain - * without the folio lock. Serialize against it with - * the anon_vma lock, the folio lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - /* page was unmapped from under us */ - ret = -EAGAIN; - goto out; - } - if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(src_pte); - pte_unmap(dst_pte); - src_pte = dst_pte = NULL; - /* now we can block and wait */ - anon_vma_lock_write(src_anon_vma); - goto retry; - } - } - ret = move_present_ptes(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, &src_folio, - len, src_anon_vma); - } else { + len); + } else { /* !pte_present() */ struct folio *folio = NULL; + const softleaf_t entry = softleaf_from_pte(orig_src_pte); - entry = pte_to_swp_entry(orig_src_pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - pte_unmap(src_pte); - pte_unmap(dst_pte); - src_pte = dst_pte = NULL; - migration_entry_wait(mm, src_pmd, src_addr); - ret = -EAGAIN; - } else - ret = -EFAULT; + if (softleaf_is_migration(entry)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + migration_entry_wait(mm, src_pmd, src_addr); + + ret = -EAGAIN; + goto out; + } else if (!softleaf_is_swap(entry)) { + ret = -EFAULT; goto out; } @@ -1515,10 +1493,6 @@ retry: } out: - if (src_anon_vma) { - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); - } if (src_folio) { folio_unlock(src_folio); folio_put(src_folio); @@ -1578,7 +1552,7 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, /* * For now, we keep it simple and only move between writable VMAs. - * Access flags are equal, therefore cheching only the source is enough. + * Access flags are equal, therefore checking only the source is enough. */ if (!(src_vma->vm_flags & VM_WRITE)) return -EINVAL; @@ -1792,15 +1766,6 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma, * virtual regions without knowing if there are transparent hugepage * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. - * - * If there's any rmap walk that is taking the anon_vma locks without - * first obtaining the folio lock (the only current instance is - * folio_referenced), they will have to verify if the folio->mapping - * has changed after taking the anon_vma lock. If it changed they - * should release the lock and retry obtaining a new anon_vma, because - * it means the anon_vma was changed by move_pages() before the lock - * could be obtained. This is the only additional complexity added to - * the rmap code to provide this anonymous page remapping functionality. */ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode) |
