summaryrefslogtreecommitdiff
path: root/mm/rmap.c
diff options
context:
space:
mode:
authorDavid Hildenbrand <david@redhat.com>2025-02-10 20:37:45 +0100
committerAndrew Morton <akpm@linux-foundation.org>2025-03-16 22:05:57 -0700
commit599b684a7854e51a51358fe59bbdfea281f0b461 (patch)
tree7321c1080d168bbef1537a54de26245fc9b8ff3c /mm/rmap.c
parentbc3fe6805cf09a25a086573a17d40e525208c5d8 (diff)
mm/rmap: convert make_device_exclusive_range() to make_device_exclusive()
The single "real" user in the tree of make_device_exclusive_range() always requests making only a single address exclusive. The current implementation is hard to fix for properly supporting anonymous THP / large folios and for avoiding messing with rmap walks in weird ways. So let's always process a single address/page and return folio + page to minimize page -> folio lookups. This is a preparation for further changes. Reject any non-anonymous or hugetlb folios early, directly after GUP. While at it, extend the documentation of make_device_exclusive() to clarify some things. Link: https://lkml.kernel.org/r/20250210193801.781278-4-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Acked-by: Simona Vetter <simona.vetter@ffwll.ch> Reviewed-by: Alistair Popple <apopple@nvidia.com> Tested-by: Alistair Popple <apopple@nvidia.com> Cc: Alex Shi <alexs@kernel.org> Cc: Danilo Krummrich <dakr@kernel.org> Cc: Dave Airlie <airlied@gmail.com> Cc: Jann Horn <jannh@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Karol Herbst <kherbst@redhat.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Lyude <lyude@redhat.com> Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: SeongJae Park <sj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yanteng Si <si.yanteng@linux.dev> Cc: Barry Song <v-songbaohua@oppo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c103
1 files changed, 61 insertions, 42 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 17fbfa61f7ef..7ccf850565d3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2495,70 +2495,89 @@ static bool folio_make_device_exclusive(struct folio *folio,
.arg = &args,
};
- /*
- * Restrict to anonymous folios for now to avoid potential writeback
- * issues.
- */
- if (!folio_test_anon(folio) || folio_test_hugetlb(folio))
- return false;
-
rmap_walk(folio, &rwc);
return args.valid && !folio_mapcount(folio);
}
/**
- * make_device_exclusive_range() - Mark a range for exclusive use by a device
+ * make_device_exclusive() - Mark a page for exclusive use by a device
* @mm: mm_struct of associated target process
- * @start: start of the region to mark for exclusive device access
- * @end: end address of region
- * @pages: returns the pages which were successfully marked for exclusive access
+ * @addr: the virtual address to mark for exclusive device access
* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ * @foliop: folio pointer will be stored here on success.
+ *
+ * This function looks up the page mapped at the given address, grabs a
+ * folio reference, locks the folio and replaces the PTE with special
+ * device-exclusive PFN swap entry, preventing access through the process
+ * page tables. The function will return with the folio locked and referenced.
*
- * Returns: number of pages found in the range by GUP. A page is marked for
- * exclusive access only if the page pointer is non-NULL.
+ * On fault, the device-exclusive entries are replaced with the original PTE
+ * under folio lock, after calling MMU notifiers.
*
- * This function finds ptes mapping page(s) to the given address range, locks
- * them and replaces mappings with special swap entries preventing userspace CPU
- * access. On fault these entries are replaced with the original mapping after
- * calling MMU notifiers.
+ * Only anonymous non-hugetlb folios are supported and the VMA must have
+ * write permissions such that we can fault in the anonymous page writable
+ * in order to mark it exclusive. The caller must hold the mmap_lock in read
+ * mode.
*
* A driver using this to program access from a device must use a mmu notifier
* critical section to hold a device specific lock during programming. Once
- * programming is complete it should drop the page lock and reference after
+ * programming is complete it should drop the folio lock and reference after
* which point CPU access to the page will revoke the exclusive access.
+ *
+ * Notes:
+ * #. This function always operates on individual PTEs mapping individual
+ * pages. PMD-sized THPs are first remapped to be mapped by PTEs before
+ * the conversion happens on a single PTE corresponding to @addr.
+ * #. While concurrent access through the process page tables is prevented,
+ * concurrent access through other page references (e.g., earlier GUP
+ * invocation) is not handled and not supported.
+ * #. device-exclusive entries are considered "clean" and "old" by core-mm.
+ * Device drivers must update the folio state when informed by MMU
+ * notifiers.
+ *
+ * Returns: pointer to mapped page on success, otherwise a negative error.
*/
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct page **pages,
- void *owner)
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+ void *owner, struct folio **foliop)
{
- long npages = (end - start) >> PAGE_SHIFT;
- long i;
+ struct folio *folio;
+ struct page *page;
+ long npages;
+
+ mmap_assert_locked(mm);
- npages = get_user_pages_remote(mm, start, npages,
+ /*
+ * Fault in the page writable and try to lock it; note that if the
+ * address would already be marked for exclusive use by a device,
+ * the GUP call would undo that first by triggering a fault.
+ */
+ npages = get_user_pages_remote(mm, addr, 1,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL);
- if (npages < 0)
- return npages;
-
- for (i = 0; i < npages; i++, start += PAGE_SIZE) {
- struct folio *folio = page_folio(pages[i]);
- if (PageTail(pages[i]) || !folio_trylock(folio)) {
- folio_put(folio);
- pages[i] = NULL;
- continue;
- }
+ &page, NULL);
+ if (npages != 1)
+ return ERR_PTR(npages);
+ folio = page_folio(page);
- if (!folio_make_device_exclusive(folio, mm, start, owner)) {
- folio_unlock(folio);
- folio_put(folio);
- pages[i] = NULL;
- }
+ if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EBUSY);
}
- return npages;
+ if (!folio_make_device_exclusive(folio, mm, addr, owner)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(-EBUSY);
+ }
+ *foliop = folio;
+ return page;
}
-EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif
void __put_anon_vma(struct anon_vma *anon_vma)