diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 201 |
1 files changed, 140 insertions, 61 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 07abcb6eb203..f154019e6b84 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2050,19 +2050,23 @@ int PageHuge(struct page *page) } EXPORT_SYMBOL_GPL(PageHuge); -/* - * PageHeadHuge() only returns true for hugetlbfs head page, but not for - * normal or transparent huge pages. +/** + * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs + * @folio: The folio to test. + * + * Context: Any context. Caller should have a reference on the folio to + * prevent it from being turned into a tail page. + * Return: True for hugetlbfs folios, false for anon folios or folios + * belonging to other filesystems. */ -int PageHeadHuge(struct page *page_head) +bool folio_test_hugetlb(struct folio *folio) { - struct folio *folio = (struct folio *)page_head; if (!folio_test_large(folio)) - return 0; + return false; return folio->_folio_dtor == HUGETLB_PAGE_DTOR; } -EXPORT_SYMBOL_GPL(PageHeadHuge); +EXPORT_SYMBOL_GPL(folio_test_hugetlb); /* * Find and lock address space (mapping) in write mode. @@ -2090,7 +2094,7 @@ pgoff_t hugetlb_basepage_index(struct page *page) pgoff_t index = page_index(page_head); unsigned long compound_idx; - if (compound_order(page_head) >= MAX_ORDER) + if (compound_order(page_head) > MAX_ORDER) compound_idx = page_to_pfn(page) - page_to_pfn(page_head); else compound_idx = page - page_head; @@ -4202,6 +4206,12 @@ static void __init hugetlb_sysfs_init(void) hugetlb_register_all_nodes(); } +#ifdef CONFIG_SYSCTL +static void hugetlb_sysctl_init(void); +#else +static inline void hugetlb_sysctl_init(void) { } +#endif + static int __init hugetlb_init(void) { int i; @@ -4257,6 +4267,7 @@ static int __init hugetlb_init(void) hugetlb_sysfs_init(); hugetlb_cgroup_file_init(); + hugetlb_sysctl_init(); #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); @@ -4497,7 +4508,7 @@ static int __init default_hugepagesz_setup(char *s) * The number of default huge pages (for this size) could have been * specified as the first hugetlb parameter: hugepages=X. If so, * then default_hstate_max_huge_pages is set. If the default huge - * page size is gigantic (>= MAX_ORDER), then the pages must be + * page size is gigantic (> MAX_ORDER), then the pages must be * allocated here from bootmem allocator. */ if (default_hstate_max_huge_pages) { @@ -4588,7 +4599,7 @@ out: return ret; } -int hugetlb_sysctl_handler(struct ctl_table *table, int write, +static int hugetlb_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { @@ -4597,7 +4608,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, } #ifdef CONFIG_NUMA -int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, +static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, @@ -4605,7 +4616,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, } #endif /* CONFIG_NUMA */ -int hugetlb_overcommit_handler(struct ctl_table *table, int write, +static int hugetlb_overcommit_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -4634,6 +4645,44 @@ out: return ret; } +static struct ctl_table hugetlb_table[] = { + { + .procname = "nr_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + }, +#endif + { + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nr_overcommit_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_overcommit_handler, + }, + { } +}; + +static void hugetlb_sysctl_init(void) +{ + register_sysctl_init("vm", hugetlb_table); +} #endif /* CONFIG_SYSCTL */ void hugetlb_report_meminfo(struct seq_file *m) @@ -4949,11 +4998,15 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, - struct folio *new_folio) + struct folio *new_folio, pte_t old) { + pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); + __folio_mark_uptodate(new_folio); hugepage_add_new_anon_rmap(new_folio, vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old)) + newpte = huge_pte_mkuffd_wp(newpte); + set_huge_pte_at(vma->vm_mm, addr, ptep, newpte); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); } @@ -5028,14 +5081,12 @@ again: */ ; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { - bool uffd_wp = huge_pte_uffd_wp(entry); - - if (!userfaultfd_wp(dst_vma) && uffd_wp) + if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_hugetlb_entry_migration(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); - bool uffd_wp = huge_pte_uffd_wp(entry); + bool uffd_wp = pte_swp_uffd_wp(entry); if (!is_readable_migration_entry(swp_entry) && cow) { /* @@ -5046,10 +5097,10 @@ again: swp_offset(swp_entry)); entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) - entry = huge_pte_mkuffd_wp(entry); + entry = pte_swp_mkuffd_wp(entry); set_huge_pte_at(src, addr, src_pte, entry); } - if (!userfaultfd_wp(dst_vma) && uffd_wp) + if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { @@ -5093,9 +5144,14 @@ again: ret = PTR_ERR(new_folio); break; } - copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma, - npages); + ret = copy_user_large_folio(new_folio, + page_folio(ptepage), + addr, dst_vma); put_page(ptepage); + if (ret) { + folio_put(new_folio); + break; + } /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); @@ -5109,7 +5165,8 @@ again: /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio); + hugetlb_install_folio(dst_vma, dst_pte, addr, + new_folio, src_pte_old); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; @@ -5127,6 +5184,9 @@ again: entry = huge_pte_wrprotect(entry); } + if (!userfaultfd_wp(dst_vma)) + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(dst, addr, dst_pte, entry); hugetlb_count_add(npages, dst); } @@ -5478,7 +5538,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; - pte_t pte; + pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(vma); struct page *old_page; struct folio *new_folio; @@ -5488,6 +5548,17 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, struct mmu_notifier_range range; /* + * Never handle CoW for uffd-wp protected pages. It should be only + * handled when the uffd-wp protection is removed. + * + * Note that only the CoW optimization path (in hugetlb_no_page()) + * can trigger this, because hugetlb_fault() will always resolve + * uffd-wp bit first. + */ + if (!unshare && huge_pte_uffd_wp(pte)) + return 0; + + /* * hugetlb does not support FOLL_FORCE-style write faults that keep the * PTE mapped R/O such as maybe_mkwrite() would do. */ @@ -5500,7 +5571,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - pte = huge_ptep_get(ptep); old_page = pte_page(pte); delayacct_wpcopy_start(); @@ -5602,8 +5672,10 @@ retry_avoidcopy: goto out_release_all; } - copy_user_huge_page(&new_folio->page, old_page, address, vma, - pages_per_huge_page(h)); + if (copy_user_large_folio(new_folio, page_folio(old_page), address, vma)) { + ret = VM_FAULT_HWPOISON_LARGE; + goto out_release_all; + } __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, @@ -5617,13 +5689,16 @@ retry_avoidcopy: spin_lock(ptl); ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { + pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); + /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(old_page, vma, true); hugepage_add_new_anon_rmap(new_folio, vma, haddr); - set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, &new_folio->page, !unshare)); + if (huge_pte_uffd_wp(pte)) + newpte = huge_pte_mkuffd_wp(newpte); + set_huge_pte_at(mm, haddr, ptep, newpte); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = page_folio(old_page); @@ -5780,7 +5855,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, */ new_folio = false; folio = filemap_lock_folio(mapping, idx); - if (!folio) { + if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; @@ -6071,6 +6146,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma_end_reservation(h, vma, haddr); pagecache_folio = filemap_lock_folio(mapping, idx); + if (IS_ERR(pagecache_folio)) + pagecache_folio = NULL; } ptl = huge_pte_lock(h, mm, ptep); @@ -6154,19 +6231,19 @@ out_mutex: #ifdef CONFIG_USERFAULTFD /* - * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with - * modifications for huge pages. + * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte + * with modifications for hugetlb pages. */ -int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, - pte_t *dst_pte, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy) -{ - bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); +int hugetlb_mfill_atomic_pte(pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + uffd_flags_t flags, + struct folio **foliop) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE); + bool wp_enabled = (flags & MFILL_ATOMIC_WP); struct hstate *h = hstate_vma(dst_vma); struct address_space *mapping = dst_vma->vm_file->f_mapping; pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); @@ -6182,11 +6259,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (is_continue) { ret = -EFAULT; folio = filemap_lock_folio(mapping, idx); - if (!folio) + if (IS_ERR(folio)) goto out; folio_in_pagecache = true; - } else if (!*pagep) { - /* If a page already exists, then it's UFFDIO_COPY for + } else if (!*foliop) { + /* If a folio already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. */ if (vm_shared && @@ -6201,9 +6278,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - ret = copy_huge_page_from_user(&folio->page, - (const void __user *) src_addr, - pages_per_huge_page(h), false); + ret = copy_folio_from_user(folio, (const void __user *) src_addr, + false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -6222,33 +6298,36 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ret = -ENOMEM; goto out; } - *pagep = &folio->page; - /* Set the outparam pagep and return to the caller to + *foliop = folio; + /* Set the outparam foliop and return to the caller to * copy the contents outside the lock. Don't free the - * page. + * folio. */ goto out; } } else { if (vm_shared && hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { - put_page(*pagep); + folio_put(*foliop); ret = -EEXIST; - *pagep = NULL; + *foliop = NULL; goto out; } folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); if (IS_ERR(folio)) { - put_page(*pagep); + folio_put(*foliop); ret = -ENOMEM; - *pagep = NULL; + *foliop = NULL; + goto out; + } + ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); + folio_put(*foliop); + *foliop = NULL; + if (ret) { + folio_put(folio); goto out; } - copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma, - pages_per_huge_page(h)); - put_page(*pagep); - *pagep = NULL; } /* @@ -6301,7 +6380,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - if (wp_copy || (is_continue && !vm_shared)) + if (wp_enabled || (is_continue && !vm_shared)) writable = 0; else writable = dst_vma->vm_flags & VM_WRITE; @@ -6316,7 +6395,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); - if (wp_copy) + if (wp_enabled) _dst_pte = huge_pte_mkuffd_wp(_dst_pte); set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); |