diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 195 |
1 files changed, 120 insertions, 75 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e3e6ac991b9c..32ab14aa4074 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -58,6 +58,7 @@ int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +__initdata nodemask_t hugetlb_bootmem_nodes; __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; @@ -1250,7 +1251,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. - * This function should be only used by move_vma() and operate on + * This function should be only used by mremap and operate on * same sized vma. It should never come here with last ref on the * reservation. */ @@ -1950,7 +1951,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; - bool retry = true; /* * By default we always try hard to allocate the folio with @@ -1965,22 +1965,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); -retry: - folio = __folio_alloc(gfp_mask, order, nid, nmask); - /* Ensure hugetlb folio won't have large_rmappable flag set. */ - if (folio) - folio_clear_large_rmappable(folio); - if (folio && !folio_ref_freeze(folio, 1)) { - folio_put(folio); - if (retry) { /* retry once */ - retry = false; - goto retry; - } - /* WOW! twice in a row. */ - pr_warn("HugeTLB unexpected inflated folio ref count\n"); - folio = NULL; - } + folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask); /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a @@ -2419,7 +2405,6 @@ static int gather_surplus_pages(struct hstate *h, long delta) long i; long needed, allocated; bool alloc_ok = true; - int node; nodemask_t *mbind_nodemask, alloc_nodemask; mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); @@ -2443,21 +2428,12 @@ retry: for (i = 0; i < needed; i++) { folio = NULL; - /* Prioritize current node */ - if (node_isset(numa_mem_id(), alloc_nodemask)) - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - numa_mem_id(), NULL); - - if (!folio) { - for_each_node_mask(node, alloc_nodemask) { - if (node == numa_mem_id()) - continue; - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - node, NULL); - if (folio) - break; - } - } + /* + * It is okay to use NUMA_NO_NODE because we use numa_mem_id() + * down the road to pick the current node if that is the case. + */ + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + NUMA_NO_NODE, &alloc_nodemask); if (!folio) { alloc_ok = false; break; @@ -2896,10 +2872,9 @@ free_new: return ret; } -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { struct hstate *h; - struct folio *folio = page_folio(page); int ret = -EBUSY; /* @@ -2949,12 +2924,20 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); + + /* + * The folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ + spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { h = folio_hstate(folio); } else { + spin_unlock_irq(&hugetlb_lock); start_pfn++; continue; } + spin_unlock_irq(&hugetlb_lock); if (!folio_ref_count(folio)) { ret = alloc_and_dissolve_hugetlb_folio(h, folio, @@ -3010,7 +2993,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long retval, gbl_chg; + long retval, gbl_chg, gbl_reserve; map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; @@ -3163,8 +3146,16 @@ out_uncharge_cgroup_reservation: hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg) - hugepage_subpool_put_pages(spool, 1); + /* + * put page to subpool iff the quota of subpool's rsv_hpages is used + * during hugepage_subpool_get_pages. + */ + if (map_chg && !gbl_chg) { + gbl_reserve = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -gbl_reserve); + } + + out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); @@ -3237,7 +3228,8 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } /* allocate from next node when distributing huge pages */ - for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, + &hugetlb_bootmem_nodes) { m = alloc_bootmem(h, node, false); if (!m) return 0; @@ -3701,6 +3693,15 @@ static void __init hugetlb_init_hstates(void) struct hstate *h, *h2; for_each_hstate(h) { + /* + * Always reset to first_memory_node here, even if + * next_nid_to_alloc was set before - we can't + * reference hugetlb_bootmem_nodes after init, and + * first_memory_node is right for all further allocations. + */ + h->next_nid_to_alloc = first_memory_node; + h->next_nid_to_free = first_memory_node; + /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -4034,10 +4035,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, list_for_each_entry_safe(folio, next, src_list, lru) { int i; + bool cma; if (folio_test_hugetlb_vmemmap_optimized(folio)) continue; + cma = folio_test_hugetlb_cma(folio); + list_del(&folio->lru); split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); @@ -4053,6 +4057,9 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, new_folio->mapping = NULL; init_new_hugetlb_folio(dst, new_folio); + /* Copy the CMA flag so that it is freed correctly */ + if (cma) + folio_set_hugetlb_cma(new_folio); list_add(&new_folio->lru, &dst_list); } } @@ -5007,6 +5014,20 @@ static int __init default_hugepagesz_setup(char *s) } hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); +void __init hugetlb_bootmem_set_nodes(void) +{ + int i, nid; + unsigned long start_pfn, end_pfn; + + if (!nodes_empty(hugetlb_bootmem_nodes)) + return; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + if (end_pfn > start_pfn) + node_set(nid, hugetlb_bootmem_nodes); + } +} + static bool __hugetlb_bootmem_allocated __initdata; bool __init hugetlb_bootmem_allocated(void) @@ -5022,6 +5043,8 @@ void __init hugetlb_bootmem_alloc(void) if (__hugetlb_bootmem_allocated) return; + hugetlb_bootmem_set_nodes(); + for (i = 0; i < MAX_NUMNODES; i++) INIT_LIST_HEAD(&huge_boot_pages[i]); @@ -5029,7 +5052,6 @@ void __init hugetlb_bootmem_alloc(void) for_each_hstate(h) { h->next_nid_to_alloc = first_online_node; - h->next_nid_to_free = first_online_node; if (hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -5458,18 +5480,16 @@ const struct vm_operations_struct hugetlb_vm_ops = { .pagesize = hugetlb_vm_op_pagesize, }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, +static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, bool try_mkwrite) { - pte_t entry; + pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); unsigned int shift = huge_page_shift(hstate_vma(vma)); if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { - entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, - vma->vm_page_prot))); + entry = pte_mkwrite_novma(pte_mkdirty(entry)); } else { - entry = huge_pte_wrprotect(mk_huge_pte(page, - vma->vm_page_prot)); + entry = pte_wrprotect(entry); } entry = pte_mkyoung(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); @@ -5524,7 +5544,7 @@ static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, true); + pte_t newpte = make_huge_pte(vma, new_folio, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -5828,14 +5848,14 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page, zap_flags_t zap_flags) + struct folio *folio, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; + const bool folio_provided = !!folio; unsigned long address; pte_t *ptep; pte_t pte; spinlock_t *ptl; - struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); bool adjust_reservation = false; @@ -5899,14 +5919,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, continue; } - page = pte_page(pte); /* - * If a reference page is supplied, it is because a specific - * page is being unmapped, not a range. Ensure the page we - * are about to unmap is the actual page of interest. + * If a folio is supplied, it is because a specific + * folio is being unmapped, not a range. Ensure the folio we + * are about to unmap is the actual folio of interest. */ - if (ref_page) { - if (page != ref_page) { + if (folio_provided) { + if (folio != page_folio(pte_page(pte))) { spin_unlock(ptl); continue; } @@ -5916,12 +5935,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * looking like data was lost */ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } else { + folio = page_folio(pte_page(pte)); } pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) - set_page_dirty(page); + folio_mark_dirty(folio); /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) @@ -5929,7 +5950,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); - hugetlb_remove_rmap(page_folio(page)); + hugetlb_remove_rmap(folio); /* * Restore the reservation for anonymous page, otherwise the @@ -5938,8 +5959,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * reservation bit. */ if (!h->surplus_huge_pages && __vma_private_lock(vma) && - folio_test_anon(page_folio(page))) { - folio_set_hugetlb_restore_reserve(page_folio(page)); + folio_test_anon(folio)) { + folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } @@ -5963,16 +5984,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * count will not be incremented by free_huge_folio. * Act as if we consumed the reservation. */ - folio_clear_hugetlb_restore_reserve(page_folio(page)); + folio_clear_hugetlb_restore_reserve(folio); else if (rc) vma_add_reservation(h, vma, address); } - tlb_remove_page_size(tlb, page, huge_page_size(h)); + tlb_remove_page_size(tlb, folio_page(folio, 0), + folio_size(folio)); /* - * Bail out after unmapping reference page if supplied + * If we were instructed to unmap a specific folio, we're done. */ - if (ref_page) + if (folio_provided) break; } tlb_end_vma(tlb, vma); @@ -6034,7 +6056,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma, } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page, + unsigned long end, struct folio *folio, zap_flags_t zap_flags) { struct mmu_notifier_range range; @@ -6046,7 +6068,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); - __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); + __unmap_hugepage_range(&tlb, vma, start, end, + folio, zap_flags); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); @@ -6059,7 +6082,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * same region. */ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) + struct folio *folio, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -6103,7 +6126,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, - address + huge_page_size(h), page, 0); + address + huge_page_size(h), + folio, 0); } i_mmap_unlock_write(mapping); } @@ -6226,8 +6250,7 @@ retry_avoidcopy: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, &old_folio->page, - vmf->address); + unmap_ref_private(mm, vma, old_folio, vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); @@ -6274,7 +6297,7 @@ retry_avoidcopy: spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); + pte_t newpte = make_huge_pte(vma, new_folio, !unshare); /* Break COW or unshare */ huge_ptep_clear_flush(vma, vmf->address, vmf->pte); @@ -6554,7 +6577,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); - new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); + new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. @@ -7039,7 +7062,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - _dst_pte = make_huge_pte(dst_vma, &folio->page, + _dst_pte = make_huge_pte(dst_vma, folio, !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be @@ -7233,7 +7256,7 @@ bool hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg = -1, add = -1; + long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -7368,8 +7391,16 @@ bool hugetlb_reserve_pages(struct inode *inode, return true; out_put_pages: - /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + spool_resv = chg - gbl_reserve; + if (spool_resv) { + /* put sub pool's reservation back, chg - gbl_reserve */ + gbl_resv = hugepage_subpool_put_pages(spool, spool_resv); + /* + * subpool's reserved pages can not be put back due to race, + * return to hstate. + */ + hugetlb_acct_memory(h, -gbl_resv); + } out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); @@ -7909,3 +7940,17 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE)); } + +/* + * For hugetlb, mremap() is an odd edge case - while the VMA copying is + * performed, we permit both the old and new VMAs to reference the same + * reservation. + * + * We fix this up after the operation succeeds, or if a newly allocated VMA + * is closed as a result of a failure to allocate memory. + */ +void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + clear_vma_resv_huge_pages(vma); +} |