From d4056386aefdcd0637f9bb2bd52279af043f0381 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:33 +0000 Subject: mm/page_alloc: cache page_zone() result in free_unref_page() Patch series "Allocate and free frozen pages", v3. Slab does not need to use the page refcount at all, and it can avoid an atomic operation on page free. Hugetlb wants to delay setting the refcount until it has assembled a complete gigantic page. We already have the ability to freeze a page (safely reduce its reference count to 0), so this patchset adds APIs to allocate and free pages which are in a frozen state. This patchset is also a step towards the Glorious Future in which struct page doesn't have a refcount; the users which need a refcount will have one in their per-allocation memdesc. This patch (of 15): Save 17 bytes of text by calculating page_zone() once instead of twice. Link: https://lkml.kernel.org/r/20241125210149.2976098-1-willy@infradead.org Link: https://lkml.kernel.org/r/20241125210149.2976098-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Acked-by: Mel Gorman Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cae7b93864c2..a44f9ff04b1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2666,16 +2666,16 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ + zone = page_zone(page); migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; } - zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { -- cgit From 520128a1d1f06657cf52d7d87252ea9a10729256 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:35 +0000 Subject: mm/page_alloc: export free_frozen_pages() instead of free_unref_page() We already have the concept of "frozen pages" (eg page_ref_freeze()), so let's not complicate things by also having the concept of "unref pages". Link: https://lkml.kernel.org/r/20241125210149.2976098-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: William Kucharski Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a44f9ff04b1a..03f2491d13d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2592,9 +2592,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; } -static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, - struct page *page, int migratetype, - unsigned int order) +static void free_frozen_page_commit(struct zone *zone, + struct per_cpu_pages *pcp, struct page *page, int migratetype, + unsigned int order) { int high, batch; int pindex; @@ -2643,7 +2643,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, /* * Free a pcp page */ -void free_unref_page(struct page *page, unsigned int order) +void free_frozen_pages(struct page *page, unsigned int order) { unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; @@ -2679,7 +2679,7 @@ void free_unref_page(struct page *page, unsigned int order) pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, migratetype, order); + free_frozen_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, FPI_NONE); @@ -2743,7 +2743,7 @@ void free_unref_folios(struct folio_batch *folios) /* * Free isolated pages directly to the - * allocator, see comment in free_unref_page. + * allocator, see comment in free_frozen_pages. */ if (is_migrate_isolate(migratetype)) { free_one_page(zone, &folio->page, pfn, @@ -2774,7 +2774,7 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_unref_page_commit(zone, pcp, &folio->page, migratetype, + free_frozen_page_commit(zone, pcp, &folio->page, migratetype, order); } @@ -4837,11 +4837,11 @@ void __free_pages(struct page *page, unsigned int order) struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_unref_page(page, order); + free_frozen_pages(page, order); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_unref_page(page + (1 << order), order); + free_frozen_pages(page + (1 << order), order); } } EXPORT_SYMBOL(__free_pages); -- cgit From 8fd10a892a8db797fffb59a9a60bce23a56eef46 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:36 +0000 Subject: mm/page_alloc: move set_page_refcounted() to callers of post_alloc_hook() In preparation for allocating frozen pages, stop initialising the page refcount in post_alloc_hook(). Link: https://lkml.kernel.org/r/20241125210149.2976098-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Miaohe Lin Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03f2491d13d2..a3072047c705 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1508,7 +1508,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, int i; set_page_private(page, 0); - set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); @@ -1564,6 +1563,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { post_alloc_hook(page, order, gfp_flags); + set_page_refcounted(page); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -6360,6 +6360,7 @@ static void split_free_pages(struct list_head *list) int i; post_alloc_hook(page, order, __GFP_MOVABLE); + set_page_refcounted(page); if (!order) continue; -- cgit From ee66e9c34fd3aeab3a7c2cda300f852aa5363609 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:37 +0000 Subject: mm/page_alloc: move set_page_refcounted() to callers of prep_new_page() In preparation for allocating frozen pages, stop initialising the page refcount in prep_new_page(). Link: https://lkml.kernel.org/r/20241125210149.2976098-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3072047c705..08cc1e0bd950 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1563,7 +1563,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { post_alloc_hook(page, order, gfp_flags); - set_page_refcounted(page); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -3474,6 +3473,7 @@ try_this_zone: gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); + set_page_refcounted(page); /* * If this is a high-order atomic allocation then check @@ -3698,8 +3698,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, count_vm_event(COMPACTSTALL); /* Prep a captured page if available */ - if (page) + if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); + set_page_refcounted(page); + } /* Try get a page from the freelist if available */ if (!page) @@ -4678,6 +4680,7 @@ retry_this_zone: nr_account++; prep_new_page(page, 0, gfp, 0); + set_page_refcounted(page); if (page_list) list_add(&page->lru, page_list); else @@ -6500,6 +6503,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); + set_page_refcounted(head); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", -- cgit From efabfe1420f5245938871a9578160f32277c8e21 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:38 +0000 Subject: mm/page_alloc: move set_page_refcounted() to callers of get_page_from_freelist() In preparation for allocating frozen pages, stop initialising the page refcount in get_page_from_freelist(). Link: https://lkml.kernel.org/r/20241125210149.2976098-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 08cc1e0bd950..2786117a50ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3473,7 +3473,6 @@ try_this_zone: gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); - set_page_refcounted(page); /* * If this is a high-order atomic allocation then check @@ -3568,6 +3567,8 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + if (page) + set_page_refcounted(page); return page; } @@ -3606,8 +3607,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & ~__GFP_DIRECT_RECLAIM, order, ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); - if (page) + if (page) { + set_page_refcounted(page); goto out; + } /* Coredumps can quickly deplete all memory reserves */ if (current->flags & PF_DUMPCORE) @@ -3698,10 +3701,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, count_vm_event(COMPACTSTALL); /* Prep a captured page if available */ - if (page) { + if (page) prep_new_page(page, order, gfp_mask, alloc_flags); - set_page_refcounted(page); - } /* Try get a page from the freelist if available */ if (!page) @@ -3710,6 +3711,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (page) { struct zone *zone = page_zone(page); + set_page_refcounted(page); zone->compact_blockskip_flush = false; compaction_defer_reset(zone, order, true); count_vm_event(COMPACTSUCCESS); @@ -3968,6 +3970,7 @@ retry: drained = true; goto retry; } + set_page_refcounted(page); out: psi_memstall_leave(&pflags); @@ -4288,8 +4291,10 @@ restart: * that first */ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } /* * For costly allocations, try direct compaction first, as it's likely @@ -4369,8 +4374,10 @@ retry: /* Attempt with potentially adjusted zonelist and alloc_flags */ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) @@ -4754,8 +4761,10 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, /* First allocation attempt */ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); - if (likely(page)) + if (likely(page)) { + set_page_refcounted(page); goto out; + } alloc_gfp = gfp; ac.spread_dirty_pages = false; -- cgit From 4c9017cc4c59627720b198e22757e17f67dc3590 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:39 +0000 Subject: mm/page_alloc: move set_page_refcounted() to callers of __alloc_pages_cpuset_fallback() In preparation for allocating frozen pages, stop initialising the page refcount in __alloc_pages_cpuset_fallback(). Link: https://lkml.kernel.org/r/20241125210149.2976098-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2786117a50ee..40ffc7f41b73 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3566,9 +3566,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, if (!page) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - - if (page) - set_page_refcounted(page); return page; } @@ -3655,6 +3652,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_NOFAIL) page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); + if (page) + set_page_refcounted(page); } out: mutex_unlock(&oom_lock); @@ -4483,8 +4482,10 @@ nopage: * the situation worse. */ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } cond_resched(); goto retry; -- cgit From df544c5eef4082d83713188b4de89e3ab2ed6772 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:40 +0000 Subject: mm/page_alloc: move set_page_refcounted() to callers of __alloc_pages_may_oom() In preparation for allocating frozen pages, stop initialising the page refcount in __alloc_pages_may_oom(). Link: https://lkml.kernel.org/r/20241125210149.2976098-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40ffc7f41b73..11cb1497f5ba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3604,10 +3604,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & ~__GFP_DIRECT_RECLAIM, order, ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); - if (page) { - set_page_refcounted(page); + if (page) goto out; - } /* Coredumps can quickly deplete all memory reserves */ if (current->flags & PF_DUMPCORE) @@ -3652,8 +3650,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_NOFAIL) page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); - if (page) - set_page_refcounted(page); } out: mutex_unlock(&oom_lock); @@ -4437,8 +4433,10 @@ retry: /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } /* Avoid allocations with no watermarks from looping endlessly */ if (tsk_is_oom_victim(current) && -- cgit From 8e4c8a9702e79be37a42cd883e1008f18fe56959 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:41 +0000 Subject: mm/page_alloc: move set_page_refcounted() to callers of __alloc_pages_direct_compact() In preparation for allocating frozen pages, stop initialising the page refcount in __alloc_pages_direct_compact(). Link: https://lkml.kernel.org/r/20241125210149.2976098-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 11cb1497f5ba..b11cf4ad78e2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3706,7 +3706,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (page) { struct zone *zone = page_zone(page); - set_page_refcounted(page); zone->compact_blockskip_flush = false; compaction_defer_reset(zone, order, true); count_vm_event(COMPACTSUCCESS); @@ -4308,8 +4307,10 @@ restart: alloc_flags, ac, INIT_COMPACT_PRIORITY, &compact_result); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } /* * Checks for costly allocations with __GFP_NORETRY, which @@ -4391,8 +4392,10 @@ retry: /* Try direct compaction and then allocating */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, compact_priority, &compact_result); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) -- cgit From 30fdb6df4cb3641de4c5be9b87f3a2a1fe2fe72c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:42 +0000 Subject: mm/page_alloc: move set_page_refcounted() to callers of __alloc_pages_direct_reclaim() In preparation for allocating frozen pages, stop initialising the page refcount in __alloc_pages_direct_reclaim(). Link: https://lkml.kernel.org/r/20241125210149.2976098-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b11cf4ad78e2..835b1491610d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3964,7 +3964,6 @@ retry: drained = true; goto retry; } - set_page_refcounted(page); out: psi_memstall_leave(&pflags); @@ -4386,8 +4385,10 @@ retry: /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } /* Try direct compaction and then allocating */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, -- cgit From a88de400e3d22035a9bf6e818808013ccccfe410 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:43 +0000 Subject: mm/page_alloc: move set_page_refcounted() to callers of __alloc_pages_slowpath() In preparation for allocating frozen pages, stop initialising the page refcount in __alloc_pages_slowpath(). Link: https://lkml.kernel.org/r/20241125210149.2976098-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 835b1491610d..e2e5cd899abd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4284,10 +4284,8 @@ restart: * that first */ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - if (page) { - set_page_refcounted(page); + if (page) goto got_pg; - } /* * For costly allocations, try direct compaction first, as it's likely @@ -4306,10 +4304,8 @@ restart: alloc_flags, ac, INIT_COMPACT_PRIORITY, &compact_result); - if (page) { - set_page_refcounted(page); + if (page) goto got_pg; - } /* * Checks for costly allocations with __GFP_NORETRY, which @@ -4369,10 +4365,8 @@ retry: /* Attempt with potentially adjusted zonelist and alloc_flags */ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - if (page) { - set_page_refcounted(page); + if (page) goto got_pg; - } /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) @@ -4385,18 +4379,14 @@ retry: /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); - if (page) { - set_page_refcounted(page); + if (page) goto got_pg; - } /* Try direct compaction and then allocating */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, compact_priority, &compact_result); - if (page) { - set_page_refcounted(page); + if (page) goto got_pg; - } /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) @@ -4437,10 +4427,8 @@ retry: /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); - if (page) { - set_page_refcounted(page); + if (page) goto got_pg; - } /* Avoid allocations with no watermarks from looping endlessly */ if (tsk_is_oom_victim(current) && @@ -4484,10 +4472,8 @@ nopage: * the situation worse. */ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); - if (page) { - set_page_refcounted(page); + if (page) goto got_pg; - } cond_resched(); goto retry; @@ -4779,6 +4765,8 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, ac.nodemask = nodemask; page = __alloc_pages_slowpath(alloc_gfp, order, &ac); + if (page) + set_page_refcounted(page); out: if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && -- cgit From c972106db3550f7757c1f984ed9852d69cf1fd69 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:44 +0000 Subject: mm/page_alloc: move set_page_refcounted() to end of __alloc_pages() Remove some code duplication by calling set_page_refcounted() at the end of __alloc_pages() instead of after each call that can allocate a page. That means that we free a frozen page if we've exceeded the allowed memcg memory. Link: https://lkml.kernel.org/r/20241125210149.2976098-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2e5cd899abd..df5b61592792 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4750,10 +4750,8 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, /* First allocation attempt */ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); - if (likely(page)) { - set_page_refcounted(page); + if (likely(page)) goto out; - } alloc_gfp = gfp; ac.spread_dirty_pages = false; @@ -4765,15 +4763,15 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, ac.nodemask = nodemask; page = __alloc_pages_slowpath(alloc_gfp, order, &ac); - if (page) - set_page_refcounted(page); out: if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { - __free_pages(page, order); + free_frozen_pages(page, order); page = NULL; } + if (page) + set_page_refcounted(page); trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp); -- cgit From 49249a2a5eeb912e66951dfe9a5924eb0dd14d50 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:45 +0000 Subject: mm/page_alloc: add __alloc_frozen_pages() Defer the initialisation of the page refcount to the new __alloc_pages() wrapper and turn the old __alloc_pages() into __alloc_frozen_pages(). Link: https://lkml.kernel.org/r/20241125210149.2976098-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df5b61592792..7a2853b7967d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4713,8 +4713,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* * This is the 'heart' of the zoned buddy allocator. */ -struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, - int preferred_nid, nodemask_t *nodemask) +struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4770,14 +4770,24 @@ out: free_frozen_pages(page, order); page = NULL; } - if (page) - set_page_refcounted(page); trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp); return page; } +EXPORT_SYMBOL(__alloc_frozen_pages_noprof); + +struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct page *page; + + page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask); + if (page) + set_page_refcounted(page); + return page; +} EXPORT_SYMBOL(__alloc_pages_noprof); struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, -- cgit From 6025ea5abbe5d813d6a41c78e6ea14259fb503f4 Mon Sep 17 00:00:00 2001 From: gaoxiang17 Date: Fri, 20 Sep 2024 20:20:30 +0800 Subject: mm/page_alloc: add some detailed comments in can_steal_fallback [akpm@linux-foundation.org: tweak grammar, fit to 80 cols] Link: https://lkml.kernel.org/r/20240920122030.159751-1-gxxa03070307@gmail.com Signed-off-by: gaoxiang17 Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a2853b7967d..a887ba2cc91d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1855,6 +1855,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt) if (order >= pageblock_order) return true; + /* + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, you just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. Unmovable and + * reclaimable allocations need to actually steal pages. + */ if (order >= pageblock_order / 2 || start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE || -- cgit From dd467f92db404ca2e061889ad1b6dd6221390222 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 3 Dec 2024 11:20:50 +0100 Subject: mm/memory_hotplug: move debug_pagealloc_map_pages() into online_pages_range() In the near future, we want to have a single way to handover PageOffline pages to the buddy, whereby they could have: (a) Never been exposed to the buddy before: kept PageOffline when onlining the memory block. (b) Been allocated from the buddy, for example using alloc_contig_range() to then be set PageOffline, Let's start by making generic_online_page()->__free_pages_core() less special compared to ordinary page freeing (e.g., free_contig_range()), and perform the debug_pagealloc_map_pages() call unconditionally, even when the online callback might decide to keep the pages offline. All pages are already initialized with PageOffline, so nobody touches them either way. Link: https://lkml.kernel.org/r/20241203102050.223318-1-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a887ba2cc91d..75de3711784e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1295,12 +1295,6 @@ void __meminit __free_pages_core(struct page *page, unsigned int order, set_page_count(p, 0); } - /* - * Freeing the page with debug_pagealloc enabled will try to - * unmap it; some archs don't like double-unmappings, so - * map it first. - */ - debug_pagealloc_map_pages(page, nr_pages); adjust_managed_page_count(page, nr_pages); } else { for (loop = 0; loop < nr_pages; loop++, p++) { -- cgit From b9e40605daa94ae1817ceb5ce9e9b34093c6d850 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 3 Dec 2024 10:47:28 +0100 Subject: mm/page_isolation: don't pass gfp flags to start_isolate_page_range() The parameter is unused, so let's stop passing it. Link: https://lkml.kernel.org/r/20241203094732.200195-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Reviewed-by: Vishal Moola (Oracle) Cc: Christophe Leroy Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 75de3711784e..98ffcd2bf153 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6447,7 +6447,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); + ret = start_isolate_page_range(start, end, migratetype, 0); if (ret) goto done; -- cgit From 2202f51e9215bf9601402d6aa09e10c1dbb2a72d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 3 Dec 2024 10:47:29 +0100 Subject: mm/page_alloc: make __alloc_contig_migrate_range() static The single user is in page_alloc.c. Link: https://lkml.kernel.org/r/20241203094732.200195-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Reviewed-by: Vishal Moola (Oracle) Cc: Christophe Leroy Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 98ffcd2bf153..84067bc29740 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6280,9 +6280,8 @@ static void alloc_contig_dump_pages(struct list_head *page_list) * @migratetype: using migratetype to filter the type of migration in * trace_mm_alloc_contig_migrate_range_info. */ -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype) +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end, int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; -- cgit From f6037a4a686523dee1967ef7620349822e019ff8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 3 Dec 2024 10:47:30 +0100 Subject: mm/page_alloc: sort out the alloc_contig_range() gfp flags mess It's all a bit complicated for alloc_contig_range(). For example, we don't support many flags, so let's start bailing out on unsupported ones -- ignoring the placement hints, as we are already given the range to allocate. While we currently set cc.gfp_mask, in __alloc_contig_migrate_range() we simply create yet another GFP mask whereby we ignore the reclaim flags specify by the caller. That looks very inconsistent. Let's clean it up, constructing the gfp flags used for compaction/migration exactly once. Update the documentation of the gfp_mask parameter for alloc_contig_range() and alloc_contig_pages(). Link: https://lkml.kernel.org/r/20241203094732.200195-5-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Zi Yan Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Cc: Christophe Leroy Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 84067bc29740..a40a81c57259 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6290,7 +6290,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, int ret = 0; struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = cc->gfp_mask, .reason = MR_CONTIG_RANGE, }; struct page *page; @@ -6386,6 +6386,39 @@ static void split_free_pages(struct list_head *list) } } +static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) +{ + const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + + /* + * We are given the range to allocate; node, mobility and placement + * hints are irrelevant at this point. We'll simply ignore them. + */ + gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE | + __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE); + + /* + * We only support most reclaim flags (but not NOFAIL/NORETRY), and + * selected action flags. + */ + if (gfp_mask & ~(reclaim_mask | action_mask)) + return -EINVAL; + + /* + * Flags to control page compaction/migration/reclaim, to free up our + * page range. Migratable pages are movable, __GFP_MOVABLE is implied + * for them. + * + * Traditionally we always had __GFP_HARDWALL|__GFP_RETRY_MAYFAIL set, + * keep doing that to not degrade callers. + */ + *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) | + __GFP_HARDWALL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; + return 0; +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -6394,7 +6427,9 @@ static void split_free_pages(struct list_head *list) * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. - * @gfp_mask: GFP mask to use during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * * The PFN range does not have to be pageblock aligned. The PFN range must * belong to a single zone. @@ -6420,11 +6455,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .no_set_skip_hint = true, - .gfp_mask = current_gfp_context(gfp_mask), .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); + gfp_mask = current_gfp_context(gfp_mask); + if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) + return -EINVAL; + /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may @@ -6567,7 +6605,9 @@ static bool zone_spans_last_pfn(const struct zone *zone, /** * alloc_contig_pages() -- tries to find and allocate contiguous range of pages * @nr_pages: Number of contiguous pages to allocate - * @gfp_mask: GFP mask to limit search and used during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * @nid: Target node * @nodemask: Mask for other possible nodes * -- cgit From 7b755570064fcb9cde37afd48f6bc65151097ba7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 3 Dec 2024 10:47:31 +0100 Subject: mm/page_alloc: forward the gfp flags from alloc_contig_range() to post_alloc_hook() In the __GFP_COMP case, we already pass the gfp_flags to prep_new_page()->post_alloc_hook(). However, in the !__GFP_COMP case, we essentially pass only hardcoded __GFP_MOVABLE to post_alloc_hook(), preventing some action modifiers from being effective.. Let's pass our now properly adjusted gfp flags there as well. This way, we can now support __GFP_ZERO for alloc_contig_*(). As a side effect, we now also support __GFP_SKIP_ZERO and__GFP_ZEROTAGS; but we'll keep the more special stuff (KASAN, NOLOCKDEP) disabled for now. It's worth noting that with __GFP_ZERO, we might unnecessarily zero pages when we have to release part of our range using free_contig_range() again. This can be optimized in the future, if ever required; the caller we'll be converting (powernv/memtrace) next won't trigger this. Link: https://lkml.kernel.org/r/20241203094732.200195-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Cc: Christophe Leroy Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Vishal Moola (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a40a81c57259..a52c6022c65c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6360,7 +6360,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } -static void split_free_pages(struct list_head *list) +static void split_free_pages(struct list_head *list, gfp_t gfp_mask) { int order; @@ -6371,7 +6371,7 @@ static void split_free_pages(struct list_head *list) list_for_each_entry_safe(page, next, &list[order], lru) { int i; - post_alloc_hook(page, order, __GFP_MOVABLE); + post_alloc_hook(page, order, gfp_mask); set_page_refcounted(page); if (!order) continue; @@ -6389,7 +6389,8 @@ static void split_free_pages(struct list_head *list) static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) { const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM; - const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO; const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN; /* @@ -6537,7 +6538,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, } if (!(gfp_mask & __GFP_COMP)) { - split_free_pages(cc.freepages); + split_free_pages(cc.freepages, gfp_mask); /* Free head and tail (if any) */ if (start != outer_start) -- cgit From f58498b72638262bd1b8d63143c5cf71761d57b7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 5 Dec 2024 10:05:07 +0100 Subject: mm/page_alloc: don't use __GFP_HARDWALL when migrating pages via alloc_contig*() Patch series "mm: don't use __GFP_HARDWALL when migrating remote pages". __GFP_HARDWALL means that we will be respecting the cpuset of the caller when allocating a page. However, when we are migrating remote allocations (pages allocated from other context), the cpuset of the current context is irrelevant. For memory offlining + alloc_contig_*(), this is rather obvious. There might be other such page migration users, let's start with the obvious ones. This patch (of 2): We'll migrate pages allocated by other contexts; respecting the cpuset of the alloc_contig*() caller when allocating a migration target does not make sense. Drop the __GFP_HARDWALL. Note that in an ideal world, migration code could figure out the cpuset of the original context and take that into consideration. Link: https://lkml.kernel.org/r/20241205090508.2095225-1-david@redhat.com Link: https://lkml.kernel.org/r/20241205090508.2095225-2-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a52c6022c65c..dde19db204b2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6412,11 +6412,11 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) * page range. Migratable pages are movable, __GFP_MOVABLE is implied * for them. * - * Traditionally we always had __GFP_HARDWALL|__GFP_RETRY_MAYFAIL set, - * keep doing that to not degrade callers. + * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that + * to not degrade callers. */ *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) | - __GFP_HARDWALL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; + __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; return 0; } -- cgit From 04f13d241b8b146b23038bffd907cb8278391d07 Mon Sep 17 00:00:00 2001 From: yangge Date: Sat, 11 Jan 2025 15:58:20 +0800 Subject: mm: replace free hugepage folios after migration My machine has 4 NUMA nodes, each equipped with 32GB of memory. I have configured each NUMA node with 16GB of CMA and 16GB of in-use hugetlb pages. The allocation of contiguous memory via cma_alloc() can fail probabilistically. When there are free hugetlb folios in the hugetlb pool, during the migration of in-use hugetlb folios, new folios are allocated from the free hugetlb pool. After the migration is completed, the old folios are released back to the free hugetlb pool instead of being returned to the buddy system. This can cause test_pages_isolated() check to fail, ultimately leading to the failure of cma_alloc(). Call trace: cma_alloc() __alloc_contig_migrate_range() // migrate in-use hugepage test_pages_isolated() __test_page_isolated_in_pageblock() PageBuddy(page) // check if the page is in buddy To address this issue, we introduce a function named replace_free_hugepage_folios(). This function will replace the hugepage in the free hugepage pool with a new one and release the old one to the buddy system. After the migration of in-use hugetlb pages is completed, we will invoke replace_free_hugepage_folios() to ensure that these hugepages are properly released to the buddy system. Following this step, when test_pages_isolated() is executed for inspection, it will successfully pass. Additionally, when alloc_contig_range() is used to migrate multiple in-use hugetlb pages, it can result in some in-use hugetlb pages being released back to the free hugetlb pool and subsequently being reallocated and used again. For example: [huge 0] [huge 1] To migrate huge 0, we obtain huge x from the pool. After the migration is completed, we return the now-freed huge 0 back to the pool. When it's time to migrate huge 1, we can simply reuse the now-freed huge 0 from the pool. As a result, when replace_free_hugepage_folios() is executed, it cannot release huge 0 back to the buddy system. To address this issue, we should prevent the reuse of isolated free hugepages during the migration process. Link: https://lkml.kernel.org/r/1734503588-16254-1-git-send-email-yangge1116@126.com Link: https://lkml.kernel.org/r/1736582300-11364-1-git-send-email-yangge1116@126.com Signed-off-by: yangge Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dde19db204b2..df5d617738e7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6504,7 +6504,17 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); if (ret && ret != -EBUSY) goto done; - ret = 0; + + /* + * When in-use hugetlb pages are migrated, they may simply be released + * back into the free hugepage pool instead of being returned to the + * buddy system. After the migration of in-use huge pages is completed, + * we will invoke replace_free_hugepage_folios() to ensure that these + * hugepages are properly released to the buddy system. + */ + ret = replace_free_hugepage_folios(start, end); + if (ret) + goto done; /* * Pages from [start, end) are within a pageblock_nr_pages -- cgit From c8b979530f27f90c0353a189b2faa6e50a0ea94a Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 23 Dec 2024 17:00:37 -0500 Subject: mm: alloc_pages_bulk_noprof: drop page_list argument Patch series "mm: alloc_pages_bulk: small API refactor", v2. Today, alloc_pages_bulk_noprof() supports two arguments to return allocated pages: a linked list and an array. There are also higher level APIs for both. However, the linked list API has apparently never been used. So, this series removes it along with the list API and also refactors the remaining API naming for consistency. This patch (of 2): commit 387ba26fb1cb ("mm/page_alloc: add a bulk page allocator") added __alloc_pages_bulk() along with the page_list argument. The next commit 0f87d9d30f21 ("mm/page_alloc: add an array-based interface to the bulk page allocator") added the array-based argument. As it turns out, the page_list argument has no users in the current tree (if it ever had any). Dropping it allows for a slight simplification and eliminates some unnecessary checks, now that page_array is required. Also, note that the removal of the page_list argument was proposed before in the thread below, where Matthew Wilcox mentions that: """ Iterating a linked list is _expensive_. It is about 10x quicker to iterate an array than a linked list. """ (https://lore.kernel.org/linux-mm/20231025093254.xvomlctwhcuerzky@techsingularity.net) Link: https://lkml.kernel.org/r/cover.1734991165.git.luizcap@redhat.com Link: https://lkml.kernel.org/r/f1c75db91d08cafd211eca6a3b199b629d4ffe16.1734991165.git.luizcap@redhat.com Signed-off-by: Luiz Capitulino Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- mm/page_alloc.c | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df5d617738e7..7cdf1ac3e32a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4531,28 +4531,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array * @gfp: GFP flags for the allocation * @preferred_nid: The preferred NUMA node ID to allocate from * @nodemask: Set of nodes to allocate from, may be NULL - * @nr_pages: The number of pages desired on the list or array - * @page_list: Optional list to store the allocated pages - * @page_array: Optional array to store the pages + * @nr_pages: The number of pages desired in the array + * @page_array: Array to store the pages * * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to page_list if page_list - * is not NULL, otherwise it is assumed that the page_array is valid. + * allocate nr_pages quickly. Pages are added to the page_array. * - * For lists, nr_pages is the number of pages that should be allocated. - * - * For arrays, only NULL elements are populated with pages and nr_pages + * Note that only NULL elements are populated with pages and nr_pages * is the maximum number of pages that will be stored in the array. * - * Returns the number of pages on the list or array. + * Returns the number of pages in the array. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array) { struct page *page; @@ -4570,7 +4565,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, * Skip populated array elements to determine if any pages need * to be allocated before disabling IRQs. */ - while (page_array && nr_populated < nr_pages && page_array[nr_populated]) + while (nr_populated < nr_pages && page_array[nr_populated]) nr_populated++; /* No pages requested? */ @@ -4578,7 +4573,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto out; /* Already populated array? */ - if (unlikely(page_array && nr_pages - nr_populated == 0)) + if (unlikely(nr_pages - nr_populated == 0)) goto out; /* Bulk allocator does not support memcg accounting. */ @@ -4660,7 +4655,7 @@ retry_this_zone: while (nr_populated < nr_pages) { /* Skip existing pages */ - if (page_array && page_array[nr_populated]) { + if (page_array[nr_populated]) { nr_populated++; continue; } @@ -4679,11 +4674,7 @@ retry_this_zone: prep_new_page(page, 0, gfp, 0); set_page_refcounted(page); - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; + page_array[nr_populated++] = page; } pcp_spin_unlock(pcp); @@ -4700,14 +4691,8 @@ failed_irq: failed: page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); - if (page) { - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; - } - + if (page) + page_array[nr_populated++] = page; goto out; } EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); -- cgit From 686fa9537d78d2f1bea42bf3891828510202be14 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Wed, 15 Jan 2025 12:16:34 +0800 Subject: mm/page_alloc: remove the incorrect and misleading comment The comment removed in this patch originally belonged to the build_zonelists_in_zone_order() function, which was introduced by commit f0c0b2b808f2 ("change zonelist order: zonelist order selection logic"). Later, commit c9bff3eebc09 ("mm, page_alloc: rip out ZONELIST_ORDER_ZONE") removed build_zonelists_in_zone_order() but left its comment behind. Subsequently, commit 9d3be21bf9c0 ("mm, page_alloc: simplify zonelist initialization") moved the node_order variable into build_zonelists(), making the comment originally belonged to build_zonelists_in_zone_order() appear as if it were part of build_zonelists(). Remove this misleading comment. Link: https://lkml.kernel.org/r/20250115041634.63387-1-yuntao.wang@linux.dev Signed-off-by: Yuntao Wang Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_alloc.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cdf1ac3e32a..4d3972b99c93 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5159,13 +5159,6 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) zonerefs->zone_idx = 0; } -/* - * Build zonelists ordered by zone and nodes within zones. - * This results in conserving DMA zone[s] until all Normal memory is - * exhausted, but results in overflowing to remote node while memory - * may still exist in local DMA zone. - */ - static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; -- cgit