From 51ff4d7486f0c0b4110a6da4af805b179dd7b11e Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Sat, 1 Feb 2025 15:18:00 -0800
Subject: mm: avoid extra mem_alloc_profiling_enabled() checks

Refactor code to avoid extra mem_alloc_profiling_enabled() checks inside
pgalloc_tag_get() function which is often called after that check was
already done.

Link: https://lkml.kernel.org/r/20250201231803.2661189-1-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: David Wang <00107082@163.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Minchan Kim <minchan@google.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 542d25f77be8..38c2b2d20b1d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4833,12 +4833,11 @@ void __free_pages(struct page *page, unsigned int order)
 {
 	/* get PageHead before we drop reference */
 	int head = PageHead(page);
-	struct alloc_tag *tag = pgalloc_tag_get(page);
 
 	if (put_page_testzero(page))
 		free_frozen_pages(page, order);
 	else if (!head) {
-		pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+		pgalloc_tag_sub_pages(page, (1 << order) - 1);
 		while (order-- > 0)
 			free_frozen_pages(page + (1 << order), order);
 	}
-- 
cgit 


From 93d5440ece3c0aa341fb02e3a44a1b7ab44304c8 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Sat, 1 Feb 2025 15:18:02 -0800
Subject: alloc_tag: uninline code gated by mem_alloc_profiling_key in page
 allocator

When a sizable code section is protected by a disabled static key, that
code gets into the instruction cache even though it's not executed and
consumes the cache, increasing cache misses. This can be remedied by
moving such code into a separate uninlined function.
On a Pixel6 phone, page allocation profiling overhead measured with
CONFIG_MEM_ALLOC_PROFILING=y and profiling disabled is:

             baseline             modified
Big core     4.93%                1.53%
Medium core  4.39%                1.41%
Little core  1.02%                0.36%

This improvement comes at the expense of the configuration when profiling
gets enabled, since there is now an additional function call. The overhead
from this additional call on Pixel6 is:

Big core     0.24%
Middle core  0.63%
Little core  1.1%

However this is negligible when compared with the overall overhead of the
memory allocation profiling when it is enabled.
On x86 this patch does not make noticeable difference because the overhead
with mem_alloc_profiling_key disabled is much lower (under 1%) to start
with, so any improvement is less visible and hard to distinguish from the
noise. The overhead from additional call when profiling is enabled is also
within noise levels.

Link: https://lkml.kernel.org/r/20250201231803.2661189-3-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: David Wang <00107082@163.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Minchan Kim <minchan@google.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 38c2b2d20b1d..d875f055aa53 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1041,6 +1041,84 @@ static void kernel_init_pages(struct page *page, int numpages)
 	kasan_enable_current();
 }
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page)
+{
+	union pgtag_ref_handle handle;
+	union codetag_ref ref;
+
+	if (get_page_tag_ref(page, &ref, &handle)) {
+		set_codetag_empty(&ref);
+		update_page_tag_ref(handle, &ref);
+		put_page_tag_ref(handle);
+	}
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_add(struct page *page, struct task_struct *task,
+		       unsigned int nr)
+{
+	union pgtag_ref_handle handle;
+	union codetag_ref ref;
+
+	if (get_page_tag_ref(page, &ref, &handle)) {
+		alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+		update_page_tag_ref(handle, &ref);
+		put_page_tag_ref(handle);
+	}
+}
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+				   unsigned int nr)
+{
+	if (mem_alloc_profiling_enabled())
+		__pgalloc_tag_add(page, task, nr);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+	union pgtag_ref_handle handle;
+	union codetag_ref ref;
+
+	if (get_page_tag_ref(page, &ref, &handle)) {
+		alloc_tag_sub(&ref, PAGE_SIZE * nr);
+		update_page_tag_ref(handle, &ref);
+		put_page_tag_ref(handle);
+	}
+}
+
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+	if (mem_alloc_profiling_enabled())
+		__pgalloc_tag_sub(page, nr);
+}
+
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
+{
+	struct alloc_tag *tag;
+
+	if (!mem_alloc_profiling_enabled())
+		return;
+
+	tag = __pgalloc_tag_get(page);
+	if (tag)
+		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+				   unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
 __always_inline bool free_pages_prepare(struct page *page,
 			unsigned int order)
 {
-- 
cgit 


From c2f6ea38fc1b640aa7a2e155cc1c0410ff91afa2 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 24 Feb 2025 19:08:24 -0500
Subject: mm: page_alloc: don't steal single pages from biggest buddy

The fallback code searches for the biggest buddy first in an attempt to
steal the whole block and encourage type grouping down the line.

The approach used to be this:

- Non-movable requests will split the largest buddy and steal the
  remainder. This splits up contiguity, but it allows subsequent
  requests of this type to fall back into adjacent space.

- Movable requests go and look for the smallest buddy instead. The
  thinking is that movable requests can be compacted, so grouping is
  less important than retaining contiguity.

c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block
conversion") enforces freelist type hygiene, which restricts stealing to
either claiming the whole block or just taking the requested chunk; no
additional pages or buddy remainders can be stolen any more.

The patch mishandled when to switch to finding the smallest buddy in that
new reality.  As a result, it may steal the exact request size, but from
the biggest buddy.  This causes fracturing for no good reason.

Fix this by committing to the new behavior: either steal the whole block,
or fall back to the smallest buddy.

Remove single-page stealing from steal_suitable_fallback().  Rename it to
try_to_steal_block() to make the intentions clear.  If this fails, always
fall back to the smallest buddy.

The following is from 4 runs of mmtest's thpchallenge.  "Pollute" is
single page fallback, "steal" is conversion of a partially used block.
The numbers for free block conversions (omitted) are comparable.

				     vanilla	      patched

@pollute[unmovable from reclaimable]:	  27		  106
@pollute[unmovable from movable]:	  82		   46
@pollute[reclaimable from unmovable]:	 256		   83
@pollute[reclaimable from movable]:	  46		    8
@pollute[movable from unmovable]:	4841		  868
@pollute[movable from reclaimable]:	5278		12568

@steal[unmovable from reclaimable]:	  11		   12
@steal[unmovable from movable]:		 113		   49
@steal[reclaimable from unmovable]:	  19		   34
@steal[reclaimable from movable]:	  47		   21
@steal[movable from unmovable]:		 250		  183
@steal[movable from reclaimable]:	  81		   93

The allocator appears to do a better job at keeping stealing and polluting
to the first fallback preference.  As a result, the numbers for "from
movable" - the least preferred fallback option, and most detrimental to
compactability - are down across the board.

Link: https://lkml.kernel.org/r/20250225001023.1494422-2-hannes@cmpxchg.org
Fixes: c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 80 ++++++++++++++++++++++++---------------------------------
 1 file changed, 34 insertions(+), 46 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d875f055aa53..462f0e5342e5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1986,13 +1986,12 @@ static inline bool boost_watermark(struct zone *zone)
  * can claim the whole pageblock for the requested migratetype. If not, we check
  * the pageblock for constituent pages; if at least half of the pages are free
  * or compatible, we can still claim the whole block, so pages freed in the
- * future will be put on the correct free list. Otherwise, we isolate exactly
- * the order we need from the fallback block and leave its migratetype alone.
+ * future will be put on the correct free list.
  */
 static struct page *
-steal_suitable_fallback(struct zone *zone, struct page *page,
-			int current_order, int order, int start_type,
-			unsigned int alloc_flags, bool whole_block)
+try_to_steal_block(struct zone *zone, struct page *page,
+		   int current_order, int order, int start_type,
+		   unsigned int alloc_flags)
 {
 	int free_pages, movable_pages, alike_pages;
 	unsigned long start_pfn;
@@ -2005,7 +2004,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
 	 * highatomic accounting.
 	 */
 	if (is_migrate_highatomic(block_type))
-		goto single_page;
+		return NULL;
 
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
@@ -2026,14 +2025,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
 	if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
 		set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
 
-	/* We are not allowed to try stealing from the whole block */
-	if (!whole_block)
-		goto single_page;
-
 	/* moving whole block can fail due to zone boundary conditions */
 	if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
 				       &movable_pages))
-		goto single_page;
+		return NULL;
 
 	/*
 	 * Determine how many pages are compatible with our allocation.
@@ -2066,9 +2061,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
 		return __rmqueue_smallest(zone, order, start_type);
 	}
 
-single_page:
-	page_del_and_expand(zone, page, order, current_order, block_type);
-	return page;
+	return NULL;
 }
 
 /*
@@ -2250,14 +2243,19 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 }
 
 /*
- * Try finding a free buddy page on the fallback list and put it on the free
- * list of requested migratetype, possibly along with other pages from the same
- * block, depending on fragmentation avoidance heuristics. Returns true if
- * fallback was found so that __rmqueue_smallest() can grab it.
+ * Try finding a free buddy page on the fallback list.
+ *
+ * This will attempt to steal a whole pageblock for the requested type
+ * to ensure grouping of such requests in the future.
+ *
+ * If a whole block cannot be stolen, regress to __rmqueue_smallest()
+ * logic to at least break up as little contiguity as possible.
  *
  * The use of signed ints for order and current_order is a deliberate
  * deviation from the rest of this file, to make the for loop
  * condition simpler.
+ *
+ * Return the stolen page, or NULL if none can be found.
  */
 static __always_inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
@@ -2291,45 +2289,35 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 		if (fallback_mt == -1)
 			continue;
 
-		/*
-		 * We cannot steal all free pages from the pageblock and the
-		 * requested migratetype is movable. In that case it's better to
-		 * steal and split the smallest available page instead of the
-		 * largest available page, because even if the next movable
-		 * allocation falls back into a different pageblock than this
-		 * one, it won't cause permanent fragmentation.
-		 */
-		if (!can_steal && start_migratetype == MIGRATE_MOVABLE
-					&& current_order > order)
-			goto find_smallest;
+		if (!can_steal)
+			break;
 
-		goto do_steal;
+		page = get_page_from_free_area(area, fallback_mt);
+		page = try_to_steal_block(zone, page, current_order, order,
+					  start_migratetype, alloc_flags);
+		if (page)
+			goto got_one;
 	}
 
-	return NULL;
+	if (alloc_flags & ALLOC_NOFRAGMENT)
+		return NULL;
 
-find_smallest:
+	/* No luck stealing blocks. Find the smallest fallback page */
 	for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
 				start_migratetype, false, &can_steal);
-		if (fallback_mt != -1)
-			break;
-	}
-
-	/*
-	 * This should not happen - we already found a suitable fallback
-	 * when looking for the largest page.
-	 */
-	VM_BUG_ON(current_order > MAX_PAGE_ORDER);
+		if (fallback_mt == -1)
+			continue;
 
-do_steal:
-	page = get_page_from_free_area(area, fallback_mt);
+		page = get_page_from_free_area(area, fallback_mt);
+		page_del_and_expand(zone, page, order, current_order, fallback_mt);
+		goto got_one;
+	}
 
-	/* take off list, maybe claim block, expand remainder */
-	page = steal_suitable_fallback(zone, page, current_order, order,
-				       start_migratetype, alloc_flags, can_steal);
+	return NULL;
 
+got_one:
 	trace_mm_page_alloc_extfrag(page, order, current_order,
 		start_migratetype, fallback_mt);
 
-- 
cgit 


From 020396a581dc69be2d30939fabde6c029d847034 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 24 Feb 2025 19:08:25 -0500
Subject: mm: page_alloc: remove remnants of unlocked migratetype updates

The freelist hygiene patches made migratetype accesses fully protected
under the zone->lock.  Remove remnants of handling the race conditions
that existed before from the MIGRATE_HIGHATOMIC code.

Link: https://lkml.kernel.org/r/20250225001023.1494422-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 50 ++++++++++++++++----------------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 462f0e5342e5..9e6f0db6c79f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1991,20 +1991,10 @@ static inline bool boost_watermark(struct zone *zone)
 static struct page *
 try_to_steal_block(struct zone *zone, struct page *page,
 		   int current_order, int order, int start_type,
-		   unsigned int alloc_flags)
+		   int block_type, unsigned int alloc_flags)
 {
 	int free_pages, movable_pages, alike_pages;
 	unsigned long start_pfn;
-	int block_type;
-
-	block_type = get_pageblock_migratetype(page);
-
-	/*
-	 * This can happen due to races and we want to prevent broken
-	 * highatomic accounting.
-	 */
-	if (is_migrate_highatomic(block_type))
-		return NULL;
 
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
@@ -2179,33 +2169,22 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < NR_PAGE_ORDERS; order++) {
 			struct free_area *area = &(zone->free_area[order]);
-			int mt;
+			unsigned long size;
 
 			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
 			if (!page)
 				continue;
 
-			mt = get_pageblock_migratetype(page);
 			/*
-			 * In page freeing path, migratetype change is racy so
-			 * we can counter several free pages in a pageblock
-			 * in this loop although we changed the pageblock type
-			 * from highatomic to ac->migratetype. So we should
-			 * adjust the count once.
+			 * It should never happen but changes to
+			 * locking could inadvertently allow a per-cpu
+			 * drain to add pages to MIGRATE_HIGHATOMIC
+			 * while unreserving so be safe and watch for
+			 * underflows.
 			 */
-			if (is_migrate_highatomic(mt)) {
-				unsigned long size;
-				/*
-				 * It should never happen but changes to
-				 * locking could inadvertently allow a per-cpu
-				 * drain to add pages to MIGRATE_HIGHATOMIC
-				 * while unreserving so be safe and watch for
-				 * underflows.
-				 */
-				size = max(pageblock_nr_pages, 1UL << order);
-				size = min(size, zone->nr_reserved_highatomic);
-				zone->nr_reserved_highatomic -= size;
-			}
+			size = max(pageblock_nr_pages, 1UL << order);
+			size = min(size, zone->nr_reserved_highatomic);
+			zone->nr_reserved_highatomic -= size;
 
 			/*
 			 * Convert to ac->migratetype and avoid the normal
@@ -2217,10 +2196,12 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * may increase.
 			 */
 			if (order < pageblock_order)
-				ret = move_freepages_block(zone, page, mt,
+				ret = move_freepages_block(zone, page,
+							   MIGRATE_HIGHATOMIC,
 							   ac->migratetype);
 			else {
-				move_to_free_list(page, zone, order, mt,
+				move_to_free_list(page, zone, order,
+						  MIGRATE_HIGHATOMIC,
 						  ac->migratetype);
 				change_pageblock_range(page, order,
 						       ac->migratetype);
@@ -2294,7 +2275,8 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 
 		page = get_page_from_free_area(area, fallback_mt);
 		page = try_to_steal_block(zone, page, current_order, order,
-					  start_migratetype, alloc_flags);
+					  start_migratetype, fallback_mt,
+					  alloc_flags);
 		if (page)
 			goto got_one;
 	}
-- 
cgit 


From a4138a2702a4428317ecdb115934554df4b788b4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 24 Feb 2025 19:08:26 -0500
Subject: mm: page_alloc: group fallback functions together

The way the fallback rules are spread out makes them hard to follow.  Move
the functions next to each other at least.

Link: https://lkml.kernel.org/r/20250225001023.1494422-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 394 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 197 insertions(+), 197 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9e6f0db6c79f..945437d7ac44 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1903,6 +1903,43 @@ static void change_pageblock_range(struct page *pageblock_page,
 	}
 }
 
+static inline bool boost_watermark(struct zone *zone)
+{
+	unsigned long max_boost;
+
+	if (!watermark_boost_factor)
+		return false;
+	/*
+	 * Don't bother in zones that are unlikely to produce results.
+	 * On small machines, including kdump capture kernels running
+	 * in a small area, boosting the watermark can cause an out of
+	 * memory situation immediately.
+	 */
+	if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
+		return false;
+
+	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+			watermark_boost_factor, 10000);
+
+	/*
+	 * high watermark may be uninitialised if fragmentation occurs
+	 * very early in boot so do not boost. We do not fall
+	 * through and boost by pageblock_nr_pages as failing
+	 * allocations that early means that reclaim is not going
+	 * to help and it may even be impossible to reclaim the
+	 * boosted watermark resulting in a hang.
+	 */
+	if (!max_boost)
+		return false;
+
+	max_boost = max(pageblock_nr_pages, max_boost);
+
+	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+		max_boost);
+
+	return true;
+}
+
 /*
  * When we are falling back to another migratetype during allocation, try to
  * steal extra free pages from the same pageblocks to satisfy further
@@ -1944,41 +1981,38 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 	return false;
 }
 
-static inline bool boost_watermark(struct zone *zone)
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+			int migratetype, bool only_stealable, bool *can_steal)
 {
-	unsigned long max_boost;
+	int i;
+	int fallback_mt;
 
-	if (!watermark_boost_factor)
-		return false;
-	/*
-	 * Don't bother in zones that are unlikely to produce results.
-	 * On small machines, including kdump capture kernels running
-	 * in a small area, boosting the watermark can cause an out of
-	 * memory situation immediately.
-	 */
-	if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
-		return false;
+	if (area->nr_free == 0)
+		return -1;
 
-	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
-			watermark_boost_factor, 10000);
+	*can_steal = false;
+	for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
+		fallback_mt = fallbacks[migratetype][i];
+		if (free_area_empty(area, fallback_mt))
+			continue;
 
-	/*
-	 * high watermark may be uninitialised if fragmentation occurs
-	 * very early in boot so do not boost. We do not fall
-	 * through and boost by pageblock_nr_pages as failing
-	 * allocations that early means that reclaim is not going
-	 * to help and it may even be impossible to reclaim the
-	 * boosted watermark resulting in a hang.
-	 */
-	if (!max_boost)
-		return false;
+		if (can_steal_fallback(order, migratetype))
+			*can_steal = true;
 
-	max_boost = max(pageblock_nr_pages, max_boost);
+		if (!only_stealable)
+			return fallback_mt;
 
-	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
-		max_boost);
+		if (*can_steal)
+			return fallback_mt;
+	}
 
-	return true;
+	return -1;
 }
 
 /*
@@ -2054,175 +2088,6 @@ try_to_steal_block(struct zone *zone, struct page *page,
 	return NULL;
 }
 
-/*
- * Check whether there is a suitable fallback freepage with requested order.
- * If only_stealable is true, this function returns fallback_mt only if
- * we can steal other freepages all together. This would help to reduce
- * fragmentation due to mixed migratetype pages in one pageblock.
- */
-int find_suitable_fallback(struct free_area *area, unsigned int order,
-			int migratetype, bool only_stealable, bool *can_steal)
-{
-	int i;
-	int fallback_mt;
-
-	if (area->nr_free == 0)
-		return -1;
-
-	*can_steal = false;
-	for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
-		fallback_mt = fallbacks[migratetype][i];
-		if (free_area_empty(area, fallback_mt))
-			continue;
-
-		if (can_steal_fallback(order, migratetype))
-			*can_steal = true;
-
-		if (!only_stealable)
-			return fallback_mt;
-
-		if (*can_steal)
-			return fallback_mt;
-	}
-
-	return -1;
-}
-
-/*
- * Reserve the pageblock(s) surrounding an allocation request for
- * exclusive use of high-order atomic allocations if there are no
- * empty page blocks that contain a page with a suitable order
- */
-static void reserve_highatomic_pageblock(struct page *page, int order,
-					 struct zone *zone)
-{
-	int mt;
-	unsigned long max_managed, flags;
-
-	/*
-	 * The number reserved as: minimum is 1 pageblock, maximum is
-	 * roughly 1% of a zone. But if 1% of a zone falls below a
-	 * pageblock size, then don't reserve any pageblocks.
-	 * Check is race-prone but harmless.
-	 */
-	if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
-		return;
-	max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
-	if (zone->nr_reserved_highatomic >= max_managed)
-		return;
-
-	spin_lock_irqsave(&zone->lock, flags);
-
-	/* Recheck the nr_reserved_highatomic limit under the lock */
-	if (zone->nr_reserved_highatomic >= max_managed)
-		goto out_unlock;
-
-	/* Yoink! */
-	mt = get_pageblock_migratetype(page);
-	/* Only reserve normal pageblocks (i.e., they can merge with others) */
-	if (!migratetype_is_mergeable(mt))
-		goto out_unlock;
-
-	if (order < pageblock_order) {
-		if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
-			goto out_unlock;
-		zone->nr_reserved_highatomic += pageblock_nr_pages;
-	} else {
-		change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
-		zone->nr_reserved_highatomic += 1 << order;
-	}
-
-out_unlock:
-	spin_unlock_irqrestore(&zone->lock, flags);
-}
-
-/*
- * Used when an allocation is about to fail under memory pressure. This
- * potentially hurts the reliability of high-order allocations when under
- * intense memory pressure but failed atomic allocations should be easier
- * to recover from than an OOM.
- *
- * If @force is true, try to unreserve pageblocks even though highatomic
- * pageblock is exhausted.
- */
-static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
-						bool force)
-{
-	struct zonelist *zonelist = ac->zonelist;
-	unsigned long flags;
-	struct zoneref *z;
-	struct zone *zone;
-	struct page *page;
-	int order;
-	int ret;
-
-	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
-								ac->nodemask) {
-		/*
-		 * Preserve at least one pageblock unless memory pressure
-		 * is really high.
-		 */
-		if (!force && zone->nr_reserved_highatomic <=
-					pageblock_nr_pages)
-			continue;
-
-		spin_lock_irqsave(&zone->lock, flags);
-		for (order = 0; order < NR_PAGE_ORDERS; order++) {
-			struct free_area *area = &(zone->free_area[order]);
-			unsigned long size;
-
-			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
-			if (!page)
-				continue;
-
-			/*
-			 * It should never happen but changes to
-			 * locking could inadvertently allow a per-cpu
-			 * drain to add pages to MIGRATE_HIGHATOMIC
-			 * while unreserving so be safe and watch for
-			 * underflows.
-			 */
-			size = max(pageblock_nr_pages, 1UL << order);
-			size = min(size, zone->nr_reserved_highatomic);
-			zone->nr_reserved_highatomic -= size;
-
-			/*
-			 * Convert to ac->migratetype and avoid the normal
-			 * pageblock stealing heuristics. Minimally, the caller
-			 * is doing the work and needs the pages. More
-			 * importantly, if the block was always converted to
-			 * MIGRATE_UNMOVABLE or another type then the number
-			 * of pageblocks that cannot be completely freed
-			 * may increase.
-			 */
-			if (order < pageblock_order)
-				ret = move_freepages_block(zone, page,
-							   MIGRATE_HIGHATOMIC,
-							   ac->migratetype);
-			else {
-				move_to_free_list(page, zone, order,
-						  MIGRATE_HIGHATOMIC,
-						  ac->migratetype);
-				change_pageblock_range(page, order,
-						       ac->migratetype);
-				ret = 1;
-			}
-			/*
-			 * Reserving the block(s) already succeeded,
-			 * so this should not fail on zone boundaries.
-			 */
-			WARN_ON_ONCE(ret == -1);
-			if (ret > 0) {
-				spin_unlock_irqrestore(&zone->lock, flags);
-				return ret;
-			}
-		}
-		spin_unlock_irqrestore(&zone->lock, flags);
-	}
-
-	return false;
-}
-
 /*
  * Try finding a free buddy page on the fallback list.
  *
@@ -3143,6 +3008,141 @@ out:
 	return page;
 }
 
+/*
+ * Reserve the pageblock(s) surrounding an allocation request for
+ * exclusive use of high-order atomic allocations if there are no
+ * empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, int order,
+					 struct zone *zone)
+{
+	int mt;
+	unsigned long max_managed, flags;
+
+	/*
+	 * The number reserved as: minimum is 1 pageblock, maximum is
+	 * roughly 1% of a zone. But if 1% of a zone falls below a
+	 * pageblock size, then don't reserve any pageblocks.
+	 * Check is race-prone but harmless.
+	 */
+	if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
+		return;
+	max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
+	if (zone->nr_reserved_highatomic >= max_managed)
+		return;
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	/* Recheck the nr_reserved_highatomic limit under the lock */
+	if (zone->nr_reserved_highatomic >= max_managed)
+		goto out_unlock;
+
+	/* Yoink! */
+	mt = get_pageblock_migratetype(page);
+	/* Only reserve normal pageblocks (i.e., they can merge with others) */
+	if (!migratetype_is_mergeable(mt))
+		goto out_unlock;
+
+	if (order < pageblock_order) {
+		if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
+			goto out_unlock;
+		zone->nr_reserved_highatomic += pageblock_nr_pages;
+	} else {
+		change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
+		zone->nr_reserved_highatomic += 1 << order;
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve pageblocks even though highatomic
+ * pageblock is exhausted.
+ */
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+						bool force)
+{
+	struct zonelist *zonelist = ac->zonelist;
+	unsigned long flags;
+	struct zoneref *z;
+	struct zone *zone;
+	struct page *page;
+	int order;
+	int ret;
+
+	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
+								ac->nodemask) {
+		/*
+		 * Preserve at least one pageblock unless memory pressure
+		 * is really high.
+		 */
+		if (!force && zone->nr_reserved_highatomic <=
+					pageblock_nr_pages)
+			continue;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		for (order = 0; order < NR_PAGE_ORDERS; order++) {
+			struct free_area *area = &(zone->free_area[order]);
+			unsigned long size;
+
+			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+			if (!page)
+				continue;
+
+			/*
+			 * It should never happen but changes to
+			 * locking could inadvertently allow a per-cpu
+			 * drain to add pages to MIGRATE_HIGHATOMIC
+			 * while unreserving so be safe and watch for
+			 * underflows.
+			 */
+			size = max(pageblock_nr_pages, 1UL << order);
+			size = min(size, zone->nr_reserved_highatomic);
+			zone->nr_reserved_highatomic -= size;
+
+			/*
+			 * Convert to ac->migratetype and avoid the normal
+			 * pageblock stealing heuristics. Minimally, the caller
+			 * is doing the work and needs the pages. More
+			 * importantly, if the block was always converted to
+			 * MIGRATE_UNMOVABLE or another type then the number
+			 * of pageblocks that cannot be completely freed
+			 * may increase.
+			 */
+			if (order < pageblock_order)
+				ret = move_freepages_block(zone, page,
+							   MIGRATE_HIGHATOMIC,
+							   ac->migratetype);
+			else {
+				move_to_free_list(page, zone, order,
+						  MIGRATE_HIGHATOMIC,
+						  ac->migratetype);
+				change_pageblock_range(page, order,
+						       ac->migratetype);
+				ret = 1;
+			}
+			/*
+			 * Reserving the block(s) already succeeded,
+			 * so this should not fail on zone boundaries.
+			 */
+			WARN_ON_ONCE(ret == -1);
+			if (ret > 0) {
+				spin_unlock_irqrestore(&zone->lock, flags);
+				return ret;
+			}
+		}
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+
+	return false;
+}
+
 static inline long __zone_watermark_unusable_free(struct zone *z,
 				unsigned int order, unsigned int alloc_flags)
 {
-- 
cgit 


From ebc29409c2966bd6a6215b27fc654de7f55ce099 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <jackmanb@google.com>
Date: Tue, 25 Feb 2025 18:45:09 +0000
Subject: mm/page_alloc: warn on nr_reserved_highatomic underflow

As documented in the comment this underflow should not happen.  The
locking has indeed changed here since the comment was written, see the
migratetype hygiene patches[0].  However, those changes made the locking
_safer_, so the underflow _really_ shouldn't happen now.  So upgrade the
comment to a warning.

[0] https://lore.kernel.org/all/20240320180429.678181-7-hannes@cmpxchg.org/T/#m3da87e6cc3348a4640aa298137bc9f8f61b76c84

Link: https://lkml.kernel.org/r/20250225-warn-underflow-v1-1-3dc542941d3a@google.com
Signed-off-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 945437d7ac44..3c5624380b6c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3095,6 +3095,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			if (!page)
 				continue;
 
+			size = max(pageblock_nr_pages, 1UL << order);
 			/*
 			 * It should never happen but changes to
 			 * locking could inadvertently allow a per-cpu
@@ -3102,8 +3103,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * while unreserving so be safe and watch for
 			 * underflows.
 			 */
-			size = max(pageblock_nr_pages, 1UL << order);
-			size = min(size, zone->nr_reserved_highatomic);
+			if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic))
+				size = zone->nr_reserved_highatomic;
 			zone->nr_reserved_highatomic -= size;
 
 			/*
-- 
cgit 


From e47f1f56dd82cc6d91f5c4d914a534aa03cd12ca Mon Sep 17 00:00:00 2001
From: Brendan Jackman <jackmanb@google.com>
Date: Fri, 28 Feb 2025 09:52:17 +0000
Subject: mm/page_alloc: clarify terminology in migratetype fallback code

Patch series "mm/page_alloc: Some clarifications for migratetype
fallback", v4.

A couple of patches to try and make the code easier to follow.


This patch (of 2):

This code is rather confusing because:

 1. "Steal" is sometimes used to refer to the general concept of
    allocating from a from a block of a fallback migratetype
    (steal_suitable_fallback()) but sometimes it refers specifically to
    converting a whole block's migratetype (can_steal_fallback()).

 2. can_steal_fallback() sounds as though it's answering the question "am
    I functionally permitted to allocate from that other type" but in
    fact it is encoding a heuristic preference.

 3. The same piece of data has different names in different places:
    can_steal vs whole_block. This reinforces point 2 because it looks
    like the different names reflect a shift in intent from "am I
    allowed to steal" to "do I want to steal", but no such shift exists.

Fix 1. by avoiding the term "steal" in ambiguous contexts. Start using
the term "claim" to refer to the special case of stealing the entire
block.

Fix 2. by using "should" instead of "can", and also rename its
parameters and add some commentary to make it more explicit what they
mean.

Fix 3. by adopting the new "claim" terminology universally for this
set of variables.

Link: https://lkml.kernel.org/r/20250228-clarify-steal-v4-0-cb2ef1a4e610@google.com
Link: https://lkml.kernel.org/r/20250228-clarify-steal-v4-1-cb2ef1a4e610@google.com
Signed-off-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 72 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c5624380b6c..0f0ecfe82f5a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1942,22 +1942,22 @@ static inline bool boost_watermark(struct zone *zone)
 
 /*
  * When we are falling back to another migratetype during allocation, try to
- * steal extra free pages from the same pageblocks to satisfy further
- * allocations, instead of polluting multiple pageblocks.
+ * claim entire blocks to satisfy further allocations, instead of polluting
+ * multiple pageblocks.
  *
- * If we are stealing a relatively large buddy page, it is likely there will
- * be more free pages in the pageblock, so try to steal them all. For
- * reclaimable and unmovable allocations, we steal regardless of page size,
- * as fragmentation caused by those allocations polluting movable pageblocks
- * is worse than movable allocations stealing from unmovable and reclaimable
- * pageblocks.
+ * If we are stealing a relatively large buddy page, it is likely there will be
+ * more free pages in the pageblock, so try to claim the whole block. For
+ * reclaimable and unmovable allocations, we try to claim the whole block
+ * regardless of page size, as fragmentation caused by those allocations
+ * polluting movable pageblocks is worse than movable allocations stealing from
+ * unmovable and reclaimable pageblocks.
  */
-static bool can_steal_fallback(unsigned int order, int start_mt)
+static bool should_try_claim_block(unsigned int order, int start_mt)
 {
 	/*
 	 * Leaving this order check is intended, although there is
 	 * relaxed order check in next check. The reason is that
-	 * we can actually steal whole pageblock if this condition met,
+	 * we can actually claim the whole pageblock if this condition met,
 	 * but, below check doesn't guarantee it and that is just heuristic
 	 * so could be changed anytime.
 	 */
@@ -1970,7 +1970,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 	 * reclaimable pages that are closest to the request size.  After a
 	 * while, memory compaction may occur to form large contiguous pages,
 	 * and the next movable allocation may not need to steal.  Unmovable and
-	 * reclaimable allocations need to actually steal pages.
+	 * reclaimable allocations need to actually claim the whole block.
 	 */
 	if (order >= pageblock_order / 2 ||
 		start_mt == MIGRATE_RECLAIMABLE ||
@@ -1983,12 +1983,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 
 /*
  * Check whether there is a suitable fallback freepage with requested order.
- * If only_stealable is true, this function returns fallback_mt only if
- * we can steal other freepages all together. This would help to reduce
+ * Sets *claim_block to instruct the caller whether it should convert a whole
+ * pageblock to the returned migratetype.
+ * If only_claim is true, this function returns fallback_mt only if
+ * we would do this whole-block claiming. This would help to reduce
  * fragmentation due to mixed migratetype pages in one pageblock.
  */
 int find_suitable_fallback(struct free_area *area, unsigned int order,
-			int migratetype, bool only_stealable, bool *can_steal)
+			int migratetype, bool only_claim, bool *claim_block)
 {
 	int i;
 	int fallback_mt;
@@ -1996,19 +1998,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
 	if (area->nr_free == 0)
 		return -1;
 
-	*can_steal = false;
+	*claim_block = false;
 	for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
 		fallback_mt = fallbacks[migratetype][i];
 		if (free_area_empty(area, fallback_mt))
 			continue;
 
-		if (can_steal_fallback(order, migratetype))
-			*can_steal = true;
+		if (should_try_claim_block(order, migratetype))
+			*claim_block = true;
 
-		if (!only_stealable)
-			return fallback_mt;
-
-		if (*can_steal)
+		if (*claim_block || !only_claim)
 			return fallback_mt;
 	}
 
@@ -2016,14 +2015,14 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
 }
 
 /*
- * This function implements actual steal behaviour. If order is large enough, we
- * can claim the whole pageblock for the requested migratetype. If not, we check
- * the pageblock for constituent pages; if at least half of the pages are free
- * or compatible, we can still claim the whole block, so pages freed in the
- * future will be put on the correct free list.
+ * This function implements actual block claiming behaviour. If order is large
+ * enough, we can claim the whole pageblock for the requested migratetype. If
+ * not, we check the pageblock for constituent pages; if at least half of the
+ * pages are free or compatible, we can still claim the whole block, so pages
+ * freed in the future will be put on the correct free list.
  */
 static struct page *
-try_to_steal_block(struct zone *zone, struct page *page,
+try_to_claim_block(struct zone *zone, struct page *page,
 		   int current_order, int order, int start_type,
 		   int block_type, unsigned int alloc_flags)
 {
@@ -2091,11 +2090,12 @@ try_to_steal_block(struct zone *zone, struct page *page,
 /*
  * Try finding a free buddy page on the fallback list.
  *
- * This will attempt to steal a whole pageblock for the requested type
+ * This will attempt to claim a whole pageblock for the requested type
  * to ensure grouping of such requests in the future.
  *
- * If a whole block cannot be stolen, regress to __rmqueue_smallest()
- * logic to at least break up as little contiguity as possible.
+ * If a whole block cannot be claimed, steal an individual page, regressing to
+ * __rmqueue_smallest() logic to at least break up as little contiguity as
+ * possible.
  *
  * The use of signed ints for order and current_order is a deliberate
  * deviation from the rest of this file, to make the for loop
@@ -2112,7 +2112,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 	int min_order = order;
 	struct page *page;
 	int fallback_mt;
-	bool can_steal;
+	bool claim_block;
 
 	/*
 	 * Do not steal pages from freelists belonging to other pageblocks
@@ -2131,15 +2131,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 				--current_order) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
-				start_migratetype, false, &can_steal);
+				start_migratetype, false, &claim_block);
 		if (fallback_mt == -1)
 			continue;
 
-		if (!can_steal)
+		if (!claim_block)
 			break;
 
 		page = get_page_from_free_area(area, fallback_mt);
-		page = try_to_steal_block(zone, page, current_order, order,
+		page = try_to_claim_block(zone, page, current_order, order,
 					  start_migratetype, fallback_mt,
 					  alloc_flags);
 		if (page)
@@ -2149,11 +2149,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 	if (alloc_flags & ALLOC_NOFRAGMENT)
 		return NULL;
 
-	/* No luck stealing blocks. Find the smallest fallback page */
+	/* No luck claiming pageblock. Find the smallest fallback page */
 	for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
-				start_migratetype, false, &can_steal);
+				start_migratetype, false, &claim_block);
 		if (fallback_mt == -1)
 			continue;
 
-- 
cgit 


From a14efee04796dd3f614eaf5348ca1ac099c21349 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <jackmanb@google.com>
Date: Fri, 28 Feb 2025 09:52:18 +0000
Subject: mm/page_alloc: clarify should_claim_block() commentary

There's lots of text here but it's a little hard to follow, this is an
attempt to break it up and align its structure more closely with the code.

Reword the top-level function comment to just explain what question the
function answers from the point of view of the caller.

Break up the internal logic into different sections that can have their
own commentary describing why that part of the rationale is present.

Note the page_group_by_mobility_disabled logic is not explained in the
commentary, that is outside the scope of this patch...

Link: https://lkml.kernel.org/r/20250228-clarify-steal-v4-2-cb2ef1a4e610@google.com
Signed-off-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 46 ++++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0f0ecfe82f5a..57f959af79c5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1941,16 +1941,9 @@ static inline bool boost_watermark(struct zone *zone)
 }
 
 /*
- * When we are falling back to another migratetype during allocation, try to
- * claim entire blocks to satisfy further allocations, instead of polluting
- * multiple pageblocks.
- *
- * If we are stealing a relatively large buddy page, it is likely there will be
- * more free pages in the pageblock, so try to claim the whole block. For
- * reclaimable and unmovable allocations, we try to claim the whole block
- * regardless of page size, as fragmentation caused by those allocations
- * polluting movable pageblocks is worse than movable allocations stealing from
- * unmovable and reclaimable pageblocks.
+ * When we are falling back to another migratetype during allocation, should we
+ * try to claim an entire block to satisfy further allocations, instead of
+ * polluting multiple pageblocks?
  */
 static bool should_try_claim_block(unsigned int order, int start_mt)
 {
@@ -1965,19 +1958,32 @@ static bool should_try_claim_block(unsigned int order, int start_mt)
 		return true;
 
 	/*
-	 * Movable pages won't cause permanent fragmentation, so when you alloc
-	 * small pages, you just need to temporarily steal unmovable or
-	 * reclaimable pages that are closest to the request size.  After a
-	 * while, memory compaction may occur to form large contiguous pages,
-	 * and the next movable allocation may not need to steal.  Unmovable and
-	 * reclaimable allocations need to actually claim the whole block.
+	 * Above a certain threshold, always try to claim, as it's likely there
+	 * will be more free pages in the pageblock.
+	 */
+	if (order >= pageblock_order / 2)
+		return true;
+
+	/*
+	 * Unmovable/reclaimable allocations would cause permanent
+	 * fragmentations if they fell back to allocating from a movable block
+	 * (polluting it), so we try to claim the whole block regardless of the
+	 * allocation size. Later movable allocations can always steal from this
+	 * block, which is less problematic.
 	 */
-	if (order >= pageblock_order / 2 ||
-		start_mt == MIGRATE_RECLAIMABLE ||
-		start_mt == MIGRATE_UNMOVABLE ||
-		page_group_by_mobility_disabled)
+	if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE)
 		return true;
 
+	if (page_group_by_mobility_disabled)
+		return true;
+
+	/*
+	 * Movable pages won't cause permanent fragmentation, so when you alloc
+	 * small pages, we just need to temporarily steal unmovable or
+	 * reclaimable pages that are closest to the request size. After a
+	 * while, memory compaction may occur to form large contiguous pages,
+	 * and the next movable allocation may not need to steal.
+	 */
 	return false;
 }
 
-- 
cgit 


From 4996fc547f5b49f4a43c261dfadb02cf165cdb51 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:56 +0100
Subject: mm: let _folio_nr_pages overlay memcg_data in first tail page

Let's free up some more of the "unconditionally available on 64BIT" space
in order-1 folios by letting _folio_nr_pages overlay memcg_data in the
first tail page (second folio page).  Consequently, we have the
optimization now whenever we have CONFIG_MEMCG, independent of 64BIT.

We have to make sure that page->memcg on tail pages does not return
"surprises".  page_memcg_check() already properly refuses PageTail().
Let's do that earlier in print_page_owner_memcg() to avoid printing wrong
"Slab cache page" information.  No other code should touch that field on
tail pages of compound pages.

Reset the "_nr_pages" to 0 when splitting folios, or when freeing them
back to the buddy (to avoid false page->memcg_data "bad page" reports).

Note that in __split_huge_page(), folio_nr_pages() would stop working
already as soon as we start messing with the subpages.

Most kernel configs should have at least CONFIG_MEMCG enabled, even if
disabled at runtime.  64byte "struct memmap" is what we usually have on
64BIT.

While at it, rename "_folio_nr_pages" to "_nr_pages".

Hopefully memdescs / dynamically allocating "strut folio" in the future
will further clean this up, e.g., making _nr_pages available in all
configs and maybe even in small folios.  Doing that should be fairly easy
on top of this change.

[david@redhat.com: make "make htmldoc" happy]
  Link: https://lkml.kernel.org/r/a97f8a91-ec41-4796-81e3-7c9e0e491ba4@redhat.com
Link: https://lkml.kernel.org/r/20250303163014.1128035-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 57f959af79c5..e1135dff9a86 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1174,8 +1174,12 @@ __always_inline bool free_pages_prepare(struct page *page,
 	if (unlikely(order)) {
 		int i;
 
-		if (compound)
+		if (compound) {
 			page[1].flags &= ~PAGE_FLAGS_SECOND;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+			folio->_nr_pages = 0;
+#endif
+		}
 		for (i = 1; i < (1 << order); i++) {
 			if (compound)
 				bad += free_tail_page_prepare(page, page + i);
-- 
cgit 


From 4eeec8c89a0c4a8c20fb13a4e7093cc8efce383d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:57 +0100
Subject: mm: move hugetlb specific things in folio to page[3]

Let's just move the hugetlb specific stuff to a separate page, and stop
letting it overlay other fields for now.

This frees up some space in page[2], which we will use on 32bit to free up
some space in page[1].  While we could move these things to page[3]
instead, it's cleaner to just move the hugetlb specific things out of the
way and pack the core-folio stuff as tight as possible.  ...  and we can
minimize the work required in dump_folio.

We can now avoid re-initializing &folio->_deferred_list in hugetlb code.

Hopefully dynamically allocating "strut folio" in the future will further
clean this up.

Link: https://lkml.kernel.org/r/20250303163014.1128035-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e1135dff9a86..735192222c36 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -971,6 +971,11 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			goto out;
 		}
 		break;
+	case 3:
+		/* the third tail page: hugetlb specifics overlap ->mappings */
+		if (IS_ENABLED(CONFIG_HUGETLB_PAGE))
+			break;
+		fallthrough;
 	default:
 		if (page->mapping != TAIL_MAPPING) {
 			bad_page(page, "corrupted mapping in tail page");
-- 
cgit 


From 31a31da8a6187f1e5448ec73222e01d7d3fed4aa Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:58 +0100
Subject: mm: move _pincount in folio to page[2] on 32bit

Let's free up some space on 32bit in page[1] by moving the _pincount to
page[2].

For order-1 folios (never anon folios!) on 32bit, we will now also use the
GUP_PIN_COUNTING_BIAS approach.  A fully-mapped order-1 folio requires 2
references.  With GUP_PIN_COUNTING_BIAS being 1024, we'd detect such
folios as "maybe pinned" with 512 full mappings, instead of 1024 for
order-0.  As anon folios are out of the picture (which are the most
relevant users of checking for pinnings on *mapped* pages) and we are
talking about 32bit, this is not expected to cause any trouble.

In __dump_page(), copy one additional folio page if we detect a folio with
an order > 1, so we can dump the pincount on order > 1 folios reliably.

Note that THPs on 32bit are not particularly common (and we don't care too
much about performance), but we want to keep it working reliably, because
likely we want to use large folios there as well in the future,
independent of PMD leaf support.

Once we dynamically allocate "struct folio", fortunately the 32bit
specifics will likely go away again; even small folios could then have a
pincount and folio_has_pincount() would essentially always return "true".

Link: https://lkml.kernel.org/r/20250303163014.1128035-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 735192222c36..2a9aa4439a66 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -959,9 +959,11 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			bad_page(page, "nonzero nr_pages_mapped");
 			goto out;
 		}
-		if (unlikely(atomic_read(&folio->_pincount))) {
-			bad_page(page, "nonzero pincount");
-			goto out;
+		if (IS_ENABLED(CONFIG_64BIT)) {
+			if (unlikely(atomic_read(&folio->_pincount))) {
+				bad_page(page, "nonzero pincount");
+				goto out;
+			}
 		}
 		break;
 	case 2:
@@ -970,6 +972,12 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			bad_page(page, "on deferred list");
 			goto out;
 		}
+		if (!IS_ENABLED(CONFIG_64BIT)) {
+			if (unlikely(atomic_read(&folio->_pincount))) {
+				bad_page(page, "nonzero pincount");
+				goto out;
+			}
+		}
 		break;
 	case 3:
 		/* the third tail page: hugetlb specifics overlap ->mappings */
-- 
cgit 


From 845d2be6d41f016da670dcc4c8f5357c22172be8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:29:59 +0100
Subject: mm: move _entire_mapcount in folio to page[2] on 32bit

Let's free up some space on 32bit in page[1] by moving the _pincount to
page[2].

Ordinary folios only use the entire mapcount with PMD mappings, so order-1
folios don't apply.  Similarly, hugetlb folios are always larger than
order-1, turning the entire mapcount essentially unused for all order-1
folios.  Moving it to order-1 folios will not change anything.

On 32bit, simply check in folio_entire_mapcount() whether we have an
order-1 folio, and return 0 in that case.

Note that THPs on 32bit are not particularly common (and we don't care too
much about performance), but we want to keep it working reliably, because
likely we want to use large folios there as well in the future,
independent of PMD leaf support.

Once we dynamically allocate "struct folio", the 32bit specifics will go
away again; even small folios could then have a pincount.

Link: https://lkml.kernel.org/r/20250303163014.1128035-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2a9aa4439a66..e456a43811fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -947,10 +947,6 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 	switch (page - head_page) {
 	case 1:
 		/* the first tail page: these may be in place of ->mapping */
-		if (unlikely(folio_entire_mapcount(folio))) {
-			bad_page(page, "nonzero entire_mapcount");
-			goto out;
-		}
 		if (unlikely(folio_large_mapcount(folio))) {
 			bad_page(page, "nonzero large_mapcount");
 			goto out;
@@ -960,6 +956,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			goto out;
 		}
 		if (IS_ENABLED(CONFIG_64BIT)) {
+			if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+				bad_page(page, "nonzero entire_mapcount");
+				goto out;
+			}
 			if (unlikely(atomic_read(&folio->_pincount))) {
 				bad_page(page, "nonzero pincount");
 				goto out;
@@ -973,6 +973,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			goto out;
 		}
 		if (!IS_ENABLED(CONFIG_64BIT)) {
+			if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+				bad_page(page, "nonzero entire_mapcount");
+				goto out;
+			}
 			if (unlikely(atomic_read(&folio->_pincount))) {
 				bad_page(page, "nonzero pincount");
 				goto out;
-- 
cgit 


From 6af8cb80d3a9a6bbd521d8a7c949b4eafb7dba5d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:30:05 +0100
Subject: mm/rmap: basic MM owner tracking for large folios (!hugetlb)

For small folios, we traditionally use the mapcount to decide whether it
was "certainly mapped exclusively" by a single MM (mapcount == 1) or
whether it "maybe mapped shared" by multiple MMs (mapcount > 1).  For
PMD-sized folios that were PMD-mapped, we were able to use a similar
mechanism (single PMD mapping), but for PTE-mapped folios and in the
future folios that span multiple PMDs, this does not work.

So we need a different mechanism to handle large folios.  Let's add a new
mechanism to detect whether a large folio is "certainly mapped
exclusively", or whether it is "maybe mapped shared".

We'll use this information next to optimize CoW reuse for PTE-mapped
anonymous THP, and to convert folio_likely_mapped_shared() to
folio_maybe_mapped_shared(), independent of per-page mapcounts.

For each large folio, we'll have two slots, whereby a slot stores:
 (1) an MM id: unique id assigned to each MM
 (2) a per-MM mapcount

If a slot is unoccupied, it can be taken by the next MM that maps folio
page.

In addition, we'll remember the current state -- "mapped exclusively" vs.
"maybe mapped shared" -- and use a bit spinlock to sync on updates and to
reduce the total number of atomic accesses on updates.  In the future, it
might be possible to squeeze a proper spinlock into "struct folio".  For
now, keep it simple, as we require the whole thing with THP only, that is
incompatible with RT.

As we have to squeeze this information into the "struct folio" of even
folios of order-1 (2 pages), and we generally want to reduce the required
metadata, we'll assign each MM a unique ID that can fit into an int.  In
total, we can squeeze everything into 4x int (2x long) on 64bit.

32bit support is a bit challenging, because we only have 2x long == 2x int
in order-1 folios.  But we can make it work for now, because we neither
expect many MMs nor very large folios on 32bit.

We will reliably detect folios as "mapped exclusively" vs.  "mapped
shared" as long as only two MMs map pages of a folio at one point in time
-- for example with fork() and short-lived child processes, or with apps
that hand over state from one instance to another.

As soon as three MMs are involved at the same time, we might detect "maybe
mapped shared" although the folio is "mapped exclusively".

Example 1:

(1) App1 faults in a (shmem/file-backed) folio page -> Tracked as MM0
(2) App2 faults in a folio page -> Tracked as MM1
(4) App1 unmaps all folio pages

 -> We will detect "mapped exclusively".

Example 2:

(1) App1 faults in a (shmem/file-backed) folio page -> Tracked as MM0
(2) App2 faults in a folio page -> Tracked as MM1
(3) App3 faults in a folio page -> No slot available, tracked as "unknown"
(4) App1 and App2 unmap all folio pages

 -> We will detect "maybe mapped shared".

Make use of __always_inline to keep possible performance degradation when
(un)mapping large folios to a minimum.

Note: by squeezing the two flags into the "unsigned long" that stores the
MM ids, we can use non-atomic __bit_spin_unlock() and non-atomic
setting/clearing of the "maybe mapped shared" bit, effectively not adding
any new atomics on the hot path when updating the large mapcount + new
metadata, which further helps reduce the runtime overhead in
micro-benchmarks.

Link: https://lkml.kernel.org/r/20250303163014.1128035-13-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e456a43811fd..c8daa3e64266 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -955,6 +955,16 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			bad_page(page, "nonzero nr_pages_mapped");
 			goto out;
 		}
+		if (IS_ENABLED(CONFIG_MM_ID)) {
+			if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
+				bad_page(page, "nonzero mm mapcount 0");
+				goto out;
+			}
+			if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
+				bad_page(page, "nonzero mm mapcount 1");
+				goto out;
+			}
+		}
 		if (IS_ENABLED(CONFIG_64BIT)) {
 			if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
 				bad_page(page, "nonzero entire_mapcount");
-- 
cgit 


From 749492229e3bd6222dda7267b8244135229d1fd8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 3 Mar 2025 17:30:13 +0100
Subject: mm: stop maintaining the per-page mapcount of large folios
 (CONFIG_NO_PAGE_MAPCOUNT)

Everything is in place to stop using the per-page mapcounts in large
folios: the mapcount of tail pages will always be logically 0 (-1 value),
just like it currently is for hugetlb folios already, and the page
mapcount of the head page is either 0 (-1 value) or contains a page type
(e.g., hugetlb).

Maintaining _nr_pages_mapped without per-page mapcounts is impossible, so
that one also has to go with CONFIG_NO_PAGE_MAPCOUNT.

There are two remaining implications:

(1) Per-node, per-cgroup and per-lruvec stats of "NR_ANON_MAPPED"
    ("mapped anonymous memory") and "NR_FILE_MAPPED"
    ("mapped file memory"):

    As soon as any page of the folio is mapped -- folio_mapped() -- we
    now account the complete folio as mapped. Once the last page is
    unmapped -- !folio_mapped() -- we account the complete folio as
    unmapped.

    This implies that ...

    * "AnonPages" and "Mapped" in /proc/meminfo and
      /sys/devices/system/node/*/meminfo
    * cgroup v2: "anon" and "file_mapped" in "memory.stat" and
      "memory.numa_stat"
    * cgroup v1: "rss" and "mapped_file" in "memory.stat" and
      "memory.numa_stat

    ... can now appear higher than before. But note that these folios do
    consume that memory, simply not all pages are actually currently
    mapped.

    It's worth nothing that other accounting in the kernel (esp. cgroup
    charging on allocation) is not affected by this change.

    [why oh why is "anon" called "rss" in cgroup v1]

 (2) Detecting partial mappings

     Detecting whether anon THPs are partially mapped gets a bit more
     unreliable. As long as a single MM maps such a large folio
     ("exclusively mapped"), we can reliably detect it. Especially before
     fork() / after a short-lived child process quit, we will detect
     partial mappings reliably, which is the common case.

     In essence, if the average per-page mapcount in an anon THP is < 1,
     we know for sure that we have a partial mapping.

     However, as soon as multiple MMs are involved, we might miss detecting
     partial mappings: this might be relevant with long-lived child
     processes. If we have a fully-mapped anon folio before fork(), once
     our child processes and our parent all unmap (zap/COW) the same pages
     (but not the complete folio), we might not detect the partial mapping.
     However, once the child processes quit we would detect the partial
     mapping.

     How relevant this case is in practice remains to be seen.
     Swapout/migration will likely mitigate this.

     In the future, RMAP walkers could check for that for that case
     (e.g., when collecting access bits during reclaim) and simply flag
     them for deferred-splitting.

Link: https://lkml.kernel.org/r/20250303163014.1128035-21-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirks^H^Hski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michal Koutn <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: tejun heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c8daa3e64266..2c6ae7e5aaad 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -951,7 +951,8 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			bad_page(page, "nonzero large_mapcount");
 			goto out;
 		}
-		if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
+		if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
+		    unlikely(atomic_read(&folio->_nr_pages_mapped))) {
 			bad_page(page, "nonzero nr_pages_mapped");
 			goto out;
 		}
-- 
cgit 


From 8c02048d1c6126527f15752a5e0849dc49cefeeb Mon Sep 17 00:00:00 2001
From: Martin Liu <liumartin@google.com>
Date: Sat, 8 Mar 2025 03:46:00 +0000
Subject: mm/page_alloc: add trace event for per-zone watermark setup

Patch series "Add tracepoints for lowmem reserves, watermarks and
totalreserve_pages", v2.

This patchset introduces tracepoints to track changes in the lowmem
reserves, watermarks and totalreserve_pages. This helps to track
the exact timing of such changes and understand their relation to
reclaim activities.

The tracepoints added are:

mm_setup_per_zone_lowmem_reserve
mm_setup_per_zone_wmarks
mm_calculate_totalreserve_pagesi


This patch (of 3):

This commit introduces the `mm_setup_per_zone_wmarks` trace event,
which provides detailed insights into the kernel's per-zone watermark
configuration, offering precise timing and the ability to correlate
watermark changes with specific kernel events.

While `/proc/zoneinfo` provides some information about zone watermarks,
this trace event offers:

1. The ability to link watermark changes to specific kernel events and
   logic.

2. The ability to capture rapid or short-lived changes in watermarks
   that may be missed by user-space polling

3. Diagnosing unexpected kswapd activity or excessive direct reclaim
   triggered by rapidly changing watermarks.

Link: https://lkml.kernel.org/r/20250308034606.2036033-1-liumartin@google.com
Link: https://lkml.kernel.org/r/20250308034606.2036033-2-liumartin@google.com
Signed-off-by: Martin Liu <liumartin@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Martin Liu <liumartin@google.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c6ae7e5aaad..b739367434ce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6006,6 +6006,7 @@ static void __setup_per_zone_wmarks(void)
 		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
 		zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
 		zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
+		trace_mm_setup_per_zone_wmarks(zone);
 
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
-- 
cgit 


From a293aba4a584709889f77a0ad0c45746aecf1b9f Mon Sep 17 00:00:00 2001
From: Martin Liu <liumartin@google.com>
Date: Sat, 8 Mar 2025 03:46:01 +0000
Subject: mm/page_alloc: add trace event for per-zone lowmem reserve setup

This commit introduces the `mm_setup_per_zone_lowmem_reserve` trace
event,which provides detailed insights into the kernel's per-zone lowmem
reserve configuration.

The trace event provides precise timestamps, allowing developers to

1. Correlate lowmem reserve changes with specific kernel events and
   able to diagnose unexpected kswapd or direct reclaim behavior triggered
   by dynamic changes in lowmem reserve.

2. Know memory allocation failures that occur due to insufficient
   lowmem reserve, by precisely correlating allocation attempts with
   reserve adjustments.

Link: https://lkml.kernel.org/r/20250308034606.2036033-3-liumartin@google.com
Signed-off-by: Martin Liu <liumartin@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b739367434ce..a82d96cb3044 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5943,6 +5943,8 @@ static void setup_per_zone_lowmem_reserve(void)
 					zone->lowmem_reserve[j] = 0;
 				else
 					zone->lowmem_reserve[j] = managed_pages / ratio;
+				trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone,
+								       zone->lowmem_reserve[j]);
 			}
 		}
 	}
-- 
cgit 


From 15766485e4a51bec2dcce304c089a95550720033 Mon Sep 17 00:00:00 2001
From: Martin Liu <liumartin@google.com>
Date: Sat, 8 Mar 2025 03:46:02 +0000
Subject: mm/page_alloc: add trace event for totalreserve_pages calculation

This commit introduces a new trace event,
`mm_calculate_totalreserve_pages`, which reports the new reserve value at
the exact time when it takes effect.

The `totalreserve_pages` value represents the total amount of memory
reserved across all zones and nodes in the system.  This reserved memory
is crucial for ensuring that critical kernel operations have access to
sufficient memory, even under memory pressure.

By tracing the `totalreserve_pages` value, developers can gain insights
that how the total reserved memory changes over time.

Link: https://lkml.kernel.org/r/20250308034606.2036033-4-liumartin@google.com
Signed-off-by: Martin Liu <liumartin@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a82d96cb3044..0be1fedd1201 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5914,6 +5914,7 @@ static void calculate_totalreserve_pages(void)
 		}
 	}
 	totalreserve_pages = reserve_pages;
+	trace_mm_calculate_totalreserve_pages(totalreserve_pages);
 }
 
 /*
-- 
cgit 


From f46012c0ec9f544998b81b2e3c6c702b9277f596 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Mar 2025 17:05:33 -0400
Subject: mm: page_alloc: trace type pollution from compaction capturing

When the page allocator places pages of a certain migratetype into blocks
of another type, it has lasting effects on the ability to compact and
defragment down the line.  For improving placement and compaction,
visibility into such events is crucial.

The most common case, allocator fallbacks, is already annotated, but
compaction capturing is also allowed to grab pages of a different type.
Extend the tracepoint to cover this case.

Link: https://lkml.kernel.org/r/20250313210647.1314586-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0be1fedd1201..5b92b1acda0e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -614,6 +614,10 @@ compaction_capture(struct capture_control *capc, struct page *page,
 	    capc->cc->migratetype != MIGRATE_MOVABLE)
 		return false;
 
+	if (migratetype != capc->cc->migratetype)
+		trace_mm_page_alloc_extfrag(page, capc->cc->order, order,
+					    capc->cc->migratetype, migratetype);
+
 	capc->page = page;
 	return true;
 }
-- 
cgit 


From e3aa7df331bca08742a212764348246e8e8a874e Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Mar 2025 17:05:34 -0400
Subject: mm: page_alloc: defrag_mode

The page allocator groups requests by migratetype to stave off
fragmentation.  However, in practice this is routinely defeated by the
fact that it gives up *before* invoking reclaim and compaction - which may
well produce suitable pages.  As a result, fragmentation of physical
memory is a common ongoing process in many load scenarios.

Fragmentation deteriorates compaction's ability to produce huge pages.
Depending on the lifetime of the fragmenting allocations, those effects
can be long-lasting or even permanent, requiring drastic measures like
forcible idle states or even reboots as the only reliable ways to recover
the address space for THP production.

In a kernel build test with supplemental THP pressure, the THP allocation
rate steadily declines over 15 runs:

    thp_fault_alloc
    61988
    56474
    57258
    50187
    52388
    55409
    52925
    47648
    43669
    40621
    36077
    41721
    36685
    34641
    33215

This is a hurdle in adopting THP in any environment where hosts are shared
between multiple overlapping workloads (cloud environments), and rarely
experience true idle periods.  To make THP a reliable and predictable
optimization, there needs to be a stronger guarantee to avoid such
fragmentation.

Introduce defrag_mode.  When enabled, reclaim/compaction is invoked to its
full extent *before* falling back.  Specifically, ALLOC_NOFRAGMENT is
enforced on the allocator fastpath and the reclaiming slowpath.

For now, fallbacks are permitted to avert OOMs.  There is a plan to add
defrag_mode=2 to prefer OOMs over fragmentation, but this requires
additional prep work in compaction and the reserve management to make it
ready for all possible allocation contexts.

The following test results are from a kernel build with periodic bursts of
THP allocations, over 15 runs:

                                        vanilla    defrag_mode=1
@claimer[unmovable]:                        189              103
@claimer[movable]:                           92              103
@claimer[reclaimable]:                      207               61
@pollute[unmovable from movable]:            25                0
@pollute[unmovable from reclaimable]:        28                0
@pollute[movable from unmovable]:         38835                0
@pollute[movable from reclaimable]:      147136                0
@pollute[reclaimable from unmovable]:       178                0
@pollute[reclaimable from movable]:          33                0
@steal[unmovable from movable]:              11                0
@steal[unmovable from reclaimable]:           5                0
@steal[reclaimable from unmovable]:         107                0
@steal[reclaimable from movable]:            90                0
@steal[movable from reclaimable]:           354                0
@steal[movable from unmovable]:             130                0

Both types of polluting fallbacks are eliminated in this workload.

Interestingly, whole block conversions are reduced as well.  This is
because once a block is claimed for a type, its empty space remains
available for future allocations, instead of being padded with fallbacks;
this allows the native type to group up instead of spreading out to new
blocks.  The assumption in the allocator has been that pollution from
movable allocations is less harmful than from other types, since they can
be reclaimed or migrated out should the space be needed.  However, since
fallbacks occur *before* reclaim/compaction is invoked, movable pollution
will still cause non-movable allocations to spread out and claim more
blocks.

Without fragmentation, THP rates hold steady with defrag_mode=1:

    thp_fault_alloc
    32478
    20725
    45045
    32130
    14018
    21711
    40791
    29134
    34458
    45381
    28305
    17265
    22584
    28454
    30850

While the downward trend is eliminated, the keen reader will of course
notice that the baseline rate is much smaller than the vanilla kernel's to
begin with.  This is due to deficiencies in how reclaim and compaction are
currently driven: ALLOC_NOFRAGMENT increases the extent to which smaller
allocations are competing with THPs for pageblocks, while making no effort
themselves to reclaim or compact beyond their own request size.  This
effect already exists with the current usage of ALLOC_NOFRAGMENT, but is
amplified by defrag_mode insisting on whole block stealing much more
strongly.

Subsequent patches will address defrag_mode reclaim strategy to raise the
THP success baseline above the vanilla kernel.

Link: https://lkml.kernel.org/r/20250313210647.1314586-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b92b1acda0e..f849eb7146b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -273,6 +273,7 @@ int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
 static int watermark_boost_factor __read_mostly = 15000;
 static int watermark_scale_factor = 10;
+static int defrag_mode;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -3389,6 +3390,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
 	 */
 	alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
 
+	if (defrag_mode) {
+		alloc_flags |= ALLOC_NOFRAGMENT;
+		return alloc_flags;
+	}
+
 #ifdef CONFIG_ZONE_DMA32
 	if (!zone)
 		return alloc_flags;
@@ -3480,7 +3486,7 @@ retry:
 				continue;
 		}
 
-		if (no_fallback && nr_online_nodes > 1 &&
+		if (no_fallback && !defrag_mode && nr_online_nodes > 1 &&
 		    zone != zonelist_zone(ac->preferred_zoneref)) {
 			int local_nid;
 
@@ -3591,7 +3597,7 @@ try_this_zone:
 	 * It's possible on a UMA machine to get through all zones that are
 	 * fragmented. If avoiding fragmentation, reset and try again.
 	 */
-	if (no_fallback) {
+	if (no_fallback && !defrag_mode) {
 		alloc_flags &= ~ALLOC_NOFRAGMENT;
 		goto retry;
 	}
@@ -4128,6 +4134,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 
 	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
 
+	if (defrag_mode)
+		alloc_flags |= ALLOC_NOFRAGMENT;
+
 	return alloc_flags;
 }
 
@@ -4510,6 +4519,11 @@ retry:
 				&compaction_retries))
 		goto retry;
 
+	/* Reclaim/compaction failed to prevent the fallback */
+	if (defrag_mode) {
+		alloc_flags &= ALLOC_NOFRAGMENT;
+		goto retry;
+	}
 
 	/*
 	 * Deal with possible cpuset update races or zonelist updates to avoid
@@ -6286,6 +6300,15 @@ static const struct ctl_table page_alloc_sysctl_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= SYSCTL_THREE_THOUSAND,
 	},
+	{
+		.procname	= "defrag_mode",
+		.data		= &defrag_mode,
+		.maxlen		= sizeof(defrag_mode),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	{
 		.procname	= "percpu_pagelist_high_fraction",
 		.data		= &percpu_pagelist_high_fraction,
-- 
cgit 


From 101f9d666e4d730e80caabe02446e8592ac44592 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Mar 2025 17:05:35 -0400
Subject: mm: page_alloc: defrag_mode kswapd/kcompactd assistance

When defrag_mode is enabled, allocation fallbacks strongly prefer whole
block conversions instead of polluting or stealing partially used blocks.
This means there is a demand for pageblocks even from sub-block requests.
Let kswapd/kcompactd help produce them.

By the time kswapd gets woken up, normal rmqueue and block conversion
fallbacks have been attempted and failed.  So always wake kswapd with the
block order; it will take care of producing a suitable compaction gap and
then chain-wake kcompactd with the block order when its done.

                                                VANILLA        DEFRAGMODE-ASYNC
Hugealloc Time mean               52739.45 (    +0.00%)   34300.36 (   -34.96%)
Hugealloc Time stddev             56541.26 (    +0.00%)   36390.42 (   -35.64%)
Kbuild Real time                    197.47 (    +0.00%)     196.13 (    -0.67%)
Kbuild User time                   1240.49 (    +0.00%)    1234.74 (    -0.46%)
Kbuild System time                   70.08 (    +0.00%)      62.62 (   -10.50%)
THP fault alloc                   46727.07 (    +0.00%)   57054.53 (   +22.10%)
THP fault fallback                21910.60 (    +0.00%)   11581.40 (   -47.14%)
Direct compact fail                 195.80 (    +0.00%)     107.80 (   -44.72%)
Direct compact success                7.93 (    +0.00%)       4.53 (   -38.06%)
Direct compact success rate %         3.51 (    +0.00%)       3.20 (    -6.89%)
Compact daemon scanned migrate  3369601.27 (    +0.00%) 5461033.93 (   +62.07%)
Compact daemon scanned free     5075474.47 (    +0.00%) 5824897.93 (   +14.77%)
Compact direct scanned migrate   161787.27 (    +0.00%)   58336.93 (   -63.94%)
Compact direct scanned free      163467.53 (    +0.00%)   32791.87 (   -79.94%)
Compact total migrate scanned   3531388.53 (    +0.00%) 5519370.87 (   +56.29%)
Compact total free scanned      5238942.00 (    +0.00%) 5857689.80 (   +11.81%)
Alloc stall                        2371.07 (    +0.00%)    2424.60 (    +2.26%)
Pages kswapd scanned            2160926.73 (    +0.00%) 2657018.33 (   +22.96%)
Pages kswapd reclaimed           533191.07 (    +0.00%)  559583.07 (    +4.95%)
Pages direct scanned             400450.33 (    +0.00%)  722094.07 (   +80.32%)
Pages direct reclaimed            94441.73 (    +0.00%)  107257.80 (   +13.57%)
Pages total scanned             2561377.07 (    +0.00%) 3379112.40 (   +31.93%)
Pages total reclaimed            627632.80 (    +0.00%)  666840.87 (    +6.25%)
Swap out                          47959.53 (    +0.00%)   77238.20 (   +61.05%)
Swap in                            7276.00 (    +0.00%)   11712.80 (   +60.97%)
File refaults                    138043.00 (    +0.00%)  143438.80 (    +3.91%)

With this patch, defrag_mode=1 beats the vanilla kernel in THP success
rates and allocation latencies.  The trend holds over time:

  thp_fault_alloc

      VANILLA        DEFRAGMODE-ASYNC
        61988                   52066
        56474                   58844
        57258                   58233
        50187                   58476
        52388                   54516
        55409                   59938
        52925                   57204
        47648                   60238
        43669                   55733
        40621                   56211
        36077                   59861
        41721                   57771
        36685                   58579
        34641                   51868
        33215                   56280

DEFRAGMODE-ASYNC also wins on %sys as ~3/4 of the direct compaction work
is shifted to kcompactd.

Reclaim activity is higher.  Part of that is simply due to the increased
memory footprint from higher THP use.  The other aspect is that *direct*
reclaim/compaction are still going for requested orders rather than
targeting the page blocks required for fallbacks, which is less efficient
than it could be.  However, this is already a useful tradeoff to make, as
in many environments peak periods are short and retaining the ability to
produce THP through them is more important.

Link: https://lkml.kernel.org/r/20250313210647.1314586-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f849eb7146b9..5a2ee82f723e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4076,15 +4076,21 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
 	struct zone *zone;
 	pg_data_t *last_pgdat = NULL;
 	enum zone_type highest_zoneidx = ac->highest_zoneidx;
+	unsigned int reclaim_order;
+
+	if (defrag_mode)
+		reclaim_order = max(order, pageblock_order);
+	else
+		reclaim_order = order;
 
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
 					ac->nodemask) {
 		if (!managed_zone(zone))
 			continue;
-		if (last_pgdat != zone->zone_pgdat) {
-			wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
-			last_pgdat = zone->zone_pgdat;
-		}
+		if (last_pgdat == zone->zone_pgdat)
+			continue;
+		wakeup_kswapd(zone, gfp_mask, reclaim_order, highest_zoneidx);
+		last_pgdat = zone->zone_pgdat;
 	}
 }
 
-- 
cgit 


From a211c6550efcc87aa2459ca347bda10721c7a46a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Mar 2025 17:05:36 -0400
Subject: mm: page_alloc: defrag_mode kswapd/kcompactd watermarks

The previous patch added pageblock_order reclaim to kswapd/kcompactd,
which helps, but produces only one block at a time.  Allocation stalls and
THP failure rates are still higher than they could be.

To adequately reflect ALLOC_NOFRAGMENT demand for pageblocks, change the
watermarking for kswapd & kcompactd: instead of targeting the high
watermark in order-0 pages and checking for one suitable block, simply
require that the high watermark is entirely met in pageblocks.

To this end, track the number of free pages within contiguous pageblocks,
then change pgdat_balanced() and compact_finished() to check watermarks
against this new value.

This further reduces THP latencies and allocation stalls, and improves THP
success rates against the previous patch:

                                       DEFRAGMODE-ASYNC DEFRAGMODE-ASYNC-WMARKS
Hugealloc Time mean               34300.36 (    +0.00%)   28904.00 (   -15.73%)
Hugealloc Time stddev             36390.42 (    +0.00%)   33464.37 (    -8.04%)
Kbuild Real time                    196.13 (    +0.00%)     196.59 (    +0.23%)
Kbuild User time                   1234.74 (    +0.00%)    1231.67 (    -0.25%)
Kbuild System time                   62.62 (    +0.00%)      59.10 (    -5.54%)
THP fault alloc                   57054.53 (    +0.00%)   63223.67 (   +10.81%)
THP fault fallback                11581.40 (    +0.00%)    5412.47 (   -53.26%)
Direct compact fail                 107.80 (    +0.00%)      59.07 (   -44.79%)
Direct compact success                4.53 (    +0.00%)       2.80 (   -31.33%)
Direct compact success rate %         3.20 (    +0.00%)       3.99 (   +18.66%)
Compact daemon scanned migrate  5461033.93 (    +0.00%) 2267500.33 (   -58.48%)
Compact daemon scanned free     5824897.93 (    +0.00%) 2339773.00 (   -59.83%)
Compact direct scanned migrate    58336.93 (    +0.00%)   47659.93 (   -18.30%)
Compact direct scanned free       32791.87 (    +0.00%)   40729.67 (   +24.21%)
Compact total migrate scanned   5519370.87 (    +0.00%) 2315160.27 (   -58.05%)
Compact total free scanned      5857689.80 (    +0.00%) 2380502.67 (   -59.36%)
Alloc stall                        2424.60 (    +0.00%)     638.87 (   -73.62%)
Pages kswapd scanned            2657018.33 (    +0.00%) 4002186.33 (   +50.63%)
Pages kswapd reclaimed           559583.07 (    +0.00%)  718577.80 (   +28.41%)
Pages direct scanned             722094.07 (    +0.00%)  355172.73 (   -50.81%)
Pages direct reclaimed           107257.80 (    +0.00%)   31162.80 (   -70.95%)
Pages total scanned             3379112.40 (    +0.00%) 4357359.07 (   +28.95%)
Pages total reclaimed            666840.87 (    +0.00%)  749740.60 (   +12.43%)
Swap out                          77238.20 (    +0.00%)  110084.33 (   +42.53%)
Swap in                           11712.80 (    +0.00%)   24457.00 (  +108.80%)
File refaults                    143438.80 (    +0.00%)  188226.93 (   +31.22%)

Also of note is that compaction work overall is reduced.  The reason for
this is that when free pageblocks are more readily available, allocations
are also much more likely to get physically placed in LRU order, instead
of being forced to scavenge free space here and there.  This means that
reclaim by itself has better chances of freeing up whole blocks, and the
system relies less on compaction.

Comparing all changes to the vanilla kernel:

                                                VANILLA DEFRAGMODE-ASYNC-WMARKS
Hugealloc Time mean               52739.45 (    +0.00%)   28904.00 (   -45.19%)
Hugealloc Time stddev             56541.26 (    +0.00%)   33464.37 (   -40.81%)
Kbuild Real time                    197.47 (    +0.00%)     196.59 (    -0.44%)
Kbuild User time                   1240.49 (    +0.00%)    1231.67 (    -0.71%)
Kbuild System time                   70.08 (    +0.00%)      59.10 (   -15.45%)
THP fault alloc                   46727.07 (    +0.00%)   63223.67 (   +35.30%)
THP fault fallback                21910.60 (    +0.00%)    5412.47 (   -75.29%)
Direct compact fail                 195.80 (    +0.00%)      59.07 (   -69.48%)
Direct compact success                7.93 (    +0.00%)       2.80 (   -57.46%)
Direct compact success rate %         3.51 (    +0.00%)       3.99 (   +10.49%)
Compact daemon scanned migrate  3369601.27 (    +0.00%) 2267500.33 (   -32.71%)
Compact daemon scanned free     5075474.47 (    +0.00%) 2339773.00 (   -53.90%)
Compact direct scanned migrate   161787.27 (    +0.00%)   47659.93 (   -70.54%)
Compact direct scanned free      163467.53 (    +0.00%)   40729.67 (   -75.08%)
Compact total migrate scanned   3531388.53 (    +0.00%) 2315160.27 (   -34.44%)
Compact total free scanned      5238942.00 (    +0.00%) 2380502.67 (   -54.56%)
Alloc stall                        2371.07 (    +0.00%)     638.87 (   -73.02%)
Pages kswapd scanned            2160926.73 (    +0.00%) 4002186.33 (   +85.21%)
Pages kswapd reclaimed           533191.07 (    +0.00%)  718577.80 (   +34.77%)
Pages direct scanned             400450.33 (    +0.00%)  355172.73 (   -11.31%)
Pages direct reclaimed            94441.73 (    +0.00%)   31162.80 (   -67.00%)
Pages total scanned             2561377.07 (    +0.00%) 4357359.07 (   +70.12%)
Pages total reclaimed            627632.80 (    +0.00%)  749740.60 (   +19.46%)
Swap out                          47959.53 (    +0.00%)  110084.33 (  +129.53%)
Swap in                            7276.00 (    +0.00%)   24457.00 (  +236.10%)
File refaults                    138043.00 (    +0.00%)  188226.93 (   +36.35%)

THP allocation latencies and %sys time are down dramatically.

THP allocation failures are down from nearly 50% to 8.5%.  And to recall
previous data points, the success rates are steady and reliable without
the cumulative deterioration of fragmentation events.

Compaction work is down overall.  Direct compaction work especially is
drastically reduced.  As an aside, its success rate of 4% indicates there
is room for improvement.  For now it's good to rely on it less.

Reclaim work is up overall, however direct reclaim work is down.  Part of
the increase can be attributed to a higher use of THPs, which due to
internal fragmentation increase the memory footprint.  This is not
necessarily an unexpected side-effect for users of THP.

However, taken both points together, there may well be some opportunities
for fine tuning in the reclaim/compaction coordination.

[hannes@cmpxchg.org: fix squawks from rebasing]
  Link: https://lkml.kernel.org/r/20250314210558.GD1316033@cmpxchg.org
Link: https://lkml.kernel.org/r/20250313210647.1314586-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a2ee82f723e..4337467eaf5a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -273,7 +273,7 @@ int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
 static int watermark_boost_factor __read_mostly = 15000;
 static int watermark_scale_factor = 10;
-static int defrag_mode;
+int defrag_mode;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -660,16 +660,20 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
 				      bool tail)
 {
 	struct free_area *area = &zone->free_area[order];
+	int nr_pages = 1 << order;
 
 	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
 		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
-		     get_pageblock_migratetype(page), migratetype, 1 << order);
+		     get_pageblock_migratetype(page), migratetype, nr_pages);
 
 	if (tail)
 		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
 	else
 		list_add(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
+
+	if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+		__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
 }
 
 /*
@@ -681,24 +685,34 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
 				     unsigned int order, int old_mt, int new_mt)
 {
 	struct free_area *area = &zone->free_area[order];
+	int nr_pages = 1 << order;
 
 	/* Free page moving can fail, so it happens before the type update */
 	VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
 		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
-		     get_pageblock_migratetype(page), old_mt, 1 << order);
+		     get_pageblock_migratetype(page), old_mt, nr_pages);
 
 	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
 
-	account_freepages(zone, -(1 << order), old_mt);
-	account_freepages(zone, 1 << order, new_mt);
+	account_freepages(zone, -nr_pages, old_mt);
+	account_freepages(zone, nr_pages, new_mt);
+
+	if (order >= pageblock_order &&
+	    is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) {
+		if (!is_migrate_isolate(old_mt))
+			nr_pages = -nr_pages;
+		__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
+	}
 }
 
 static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
 					     unsigned int order, int migratetype)
 {
+	int nr_pages = 1 << order;
+
         VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
 		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
-		     get_pageblock_migratetype(page), migratetype, 1 << order);
+		     get_pageblock_migratetype(page), migratetype, nr_pages);
 
 	/* clear reported state and update reported page count */
 	if (page_reported(page))
@@ -708,6 +722,9 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 	zone->free_area[order].nr_free--;
+
+	if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+		__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
 }
 
 static inline void del_page_from_free_list(struct page *page, struct zone *zone,
-- 
cgit 


From 1506c25508acd740ced5e92c539ed3d12f622c5b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 14 Mar 2025 13:36:12 +0000
Subject: mm: simplify split_page_memcg()

The last argument to split_page_memcg() is now always 0, so remove it,
effectively reverting commit b8791381d7ed.

Link: https://lkml.kernel.org/r/20250314133617.138071-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4337467eaf5a..a6d060eea638 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2778,7 +2778,7 @@ void split_page(struct page *page, unsigned int order)
 		set_page_refcounted(page + i);
 	split_page_owner(page, order, 0);
 	pgalloc_tag_split(page_folio(page), order, 0);
-	split_page_memcg(page, order, 0);
+	split_page_memcg(page, order);
 }
 EXPORT_SYMBOL_GPL(split_page);
 
@@ -4992,7 +4992,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
 
 		split_page_owner(page, order, 0);
 		pgalloc_tag_split(page_folio(page), order, 0);
-		split_page_memcg(page, order, 0);
+		split_page_memcg(page, order);
 		while (page < --last)
 			set_page_refcounted(last);
 
-- 
cgit 


From 0a1e082b64ccce165e7307a7b49d22b2504f9d1f Mon Sep 17 00:00:00 2001
From: Liu Ye <liuye@kylinos.cn>
Date: Wed, 19 Mar 2025 17:17:26 +0800
Subject: mm/page_alloc: remove unnecessary __maybe_unused in order_to_pindex()

The `movable` variable is always used when `CONFIG_TRANSPARENT_HUGEPAGE`
is enabled, so the `__maybe_unused` attribute is not necessary.  This
patch removes it and keeps the variable declaration within the `#ifdef`
block for better clarity.

Link: https://lkml.kernel.org/r/20250319091726.401158-1-liuyerd@163.com
Signed-off-by: Liu Ye<liuye@kylinos.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6d060eea638..0c01998cb3a0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -509,9 +509,9 @@ out:
 
 static inline unsigned int order_to_pindex(int migratetype, int order)
 {
-	bool __maybe_unused movable;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	bool movable;
 	if (order > PAGE_ALLOC_COSTLY_ORDER) {
 		VM_BUG_ON(order != HPAGE_PMD_ORDER);
 
-- 
cgit