From 89ce924f0bd447eb52a5f224d879dbf8f09451db Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 24 Jan 2025 00:41:32 -0500
Subject: mm: memcontrol: move memsw charge callbacks to v1

The interweaving of two entirely different swap accounting strategies has
been one of the more confusing parts of the memcg code.  Split out the v1
code to clarify the implementation and a handful of callsites, and to
avoid building the v1 bits when !CONFIG_MEMCG_V1.

   text	  data	   bss	   dec	   hex	filename
  39253	  6446	  4160	 49859	  c2c3	mm/memcontrol.o.old
  38877	  6382	  4160	 49419	  c10b	mm/memcontrol.o

Link: https://lkml.kernel.org/r/20250124054132.45643-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Balbir Singh <balbirs@nvidia.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c767d71c43d7..fc4951d23b97 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -769,7 +769,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
 		if (reclaimed && !mapping_exiting(mapping))
 			shadow = workingset_eviction(folio, target_memcg);
 		__delete_from_swap_cache(folio, swap, shadow);
-		mem_cgroup_swapout(folio, swap);
+		memcg1_swapout(folio, swap);
 		xa_unlock_irq(&mapping->i_pages);
 		put_swap_folio(folio, swap);
 	} else {
-- 
cgit 


From 350dce38eb6e221cca16940f3bd8d9947364f1ca Mon Sep 17 00:00:00 2001
From: Hao Zhang <zhanghao1@kylinos.cn>
Date: Wed, 15 Jan 2025 09:58:29 +0800
Subject: mm/vmscan: extract calculated pressure balance as a function

Extract pressure balance calculation into a function.This doesn't change
current behaviour.

[akpm@linux-foundation.org: 80-col wrapping]
Link: https://lkml.kernel.org/r/tencent_735DB36A2306C08B8568049E4C0B99716C07@qq.com
Signed-off-by: Hao Zhang <zhanghao1@kylinos.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 68 ++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index fc4951d23b97..bc1826020159 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2400,6 +2400,43 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
 	}
 }
 
+static inline void calculate_pressure_balance(struct scan_control *sc,
+			int swappiness, u64 *fraction, u64 *denominator)
+{
+	unsigned long anon_cost, file_cost, total_cost;
+	unsigned long ap, fp;
+
+	/*
+	 * Calculate the pressure balance between anon and file pages.
+	 *
+	 * The amount of pressure we put on each LRU is inversely
+	 * proportional to the cost of reclaiming each list, as
+	 * determined by the share of pages that are refaulting, times
+	 * the relative IO cost of bringing back a swapped out
+	 * anonymous page vs reloading a filesystem page (swappiness).
+	 *
+	 * Although we limit that influence to ensure no list gets
+	 * left behind completely: at least a third of the pressure is
+	 * applied, before swappiness.
+	 *
+	 * With swappiness at 100, anon and file have equal IO cost.
+	 */
+	total_cost = sc->anon_cost + sc->file_cost;
+	anon_cost = total_cost + sc->anon_cost;
+	file_cost = total_cost + sc->file_cost;
+	total_cost = anon_cost + file_cost;
+
+	ap = swappiness * (total_cost + 1);
+	ap /= anon_cost + 1;
+
+	fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
+	fp /= file_cost + 1;
+
+	fraction[WORKINGSET_ANON] = ap;
+	fraction[WORKINGSET_FILE] = fp;
+	*denominator = ap + fp;
+}
+
 /*
  * Determine how aggressively the anon and file LRU lists should be
  * scanned.
@@ -2412,12 +2449,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 {
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-	unsigned long anon_cost, file_cost, total_cost;
 	int swappiness = sc_swappiness(sc, memcg);
 	u64 fraction[ANON_AND_FILE];
 	u64 denominator = 0;	/* gcc */
 	enum scan_balance scan_balance;
-	unsigned long ap, fp;
 	enum lru_list lru;
 
 	/* If we have no swap space, do not bother scanning anon folios. */
@@ -2466,35 +2501,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	}
 
 	scan_balance = SCAN_FRACT;
-	/*
-	 * Calculate the pressure balance between anon and file pages.
-	 *
-	 * The amount of pressure we put on each LRU is inversely
-	 * proportional to the cost of reclaiming each list, as
-	 * determined by the share of pages that are refaulting, times
-	 * the relative IO cost of bringing back a swapped out
-	 * anonymous page vs reloading a filesystem page (swappiness).
-	 *
-	 * Although we limit that influence to ensure no list gets
-	 * left behind completely: at least a third of the pressure is
-	 * applied, before swappiness.
-	 *
-	 * With swappiness at 100, anon and file have equal IO cost.
-	 */
-	total_cost = sc->anon_cost + sc->file_cost;
-	anon_cost = total_cost + sc->anon_cost;
-	file_cost = total_cost + sc->file_cost;
-	total_cost = anon_cost + file_cost;
-
-	ap = swappiness * (total_cost + 1);
-	ap /= anon_cost + 1;
-
-	fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
-	fp /= file_cost + 1;
+	calculate_pressure_balance(sc, swappiness, fraction, &denominator);
 
-	fraction[0] = ap;
-	fraction[1] = fp;
-	denominator = ap + fp;
 out:
 	for_each_evictable_lru(lru) {
 		bool file = is_file_lru(lru);
-- 
cgit 


From 6e80c0aaad469e0a923ea0d7018fb1464e992018 Mon Sep 17 00:00:00 2001
From: Bertrand Wlodarczyk <bertrand.wlodarczyk@intel.com>
Date: Mon, 10 Feb 2025 17:07:49 +0100
Subject: vmscan, cleanup: add for_each_managed_zone_pgdat macro

The macro is introduced to eliminate redundancy in the repeated iteration
over managed zones in pgdat data structure, reducing the potential for
errors. This change doesn't introduce any functional modifications.
Due to concentration of the pattern in vmscan.c the macro is placed
locally in that file.

Link: https://lkml.kernel.org/r/20250210160818.686-1-bertrand.wlodarczyk@intel.com
Signed-off-by: Bertrand Wlodarczyk <bertrand.wlodarczyk@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 83 ++++++++++++++++++++++++-------------------------------------
 1 file changed, 32 insertions(+), 51 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc1826020159..fcca38bc640f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -271,6 +271,25 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
 }
 #endif
 
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx)	\
+	for ((idx) = 0, (zone) = (pgdat)->node_zones;		\
+	    (idx) <= (highidx);					\
+	    (idx)++, (zone)++)					\
+		if (!managed_zone(zone))			\
+			continue;				\
+		else
+
 static void set_task_reclaim_state(struct task_struct *task,
 				   struct reclaim_state *rs)
 {
@@ -396,13 +415,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
 {
 	unsigned long size = 0;
 	int zid;
+	struct zone *zone;
 
-	for (zid = 0; zid <= zone_idx; zid++) {
-		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
-
-		if (!managed_zone(zone))
-			continue;
-
+	for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
 		if (!mem_cgroup_disabled())
 			size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
 		else
@@ -495,7 +510,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
 {
 	int reclaimable = 0, write_pending = 0;
 	int i;
-
+	struct zone *zone;
 	/*
 	 * If kswapd is disabled, reschedule if necessary but do not
 	 * throttle as the system is likely near OOM.
@@ -508,12 +523,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
 	 * throttle as throttling will occur when the folios cycle
 	 * towards the end of the LRU if still under writeback.
 	 */
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		if (!managed_zone(zone))
-			continue;
-
+	for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
 		reclaimable += zone_reclaimable_pages(zone);
 		write_pending += zone_page_state_snapshot(zone,
 						  NR_ZONE_WRITE_PENDING);
@@ -2372,17 +2382,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
 		unsigned long total_high_wmark = 0;
 		unsigned long free, anon;
 		int z;
+		struct zone *zone;
 
 		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
 		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
 			   node_page_state(pgdat, NR_INACTIVE_FILE);
 
-		for (z = 0; z < MAX_NR_ZONES; z++) {
-			struct zone *zone = &pgdat->node_zones[z];
-
-			if (!managed_zone(zone))
-				continue;
-
+		for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) {
 			total_high_wmark += high_wmark_pages(zone);
 		}
 
@@ -5851,6 +5857,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 	unsigned long pages_for_compaction;
 	unsigned long inactive_lru_pages;
 	int z;
+	struct zone *zone;
 
 	/* If not in reclaim/compaction mode, stop */
 	if (!in_reclaim_compaction(sc))
@@ -5870,11 +5877,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 		return false;
 
 	/* If compaction would go ahead or the allocation would succeed, stop */
-	for (z = 0; z <= sc->reclaim_idx; z++) {
-		struct zone *zone = &pgdat->node_zones[z];
-		if (!managed_zone(zone))
-			continue;
-
+	for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
 		/* Allocation can already succeed, nothing to do */
 		if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
 				      sc->reclaim_idx, 0))
@@ -6401,11 +6404,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
 		return true;
 
-	for (i = 0; i <= ZONE_NORMAL; i++) {
-		zone = &pgdat->node_zones[i];
-		if (!managed_zone(zone))
-			continue;
-
+	for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
 		if (!zone_reclaimable_pages(zone))
 			continue;
 
@@ -6710,12 +6709,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
 	 * Check watermarks bottom-up as lower zones are more likely to
 	 * meet watermarks.
 	 */
-	for (i = 0; i <= highest_zoneidx; i++) {
-		zone = pgdat->node_zones + i;
-
-		if (!managed_zone(zone))
-			continue;
-
+	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
 		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
 			mark = promo_wmark_pages(zone);
 		else
@@ -6800,11 +6794,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
 
 	/* Reclaim a number of pages proportional to the number of zones */
 	sc->nr_to_reclaim = 0;
-	for (z = 0; z <= sc->reclaim_idx; z++) {
-		zone = pgdat->node_zones + z;
-		if (!managed_zone(zone))
-			continue;
-
+	for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
 		sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
 	}
 
@@ -6835,12 +6825,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
 	int i;
 	struct zone *zone;
 
-	for (i = 0; i <= highest_zoneidx; i++) {
-		zone = pgdat->node_zones + i;
-
-		if (!managed_zone(zone))
-			continue;
-
+	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
 		if (active)
 			set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
 		else
@@ -6901,11 +6886,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 	 * stall or direct reclaim until kswapd is finished.
 	 */
 	nr_boost_reclaim = 0;
-	for (i = 0; i <= highest_zoneidx; i++) {
-		zone = pgdat->node_zones + i;
-		if (!managed_zone(zone))
-			continue;
-
+	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
 		nr_boost_reclaim += zone->watermark_boost;
 		zone_boosts[i] = zone->watermark_boost;
 	}
-- 
cgit 


From b487a2da3575b6cdfb6d6559311830c8fea70bb9 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Fri, 14 Mar 2025 00:59:35 +0800
Subject: mm, swap: simplify folio swap allocation

With slot cache gone, clean up the allocation helpers even more.
folio_alloc_swap will be the only entry for allocation and adding the
folio to swap cache (except suspend), making it opposite of
folio_free_swap.

Link: https://lkml.kernel.org/r/20250313165935.63303-8-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index fcca38bc640f..be00af3763b5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1289,7 +1289,7 @@ retry:
 					    split_folio_to_list(folio, folio_list))
 						goto activate_locked;
 				}
-				if (!add_to_swap(folio)) {
+				if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
 					int __maybe_unused order = folio_order(folio);
 
 					if (!folio_test_large(folio))
@@ -1305,9 +1305,21 @@ retry:
 					}
 #endif
 					count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
-					if (!add_to_swap(folio))
+					if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
 						goto activate_locked_split;
 				}
+				/*
+				 * Normally the folio will be dirtied in unmap because its
+				 * pte should be dirty. A special case is MADV_FREE page. The
+				 * page's pte could have dirty bit cleared but the folio's
+				 * SwapBacked flag is still set because clearing the dirty bit
+				 * and SwapBacked flag has no lock protected. For such folio,
+				 * unmap will not set dirty bit for it, so folio reclaim will
+				 * not write the folio out. This can cause data corruption when
+				 * the folio is swapped in later. Always setting the dirty flag
+				 * for the folio solves the problem.
+				 */
+				folio_mark_dirty(folio);
 			}
 		}
 
-- 
cgit 


From c0ebbb3841e07c4493e6fe351698806b09a87a37 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 12 Mar 2025 10:10:13 -0400
Subject: mm: add missing release barrier on PGDAT_RECLAIM_LOCKED unlock

The PGDAT_RECLAIM_LOCKED bit is used to provide mutual exclusion of node
reclaim for struct pglist_data using a single bit.

It is "locked" with a test_and_set_bit (similarly to a try lock) which
provides full ordering with respect to loads and stores done within
__node_reclaim().

It is "unlocked" with clear_bit(), which does not provide any ordering
with respect to loads and stores done before clearing the bit.

The lack of clear_bit() memory ordering with respect to stores within
__node_reclaim() can cause a subsequent CPU to fail to observe stores from
a prior node reclaim.  This is not an issue in practice on TSO (e.g.
x86), but it is an issue on weakly-ordered architectures (e.g.  arm64).

Fix this by using clear_bit_unlock rather than clear_bit to clear
PGDAT_RECLAIM_LOCKED with a release memory ordering semantic.

This provides stronger memory ordering (release rather than relaxed).

Link: https://lkml.kernel.org/r/20250312141014.129725-1-mathieu.desnoyers@efficios.com
Fixes: d773ed6b856a ("mm: test and set zone reclaim lock before starting reclaim")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jade Alglave <j.alglave@ucl.ac.uk>
Cc: Luc Maranget <luc.maranget@inria.fr>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index be00af3763b5..bbd3913e3887 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7581,7 +7581,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 		return NODE_RECLAIM_NOSCAN;
 
 	ret = __node_reclaim(pgdat, gfp_mask, order);
-	clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+	clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
 
 	if (ret)
 		count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
-- 
cgit 


From ca868cd77063ee670ade6d5d1554e3f5f223afd7 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 12 Mar 2025 10:10:14 -0400
Subject: mm: lock PGDAT_RECLAIM_LOCKED with acquire memory ordering

The PGDAT_RECLAIM_LOCKED bit is used to provide mutual exclusion of node
reclaim for struct pglist_data using a single bit.

Use test_and_set_bit_lock rather than test_and_set_bit to test-and-set
PGDAT_RECLAIM_LOCKED with an acquire memory ordering semantic.

This changes the "lock" acquisition from a full barrier to an acquire
memory ordering, which is weaker.  The acquire semi-permeable barrier
paired with the release on unlock is sufficient for this mutual exclusion
use-case.

No behavior change intended other than to reduce overhead by using the
appropriate barrier.

Link: https://lkml.kernel.org/r/20250312141014.129725-2-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jade Alglave <j.alglave@ucl.ac.uk>
Cc: Luc Maranget <luc.maranget@inria.fr>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index bbd3913e3887..2bc740637a6c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7577,7 +7577,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 	if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
 		return NODE_RECLAIM_NOSCAN;
 
-	if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+	if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
 		return NODE_RECLAIM_NOSCAN;
 
 	ret = __node_reclaim(pgdat, gfp_mask, order);
-- 
cgit 


From 67914ac08604345f620566ccf5bac87b40d5881d Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Mar 2025 17:05:32 -0400
Subject: mm: compaction: push watermark into compaction_suitable() callers

Patch series "mm: reliable huge page allocator".

This series makes changes to the allocator and reclaim/compaction code to
try harder to avoid fragmentation.  As a result, this makes huge page
allocations cheaper, more reliable and more sustainable.

It's a subset of the huge page allocator RFC initially proposed here:

  https://lore.kernel.org/lkml/20230418191313.268131-1-hannes@cmpxchg.org/

The following results are from a kernel build test, with additional
concurrent bursts of THP allocations on a memory-constrained system.
Comparing before and after the changes over 15 runs:

                                                     before                   after
    Hugealloc Time mean               52739.45 (    +0.00%)   28904.00 (   -45.19%)
    Hugealloc Time stddev             56541.26 (    +0.00%)   33464.37 (   -40.81%)
    Kbuild Real time                    197.47 (    +0.00%)     196.59 (    -0.44%)
    Kbuild User time                   1240.49 (    +0.00%)    1231.67 (    -0.71%)
    Kbuild System time                   70.08 (    +0.00%)      59.10 (   -15.45%)
    THP fault alloc                   46727.07 (    +0.00%)   63223.67 (   +35.30%)
    THP fault fallback                21910.60 (    +0.00%)    5412.47 (   -75.29%)
    Direct compact fail                 195.80 (    +0.00%)      59.07 (   -69.48%)
    Direct compact success                7.93 (    +0.00%)       2.80 (   -57.46%)
    Direct compact success rate %         3.51 (    +0.00%)       3.99 (   +10.49%)
    Compact daemon scanned migrate  3369601.27 (    +0.00%) 2267500.33 (   -32.71%)
    Compact daemon scanned free     5075474.47 (    +0.00%) 2339773.00 (   -53.90%)
    Compact direct scanned migrate   161787.27 (    +0.00%)   47659.93 (   -70.54%)
    Compact direct scanned free      163467.53 (    +0.00%)   40729.67 (   -75.08%)
    Compact total migrate scanned   3531388.53 (    +0.00%) 2315160.27 (   -34.44%)
    Compact total free scanned      5238942.00 (    +0.00%) 2380502.67 (   -54.56%)
    Alloc stall                        2371.07 (    +0.00%)     638.87 (   -73.02%)
    Pages kswapd scanned            2160926.73 (    +0.00%) 4002186.33 (   +85.21%)
    Pages kswapd reclaimed           533191.07 (    +0.00%)  718577.80 (   +34.77%)
    Pages direct scanned             400450.33 (    +0.00%)  355172.73 (   -11.31%)
    Pages direct reclaimed            94441.73 (    +0.00%)   31162.80 (   -67.00%)
    Pages total scanned             2561377.07 (    +0.00%) 4357359.07 (   +70.12%)
    Pages total reclaimed            627632.80 (    +0.00%)  749740.60 (   +19.46%)
    Swap out                          47959.53 (    +0.00%)  110084.33 (  +129.53%)
    Swap in                            7276.00 (    +0.00%)   24457.00 (  +236.10%)
    File refaults                    138043.00 (    +0.00%)  188226.93 (   +36.35%)

THP latencies are cut in half, and failure rates are cut by 75%.  These
metrics also hold up over time, while the vanilla kernel sees a steady
downward trend in success rates with each subsequent run, owed to the
cumulative effects of fragmentation.

A more detailed discussion of results is in the patch changelogs.

The patches first introduce a vm.defrag_mode sysctl, which enforces the
existing ALLOC_NOFRAGMENT alloc flag until after reclaim and compaction
have run.  They then change kswapd and kcompactd to target pageblocks,
which boosts success in the ALLOC_NOFRAGMENT hotpaths.

Patches #1 and #2 are somewhat unrelated cleanups, but touch the same code
and so are included here to avoid conflicts from re-ordering.


This patch (of 5):

compaction_suitable() hardcodes the min watermark, with a boost to the low
watermark for costly orders.  However, compaction_ready() requires order-0
at the high watermark.  It currently checks the marks twice.

Make the watermark a parameter to compaction_suitable() and have the
callers pass in what they require:

- compaction_zonelist_suitable() is used by the direct reclaim path,
  so use the min watermark.

- compact_suit_allocation_order() has a watermark in context derived
  from cc->alloc_flags.

  The only quirk is that kcompactd doesn't initialize cc->alloc_flags
  explicitly. There is a direct check in kcompactd_do_work() that
  passes ALLOC_WMARK_MIN, but there is another check downstack in
  compact_zone() that ends up passing the unset alloc_flags. Since
  they default to 0, and that coincides with ALLOC_WMARK_MIN, it is
  correct. But it's subtle. Set cc->alloc_flags explicitly.

- should_continue_reclaim() is direct reclaim, use the min watermark.

- Finally, consolidate the two checks in compaction_ready() to a
  single compaction_suitable() call passing the high watermark.

  There is a tiny change in behavior: before, compaction_suitable()
  would check order-0 against min or low, depending on costly
  order. Then there'd be another high watermark check.

  Now, the high watermark is passed to compaction_suitable(), and the
  costly order-boost (low - min) is added on top. This means
  compaction_ready() sets a marginally higher target for free pages.

  In a kernelbuild + THP pressure test, though, this didn't show any
  measurable negative effects on memory pressure or reclaim rates. As
  the comment above the check says, reclaim is usually stopped short
  on should_continue_reclaim(), and this just defines the worst-case
  reclaim cutoff in case compaction is not making any headway.

[hughd@google.com: stop oops on out-of-range highest_zoneidx]
  Link: https://lkml.kernel.org/r/005ace8b-07fa-01d4-b54b-394a3e029c07@google.com
Link: https://lkml.kernel.org/r/20250313210647.1314586-1-hannes@cmpxchg.org
Link: https://lkml.kernel.org/r/20250313210647.1314586-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2bc740637a6c..3370bdca6868 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5890,12 +5890,15 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 
 	/* If compaction would go ahead or the allocation would succeed, stop */
 	for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
+		unsigned long watermark = min_wmark_pages(zone);
+
 		/* Allocation can already succeed, nothing to do */
-		if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+		if (zone_watermark_ok(zone, sc->order, watermark,
 				      sc->reclaim_idx, 0))
 			return false;
 
-		if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+		if (compaction_suitable(zone, sc->order, watermark,
+					sc->reclaim_idx))
 			return false;
 	}
 
@@ -6122,22 +6125,21 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 			      sc->reclaim_idx, 0))
 		return true;
 
-	/* Compaction cannot yet proceed. Do reclaim. */
-	if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
-		return false;
-
 	/*
-	 * Compaction is already possible, but it takes time to run and there
-	 * are potentially other callers using the pages just freed. So proceed
-	 * with reclaim to make a buffer of free pages available to give
-	 * compaction a reasonable chance of completing and allocating the page.
+	 * Direct reclaim usually targets the min watermark, but compaction
+	 * takes time to run and there are potentially other callers using the
+	 * pages just freed. So target a higher buffer to give compaction a
+	 * reasonable chance of completing and allocating the pages.
+	 *
 	 * Note that we won't actually reclaim the whole buffer in one attempt
 	 * as the target watermark in should_continue_reclaim() is lower. But if
 	 * we are already above the high+gap watermark, don't reclaim at all.
 	 */
-	watermark = high_wmark_pages(zone) + compact_gap(sc->order);
+	watermark = high_wmark_pages(zone);
+	if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx))
+		return true;
 
-	return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
+	return false;
 }
 
 static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
-- 
cgit 


From a211c6550efcc87aa2459ca347bda10721c7a46a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Mar 2025 17:05:36 -0400
Subject: mm: page_alloc: defrag_mode kswapd/kcompactd watermarks

The previous patch added pageblock_order reclaim to kswapd/kcompactd,
which helps, but produces only one block at a time.  Allocation stalls and
THP failure rates are still higher than they could be.

To adequately reflect ALLOC_NOFRAGMENT demand for pageblocks, change the
watermarking for kswapd & kcompactd: instead of targeting the high
watermark in order-0 pages and checking for one suitable block, simply
require that the high watermark is entirely met in pageblocks.

To this end, track the number of free pages within contiguous pageblocks,
then change pgdat_balanced() and compact_finished() to check watermarks
against this new value.

This further reduces THP latencies and allocation stalls, and improves THP
success rates against the previous patch:

                                       DEFRAGMODE-ASYNC DEFRAGMODE-ASYNC-WMARKS
Hugealloc Time mean               34300.36 (    +0.00%)   28904.00 (   -15.73%)
Hugealloc Time stddev             36390.42 (    +0.00%)   33464.37 (    -8.04%)
Kbuild Real time                    196.13 (    +0.00%)     196.59 (    +0.23%)
Kbuild User time                   1234.74 (    +0.00%)    1231.67 (    -0.25%)
Kbuild System time                   62.62 (    +0.00%)      59.10 (    -5.54%)
THP fault alloc                   57054.53 (    +0.00%)   63223.67 (   +10.81%)
THP fault fallback                11581.40 (    +0.00%)    5412.47 (   -53.26%)
Direct compact fail                 107.80 (    +0.00%)      59.07 (   -44.79%)
Direct compact success                4.53 (    +0.00%)       2.80 (   -31.33%)
Direct compact success rate %         3.20 (    +0.00%)       3.99 (   +18.66%)
Compact daemon scanned migrate  5461033.93 (    +0.00%) 2267500.33 (   -58.48%)
Compact daemon scanned free     5824897.93 (    +0.00%) 2339773.00 (   -59.83%)
Compact direct scanned migrate    58336.93 (    +0.00%)   47659.93 (   -18.30%)
Compact direct scanned free       32791.87 (    +0.00%)   40729.67 (   +24.21%)
Compact total migrate scanned   5519370.87 (    +0.00%) 2315160.27 (   -58.05%)
Compact total free scanned      5857689.80 (    +0.00%) 2380502.67 (   -59.36%)
Alloc stall                        2424.60 (    +0.00%)     638.87 (   -73.62%)
Pages kswapd scanned            2657018.33 (    +0.00%) 4002186.33 (   +50.63%)
Pages kswapd reclaimed           559583.07 (    +0.00%)  718577.80 (   +28.41%)
Pages direct scanned             722094.07 (    +0.00%)  355172.73 (   -50.81%)
Pages direct reclaimed           107257.80 (    +0.00%)   31162.80 (   -70.95%)
Pages total scanned             3379112.40 (    +0.00%) 4357359.07 (   +28.95%)
Pages total reclaimed            666840.87 (    +0.00%)  749740.60 (   +12.43%)
Swap out                          77238.20 (    +0.00%)  110084.33 (   +42.53%)
Swap in                           11712.80 (    +0.00%)   24457.00 (  +108.80%)
File refaults                    143438.80 (    +0.00%)  188226.93 (   +31.22%)

Also of note is that compaction work overall is reduced.  The reason for
this is that when free pageblocks are more readily available, allocations
are also much more likely to get physically placed in LRU order, instead
of being forced to scavenge free space here and there.  This means that
reclaim by itself has better chances of freeing up whole blocks, and the
system relies less on compaction.

Comparing all changes to the vanilla kernel:

                                                VANILLA DEFRAGMODE-ASYNC-WMARKS
Hugealloc Time mean               52739.45 (    +0.00%)   28904.00 (   -45.19%)
Hugealloc Time stddev             56541.26 (    +0.00%)   33464.37 (   -40.81%)
Kbuild Real time                    197.47 (    +0.00%)     196.59 (    -0.44%)
Kbuild User time                   1240.49 (    +0.00%)    1231.67 (    -0.71%)
Kbuild System time                   70.08 (    +0.00%)      59.10 (   -15.45%)
THP fault alloc                   46727.07 (    +0.00%)   63223.67 (   +35.30%)
THP fault fallback                21910.60 (    +0.00%)    5412.47 (   -75.29%)
Direct compact fail                 195.80 (    +0.00%)      59.07 (   -69.48%)
Direct compact success                7.93 (    +0.00%)       2.80 (   -57.46%)
Direct compact success rate %         3.51 (    +0.00%)       3.99 (   +10.49%)
Compact daemon scanned migrate  3369601.27 (    +0.00%) 2267500.33 (   -32.71%)
Compact daemon scanned free     5075474.47 (    +0.00%) 2339773.00 (   -53.90%)
Compact direct scanned migrate   161787.27 (    +0.00%)   47659.93 (   -70.54%)
Compact direct scanned free      163467.53 (    +0.00%)   40729.67 (   -75.08%)
Compact total migrate scanned   3531388.53 (    +0.00%) 2315160.27 (   -34.44%)
Compact total free scanned      5238942.00 (    +0.00%) 2380502.67 (   -54.56%)
Alloc stall                        2371.07 (    +0.00%)     638.87 (   -73.02%)
Pages kswapd scanned            2160926.73 (    +0.00%) 4002186.33 (   +85.21%)
Pages kswapd reclaimed           533191.07 (    +0.00%)  718577.80 (   +34.77%)
Pages direct scanned             400450.33 (    +0.00%)  355172.73 (   -11.31%)
Pages direct reclaimed            94441.73 (    +0.00%)   31162.80 (   -67.00%)
Pages total scanned             2561377.07 (    +0.00%) 4357359.07 (   +70.12%)
Pages total reclaimed            627632.80 (    +0.00%)  749740.60 (   +19.46%)
Swap out                          47959.53 (    +0.00%)  110084.33 (  +129.53%)
Swap in                            7276.00 (    +0.00%)   24457.00 (  +236.10%)
File refaults                    138043.00 (    +0.00%)  188226.93 (   +36.35%)

THP allocation latencies and %sys time are down dramatically.

THP allocation failures are down from nearly 50% to 8.5%.  And to recall
previous data points, the success rates are steady and reliable without
the cumulative deterioration of fragmentation events.

Compaction work is down overall.  Direct compaction work especially is
drastically reduced.  As an aside, its success rate of 4% indicates there
is room for improvement.  For now it's good to rely on it less.

Reclaim work is up overall, however direct reclaim work is down.  Part of
the increase can be attributed to a higher use of THPs, which due to
internal fragmentation increase the memory footprint.  This is not
necessarily an unexpected side-effect for users of THP.

However, taken both points together, there may well be some opportunities
for fine tuning in the reclaim/compaction coordination.

[hannes@cmpxchg.org: fix squawks from rebasing]
  Link: https://lkml.kernel.org/r/20250314210558.GD1316033@cmpxchg.org
Link: https://lkml.kernel.org/r/20250313210647.1314586-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3370bdca6868..b5c7dfc2b189 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6724,11 +6724,24 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
 	 * meet watermarks.
 	 */
 	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+		unsigned long free_pages;
+
 		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
 			mark = promo_wmark_pages(zone);
 		else
 			mark = high_wmark_pages(zone);
-		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
+
+		/*
+		 * In defrag_mode, watermarks must be met in whole
+		 * blocks to avoid polluting allocator fallbacks.
+		 */
+		if (defrag_mode)
+			free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+		else
+			free_pages = zone_page_state(zone, NR_FREE_PAGES);
+
+		if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
+					0, free_pages))
 			return true;
 	}
 
-- 
cgit 


From e452872b40e3f1fb92adf0d573a0a6a7c9f6ce22 Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao1@lixiang.com>
Date: Tue, 18 Mar 2025 15:58:32 +0800
Subject: mm: vmscan: split proactive reclaim statistics from direct reclaim
 statistics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Adding Proactive Memory Reclaim Statistics".

These two patches are related to proactive memory reclaim.

Patch 1 Split proactive reclaim statistics from direct reclaim counters
and introduces new counters: pgsteal_proactive, pgdemote_proactive,
and pgscan_proactive.

Patch 2 Adds pswpin and pswpout items to the cgroup-v2 documentation.


This patch (of 2):

In proactive memory reclaim scenarios, it is necessary to accurately track
proactive reclaim statistics to dynamically adjust the frequency and
amount of memory being reclaimed proactively.  Currently, proactive
reclaim is included in direct reclaim statistics, which can make these
direct reclaim statistics misleading.

Therefore, separate proactive reclaim memory from the direct reclaim
counters by introducing new counters: pgsteal_proactive,
pgdemote_proactive, and pgscan_proactive, to avoid confusion with direct
reclaim.

Link: https://lkml.kernel.org/r/20250318075833.90615-1-jiahao.kernel@gmail.com
Link: https://lkml.kernel.org/r/20250318075833.90615-2-jiahao.kernel@gmail.com
Signed-off-by: Hao Jia <jiahao1@lixiang.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b5c7dfc2b189..98e6ac82e428 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -456,21 +456,26 @@ void drop_slab(void)
 	} while ((freed >> shift++) > 1);
 }
 
-static int reclaimer_offset(void)
+#define CHECK_RECLAIMER_OFFSET(type)					\
+	do {								\
+		BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD !=		\
+			     PGDEMOTE_##type - PGDEMOTE_KSWAPD);	\
+		BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD !=		\
+			     PGSCAN_##type - PGSCAN_KSWAPD);		\
+	} while (0)
+
+static int reclaimer_offset(struct scan_control *sc)
 {
-	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
-			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
-	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
-			PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
-	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
-			PGSCAN_DIRECT - PGSCAN_KSWAPD);
-	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
-			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+	CHECK_RECLAIMER_OFFSET(DIRECT);
+	CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
+	CHECK_RECLAIMER_OFFSET(PROACTIVE);
 
 	if (current_is_kswapd())
 		return 0;
 	if (current_is_khugepaged())
 		return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+	if (sc->proactive)
+		return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
 	return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
 }
 
@@ -2008,7 +2013,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 				     &nr_scanned, sc, lru);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
-	item = PGSCAN_KSWAPD + reclaimer_offset();
+	item = PGSCAN_KSWAPD + reclaimer_offset(sc);
 	if (!cgroup_reclaim(sc))
 		__count_vm_events(item, nr_scanned);
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
@@ -2024,10 +2029,10 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	spin_lock_irq(&lruvec->lru_lock);
 	move_folios_to_lru(lruvec, &folio_list);
 
-	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
 					stat.nr_demoted);
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-	item = PGSTEAL_KSWAPD + reclaimer_offset();
+	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
 	if (!cgroup_reclaim(sc))
 		__count_vm_events(item, nr_reclaimed);
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
@@ -4571,7 +4576,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 			break;
 	}
 
-	item = PGSCAN_KSWAPD + reclaimer_offset();
+	item = PGSCAN_KSWAPD + reclaimer_offset(sc);
 	if (!cgroup_reclaim(sc)) {
 		__count_vm_events(item, isolated);
 		__count_vm_events(PGREFILL, sorted);
@@ -4721,10 +4726,10 @@ retry:
 		reset_batch_size(walk);
 	}
 
-	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
 					stat.nr_demoted);
 
-	item = PGSTEAL_KSWAPD + reclaimer_offset();
+	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
 	if (!cgroup_reclaim(sc))
 		__count_vm_events(item, reclaimed);
 	__count_memcg_events(memcg, item, reclaimed);
-- 
cgit 


From 1b0449544c6482179ac84530b61fc192a6527bfd Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang@huawei.com>
Date: Tue, 18 Mar 2025 16:39:39 +0800
Subject: mm/vmscan: don't try to reclaim hwpoison folio

Syzkaller reports a bug as follows:

Injecting memory failure for pfn 0x18b00e at process virtual address 0x20ffd000
Memory failure: 0x18b00e: dirty swapcache page still referenced by 2 users
Memory failure: 0x18b00e: recovery action for dirty swapcache page: Failed
page: refcount:2 mapcount:0 mapping:0000000000000000 index:0x20ffd pfn:0x18b00e
memcg:ffff0000dd6d9000
anon flags: 0x5ffffe00482011(locked|dirty|arch_1|swapbacked|hwpoison|node=0|zone=2|lastcpupid=0xfffff)
raw: 005ffffe00482011 dead000000000100 dead000000000122 ffff0000e232a7c9
raw: 0000000000020ffd 0000000000000000 00000002ffffffff ffff0000dd6d9000
page dumped because: VM_BUG_ON_FOLIO(!folio_test_uptodate(folio))
------------[ cut here ]------------
kernel BUG at mm/swap_state.c:184!
Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
Modules linked in:
CPU: 0 PID: 60 Comm: kswapd0 Not tainted 6.6.0-gcb097e7de84e #3
Hardware name: linux,dummy-virt (DT)
pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : add_to_swap+0xbc/0x158
lr : add_to_swap+0xbc/0x158
sp : ffff800087f37340
x29: ffff800087f37340 x28: fffffc00052c0380 x27: ffff800087f37780
x26: ffff800087f37490 x25: ffff800087f37c78 x24: ffff800087f377a0
x23: ffff800087f37c50 x22: 0000000000000000 x21: fffffc00052c03b4
x20: 0000000000000000 x19: fffffc00052c0380 x18: 0000000000000000
x17: 296f696c6f662865 x16: 7461646f7470755f x15: 747365745f6f696c
x14: 6f6621284f494c4f x13: 0000000000000001 x12: ffff600036d8b97b
x11: 1fffe00036d8b97a x10: ffff600036d8b97a x9 : dfff800000000000
x8 : 00009fffc9274686 x7 : ffff0001b6c5cbd3 x6 : 0000000000000001
x5 : ffff0000c25896c0 x4 : 0000000000000000 x3 : 0000000000000000
x2 : 0000000000000000 x1 : ffff0000c25896c0 x0 : 0000000000000000
Call trace:
 add_to_swap+0xbc/0x158
 shrink_folio_list+0x12ac/0x2648
 shrink_inactive_list+0x318/0x948
 shrink_lruvec+0x450/0x720
 shrink_node_memcgs+0x280/0x4a8
 shrink_node+0x128/0x978
 balance_pgdat+0x4f0/0xb20
 kswapd+0x228/0x438
 kthread+0x214/0x230
 ret_from_fork+0x10/0x20

I can reproduce this issue with the following steps:

1) When a dirty swapcache page is isolated by reclaim process and the
   page isn't locked, inject memory failure for the page.
   me_swapcache_dirty() clears uptodate flag and tries to delete from lru,
   but fails.  Reclaim process will put the hwpoisoned page back to lru.

2) The process that maps the hwpoisoned page exits, the page is deleted
   the page will never be freed and will be in the lru forever.

3) If we trigger a reclaim again and tries to reclaim the page,
   add_to_swap() will trigger VM_BUG_ON_FOLIO due to the uptodate flag is
   cleared.

To fix it, skip the hwpoisoned page in shrink_folio_list().  Besides, the
hwpoison folio may not be unmapped by hwpoison_user_mappings() yet, unmap
it in shrink_folio_list(), otherwise the folio will fail to be unmaped by
hwpoison_user_mappings() since the folio isn't in lru list.

Link: https://lkml.kernel.org/r/20250318083939.987651-3-tujinjiang@huawei.com
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: <stable@vger,kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 98e6ac82e428..2b2ab386cab5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1127,6 +1127,13 @@ retry:
 		if (!folio_trylock(folio))
 			goto keep;
 
+		if (folio_contain_hwpoisoned_page(folio)) {
+			unmap_poisoned_folio(folio, folio_pfn(folio), false);
+			folio_unlock(folio);
+			folio_put(folio);
+			continue;
+		}
+
 		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
 
 		nr_pages = folio_nr_pages(folio);
-- 
cgit