summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-04-19 21:46:58 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-04-19 21:46:58 -0700
commit6fea5fabd3323cd27b2ab5143263f37ff29550cb (patch)
treedaf14fe856e36d8023a5ca15db5ea9e96b193fe8
parent119009db267415049182774196e3cce9e13b52ef (diff)
parentea21641b6a79f9cdd64f8339983c71c89949dcb5 (diff)
Merge tag 'mm-hotfixes-stable-2025-04-19-21-24' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc hotfixes from Andrew Morton: "16 hotfixes. 2 are cc:stable and the remainder address post-6.14 issues or aren't considered necessary for -stable kernels. All patches are basically for MM although five are alterations to MAINTAINERS" [ Basic counting skills are clearly not a strictly necessary requirement for kernel maintainers. - Linus ] * tag 'mm-hotfixes-stable-2025-04-19-21-24' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: MAINTAINERS: add section for locking of mm's and VMAs mm: vmscan: fix kswapd exit condition in defrag_mode mm: vmscan: restore high-cpu watermark safety in kswapd MAINTAINERS: add Pedro as reviewer to the MEMORY MAPPING section mm/memory: move sanity checks in do_wp_page() after mapcount vs. refcount stabilization mm, hugetlb: increment the number of pages to be reset on HVO writeback: fix false warning in inode_to_wb() docs: ABI: replace mcroce@microsoft.com with new Meta address mm/gup: fix wrongly calculated returned value in fault_in_safe_writeable() MAINTAINERS: add memory advice section MAINTAINERS: add mmap trace events to MEMORY MAPPING mm: memcontrol: fix swap counter leak from offline cgroup MAINTAINERS: add MM subsection for the page allocator MAINTAINERS: update SLAB ALLOCATOR maintainers fs/dax: fix folio splitting issue by resetting old folio order + _nr_pages mm/page_alloc: fix deadlock on cpu_hotplug_lock in __accept_page()
-rw-r--r--CREDITS4
-rw-r--r--Documentation/ABI/stable/sysfs-block2
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-reboot10
-rw-r--r--MAINTAINERS49
-rw-r--r--fs/dax.c1
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/mm.h17
-rw-r--r--include/linux/mmzone.h5
-rw-r--r--mm/gup.c4
-rw-r--r--mm/hugetlb_vmemmap.c6
-rw-r--r--mm/internal.h1
-rw-r--r--mm/memcontrol-v1.c2
-rw-r--r--mm/memory.c4
-rw-r--r--mm/mm_init.c1
-rw-r--r--mm/page_alloc.c40
-rw-r--r--mm/vmscan.c29
16 files changed, 141 insertions, 35 deletions
diff --git a/CREDITS b/CREDITS
index 1b77fba6c27e..f74d230992d6 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2071,6 +2071,10 @@ S: 660 Harvard Ave. #7
S: Santa Clara, CA 95051
S: USA
+N: Joonsoo Kim
+E: iamjoonsoo.kim@lge.com
+D: Slab allocators
+
N: Kukjin Kim
E: kgene@kernel.org
D: Samsung S3C, S5P and Exynos ARM architectures
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 3879963f0f01..11545c9e2e93 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -77,7 +77,7 @@ Description:
What: /sys/block/<disk>/diskseq
Date: February 2021
-Contact: Matteo Croce <mcroce@microsoft.com>
+Contact: Matteo Croce <teknoraver@meta.com>
Description:
The /sys/block/<disk>/diskseq files reports the disk
sequence number, which is a monotonically increasing
diff --git a/Documentation/ABI/testing/sysfs-kernel-reboot b/Documentation/ABI/testing/sysfs-kernel-reboot
index e117aba46be0..52571fd5ddba 100644
--- a/Documentation/ABI/testing/sysfs-kernel-reboot
+++ b/Documentation/ABI/testing/sysfs-kernel-reboot
@@ -1,7 +1,7 @@
What: /sys/kernel/reboot
Date: November 2020
KernelVersion: 5.11
-Contact: Matteo Croce <mcroce@microsoft.com>
+Contact: Matteo Croce <teknoraver@meta.com>
Description: Interface to set the kernel reboot behavior, similarly to
what can be done via the reboot= cmdline option.
(see Documentation/admin-guide/kernel-parameters.txt)
@@ -9,25 +9,25 @@ Description: Interface to set the kernel reboot behavior, similarly to
What: /sys/kernel/reboot/mode
Date: November 2020
KernelVersion: 5.11
-Contact: Matteo Croce <mcroce@microsoft.com>
+Contact: Matteo Croce <teknoraver@meta.com>
Description: Reboot mode. Valid values are: cold warm hard soft gpio
What: /sys/kernel/reboot/type
Date: November 2020
KernelVersion: 5.11
-Contact: Matteo Croce <mcroce@microsoft.com>
+Contact: Matteo Croce <teknoraver@meta.com>
Description: Reboot type. Valid values are: bios acpi kbd triple efi pci
What: /sys/kernel/reboot/cpu
Date: November 2020
KernelVersion: 5.11
-Contact: Matteo Croce <mcroce@microsoft.com>
+Contact: Matteo Croce <teknoraver@meta.com>
Description: CPU number to use to reboot.
What: /sys/kernel/reboot/force
Date: November 2020
KernelVersion: 5.11
-Contact: Matteo Croce <mcroce@microsoft.com>
+Contact: Matteo Croce <teknoraver@meta.com>
Description: Don't wait for any other CPUs on reboot and
avoid anything that could hang.
diff --git a/MAINTAINERS b/MAINTAINERS
index c966e71ea60b..fa1e04e87d1d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15517,6 +15517,21 @@ F: mm/numa.c
F: mm/numa_emulation.c
F: mm/numa_memblks.c
+MEMORY MANAGEMENT - PAGE ALLOCATOR
+M: Andrew Morton <akpm@linux-foundation.org>
+R: Vlastimil Babka <vbabka@suse.cz>
+R: Suren Baghdasaryan <surenb@google.com>
+R: Michal Hocko <mhocko@suse.com>
+R: Brendan Jackman <jackmanb@google.com>
+R: Johannes Weiner <hannes@cmpxchg.org>
+R: Zi Yan <ziy@nvidia.com>
+L: linux-mm@kvack.org
+S: Maintained
+F: mm/compaction.c
+F: mm/page_alloc.c
+F: include/linux/gfp.h
+F: include/linux/compaction.h
+
MEMORY MANAGEMENT - SECRETMEM
M: Andrew Morton <akpm@linux-foundation.org>
M: Mike Rapoport <rppt@kernel.org>
@@ -15544,10 +15559,12 @@ M: Liam R. Howlett <Liam.Howlett@oracle.com>
M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
R: Vlastimil Babka <vbabka@suse.cz>
R: Jann Horn <jannh@google.com>
+R: Pedro Falcato <pfalcato@suse.de>
L: linux-mm@kvack.org
S: Maintained
W: http://www.linux-mm.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F: include/trace/events/mmap.h
F: mm/mlock.c
F: mm/mmap.c
F: mm/mprotect.c
@@ -15558,6 +15575,36 @@ F: mm/vma.h
F: mm/vma_internal.h
F: tools/testing/vma/
+MEMORY MAPPING - LOCKING
+M: Andrew Morton <akpm@linux-foundation.org>
+M: Suren Baghdasaryan <surenb@google.com>
+M: Liam R. Howlett <Liam.Howlett@oracle.com>
+M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+R: Vlastimil Babka <vbabka@suse.cz>
+R: Shakeel Butt <shakeel.butt@linux.dev>
+L: linux-mm@kvack.org
+S: Maintained
+W: http://www.linux-mm.org
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F: Documentation/mm/process_addrs.rst
+F: include/linux/mmap_lock.h
+F: include/trace/events/mmap_lock.h
+F: mm/mmap_lock.c
+
+MEMORY MAPPING - MADVISE (MEMORY ADVICE)
+M: Andrew Morton <akpm@linux-foundation.org>
+M: Liam R. Howlett <Liam.Howlett@oracle.com>
+M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+M: David Hildenbrand <david@redhat.com>
+R: Vlastimil Babka <vbabka@suse.cz>
+R: Jann Horn <jannh@google.com>
+L: linux-mm@kvack.org
+S: Maintained
+W: http://www.linux-mm.org
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F: include/uapi/asm-generic/mman-common.h
+F: mm/madvise.c
+
MEMORY TECHNOLOGY DEVICES (MTD)
M: Miquel Raynal <miquel.raynal@bootlin.com>
M: Richard Weinberger <richard@nod.at>
@@ -22250,9 +22297,7 @@ F: drivers/nvmem/layouts/sl28vpd.c
SLAB ALLOCATOR
M: Christoph Lameter <cl@linux.com>
-M: Pekka Enberg <penberg@kernel.org>
M: David Rientjes <rientjes@google.com>
-M: Joonsoo Kim <iamjoonsoo.kim@lge.com>
M: Andrew Morton <akpm@linux-foundation.org>
M: Vlastimil Babka <vbabka@suse.cz>
R: Roman Gushchin <roman.gushchin@linux.dev>
diff --git a/fs/dax.c b/fs/dax.c
index af5045b0f476..676303419e9e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -396,6 +396,7 @@ static inline unsigned long dax_folio_put(struct folio *folio)
order = folio_order(folio);
if (!order)
return 0;
+ folio_reset_order(folio);
for (i = 0; i < (1UL << order); i++) {
struct dev_pagemap *pgmap = page_pgmap(&folio->page);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 8e7af9a03b41..e721148c95d0 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -249,6 +249,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
WARN_ON_ONCE(debug_locks &&
+ (inode->i_sb->s_iflags & SB_I_CGROUPWB) &&
(!lockdep_is_held(&inode->i_lock) &&
!lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) &&
!lockdep_is_held(&inode->i_wb->list_lock)));
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b7f13f087954..bf55206935c4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1218,6 +1218,23 @@ static inline unsigned int folio_order(const struct folio *folio)
return folio_large_order(folio);
}
+/**
+ * folio_reset_order - Reset the folio order and derived _nr_pages
+ * @folio: The folio.
+ *
+ * Reset the order and derived _nr_pages to 0. Must only be used in the
+ * process of splitting large folios.
+ */
+static inline void folio_reset_order(struct folio *folio)
+{
+ if (WARN_ON_ONCE(!folio_test_large(folio)))
+ return;
+ folio->_flags_1 &= ~0xffUL;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 0;
+#endif
+}
+
#include <linux/huge_mm.h>
/*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 25e80b2ca7f4..6ccec1bf2896 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -967,6 +967,9 @@ struct zone {
#ifdef CONFIG_UNACCEPTED_MEMORY
/* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
struct list_head unaccepted_pages;
+
+ /* To be called once the last page in the zone is accepted */
+ struct work_struct unaccepted_cleanup;
#endif
/* zone flags, see below */
@@ -1499,8 +1502,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
bool zone_watermark_ok(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags);
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int highest_zoneidx);
/*
* Memory initialization context, use to differentiate memory added by
* the platform statically or via memory hotplug interface.
diff --git a/mm/gup.c b/mm/gup.c
index 92351e2fa876..84461d384ae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2207,8 +2207,8 @@ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
} while (start != end);
mmap_read_unlock(mm);
- if (size > (unsigned long)uaddr - start)
- return size - ((unsigned long)uaddr - start);
+ if (size > start - (unsigned long)uaddr)
+ return size - (start - (unsigned long)uaddr);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 9a99dfa3c495..27245e86df25 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* struct page, the special metadata (e.g. page->flags or page->mapping)
* cannot copy to the tail struct page structs. The invalid value will be
* checked in the free_tail_page_prepare(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
+ * of "corrupted mapping in tail page". We need to reset at least 4 (one
+ * head struct page struct and three tail struct page structs) struct page
* structs.
*/
-#define NR_RESET_STRUCT_PAGE 3
+#define NR_RESET_STRUCT_PAGE 4
static inline void reset_struct_pages(struct page *start)
{
diff --git a/mm/internal.h b/mm/internal.h
index 50c2f590b2d0..e9695baa5922 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1595,6 +1595,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc);
#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
+void unaccepted_cleanup_work(struct work_struct *work);
#else /* CONFIG_UNACCEPTED_MEMORY */
static inline void accept_page(struct page *page)
{
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 8660908850dc..4a9cf27a70af 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -620,7 +620,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+ swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
diff --git a/mm/memory.c b/mm/memory.c
index 44481fe7c629..ba3ea0a82f7f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3734,8 +3734,6 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
return false;
VM_WARN_ON_ONCE(folio_test_ksm(folio));
- VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio));
- VM_WARN_ON_ONCE(folio_entire_mapcount(folio));
if (unlikely(folio_test_swapcache(folio))) {
/*
@@ -3760,6 +3758,8 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
if (folio_large_mapcount(folio) != folio_ref_count(folio))
goto unlock;
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 84f14fa12d0d..9659689b8ace 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1441,6 +1441,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
#ifdef CONFIG_UNACCEPTED_MEMORY
INIT_LIST_HEAD(&zone->unaccepted_pages);
+ INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work);
#endif
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1715e34b91af..5669baf2a6fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3470,18 +3470,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
return false;
}
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int highest_zoneidx)
-{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
-
- if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
- free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
- return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
- free_pages);
-}
-
#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
@@ -7191,6 +7179,11 @@ static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
static bool lazy_accept = true;
+void unaccepted_cleanup_work(struct work_struct *work)
+{
+ static_branch_dec(&zones_with_unaccepted_pages);
+}
+
static int __init accept_memory_parse(char *p)
{
if (!strcmp(p, "lazy")) {
@@ -7229,8 +7222,27 @@ static void __accept_page(struct zone *zone, unsigned long *flags,
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
- if (last)
- static_branch_dec(&zones_with_unaccepted_pages);
+ if (last) {
+ /*
+ * There are two corner cases:
+ *
+ * - If allocation occurs during the CPU bring up,
+ * static_branch_dec() cannot be used directly as
+ * it causes a deadlock on cpu_hotplug_lock.
+ *
+ * Instead, use schedule_work() to prevent deadlock.
+ *
+ * - If allocation occurs before workqueues are initialized,
+ * static_branch_dec() should be called directly.
+ *
+ * Workqueues are initialized before CPU bring up, so this
+ * will not conflict with the first scenario.
+ */
+ if (system_wq)
+ schedule_work(&zone->unaccepted_cleanup);
+ else
+ unaccepted_cleanup_work(&zone->unaccepted_cleanup);
+ }
}
void accept_page(struct page *page)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b620d74b0f66..3783e45bfc92 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6736,6 +6736,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* meet watermarks.
*/
for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ enum zone_stat_item item;
unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
@@ -6746,11 +6747,33 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
/*
* In defrag_mode, watermarks must be met in whole
* blocks to avoid polluting allocator fallbacks.
+ *
+ * However, kswapd usually cannot accomplish this on
+ * its own and needs kcompactd support. Once it's
+ * reclaimed a compaction gap, and kswapd_shrink_node
+ * has dropped order, simply ensure there are enough
+ * base pages for compaction, wake kcompactd & sleep.
*/
- if (defrag_mode)
- free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+ if (defrag_mode && order)
+ item = NR_FREE_PAGES_BLOCKS;
else
- free_pages = zone_page_state(zone, NR_FREE_PAGES);
+ item = NR_FREE_PAGES;
+
+ /*
+ * When there is a high number of CPUs in the system,
+ * the cumulative error from the vmstat per-cpu cache
+ * can blur the line between the watermarks. In that
+ * case, be safe and get an accurate snapshot.
+ *
+ * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+ * pageblock_nr_pages, while the vmstat pcp threshold
+ * is limited to 125. On many configurations that
+ * counter won't actually be per-cpu cached. But keep
+ * things simple for now; revisit when somebody cares.
+ */
+ free_pages = zone_page_state(zone, item);
+ if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(zone, item);
if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
0, free_pages))