summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKiryl Shutsemau <kas@kernel.org>2025-09-15 14:52:53 +0100
committerAndrew Morton <akpm@linux-foundation.org>2025-09-23 14:14:15 -0700
commitf8a01513f5749180228d6460662188dd1e101a53 (patch)
tree80306affe8906781d663b58a234c9dceab070baa
parentf7a741c53b712542aedd9382f215fbe969f8a580 (diff)
mm/khugepaged: do not fail collapse_pte_mapped_thp() on SCAN_PMD_NULL
MADV_COLLAPSE on a file mapping behaves inconsistently depending on if PMD page table is installed or not. Consider following example: p = mmap(NULL, 2UL << 20, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); err = madvise(p, 2UL << 20, MADV_COLLAPSE); fd is a populated tmpfs file. The result depends on the address that the kernel returns on mmap(). If it is located in an existing PMD table, the madvise() will succeed. However, if the table does not exist, it will fail with -EINVAL. This occurs because find_pmd_or_thp_or_none() returns SCAN_PMD_NULL when a page table is missing, which causes collapse_pte_mapped_thp() to fail. SCAN_PMD_NULL and SCAN_PMD_NONE should be treated the same in collapse_pte_mapped_thp(): install the PMD leaf entry and allocate page tables as needed. Link: https://lkml.kernel.org/r/v5ivpub6z2n2uyemlnxgbilzs52ep4lrary7lm7o6axxoneb75@yfacfl5rkzeh Signed-off-by: Kiryl Shutsemau <kas@kernel.org> Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Dev Jain <dev.jain@arm.com> Reviewed-by: Zi Yan <ziy@nvidia.com> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Zach O'Keefe <zokeefe@google.com> Cc: Barry Song <baohua@kernel.org> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Mariano Pache <npache@redhat.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r--mm/khugepaged.c20
1 files changed, 19 insertions, 1 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index af5f5c80fe4e..9ed1af2b5c38 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1460,15 +1460,32 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct folio *folio, struct page *page)
{
+ struct mm_struct *mm = vma->vm_mm;
struct vm_fault vmf = {
.vma = vma,
.address = addr,
.flags = 0,
- .pmd = pmdp,
};
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
mmap_assert_locked(vma->vm_mm);
+ if (!pmdp) {
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ return SCAN_FAIL;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ return SCAN_FAIL;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ return SCAN_FAIL;
+ }
+
+ vmf.pmd = pmdp;
if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
@@ -1544,6 +1561,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
switch (result) {
case SCAN_SUCCEED:
break;
+ case SCAN_PMD_NULL:
case SCAN_PMD_NONE:
/*
* All pte entries have been removed and pmd cleared.