summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c1208
1 files changed, 738 insertions, 470 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1192e62531cd..f7c565f11a98 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -37,11 +37,11 @@
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/compat.h>
+#include <linux/pgalloc.h>
#include <linux/pgalloc_tag.h>
#include <linux/pagewalk.h>
#include <asm/tlb.h>
-#include <asm/pgalloc.h>
#include "internal.h"
#include "swap.h"
@@ -1077,28 +1077,103 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
+static struct deferred_split *split_queue_node(int nid)
+{
+ struct pglist_data *pgdata = NODE_DATA(nid);
+
+ return &pgdata->deferred_split_queue;
+}
+
#ifdef CONFIG_MEMCG
static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+ struct deferred_split *queue)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
- struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
+ if (mem_cgroup_disabled())
+ return NULL;
+ if (split_queue_node(folio_nid(folio)) == queue)
+ return NULL;
+ return container_of(queue, struct mem_cgroup, deferred_split_queue);
+}
- if (memcg)
- return &memcg->deferred_split_queue;
- else
- return &pgdat->deferred_split_queue;
+static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
+{
+ return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
}
#else
static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+ struct deferred_split *queue)
{
- struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
+ return NULL;
+}
- return &pgdat->deferred_split_queue;
+static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
+{
+ return split_queue_node(nid);
}
#endif
+static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
+{
+ struct deferred_split *queue;
+
+retry:
+ queue = memcg_split_queue(nid, memcg);
+ spin_lock(&queue->split_queue_lock);
+ /*
+ * There is a period between setting memcg to dying and reparenting
+ * deferred split queue, and during this period the THPs in the deferred
+ * split queue will be hidden from the shrinker side.
+ */
+ if (unlikely(memcg_is_dying(memcg))) {
+ spin_unlock(&queue->split_queue_lock);
+ memcg = parent_mem_cgroup(memcg);
+ goto retry;
+ }
+
+ return queue;
+}
+
+static struct deferred_split *
+split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
+{
+ struct deferred_split *queue;
+
+retry:
+ queue = memcg_split_queue(nid, memcg);
+ spin_lock_irqsave(&queue->split_queue_lock, *flags);
+ if (unlikely(memcg_is_dying(memcg))) {
+ spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
+ memcg = parent_mem_cgroup(memcg);
+ goto retry;
+ }
+
+ return queue;
+}
+
+static struct deferred_split *folio_split_queue_lock(struct folio *folio)
+{
+ return split_queue_lock(folio_nid(folio), folio_memcg(folio));
+}
+
+static struct deferred_split *
+folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
+{
+ return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
+}
+
+static inline void split_queue_unlock(struct deferred_split *queue)
+{
+ spin_unlock(&queue->split_queue_lock);
+}
+
+static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
+ unsigned long flags)
+{
+ spin_unlock_irqrestore(&queue->split_queue_lock, flags);
+}
+
static inline bool is_transparent_hugepage(const struct folio *folio)
{
if (!folio_test_large(folio))
@@ -1127,7 +1202,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
if (len_pad < len || (off + len_pad) < off)
return 0;
- ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
+ ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad,
off >> PAGE_SHIFT, flags, vm_flags);
/*
@@ -1164,7 +1239,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
if (ret)
return ret;
- return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
+ return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags,
vm_flags);
}
@@ -1218,7 +1293,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
return folio;
}
-static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
+void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd,
struct vm_area_struct *vma, unsigned long haddr)
{
pmd_t entry;
@@ -1229,6 +1304,13 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
folio_add_lru_vma(folio, vma);
set_pmd_at(vma->vm_mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, haddr, pmd);
+ deferred_split_folio(folio, false);
+}
+
+static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd,
+ struct vm_area_struct *vma, unsigned long haddr)
+{
+ map_anon_folio_pmd_nopf(folio, pmd, vma, haddr);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
count_vm_event(THP_FAULT_ALLOC);
count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
@@ -1271,9 +1353,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return ret;
}
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+ map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
mm_inc_nr_ptes(vma->vm_mm);
- deferred_split_folio(folio, false);
spin_unlock(vmf->ptl);
}
@@ -1288,6 +1369,44 @@ release:
}
+vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = 0;
+ spinlock_t *ptl;
+ softleaf_t entry;
+ struct page *page;
+ struct folio *folio;
+
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+
+ ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ entry = softleaf_from_pmd(vmf->orig_pmd);
+ page = softleaf_to_page(entry);
+ folio = page_folio(page);
+ vmf->page = page;
+ vmf->pte = NULL;
+ if (folio_trylock(folio)) {
+ folio_get(folio);
+ spin_unlock(ptl);
+ ret = page_pgmap(page)->ops->migrate_to_ram(vmf);
+ folio_unlock(folio);
+ folio_put(folio);
+ } else {
+ spin_unlock(ptl);
+ }
+
+ return ret;
+}
+
/*
* always: directly stall for all thp allocations
* defer: wake kswapd and fail if not immediately available
@@ -1668,6 +1787,62 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
return false;
}
+static void copy_huge_non_present_pmd(
+ struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pmd_t pmd, pgtable_t pgtable)
+{
+ softleaf_t entry = softleaf_from_pmd(pmd);
+ struct folio *src_folio;
+
+ VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd));
+
+ if (softleaf_is_migration_write(entry) ||
+ softleaf_is_migration_read_exclusive(entry)) {
+ entry = make_readable_migration_entry(swp_offset(entry));
+ pmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*src_pmd))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ } else if (softleaf_is_device_private(entry)) {
+ /*
+ * For device private entries, since there are no
+ * read exclusive entries, writable = !readable
+ */
+ if (softleaf_is_device_private_write(entry)) {
+ entry = make_readable_device_private_entry(swp_offset(entry));
+ pmd = swp_entry_to_pmd(entry);
+
+ if (pmd_swp_soft_dirty(*src_pmd))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
+
+ src_folio = softleaf_to_folio(entry);
+ VM_WARN_ON(!folio_test_large(src_folio));
+
+ folio_get(src_folio);
+ /*
+ * folio_try_dup_anon_rmap_pmd does not fail for
+ * device private entries.
+ */
+ folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
+ dst_vma, src_vma);
+ }
+
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ mm_inc_nr_ptes(dst_mm);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+}
+
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
@@ -1713,31 +1888,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- if (unlikely(is_swap_pmd(pmd))) {
- swp_entry_t entry = pmd_to_swp_entry(pmd);
-
- VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (!is_readable_migration_entry(entry)) {
- entry = make_readable_migration_entry(
- swp_offset(entry));
- pmd = swp_entry_to_pmd(entry);
- if (pmd_swp_soft_dirty(*src_pmd))
- pmd = pmd_swp_mksoft_dirty(pmd);
- if (pmd_swp_uffd_wp(*src_pmd))
- pmd = pmd_swp_mkuffd_wp(pmd);
- set_pmd_at(src_mm, addr, src_pmd, pmd);
- }
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- mm_inc_nr_ptes(dst_mm);
- pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
- if (!userfaultfd_wp(dst_vma))
- pmd = pmd_swp_clear_uffd_wp(pmd);
- set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ if (unlikely(thp_migration_supported() &&
+ pmd_is_valid_softleaf(pmd))) {
+ copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
+ dst_vma, src_vma, pmd, pgtable);
ret = 0;
goto out_unlock;
}
-#endif
if (unlikely(!pmd_trans_huge(pmd))) {
pte_free(dst_mm, pgtable);
@@ -1887,7 +2044,7 @@ static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
if (ret)
goto release;
(void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
- map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+ map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
goto unlock;
release:
folio_put(folio);
@@ -2123,7 +2280,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
+ !pmd_is_migration_entry(orig_pmd));
goto out;
}
@@ -2221,15 +2378,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
folio_remove_rmap_pmd(folio, page, vma);
WARN_ON_ONCE(folio_mapcount(folio) < 0);
VM_BUG_ON_PAGE(!PageHead(page), page);
- } else if (thp_migration_supported()) {
- swp_entry_t entry;
+ } else if (pmd_is_valid_softleaf(orig_pmd)) {
+ const softleaf_t entry = softleaf_from_pmd(orig_pmd);
- VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
- entry = pmd_to_swp_entry(orig_pmd);
- folio = pfn_swap_entry_folio(entry);
+ folio = softleaf_to_folio(entry);
flush_needed = 0;
- } else
- WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
+ if (!thp_migration_supported())
+ WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+ }
if (folio_test_anon(folio)) {
zap_deposited_table(tlb->mm, pmd);
@@ -2249,6 +2406,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
folio_mark_accessed(folio);
}
+ if (folio_is_device_private(folio)) {
+ folio_remove_rmap_pmd(folio, &folio->page, vma);
+ WARN_ON_ONCE(folio_mapcount(folio) < 0);
+ folio_put(folio);
+ }
+
spin_unlock(ptl);
if (flush_needed)
tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
@@ -2273,20 +2436,23 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
-#ifdef CONFIG_MEM_SOFT_DIRTY
- if (unlikely(is_pmd_migration_entry(pmd)))
- pmd = pmd_swp_mksoft_dirty(pmd);
- else if (pmd_present(pmd))
- pmd = pmd_mksoft_dirty(pmd);
-#endif
+ if (pgtable_supports_soft_dirty()) {
+ if (unlikely(pmd_is_migration_entry(pmd)))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ else if (pmd_present(pmd))
+ pmd = pmd_mksoft_dirty(pmd);
+ }
+
return pmd;
}
static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
{
+ if (pmd_none(pmd))
+ return pmd;
if (pmd_present(pmd))
pmd = pmd_clear_uffd_wp(pmd);
- else if (is_swap_pmd(pmd))
+ else
pmd = pmd_swp_clear_uffd_wp(pmd);
return pmd;
@@ -2343,6 +2509,42 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
return false;
}
+static void change_non_present_huge_pmd(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmd, bool uffd_wp,
+ bool uffd_wp_resolve)
+{
+ softleaf_t entry = softleaf_from_pmd(*pmd);
+ const struct folio *folio = softleaf_to_folio(entry);
+ pmd_t newpmd;
+
+ VM_WARN_ON(!pmd_is_valid_softleaf(*pmd));
+ if (softleaf_is_migration_write(entry)) {
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ if (folio_test_anon(folio))
+ entry = make_readable_exclusive_migration_entry(swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
+ newpmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*pmd))
+ newpmd = pmd_swp_mksoft_dirty(newpmd);
+ } else if (softleaf_is_device_private_write(entry)) {
+ entry = make_readable_device_private_entry(swp_offset(entry));
+ newpmd = swp_entry_to_pmd(entry);
+ } else {
+ newpmd = *pmd;
+ }
+
+ if (uffd_wp)
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
+ else if (uffd_wp_resolve)
+ newpmd = pmd_swp_clear_uffd_wp(newpmd);
+ if (!pmd_same(*pmd, newpmd))
+ set_pmd_at(mm, addr, pmd, newpmd);
+}
+
/*
* Returns
* - 0 if PMD could not be locked
@@ -2371,42 +2573,14 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (!ptl)
return 0;
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- if (is_swap_pmd(*pmd)) {
- swp_entry_t entry = pmd_to_swp_entry(*pmd);
- struct folio *folio = pfn_swap_entry_folio(entry);
- pmd_t newpmd;
-
- VM_BUG_ON(!is_pmd_migration_entry(*pmd));
- if (is_writable_migration_entry(entry)) {
- /*
- * A protection check is difficult so
- * just be safe and disable write
- */
- if (folio_test_anon(folio))
- entry = make_readable_exclusive_migration_entry(swp_offset(entry));
- else
- entry = make_readable_migration_entry(swp_offset(entry));
- newpmd = swp_entry_to_pmd(entry);
- if (pmd_swp_soft_dirty(*pmd))
- newpmd = pmd_swp_mksoft_dirty(newpmd);
- } else {
- newpmd = *pmd;
- }
-
- if (uffd_wp)
- newpmd = pmd_swp_mkuffd_wp(newpmd);
- else if (uffd_wp_resolve)
- newpmd = pmd_swp_clear_uffd_wp(newpmd);
- if (!pmd_same(*pmd, newpmd))
- set_pmd_at(mm, addr, pmd, newpmd);
+ if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) {
+ change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
+ uffd_wp_resolve);
goto unlock;
}
-#endif
if (prot_numa) {
- struct folio *folio;
- bool toptier;
+
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -2418,19 +2592,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (pmd_protnone(*pmd))
goto unlock;
- folio = pmd_folio(*pmd);
- toptier = node_is_toptier(folio_nid(folio));
- /*
- * Skip scanning top tier node if normal numa
- * balancing is disabled
- */
- if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- toptier)
+ if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
+ vma_is_single_threaded_private(vma)))
goto unlock;
-
- if (folio_use_access_time(folio))
- folio_xchg_access_time(folio,
- jiffies_to_msecs(jiffies));
}
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
@@ -2543,7 +2707,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
pmd_t _dst_pmd, src_pmdval;
struct page *src_page;
struct folio *src_folio;
- struct anon_vma *src_anon_vma;
spinlock_t *src_ptl, *dst_ptl;
pgtable_t src_pgtable;
struct mmu_notifier_range range;
@@ -2565,7 +2728,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
if (!pmd_trans_huge(src_pmdval)) {
spin_unlock(src_ptl);
- if (is_pmd_migration_entry(src_pmdval)) {
+ if (pmd_is_migration_entry(src_pmdval)) {
pmd_migration_entry_wait(mm, &src_pmdval);
return -EAGAIN;
}
@@ -2592,23 +2755,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
src_addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
- if (src_folio) {
+ if (src_folio)
folio_lock(src_folio);
- /*
- * split_huge_page walks the anon_vma chain without the page
- * lock. Serialize against it with the anon_vma lock, the page
- * lock is not enough.
- */
- src_anon_vma = folio_get_anon_vma(src_folio);
- if (!src_anon_vma) {
- err = -EAGAIN;
- goto unlock_folio;
- }
- anon_vma_lock_write(src_anon_vma);
- } else
- src_anon_vma = NULL;
-
dst_ptl = pmd_lockptr(mm, dst_pmd);
double_pt_lock(src_ptl, dst_ptl);
if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
@@ -2653,11 +2802,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
double_pt_unlock(src_ptl, dst_ptl);
- if (src_anon_vma) {
- anon_vma_unlock_write(src_anon_vma);
- put_anon_vma(src_anon_vma);
- }
-unlock_folio:
/* unblock rmap walks */
if (src_folio)
folio_unlock(src_folio);
@@ -2677,8 +2821,9 @@ unlock_folio:
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
spinlock_t *ptl;
+
ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
+ if (likely(pmd_is_huge(*pmd)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -2844,7 +2989,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
+ bool soft_dirty, uffd_wp = false, young = false, write = false;
bool anon_exclusive = false, dirty = false;
unsigned long addr;
pte_t *pte;
@@ -2853,7 +2998,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
+
+ VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -2867,11 +3013,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
zap_deposited_table(mm, pmd);
if (!vma_is_dax(vma) && vma_is_special_huge(vma))
return;
- if (unlikely(is_pmd_migration_entry(old_pmd))) {
- swp_entry_t entry;
+ if (unlikely(pmd_is_migration_entry(old_pmd))) {
+ const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
- entry = pmd_to_swp_entry(old_pmd);
- folio = pfn_swap_entry_folio(entry);
+ folio = softleaf_to_folio(old_entry);
} else if (is_huge_zero_pmd(old_pmd)) {
return;
} else {
@@ -2901,20 +3046,54 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- pmd_migration = is_pmd_migration_entry(*pmd);
- if (unlikely(pmd_migration)) {
- swp_entry_t entry;
+ if (pmd_is_migration_entry(*pmd)) {
+ softleaf_t entry;
old_pmd = *pmd;
- entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
- write = is_writable_migration_entry(entry);
+ entry = softleaf_from_pmd(old_pmd);
+ page = softleaf_to_page(entry);
+ folio = page_folio(page);
+
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = softleaf_is_migration_write(entry);
if (PageAnon(page))
- anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = is_migration_entry_young(entry);
- dirty = is_migration_entry_dirty(entry);
+ anon_exclusive = softleaf_is_migration_read_exclusive(entry);
+ young = softleaf_is_migration_young(entry);
+ dirty = softleaf_is_migration_dirty(entry);
+ } else if (pmd_is_device_private_entry(*pmd)) {
+ softleaf_t entry;
+
+ old_pmd = *pmd;
+ entry = softleaf_from_pmd(old_pmd);
+ page = softleaf_to_page(entry);
+ folio = page_folio(page);
+
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = softleaf_is_device_private_write(entry);
+ anon_exclusive = PageAnonExclusive(page);
+
+ /*
+ * Device private THP should be treated the same as regular
+ * folios w.r.t anon exclusive handling. See the comments for
+ * folio handling and anon_exclusive below.
+ */
+ if (freeze && anon_exclusive &&
+ folio_try_share_anon_rmap_pmd(folio, page))
+ freeze = false;
+ if (!freeze) {
+ rmap_t rmap_flags = RMAP_NONE;
+
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+ if (anon_exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
+ vma, haddr, rmap_flags);
+ }
} else {
/*
* Up to this point the pmd is present and huge and userland has
@@ -2998,11 +3177,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* Note that NUMA hinting access restrictions are not transferred to
* avoid any possibility of altering permissions across VMAs.
*/
- if (freeze || pmd_migration) {
- for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry;
- swp_entry_t swp_entry;
+ if (freeze || pmd_is_migration_entry(old_pmd)) {
+ pte_t entry;
+ swp_entry_t swp_entry;
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
if (write)
swp_entry = make_writable_migration_entry(
page_to_pfn(page + i));
@@ -3021,7 +3200,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_swp_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+ set_pte_at(mm, addr, pte + i, entry);
+ }
+ } else if (pmd_is_device_private_entry(old_pmd)) {
+ pte_t entry;
+ swp_entry_t swp_entry;
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ /*
+ * anon_exclusive was already propagated to the relevant
+ * pages corresponding to the pte entries when freeze
+ * is false.
+ */
+ if (write)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page + i));
+ /*
+ * Young and dirty bits are not progated via swp_entry
+ */
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
VM_WARN_ON(!pte_none(ptep_get(pte + i)));
set_pte_at(mm, addr, pte + i, entry);
}
@@ -3048,7 +3253,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
pte_unmap(pte);
- if (!pmd_migration)
+ if (!pmd_is_migration_entry(*pmd))
folio_remove_rmap_pmd(folio, page, vma);
if (freeze)
put_page(page);
@@ -3061,7 +3266,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze)
{
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
+ if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd))
__split_huge_pmd_locked(vma, pmd, address, freeze);
}
@@ -3240,6 +3445,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
lockdep_assert_held(&lruvec->lru_lock);
+ if (folio_is_device_private(folio))
+ return;
+
if (list) {
/* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
@@ -3354,15 +3562,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
new_folio->mapping = folio->mapping;
new_folio->index = folio->index + i;
- /*
- * page->private should not be set in tail pages. Fix up and warn once
- * if private is unexpectedly set.
- */
- if (unlikely(new_folio->private)) {
- VM_WARN_ON_ONCE_PAGE(true, new_head);
- new_folio->private = NULL;
- }
-
if (folio_test_swapcache(folio))
new_folio->swap.val = folio->swap.val + i;
@@ -3398,8 +3597,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
ClearPageCompound(&folio->page);
}
-/*
- * It splits an unmapped @folio to lower order smaller folios in two ways.
+/**
+ * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in
+ * two ways: uniform split or non-uniform split.
* @folio: the to-be-split folio
* @new_order: the smallest order of the after split folios (since buddy
* allocator like split generates folios with orders from @folio's
@@ -3408,64 +3608,56 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* will be split until its order becomes @new_order.
* @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
* @mapping: @folio->mapping
- * @uniform_split: if the split is uniform or not (buddy allocator like split)
+ * @split_type: if the split is uniform or not (buddy allocator like split)
*
*
* 1. uniform split: the given @folio into multiple @new_order small folios,
* where all small folios have the same order. This is done when
- * uniform_split is true.
+ * split_type is SPLIT_TYPE_UNIFORM.
* 2. buddy allocator like (non-uniform) split: the given @folio is split into
* half and one of the half (containing the given page) is split into half
- * until the given @page's order becomes @new_order. This is done when
- * uniform_split is false.
+ * until the given @folio's order becomes @new_order. This is done when
+ * split_type is SPLIT_TYPE_NON_UNIFORM.
*
* The high level flow for these two methods are:
- * 1. uniform split: a single __split_folio_to_order() is called to split the
- * @folio into @new_order, then we traverse all the resulting folios one by
- * one in PFN ascending order and perform stats, unfreeze, adding to list,
- * and file mapping index operations.
- * 2. non-uniform split: in general, folio_order - @new_order calls to
- * __split_folio_to_order() are made in a for loop to split the @folio
- * to one lower order at a time. The resulting small folios are processed
- * like what is done during the traversal in 1, except the one containing
- * @page, which is split in next for loop.
+ *
+ * 1. uniform split: @xas is split with no expectation of failure and a single
+ * __split_folio_to_order() is called to split the @folio into @new_order
+ * along with stats update.
+ * 2. non-uniform split: folio_order - @new_order calls to
+ * __split_folio_to_order() are expected to be made in a for loop to split
+ * the @folio to one lower order at a time. The folio containing @split_at
+ * is split in each iteration. @xas is split into half in each iteration and
+ * can fail. A failed @xas split leaves split folios as is without merging
+ * them back.
*
* After splitting, the caller's folio reference will be transferred to the
- * folio containing @page. The caller needs to unlock and/or free after-split
- * folios if necessary.
+ * folio containing @split_at. The caller needs to unlock and/or free
+ * after-split folios if necessary.
*
- * For !uniform_split, when -ENOMEM is returned, the original folio might be
- * split. The caller needs to check the input folio.
+ * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
*/
static int __split_unmapped_folio(struct folio *folio, int new_order,
struct page *split_at, struct xa_state *xas,
- struct address_space *mapping, bool uniform_split)
+ struct address_space *mapping, enum split_type split_type)
{
- int order = folio_order(folio);
- int start_order = uniform_split ? new_order : order - 1;
- bool stop_split = false;
- struct folio *next;
+ const bool is_anon = folio_test_anon(folio);
+ int old_order = folio_order(folio);
+ int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1;
int split_order;
- int ret = 0;
-
- if (folio_test_anon(folio))
- mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
/*
* split to new_order one order at a time. For uniform split,
* folio is split to new_order directly.
*/
for (split_order = start_order;
- split_order >= new_order && !stop_split;
+ split_order >= new_order;
split_order--) {
- struct folio *end_folio = folio_next(folio);
- int old_order = folio_order(folio);
- struct folio *new_folio;
+ int nr_new_folios = 1UL << (old_order - split_order);
/* order-1 anonymous folio is not supported */
- if (folio_test_anon(folio) && split_order == 1)
- continue;
- if (uniform_split && split_order != new_order)
+ if (is_anon && split_order == 1)
continue;
if (mapping) {
@@ -3474,58 +3666,39 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
* irq is disabled to allocate enough memory, whereas
* non-uniform split can handle ENOMEM.
*/
- if (uniform_split)
+ if (split_type == SPLIT_TYPE_UNIFORM)
xas_split(xas, folio, old_order);
else {
xas_set_order(xas, folio->index, split_order);
xas_try_split(xas, folio, old_order);
- if (xas_error(xas)) {
- ret = xas_error(xas);
- stop_split = true;
- }
+ if (xas_error(xas))
+ return xas_error(xas);
}
}
- if (!stop_split) {
- folio_split_memcg_refs(folio, old_order, split_order);
- split_page_owner(&folio->page, old_order, split_order);
- pgalloc_tag_split(folio, old_order, split_order);
+ folio_split_memcg_refs(folio, old_order, split_order);
+ split_page_owner(&folio->page, old_order, split_order);
+ pgalloc_tag_split(folio, old_order, split_order);
+ __split_folio_to_order(folio, old_order, split_order);
- __split_folio_to_order(folio, old_order, split_order);
+ if (is_anon) {
+ mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1);
+ mod_mthp_stat(split_order, MTHP_STAT_NR_ANON, nr_new_folios);
}
-
/*
- * Iterate through after-split folios and update folio stats.
- * But in buddy allocator like split, the folio
- * containing the specified page is skipped until its order
- * is new_order, since the folio will be worked on in next
- * iteration.
+ * If uniform split, the process is complete.
+ * If non-uniform, continue splitting the folio at @split_at
+ * as long as the next @split_order is >= @new_order.
*/
- for (new_folio = folio; new_folio != end_folio; new_folio = next) {
- next = folio_next(new_folio);
- /*
- * for buddy allocator like split, new_folio containing
- * @split_at page could be split again, thus do not
- * change stats yet. Wait until new_folio's order is
- * @new_order or stop_split is set to true by the above
- * xas_split() failure.
- */
- if (new_folio == page_folio(split_at)) {
- folio = new_folio;
- if (split_order != new_order && !stop_split)
- continue;
- }
- if (folio_test_anon(new_folio))
- mod_mthp_stat(folio_order(new_folio),
- MTHP_STAT_NR_ANON, 1);
- }
+ folio = page_folio(split_at);
+ old_order = split_order;
}
- return ret;
+ return 0;
}
-bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
- bool warns)
+bool folio_split_supported(struct folio *folio, unsigned int new_order,
+ enum split_type split_type, bool warns)
{
if (folio_test_anon(folio)) {
/* order-1 is not supported for anonymous THP. */
@@ -3533,21 +3706,41 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
"Cannot split to order-1 folio");
if (new_order == 1)
return false;
- } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- !mapping_large_folio_support(folio->mapping)) {
- /*
- * No split if the file system does not support large folio.
- * Note that we might still have THPs in such mappings due to
- * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
- * does not actually support large folios properly.
- */
- VM_WARN_ONCE(warns,
- "Cannot split file folio to non-0 order");
- return false;
+ } else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) {
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ /*
+ * We can always split a folio down to a single page
+ * (new_order == 0) uniformly.
+ *
+ * For any other scenario
+ * a) uniform split targeting a large folio
+ * (new_order > 0)
+ * b) any non-uniform split
+ * we must confirm that the file system supports large
+ * folios.
+ *
+ * Note that we might still have THPs in such
+ * mappings, which is created from khugepaged when
+ * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that
+ * case, the mapping does not actually support large
+ * folios properly.
+ */
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
}
- /* Only swapping a whole PMD-mapped folio is supported */
- if (folio_test_swapcache(folio)) {
+ /*
+ * swapcache folio could only be split to order 0
+ *
+ * non-uniform split creates after-split folios with orders from
+ * folio_order(folio) - 1 to new_order, making it not suitable for any
+ * swapcache folio split. Only uniform split to order-0 can be used
+ * here.
+ */
+ if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) {
VM_WARN_ONCE(warns,
"Cannot split swapcache folio to non-0 order");
return false;
@@ -3556,41 +3749,160 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
return true;
}
-/* See comments in non_uniform_split_supported() */
-bool uniform_split_supported(struct folio *folio, unsigned int new_order,
- bool warns)
+static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct xa_state *xas,
+ struct address_space *mapping, bool do_lru,
+ struct list_head *list, enum split_type split_type,
+ pgoff_t end, int *nr_shmem_dropped, int extra_pins)
{
- if (folio_test_anon(folio)) {
- VM_WARN_ONCE(warns && new_order == 1,
- "Cannot split to order-1 folio");
- if (new_order == 1)
- return false;
- } else if (new_order) {
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- !mapping_large_folio_support(folio->mapping)) {
- VM_WARN_ONCE(warns,
- "Cannot split file folio to non-0 order");
- return false;
+ struct folio *end_folio = folio_next(folio);
+ struct folio *new_folio, *next;
+ int old_order = folio_order(folio);
+ int ret = 0;
+ struct deferred_split *ds_queue;
+
+ VM_WARN_ON_ONCE(!mapping && end);
+ /* Prevent deferred_split_scan() touching ->_refcount */
+ ds_queue = folio_split_queue_lock(folio);
+ if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ struct swap_cluster_info *ci = NULL;
+ struct lruvec *lruvec;
+ int expected_refs;
+
+ if (old_order > 1) {
+ if (!list_empty(&folio->_deferred_list)) {
+ ds_queue->split_queue_len--;
+ /*
+ * Reinitialize page_deferred_list after removing the
+ * page from the split_queue, otherwise a subsequent
+ * split will see list corruption when checking the
+ * page_deferred_list.
+ */
+ list_del_init(&folio->_deferred_list);
+ }
+ if (folio_test_partially_mapped(folio)) {
+ folio_clear_partially_mapped(folio);
+ mod_mthp_stat(old_order,
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+ }
}
- }
+ split_queue_unlock(ds_queue);
+ if (mapping) {
+ int nr = folio_nr_pages(folio);
- if (new_order && folio_test_swapcache(folio)) {
- VM_WARN_ONCE(warns,
- "Cannot split swapcache folio to non-0 order");
- return false;
+ if (folio_test_pmd_mappable(folio) &&
+ new_order < HPAGE_PMD_ORDER) {
+ if (folio_test_swapbacked(folio)) {
+ lruvec_stat_mod_folio(folio,
+ NR_SHMEM_THPS, -nr);
+ } else {
+ lruvec_stat_mod_folio(folio,
+ NR_FILE_THPS, -nr);
+ filemap_nr_thps_dec(mapping);
+ }
+ }
+ }
+
+ if (folio_test_swapcache(folio)) {
+ if (mapping) {
+ VM_WARN_ON_ONCE_FOLIO(mapping, folio);
+ return -EINVAL;
+ }
+
+ ci = swap_cluster_get_and_lock(folio);
+ }
+
+ /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
+ if (do_lru)
+ lruvec = folio_lruvec_lock(folio);
+
+ ret = __split_unmapped_folio(folio, new_order, split_at, xas,
+ mapping, split_type);
+
+ /*
+ * Unfreeze after-split folios and put them back to the right
+ * list. @folio should be kept frozon until page cache
+ * entries are updated with all the other after-split folios
+ * to prevent others seeing stale page cache entries.
+ * As a result, new_folio starts from the next folio of
+ * @folio.
+ */
+ for (new_folio = folio_next(folio); new_folio != end_folio;
+ new_folio = next) {
+ unsigned long nr_pages = folio_nr_pages(new_folio);
+
+ next = folio_next(new_folio);
+
+ zone_device_private_split_cb(folio, new_folio);
+
+ expected_refs = folio_expected_ref_count(new_folio) + 1;
+ folio_ref_unfreeze(new_folio, expected_refs);
+
+ if (do_lru)
+ lru_add_split_folio(folio, new_folio, lruvec, list);
+
+ /*
+ * Anonymous folio with swap cache.
+ * NOTE: shmem in swap cache is not supported yet.
+ */
+ if (ci) {
+ __swap_cache_replace_folio(ci, folio, new_folio);
+ continue;
+ }
+
+ /* Anonymous folio without swap cache */
+ if (!mapping)
+ continue;
+
+ /* Add the new folio to the page cache. */
+ if (new_folio->index < end) {
+ __xa_store(&mapping->i_pages, new_folio->index,
+ new_folio, 0);
+ continue;
+ }
+
+ VM_WARN_ON_ONCE(!nr_shmem_dropped);
+ /* Drop folio beyond EOF: ->index >= end */
+ if (shmem_mapping(mapping) && nr_shmem_dropped)
+ *nr_shmem_dropped += nr_pages;
+ else if (folio_test_clear_dirty(new_folio))
+ folio_account_cleaned(
+ new_folio, inode_to_wb(mapping->host));
+ __filemap_remove_folio(new_folio, NULL);
+ folio_put_refs(new_folio, nr_pages);
+ }
+
+ zone_device_private_split_cb(folio, NULL);
+ /*
+ * Unfreeze @folio only after all page cache entries, which
+ * used to point to it, have been updated with new folios.
+ * Otherwise, a parallel folio_try_get() can grab @folio
+ * and its caller can see stale page cache entries.
+ */
+ expected_refs = folio_expected_ref_count(folio) + 1;
+ folio_ref_unfreeze(folio, expected_refs);
+
+ if (do_lru)
+ unlock_page_lruvec(lruvec);
+
+ if (ci)
+ swap_cluster_unlock(ci);
+ } else {
+ split_queue_unlock(ds_queue);
+ return -EAGAIN;
}
- return true;
+ return ret;
}
-/*
- * __folio_split: split a folio at @split_at to a @new_order folio
+/**
+ * __folio_split() - split a folio at @split_at to a @new_order folio
* @folio: folio to split
* @new_order: the order of the new folio
* @split_at: a page within the new folio
* @lock_at: a page within @folio to be left locked to caller
* @list: after-split folios will be put on it if non NULL
- * @uniform_split: perform uniform split or not (non-uniform split)
+ * @split_type: perform uniform split or not (non-uniform split)
*
* It calls __split_unmapped_folio() to perform uniform and non-uniform split.
* It is in charge of checking whether the split is supported or not and
@@ -3601,25 +3913,24 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
* 1. for uniform split, @lock_at points to one of @folio's subpages;
* 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
*
- * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
* split but not to @new_order, the caller needs to check)
*/
static int __folio_split(struct folio *folio, unsigned int new_order,
struct page *split_at, struct page *lock_at,
- struct list_head *list, bool uniform_split)
+ struct list_head *list, enum split_type split_type)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(folio);
XA_STATE(xas, &folio->mapping->i_pages, folio->index);
struct folio *end_folio = folio_next(folio);
bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL;
struct anon_vma *anon_vma = NULL;
- int order = folio_order(folio);
+ int old_order = folio_order(folio);
struct folio *new_folio, *next;
int nr_shmem_dropped = 0;
int remap_flags = 0;
int extra_pins, ret;
- pgoff_t end;
+ pgoff_t end = 0;
bool is_hzp;
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
@@ -3638,14 +3949,10 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
if (!is_anon && !folio->mapping)
return -EBUSY;
- if (new_order >= folio_order(folio))
+ if (new_order >= old_order)
return -EINVAL;
- if (uniform_split && !uniform_split_supported(folio, new_order, true))
- return -EINVAL;
-
- if (!uniform_split &&
- !non_uniform_split_supported(folio, new_order, true))
+ if (!folio_split_supported(folio, new_order, split_type, /* warn = */ true))
return -EINVAL;
is_hzp = is_huge_zero_folio(folio);
@@ -3671,8 +3978,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
ret = -EBUSY;
goto out;
}
- mapping = NULL;
anon_vma_lock_write(anon_vma);
+ mapping = NULL;
} else {
unsigned int min_order;
gfp_t gfp;
@@ -3692,9 +3999,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
goto out;
}
- if (uniform_split) {
+ if (split_type == SPLIT_TYPE_UNIFORM) {
xas_set_order(&xas, folio->index, new_order);
- xas_split_alloc(&xas, folio, folio_order(folio), gfp);
+ xas_split_alloc(&xas, folio, old_order, gfp);
if (xas_error(&xas)) {
ret = xas_error(&xas);
goto out;
@@ -3742,127 +4049,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
}
}
- /* Prevent deferred_split_scan() touching ->_refcount */
- spin_lock(&ds_queue->split_queue_lock);
- if (folio_ref_freeze(folio, 1 + extra_pins)) {
- struct swap_cluster_info *ci = NULL;
- struct lruvec *lruvec;
- int expected_refs;
-
- if (folio_order(folio) > 1 &&
- !list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
- if (folio_test_partially_mapped(folio)) {
- folio_clear_partially_mapped(folio);
- mod_mthp_stat(folio_order(folio),
- MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
- }
- /*
- * Reinitialize page_deferred_list after removing the
- * page from the split_queue, otherwise a subsequent
- * split will see list corruption when checking the
- * page_deferred_list.
- */
- list_del_init(&folio->_deferred_list);
- }
- spin_unlock(&ds_queue->split_queue_lock);
- if (mapping) {
- int nr = folio_nr_pages(folio);
-
- if (folio_test_pmd_mappable(folio) &&
- new_order < HPAGE_PMD_ORDER) {
- if (folio_test_swapbacked(folio)) {
- __lruvec_stat_mod_folio(folio,
- NR_SHMEM_THPS, -nr);
- } else {
- __lruvec_stat_mod_folio(folio,
- NR_FILE_THPS, -nr);
- filemap_nr_thps_dec(mapping);
- }
- }
- }
-
- if (folio_test_swapcache(folio)) {
- if (mapping) {
- VM_WARN_ON_ONCE_FOLIO(mapping, folio);
- ret = -EINVAL;
- goto fail;
- }
-
- ci = swap_cluster_get_and_lock(folio);
- }
-
- /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
- lruvec = folio_lruvec_lock(folio);
-
- ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
- mapping, uniform_split);
-
- /*
- * Unfreeze after-split folios and put them back to the right
- * list. @folio should be kept frozon until page cache
- * entries are updated with all the other after-split folios
- * to prevent others seeing stale page cache entries.
- * As a result, new_folio starts from the next folio of
- * @folio.
- */
- for (new_folio = folio_next(folio); new_folio != end_folio;
- new_folio = next) {
- unsigned long nr_pages = folio_nr_pages(new_folio);
-
- next = folio_next(new_folio);
-
- expected_refs = folio_expected_ref_count(new_folio) + 1;
- folio_ref_unfreeze(new_folio, expected_refs);
-
- lru_add_split_folio(folio, new_folio, lruvec, list);
-
- /*
- * Anonymous folio with swap cache.
- * NOTE: shmem in swap cache is not supported yet.
- */
- if (ci) {
- __swap_cache_replace_folio(ci, folio, new_folio);
- continue;
- }
-
- /* Anonymous folio without swap cache */
- if (!mapping)
- continue;
-
- /* Add the new folio to the page cache. */
- if (new_folio->index < end) {
- __xa_store(&mapping->i_pages, new_folio->index,
- new_folio, 0);
- continue;
- }
-
- /* Drop folio beyond EOF: ->index >= end */
- if (shmem_mapping(mapping))
- nr_shmem_dropped += nr_pages;
- else if (folio_test_clear_dirty(new_folio))
- folio_account_cleaned(
- new_folio, inode_to_wb(mapping->host));
- __filemap_remove_folio(new_folio, NULL);
- folio_put_refs(new_folio, nr_pages);
- }
- /*
- * Unfreeze @folio only after all page cache entries, which
- * used to point to it, have been updated with new folios.
- * Otherwise, a parallel folio_try_get() can grab @folio
- * and its caller can see stale page cache entries.
- */
- expected_refs = folio_expected_ref_count(folio) + 1;
- folio_ref_unfreeze(folio, expected_refs);
-
- unlock_page_lruvec(lruvec);
-
- if (ci)
- swap_cluster_unlock(ci);
- } else {
- spin_unlock(&ds_queue->split_queue_lock);
- ret = -EAGAIN;
- }
+ ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
+ true, list, split_type, end, &nr_shmem_dropped,
+ extra_pins);
fail:
if (mapping)
xas_unlock(&xas);
@@ -3872,9 +4061,10 @@ fail:
if (nr_shmem_dropped)
shmem_uncharge(mapping->host, nr_shmem_dropped);
- if (!ret && is_anon)
+ if (!ret && is_anon && !folio_is_device_private(folio))
remap_flags = RMP_USE_SHARED_ZEROPAGE;
- remap_page(folio, 1 << order, remap_flags);
+
+ remap_page(folio, 1 << old_order, remap_flags);
/*
* Unlock all after-split folios except the one containing
@@ -3905,9 +4095,51 @@ out_unlock:
i_mmap_unlock_read(mapping);
out:
xas_destroy(&xas);
- if (order == HPAGE_PMD_ORDER)
+ if (old_order == HPAGE_PMD_ORDER)
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
- count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
+ count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
+ return ret;
+}
+
+/**
+ * folio_split_unmapped() - split a large anon folio that is already unmapped
+ * @folio: folio to split
+ * @new_order: the order of folios after split
+ *
+ * This function is a helper for splitting folios that have already been
+ * unmapped. The use case is that the device or the CPU can refuse to migrate
+ * THP pages in the middle of migration, due to allocation issues on either
+ * side.
+ *
+ * anon_vma_lock is not required to be held, mmap_read_lock() or
+ * mmap_write_lock() should be held. @folio is expected to be locked by the
+ * caller. device-private and non device-private folios are supported along
+ * with folios that are in the swapcache. @folio should also be unmapped and
+ * isolated from LRU (if applicable)
+ *
+ * Upon return, the folio is not remapped, split folios are not added to LRU,
+ * free_folio_and_swap_cache() is not called, and new folios remain locked.
+ *
+ * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to
+ * insufficient reference count or extra pins).
+ */
+int folio_split_unmapped(struct folio *folio, unsigned int new_order)
+{
+ int extra_pins, ret = 0;
+
+ VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
+
+ if (!can_split_folio(folio, 1, &extra_pins))
+ return -EAGAIN;
+
+ local_irq_disable();
+ ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
+ NULL, false, NULL, SPLIT_TYPE_UNIFORM,
+ 0, NULL, extra_pins);
+ local_irq_enable();
return ret;
}
@@ -3958,22 +4190,22 @@ out:
* Returns -EINVAL when trying to split to an order that is incompatible
* with the folio. Splitting to order 0 is compatible with all folios.
*/
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order)
{
struct folio *folio = page_folio(page);
- return __folio_split(folio, new_order, &folio->page, page, list, true);
+ return __folio_split(folio, new_order, &folio->page, page, list,
+ SPLIT_TYPE_UNIFORM);
}
-/*
- * folio_split: split a folio at @split_at to a @new_order folio
+/**
+ * folio_split() - split a folio at @split_at to a @new_order folio
* @folio: folio to split
* @new_order: the order of the new folio
* @split_at: a page within the new folio
- *
- * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
- * split but not to @new_order, the caller needs to check)
+ * @list: after-split folios are added to @list if not null, otherwise to LRU
+ * list
*
* It has the same prerequisites and returns as
* split_huge_page_to_list_to_order().
@@ -3987,12 +4219,15 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
* [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
*
* After split, folio is left locked for caller.
+ *
+ * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
*/
int folio_split(struct folio *folio, unsigned int new_order,
struct page *split_at, struct list_head *list)
{
return __folio_split(folio, new_order, split_at, &folio->page, list,
- false);
+ SPLIT_TYPE_NON_UNIFORM);
}
int min_order_for_split(struct folio *folio)
@@ -4034,10 +4269,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
bool unqueued = false;
WARN_ON_ONCE(folio_ref_count(folio));
- WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
- ds_queue = get_deferred_split_queue(folio);
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
if (folio_test_partially_mapped(folio)) {
@@ -4048,7 +4282,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
list_del_init(&folio->_deferred_list);
unqueued = true;
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ split_queue_unlock_irqrestore(ds_queue, flags);
return unqueued; /* useful for debug warnings */
}
@@ -4056,10 +4290,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
void deferred_split_folio(struct folio *folio, bool partially_mapped)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(folio);
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = folio_memcg(folio);
-#endif
+ struct deferred_split *ds_queue;
unsigned long flags;
/*
@@ -4082,7 +4313,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
if (folio_test_swapcache(folio))
return;
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
if (partially_mapped) {
if (!folio_test_partially_mapped(folio)) {
folio_set_partially_mapped(folio);
@@ -4097,15 +4328,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
}
if (list_empty(&folio->_deferred_list)) {
+ struct mem_cgroup *memcg;
+
+ memcg = folio_split_queue_memcg(folio, ds_queue);
list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
ds_queue->split_queue_len++;
-#ifdef CONFIG_MEMCG
if (memcg)
set_shrinker_bit(memcg, folio_nid(folio),
- deferred_split_shrinker->id);
-#endif
+ shrinker_id(deferred_split_shrinker));
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ split_queue_unlock_irqrestore(ds_queue, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
@@ -4151,43 +4383,42 @@ static bool thp_underused(struct folio *folio)
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct pglist_data *pgdata = NODE_DATA(sc->nid);
- struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+ struct deferred_split *ds_queue;
unsigned long flags;
- LIST_HEAD(list);
- struct folio *folio, *next, *prev = NULL;
- int split = 0, removed = 0;
+ struct folio *folio, *next;
+ int split = 0, i;
+ struct folio_batch fbatch;
-#ifdef CONFIG_MEMCG
- if (sc->memcg)
- ds_queue = &sc->memcg->deferred_split_queue;
-#endif
+ folio_batch_init(&fbatch);
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+retry:
+ ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
/* Take pin on all head pages to avoid freeing them under us */
list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
_deferred_list) {
if (folio_try_get(folio)) {
- list_move(&folio->_deferred_list, &list);
- } else {
+ folio_batch_add(&fbatch, folio);
+ } else if (folio_test_partially_mapped(folio)) {
/* We lost race with folio_put() */
- if (folio_test_partially_mapped(folio)) {
- folio_clear_partially_mapped(folio);
- mod_mthp_stat(folio_order(folio),
- MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
- }
- list_del_init(&folio->_deferred_list);
- ds_queue->split_queue_len--;
+ folio_clear_partially_mapped(folio);
+ mod_mthp_stat(folio_order(folio),
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
+ list_del_init(&folio->_deferred_list);
+ ds_queue->split_queue_len--;
if (!--sc->nr_to_scan)
break;
+ if (!folio_batch_space(&fbatch))
+ break;
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ split_queue_unlock_irqrestore(ds_queue, flags);
- list_for_each_entry_safe(folio, next, &list, _deferred_list) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
bool did_split = false;
bool underused = false;
+ struct deferred_split *fqueue;
+ folio = fbatch.folios[i];
if (!folio_test_partially_mapped(folio)) {
/*
* See try_to_map_unused_to_zeropage(): we cannot
@@ -4210,38 +4441,27 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
}
folio_unlock(folio);
next:
+ if (did_split || !folio_test_partially_mapped(folio))
+ continue;
/*
- * split_folio() removes folio from list on success.
* Only add back to the queue if folio is partially mapped.
* If thp_underused returns false, or if split_folio fails
* in the case it was underused, then consider it used and
* don't add it back to split_queue.
*/
- if (did_split) {
- ; /* folio already removed from list */
- } else if (!folio_test_partially_mapped(folio)) {
- list_del_init(&folio->_deferred_list);
- removed++;
- } else {
- /*
- * That unlocked list_del_init() above would be unsafe,
- * unless its folio is separated from any earlier folios
- * left on the list (which may be concurrently unqueued)
- * by one safe folio with refcount still raised.
- */
- swap(folio, prev);
+ fqueue = folio_split_queue_lock_irqsave(folio, &flags);
+ if (list_empty(&folio->_deferred_list)) {
+ list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
+ fqueue->split_queue_len++;
}
- if (folio)
- folio_put(folio);
+ split_queue_unlock_irqrestore(fqueue, flags);
}
+ folios_put(&fbatch);
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- list_splice_tail(&list, &ds_queue->split_queue);
- ds_queue->split_queue_len -= removed;
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
-
- if (prev)
- folio_put(prev);
+ if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
+ cond_resched();
+ goto retry;
+ }
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
@@ -4252,6 +4472,33 @@ next:
return split;
}
+#ifdef CONFIG_MEMCG
+void reparent_deferred_split_queue(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct deferred_split *ds_queue = &memcg->deferred_split_queue;
+ struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
+ int nid;
+
+ spin_lock_irq(&ds_queue->split_queue_lock);
+ spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
+
+ if (!ds_queue->split_queue_len)
+ goto unlock;
+
+ list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
+ parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
+ ds_queue->split_queue_len = 0;
+
+ for_each_node(nid)
+ set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
+
+unlock:
+ spin_unlock(&parent_ds_queue->split_queue_lock);
+ spin_unlock_irq(&ds_queue->split_queue_lock);
+}
+#endif
+
#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
@@ -4613,7 +4860,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
return 0;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
- pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ if (unlikely(!pmd_present(*pvmw->pmd)))
+ pmdval = pmdp_huge_get_and_clear(vma->vm_mm, address, pvmw->pmd);
+ else
+ pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
@@ -4655,30 +4905,48 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
unsigned long address = pvmw->address;
unsigned long haddr = address & HPAGE_PMD_MASK;
pmd_t pmde;
- swp_entry_t entry;
+ softleaf_t entry;
if (!(pvmw->pmd && !pvmw->pte))
return;
- entry = pmd_to_swp_entry(*pvmw->pmd);
+ entry = softleaf_from_pmd(*pvmw->pmd);
folio_get(folio);
pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
+
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
- if (is_writable_migration_entry(entry))
+ if (softleaf_is_migration_write(entry))
pmde = pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_mkuffd_wp(pmde);
- if (!is_migration_entry_young(entry))
+ if (!softleaf_is_migration_young(entry))
pmde = pmd_mkold(pmde);
/* NOTE: this may contain setting soft-dirty on some archs */
- if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
+ if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
pmde = pmd_mkdirty(pmde);
+ if (folio_is_device_private(folio)) {
+ swp_entry_t entry;
+
+ if (pmd_write(pmde))
+ entry = make_writable_device_private_entry(
+ page_to_pfn(new));
+ else
+ entry = make_readable_device_private_entry(
+ page_to_pfn(new));
+ pmde = swp_entry_to_pmd(entry);
+
+ if (pmd_swp_soft_dirty(*pvmw->pmd))
+ pmde = pmd_swp_mksoft_dirty(pmde);
+ if (pmd_swp_uffd_wp(*pvmw->pmd))
+ pmde = pmd_swp_mkuffd_wp(pmde);
+ }
+
if (folio_test_anon(folio)) {
rmap_t rmap_flags = RMAP_NONE;
- if (!is_readable_migration_entry(entry))
+ if (!softleaf_is_migration_read(entry))
rmap_flags |= RMAP_EXCLUSIVE;
folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);