summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/mm/transhuge.rst8
-rw-r--r--include/linux/mm_types.h49
-rw-r--r--include/linux/page-flags.h4
-rw-r--r--include/linux/rmap.h165
-rw-r--r--kernel/fork.c36
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/internal.h5
-rw-r--r--mm/page_alloc.c10
8 files changed, 281 insertions, 0 deletions
diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst
index a2cd8800d527..baa17d718a76 100644
--- a/Documentation/mm/transhuge.rst
+++ b/Documentation/mm/transhuge.rst
@@ -120,11 +120,19 @@ pages:
and also increment/decrement folio->_nr_pages_mapped by ENTIRELY_MAPPED
when _entire_mapcount goes from -1 to 0 or 0 to -1.
+ We also maintain the two slots for tracking MM owners (MM ID and
+ corresponding mapcount), and the current status ("maybe mapped shared" vs.
+ "mapped exclusively").
+
- map/unmap of individual pages with PTE entry increment/decrement
page->_mapcount, increment/decrement folio->_large_mapcount and also
increment/decrement folio->_nr_pages_mapped when page->_mapcount goes
from -1 to 0 or 0 to -1 as this counts the number of pages mapped by PTE.
+ We also maintain the two slots for tracking MM owners (MM ID and
+ corresponding mapcount), and the current status ("maybe mapped shared" vs.
+ "mapped exclusively").
+
split_huge_page internally has to distribute the refcounts in the head
page to the tail pages before clearing all PG_head/tail bits from the page
structures. It can be done easily for refcounts taken by page table
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9499eb8e8e66..aac7c87b04e1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -292,6 +292,44 @@ typedef struct {
#define NR_PAGES_IN_LARGE_FOLIO
#endif
+/*
+ * On 32bit, we can cut the required metadata in half, because:
+ * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have,
+ * so we can limit MM IDs to 15 bit (32767).
+ * (b) We don't expect folios where even a single complete PTE mapping by
+ * one MM would exceed 15 bits (order-15).
+ */
+#ifdef CONFIG_64BIT
+typedef int mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX INT_MAX
+typedef unsigned int mm_id_t;
+#else /* !CONFIG_64BIT */
+typedef short mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX SHRT_MAX
+typedef unsigned short mm_id_t;
+#endif /* CONFIG_64BIT */
+
+/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */
+#define MM_ID_DUMMY 0
+#define MM_ID_MIN (MM_ID_DUMMY + 1)
+
+/*
+ * We leave the highest bit of each MM id unused, so we can store a flag
+ * in the highest bit of each folio->_mm_id[].
+ */
+#define MM_ID_BITS ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1)
+#define MM_ID_MASK ((1U << MM_ID_BITS) - 1)
+#define MM_ID_MAX MM_ID_MASK
+
+/*
+ * In order to use bit_spin_lock(), which requires an unsigned long, we
+ * operate on folio->_mm_ids when working on flags.
+ */
+#define FOLIO_MM_IDS_LOCK_BITNUM MM_ID_BITS
+#define FOLIO_MM_IDS_LOCK_BIT BIT(FOLIO_MM_IDS_LOCK_BITNUM)
+#define FOLIO_MM_IDS_SHARED_BITNUM (2 * MM_ID_BITS + 1)
+#define FOLIO_MM_IDS_SHARED_BIT BIT(FOLIO_MM_IDS_SHARED_BITNUM)
+
/**
* struct folio - Represents a contiguous set of bytes.
* @flags: Identical to the page flags.
@@ -318,6 +356,9 @@ typedef struct {
* @_nr_pages_mapped: Do not use outside of rmap and debug code.
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
* @_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_mm_id: Do not use outside of rmap code.
+ * @_mm_ids: Do not use outside of rmap code.
+ * @_mm_id_mapcount: Do not use outside of rmap code.
* @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
* @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
* @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
@@ -390,6 +431,11 @@ struct folio {
atomic_t _entire_mapcount;
atomic_t _pincount;
#endif /* CONFIG_64BIT */
+ mm_id_mapcount_t _mm_id_mapcount[2];
+ union {
+ mm_id_t _mm_id[2];
+ unsigned long _mm_ids;
+ };
/* private: the union with struct page is transitional */
};
unsigned long _usable_1[4];
@@ -1114,6 +1160,9 @@ struct mm_struct {
#endif
} lru_gen;
#endif /* CONFIG_LRU_GEN_WALKS_MMU */
+#ifdef CONFIG_MM_ID
+ mm_id_t mm_id;
+#endif /* CONFIG_MM_ID */
} __randomize_layout;
/*
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index cab382bd965e..f26ce54c7aa4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1185,6 +1185,10 @@ static inline int folio_has_private(const struct folio *folio)
return !!(folio->flags & PAGE_FLAGS_PRIVATE);
}
+static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio)
+{
+ return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
+}
#undef PF_ANY
#undef PF_HEAD
#undef PF_NO_TAIL
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d1e888cc97a5..c131b0efff0f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -13,6 +13,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
+#include <linux/bit_spinlock.h>
/*
* The anon_vma heads a list of private "related" vmas, to scan if
@@ -173,6 +174,169 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
struct anon_vma *folio_get_anon_vma(const struct folio *folio);
+#ifdef CONFIG_MM_ID
+static __always_inline void folio_lock_large_mapcount(struct folio *folio)
+{
+ bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
+}
+
+static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
+{
+ __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
+}
+
+static inline unsigned int folio_mm_id(const struct folio *folio, int idx)
+{
+ VM_WARN_ON_ONCE(idx != 0 && idx != 1);
+ return folio->_mm_id[idx] & MM_ID_MASK;
+}
+
+static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id)
+{
+ VM_WARN_ON_ONCE(idx != 0 && idx != 1);
+ folio->_mm_id[idx] &= ~MM_ID_MASK;
+ folio->_mm_id[idx] |= id;
+}
+
+static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
+ int diff, mm_id_t mm_id)
+{
+ VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio));
+ VM_WARN_ON_ONCE(diff <= 0);
+ VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX);
+
+ /*
+ * Make sure we can detect at least one complete PTE mapping of the
+ * folio in a single MM as "exclusively mapped". This is primarily
+ * a check on 32bit, where we currently reduce the size of the per-MM
+ * mapcount to a short.
+ */
+ VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
+ VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX);
+
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY &&
+ folio->_mm_id_mapcount[0] != -1);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY &&
+ folio->_mm_id_mapcount[0] < 0);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY &&
+ folio->_mm_id_mapcount[1] != -1);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
+ folio->_mm_id_mapcount[1] < 0);
+ VM_WARN_ON_ONCE(!folio_mapped(folio) &&
+ folio_test_large_maybe_mapped_shared(folio));
+}
+
+static __always_inline void folio_set_large_mapcount(struct folio *folio,
+ int mapcount, struct vm_area_struct *vma)
+{
+ __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);
+
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);
+
+ /* Note: mapcounts start at -1. */
+ atomic_set(&folio->_large_mapcount, mapcount - 1);
+ folio->_mm_id_mapcount[0] = mapcount - 1;
+ folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
+}
+
+static __always_inline void folio_add_large_mapcount(struct folio *folio,
+ int diff, struct vm_area_struct *vma)
+{
+ const mm_id_t mm_id = vma->vm_mm->mm_id;
+ int new_mapcount_val;
+
+ folio_lock_large_mapcount(folio);
+ __folio_large_mapcount_sanity_checks(folio, diff, mm_id);
+
+ new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
+ atomic_set(&folio->_large_mapcount, new_mapcount_val);
+
+ /*
+ * If a folio is mapped more than once into an MM on 32bit, we
+ * can in theory overflow the per-MM mapcount (although only for
+ * fairly large folios), turning it negative. In that case, just
+ * free up the slot and mark the folio "mapped shared", otherwise
+ * we might be in trouble when unmapping pages later.
+ */
+ if (folio_mm_id(folio, 0) == mm_id) {
+ folio->_mm_id_mapcount[0] += diff;
+ if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) {
+ folio->_mm_id_mapcount[0] = -1;
+ folio_set_mm_id(folio, 0, MM_ID_DUMMY);
+ folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+ }
+ } else if (folio_mm_id(folio, 1) == mm_id) {
+ folio->_mm_id_mapcount[1] += diff;
+ if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) {
+ folio->_mm_id_mapcount[1] = -1;
+ folio_set_mm_id(folio, 1, MM_ID_DUMMY);
+ folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+ }
+ } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) {
+ folio_set_mm_id(folio, 0, mm_id);
+ folio->_mm_id_mapcount[0] = diff - 1;
+ /* We might have other mappings already. */
+ if (new_mapcount_val != diff - 1)
+ folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+ } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) {
+ folio_set_mm_id(folio, 1, mm_id);
+ folio->_mm_id_mapcount[1] = diff - 1;
+ /* Slot 0 certainly has mappings as well. */
+ folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+ }
+ folio_unlock_large_mapcount(folio);
+}
+
+static __always_inline void folio_sub_large_mapcount(struct folio *folio,
+ int diff, struct vm_area_struct *vma)
+{
+ const mm_id_t mm_id = vma->vm_mm->mm_id;
+ int new_mapcount_val;
+
+ folio_lock_large_mapcount(folio);
+ __folio_large_mapcount_sanity_checks(folio, diff, mm_id);
+
+ new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
+ atomic_set(&folio->_large_mapcount, new_mapcount_val);
+
+ /*
+ * There are valid corner cases where we might underflow a per-MM
+ * mapcount (some mappings added when no slot was free, some mappings
+ * added once a slot was free), so we always set it to -1 once we go
+ * negative.
+ */
+ if (folio_mm_id(folio, 0) == mm_id) {
+ folio->_mm_id_mapcount[0] -= diff;
+ if (folio->_mm_id_mapcount[0] >= 0)
+ goto out;
+ folio->_mm_id_mapcount[0] = -1;
+ folio_set_mm_id(folio, 0, MM_ID_DUMMY);
+ } else if (folio_mm_id(folio, 1) == mm_id) {
+ folio->_mm_id_mapcount[1] -= diff;
+ if (folio->_mm_id_mapcount[1] >= 0)
+ goto out;
+ folio->_mm_id_mapcount[1] = -1;
+ folio_set_mm_id(folio, 1, MM_ID_DUMMY);
+ }
+
+ /*
+ * If one MM slot owns all mappings, the folio is mapped exclusively.
+ * Note that if the folio is now unmapped (new_mapcount_val == -1), both
+ * slots must be free (mapcount == -1), and we'll also mark it as
+ * exclusive.
+ */
+ if (folio->_mm_id_mapcount[0] == new_mapcount_val ||
+ folio->_mm_id_mapcount[1] == new_mapcount_val)
+ folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
+out:
+ folio_unlock_large_mapcount(folio);
+}
+#else /* !CONFIG_MM_ID */
+/*
+ * See __folio_rmap_sanity_checks(), we might map large folios even without
+ * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
+ */
static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
struct vm_area_struct *vma)
{
@@ -191,6 +355,7 @@ static inline void folio_sub_large_mapcount(struct folio *folio,
{
atomic_sub(diff, &folio->_large_mapcount);
}
+#endif /* CONFIG_MM_ID */
#define folio_inc_large_mapcount(folio, vma) \
folio_add_large_mapcount(folio, 1, vma)
diff --git a/kernel/fork.c b/kernel/fork.c
index 364b2d4fd3ef..f9cf0f056eb6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,6 +802,36 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
+#ifdef CONFIG_MM_ID
+static DEFINE_IDA(mm_ida);
+
+static inline int mm_alloc_id(struct mm_struct *mm)
+{
+ int ret;
+
+ ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+ mm->mm_id = ret;
+ return 0;
+}
+
+static inline void mm_free_id(struct mm_struct *mm)
+{
+ const mm_id_t id = mm->mm_id;
+
+ mm->mm_id = MM_ID_DUMMY;
+ if (id == MM_ID_DUMMY)
+ return;
+ if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
+ return;
+ ida_free(&mm_ida, id);
+}
+#else /* !CONFIG_MM_ID */
+static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
+static inline void mm_free_id(struct mm_struct *mm) {}
+#endif /* CONFIG_MM_ID */
+
static void check_mm(struct mm_struct *mm)
{
int i;
@@ -905,6 +935,7 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
+ mm_free_id(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
@@ -1289,6 +1320,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (mm_alloc_pgd(mm))
goto fail_nopgd;
+ if (mm_alloc_id(mm))
+ goto fail_noid;
+
if (init_new_context(p, mm))
goto fail_nocontext;
@@ -1308,6 +1342,8 @@ fail_pcpu:
fail_cid:
destroy_context(mm);
fail_nocontext:
+ mm_free_id(mm);
+fail_noid:
mm_free_pgd(mm);
fail_nopgd:
free_mm(mm);
diff --git a/mm/Kconfig b/mm/Kconfig
index 4c1640a197a0..a25c5476b3ad 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -815,11 +815,15 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config MM_ID
+ def_bool n
+
menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
select XARRAY_MULTI
+ select MM_ID
help
Transparent Hugepages allows the kernel to use huge pages and
huge tlb transparently to the applications whenever possible.
diff --git a/mm/internal.h b/mm/internal.h
index fcf0aeae3934..04724971379c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -763,6 +763,11 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
folio_set_order(folio, order);
atomic_set(&folio->_large_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ folio->_mm_ids = 0;
+ folio->_mm_id_mapcount[0] = -1;
+ folio->_mm_id_mapcount[1] = -1;
+ }
if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
atomic_set(&folio->_pincount, 0);
atomic_set(&folio->_entire_mapcount, -1);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e456a43811fd..c8daa3e64266 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -955,6 +955,16 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
bad_page(page, "nonzero nr_pages_mapped");
goto out;
}
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
+ bad_page(page, "nonzero mm mapcount 0");
+ goto out;
+ }
+ if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
+ bad_page(page, "nonzero mm mapcount 1");
+ goto out;
+ }
+ }
if (IS_ENABLED(CONFIG_64BIT)) {
if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
bad_page(page, "nonzero entire_mapcount");