summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig40
-rw-r--r--mm/Makefile4
-rw-r--r--mm/cma.c11
-rw-r--r--mm/cma.h6
-rw-r--r--mm/compaction.c24
-rw-r--r--mm/damon/Kconfig4
-rw-r--r--mm/damon/core.c49
-rw-r--r--mm/damon/paddr.c1
-rw-r--r--mm/damon/sysfs-schemes.c47
-rw-r--r--mm/damon/tests/core-kunit.h70
-rw-r--r--mm/debug.c6
-rw-r--r--mm/debug_page_alloc.c2
-rw-r--r--mm/debug_vm_pgtable.c18
-rw-r--r--mm/execmem.c2
-rw-r--r--mm/filemap.c45
-rw-r--r--mm/gup.c147
-rw-r--r--mm/hmm.c262
-rw-r--r--mm/huge_memory.c63
-rw-r--r--mm/hugetlb.c139
-rw-r--r--mm/hugetlb_cma.c11
-rw-r--r--mm/internal.h9
-rw-r--r--mm/io-mapping.c9
-rw-r--r--mm/khugepaged.c69
-rw-r--r--mm/kmemleak.c9
-rw-r--r--mm/kmsan/core.c12
-rw-r--r--mm/kmsan/hooks.c6
-rw-r--r--mm/kmsan/init.c3
-rw-r--r--mm/kmsan/instrumentation.c4
-rw-r--r--mm/kmsan/kmsan.h1
-rw-r--r--mm/kmsan/report.c6
-rw-r--r--mm/kmsan/shadow.c7
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c101
-rw-r--r--mm/memblock.c317
-rw-r--r--mm/memcontrol-v1.c9
-rw-r--r--mm/memcontrol.c771
-rw-r--r--mm/memfd.c1
-rw-r--r--mm/memory.c436
-rw-r--r--mm/memory_hotplug.c8
-rw-r--r--mm/mempolicy.c558
-rw-r--r--mm/memremap.c8
-rw-r--r--mm/migrate.c23
-rw-r--r--mm/mincore.c22
-rw-r--r--mm/mm_init.c58
-rw-r--r--mm/mmap.c306
-rw-r--r--mm/mmap_lock.c273
-rw-r--r--mm/mmu_gather.c1
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nommu.c28
-rw-r--r--mm/numa.c4
-rw-r--r--mm/numa_memblks.c22
-rw-r--r--mm/page-writeback.c9
-rw-r--r--mm/page_alloc.c62
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/ptdump.c62
-rw-r--r--mm/rmap.c30
-rw-r--r--mm/secretmem.c14
-rw-r--r--mm/shmem.c23
-rw-r--r--mm/show_mem.c18
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap.h18
-rw-r--r--mm/swap_state.c8
-rw-r--r--mm/swapfile.c184
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/userfaultfd.c2
-rw-r--r--mm/vma.c204
-rw-r--r--mm/vma.h15
-rw-r--r--mm/vma_exec.c161
-rw-r--r--mm/vma_init.c151
-rw-r--r--mm/vmalloc.c208
-rw-r--r--mm/vmscan.c119
-rw-r--r--mm/vmstat.c4
-rw-r--r--mm/workingset.c4
-rw-r--r--mm/zpdesc.h7
-rw-r--r--mm/zpool.c8
-rw-r--r--mm/zsmalloc.c32
-rw-r--r--mm/zswap.c2
80 files changed, 3613 insertions, 1788 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e113f713b493..f8bb8f070d0d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -469,6 +469,10 @@ config HAVE_GUP_FAST
depends on MMU
bool
+# Enable memblock support for scratch memory which is needed for kexec handover
+config MEMBLOCK_KHO_SCRATCH
+ bool
+
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
# after early boot, so it can still be used to test for validity of memory.
# Also, memblocks are updated with memory hot(un)plug.
@@ -882,7 +886,7 @@ config THP_SWAP
config READ_ONLY_THP_FOR_FS
bool "Read-only THP for filesystems (EXPERIMENTAL)"
- depends on TRANSPARENT_HUGEPAGE && SHMEM
+ depends on TRANSPARENT_HUGEPAGE
help
Allow khugepaged to put read-only file-backed pages in THP.
@@ -989,6 +993,40 @@ config CMA_AREAS
If unsure, leave the default value "8" in UMA and "20" in NUMA.
+#
+# Select this config option from the architecture Kconfig, if available, to set
+# the max page order for physically contiguous allocations.
+#
+config ARCH_FORCE_MAX_ORDER
+ int
+
+#
+# When ARCH_FORCE_MAX_ORDER is not defined,
+# the default page block order is MAX_PAGE_ORDER (10) as per
+# include/linux/mmzone.h.
+#
+config PAGE_BLOCK_ORDER
+ int "Page Block Order"
+ range 1 10 if ARCH_FORCE_MAX_ORDER = 0
+ default 10 if ARCH_FORCE_MAX_ORDER = 0
+ range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+ help
+ The page block order refers to the power of two number of pages that
+ are physically contiguous and can have a migrate type associated to
+ them. The maximum size of the page block order is limited by
+ ARCH_FORCE_MAX_ORDER.
+
+ This config allows overriding the default page block order when the
+ page block order is required to be smaller than ARCH_FORCE_MAX_ORDER
+ or MAX_PAGE_ORDER.
+
+ Reducing pageblock order can negatively impact THP generation
+ success rate. If your workloads uses THP heavily, please use this
+ option with caution.
+
+ Don't change if unsure.
+
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
diff --git a/mm/Makefile b/mm/Makefile
index e7f6bbf8ae5f..1a7a11d4933d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o vma.o
+ pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
@@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o show_mem.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o mmap_lock.o $(mmu-y)
+ debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
diff --git a/mm/cma.c b/mm/cma.c
index c04be488b099..397567883a10 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -143,13 +143,14 @@ bool cma_validate_zones(struct cma *cma)
static void __init cma_activate_area(struct cma *cma)
{
- unsigned long pfn, end_pfn;
+ unsigned long pfn, end_pfn, early_pfn[CMA_MAX_RANGES];
int allocrange, r;
struct cma_memrange *cmr;
unsigned long bitmap_count, count;
for (allocrange = 0; allocrange < cma->nranges; allocrange++) {
cmr = &cma->ranges[allocrange];
+ early_pfn[allocrange] = cmr->early_pfn;
cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr),
GFP_KERNEL);
if (!cmr->bitmap)
@@ -161,13 +162,13 @@ static void __init cma_activate_area(struct cma *cma)
for (r = 0; r < cma->nranges; r++) {
cmr = &cma->ranges[r];
- if (cmr->early_pfn != cmr->base_pfn) {
- count = cmr->early_pfn - cmr->base_pfn;
+ if (early_pfn[r] != cmr->base_pfn) {
+ count = early_pfn[r] - cmr->base_pfn;
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
bitmap_set(cmr->bitmap, 0, bitmap_count);
}
- for (pfn = cmr->early_pfn; pfn < cmr->base_pfn + cmr->count;
+ for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count;
pfn += pageblock_nr_pages)
init_cma_reserved_pageblock(pfn_to_page(pfn));
}
@@ -193,7 +194,7 @@ cleanup:
for (r = 0; r < allocrange; r++) {
cmr = &cma->ranges[r];
end_pfn = cmr->base_pfn + cmr->count;
- for (pfn = cmr->early_pfn; pfn < end_pfn; pfn++)
+ for (pfn = early_pfn[r]; pfn < end_pfn; pfn++)
free_reserved_page(pfn_to_page(pfn));
}
}
diff --git a/mm/cma.h b/mm/cma.h
index 41a3ab0ec3de..c70180c36559 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -25,9 +25,11 @@ struct cma_kobject {
*/
struct cma_memrange {
unsigned long base_pfn;
- unsigned long early_pfn;
unsigned long count;
- unsigned long *bitmap;
+ union {
+ unsigned long early_pfn;
+ unsigned long *bitmap;
+ };
#ifdef CONFIG_CMA_DEBUGFS
struct debugfs_u32_array dfs_bitmap;
#endif
diff --git a/mm/compaction.c b/mm/compaction.c
index ca71fd3c3181..3925cb61dbb8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1001,10 +1001,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = NULL;
}
- ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+ folio = page_folio(page);
+ ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages);
/*
- * Fail isolation in case isolate_or_dissolve_huge_page()
+ * Fail isolation in case isolate_or_dissolve_huge_folio()
* reports an error. In case of -ENOMEM, abort right away.
*/
if (ret < 0) {
@@ -1016,12 +1017,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_fail;
}
- if (PageHuge(page)) {
+ if (folio_test_hugetlb(folio)) {
/*
* Hugepage was successfully isolated and placed
* on the cc->migratepages list.
*/
- folio = page_folio(page);
low_pfn += folio_nr_pages(folio) - 1;
goto isolate_success_no_list;
}
@@ -2249,15 +2249,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
static unsigned int fragmentation_score_wmark(bool low)
{
- unsigned int wmark_low;
+ unsigned int wmark_low, leeway;
- /*
- * Cap the low watermark to avoid excessive compaction
- * activity in case a user sets the proactiveness tunable
- * close to 100 (maximum).
- */
- wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
- return low ? wmark_low : min(wmark_low + 10, 100U);
+ wmark_low = 100U - sysctl_compaction_proactiveness;
+ leeway = min(10U, wmark_low / 2);
+ return low ? wmark_low : min(wmark_low + leeway, 100U);
}
static bool should_proactive_compact_node(pg_data_t *pgdat)
@@ -2348,7 +2344,6 @@ static enum compact_result __compact_finished(struct compact_control *cc)
ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &cc->zone->free_area[order];
- bool claim_block;
/* Job done if page is free of the right migratetype */
if (!free_area_empty(area, migratetype))
@@ -2364,8 +2359,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* Job done if allocation would steal freepages from
* other migratetype buddy lists.
*/
- if (find_suitable_fallback(area, order, migratetype,
- true, &claim_block) != -1)
+ if (find_suitable_fallback(area, order, migratetype, true) >= 0)
/*
* Movable pages are OK in any pageblock. If we are
* stealing for a non-movable allocation, make sure
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index c213cf8b5638..551745df011b 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -4,6 +4,7 @@ menu "Data Access Monitoring"
config DAMON
bool "DAMON: Data Access Monitoring Framework"
+ default y
help
This builds a framework that allows kernel subsystems to monitor
access frequency of each memory region. The information can be useful
@@ -28,6 +29,7 @@ config DAMON_VADDR
bool "Data access monitoring operations for virtual address spaces"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that work for virtual address spaces.
@@ -36,6 +38,7 @@ config DAMON_PADDR
bool "Data access monitoring operations for the physical address space"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
+ default DAMON
help
This builds the default data access monitoring operations for DAMON
that works for the physical address space.
@@ -55,6 +58,7 @@ config DAMON_VADDR_KUNIT_TEST
config DAMON_SYSFS
bool "DAMON sysfs interface"
depends on DAMON && SYSFS
+ default DAMON
help
This builds the sysfs interface for DAMON. The user space can use
the interface for arbitrary data access monitoring.
diff --git a/mm/damon/core.c b/mm/damon/core.c
index f0c1676f0599..b217e0120e09 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1093,9 +1093,17 @@ static int damon_commit_targets(
if (err)
return err;
} else {
+ struct damos *s;
+
if (damon_target_has_pid(dst))
put_pid(dst_target->pid);
damon_destroy_target(dst_target);
+ damon_for_each_scheme(s, dst) {
+ if (s->quota.charge_target_from == dst_target) {
+ s->quota.charge_target_from = NULL;
+ s->quota.charge_addr_from = 0;
+ }
+ }
}
}
@@ -1392,6 +1400,19 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
}
/*
+ * Warn and fix corrupted ->nr_accesses[_bp] for investigations and preventing
+ * the problem being propagated.
+ */
+static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r)
+{
+ if (r->nr_accesses_bp == r->nr_accesses * 10000)
+ return;
+ WARN_ONCE(true, "invalid nr_accesses_bp at reset: %u %u\n",
+ r->nr_accesses_bp, r->nr_accesses);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
+}
+
+/*
* Reset the aggregated monitoring results ('nr_accesses' of each region).
*/
static void kdamond_reset_aggregated(struct damon_ctx *c)
@@ -1404,6 +1425,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
damon_for_each_region(r, t) {
trace_damon_aggregated(ti, r, damon_nr_regions(t));
+ damon_warn_fix_nr_accesses_corruption(r);
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
@@ -1889,6 +1911,29 @@ static inline u64 damos_get_some_mem_psi_total(void)
#endif /* CONFIG_PSI */
+#ifdef CONFIG_NUMA
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ struct sysinfo i;
+ __kernel_ulong_t numerator;
+
+ si_meminfo_node(&i, goal->nid);
+ if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP)
+ numerator = i.totalram - i.freeram;
+ else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */
+ numerator = i.freeram;
+ return numerator * 10000 / i.totalram;
+}
+#else
+static __kernel_ulong_t damos_get_node_mem_bp(
+ struct damos_quota_goal *goal)
+{
+ return 0;
+}
+#endif
+
+
static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
{
u64 now_psi_total;
@@ -1902,6 +1947,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
goal->current_value = now_psi_total - goal->last_psi_total;
goal->last_psi_total = now_psi_total;
break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->current_value = damos_get_node_mem_bp(goal);
+ break;
default:
break;
}
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 1b70d3f36046..e8464f7e0014 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -548,7 +548,6 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
unsigned long *sz_filter_passed)
{
unsigned long addr;
- LIST_HEAD(folio_list);
struct folio *folio;
if (!damon_pa_scheme_has_filter(s))
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 23b562df0839..0f6c9e1fec0b 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -465,7 +465,8 @@ static ssize_t memcg_path_store(struct kobject *kobj,
{
struct damon_sysfs_scheme_filter *filter = container_of(kobj,
struct damon_sysfs_scheme_filter, kobj);
- char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+ char *path = kmalloc_array(size_add(count, 1), sizeof(*path),
+ GFP_KERNEL);
if (!path)
return -ENOMEM;
@@ -936,12 +937,15 @@ struct damos_sysfs_quota_goal {
enum damos_quota_goal_metric metric;
unsigned long target_value;
unsigned long current_value;
+ int nid;
};
-/* This should match with enum damos_action */
+/* This should match with enum damos_quota_goal_metric */
static const char * const damos_sysfs_quota_goal_metric_strs[] = {
"user_input",
"some_mem_psi_us",
+ "node_mem_used_bp",
+ "node_mem_free_bp",
};
static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
@@ -1014,6 +1018,28 @@ static ssize_t current_value_store(struct kobject *kobj,
return err ? err : count;
}
+static ssize_t nid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+
+ /* todo: return error if the goal is not using nid */
+
+ return sysfs_emit(buf, "%d\n", goal->nid);
+}
+
+static ssize_t nid_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
+ damos_sysfs_quota_goal, kobj);
+ int err = kstrtoint(buf, 0, &goal->nid);
+
+ /* feed callback should check existence of this file and read value */
+ return err ? err : count;
+}
+
static void damos_sysfs_quota_goal_release(struct kobject *kobj)
{
/* or, notify this release to the feed callback */
@@ -1029,10 +1055,14 @@ static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr =
static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr =
__ATTR_RW_MODE(current_value, 0600);
+static struct kobj_attribute damos_sysfs_quota_goal_nid_attr =
+ __ATTR_RW_MODE(nid, 0600);
+
static struct attribute *damos_sysfs_quota_goal_attrs[] = {
&damos_sysfs_quota_goal_target_metric_attr.attr,
&damos_sysfs_quota_goal_target_value_attr.attr,
&damos_sysfs_quota_goal_current_value_attr.attr,
+ &damos_sysfs_quota_goal_nid_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damos_sysfs_quota_goal);
@@ -2035,7 +2065,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
if (!memcg_path)
return -EINVAL;
- path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+ path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL);
if (!path)
return -ENOMEM;
@@ -2120,8 +2150,17 @@ static int damos_sysfs_add_quota_score(
sysfs_goal->target_value);
if (!goal)
return -ENOMEM;
- if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT)
+ switch (sysfs_goal->metric) {
+ case DAMOS_QUOTA_USER_INPUT:
goal->current_value = sysfs_goal->current_value;
+ break;
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ goal->nid = sysfs_goal->nid;
+ break;
+ default:
+ break;
+ }
damos_add_quota_goal(quota, goal);
}
return 0;
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index be0fea9ee5fc..298c67557fae 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -510,6 +510,75 @@ static void damon_test_feed_loop_next_input(struct kunit *test)
damon_feed_loop_next_input(last_input, 2000));
}
+static void damon_test_set_filters_default_reject(struct kunit *test)
+{
+ struct damos scheme;
+ struct damos_filter *target_filter, *anon_filter;
+
+ INIT_LIST_HEAD(&scheme.filters);
+ INIT_LIST_HEAD(&scheme.ops_filters);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * No filter is installed. Allow by default on both core and ops layer
+ * filtering stages, since there are no filters at all.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true);
+ damos_add_filter(&scheme, target_filter);
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter is installed.
+ * Rejct by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, true);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ target_filter->allow = false;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter is installed.
+ * Allow by default on core layer filtering stage due to the last
+ * core-layer-filter's behavior.
+ * Allow by default on ops layer filtering stage due to the absence of
+ * ops layer filters.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
+
+ anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true);
+ damos_add_filter(&scheme, anon_filter);
+
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled reject-filter and ops-handled allow-filter are installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+
+ target_filter->allow = true;
+ damos_set_filters_default_reject(&scheme);
+ /*
+ * A core-handled allow-filter and ops-handled allow-filter are
+ * installed.
+ * Allow by default on core layer filtering stage due to the existence
+ * of the ops-handled filter.
+ * Reject by default on ops layer filtering stage due to the last
+ * ops-layer-filter's behavior.
+ */
+ KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
+ KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -527,6 +596,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damos_test_new_filter),
KUNIT_CASE(damos_test_filter_out),
KUNIT_CASE(damon_test_feed_loop_next_input),
+ KUNIT_CASE(damon_test_set_filters_default_reject),
{},
};
diff --git a/mm/debug.c b/mm/debug.c
index db83e381a8ae..907382257062 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -71,10 +71,12 @@ static void __dump_folio(struct folio *folio, struct page *page,
unsigned long pfn, unsigned long idx)
{
struct address_space *mapping = folio_mapping(folio);
- int mapcount = atomic_read(&page->_mapcount);
+ int mapcount = atomic_read(&page->_mapcount) + 1;
char *type = "";
- mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1;
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+
pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
folio_ref_count(folio), mapcount, mapping,
folio->index + idx, pfn);
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index d46acf989dde..6a26eca546c3 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -23,7 +23,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) {
- pr_err("Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value: %s\n", buf);
return 0;
}
_debug_guardpage_minorder = res;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index bc748f700a9e..7731b238b534 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -910,26 +910,18 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
#ifdef CONFIG_HUGETLB_PAGE
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
{
- struct page *page;
pte_t pte;
pr_debug("Validating HugeTLB basic\n");
- /*
- * Accessing the page associated with the pfn is safe here,
- * as it was previously derived from a real kernel symbol.
- */
- page = pfn_to_page(args->fixed_pmd_pfn);
- pte = mk_huge_pte(page, args->page_prot);
+ pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
+ pte = arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS);
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+ WARN_ON(!pte_huge(pte));
+#endif
WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
-
-#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
- pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
-
- WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS)));
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
#else /* !CONFIG_HUGETLB_PAGE */
static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { }
diff --git a/mm/execmem.c b/mm/execmem.c
index 6f7a2653b280..9720ac2dfa41 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -411,6 +411,8 @@ void *execmem_alloc(enum execmem_type type, size_t size)
pgprot_t pgprot = range->pgprot;
void *p;
+ size = PAGE_ALIGN(size);
+
if (use_cache)
p = execmem_cache_alloc(range, size);
else
diff --git a/mm/filemap.c b/mm/filemap.c
index 7b90cbeb4a1a..bada249b9fb7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -142,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping,
xas_init_marks(&xas);
folio->mapping = NULL;
- /* Leave page->index set: truncation lookup relies upon it */
+ /* Leave folio->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}
@@ -949,7 +949,7 @@ unlock:
return 0;
error:
folio->mapping = NULL;
- /* Leave page->index set: truncation relies upon it */
+ /* Leave folio->index set: truncation relies upon it */
folio_put_refs(folio, nr);
return xas_error(&xas);
}
@@ -1589,13 +1589,30 @@ int folio_wait_private_2_killable(struct folio *folio)
}
EXPORT_SYMBOL(folio_wait_private_2_killable);
+static void filemap_end_dropbehind(struct folio *folio)
+{
+ struct address_space *mapping = folio->mapping;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (!folio_test_clear_dropbehind(folio))
+ return;
+ if (mapping)
+ folio_unmap_invalidate(mapping, folio, 0);
+}
+
/*
* If folio was marked as dropbehind, then pages should be dropped when writeback
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
*/
-static void folio_end_dropbehind_write(struct folio *folio)
+static void filemap_end_dropbehind_write(struct folio *folio)
{
+ if (!folio_test_dropbehind(folio))
+ return;
+
/*
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
* but can happen if normal writeback just happens to find dirty folios
@@ -1604,8 +1621,7 @@ static void folio_end_dropbehind_write(struct folio *folio)
* invalidation in that case.
*/
if (in_task() && folio_trylock(folio)) {
- if (folio->mapping)
- folio_unmap_invalidate(folio->mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -1620,8 +1636,6 @@ static void folio_end_dropbehind_write(struct folio *folio)
*/
void folio_end_writeback(struct folio *folio)
{
- bool folio_dropbehind = false;
-
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
/*
@@ -1643,14 +1657,11 @@ void folio_end_writeback(struct folio *folio)
* reused before the folio_wake_bit().
*/
folio_get(folio);
- if (!folio_test_dirty(folio))
- folio_dropbehind = folio_test_clear_dropbehind(folio);
if (__folio_end_writeback(folio))
folio_wake_bit(folio, PG_writeback);
- acct_reclaim_writeback(folio);
- if (folio_dropbehind)
- folio_end_dropbehind_write(folio);
+ filemap_end_dropbehind_write(folio);
+ acct_reclaim_writeback(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);
@@ -2635,16 +2646,14 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
return (pos1 >> shift == pos2 >> shift);
}
-static void filemap_end_dropbehind_read(struct address_space *mapping,
- struct folio *folio)
+static void filemap_end_dropbehind_read(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
return;
if (folio_test_writeback(folio) || folio_test_dirty(folio))
return;
if (folio_trylock(folio)) {
- if (folio_test_clear_dropbehind(folio))
- folio_unmap_invalidate(mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -2765,7 +2774,7 @@ put_folios:
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- filemap_end_dropbehind_read(mapping, folio);
+ filemap_end_dropbehind_read(folio);
folio_put(folio);
}
folio_batch_init(&fbatch);
@@ -3533,7 +3542,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
struct page *page = folio_file_page(folio, start);
- vm_fault_t ret = do_set_pmd(vmf, page);
+ vm_fault_t ret = do_set_pmd(vmf, folio, page);
if (!ret) {
/* The page is mapped successfully, reference consumed. */
folio_unlock(folio);
diff --git a/mm/gup.c b/mm/gup.c
index 84461d384ae2..e065a49842a8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -26,6 +26,7 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#include "swap.h"
struct follow_page_context {
struct dev_pagemap *pgmap;
@@ -844,11 +845,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
pte_t *ptep, pte;
int ret;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return ERR_PTR(-EINVAL);
-
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
return no_page_table(vma, flags, address);
@@ -1106,10 +1102,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
/* user gate pages are read-only */
if (gup_flags & FOLL_WRITE)
return -EFAULT;
- if (address > TASK_SIZE)
- pgd = pgd_offset_k(address);
- else
- pgd = pgd_offset_gate(mm, address);
+ pgd = pgd_offset(mm, address);
if (pgd_none(*pgd))
return -EFAULT;
p4d = p4d_offset(pgd, address);
@@ -1432,7 +1425,11 @@ static long __get_user_pages(struct mm_struct *mm,
start = untagged_addr_remote(mm, start);
- VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+ VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET));
do {
struct page *page;
@@ -2114,28 +2111,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
*/
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
- char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
if (unlikely(size == 0))
return 0;
if (!user_write_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_put_user(0, uaddr, out);
- uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_put_user(0, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_put_user(0, (char __user *)cur, out);
out:
user_write_access_end();
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_writeable);
@@ -2189,26 +2180,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable);
*/
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
- unsigned long start = (unsigned long)uaddr, end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
struct mm_struct *mm = current->mm;
bool unlocked = false;
if (unlikely(size == 0))
return 0;
- end = PAGE_ALIGN(start + size);
- if (end < start)
- end = 0;
mmap_read_lock(mm);
- do {
- if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))
break;
- start = (start + PAGE_SIZE) & PAGE_MASK;
- } while (start != end);
mmap_read_unlock(mm);
- if (size > start - (unsigned long)uaddr)
- return size - (start - (unsigned long)uaddr);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
@@ -2223,30 +2212,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable);
*/
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
- const char __user *start = uaddr, *end;
+ const unsigned long start = (unsigned long)uaddr;
+ const unsigned long end = start + size;
+ unsigned long cur;
volatile char c;
if (unlikely(size == 0))
return 0;
if (!user_read_access_begin(uaddr, size))
return size;
- if (!PAGE_ALIGNED(uaddr)) {
- unsafe_get_user(c, uaddr, out);
- uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
- }
- end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
- if (unlikely(end < start))
- end = NULL;
- while (uaddr != end) {
- unsafe_get_user(c, uaddr, out);
- uaddr += PAGE_SIZE;
- }
+ /* Stop once we overflow to 0. */
+ for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
+ unsafe_get_user(c, (const char __user *)cur, out);
out:
user_read_access_end();
(void)c;
- if (size > uaddr - start)
- return size - (uaddr - start);
+ if (size > cur - start)
+ return size - (cur - start);
return 0;
}
EXPORT_SYMBOL(fault_in_readable);
@@ -3173,46 +3156,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
return 1;
}
-static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, unsigned int flags, struct page **pages,
- int *nr)
-{
- int refs;
- struct page *page;
- struct folio *folio;
-
- if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
- return 0;
-
- BUILD_BUG_ON(pgd_devmap(orig));
-
- page = pgd_page(orig);
- refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);
-
- folio = try_grab_folio_fast(page, refs, flags);
- if (!folio)
- return 0;
-
- if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!gup_fast_folio_allowed(folio, flags)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- *nr += refs;
- folio_set_referenced(folio);
- return 1;
-}
-
static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
@@ -3307,12 +3250,9 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
return;
- if (unlikely(pgd_leaf(pgd))) {
- if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags,
- pages, nr))
- return;
- } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
- pages, nr))
+ BUILD_BUG_ON(pgd_leaf(pgd));
+ if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
+ pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -3359,7 +3299,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
* include/asm-generic/tlb.h for more details.
*
* We do not adopt an rcu_read_lock() here as we also want to block IPIs
- * that come from THPs splitting.
+ * that come from callers of tlb_remove_table_sync_one().
*/
local_irq_save(flags);
gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
@@ -3647,7 +3587,7 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
{
unsigned int flags, nr_folios, nr_found;
unsigned int i, pgshift = PAGE_SHIFT;
- pgoff_t start_idx, end_idx, next_idx;
+ pgoff_t start_idx, end_idx;
struct folio *folio = NULL;
struct folio_batch fbatch;
struct hstate *h;
@@ -3697,20 +3637,8 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
folio = NULL;
}
- next_idx = 0;
for (i = 0; i < nr_found; i++) {
- /*
- * As there can be multiple entries for a
- * given folio in the batch returned by
- * filemap_get_folios_contig(), the below
- * check is to ensure that we pin and return a
- * unique set of folios between start and end.
- */
- if (next_idx &&
- next_idx != folio_index(fbatch.folios[i]))
- continue;
-
- folio = page_folio(&fbatch.folios[i]->page);
+ folio = fbatch.folios[i];
if (try_grab_folio(folio, 1, FOLL_PIN)) {
folio_batch_release(&fbatch);
@@ -3722,7 +3650,6 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
*offset = offset_in_folio(folio, start);
folios[nr_folios] = folio;
- next_idx = folio_next_index(folio);
if (++nr_folios == max_folios)
break;
}
diff --git a/mm/hmm.c b/mm/hmm.c
index 082f7b7c0b9e..feac86196a65 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
+#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -39,13 +41,21 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
+enum {
+ /* These flags are carried from input-to-output */
+ HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
+ HMM_PFN_P2PDMA_BUS,
+};
+
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, unsigned long cpu_flags)
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++)
- range->hmm_pfns[i] = cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= cpu_flags;
+ }
return 0;
}
@@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long cpu_flags;
pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
+ uint64_t new_pfn_flags = 0;
if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *hmm_pfn = 0;
- return 0;
+ goto out;
}
if (!pte_present(pte)) {
@@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
- return 0;
+ new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+ goto out;
}
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
- if (!required_fault) {
- *hmm_pfn = 0;
- return 0;
- }
+ if (!required_fault)
+ goto out;
if (!non_swap_entry(entry))
goto fault;
@@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_unmap(ptep);
return -EFAULT;
}
- *hmm_pfn = HMM_PFN_ERROR;
- return 0;
+ new_pfn_flags = HMM_PFN_ERROR;
+ goto out;
}
- *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ new_pfn_flags = pte_pfn(pte) | cpu_flags;
+out:
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
return 0;
fault:
@@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
goto out_unlock;
}
@@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->hmm_pfns[i] = pfn | cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= pfn | cpu_flags;
+ }
spin_unlock(ptl);
return 0;
@@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_dma_map_alloc - Allocate HMM map structure
+ * @dev: device to allocate structure for
+ * @map: HMM map to allocate
+ * @nr_entries: number of entries in the map
+ * @dma_entry_size: size of the DMA entry in the map
+ *
+ * Allocate the HMM map structure and all the lists it contains.
+ * Return 0 on success, -ENOMEM on failure.
+ */
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size)
+{
+ bool dma_need_sync = false;
+ bool use_iova;
+
+ WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
+
+ /*
+ * The HMM API violates our normal DMA buffer ownership rules and can't
+ * transfer buffer ownership. The dma_addressing_limited() check is a
+ * best approximation to ensure no swiotlb buffering happens.
+ */
+#ifdef CONFIG_DMA_NEED_SYNC
+ dma_need_sync = !dev->dma_skip_sync;
+#endif /* CONFIG_DMA_NEED_SYNC */
+ if (dma_need_sync || dma_addressing_limited(dev))
+ return -EOPNOTSUPP;
+
+ map->dma_entry_size = dma_entry_size;
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
+ return -ENOMEM;
+
+ use_iova = dma_iova_try_alloc(dev, &map->state, 0,
+ nr_entries * PAGE_SIZE);
+ if (!use_iova && dma_need_unmap(dev)) {
+ map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->dma_list)
+ goto err_dma;
+ }
+ return 0;
+
+err_dma:
+ kvfree(map->pfn_list);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
+
+/**
+ * hmm_dma_map_free - iFree HMM map structure
+ * @dev: device to free structure from
+ * @map: HMM map containing the various lists and state
+ *
+ * Free the HMM map structure and all the lists it contains.
+ */
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
+{
+ if (dma_use_iova(&map->state))
+ dma_iova_free(dev, &map->state);
+ kvfree(map->pfn_list);
+ kvfree(map->dma_list);
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_free);
+
+/**
+ * hmm_dma_map_pfn - Map a physical HMM page to DMA address
+ * @dev: Device to map the page for
+ * @map: HMM map
+ * @idx: Index into the PFN and dma address arrays
+ * @p2pdma_state: PCI P2P state.
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by their use in
+ * iova->size. Call this function after IOVA allocation to link whole @page
+ * to get the DMA address. Note that very first call to this function
+ * will have @offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx,
+ struct pci_p2pdma_map_state *p2pdma_state)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ struct page *page = hmm_pfn_to_page(pfns[idx]);
+ phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
+ size_t offset = idx * map->dma_entry_size;
+ unsigned long attrs = 0;
+ dma_addr_t dma_addr;
+ int ret;
+
+ if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
+ !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ */
+ if (dma_use_iova(state))
+ return state->addr + offset;
+
+ /*
+ * Without dma_need_unmap, the dma_addrs array is NULL, thus we
+ * need to regenerate the address below even if there already
+ * was a mapping. But !dma_need_unmap implies that the
+ * mapping stateless, so this is fine.
+ */
+ if (dma_need_unmap(dev))
+ return dma_addrs[idx];
+
+ /* Continue to remapping */
+ }
+
+ switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ pfns[idx] |= HMM_PFN_P2PDMA;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
+ return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ default:
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (dma_use_iova(state)) {
+ ret = dma_iova_link(dev, state, paddr, offset,
+ map->dma_entry_size, DMA_BIDIRECTIONAL,
+ attrs);
+ if (ret)
+ goto error;
+
+ ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
+ if (ret) {
+ dma_iova_unlink(dev, state, offset, map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+ goto error;
+ }
+
+ dma_addr = state->addr + offset;
+ } else {
+ if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
+ goto error;
+
+ dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, dma_addr))
+ goto error;
+
+ if (dma_need_unmap(dev))
+ dma_addrs[idx] = dma_addr;
+ }
+ pfns[idx] |= HMM_PFN_DMA_MAPPED;
+ return dma_addr;
+error:
+ pfns[idx] &= ~HMM_PFN_P2PDMA;
+ return DMA_MAPPING_ERROR;
+
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
+
+/**
+ * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
+ * @dev: Device to unmap the page from
+ * @map: HMM map
+ * @idx: Index of the PFN to unmap
+ *
+ * Returns true if the PFN was mapped and has been unmapped, false otherwise.
+ */
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
+{
+ const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ unsigned long attrs = 0;
+
+ if ((pfns[idx] & valid_dma) != valid_dma)
+ return false;
+
+ if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
+ ; /* no need to unmap bus address P2P mappings */
+ else if (dma_use_iova(state)) {
+ if (pfns[idx] & HMM_PFN_P2PDMA)
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ dma_iova_unlink(dev, state, idx * map->dma_entry_size,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
+ } else if (dma_need_unmap(dev))
+ dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+
+ pfns[idx] &=
+ ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
+ return true;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 47d76d03ce30..d3e66136e41a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1203,7 +1203,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
{
pmd_t entry;
- entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
@@ -1309,8 +1309,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
struct folio *zero_folio)
{
pmd_t entry;
- entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
- entry = pmd_mkhuge(entry);
+ entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
@@ -1456,7 +1455,8 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
return VM_FAULT_OOM;
}
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
+
ptl = pmd_lock(vma->vm_mm, vmf->pmd);
error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write,
pgtable);
@@ -1578,7 +1578,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
ptl = pud_lock(vma->vm_mm, vmf->pud);
insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
@@ -1786,7 +1786,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
- __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
+ __split_huge_pmd(src_vma, src_pmd, addr, false);
return -EAGAIN;
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -2008,7 +2008,7 @@ unlock_fallback:
folio_unlock(folio);
spin_unlock(vmf->ptl);
fallback:
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -2260,6 +2260,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, mm_counter_file(folio),
-HPAGE_PMD_NR);
+
+ /*
+ * Use flush_needed to indicate whether the PMD entry
+ * is present, instead of checking pmd_present() again.
+ */
+ if (flush_needed && pmd_young(orig_pmd) &&
+ likely(vma_has_recency(vma)))
+ folio_mark_accessed(folio);
}
spin_unlock(ptl);
@@ -2653,12 +2661,12 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
- _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
} else {
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
- _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
+ _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
}
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
@@ -3073,33 +3081,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, bool freeze, struct folio *folio)
+ pmd_t *pmd, bool freeze)
{
- bool pmd_migration = is_pmd_migration_entry(*pmd);
-
- VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
- VM_BUG_ON(freeze && !folio);
-
- /*
- * When the caller requests to set up a migration entry, we
- * require a folio to check the PMD against. Otherwise, there
- * is a risk of replacing the wrong folio.
- */
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || pmd_migration) {
- /*
- * Do not apply pmd_folio() to a migration entry; and folio lock
- * guarantees that it must be of the wrong folio anyway.
- */
- if (folio && (pmd_migration || folio != pmd_folio(*pmd)))
- return;
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
+ is_pmd_migration_entry(*pmd))
__split_huge_pmd_locked(vma, pmd, address, freeze);
- }
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct folio *folio)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mmu_notifier_range range;
@@ -3109,20 +3100,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
- split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
+ split_huge_pmd_locked(vma, range.start, pmd, freeze);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
- bool freeze, struct folio *folio)
+ bool freeze)
{
pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
if (!pmd)
return;
- __split_huge_pmd(vma, pmd, address, freeze, folio);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
@@ -3134,7 +3125,7 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned
if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
ALIGN(address, HPAGE_PMD_SIZE)))
- split_huge_pmd_address(vma, address, false, NULL);
+ split_huge_pmd_address(vma, address, false);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3646,7 +3637,7 @@ after_split:
* requires taking the lru_lock so we do the put_page
* of the tail pages after the split is complete.
*/
- free_page_and_swap_cache(&new_folio->page);
+ free_folio_and_swap_cache(new_folio);
}
return ret;
}
@@ -4680,7 +4671,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
folio_get(folio);
- pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
+ pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6a3cf7935c14..f0b1d53079f9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -58,6 +58,7 @@ int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+__initdata nodemask_t hugetlb_bootmem_nodes;
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;
@@ -1950,7 +1951,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
int order = huge_page_order(h);
struct folio *folio;
bool alloc_try_hard = true;
- bool retry = true;
/*
* By default we always try hard to allocate the folio with
@@ -1965,22 +1965,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
-retry:
- folio = __folio_alloc(gfp_mask, order, nid, nmask);
- /* Ensure hugetlb folio won't have large_rmappable flag set. */
- if (folio)
- folio_clear_large_rmappable(folio);
- if (folio && !folio_ref_freeze(folio, 1)) {
- folio_put(folio);
- if (retry) { /* retry once */
- retry = false;
- goto retry;
- }
- /* WOW! twice in a row. */
- pr_warn("HugeTLB unexpected inflated folio ref count\n");
- folio = NULL;
- }
+ folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a
@@ -2419,7 +2405,6 @@ static int gather_surplus_pages(struct hstate *h, long delta)
long i;
long needed, allocated;
bool alloc_ok = true;
- int node;
nodemask_t *mbind_nodemask, alloc_nodemask;
mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
@@ -2443,21 +2428,12 @@ retry:
for (i = 0; i < needed; i++) {
folio = NULL;
- /* Prioritize current node */
- if (node_isset(numa_mem_id(), alloc_nodemask))
- folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
- numa_mem_id(), NULL);
-
- if (!folio) {
- for_each_node_mask(node, alloc_nodemask) {
- if (node == numa_mem_id())
- continue;
- folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
- node, NULL);
- if (folio)
- break;
- }
- }
+ /*
+ * It is okay to use NUMA_NO_NODE because we use numa_mem_id()
+ * down the road to pick the current node if that is the case.
+ */
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
+ NUMA_NO_NODE, &alloc_nodemask);
if (!folio) {
alloc_ok = false;
break;
@@ -2896,10 +2872,9 @@ free_new:
return ret;
}
-int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
{
struct hstate *h;
- struct folio *folio = page_folio(page);
int ret = -EBUSY;
/*
@@ -3253,7 +3228,8 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
}
/* allocate from next node when distributing huge pages */
- for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) {
+ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node,
+ &hugetlb_bootmem_nodes) {
m = alloc_bootmem(h, node, false);
if (!m)
return 0;
@@ -3717,6 +3693,15 @@ static void __init hugetlb_init_hstates(void)
struct hstate *h, *h2;
for_each_hstate(h) {
+ /*
+ * Always reset to first_memory_node here, even if
+ * next_nid_to_alloc was set before - we can't
+ * reference hugetlb_bootmem_nodes after init, and
+ * first_memory_node is right for all further allocations.
+ */
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
+
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -3756,7 +3741,7 @@ static void __init report_hugepages(void)
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
- buf, h->free_huge_pages);
+ buf, h->nr_huge_pages);
if (nrinvalid)
pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
buf, nrinvalid, nrinvalid > 1 ? "s" : "");
@@ -5029,6 +5014,20 @@ static int __init default_hugepagesz_setup(char *s)
}
hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);
+void __init hugetlb_bootmem_set_nodes(void)
+{
+ int i, nid;
+ unsigned long start_pfn, end_pfn;
+
+ if (!nodes_empty(hugetlb_bootmem_nodes))
+ return;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ if (end_pfn > start_pfn)
+ node_set(nid, hugetlb_bootmem_nodes);
+ }
+}
+
static bool __hugetlb_bootmem_allocated __initdata;
bool __init hugetlb_bootmem_allocated(void)
@@ -5044,6 +5043,8 @@ void __init hugetlb_bootmem_alloc(void)
if (__hugetlb_bootmem_allocated)
return;
+ hugetlb_bootmem_set_nodes();
+
for (i = 0; i < MAX_NUMNODES; i++)
INIT_LIST_HEAD(&huge_boot_pages[i]);
@@ -5051,7 +5052,6 @@ void __init hugetlb_bootmem_alloc(void)
for_each_hstate(h) {
h->next_nid_to_alloc = first_online_node;
- h->next_nid_to_free = first_online_node;
if (hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -5480,18 +5480,16 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.pagesize = hugetlb_vm_op_pagesize,
};
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
bool try_mkwrite)
{
- pte_t entry;
+ pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
unsigned int shift = huge_page_shift(hstate_vma(vma));
if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
- entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
- vma->vm_page_prot)));
+ entry = pte_mkwrite_novma(pte_mkdirty(entry));
} else {
- entry = huge_pte_wrprotect(mk_huge_pte(page,
- vma->vm_page_prot));
+ entry = pte_wrprotect(entry);
}
entry = pte_mkyoung(entry);
entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
@@ -5546,7 +5544,7 @@ static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct folio *new_folio, pte_t old, unsigned long sz)
{
- pte_t newpte = make_huge_pte(vma, &new_folio->page, true);
+ pte_t newpte = make_huge_pte(vma, new_folio, true);
__folio_mark_uptodate(new_folio);
hugetlb_add_new_anon_rmap(new_folio, vma, addr);
@@ -5850,14 +5848,14 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- struct page *ref_page, zap_flags_t zap_flags)
+ struct folio *folio, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
+ const bool folio_provided = !!folio;
unsigned long address;
pte_t *ptep;
pte_t pte;
spinlock_t *ptl;
- struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
bool adjust_reservation = false;
@@ -5921,14 +5919,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
continue;
}
- page = pte_page(pte);
/*
- * If a reference page is supplied, it is because a specific
- * page is being unmapped, not a range. Ensure the page we
- * are about to unmap is the actual page of interest.
+ * If a folio is supplied, it is because a specific
+ * folio is being unmapped, not a range. Ensure the folio we
+ * are about to unmap is the actual folio of interest.
*/
- if (ref_page) {
- if (page != ref_page) {
+ if (folio_provided) {
+ if (folio != page_folio(pte_page(pte))) {
spin_unlock(ptl);
continue;
}
@@ -5938,12 +5935,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* looking like data was lost
*/
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ } else {
+ folio = page_folio(pte_page(pte));
}
pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
@@ -5951,7 +5950,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
make_pte_marker(PTE_MARKER_UFFD_WP),
sz);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- hugetlb_remove_rmap(page_folio(page));
+ hugetlb_remove_rmap(folio);
/*
* Restore the reservation for anonymous page, otherwise the
@@ -5960,8 +5959,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* reservation bit.
*/
if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
- folio_test_anon(page_folio(page))) {
- folio_set_hugetlb_restore_reserve(page_folio(page));
+ folio_test_anon(folio)) {
+ folio_set_hugetlb_restore_reserve(folio);
/* Reservation to be adjusted after the spin lock */
adjust_reservation = true;
}
@@ -5985,16 +5984,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* count will not be incremented by free_huge_folio.
* Act as if we consumed the reservation.
*/
- folio_clear_hugetlb_restore_reserve(page_folio(page));
+ folio_clear_hugetlb_restore_reserve(folio);
else if (rc)
vma_add_reservation(h, vma, address);
}
- tlb_remove_page_size(tlb, page, huge_page_size(h));
+ tlb_remove_page_size(tlb, folio_page(folio, 0),
+ folio_size(folio));
/*
- * Bail out after unmapping reference page if supplied
+ * If we were instructed to unmap a specific folio, we're done.
*/
- if (ref_page)
+ if (folio_provided)
break;
}
tlb_end_vma(tlb, vma);
@@ -6056,7 +6056,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page,
+ unsigned long end, struct folio *folio,
zap_flags_t zap_flags)
{
struct mmu_notifier_range range;
@@ -6068,7 +6068,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
mmu_notifier_invalidate_range_start(&range);
tlb_gather_mmu(&tlb, vma->vm_mm);
- __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
+ __unmap_hugepage_range(&tlb, vma, start, end,
+ folio, zap_flags);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
@@ -6081,7 +6082,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
* same region.
*/
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page *page, unsigned long address)
+ struct folio *folio, unsigned long address)
{
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
@@ -6125,7 +6126,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
- address + huge_page_size(h), page, 0);
+ address + huge_page_size(h),
+ folio, 0);
}
i_mmap_unlock_write(mapping);
}
@@ -6248,8 +6250,7 @@ retry_avoidcopy:
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- unmap_ref_private(mm, vma, &old_folio->page,
- vmf->address);
+ unmap_ref_private(mm, vma, old_folio, vmf->address);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
@@ -6296,7 +6297,7 @@ retry_avoidcopy:
spin_lock(vmf->ptl);
vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
- pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
+ pte_t newpte = make_huge_pte(vma, new_folio, !unshare);
/* Break COW or unshare */
huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
@@ -6576,7 +6577,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
- new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED);
+ new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED);
/*
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
@@ -7061,7 +7062,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
* with wp flag set, don't set pte write bit.
*/
- _dst_pte = make_huge_pte(dst_vma, &folio->page,
+ _dst_pte = make_huge_pte(dst_vma, folio,
!wp_enabled && !(is_continue && !vm_shared));
/*
* Always mark UFFDIO_COPY page dirty; note that this may not be
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
index e0f2d5c3a84c..f58ef4969e7a 100644
--- a/mm/hugetlb_cma.c
+++ b/mm/hugetlb_cma.c
@@ -66,7 +66,7 @@ hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact)
if (node_exact)
return NULL;
- for_each_online_node(node) {
+ for_each_node_mask(node, hugetlb_bootmem_nodes) {
cma = hugetlb_cma[node];
if (!cma || node == *nid)
continue;
@@ -153,11 +153,13 @@ void __init hugetlb_cma_reserve(int order)
if (!hugetlb_cma_size)
return;
+ hugetlb_bootmem_set_nodes();
+
for (nid = 0; nid < MAX_NUMNODES; nid++) {
if (hugetlb_cma_size_in_node[nid] == 0)
continue;
- if (!node_online(nid)) {
+ if (!node_isset(nid, hugetlb_bootmem_nodes)) {
pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
hugetlb_cma_size_in_node[nid] = 0;
@@ -190,13 +192,14 @@ void __init hugetlb_cma_reserve(int order)
* If 3 GB area is requested on a machine with 4 numa nodes,
* let's allocate 1 GB on first three nodes and ignore the last one.
*/
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ per_node = DIV_ROUND_UP(hugetlb_cma_size,
+ nodes_weight(hugetlb_bootmem_nodes));
pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
}
reserved = 0;
- for_each_online_node(nid) {
+ for_each_node_mask(nid, hugetlb_bootmem_nodes) {
int res;
char name[CMA_MAX_NAME];
diff --git a/mm/internal.h b/mm/internal.h
index 5c7a2b43ad76..6b8ed2017743 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -430,6 +430,9 @@ void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details);
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, struct zap_details *details);
int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
gfp_t gfp);
@@ -910,7 +913,7 @@ static inline void init_cma_pageblock(struct page *page)
int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool claim_only, bool *claim_block);
+ int migratetype, bool claimable);
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
@@ -1116,6 +1119,8 @@ DECLARE_STATIC_KEY_TRUE(deferred_pages);
bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void init_deferred_page(unsigned long pfn, int nid);
+
enum mminit_level {
MMINIT_WARNING,
MMINIT_VERIFY,
@@ -1619,5 +1624,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
}
#endif /* CONFIG_PT_RECLAIM */
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
index 01b362799930..d3586e95c12c 100644
--- a/mm/io-mapping.c
+++ b/mm/io-mapping.c
@@ -21,9 +21,10 @@ int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
return -EINVAL;
- /* We rely on prevalidation of the io-mapping to skip track_pfn(). */
- return remap_pfn_range_notrack(vma, addr, pfn, size,
- __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
- (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
+ pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
+ (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK));
+
+ /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
+ return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot);
}
EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cc945c6ab3bd..15203ea7d007 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -548,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static bool is_refcount_suitable(struct folio *folio)
-{
- int expected_refcount = folio_mapcount(folio);
-
- if (!folio_test_anon(folio) || folio_test_swapcache(folio))
- expected_refcount += folio_nr_pages(folio);
-
- if (folio_test_private(folio))
- expected_refcount++;
-
- return folio_ref_count(folio) == expected_refcount;
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
@@ -652,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
@@ -696,13 +683,13 @@ next:
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
referenced, writable, result);
return result;
}
@@ -746,7 +733,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
ptep_clear(vma->vm_mm, address, _pte);
folio_remove_rmap_pte(src, src_page, vma);
spin_unlock(ptl);
- free_page_and_swap_cache(src_page);
+ free_folio_and_swap_cache(src);
}
}
@@ -1239,7 +1226,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
@@ -1402,7 +1389,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1435,7 +1422,7 @@ out_unmap:
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
none_or_zero, result, unmapped);
return result;
}
@@ -1464,10 +1451,9 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
}
}
-#ifdef CONFIG_SHMEM
-/* hpage must be locked, and mmap_lock must be held */
+/* folio must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmdp, struct page *hpage)
+ pmd_t *pmdp, struct folio *folio, struct page *page)
{
struct vm_fault vmf = {
.vma = vma,
@@ -1476,13 +1462,12 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
.pmd = pmdp,
};
- VM_BUG_ON(!PageTransHuge(hpage));
mmap_assert_locked(vma->vm_mm);
- if (do_set_pmd(&vmf, hpage))
+ if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
- get_page(hpage);
+ folio_get(folio);
return SCAN_SUCCEED;
}
@@ -1689,7 +1674,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, &folio->page)
+ ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
: SCAN_SUCCEED;
goto drop_folio;
abort:
@@ -2295,6 +2280,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
+ if (!folio_try_get(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ xas_reset(&xas);
+ continue;
+ }
+
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@@ -2305,23 +2301,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* it's safe to skip LRU and refcount checks before
* returning.
*/
+ folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
+ folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
+ folio_put(folio);
break;
}
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
+ folio_put(folio);
break;
}
@@ -2333,6 +2333,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
*/
present += folio_nr_pages(folio);
+ folio_put(folio);
if (need_resched()) {
xas_pause(&xas);
@@ -2354,14 +2355,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
-#else
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
-{
- BUILD_BUG();
-}
-#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
@@ -2437,7 +2430,7 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
@@ -2783,7 +2776,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c12cef3eeb32..da9cee34ee1b 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -210,13 +210,11 @@ static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled = 1;
+static int kmemleak_enabled __read_mostly = 1;
/* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled = 1;
+static int kmemleak_free_enabled __read_mostly = 1;
/* set in the late_initcall if there were no errors */
static int kmemleak_late_initialized;
-/* set if a kmemleak warning was issued */
-static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
static int kmemleak_error;
@@ -254,7 +252,6 @@ static void kmemleak_disable(void);
#define kmemleak_warn(x...) do { \
pr_warn(x); \
dump_stack(); \
- kmemleak_warning = 1; \
} while (0)
/*
@@ -325,8 +322,6 @@ static void hex_dump_object(struct seq_file *seq,
* sufficient references to it (count >= min_count)
* - black - ignore, it doesn't contain references (e.g. text section)
* (min_count == -1). No function defined for this color.
- * Newly created objects don't have any color assigned (object->count == -1)
- * before the next memory scan when they become white.
*/
static bool color_white(const struct kmemleak_object *object)
{
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index a495debf1436..1ea711786c52 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -159,8 +159,8 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
* Make sure we have enough spare bits in @id to hold the UAF bit and
* the chain depth.
*/
- BUILD_BUG_ON(
- (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1));
+ BUILD_BUG_ON((1 << STACK_DEPOT_EXTRA_BITS) <=
+ (KMSAN_MAX_ORIGIN_DEPTH << 1));
extra_bits = stack_depot_get_extra_bits(id);
depth = kmsan_depth_from_eb(extra_bits);
@@ -274,11 +274,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos - 1, user_addr,
reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -292,11 +290,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
* poisoned bytes before, report them.
*/
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = 0;
cur_off_start = -1;
@@ -312,11 +308,9 @@ void kmsan_internal_check_memory(void *addr, size_t size,
*/
if (cur_origin != new_origin) {
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size,
cur_off_start, pos + i - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
cur_origin = new_origin;
cur_off_start = pos + i;
@@ -326,10 +320,8 @@ void kmsan_internal_check_memory(void *addr, size_t size,
}
KMSAN_WARN_ON(pos != size);
if (cur_origin) {
- kmsan_enter_runtime();
kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1,
user_addr, reason);
- kmsan_leave_runtime();
}
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3df45c25c1f6..97de3d6194f0 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -114,9 +114,7 @@ void kmsan_kfree_large(const void *ptr)
kmsan_enter_runtime();
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
- kmsan_internal_poison_memory((void *)ptr,
- page_size(page),
- GFP_KERNEL,
+ kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@@ -277,8 +275,10 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
* Don't check anything, just copy the shadow of the copied
* bytes.
*/
+ kmsan_enter_runtime();
kmsan_internal_memmove_metadata((void *)to, (void *)from,
to_copy - left);
+ kmsan_leave_runtime();
}
user_access_restore(ua_flags);
}
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 10f52c085e6c..b14ce3417e65 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -35,8 +35,7 @@ static void __init kmsan_record_future_shadow_range(void *start, void *end)
KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES);
KMSAN_WARN_ON((nstart >= nend) ||
/* Virtual address 0 is valid on s390. */
- (!IS_ENABLED(CONFIG_S390) && !nstart) ||
- !nend);
+ (!IS_ENABLED(CONFIG_S390) && !nstart) || !nend);
nstart = ALIGN_DOWN(nstart, PAGE_SIZE);
nend = ALIGN(nend, PAGE_SIZE);
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index 02a405e55d6c..69f0a57a401c 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -312,13 +312,9 @@ EXPORT_SYMBOL(__msan_unpoison_alloca);
void __msan_warning(u32 origin);
void __msan_warning(u32 origin)
{
- if (!kmsan_enabled || kmsan_in_runtime())
- return;
- kmsan_enter_runtime();
kmsan_report(origin, /*address*/ NULL, /*size*/ 0,
/*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ NULL,
REASON_ANY);
- kmsan_leave_runtime();
}
EXPORT_SYMBOL(__msan_warning);
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
index 29555a8bc315..bc3d1810f352 100644
--- a/mm/kmsan/kmsan.h
+++ b/mm/kmsan/kmsan.h
@@ -121,7 +121,6 @@ static __always_inline void kmsan_leave_runtime(void)
KMSAN_WARN_ON(--ctx->kmsan_in_runtime);
}
-depot_stack_handle_t kmsan_save_stack(void);
depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
unsigned int extra_bits);
diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c
index 94a3303fb65e..d6853ce08954 100644
--- a/mm/kmsan/report.c
+++ b/mm/kmsan/report.c
@@ -157,14 +157,14 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
unsigned long ua_flags;
bool is_uaf;
- if (!kmsan_enabled)
+ if (!kmsan_enabled || kmsan_in_runtime())
return;
if (current->kmsan_ctx.depth)
return;
if (!origin)
return;
- kmsan_disable_current();
+ kmsan_enter_runtime();
ua_flags = user_access_save();
raw_spin_lock(&kmsan_report_lock);
pr_err("=====================================================\n");
@@ -217,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size,
if (panic_on_kmsan)
panic("kmsan.panic set ...\n");
user_access_restore(ua_flags);
- kmsan_enable_current();
+ kmsan_leave_runtime();
}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 1bb505a08415..54f3c3c962f0 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -207,8 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
if (!kmsan_enabled || kmsan_in_runtime())
return;
kmsan_enter_runtime();
- kmsan_internal_poison_memory(page_address(page),
- page_size(page),
+ kmsan_internal_poison_memory(page_address(page), page_size(page),
GFP_KERNEL,
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
@@ -248,17 +247,19 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
s_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
+ kmsan_enter_runtime();
mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
o_pages, page_shift);
+ kmsan_leave_runtime();
if (mapped) {
err = mapped;
goto ret;
}
- kmsan_leave_runtime();
flush_tlb_kernel_range(shadow_start, shadow_end);
flush_tlb_kernel_range(origin_start, origin_end);
flush_cache_vmap(shadow_start, shadow_end);
diff --git a/mm/maccess.c b/mm/maccess.c
index 8f0906180a94..831b4dd7296c 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -196,7 +196,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
if (ret >= count) {
ret = count;
dst[ret - 1] = '\0';
- } else if (ret > 0) {
+ } else if (ret >= 0) {
ret++;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index b17f684322ad..8433ac9b27e0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -48,6 +48,11 @@ struct madvise_walk_private {
bool pageout;
};
+struct madvise_behavior {
+ int behavior;
+ struct mmu_gather *tlb;
+};
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_lock for writing. Others, which simply traverse vmas, need
@@ -794,12 +799,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static int madvise_free_single_vma(struct vm_area_struct *vma,
+static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
+ struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+ struct mmu_gather *tlb = madv_behavior->tlb;
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -815,17 +821,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
- tlb_start_vma(&tlb, vma);
+ tlb_start_vma(tlb, vma);
walk_page_range(vma->vm_mm, range.start, range.end,
- &madvise_free_walk_ops, &tlb);
- tlb_end_vma(&tlb, vma);
+ &madvise_free_walk_ops, tlb);
+ tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-
return 0;
}
@@ -848,7 +851,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
+static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior,
+ struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct zap_details details = {
@@ -856,7 +860,8 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
.even_cows = true,
};
- zap_page_range_single(vma, start, end - start, &details);
+ zap_page_range_single_batched(
+ madv_behavior->tlb, vma, start, end - start, &details);
return 0;
}
@@ -893,8 +898,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
static long madvise_dontneed_free(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- int behavior)
+ struct madvise_behavior *madv_behavior)
{
+ int behavior = madv_behavior->behavior;
struct mm_struct *mm = vma->vm_mm;
*prev = vma;
@@ -946,9 +952,10 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
- return madvise_dontneed_single_vma(vma, start, end);
+ return madvise_dontneed_single_vma(
+ madv_behavior, vma, start, end);
else if (behavior == MADV_FREE)
- return madvise_free_single_vma(vma, start, end);
+ return madvise_free_single_vma(madv_behavior, vma, start, end);
else
return -EINVAL;
}
@@ -1249,8 +1256,10 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
static int madvise_vma_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long behavior)
+ void *behavior_arg)
{
+ struct madvise_behavior *arg = behavior_arg;
+ int behavior = arg->behavior;
int error;
struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
@@ -1270,7 +1279,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
+ return madvise_dontneed_free(vma, prev, start, end, arg);
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1487,10 +1496,10 @@ static bool process_madvise_remote_valid(int behavior)
*/
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long arg,
+ unsigned long end, void *arg,
int (*visit)(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long arg))
+ unsigned long end, void *arg))
{
struct vm_area_struct *vma;
struct vm_area_struct *prev;
@@ -1548,7 +1557,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long anon_name)
+ void *anon_name)
{
int error;
@@ -1557,7 +1566,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (struct anon_vma_name *)anon_name);
+ anon_name);
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1589,7 +1598,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
if (end == start)
return 0;
- return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
+ return madvise_walk_vmas(mm, start, end, anon_name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
@@ -1619,6 +1628,31 @@ static void madvise_unlock(struct mm_struct *mm, int behavior)
mmap_read_unlock(mm);
}
+static bool madvise_batch_tlb_flush(int behavior)
+{
+ switch (behavior) {
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void madvise_init_tlb(struct madvise_behavior *madv_behavior,
+ struct mm_struct *mm)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_gather_mmu(madv_behavior->tlb, mm);
+}
+
+static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_finish_mmu(madv_behavior->tlb);
+}
+
static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
{
size_t len;
@@ -1677,8 +1711,10 @@ static bool is_madvise_populate(int behavior)
}
static int madvise_do_behavior(struct mm_struct *mm,
- unsigned long start, size_t len_in, int behavior)
+ unsigned long start, size_t len_in,
+ struct madvise_behavior *madv_behavior)
{
+ int behavior = madv_behavior->behavior;
struct blk_plug plug;
unsigned long end;
int error;
@@ -1692,7 +1728,7 @@ static int madvise_do_behavior(struct mm_struct *mm,
if (is_madvise_populate(behavior))
error = madvise_populate(mm, start, end, behavior);
else
- error = madvise_walk_vmas(mm, start, end, behavior,
+ error = madvise_walk_vmas(mm, start, end, madv_behavior,
madvise_vma_behavior);
blk_finish_plug(&plug);
return error;
@@ -1773,13 +1809,20 @@ static int madvise_do_behavior(struct mm_struct *mm,
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
int error;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
if (madvise_should_skip(start, len_in, behavior, &error))
return error;
error = madvise_lock(mm, behavior);
if (error)
return error;
- error = madvise_do_behavior(mm, start, len_in, behavior);
+ madvise_init_tlb(&madv_behavior, mm);
+ error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
+ madvise_finish_tlb(&madv_behavior);
madvise_unlock(mm, behavior);
return error;
@@ -1796,12 +1839,18 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
{
ssize_t ret = 0;
size_t total_len;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
total_len = iov_iter_count(iter);
ret = madvise_lock(mm, behavior);
if (ret)
return ret;
+ madvise_init_tlb(&madv_behavior, mm);
while (iov_iter_count(iter)) {
unsigned long start = (unsigned long)iter_iov_addr(iter);
@@ -1811,7 +1860,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
if (madvise_should_skip(start, len_in, behavior, &error))
ret = error;
else
- ret = madvise_do_behavior(mm, start, len_in, behavior);
+ ret = madvise_do_behavior(mm, start, len_in,
+ &madv_behavior);
/*
* An madvise operation is attempting to restart the syscall,
* but we cannot proceed as it would not be correct to repeat
@@ -1829,14 +1879,17 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
}
/* Drop and reacquire lock to unwind race. */
+ madvise_finish_tlb(&madv_behavior);
madvise_unlock(mm, behavior);
madvise_lock(mm, behavior);
+ madvise_init_tlb(&madv_behavior, mm);
continue;
}
if (ret < 0)
break;
iov_iter_advance(iter, iter_iov_len(iter));
}
+ madvise_finish_tlb(&madv_behavior);
madvise_unlock(mm, behavior);
ret = (total_len - iov_iter_count(iter)) ? : ret;
diff --git a/mm/memblock.c b/mm/memblock.c
index 0e9ebb8aa7fe..154f1d73b61f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -18,6 +18,11 @@
#include <linux/memblock.h>
#include <linux/mutex.h>
+#ifdef CONFIG_KEXEC_HANDOVER
+#include <linux/libfdt.h>
+#include <linux/kexec_handover.h>
+#endif /* CONFIG_KEXEC_HANDOVER */
+
#include <asm/sections.h>
#include <linux/io.h>
@@ -107,6 +112,13 @@ unsigned long min_low_pfn;
unsigned long max_pfn;
unsigned long long max_possible_pfn;
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */
+static bool kho_scratch_only;
+#else
+#define kho_scratch_only false
+#endif
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -166,6 +178,10 @@ bool __init_memblock memblock_has_mirror(void)
static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
+ /* skip non-scratch memory for kho early boot allocations */
+ if (kho_scratch_only)
+ return MEMBLOCK_KHO_SCRATCH;
+
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
@@ -499,7 +515,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
* needn't do it
*/
if (!use_slab)
- BUG_ON(memblock_reserve(addr, new_alloc_size));
+ BUG_ON(memblock_reserve_kern(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
@@ -649,7 +665,7 @@ repeat:
#ifdef CONFIG_NUMA
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
- WARN_ON(flags != rgn->flags);
+ WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags);
nr_new++;
if (insert) {
if (start_rgn == -1)
@@ -909,14 +925,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.reserved, base, size);
}
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size,
+ int nid, enum memblock_flags flags)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
- &base, &end, (void *)_RET_IP_);
+ memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
+ &base, &end, nid, flags, (void *)_RET_IP_);
- return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -931,6 +948,40 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
}
#endif
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+__init void memblock_set_kho_scratch_only(void)
+{
+ kho_scratch_only = true;
+}
+
+__init void memblock_clear_kho_scratch_only(void)
+{
+ kho_scratch_only = false;
+}
+
+__init void memmap_init_kho_scratch_pages(void)
+{
+ phys_addr_t start, end;
+ unsigned long pfn;
+ int nid;
+ u64 i;
+
+ if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
+ return;
+
+ /*
+ * Initialize struct pages for free scratch memory.
+ * The struct pages for reserved scratch memory will be set up in
+ * reserve_bootmem_region()
+ */
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) {
+ for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++)
+ init_deferred_page(pfn, nid);
+ }
+}
+#endif
+
/**
* memblock_setclr_flag - set or clear flag for a memory region
* @type: memblock type to set/clear flag for
@@ -1056,6 +1107,36 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t
MEMBLOCK_RSRV_NOINIT);
}
+/**
+ * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered
+ * for allocations during early boot with kexec handover.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 1,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
+/**
+ * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a
+ * specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+__init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(&memblock.memory, base, size, 0,
+ MEMBLOCK_KHO_SCRATCH);
+}
+
static bool should_skip_region(struct memblock_type *type,
struct memblock_region *m,
int nid, int flags)
@@ -1087,6 +1168,13 @@ static bool should_skip_region(struct memblock_type *type,
if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
return true;
+ /*
+ * In early alloc during kexec handover, we can only consider
+ * MEMBLOCK_KHO_SCRATCH regions for the allocations
+ */
+ if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m))
+ return true;
+
return false;
}
@@ -1467,14 +1555,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN))
goto done;
if (numa_valid_node(nid) && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !memblock_reserve_kern(found, size))
goto done;
}
@@ -1759,6 +1847,28 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
return memblock.reserved.total_size;
}
+phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid)
+{
+ struct memblock_region *r;
+ phys_addr_t total = 0;
+
+ for_each_reserved_mem_region(r) {
+ phys_addr_t size = r->size;
+
+ if (r->base > limit)
+ break;
+
+ if (r->base + r->size > limit)
+ size = limit - r->base;
+
+ if (nid == memblock_get_region_node(r) || !numa_valid_node(nid))
+ if (r->flags & MEMBLOCK_RSRV_KERN)
+ total += size;
+ }
+
+ return total;
+}
+
/**
* memblock_estimated_nr_free_pages - return estimated number of free pages
* from memblock point of view
@@ -2289,6 +2399,7 @@ void __init memblock_free_all(void)
free_unused_memmap();
reset_all_zones_managed_pages();
+ memblock_clear_kho_scratch_only();
pages = free_low_memory_core_early();
totalram_pages_add(pages);
}
@@ -2386,6 +2497,189 @@ int reserve_mem_release_by_name(const char *name)
return 1;
}
+#ifdef CONFIG_KEXEC_HANDOVER
+#define MEMBLOCK_KHO_FDT "memblock"
+#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1"
+#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1"
+static struct page *kho_fdt;
+
+static int reserve_mem_kho_finalize(struct kho_serialization *ser)
+{
+ int err = 0, i;
+
+ for (i = 0; i < reserved_mem_count; i++) {
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+
+ err |= kho_preserve_phys(map->start, map->size);
+ }
+
+ err |= kho_preserve_folio(page_folio(kho_fdt));
+ err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt));
+
+ return notifier_from_errno(err);
+}
+
+static int reserve_mem_kho_notifier(struct notifier_block *self,
+ unsigned long cmd, void *v)
+{
+ switch (cmd) {
+ case KEXEC_KHO_FINALIZE:
+ return reserve_mem_kho_finalize((struct kho_serialization *)v);
+ case KEXEC_KHO_ABORT:
+ return NOTIFY_DONE;
+ default:
+ return NOTIFY_BAD;
+ }
+}
+
+static struct notifier_block reserve_mem_kho_nb = {
+ .notifier_call = reserve_mem_kho_notifier,
+};
+
+static int __init prepare_kho_fdt(void)
+{
+ int err = 0, i;
+ void *fdt;
+
+ kho_fdt = alloc_page(GFP_KERNEL);
+ if (!kho_fdt)
+ return -ENOMEM;
+
+ fdt = page_to_virt(kho_fdt);
+
+ err |= fdt_create(fdt, PAGE_SIZE);
+ err |= fdt_finish_reservemap(fdt);
+
+ err |= fdt_begin_node(fdt, "");
+ err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE);
+ for (i = 0; i < reserved_mem_count; i++) {
+ struct reserve_mem_table *map = &reserved_mem_table[i];
+
+ err |= fdt_begin_node(fdt, map->name);
+ err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ err |= fdt_property(fdt, "start", &map->start, sizeof(map->start));
+ err |= fdt_property(fdt, "size", &map->size, sizeof(map->size));
+ err |= fdt_end_node(fdt);
+ }
+ err |= fdt_end_node(fdt);
+
+ err |= fdt_finish(fdt);
+
+ if (err) {
+ pr_err("failed to prepare memblock FDT for KHO: %d\n", err);
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+
+static int __init reserve_mem_init(void)
+{
+ int err;
+
+ if (!kho_is_enabled() || !reserved_mem_count)
+ return 0;
+
+ err = prepare_kho_fdt();
+ if (err)
+ return err;
+
+ err = register_kho_notifier(&reserve_mem_kho_nb);
+ if (err) {
+ put_page(kho_fdt);
+ kho_fdt = NULL;
+ }
+
+ return err;
+}
+late_initcall(reserve_mem_init);
+
+static void *__init reserve_mem_kho_retrieve_fdt(void)
+{
+ phys_addr_t fdt_phys;
+ static void *fdt;
+ int err;
+
+ if (fdt)
+ return fdt;
+
+ err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT)
+ pr_warn("failed to retrieve FDT '%s' from KHO: %d\n",
+ MEMBLOCK_KHO_FDT, err);
+ return NULL;
+ }
+
+ fdt = phys_to_virt(fdt_phys);
+
+ err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("FDT '%s' is incompatible with '%s': %d\n",
+ MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err);
+ fdt = NULL;
+ }
+
+ return fdt;
+}
+
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ int err, len_start, len_size, offset;
+ const phys_addr_t *p_start, *p_size;
+ const void *fdt;
+
+ fdt = reserve_mem_kho_retrieve_fdt();
+ if (!fdt)
+ return false;
+
+ offset = fdt_subnode_offset(fdt, 0, name);
+ if (offset < 0) {
+ pr_warn("FDT '%s' has no child '%s': %d\n",
+ MEMBLOCK_KHO_FDT, name, offset);
+ return false;
+ }
+ err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE);
+ if (err) {
+ pr_warn("Node '%s' is incompatible with '%s': %d\n",
+ name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err);
+ return false;
+ }
+
+ p_start = fdt_getprop(fdt, offset, "start", &len_start);
+ p_size = fdt_getprop(fdt, offset, "size", &len_size);
+ if (!p_start || len_start != sizeof(*p_start) || !p_size ||
+ len_size != sizeof(*p_size)) {
+ return false;
+ }
+
+ if (*p_start & (align - 1)) {
+ pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n",
+ name, (long)align, (long)*p_start);
+ return false;
+ }
+
+ if (*p_size != size) {
+ pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n",
+ name, (long)*p_size, (long)size);
+ return false;
+ }
+
+ reserved_mem_add(*p_start, size, name);
+ pr_info("Revived memory reservation '%s' from KHO\n", name);
+
+ return true;
+}
+#else
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+ phys_addr_t align)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER */
+
/*
* Parse reserve_mem=nn:align:name
*/
@@ -2441,6 +2735,11 @@ static int __init reserve_mem(char *p)
if (reserve_mem_find_by_name(name, &start, &tmp))
return -EBUSY;
+ /* Pick previous allocations up from KHO if available */
+ if (reserve_mem_kho_revive(name, size, align))
+ return 1;
+
+ /* TODO: Allocation must be outside of scratch region */
start = memblock_phys_alloc(size, align);
if (!start)
return -ENOMEM;
@@ -2458,6 +2757,8 @@ static const char * const flagname[] = {
[ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
[ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
[ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
+ [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN",
+ [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH",
};
static int memblock_debug_show(struct seq_file *m, void *private)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 4a9cf27a70af..4b94731305b9 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -512,9 +512,9 @@ static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
{
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __count_memcg_events(memcg, PGPGIN, 1);
+ count_memcg_events(memcg, PGPGIN, 1);
else {
- __count_memcg_events(memcg, PGPGOUT, 1);
+ count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
@@ -689,7 +689,7 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long flags;
local_irq_save(flags);
- __count_memcg_events(memcg, PGPGOUT, pgpgout);
+ count_memcg_events(memcg, PGPGOUT, pgpgout);
__this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
memcg1_check_events(memcg, nid);
local_irq_restore(flags);
@@ -2198,8 +2198,7 @@ bool memcg1_alloc_events(struct mem_cgroup *memcg)
void memcg1_free_events(struct mem_cgroup *memcg)
{
- if (memcg->events_percpu)
- free_percpu(memcg->events_percpu);
+ free_percpu(memcg->events_percpu);
}
static int __init memcg1_init(void)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ec39e62b172e..902da8a9c643 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/cpuset.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
@@ -95,6 +96,9 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;
+static struct kmem_cache *memcg_cachep;
+static struct kmem_cache *memcg_pn_cachep;
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -129,8 +133,7 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
-static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages);
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -163,8 +166,16 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
- if (nr_pages)
- obj_cgroup_uncharge_pages(objcg, nr_pages);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ memcg1_account_kmem(memcg, -nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
+ }
spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
@@ -463,6 +474,8 @@ static const unsigned int memcg_vm_event_stat[] = {
NUMA_PAGE_MIGRATE,
NUMA_PTE_UPDATES,
NUMA_HINT_FAULTS,
+ NUMA_TASK_MIGRATE,
+ NUMA_TASK_SWAP,
#endif
};
@@ -492,8 +505,8 @@ struct memcg_vmstats_percpu {
unsigned int stats_updates;
/* Cached pointers for fast iteration in memcg_rstat_updated() */
- struct memcg_vmstats_percpu *parent;
- struct memcg_vmstats *vmstats;
+ struct memcg_vmstats_percpu __percpu *parent_pcpu;
+ struct memcg_vmstats *vmstats;
/* The above should fit a single cacheline for memcg_rstat_updated() */
@@ -520,7 +533,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
- atomic64_t stats_updates;
+ atomic_t stats_updates;
};
/*
@@ -544,60 +557,43 @@ static u64 flush_last_time;
#define FLUSH_TIME (2UL*HZ)
-/*
- * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
- * not rely on this as part of an acquired spinlock_t lock. These functions are
- * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
- * is sufficient.
- */
-static void memcg_stats_lock(void)
-{
- preempt_disable_nested();
- VM_WARN_ON_IRQS_ENABLED();
-}
-
-static void __memcg_stats_lock(void)
-{
- preempt_disable_nested();
-}
-
-static void memcg_stats_unlock(void)
-{
- preempt_enable_nested();
-}
-
-
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic64_read(&vmstats->stats_updates) >
+ return atomic_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
-static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
+ int cpu)
{
+ struct memcg_vmstats_percpu __percpu *statc_pcpu;
struct memcg_vmstats_percpu *statc;
- int cpu = smp_processor_id();
unsigned int stats_updates;
if (!val)
return;
- css_rstat_updated(&memcg->css, cpu);
- statc = this_cpu_ptr(memcg->vmstats_percpu);
- for (; statc; statc = statc->parent) {
- stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
- WRITE_ONCE(statc->stats_updates, stats_updates);
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ if (!in_nmi())
+ css_rstat_updated(&memcg->css, cpu);
+ statc_pcpu = memcg->vmstats_percpu;
+ for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
+ statc = this_cpu_ptr(statc_pcpu);
+ /*
+ * If @memcg is already flushable then all its ancestors are
+ * flushable as well and also there is no need to increase
+ * stats_updates.
+ */
+ if (memcg_vmstats_needs_flush(statc->vmstats))
+ break;
+
+ stats_updates = this_cpu_add_return(statc_pcpu->stats_updates,
+ abs(val));
if (stats_updates < MEMCG_CHARGE_BATCH)
continue;
- /*
- * If @memcg is already flush-able, increasing stats_updates is
- * redundant. Avoid the overhead of the atomic update.
- */
- if (!memcg_vmstats_needs_flush(statc->vmstats))
- atomic64_add(stats_updates,
- &statc->vmstats->stats_updates);
- WRITE_ONCE(statc->stats_updates, 0);
+ stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
+ atomic_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@@ -605,7 +601,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
- trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
+ trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@@ -687,15 +683,16 @@ static int memcg_state_val_in_pages(int idx, int val)
}
/**
- * __mod_memcg_state - update cgroup memory statistics
+ * mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
-void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
int val)
{
int i = memcg_stats_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -703,10 +700,14 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_state(memcg, idx, val);
+
+ put_cpu();
}
#ifdef CONFIG_MEMCG_V1
@@ -728,13 +729,14 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
}
#endif
-static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+static void mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
+ int cpu;
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
@@ -742,35 +744,19 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
- /*
- * The caller from rmap relies on disabled preemption because they never
- * update their counter from in-interrupt context. For these two
- * counters we check that the update is never performed from an
- * interrupt context while other caller need to have disabled interrupt.
- */
- __memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM)) {
- switch (idx) {
- case NR_ANON_MAPPED:
- case NR_FILE_MAPPED:
- case NR_ANON_THPS:
- WARN_ON_ONCE(!in_task());
- break;
- default:
- VM_WARN_ON_IRQS_ENABLED();
- }
- }
+ cpu = get_cpu();
/* Update memcg */
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
+ this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_lruvec_state(memcg, idx, val);
- memcg_stats_unlock();
+
+ put_cpu();
}
/**
@@ -791,7 +777,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
- __mod_memcg_lruvec_state(lruvec, idx, val);
+ mod_memcg_lruvec_state(lruvec, idx, val);
}
void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
@@ -841,15 +827,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
}
/**
- * __count_memcg_events - account VM events in a cgroup
+ * count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
* @count: the number of events that occurred
*/
-void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
int i = memcg_events_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -857,11 +844,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- memcg_stats_lock();
- __this_cpu_add(memcg->vmstats_percpu->events[i], count);
- memcg_rstat_updated(memcg, count);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->events[i], count);
+ memcg_rstat_updated(memcg, count, cpu);
trace_count_memcg_events(memcg, idx, count);
- memcg_stats_unlock();
+
+ put_cpu();
}
unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -1662,7 +1651,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = task_is_dying() || out_of_memory(&oc);
+ ret = out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -1756,155 +1745,234 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
pr_cont(" are going to be killed due to memory.oom.group set\n");
}
+/*
+ * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
+ * nr_pages in a single cacheline. This may change in future.
+ */
+#define NR_MEMCG_STOCK 7
+#define FLUSHING_CACHED_CHARGE 0
struct memcg_stock_pcp {
- local_trylock_t stock_lock;
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
+ local_trylock_t lock;
+ uint8_t nr_pages[NR_MEMCG_STOCK];
+ struct mem_cgroup *cached[NR_MEMCG_STOCK];
+
+ struct work_struct work;
+ unsigned long flags;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
+};
+struct obj_stock_pcp {
+ local_trylock_t lock;
+ unsigned int nr_bytes;
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
- unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
struct work_struct work;
unsigned long flags;
-#define FLUSHING_CACHED_CHARGE 0
};
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCAL_TRYLOCK(stock_lock),
+
+static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
};
+
static DEFINE_MUTEX(percpu_charge_mutex);
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static void drain_obj_stock(struct obj_stock_pcp *stock);
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg);
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
- * @gfp_mask: allocation mask.
*
- * The charges will only happen if @memcg matches the current cpu's memcg
- * stock, and at least @nr_pages are available in that stock. Failure to
- * service an allocation will refill the stock.
+ * Consume the cached charge if enough nr_pages are present otherwise return
+ * failure. Also return failure for charge request larger than
+ * MEMCG_CHARGE_BATCH or if the local lock is already taken.
*
* returns true if successful, false otherwise.
*/
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
- gfp_t gfp_mask)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
- unsigned long flags;
+ uint8_t stock_pages;
bool ret = false;
+ int i;
- if (nr_pages > MEMCG_CHARGE_BATCH)
- return ret;
-
- if (gfpflags_allow_spinning(gfp_mask))
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- else if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags))
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock))
return ret;
stock = this_cpu_ptr(&memcg_stock);
- stock_pages = READ_ONCE(stock->nr_pages);
- if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
- WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
- ret = true;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ if (memcg != READ_ONCE(stock->cached[i]))
+ continue;
+
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
+ if (stock_pages >= nr_pages) {
+ WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
+ ret = true;
+ }
+ break;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&memcg_stock.lock);
return ret;
}
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+
/*
* Returns stocks cached in percpu and reset cached information.
*/
-static void drain_stock(struct memcg_stock_pcp *stock)
+static void drain_stock(struct memcg_stock_pcp *stock, int i)
{
- unsigned int stock_pages = READ_ONCE(stock->nr_pages);
- struct mem_cgroup *old = READ_ONCE(stock->cached);
+ struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
+ uint8_t stock_pages;
if (!old)
return;
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
if (stock_pages) {
- page_counter_uncharge(&old->memory, stock_pages);
- if (do_memsw_account())
- page_counter_uncharge(&old->memsw, stock_pages);
-
- WRITE_ONCE(stock->nr_pages, 0);
+ memcg_uncharge(old, stock_pages);
+ WRITE_ONCE(stock->nr_pages[i], 0);
}
css_put(&old->css);
- WRITE_ONCE(stock->cached, NULL);
+ WRITE_ONCE(stock->cached[i], NULL);
}
-static void drain_local_stock(struct work_struct *dummy)
+static void drain_stock_fully(struct memcg_stock_pcp *stock)
+{
+ int i;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i)
+ drain_stock(stock, i);
+}
+
+static void drain_local_memcg_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
- /*
- * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
- * drain_stock races is that we always operate on local CPU stock
- * here with IRQ disabled
- */
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&memcg_stock.lock);
stock = this_cpu_ptr(&memcg_stock);
- old = drain_obj_stock(stock);
- drain_stock(stock);
+ drain_stock_fully(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ local_unlock(&memcg_stock.lock);
}
-/*
- * Cache charges(val) to local per_cpu area.
- * This will be consumed by consume_stock() function, later.
- */
-static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void drain_local_obj_stock(struct work_struct *dummy)
{
- struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
+ struct obj_stock_pcp *stock;
- stock = this_cpu_ptr(&memcg_stock);
- if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
- drain_stock(stock);
- css_get(&memcg->css);
- WRITE_ONCE(stock->cached, memcg);
- }
- stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
- WRITE_ONCE(stock->nr_pages, stock_pages);
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&obj_stock.lock);
+
+ stock = this_cpu_ptr(&obj_stock);
+ drain_obj_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- if (stock_pages > MEMCG_CHARGE_BATCH)
- drain_stock(stock);
+ local_unlock(&obj_stock.lock);
}
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
- unsigned long flags;
+ struct memcg_stock_pcp *stock;
+ struct mem_cgroup *cached;
+ uint8_t stock_pages;
+ bool success = false;
+ int empty_slot = -1;
+ int i;
+
+ /*
+ * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
+ * decide to increase it more than 127 then we will need more careful
+ * handling of nr_pages[] in struct memcg_stock_pcp.
+ */
+ BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
+
+ VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
- if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock)) {
/*
- * In case of unlikely failure to lock percpu stock_lock
- * uncharge memcg directly.
+ * In case of larger than batch refill or unlikely failure to
+ * lock the percpu memcg_stock.lock, uncharge memcg directly.
*/
- if (mem_cgroup_is_root(memcg))
- return;
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ memcg_uncharge(memcg, nr_pages);
return;
}
- __refill_stock(memcg, nr_pages);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ cached = READ_ONCE(stock->cached[i]);
+ if (!cached && empty_slot == -1)
+ empty_slot = i;
+ if (memcg == READ_ONCE(stock->cached[i])) {
+ stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
+ WRITE_ONCE(stock->nr_pages[i], stock_pages);
+ if (stock_pages > MEMCG_CHARGE_BATCH)
+ drain_stock(stock, i);
+ success = true;
+ break;
+ }
+ }
+
+ if (!success) {
+ i = empty_slot;
+ if (i == -1) {
+ i = get_random_u32_below(NR_MEMCG_STOCK);
+ drain_stock(stock, i);
+ }
+ css_get(&memcg->css);
+ WRITE_ONCE(stock->cached[i], memcg);
+ WRITE_ONCE(stock->nr_pages[i], nr_pages);
+ }
+
+ local_unlock(&memcg_stock.lock);
+}
+
+static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ memcg = READ_ONCE(stock->cached[i]);
+ if (!memcg)
+ continue;
+
+ if (READ_ONCE(stock->nr_pages[i]) &&
+ mem_cgroup_is_descendant(memcg, root_memcg)) {
+ flush = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return flush;
}
/*
@@ -1927,25 +1995,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- struct mem_cgroup *memcg;
- bool flush = false;
+ struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
+ struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
- rcu_read_lock();
- memcg = READ_ONCE(stock->cached);
- if (memcg && READ_ONCE(stock->nr_pages) &&
- mem_cgroup_is_descendant(memcg, root_memcg))
- flush = true;
- else if (obj_stock_flush_required(stock, root_memcg))
- flush = true;
- rcu_read_unlock();
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
+ is_memcg_drain_needed(memcg_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &memcg_st->flags)) {
+ if (cpu == curcpu)
+ drain_local_memcg_stock(&memcg_st->work);
+ else if (!cpu_is_isolated(cpu))
+ schedule_work_on(cpu, &memcg_st->work);
+ }
- if (flush &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
+ obj_stock_flush_required(obj_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &obj_st->flags)) {
if (cpu == curcpu)
- drain_local_stock(&stock->work);
+ drain_local_obj_stock(&obj_st->work);
else if (!cpu_is_isolated(cpu))
- schedule_work_on(cpu, &stock->work);
+ schedule_work_on(cpu, &obj_st->work);
}
}
migrate_enable();
@@ -1954,19 +2024,9 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old;
- unsigned long flags;
-
- stock = &per_cpu(memcg_stock, cpu);
-
- /* drain_obj_stock requires stock_lock */
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- old = drain_obj_stock(stock);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
-
- drain_stock(stock);
- obj_cgroup_put(old);
+ /* no need for the local lock */
+ drain_obj_stock(&per_cpu(obj_stock, cpu));
+ drain_stock_fully(&per_cpu(memcg_stock, cpu));
return 0;
}
@@ -2256,7 +2316,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long pflags;
retry:
- if (consume_stock(memcg, nr_pages, gfp_mask))
+ if (consume_stock(memcg, nr_pages))
return 0;
if (!gfpflags_allow_spinning(gfp_mask))
@@ -2457,17 +2517,47 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
-static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ if (likely(!in_nmi())) {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ } else {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
+
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ if (idx == NR_SLAB_RECLAIMABLE_B)
+ atomic_add(nr, &pn->slab_reclaimable);
+ else
+ atomic_add(nr, &pn->slab_unreclaimable);
+ }
+}
+#else
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+}
+#endif
+
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_memcg_lruvec_state(lruvec, idx, nr);
+ account_slab_nmi_safe(memcg, pgdat, idx, nr);
rcu_read_unlock();
}
@@ -2592,6 +2682,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
+ return NULL;
+
if (in_task()) {
memcg = current->active_memcg;
if (unlikely(memcg))
@@ -2654,6 +2747,23 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
return objcg;
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ if (likely(!in_nmi())) {
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+ } else {
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ atomic_add(val, &memcg->kmem_stat);
+ }
+}
+#else
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+}
+#endif
+
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -2666,7 +2776,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
refill_stock(memcg, nr_pages);
@@ -2694,7 +2804,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ account_kmem_nmi_safe(memcg, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -2762,50 +2872,27 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
obj_cgroup_put(objcg);
}
-/* Replace the stock objcg with objcg, return the old objcg */
-static struct obj_cgroup *replace_stock_objcg(struct memcg_stock_pcp *stock,
- struct obj_cgroup *objcg)
-{
- struct obj_cgroup *old = NULL;
-
- old = drain_obj_stock(stock);
- obj_cgroup_get(objcg);
- stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
- ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- WRITE_ONCE(stock->cached_objcg, objcg);
- return old;
-}
-
-static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
+static void __account_obj_stock(struct obj_cgroup *objcg,
+ struct obj_stock_pcp *stock, int nr,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
int *bytes;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- stock = this_cpu_ptr(&memcg_stock);
-
/*
* Save vmstat data in stock and skip vmstat array update unless
- * accumulating over a page of vmstat data or when pgdat or idx
- * changes.
+ * accumulating over a page of vmstat data or when pgdat changes.
*/
- if (READ_ONCE(stock->cached_objcg) != objcg) {
- old = replace_stock_objcg(stock, objcg);
- stock->cached_pgdat = pgdat;
- } else if (stock->cached_pgdat != pgdat) {
+ if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
struct pglist_data *oldpg = stock->cached_pgdat;
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
@@ -2831,37 +2918,38 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
}
}
if (nr)
- __mod_objcg_mlstate(objcg, pgdat, idx, nr);
-
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
}
-static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
bool ret = false;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock))
+ return ret;
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
+
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&obj_stock.lock);
return ret;
}
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
+static void drain_obj_stock(struct obj_stock_pcp *stock)
{
struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
- return NULL;
+ return;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
@@ -2874,7 +2962,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- __refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2898,13 +2987,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
*/
if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
@@ -2913,63 +3002,76 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
}
WRITE_ONCE(stock->cached_objcg, NULL);
- /*
- * The `old' objects needs to be released by the caller via
- * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
- */
- return old;
+ obj_cgroup_put(old);
}
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
if (objcg) {
memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- return true;
+ flush = true;
}
+ rcu_read_unlock();
- return false;
+ return flush;
}
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
- bool allow_uncharge)
+ bool allow_uncharge, int nr_acct, struct pglist_data *pgdat,
+ enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
unsigned int nr_pages = 0;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock)) {
+ if (pgdat)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+ nr_bytes = nr_bytes & (PAGE_SIZE - 1);
+ atomic_add(nr_bytes, &objcg->nr_charged_bytes);
+ goto out;
+ }
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
- old = replace_stock_objcg(stock, objcg);
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ WRITE_ONCE(stock->cached_objcg, objcg);
+
allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_acct, pgdat, idx);
+
if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
nr_pages = stock->nr_bytes >> PAGE_SHIFT;
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
-
+ local_unlock(&obj_stock.lock);
+out:
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
}
-int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
unsigned int nr_pages, nr_bytes;
int ret;
- if (consume_obj_stock(objcg, size))
+ if (likely(consume_obj_stock(objcg, size, pgdat, idx)))
return 0;
/*
@@ -3002,15 +3104,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
nr_pages += 1;
ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
- if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
+ if (!ret && (nr_bytes || pgdat))
+ refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0,
+ false, size, pgdat, idx);
return ret;
}
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0);
+}
+
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size, true);
+ refill_obj_stock(objcg, size, true, 0, NULL, 0);
}
static inline size_t obj_full_size(struct kmem_cache *s)
@@ -3062,23 +3170,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
return false;
}
- if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
- return false;
-
for (i = 0; i < size; i++) {
slab = virt_to_slab(p[i]);
if (!slab_obj_exts(slab) &&
alloc_slab_obj_exts(slab, s, flags, false)) {
- obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
+ /*
+ * if we fail and size is 1, memcg_alloc_abort_single() will
+ * just free the object, which is ok as we have not assigned
+ * objcg to its obj_ext yet
+ *
+ * for larger sizes, kmem_cache_free_bulk() will uncharge
+ * any objects that were already charged and obj_ext assigned
+ *
+ * TODO: we could batch this until slab_pgdat(slab) changes
+ * between iterations, with a more complicated undo
+ */
+ if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s),
+ slab_pgdat(slab), cache_vmstat_idx(s)))
+ return false;
+
off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
slab_obj_exts(slab)[off].objcg = objcg;
- mod_objcg_state(objcg, slab_pgdat(slab),
- cache_vmstat_idx(s), obj_full_size(s));
}
return true;
@@ -3087,6 +3204,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts)
{
+ size_t obj_size = obj_full_size(s);
+
for (int i = 0; i < objects; i++) {
struct obj_cgroup *objcg;
unsigned int off;
@@ -3097,9 +3216,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
continue;
obj_exts[off].objcg = NULL;
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
- -obj_full_size(s));
+ refill_obj_stock(objcg, obj_size, true, -obj_size,
+ slab_pgdat(slab), cache_vmstat_idx(s));
obj_cgroup_put(objcg);
}
}
@@ -3541,7 +3659,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
+ pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
+ node);
if (!pn)
return false;
@@ -3588,13 +3707,14 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
{
- struct memcg_vmstats_percpu *statc, *pstatc;
+ struct memcg_vmstats_percpu *statc;
+ struct memcg_vmstats_percpu __percpu *pstatc_pcpu;
struct mem_cgroup *memcg;
int node, cpu;
int __maybe_unused i;
long error;
- memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
+ memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
if (!memcg)
return ERR_PTR(-ENOMEM);
@@ -3619,9 +3739,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
for_each_possible_cpu(cpu) {
if (parent)
- pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
+ pstatc_pcpu = parent->vmstats_percpu;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- statc->parent = parent ? pstatc : NULL;
+ statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
statc->vmstats = memcg->vmstats;
}
@@ -3895,6 +4015,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
}
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{
+ int nid;
+
+ if (atomic_read(&memcg->kmem_stat)) {
+ int kmem = atomic_xchg(&memcg->kmem_stat, 0);
+ int index = memcg_stats_index(MEMCG_KMEM);
+
+ memcg->vmstats->state[index] += kmem;
+ if (parent)
+ parent->vmstats->state_pending[index] += kmem;
+ }
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct lruvec_stats *lstats = pn->lruvec_stats;
+ struct lruvec_stats *plstats = NULL;
+
+ if (parent)
+ plstats = parent->nodeinfo[nid]->lruvec_stats;
+
+ if (atomic_read(&pn->slab_reclaimable)) {
+ int slab = atomic_xchg(&pn->slab_reclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ if (atomic_read(&pn->slab_unreclaimable)) {
+ int slab = atomic_xchg(&pn->slab_unreclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ }
+}
+#else
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{}
+#endif
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -3903,6 +4070,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct aggregate_control ac;
int nid;
+ flush_nmi_stats(memcg, parent, cpu);
+
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
ac = (struct aggregate_control) {
@@ -3952,8 +4121,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
- if (atomic64_read(&memcg->vmstats->stats_updates))
- atomic64_set(&memcg->vmstats->stats_updates, 0);
+ if (atomic_read(&memcg->vmstats->stats_updates))
+ atomic_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)
@@ -4194,6 +4363,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
page_counter_set_high(&memcg->memory, high);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -4216,7 +4388,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (!reclaimed && !nr_retries--)
break;
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4243,6 +4415,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
xchg(&memcg->memory.max, max);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -4270,7 +4445,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
break;
cond_resched();
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4393,11 +4568,13 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
enum {
MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_SWAPPINESS_MAX,
MEMORY_RECLAIM_NULL,
};
static const match_table_t tokens = {
{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
{ MEMORY_RECLAIM_NULL, NULL },
};
@@ -4431,6 +4608,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
return -EINVAL;
break;
+ case MEMORY_RECLAIM_SWAPPINESS_MAX:
+ swappiness = SWAPPINESS_ANON_ONLY;
+ break;
default:
return -EINVAL;
}
@@ -4695,9 +4875,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
if (ug->nr_memory) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
- if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ memcg_uncharge(ug->memcg, ug->nr_memory);
if (ug->nr_kmem) {
mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
@@ -4973,15 +5151,16 @@ static int __init cgroup_memory(char *s)
__setup("cgroup.memory=", cgroup_memory);
/*
- * subsys_initcall() for memory controller.
+ * Memory controller init before cgroup_init() initialize root_mem_cgroup.
*
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but
* basically everything that doesn't depend on a specific mem_cgroup structure
* should be initialized from here.
*/
-static int __init mem_cgroup_init(void)
+int __init mem_cgroup_init(void)
{
+ unsigned int memcg_size;
int cpu;
/*
@@ -4995,13 +5174,22 @@ static int __init mem_cgroup_init(void)
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_stock);
+ drain_local_memcg_stock);
+ INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
+ drain_local_obj_stock);
+ }
+
+ memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
+ memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
+
+ memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN);
return 0;
}
-subsys_initcall(mem_cgroup_init);
#ifdef CONFIG_SWAP
/**
@@ -5455,3 +5643,8 @@ static int __init mem_cgroup_swap_init(void)
subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
+
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+ return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+}
diff --git a/mm/memfd.c b/mm/memfd.c
index c64df1343059..ab367e61553d 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -20,6 +20,7 @@
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
+#include "swap.h"
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
diff --git a/mm/memory.c b/mm/memory.c
index 49199410805c..8eba595056fe 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -278,8 +278,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
p4d_free_tlb(tlb, p4d, start);
}
-/*
- * This function frees user-level page tables of a process.
+/**
+ * free_pgd_range - Unmap and free page tables in the range
+ * @tlb: the mmu_gather containing pending TLB flush info
+ * @addr: virtual address start
+ * @end: virtual address end
+ * @floor: lowest address boundary
+ * @ceiling: highest address boundary
+ *
+ * This function tears down all user-level page tables in the
+ * specified virtual address range [@addr..@end). It is part of
+ * the memory unmap flow.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -349,6 +358,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
{
struct unlink_vma_file_batch vb;
+ tlb_free_vmas(tlb);
+
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -518,10 +529,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
mapping ? mapping->a_ops->read_folio : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -929,7 +941,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
+ pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
/* Uffd-wp needs to be delivered to dest pte as well */
@@ -1361,7 +1373,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
- unsigned long next, pfn = 0;
+ unsigned long next;
bool is_cow;
int ret;
@@ -1371,12 +1383,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (is_vm_hugetlb_page(src_vma))
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
- if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- ret = track_pfn_copy(dst_vma, src_vma, &pfn);
- if (ret)
- return ret;
- }
-
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
@@ -1418,8 +1424,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
- if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
- untrack_pfn_copy(dst_vma, pfn);
return ret;
}
@@ -1799,7 +1803,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
addr = next;
continue;
@@ -1914,9 +1918,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file)
uprobe_munmap(vma, start, end);
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn(vma, 0, 0, mm_wr_locked);
-
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
@@ -1990,35 +1991,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
}
/**
- * zap_page_range_single - remove user pages in a given range
+ * zap_page_range_single_batched - remove user pages in a given range
+ * @tlb: pointer to the caller's struct mmu_gather
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
- * @size: number of bytes to zap
+ * @address: starting address of pages to remove
+ * @size: number of bytes to remove
* @details: details of shared cache invalidation
*
- * The range must fit into one VMA.
+ * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for
+ * hugetlb, @tlb is flushed and re-initialized by this function.
*/
-void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
const unsigned long end = address + size;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+
+ VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
- tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
/*
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
- unmap_single_vma(&tlb, vma, address, end, details, false);
+ unmap_single_vma(tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
+ if (is_vm_hugetlb_page(vma)) {
+ /*
+ * flush tlb and free resources before hugetlb_zap_end(), to
+ * avoid concurrent page faults' allocation failure.
+ */
+ tlb_finish_mmu(tlb);
+ hugetlb_zap_end(vma, details);
+ tlb_gather_mmu(tlb, vma->vm_mm);
+ }
+}
+
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of shared cache invalidation
+ *
+ * The range must fit into one VMA.
+ */
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
+{
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ zap_page_range_single_batched(&tlb, vma, address, size, details);
tlb_finish_mmu(&tlb);
- hugetlb_zap_end(vma, details);
}
/**
@@ -2525,7 +2555,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
false);
@@ -2588,7 +2618,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
return VM_FAULT_SIGBUS;
@@ -2833,6 +2863,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
return error;
}
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
+ unsigned long size, pgprot_t *prot)
+{
+ struct pfnmap_track_ctx *ctx;
+
+ if (pfnmap_track(pfn, size, prot))
+ return ERR_PTR(-EINVAL);
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (unlikely(!ctx)) {
+ pfnmap_untrack(pfn, size);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ctx->pfn = pfn;
+ ctx->size = size;
+ kref_init(&ctx->kref);
+ return ctx;
+}
+
+void pfnmap_track_ctx_release(struct kref *ref)
+{
+ struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);
+
+ pfnmap_untrack(ctx->pfn, ctx->size);
+ kfree(ctx);
+}
+#endif /* __HAVE_PFNMAP_TRACKING */
+
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
@@ -2845,20 +2905,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
*
* Return: %0 on success, negative error code otherwise.
*/
+#ifdef __HAVE_PFNMAP_TRACKING
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
+ struct pfnmap_track_ctx *ctx = NULL;
int err;
- err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
- if (err)
+ size = PAGE_ALIGN(size);
+
+ /*
+ * If we cover the full VMA, we'll perform actual tracking, and
+ * remember to untrack when the last reference to our tracking
+ * context from a VMA goes away. We'll keep tracking the whole pfn
+ * range even during VMA splits and partial unmapping.
+ *
+ * If we only cover parts of the VMA, we'll only setup the cachemode
+ * in the pgprot for the pfn range.
+ */
+ if (addr == vma->vm_start && addr + size == vma->vm_end) {
+ if (vma->pfnmap_track_ctx)
+ return -EINVAL;
+ ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ } else if (pfnmap_setup_cachemode(pfn, size, &prot)) {
return -EINVAL;
+ }
err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
- if (err)
- untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
+ if (ctx) {
+ if (err)
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ else
+ vma->pfnmap_track_ctx = ctx;
+ }
return err;
}
+
+#else
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+}
+#endif
EXPORT_SYMBOL(remap_pfn_range);
/**
@@ -3523,7 +3614,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- entry = mk_pte(&new_folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(new_folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (unlikely(unshare)) {
if (pte_soft_dirty(vmf->orig_pte))
@@ -3730,7 +3821,7 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
* If all folio references are from mappings, and all mappings are in
* the page tables of this MM, then this folio is exclusive to this MM.
*/
- if (folio_test_large_maybe_mapped_shared(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
return false;
VM_WARN_ON_ONCE(folio_test_ksm(folio));
@@ -3753,7 +3844,7 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
folio_lock_large_mapcount(folio);
VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
- if (folio_test_large_maybe_mapped_shared(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
goto unlock;
if (folio_large_mapcount(folio) != folio_ref_count(folio))
goto unlock;
@@ -4579,8 +4670,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/*
* KSM sometimes has to copy on read faults, for example, if
- * page->index of !PageKSM() pages would be nonlinear inside the
- * anon VMA -- PageKSM() is lost on actual swapout.
+ * folio->index of non-ksm folios would be nonlinear inside the
+ * anon VMA -- the ksm flag is lost on actual swapout.
*/
folio = ksm_might_need_to_copy(folio, vma, vmf->address);
if (unlikely(!folio)) {
@@ -5013,7 +5104,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
*/
__folio_mark_uptodate(folio);
- entry = mk_pte(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry), vma);
@@ -5138,9 +5229,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
- struct folio *folio = page_folio(page);
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
@@ -5188,7 +5278,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
flush_icache_pages(vma, page, HPAGE_PMD_NR);
- entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -5213,7 +5303,7 @@ out:
return ret;
}
#else
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
return VM_FAULT_FALLBACK;
}
@@ -5245,6 +5335,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (pte_write(entry) && folio_test_dirty(folio))
+ entry = pte_mkdirty(entry);
if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
@@ -5305,6 +5397,7 @@ fallback:
else
page = vmf->page;
+ folio = page_folio(page);
/*
* check even for read faults because we might have lost our CoWed
* page
@@ -5316,8 +5409,8 @@ fallback:
}
if (pmd_none(*vmf->pmd)) {
- if (PageTransCompound(page)) {
- ret = do_set_pmd(vmf, page);
+ if (folio_test_pmd_mappable(folio)) {
+ ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
@@ -5328,7 +5421,6 @@ fallback:
return VM_FAULT_OOM;
}
- folio = page_folio(page);
nr_pages = folio_nr_pages(folio);
/*
@@ -5892,7 +5984,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
split:
/* COW or write-notify handled on pte level: split pmd. */
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -6338,258 +6430,6 @@ out:
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
-#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
-#include <linux/extable.h>
-
-static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- if (likely(mmap_read_trylock(mm)))
- return true;
-
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
-
- return !mmap_read_lock_killable(mm);
-}
-
-static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
-{
- /*
- * We don't have this operation yet.
- *
- * It should be easy enough to do: it's basically a
- * atomic_long_try_cmpxchg_acquire()
- * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
- * it also needs the proper lockdep magic etc.
- */
- return false;
-}
-
-static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- mmap_read_unlock(mm);
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
- return !mmap_write_lock_killable(mm);
-}
-
-/*
- * Helper for page fault handling.
- *
- * This is kind of equivalent to "mmap_read_lock()" followed
- * by "find_extend_vma()", except it's a lot more careful about
- * the locking (and will drop the lock on failure).
- *
- * For example, if we have a kernel bug that causes a page
- * fault, we don't want to just use mmap_read_lock() to get
- * the mm lock, because that would deadlock if the bug were
- * to happen while we're holding the mm lock for writing.
- *
- * So this checks the exception tables on kernel faults in
- * order to only do this all for instructions that are actually
- * expected to fault.
- *
- * We can also actually take the mm lock for writing if we
- * need to extend the vma, which helps the VM layer a lot.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- if (!get_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (likely(vma && (vma->vm_start <= addr)))
- return vma;
-
- /*
- * Well, dang. We might still be successful, but only
- * if we can extend a vma to do so.
- */
- if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
- mmap_read_unlock(mm);
- return NULL;
- }
-
- /*
- * We can try to upgrade the mmap lock atomically,
- * in which case we can continue to use the vma
- * we already looked up.
- *
- * Otherwise we'll have to drop the mmap lock and
- * re-take it, and also look up the vma again,
- * re-checking it.
- */
- if (!mmap_upgrade_trylock(mm)) {
- if (!upgrade_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (!vma)
- goto fail;
- if (vma->vm_start <= addr)
- goto success;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto fail;
- }
-
- if (expand_stack_locked(vma, addr))
- goto fail;
-
-success:
- mmap_write_downgrade(mm);
- return vma;
-
-fail:
- mmap_write_unlock(mm);
- return NULL;
-}
-#endif
-
-#ifdef CONFIG_PER_VMA_LOCK
-static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
-{
- unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
-
- /* Additional refcnt if the vma is attached. */
- if (!detaching)
- tgt_refcnt++;
-
- /*
- * If vma is detached then only vma_mark_attached() can raise the
- * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
- */
- if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
- return false;
-
- rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
- rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
- refcount_read(&vma->vm_refcnt) == tgt_refcnt,
- TASK_UNINTERRUPTIBLE);
- lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
-
- return true;
-}
-
-static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
-{
- *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
- rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
-}
-
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
-{
- bool locked;
-
- /*
- * __vma_enter_locked() returns false immediately if the vma is not
- * attached, otherwise it waits until refcnt is indicating that vma
- * is attached with no readers.
- */
- locked = __vma_enter_locked(vma, false);
-
- /*
- * We should use WRITE_ONCE() here because we can have concurrent reads
- * from the early lockless pessimistic check in vma_start_read().
- * We don't really care about the correctness of that early check, but
- * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
- */
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-
- if (locked) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(detached); /* vma should remain attached */
- }
-}
-EXPORT_SYMBOL_GPL(__vma_start_write);
-
-void vma_mark_detached(struct vm_area_struct *vma)
-{
- vma_assert_write_locked(vma);
- vma_assert_attached(vma);
-
- /*
- * We are the only writer, so no need to use vma_refcount_put().
- * The condition below is unlikely because the vma has been already
- * write-locked and readers can increment vm_refcnt only temporarily
- * before they check vm_lock_seq, realize the vma is locked and drop
- * back the vm_refcnt. That is a narrow window for observing a raised
- * vm_refcnt.
- */
- if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
- /* Wait until vma is detached with no readers. */
- if (__vma_enter_locked(vma, true)) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(!detached);
- }
- }
-}
-
-/*
- * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
- * stable and not isolated. If the VMA is not found or is being modified the
- * function returns NULL.
- */
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
-{
- MA_STATE(mas, &mm->mm_mt, address, address);
- struct vm_area_struct *vma;
-
- rcu_read_lock();
-retry:
- vma = mas_walk(&mas);
- if (!vma)
- goto inval;
-
- vma = vma_start_read(mm, vma);
- if (IS_ERR_OR_NULL(vma)) {
- /* Check if the VMA got isolated after we found it */
- if (PTR_ERR(vma) == -EAGAIN) {
- count_vm_vma_lock_event(VMA_LOCK_MISS);
- /* The area was replaced with another one */
- goto retry;
- }
-
- /* Failed to lock the VMA */
- goto inval;
- }
- /*
- * At this point, we have a stable reference to a VMA: The VMA is
- * locked and we know it hasn't already been isolated.
- * From here on, we can access the VMA without worrying about which
- * fields are accessible for RCU readers.
- */
-
- /* Check if the vma we locked is the right one. */
- if (unlikely(vma->vm_mm != mm ||
- address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
-
- rcu_read_unlock();
- return vma;
-
-inval_end_read:
- vma_end_read(vma);
-inval:
- rcu_read_unlock();
- count_vm_vma_lock_event(VMA_LOCK_ABORT);
- return NULL;
-}
-#endif /* CONFIG_PER_VMA_LOCK */
-
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8305483de38b..b1caedbade5b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1756,12 +1756,10 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
{
unsigned long pfn;
- for (pfn = start; pfn < end; pfn++) {
+ for_each_valid_pfn(pfn, start, end) {
struct page *page;
struct folio *folio;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
if (PageLRU(page))
goto found;
@@ -1805,11 +1803,9 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ for_each_valid_pfn(pfn, start_pfn, end_pfn) {
struct page *page;
- if (!pfn_valid(pfn))
- continue;
page = pfn_to_page(pfn);
folio = page_folio(page);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b28a1e6ae096..72fd72e156b1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -109,10 +109,12 @@
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
+#include <linux/gcd.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>
+#include <linux/memory.h>
#include "internal.h"
@@ -139,31 +141,138 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/*
- * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
- * system-default value should be used. A NULL iw_table also denotes that
- * system-default values should be used. Until the system-default table
- * is implemented, the system-default is always 1.
- *
- * iw_table is RCU protected
+ * weightiness balances the tradeoff between small weights (cycles through nodes
+ * faster, more fair/even distribution) and large weights (smaller errors
+ * between actual bandwidth ratios and weight ratios). 32 is a number that has
+ * been found to perform at a reasonable compromise between the two goals.
+ */
+static const int weightiness = 32;
+
+/*
+ * A null weighted_interleave_state is interpreted as having .mode="auto",
+ * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
*/
-static u8 __rcu *iw_table;
-static DEFINE_MUTEX(iw_table_lock);
+struct weighted_interleave_state {
+ bool mode_auto;
+ u8 iw_table[];
+};
+static struct weighted_interleave_state __rcu *wi_state;
+static unsigned int *node_bw_table;
+
+/*
+ * wi_state_lock protects both wi_state and node_bw_table.
+ * node_bw_table is only used by writers to update wi_state.
+ */
+static DEFINE_MUTEX(wi_state_lock);
static u8 get_il_weight(int node)
{
- u8 *table;
- u8 weight;
+ struct weighted_interleave_state *state;
+ u8 weight = 1;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- /* if no iw_table, use system default */
- weight = table ? table[node] : 1;
- /* if value in iw_table is 0, use system default */
- weight = weight ? weight : 1;
+ state = rcu_dereference(wi_state);
+ if (state)
+ weight = state->iw_table[node];
rcu_read_unlock();
return weight;
}
+/*
+ * Convert bandwidth values into weighted interleave weights.
+ * Call with wi_state_lock.
+ */
+static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
+{
+ u64 sum_bw = 0;
+ unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ sum_bw += bw[nid];
+
+ /* Scale bandwidths to whole numbers in the range [1, weightiness] */
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Try not to perform 64-bit division.
+ * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
+ * If sum_bw > scaling_factor, then round the weight up to 1.
+ */
+ scaling_factor = weightiness * bw[nid];
+ if (bw[nid] && sum_bw < scaling_factor) {
+ cast_sum_bw = (unsigned int)sum_bw;
+ new_iw[nid] = scaling_factor / cast_sum_bw;
+ } else {
+ new_iw[nid] = 1;
+ }
+ if (!iw_gcd)
+ iw_gcd = new_iw[nid];
+ iw_gcd = gcd(iw_gcd, new_iw[nid]);
+ }
+
+ /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
+ for_each_node_state(nid, N_MEMORY)
+ new_iw[nid] /= iw_gcd;
+}
+
+int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
+{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *old_bw, *new_bw;
+ unsigned int bw_val;
+ int i;
+
+ bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
+ new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+ if (!new_bw)
+ return -ENOMEM;
+
+ new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state) {
+ kfree(new_bw);
+ return -ENOMEM;
+ }
+ new_wi_state->mode_auto = true;
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+
+ /*
+ * Update bandwidth info, even in manual mode. That way, when switching
+ * to auto mode in the future, iw_table can be overwritten using
+ * accurate bw data.
+ */
+ mutex_lock(&wi_state_lock);
+
+ old_bw = node_bw_table;
+ if (old_bw)
+ memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
+ new_bw[node] = bw_val;
+ node_bw_table = new_bw;
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state && !old_wi_state->mode_auto) {
+ /* Manual mode; skip reducing weights and updating wi_state */
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ goto out;
+ }
+
+ /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
+ reduce_interleave_weights(new_bw, new_wi_state->iw_table);
+ rcu_assign_pointer(wi_state, new_wi_state);
+
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+out:
+ kfree(old_bw);
+ return 0;
+}
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -566,6 +675,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct vm_area_struct *vma = walk->vma;
struct folio *folio;
struct queue_pages *qp = walk->private;
@@ -573,6 +683,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
+ int max_nr, nr;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -586,7 +697,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
+ max_nr = (end - addr) >> PAGE_SHIFT;
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -598,6 +711,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
+ if (folio_test_large(folio) && max_nr != 1)
+ nr = folio_pte_batch(folio, addr, pte, ptent,
+ max_nr, fpb_flags,
+ NULL, NULL, NULL);
/*
* vm_normal_folio() filters out zero pages, but there might
* still be reserved folios to skip, perhaps in a VDSO.
@@ -630,7 +747,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(vma) ||
!migrate_folio_add(folio, qp->pagelist, flags)) {
- qp->nr_failed++;
+ qp->nr_failed += nr;
if (strictly_unmovable(flags))
break;
}
@@ -2014,26 +2131,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
+ struct weighted_interleave_state *state;
nodemask_t nodemask;
unsigned int target, nr_nodes;
- u8 *table;
+ u8 *table = NULL;
unsigned int weight_total = 0;
u8 weight;
- int nid;
+ int nid = 0;
nr_nodes = read_once_policy_nodemask(pol, &nodemask);
if (!nr_nodes)
return numa_node_id();
rcu_read_lock();
- table = rcu_dereference(iw_table);
+
+ state = rcu_dereference(wi_state);
+ /* Uninitialized wi_state means we should assume all weights are 1 */
+ if (state)
+ table = state->iw_table;
+
/* calculate the total weight */
- for_each_node_mask(nid, nodemask) {
- /* detect system default usage */
- weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
- weight_total += weight;
- }
+ for_each_node_mask(nid, nodemask)
+ weight_total += table ? table[nid] : 1;
/* Calculate the node offset based on totals */
target = ilx % weight_total;
@@ -2041,7 +2160,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
while (target) {
/* detect system default usage */
weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
if (target < weight)
break;
target -= weight;
@@ -2442,13 +2560,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
+ struct weighted_interleave_state *state;
struct task_struct *me = current;
unsigned int cpuset_mems_cookie;
unsigned long total_allocated = 0;
unsigned long nr_allocated = 0;
unsigned long rounds;
unsigned long node_pages, delta;
- u8 *table, *weights, weight;
+ u8 *weights, weight;
unsigned int weight_total = 0;
unsigned long rem_pages = nr_pages;
nodemask_t nodes;
@@ -2498,17 +2617,19 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
return total_allocated;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- if (table)
- memcpy(weights, table, nr_node_ids);
- rcu_read_unlock();
+ state = rcu_dereference(wi_state);
+ if (state) {
+ memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ for (i = 0; i < nr_node_ids; i++)
+ weights[i] = 1;
+ }
/* calculate total, detect system default usage */
- for_each_node_mask(node, nodes) {
- if (!weights[node])
- weights[node] = 1;
+ for_each_node_mask(node, nodes)
weight_total += weights[node];
- }
/*
* Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
@@ -3419,6 +3540,14 @@ struct iw_node_attr {
int nid;
};
+struct sysfs_wi_group {
+ struct kobject wi_kobj;
+ struct mutex kobj_lock;
+ struct iw_node_attr *nattrs[];
+};
+
+static struct sysfs_wi_group *wi_group;
+
static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -3433,177 +3562,318 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
struct iw_node_attr *node_attr;
- u8 *new;
- u8 *old;
u8 weight = 0;
+ int i;
node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
- if (count == 0 || sysfs_streq(buf, ""))
- weight = 0;
- else if (kstrtou8(buf, 0, &weight))
+ if (count == 0 || sysfs_streq(buf, "") ||
+ kstrtou8(buf, 0, &weight) || weight == 0)
return -EINVAL;
- new = kzalloc(nr_node_ids, GFP_KERNEL);
- if (!new)
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
return -ENOMEM;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- if (old)
- memcpy(new, old, nr_node_ids);
- new[node_attr->nid] = weight;
- rcu_assign_pointer(iw_table, new);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state) {
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ } else {
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+ }
+ new_wi_state->iw_table[node_attr->nid] = weight;
+ new_wi_state->mode_auto = false;
+
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
return count;
}
-static struct iw_node_attr **node_attrs;
-
-static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
- struct kobject *parent)
+static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
- if (!node_attr)
- return;
- sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
+ struct weighted_interleave_state *state;
+ bool wi_auto = true;
+
+ rcu_read_lock();
+ state = rcu_dereference(wi_state);
+ if (state)
+ wi_auto = state->mode_auto;
+ rcu_read_unlock();
+
+ return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
}
-static void sysfs_wi_release(struct kobject *wi_kobj)
+static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *bw;
+ bool input;
int i;
+ if (kstrtobool(buf, &input))
+ return -EINVAL;
+
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
+ return -ENOMEM;
for (i = 0; i < nr_node_ids; i++)
- sysfs_wi_node_release(node_attrs[i], wi_kobj);
- kobject_put(wi_kobj);
+ new_wi_state->iw_table[i] = 1;
+
+ mutex_lock(&wi_state_lock);
+ if (!input) {
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state)
+ goto update_wi_state;
+ if (input == old_wi_state->mode_auto) {
+ mutex_unlock(&wi_state_lock);
+ return count;
+ }
+
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ goto update_wi_state;
+ }
+
+ bw = node_bw_table;
+ if (!bw) {
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ return -ENODEV;
+ }
+
+ new_wi_state->mode_auto = true;
+ reduce_interleave_weights(bw, new_wi_state->iw_table);
+
+update_wi_state:
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+ return count;
+}
+
+static void sysfs_wi_node_delete(int nid)
+{
+ struct iw_node_attr *attr;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return;
+
+ mutex_lock(&wi_group->kobj_lock);
+ attr = wi_group->nattrs[nid];
+ if (!attr) {
+ mutex_unlock(&wi_group->kobj_lock);
+ return;
+ }
+
+ wi_group->nattrs[nid] = NULL;
+ mutex_unlock(&wi_group->kobj_lock);
+
+ sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+ kfree(attr->kobj_attr.attr.name);
+ kfree(attr);
+}
+
+static void sysfs_wi_node_delete_all(void)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++)
+ sysfs_wi_node_delete(nid);
+}
+
+static void wi_state_free(void)
+{
+ struct weighted_interleave_state *old_wi_state;
+
+ mutex_lock(&wi_state_lock);
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state) {
+ mutex_unlock(&wi_state_lock);
+ goto out;
+ }
+
+ rcu_assign_pointer(wi_state, NULL);
+ mutex_unlock(&wi_state_lock);
+ synchronize_rcu();
+ kfree(old_wi_state);
+out:
+ kfree(&wi_group->wi_kobj);
+}
+
+static struct kobj_attribute wi_auto_attr =
+ __ATTR(auto, 0664, weighted_interleave_auto_show,
+ weighted_interleave_auto_store);
+
+static void wi_cleanup(void) {
+ sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ sysfs_wi_node_delete_all();
+ wi_state_free();
+}
+
+static void wi_kobj_release(struct kobject *wi_kobj)
+{
+ kfree(wi_group);
}
static const struct kobj_type wi_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = sysfs_wi_release,
+ .release = wi_kobj_release,
};
-static int add_weight_node(int nid, struct kobject *wi_kobj)
+static int sysfs_wi_node_add(int nid)
{
- struct iw_node_attr *node_attr;
+ int ret;
char *name;
+ struct iw_node_attr *new_attr;
- node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
- if (!node_attr)
+ if (nid < 0 || nid >= nr_node_ids) {
+ pr_err("invalid node id: %d\n", nid);
+ return -EINVAL;
+ }
+
+ new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+ if (!new_attr)
return -ENOMEM;
name = kasprintf(GFP_KERNEL, "node%d", nid);
if (!name) {
- kfree(node_attr);
+ kfree(new_attr);
return -ENOMEM;
}
- sysfs_attr_init(&node_attr->kobj_attr.attr);
- node_attr->kobj_attr.attr.name = name;
- node_attr->kobj_attr.attr.mode = 0644;
- node_attr->kobj_attr.show = node_show;
- node_attr->kobj_attr.store = node_store;
- node_attr->nid = nid;
+ sysfs_attr_init(&new_attr->kobj_attr.attr);
+ new_attr->kobj_attr.attr.name = name;
+ new_attr->kobj_attr.attr.mode = 0644;
+ new_attr->kobj_attr.show = node_show;
+ new_attr->kobj_attr.store = node_store;
+ new_attr->nid = nid;
- if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
- pr_err("failed to add attribute to weighted_interleave\n");
- return -ENOMEM;
+ mutex_lock(&wi_group->kobj_lock);
+ if (wi_group->nattrs[nid]) {
+ mutex_unlock(&wi_group->kobj_lock);
+ ret = -EEXIST;
+ goto out;
}
- node_attrs[nid] = node_attr;
+ ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+ if (ret) {
+ mutex_unlock(&wi_group->kobj_lock);
+ goto out;
+ }
+ wi_group->nattrs[nid] = new_attr;
+ mutex_unlock(&wi_group->kobj_lock);
return 0;
+
+out:
+ kfree(new_attr->kobj_attr.attr.name);
+ kfree(new_attr);
+ return ret;
}
-static int add_weighted_interleave_group(struct kobject *root_kobj)
+static int wi_node_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ int err;
+ struct memory_notify *arg = data;
+ int nid = arg->status_change_nid;
+
+ if (nid < 0)
+ return NOTIFY_OK;
+
+ switch (action) {
+ case MEM_ONLINE:
+ err = sysfs_wi_node_add(nid);
+ if (err)
+ pr_err("failed to add sysfs for node%d during hotplug: %d\n",
+ nid, err);
+ break;
+ case MEM_OFFLINE:
+ sysfs_wi_node_delete(nid);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
{
- struct kobject *wi_kobj;
int nid, err;
- wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (!wi_kobj)
+ wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
+ GFP_KERNEL);
+ if (!wi_group)
return -ENOMEM;
+ mutex_init(&wi_group->kobj_lock);
- err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+ err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
"weighted_interleave");
- if (err) {
- kfree(wi_kobj);
- return err;
- }
+ if (err)
+ goto err_put_kobj;
- for_each_node_state(nid, N_POSSIBLE) {
- err = add_weight_node(nid, wi_kobj);
+ err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ if (err)
+ goto err_put_kobj;
+
+ for_each_online_node(nid) {
+ if (!node_state(nid, N_MEMORY))
+ continue;
+
+ err = sysfs_wi_node_add(nid);
if (err) {
- pr_err("failed to add sysfs [node%d]\n", nid);
- break;
+ pr_err("failed to add sysfs for node%d during init: %d\n",
+ nid, err);
+ goto err_cleanup_kobj;
}
}
- if (err)
- kobject_put(wi_kobj);
- return 0;
-}
-static void mempolicy_kobj_release(struct kobject *kobj)
-{
- u8 *old;
+ hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
+ return 0;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- rcu_assign_pointer(iw_table, NULL);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
- kfree(node_attrs);
- kfree(kobj);
+err_cleanup_kobj:
+ wi_cleanup();
+ kobject_del(&wi_group->wi_kobj);
+err_put_kobj:
+ kobject_put(&wi_group->wi_kobj);
+ return err;
}
-static const struct kobj_type mempolicy_ktype = {
- .release = mempolicy_kobj_release
-};
-
static int __init mempolicy_sysfs_init(void)
{
int err;
static struct kobject *mempolicy_kobj;
- mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
- if (!mempolicy_kobj) {
- err = -ENOMEM;
- goto err_out;
- }
-
- node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
- GFP_KERNEL);
- if (!node_attrs) {
- err = -ENOMEM;
- goto mempol_out;
- }
+ mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
+ if (!mempolicy_kobj)
+ return -ENOMEM;
- err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
- "mempolicy");
+ err = add_weighted_interleave_group(mempolicy_kobj);
if (err)
- goto node_out;
+ goto err_kobj;
- err = add_weighted_interleave_group(mempolicy_kobj);
- if (err) {
- pr_err("mempolicy sysfs structure failed to initialize\n");
- kobject_put(mempolicy_kobj);
- return err;
- }
+ return 0;
- return err;
-node_out:
- kfree(node_attrs);
-mempol_out:
- kfree(mempolicy_kobj);
-err_out:
- pr_err("failed to add mempolicy kobject to the system\n");
+err_kobj:
+ kobject_del(mempolicy_kobj);
+ kobject_put(mempolicy_kobj);
return err;
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 2aebc1b192da..c417c843e9b1 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -130,7 +130,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
pgmap_array_delete(range);
}
@@ -211,8 +211,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
if (nid < 0)
nid = numa_mem_id();
- error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
- range_len(range));
+ error = pfnmap_track(PHYS_PFN(range->start), range_len(range),
+ &params->pgprot);
if (error)
goto err_pfn_remap;
@@ -277,7 +277,7 @@ err_add_memory:
if (!is_private)
kasan_remove_zero_shadow(__va(range->start), range_len(range));
err_kasan:
- untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true);
+ pfnmap_untrack(PHYS_PFN(range->start), range_len(range));
err_pfn_remap:
pgmap_array_delete(range);
return error;
diff --git a/mm/migrate.c b/mm/migrate.c
index c80591514e66..8cf0f9c9599d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -50,6 +50,7 @@
#include <trace/events/migrate.h>
#include "internal.h"
+#include "swap.h"
bool isolate_movable_page(struct page *page, isolate_mode_t mode)
{
@@ -445,20 +446,6 @@ unlock:
}
#endif
-static int folio_expected_refs(struct address_space *mapping,
- struct folio *folio)
-{
- int refs = 1;
- if (!mapping)
- return refs;
-
- refs += folio_nr_pages(folio);
- if (folio_test_private(folio))
- refs++;
-
- return refs;
-}
-
/*
* Replace the folio in the mapping.
*
@@ -601,7 +588,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count)
{
- int expected_count = folio_expected_refs(mapping, folio) + extra_count;
+ int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
if (folio_ref_count(folio) != expected_count)
return -EAGAIN;
@@ -618,7 +605,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src)
{
XA_STATE(xas, &mapping->i_pages, folio_index(src));
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -749,7 +736,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, void *src_private,
enum migrate_mode mode)
{
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
/* Check whether src does not have extra refs before we do more work */
if (folio_ref_count(src) != expected_count)
@@ -837,7 +824,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return migrate_folio(mapping, dst, src, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = folio_expected_refs(mapping, src);
+ expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
diff --git a/mm/mincore.c b/mm/mincore.c
index 832f29f46767..42d6c9c8da86 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include "swap.h"
+#include "internal.h"
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -105,6 +106,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *ptep;
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
+ int step, i;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -118,16 +120,26 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; ptep++, addr += PAGE_SIZE) {
+ for (; addr != end; ptep += step, addr += step * PAGE_SIZE) {
pte_t pte = ptep_get(ptep);
+ step = 1;
/* We need to do cache lookup too for pte markers */
if (pte_none_mostly(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
vma, vec);
- else if (pte_present(pte))
- *vec = 1;
- else { /* pte is a swap entry */
+ else if (pte_present(pte)) {
+ unsigned int batch = pte_batch_hint(ptep, pte);
+
+ if (batch > 1) {
+ unsigned int max_nr = (end - addr) >> PAGE_SHIFT;
+
+ step = min_t(unsigned int, batch, max_nr);
+ }
+
+ for (i = 0; i < step; i++)
+ vec[i] = 1;
+ } else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
@@ -146,7 +158,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
#endif
}
}
- vec++;
+ vec += step;
}
pte_unmap_unlock(ptep - 1, ptl);
out:
diff --git a/mm/mm_init.c b/mm/mm_init.c
index eedce9321e13..f2944748f526 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -30,6 +30,7 @@
#include <linux/crash_dump.h>
#include <linux/execmem.h>
#include <linux/vmstat.h>
+#include <linux/kexec_handover.h>
#include <linux/hugetlb.h>
#include "internal.h"
#include "slab.h"
@@ -743,7 +744,7 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_deferred_page(unsigned long pfn, int nid)
+static void __meminit __init_deferred_page(unsigned long pfn, int nid)
{
if (early_page_initialised(pfn, nid))
return;
@@ -763,11 +764,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_deferred_page(unsigned long pfn, int nid)
+static inline void __init_deferred_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void __meminit init_deferred_page(unsigned long pfn, int nid)
+{
+ __init_deferred_page(pfn, nid);
+}
+
/*
* Initialised pages do not have PageReserved set. This function is
* called for each range allocated by the bootmem allocator and
@@ -777,22 +783,19 @@ static inline void init_deferred_page(unsigned long pfn, int nid)
void __meminit reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid)
{
- unsigned long start_pfn = PFN_DOWN(start);
- unsigned long end_pfn = PFN_UP(end);
+ unsigned long pfn;
- for (; start_pfn < end_pfn; start_pfn++) {
- if (pfn_valid(start_pfn)) {
- struct page *page = pfn_to_page(start_pfn);
+ for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
+ struct page *page = pfn_to_page(pfn);
- init_deferred_page(start_pfn, nid);
+ __init_deferred_page(pfn, nid);
- /*
- * no need for atomic set_bit because the struct
- * page is not visible yet so nobody should
- * access it yet.
- */
- __SetPageReserved(page);
- }
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
@@ -828,7 +831,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* - physical memory bank size is not necessarily the exact multiple of the
* arbitrary section size
* - early reserved memory may not be listed in memblock.memory
- * - non-memory regions covered by the contigious flatmem mapping
+ * - non-memory regions covered by the contiguous flatmem mapping
* - memory layouts defined with memmap= kernel parameter may not align
* nicely with memmap sections
*
@@ -848,11 +851,7 @@ static void __init init_unavailable_range(unsigned long spfn,
unsigned long pfn;
u64 pgcnt = 0;
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(pageblock_start_pfn(pfn))) {
- pfn = pageblock_end_pfn(pfn) - 1;
- continue;
- }
+ for_each_valid_pfn(pfn, spfn, epfn) {
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
@@ -1510,7 +1509,7 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
- unsigned int order = MAX_PAGE_ORDER;
+ unsigned int order = PAGE_BLOCK_ORDER;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
@@ -1907,7 +1906,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
free_area_init_node(nid);
/*
- * No sysfs hierarcy will be created via register_one_node()
+ * No sysfs hierarchy will be created via register_one_node()
*for memory-less node because here it's not marked as N_MEMORY
*and won't be set online later. The benefit is userspace
*program won't be confused by sysfs files/directories of
@@ -2667,12 +2666,6 @@ static void __init report_meminit(void)
stack = "all(pattern)";
else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
stack = "all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- stack = "byref_all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- stack = "byref(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- stack = "__user(zero)";
else
stack = "off";
@@ -2765,6 +2758,13 @@ void __init mm_core_init(void)
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
+
+ /*
+ * KHO memory setup must happen while memblock is still active, but
+ * as close as possible to buddy initialization
+ */
+ kho_memory_init();
+
memblock_free_all();
mem_init();
kmem_cache_init();
diff --git a/mm/mmap.c b/mm/mmap.c
index bd210aaf7ebd..09c563c95112 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -475,7 +475,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op->mmap)
+ if (!file_has_valid_mmap_hooks(file))
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
@@ -1321,48 +1321,6 @@ destroy:
vm_unacct_memory(nr_accounted);
}
-/* Insert vm structure into process list sorted by address
- * and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_rwsem is taken here.
- */
-int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- unsigned long charged = vma_pages(vma);
-
-
- if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
- return -ENOMEM;
-
- if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
-
- /*
- * The vm_pgoff of a purely anonymous vma should be irrelevant
- * until its first write fault, when page's anon_vma and index
- * are set. But now set the vm_pgoff it will almost certainly
- * end up with (unless mremap moves it elsewhere before that
- * first wfault), so /proc/pid/maps tells a consistent story.
- *
- * By setting it to reflect the virtual start address of the
- * vma, merges and splits can happen in a seamless way, just
- * using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap and in do_brk_flags.
- */
- if (vma_is_anonymous(vma)) {
- BUG_ON(vma->anon_vma);
- vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
- }
-
- if (vma_link(mm, vma)) {
- if (vma->vm_flags & VM_ACCOUNT)
- vm_unacct_memory(charged);
- return -ENOMEM;
- }
-
- return 0;
-}
-
/*
* Return true if the calling process may expand its vm space by the passed
* number of pages
@@ -1596,7 +1554,7 @@ static const struct ctl_table mmap_table[] = {
#endif /* CONFIG_SYSCTL */
/*
- * initialise the percpu counter for VM
+ * initialise the percpu counter for VM, initialise VMA state.
*/
void __init mmap_init(void)
{
@@ -1607,6 +1565,7 @@ void __init mmap_init(void)
#ifdef CONFIG_SYSCTL
register_sysctl_init("vm", mmap_table);
#endif
+ vma_state_init();
}
/*
@@ -1718,90 +1677,6 @@ static int __meminit init_reserve_notifier(void)
subsys_initcall(init_reserve_notifier);
/*
- * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
- * this VMA and its relocated range, which will now reside at [vma->vm_start -
- * shift, vma->vm_end - shift).
- *
- * This function is almost certainly NOT what you want for anything other than
- * early executable temporary stack relocation.
- */
-int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
-{
- /*
- * The process proceeds as follows:
- *
- * 1) Use shift to calculate the new vma endpoints.
- * 2) Extend vma to cover both the old and new ranges. This ensures the
- * arguments passed to subsequent functions are consistent.
- * 3) Move vma's page tables to the new range.
- * 4) Free up any cleared pgd range.
- * 5) Shrink the vma to cover only the new range.
- */
-
- struct mm_struct *mm = vma->vm_mm;
- unsigned long old_start = vma->vm_start;
- unsigned long old_end = vma->vm_end;
- unsigned long length = old_end - old_start;
- unsigned long new_start = old_start - shift;
- unsigned long new_end = old_end - shift;
- VMA_ITERATOR(vmi, mm, new_start);
- VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
- struct vm_area_struct *next;
- struct mmu_gather tlb;
- PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
-
- BUG_ON(new_start > new_end);
-
- /*
- * ensure there are no vmas between where we want to go
- * and where we are
- */
- if (vma != vma_next(&vmi))
- return -EFAULT;
-
- vma_iter_prev_range(&vmi);
- /*
- * cover the whole range: [new_start, old_end)
- */
- vmg.middle = vma;
- if (vma_expand(&vmg))
- return -ENOMEM;
-
- /*
- * move the page tables downwards, on failure we rely on
- * process cleanup to remove whatever mess we made.
- */
- pmc.for_stack = true;
- if (length != move_page_tables(&pmc))
- return -ENOMEM;
-
- tlb_gather_mmu(&tlb, mm);
- next = vma_next(&vmi);
- if (new_end > old_start) {
- /*
- * when the old and new regions overlap clear from new_end.
- */
- free_pgd_range(&tlb, new_end, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- } else {
- /*
- * otherwise, clean from old_start; this is done to not touch
- * the address space in [new_end, old_start) some architectures
- * have constraints on va-space that make this illegal (IA64) -
- * for the others its just a little faster.
- */
- free_pgd_range(&tlb, old_start, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- }
- tlb_finish_mmu(&tlb);
-
- vma_prev(&vmi);
- /* Shrink the vma to just the new range */
- return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
-}
-
-#ifdef CONFIG_MMU
-/*
* Obtain a read lock on mm->mmap_lock, if the specified address is below the
* start of the VMA, the intent is to perform a write, and it is a
* downward-growing stack, then attempt to expand the stack to contain it.
@@ -1844,10 +1719,175 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
mmap_write_downgrade(mm);
return true;
}
-#else
-bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, bool write)
+
+__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
- return false;
+ struct vm_area_struct *mpnt, *tmp;
+ int retval;
+ unsigned long charge = 0;
+ LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, 0);
+
+ if (mmap_write_lock_killable(oldmm))
+ return -EINTR;
+ flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
+ /*
+ * Not linked in yet - no deadlock potential:
+ */
+ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
+
+ /* No ordering required: file already has been exposed. */
+ dup_mm_exe_file(mm, oldmm);
+
+ mm->total_vm = oldmm->total_vm;
+ mm->data_vm = oldmm->data_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
+ goto out;
+
+ mt_clear_in_rcu(vmi.mas.tree);
+ for_each_vma(vmi, mpnt) {
+ struct file *file;
+
+ vma_start_write(mpnt);
+ if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+ continue;
+ }
+ charge = 0;
+ /*
+ * Don't duplicate many vmas if we've been oom-killed (for
+ * example)
+ */
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto loop_out;
+ }
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+ goto fail_nomem;
+ charge = len;
+ }
+
+ tmp = vm_area_dup(mpnt);
+ if (!tmp)
+ goto fail_nomem;
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
+ goto fail_nomem_policy;
+ tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
+ if (tmp->vm_flags & VM_WIPEONFORK) {
+ /*
+ * VM_WIPEONFORK gets a clean slate in the child.
+ * Don't prepare anon_vma until fault since we don't
+ * copy page for current vma.
+ */
+ tmp->anon_vma = NULL;
+ } else if (anon_vma_fork(tmp, mpnt))
+ goto fail_nomem_anon_vma_fork;
+ vm_flags_clear(tmp, VM_LOCKED_MASK);
+ /*
+ * Copy/update hugetlb private vma information.
+ */
+ if (is_vm_hugetlb_page(tmp))
+ hugetlb_dup_vma_private(tmp);
+
+ /*
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
+ */
+ vma_iter_bulk_store(&vmi, tmp);
+
+ mm->map_count++;
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ file = tmp->vm_file;
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+
+ get_file(file);
+ i_mmap_lock_write(mapping);
+ if (vma_is_shared_maywrite(tmp))
+ mapping_allow_writable(mapping);
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ if (!(tmp->vm_flags & VM_WIPEONFORK))
+ retval = copy_page_range(tmp, mpnt);
+
+ if (retval) {
+ mpnt = vma_next(&vmi);
+ goto loop_out;
+ }
+ }
+ /* a new mm has just been created */
+ retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ vma_iter_free(&vmi);
+ if (!retval) {
+ mt_set_in_rcu(vmi.mas.tree);
+ ksm_fork(mm, oldmm);
+ khugepaged_fork(mm, oldmm);
+ } else {
+
+ /*
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
+ */
+ if (mpnt) {
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ /* Avoid OOM iterating a broken tree */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ }
+ /*
+ * The mm_struct is going to exit, but the locks will be dropped
+ * first. Set the mm_struct as unstable is advisable as it is
+ * not fully initialised.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
+ }
+out:
+ mmap_write_unlock(mm);
+ flush_tlb_mm(oldmm);
+ mmap_write_unlock(oldmm);
+ if (!retval)
+ dup_userfaultfd_complete(&uf);
+ else
+ dup_userfaultfd_fail(&uf);
+ return retval;
+
+fail_nomem_anon_vma_fork:
+ mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+ vm_area_free(tmp);
+fail_nomem:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto loop_out;
}
-#endif
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index e7dbaf96aa17..5f725cc67334 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -42,3 +42,276 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */
+
+#ifdef CONFIG_MMU
+#ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+{
+ unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+
+ /* Additional refcnt if the vma is attached. */
+ if (!detaching)
+ tgt_refcnt++;
+
+ /*
+ * If vma is detached then only vma_mark_attached() can raise the
+ * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+ */
+ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+ return false;
+
+ rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+ rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+ refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+ TASK_UNINTERRUPTIBLE);
+ lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+ return true;
+}
+
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+ *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+{
+ bool locked;
+
+ /*
+ * __vma_enter_locked() returns false immediately if the vma is not
+ * attached, otherwise it waits until refcnt is indicating that vma
+ * is attached with no readers.
+ */
+ locked = __vma_enter_locked(vma, false);
+
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+
+ if (locked) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(detached); /* vma should remain attached */
+ }
+}
+EXPORT_SYMBOL_GPL(__vma_start_write);
+
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * We are the only writer, so no need to use vma_refcount_put().
+ * The condition below is unlikely because the vma has been already
+ * write-locked and readers can increment vm_refcnt only temporarily
+ * before they check vm_lock_seq, realize the vma is locked and drop
+ * back the vm_refcnt. That is a narrow window for observing a raised
+ * vm_refcnt.
+ */
+ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+ /* Wait until vma is detached with no readers. */
+ if (__vma_enter_locked(vma, true)) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(!detached);
+ }
+ }
+}
+
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ MA_STATE(mas, &mm->mm_mt, address, address);
+ struct vm_area_struct *vma;
+
+ rcu_read_lock();
+retry:
+ vma = mas_walk(&mas);
+ if (!vma)
+ goto inval;
+
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /* Check if the VMA got isolated after we found it */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
+
+ /* Failed to lock the VMA */
+ goto inval;
+ }
+ /*
+ * At this point, we have a stable reference to a VMA: The VMA is
+ * locked and we know it hasn't already been isolated.
+ * From here on, we can access the VMA without worrying about which
+ * fields are accessible for RCU readers.
+ */
+
+ /* Check if the vma we locked is the right one. */
+ if (unlikely(vma->vm_mm != mm ||
+ address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
+
+ rcu_read_unlock();
+ return vma;
+
+inval_end_read:
+ vma_end_read(vma);
+inval:
+ rcu_read_unlock();
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
+ return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
+
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ if (likely(mmap_read_trylock(mm)))
+ return true;
+
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+
+ return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ mmap_read_unlock(mm);
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+ return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalent to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ if (!get_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
+
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
+
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
+}
+#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
+
+#else /* CONFIG_MMU */
+
+/*
+ * At least xtensa ends up having protection faults even with no
+ * MMU.. No stack expansion, at least.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ mmap_read_unlock(mm);
+ return vma;
+}
+
+#endif /* CONFIG_MMU */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index db7ba4a725d6..b49cc6385f1f 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -424,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
+ tlb->vma_pfn = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index fc18fe274505..8e0125dc0522 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -4,7 +4,7 @@
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
- * Christoph Lameter <cl@linux.com>
+ * Christoph Lameter <cl@gentwo.org>
*/
#include <linux/rculist.h>
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 62c1f7945741..88608d0dc2c2 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -379,7 +379,7 @@ again:
if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
pgtable_split_needed(vma, cp_flags)) {
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
/*
* For file-backed, the pmd could have been
* cleared; make sure pmd populated if
diff --git a/mm/mremap.c b/mm/mremap.c
index 0d4948b720e2..83e359754961 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1190,10 +1190,6 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
fixup_hugetlb_reservations(vma);
- /* Tell pfnmap has moved from this vma */
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(vma);
-
*new_vma_ptr = new_vma;
return err;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 70f92f9a7fab..b624acec6d2e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -415,7 +415,8 @@ static const struct ctl_table nommu_table[] = {
};
/*
- * initialise the percpu counter for VM and region record slabs
+ * initialise the percpu counter for VM and region record slabs, initialise VMA
+ * state.
*/
void __init mmap_init(void)
{
@@ -425,6 +426,7 @@ void __init mmap_init(void)
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
register_sysctl_init("vm", nommu_table);
+ vma_state_init();
}
/*
@@ -643,22 +645,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma);
/*
- * At least xtensa ends up having protection faults even with no
- * MMU.. No stack expansion, at least.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- vma = vma_lookup(mm, addr);
- if (!vma)
- mmap_read_unlock(mm);
- return vma;
-}
-
-/*
* expand a stack to a given address
* - not supported under NOMMU conditions
*/
@@ -1906,3 +1892,11 @@ static int __meminit init_admin_reserve(void)
return 0;
}
subsys_initcall(init_admin_reserve);
+
+int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ mmap_write_lock(oldmm);
+ dup_mm_exe_file(mm, oldmm);
+ mmap_write_unlock(oldmm);
+ return 0;
+}
diff --git a/mm/numa.c b/mm/numa.c
index f1787d7713a6..7d5e06fe5bd4 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -13,7 +13,6 @@ void __init alloc_node_data(int nid)
{
const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
u64 nd_pa;
- void *nd;
int tnid;
/* Allocate node data. Try node-local memory and then any node. */
@@ -21,7 +20,6 @@ void __init alloc_node_data(int nid)
if (!nd_pa)
panic("Cannot allocate %zu bytes for node %d data\n",
nd_size, nid);
- nd = __va(nd_pa);
/* report and initialize */
pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
@@ -30,7 +28,7 @@ void __init alloc_node_data(int nid)
if (tnid != nid)
pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
- node_data[nid] = nd;
+ node_data[nid] = __va(nd_pa);
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
}
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index ff4054f4334d..541a99c4071a 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -201,6 +201,28 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
}
/**
+ * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the numa_reserved_meminfo.
+ *
+ * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances
+ * against memblock_type information and moves any that intersect reserved
+ * ranges to numa_reserved_meminfo. However, when that information is known
+ * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk
+ * to numa_reserved_meminfo directly.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_reserved_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo);
+}
+
+/**
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 76200cd85fe7..b8eea5b3c064 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -41,6 +41,7 @@
#include <trace/events/writeback.h>
#include "internal.h"
+#include "swap.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
@@ -520,8 +521,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}
@@ -2564,11 +2565,11 @@ struct folio *writeback_iter(struct address_space *mapping,
if (!folio) {
/*
* To avoid deadlocks between range_cyclic writeback and callers
- * that hold pages in PageWriteback to aggregate I/O until
+ * that hold folios in writeback to aggregate I/O until
* the writeback iteration finishes, we do not loop back to the
- * start of the file. Doing so causes a page lock/page
+ * start of the file. Doing so causes a folio lock/folio
* writeback access order inversion - we should only ever lock
- * multiple pages in ascending page->index order, and looping
+ * multiple folios in ascending folio->index order, and looping
* back to the start of the file violates that rule and causes
* deadlocks.
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f29e393f6af..2ef3c07266b3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -930,19 +930,13 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
return bad_reason;
}
-static void free_page_is_bad_report(struct page *page)
-{
- bad_page(page,
- page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
-}
-
static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return false;
/* Something has gone sideways, find it */
- free_page_is_bad_report(page);
+ bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
return true;
}
@@ -2070,31 +2064,25 @@ static bool should_try_claim_block(unsigned int order, int start_mt)
/*
* Check whether there is a suitable fallback freepage with requested order.
- * Sets *claim_block to instruct the caller whether it should convert a whole
- * pageblock to the returned migratetype.
- * If only_claim is true, this function returns fallback_mt only if
+ * If claimable is true, this function returns fallback_mt only if
* we would do this whole-block claiming. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_claim, bool *claim_block)
+ int migratetype, bool claimable)
{
int i;
- int fallback_mt;
+
+ if (claimable && !should_try_claim_block(order, migratetype))
+ return -2;
if (area->nr_free == 0)
return -1;
- *claim_block = false;
for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
- fallback_mt = fallbacks[migratetype][i];
- if (free_area_empty(area, fallback_mt))
- continue;
+ int fallback_mt = fallbacks[migratetype][i];
- if (should_try_claim_block(order, migratetype))
- *claim_block = true;
-
- if (*claim_block || !only_claim)
+ if (!free_area_empty(area, fallback_mt))
return fallback_mt;
}
@@ -2191,7 +2179,6 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype,
int min_order = order;
struct page *page;
int fallback_mt;
- bool claim_block;
/*
* Do not steal pages from freelists belonging to other pageblocks
@@ -2210,11 +2197,14 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype,
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &claim_block);
+ start_migratetype, true);
+
+ /* No block in that order */
if (fallback_mt == -1)
continue;
- if (!claim_block)
+ /* Advanced into orders too low to claim, abort */
+ if (fallback_mt == -2)
break;
page = get_page_from_free_area(area, fallback_mt);
@@ -2242,12 +2232,11 @@ __rmqueue_steal(struct zone *zone, int order, int start_migratetype)
int current_order;
struct page *page;
int fallback_mt;
- bool claim_block;
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &claim_block);
+ start_migratetype, false);
if (fallback_mt == -1)
continue;
@@ -2668,10 +2657,10 @@ static void free_frozen_page_commit(struct zone *zone,
* stops will be drained from vmstat refresh context.
*/
if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
- free_high = (pcp->free_count >= batch &&
+ free_high = (pcp->free_count >= (batch + pcp->high_min / 2) &&
(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
(!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
- pcp->count >= READ_ONCE(batch)));
+ pcp->count >= batch));
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
@@ -3550,7 +3539,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
retry:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
+ * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
@@ -4237,7 +4226,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
/*
* Ignore cpuset mems for non-blocking __GFP_HIGH (probably
* GFP_ATOMIC) rather than fail, see the comment for
- * cpuset_node_allowed().
+ * cpuset_current_node_allowed().
*/
if (alloc_flags & ALLOC_MIN_RESERVE)
alloc_flags &= ~ALLOC_CPUSET;
@@ -5085,7 +5074,7 @@ EXPORT_SYMBOL(__free_pages);
/*
* Can be called while holding raw_spin_lock or from IRQ and NMI for any
- * page type (not only those that came from try_alloc_pages)
+ * page type (not only those that came from alloc_pages_nolock)
*/
void free_pages_nolock(struct page *page, unsigned int order)
{
@@ -7326,20 +7315,21 @@ static bool __free_unaccepted(struct page *page)
#endif /* CONFIG_UNACCEPTED_MEMORY */
/**
- * try_alloc_pages - opportunistic reentrant allocation from any context
+ * alloc_pages_nolock - opportunistic reentrant allocation from any context
* @nid: node to allocate from
* @order: allocation order size
*
* Allocates pages of a given order from the given node. This is safe to
* call from any context (from atomic, NMI, and also reentrant
- * allocator -> tracepoint -> try_alloc_pages_noprof).
+ * allocator -> tracepoint -> alloc_pages_nolock_noprof).
* Allocation is best effort and to be expected to fail easily so nobody should
* rely on the success. Failures are not reported via warn_alloc().
* See always fail conditions below.
*
- * Return: allocated page or NULL on failure.
+ * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
+ * It means ENOMEM. There is no reason to call it again and expect !NULL.
*/
-struct page *try_alloc_pages_noprof(int nid, unsigned int order)
+struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
{
/*
* Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
@@ -7348,7 +7338,7 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
*
* These two are the conditions for gfpflags_allow_spinning() being true.
*
- * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason
+ * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason
* to warn. Also warn would trigger printk() which is unsafe from
* various contexts. We cannot use printk_deferred_enter() to mitigate,
* since the running context is unknown.
@@ -7358,7 +7348,7 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
* BPF use cases.
*
* Though __GFP_NOMEMALLOC is not checked in the code path below,
- * specify it here to highlight that try_alloc_pages()
+ * specify it here to highlight that alloc_pages_nolock()
* doesn't want to deplete reserves.
*/
gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
diff --git a/mm/page_owner.c b/mm/page_owner.c
index cc4a6916eec6..9928c9ac8c31 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -302,7 +302,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
/*
* Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
* to prevent issues in stack_depot_save().
- * This is similar to try_alloc_pages() gfp flags, but only used
+ * This is similar to alloc_pages_nolock() gfp flags, but only used
* to signal stack_depot to avoid spin_locks.
*/
handle = save_stack(__GFP_NOWARN);
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 106e1d66e9f9..9374f29cdc6f 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -18,7 +18,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
{
struct ptdump_state *st = walk->private;
- st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+ st->note_page_pte(st, addr, kasan_early_shadow_pte[0]);
walk->action = ACTION_CONTINUE;
@@ -38,11 +38,11 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 0, pgd_val(val));
+ if (st->effective_prot_pgd)
+ st->effective_prot_pgd(st, val);
if (pgd_leaf(val)) {
- st->note_page(st, addr, 0, pgd_val(val));
+ st->note_page_pgd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -61,11 +61,11 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 1, p4d_val(val));
+ if (st->effective_prot_p4d)
+ st->effective_prot_p4d(st, val);
if (p4d_leaf(val)) {
- st->note_page(st, addr, 1, p4d_val(val));
+ st->note_page_p4d(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -84,11 +84,11 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 2, pud_val(val));
+ if (st->effective_prot_pud)
+ st->effective_prot_pud(st, val);
if (pud_leaf(val)) {
- st->note_page(st, addr, 2, pud_val(val));
+ st->note_page_pud(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -106,10 +106,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
- if (st->effective_prot)
- st->effective_prot(st, 3, pmd_val(val));
+ if (st->effective_prot_pmd)
+ st->effective_prot_pmd(st, val);
if (pmd_leaf(val)) {
- st->note_page(st, addr, 3, pmd_val(val));
+ st->note_page_pmd(st, addr, val);
walk->action = ACTION_CONTINUE;
}
@@ -122,10 +122,10 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
struct ptdump_state *st = walk->private;
pte_t val = ptep_get_lockless(pte);
- if (st->effective_prot)
- st->effective_prot(st, 4, pte_val(val));
+ if (st->effective_prot_pte)
+ st->effective_prot_pte(st, val);
- st->note_page(st, addr, 4, pte_val(val));
+ st->note_page_pte(st, addr, val);
return 0;
}
@@ -134,9 +134,31 @@ static int ptdump_hole(unsigned long addr, unsigned long next,
int depth, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
-
- st->note_page(st, addr, depth, 0);
-
+ pte_t pte_zero = {0};
+ pmd_t pmd_zero = {0};
+ pud_t pud_zero = {0};
+ p4d_t p4d_zero = {0};
+ pgd_t pgd_zero = {0};
+
+ switch (depth) {
+ case 4:
+ st->note_page_pte(st, addr, pte_zero);
+ break;
+ case 3:
+ st->note_page_pmd(st, addr, pmd_zero);
+ break;
+ case 2:
+ st->note_page_pud(st, addr, pud_zero);
+ break;
+ case 1:
+ st->note_page_p4d(st, addr, p4d_zero);
+ break;
+ case 0:
+ st->note_page_pgd(st, addr, pgd_zero);
+ break;
+ default:
+ break;
+ }
return 0;
}
@@ -162,7 +184,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
mmap_write_unlock(mm);
/* Flush out the last page */
- st->note_page(st, 0, -1, 0);
+ st->note_page_flush(st);
}
static int check_wx_show(struct seq_file *m, void *v)
diff --git a/mm/rmap.c b/mm/rmap.c
index 67bb273dfb80..fb63d9256f09 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -774,7 +774,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
* @vma: The VMA we need to know the address in.
*
* Calculates the user virtual address of this page in the specified VMA.
- * It is the caller's responsibililty to check the page is actually
+ * It is the caller's responsibility to check the page is actually
* within the VMA. There may not currently be a PTE pointing at this
* page, but if a page fault occurs at this address, this is the page
* which will be accessed.
@@ -789,13 +789,13 @@ unsigned long page_address_in_vma(const struct folio *folio,
const struct page *page, const struct vm_area_struct *vma)
{
if (folio_test_anon(folio)) {
- struct anon_vma *page__anon_vma = folio_anon_vma(folio);
+ struct anon_vma *anon_vma = folio_anon_vma(folio);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
*/
- if (!vma->anon_vma || !page__anon_vma ||
- vma->anon_vma->root != page__anon_vma->root)
+ if (!vma->anon_vma || !anon_vma ||
+ vma->anon_vma->root != anon_vma->root)
return -EFAULT;
} else if (!vma->vm_file) {
return -EFAULT;
@@ -803,7 +803,7 @@ unsigned long page_address_in_vma(const struct folio *folio,
return -EFAULT;
}
- /* KSM folios don't reach here because of the !page__anon_vma check */
+ /* KSM folios don't reach here because of the !anon_vma check */
return vma_address(vma, page_pgoff(folio, page), 1);
}
@@ -1944,7 +1944,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* restart so we can process the PTE-mapped THP.
*/
split_huge_pmd_locked(vma, pvmw.address,
- pvmw.pmd, false, folio);
+ pvmw.pmd, false);
flags &= ~TTU_SPLIT_HUGE_PMD;
page_vma_mapped_walk_restart(&pvmw);
continue;
@@ -2292,13 +2292,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pvmw.flags = PVMW_SYNC;
/*
- * unmap_page() in mm/huge_memory.c is the only user of migration with
- * TTU_SPLIT_HUGE_PMD and it wants to freeze.
- */
- if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, true, folio);
-
- /*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
@@ -2323,9 +2316,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_locked(vma, pvmw.address,
+ pvmw.pmd, true);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
subpage = folio_page(folio,
pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
@@ -2337,8 +2337,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
continue;
- }
#endif
+ }
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 1b0a214ee558..589b26c2d553 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -120,18 +120,18 @@ static int secretmem_release(struct inode *inode, struct file *file)
return 0;
}
-static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+static int secretmem_mmap_prepare(struct vm_area_desc *desc)
{
- unsigned long len = vma->vm_end - vma->vm_start;
+ const unsigned long len = desc->end - desc->start;
- if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+ if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len))
return -EAGAIN;
- vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
- vma->vm_ops = &secretmem_vm_ops;
+ desc->vm_flags |= VM_LOCKED | VM_DONTDUMP;
+ desc->vm_ops = &secretmem_vm_ops;
return 0;
}
@@ -143,7 +143,7 @@ bool vma_is_secretmem(struct vm_area_struct *vma)
static const struct file_operations secretmem_fops = {
.release = secretmem_release,
- .mmap = secretmem_mmap,
+ .mmap_prepare = secretmem_mmap_prepare,
};
static int secretmem_migrate_folio(struct address_space *mapping,
diff --git a/mm/shmem.c b/mm/shmem.c
index 858cee02ca49..0c5fb4ffa03a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1446,8 +1446,6 @@ static int shmem_unuse_swap_entries(struct inode *inode,
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(folio))
- continue;
error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
@@ -1505,6 +1503,7 @@ int shmem_unuse(unsigned int type)
return 0;
mutex_lock(&shmem_swaplist_mutex);
+start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
@@ -1523,13 +1522,15 @@ int shmem_unuse(unsigned int type)
cond_resched();
mutex_lock(&shmem_swaplist_mutex);
- next = list_next_entry(info, swaplist);
- if (!info->swapped)
- list_del_init(&info->swaplist);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error)
break;
+ if (list_empty(&info->swaplist))
+ goto start_over;
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
}
mutex_unlock(&shmem_swaplist_mutex);
@@ -1643,8 +1644,8 @@ try_split:
BUG_ON(folio_mapped(folio));
return swap_writeout(folio, wbc);
}
-
- list_del_init(&info->swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
if (nr_pages > 1)
goto try_split;
@@ -2331,6 +2332,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
*/
split_order = shmem_split_large_entry(inode, index, swap, gfp);
if (split_order < 0) {
+ folio_put(folio);
+ folio = NULL;
error = split_order;
goto failed;
}
@@ -5805,12 +5808,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
- if (shmem_acct_size(flags, size))
- return ERR_PTR(-ENOMEM);
-
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
+ if (shmem_acct_size(flags, size))
+ return ERR_PTR(-ENOMEM);
+
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
if (IS_ERR(inode)) {
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 5acb51a9fc49..0cf8bf5d832d 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
unsigned long free_highpages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
- val->totalram = managed_pages;
- val->sharedram = node_page_state(pgdat, NR_SHMEM);
- val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
-#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
-
+ managed_pages += zone_managed_pages(zone);
if (is_highmem(zone)) {
managed_highpages += zone_managed_pages(zone);
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
+
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
val->totalhigh = managed_highpages;
val->freehigh = free_highpages;
-#else
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#endif
val->mem_unit = PAGE_SIZE;
}
#endif
@@ -311,6 +305,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" low:%lukB"
" high:%lukB"
" reserved_highatomic:%luKB"
+ " free_highatomic:%luKB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
@@ -332,6 +327,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
K(zone->nr_reserved_highatomic),
+ K(zone->nr_free_highatomic),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5be257e03c7c..bfe7c40eeee1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -2,7 +2,7 @@
/*
* Slab allocator functions that are independent of the allocator strategy
*
- * (C) 2012 Christoph Lameter <cl@linux.com>
+ * (C) 2012 Christoph Lameter <cl@gentwo.org>
*/
#include <linux/slab.h>
diff --git a/mm/swap.c b/mm/swap.c
index 77b2d5997873..4fc322f7111a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -309,7 +309,7 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio)
trace_mm_lru_activate(folio);
__count_vm_events(PGACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
}
#ifdef CONFIG_SMP
@@ -581,7 +581,7 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
if (active) {
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
nr_pages);
}
}
@@ -599,7 +599,7 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
}
static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
@@ -625,7 +625,7 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
lruvec_add_folio(lruvec, folio);
__count_vm_events(PGLAZYFREE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
+ count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
}
/*
diff --git a/mm/swap.h b/mm/swap.h
index aa62463976d5..2269eb9df0af 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -201,4 +201,22 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
#endif /* CONFIG_SWAP */
+/**
+ * folio_index - File index of a folio.
+ * @folio: The folio.
+ *
+ * For a folio which is either in the page cache or the swap cache,
+ * return its index within the address_space it belongs to. If you know
+ * the folio is definitely in the page cache, you can look at the folio's
+ * index directly.
+ *
+ * Return: The index (offset in units of pages) of a folio in its file.
+ */
+static inline pgoff_t folio_index(struct folio *folio)
+{
+ if (unlikely(folio_test_swapcache(folio)))
+ return swap_cache_index(folio->swap);
+ return folio->index;
+}
+
#endif /* _MM_SWAP_H */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ec2b1c9c9926..c354435a0923 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -231,13 +231,11 @@ void free_swap_cache(struct folio *folio)
}
/*
- * Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page.
+ * Freeing a folio and also freeing any swap cache associated with
+ * this folio if it is the last user.
*/
-void free_page_and_swap_cache(struct page *page)
+void free_folio_and_swap_cache(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
free_swap_cache(folio);
if (!is_huge_zero_folio(folio))
folio_put(folio);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 86643b181098..68ce283e84be 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -52,9 +52,9 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static void swap_entry_range_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages);
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages);
static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
@@ -192,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si,
unsigned char *map_end = map + nr_pages;
unsigned char count = *map;
- if (swap_count(count) != 1)
+ if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM)
return false;
while (++map < map_end) {
@@ -1355,10 +1355,12 @@ out:
return NULL;
}
-static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
- unsigned long offset,
- unsigned char usage)
+static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry,
+ unsigned char usage)
{
+ unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
@@ -1390,7 +1392,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
if (usage)
WRITE_ONCE(si->swap_map[offset], usage);
else
- WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);
+ swap_entries_free(si, ci, entry, 1);
return usage;
}
@@ -1461,71 +1463,104 @@ put_out:
return NULL;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *si,
- swp_entry_t entry)
+static void swap_entries_put_cache(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
- struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
- unsigned char usage;
+ struct swap_cluster_info *ci;
ci = lock_cluster(si, offset);
- usage = __swap_entry_free_locked(si, offset, 1);
- if (!usage)
- swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
+ if (swap_only_has_cache(si, offset, nr))
+ swap_entries_free(si, ci, entry, nr);
+ else {
+ for (int i = 0; i < nr; i++, entry.val++)
+ swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+ }
unlock_cluster(ci);
-
- return usage;
}
-static bool __swap_entries_free(struct swap_info_struct *si,
- swp_entry_t entry, int nr)
+static bool swap_entries_put_map(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
unsigned long offset = swp_offset(entry);
- unsigned int type = swp_type(entry);
struct swap_cluster_info *ci;
bool has_cache = false;
unsigned char count;
int i;
- if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
+ if (nr <= 1)
goto fallback;
- /* cross into another cluster */
- if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+ count = swap_count(data_race(si->swap_map[offset]));
+ if (count != 1 && count != SWAP_MAP_SHMEM)
goto fallback;
ci = lock_cluster(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
- unlock_cluster(ci);
- goto fallback;
+ goto locked_fallback;
}
- for (i = 0; i < nr; i++)
- WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
if (!has_cache)
- swap_entry_range_free(si, ci, entry, nr);
+ swap_entries_free(si, ci, entry, nr);
+ else
+ for (i = 0; i < nr; i++)
+ WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
unlock_cluster(ci);
return has_cache;
fallback:
- for (i = 0; i < nr; i++) {
- if (data_race(si->swap_map[offset + i])) {
- count = __swap_entry_free(si, swp_entry(type, offset + i));
- if (count == SWAP_HAS_CACHE)
- has_cache = true;
- } else {
- WARN_ON_ONCE(1);
- }
+ ci = lock_cluster(si, offset);
+locked_fallback:
+ for (i = 0; i < nr; i++, entry.val++) {
+ count = swap_entry_put_locked(si, ci, entry, 1);
+ if (count == SWAP_HAS_CACHE)
+ has_cache = true;
}
+ unlock_cluster(ci);
return has_cache;
+
}
/*
- * Drop the last HAS_CACHE flag of swap entries, caller have to
- * ensure all entries belong to the same cgroup.
+ * Only functions with "_nr" suffix are able to free entries spanning
+ * cross multi clusters, so ensure the range is within a single cluster
+ * when freeing entries with functions without "_nr" suffix.
*/
-static void swap_entry_range_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages)
+static bool swap_entries_put_map_nr(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
+{
+ int cluster_nr, cluster_rest;
+ unsigned long offset = swp_offset(entry);
+ bool has_cache = false;
+
+ cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER;
+ while (nr) {
+ cluster_nr = min(nr, cluster_rest);
+ has_cache |= swap_entries_put_map(si, entry, cluster_nr);
+ cluster_rest = SWAPFILE_CLUSTER;
+ nr -= cluster_nr;
+ entry.val += cluster_nr;
+ }
+
+ return has_cache;
+}
+
+/*
+ * Check if it's the last ref of swap entry in the freeing path.
+ * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
+ */
+static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
+{
+ return (count == SWAP_HAS_CACHE) || (count == 1) ||
+ (count == SWAP_MAP_SHMEM);
+}
+
+/*
+ * Drop the last ref of swap entries, caller have to ensure all entries
+ * belong to the same cgroup and cluster.
+ */
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages)
{
unsigned long offset = swp_offset(entry);
unsigned char *map = si->swap_map + offset;
@@ -1538,7 +1573,7 @@ static void swap_entry_range_free(struct swap_info_struct *si,
ci->count -= nr_pages;
do {
- VM_BUG_ON(*map != SWAP_HAS_CACHE);
+ VM_BUG_ON(!swap_is_last_ref(*map));
*map = 0;
} while (++map < map_end);
@@ -1551,21 +1586,6 @@ static void swap_entry_range_free(struct swap_info_struct *si,
partial_free_cluster(si, ci);
}
-static void cluster_swap_free_nr(struct swap_info_struct *si,
- unsigned long offset, int nr_pages,
- unsigned char usage)
-{
- struct swap_cluster_info *ci;
- unsigned long end = offset + nr_pages;
-
- ci = lock_cluster(si, offset);
- do {
- if (!__swap_entry_free_locked(si, offset, usage))
- swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
- } while (++offset < end);
- unlock_cluster(ci);
-}
-
/*
* Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
@@ -1582,7 +1602,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
while (nr_pages) {
nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- cluster_swap_free_nr(sis, offset, nr, 1);
+ swap_entries_put_map(sis, swp_entry(sis->type, offset), nr);
offset += nr;
nr_pages -= nr;
}
@@ -1593,8 +1613,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
*/
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
- unsigned long offset = swp_offset(entry);
- struct swap_cluster_info *ci;
struct swap_info_struct *si;
int size = 1 << swap_entry_order(folio_order(folio));
@@ -1602,16 +1620,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
if (!si)
return;
- ci = lock_cluster(si, offset);
- if (swap_only_has_cache(si, offset, size))
- swap_entry_range_free(si, ci, entry, size);
- else {
- for (int i = 0; i < size; i++, entry.val++) {
- if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE))
- swap_entry_range_free(si, ci, entry, 1);
- }
- }
- unlock_cluster(ci);
+ swap_entries_put_cache(si, entry, size);
}
int __swap_count(swp_entry_t entry)
@@ -1806,7 +1815,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
/*
* First free all entries in the range.
*/
- any_only_cache = __swap_entries_free(si, entry, nr);
+ any_only_cache = swap_entries_put_map_nr(si, entry, nr);
/*
* Short-circuit the below loop if none of the entries had their
@@ -1816,13 +1825,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
goto out;
/*
- * Now go back over the range trying to reclaim the swap cache. This is
- * more efficient for large folios because we will only try to reclaim
- * the swap once per folio in the common case. If we do
- * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
- * latter will get a reference and lock the folio for every individual
- * page but will only succeed once the swap slot for every subpage is
- * zero.
+ * Now go back over the range trying to reclaim the swap cache.
*/
for (offset = start_offset; offset < end_offset; offset += nr) {
nr = 1;
@@ -3654,11 +3657,13 @@ int swapcache_prepare(swp_entry_t entry, int nr)
return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
}
+/*
+ * Caller should ensure entries belong to the same folio so
+ * the entries won't span cross cluster boundary.
+ */
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
- unsigned long offset = swp_offset(entry);
-
- cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
+ swap_entries_put_cache(si, entry, nr);
}
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
@@ -3667,21 +3672,6 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry)
}
/*
- * out-of-line methods to avoid include hell.
- */
-struct address_space *swapcache_mapping(struct folio *folio)
-{
- return swp_swap_info(folio->swap)->swap_file->f_mapping;
-}
-EXPORT_SYMBOL_GPL(swapcache_mapping);
-
-pgoff_t __folio_swap_cache_index(struct folio *folio)
-{
- return swap_cache_index(folio->swap);
-}
-EXPORT_SYMBOL_GPL(__folio_swap_cache_index);
-
-/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
* page of the original vmalloc'ed swap_map, to hold the continuation count
@@ -3798,7 +3788,7 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or caller of __swap_entry_free_locked()
+ * Called while __swap_duplicate() or caller of swap_entry_put_locked()
* holds cluster lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,
diff --git a/mm/truncate.c b/mm/truncate.c
index f2aaf99f2990..91eb92a5ce4f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -425,7 +425,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
+ /* We rely upon deletion not changing folio->index */
if (xa_is_value(folio))
continue;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e0db855c89b4..bc473ad21202 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1063,7 +1063,7 @@ static int move_present_pte(struct mm_struct *mm,
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
- orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
/* Set soft dirty bit so userspace can notice the pte was moved */
#ifdef CONFIG_MEM_SOFT_DIRTY
orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
diff --git a/mm/vma.c b/mm/vma.c
index a468d4c29c0c..1c6595f282e5 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -17,9 +17,13 @@ struct mmap_state {
unsigned long pglen;
unsigned long flags;
struct file *file;
+ pgprot_t page_prot;
+
+ /* User-defined fields, perhaps updated by .mmap_prepare(). */
+ const struct vm_operations_struct *vm_ops;
+ void *vm_private_data;
unsigned long charged;
- bool retry_merge;
struct vm_area_struct *prev;
struct vm_area_struct *next;
@@ -40,6 +44,7 @@ struct mmap_state {
.pglen = PHYS_PFN(len_), \
.flags = flags_, \
.file = file_, \
+ .page_prot = vm_get_page_prot(flags_), \
}
#define VMG_MMAP_STATE(name, map_, vma_) \
@@ -57,6 +62,22 @@ struct mmap_state {
.state = VMA_MERGE_START, \
}
+/*
+ * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain
+ * more than one anon_vma_chain connecting it to more than one anon_vma. A merge
+ * would mean a wider range of folios sharing the root anon_vma lock, and thus
+ * potential lock contention, we do not wish to encourage merging such that this
+ * scales to a problem.
+ */
+static bool vma_had_uncowed_parents(struct vm_area_struct *vma)
+{
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
+}
+
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
@@ -82,24 +103,28 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
return true;
}
-static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2, struct vm_area_struct *vma)
+static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
{
+ struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
+ struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */
+ struct anon_vma *tgt_anon = tgt->anon_vma;
+ struct anon_vma *src_anon = vmg->anon_vma;
+
/*
- * The list_is_singular() test is to avoid merging VMA cloned from
- * parents. This can improve scalability caused by anon_vma lock.
+ * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
+ * will remove the existing VMA's anon_vma's so there's no scalability
+ * concerns.
*/
- if ((!anon_vma1 || !anon_vma2) && (!vma ||
- list_is_singular(&vma->anon_vma_chain)))
- return true;
- return anon_vma1 == anon_vma2;
-}
+ VM_WARN_ON(src && src_anon != src->anon_vma);
-/* Are the anon_vma's belonging to each VMA compatible with one another? */
-static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
- struct vm_area_struct *vma2)
-{
- return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
+ /* Case 1 - we will dup_anon_vma() from src into tgt. */
+ if (!tgt_anon && src_anon)
+ return !vma_had_uncowed_parents(src);
+ /* Case 2 - we will simply use tgt's anon_vma. */
+ if (tgt_anon && !src_anon)
+ return !vma_had_uncowed_parents(tgt);
+ /* Case 3 - the anon_vma's are already shared. */
+ return src_anon == tgt_anon;
}
/*
@@ -164,7 +189,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ true)) {
if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
return true;
}
@@ -184,7 +209,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
static bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ false)) {
if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
return true;
}
@@ -400,8 +425,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg)
static bool can_vma_merge_right(struct vma_merge_struct *vmg,
bool can_merge_left)
{
- if (!vmg->next || vmg->end != vmg->next->vm_start ||
- !can_vma_merge_before(vmg))
+ struct vm_area_struct *next = vmg->next;
+ struct vm_area_struct *prev;
+
+ if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg))
return false;
if (!can_merge_left)
@@ -414,7 +441,9 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
*
* We therefore check this in addition to mergeability to either side.
*/
- return are_anon_vmas_compatible(vmg->prev, vmg->next);
+ prev = vmg->prev;
+ return !prev->anon_vma || !next->anon_vma ||
+ prev->anon_vma == next->anon_vma;
}
/*
@@ -554,7 +583,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
/*
- * dup_anon_vma() - Helper function to duplicate anon_vma
+ * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
+ * instance that the destination VMA has no anon_vma but the source does.
+ *
* @dst: The destination VMA
* @src: The source VMA
* @dup: Pointer to the destination VMA when successful.
@@ -565,9 +596,18 @@ static int dup_anon_vma(struct vm_area_struct *dst,
struct vm_area_struct *src, struct vm_area_struct **dup)
{
/*
- * Easily overlooked: when mprotect shifts the boundary, make sure the
- * expanding vma has anon_vma set if the shrinking vma had, to cover any
- * anon pages imported.
+ * There are three cases to consider for correctly propagating
+ * anon_vma's on merge.
+ *
+ * The first is trivial - neither VMA has anon_vma, we need not do
+ * anything.
+ *
+ * The second where both have anon_vma is also a no-op, as they must
+ * then be the same, so there is simply nothing to copy.
+ *
+ * Here we cover the third - if the destination VMA has no anon_vma,
+ * that is it is unfaulted, we need to ensure that the newly merged
+ * range is referenced by the anon_vma's of the source.
*/
if (src->anon_vma && !dst->anon_vma) {
int ret;
@@ -2351,6 +2391,10 @@ static int __mmap_new_file_vma(struct mmap_state *map,
int error;
vma->vm_file = get_file(map->file);
+
+ if (!map->file->f_op->mmap)
+ return 0;
+
error = mmap_file(vma->vm_file, vma);
if (error) {
fput(vma->vm_file);
@@ -2373,8 +2417,6 @@ static int __mmap_new_file_vma(struct mmap_state *map,
!(map->flags & VM_MAYWRITE) &&
(vma->vm_flags & VM_MAYWRITE));
- /* If the flags change (and are mergeable), let's retry later. */
- map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL);
map->flags = vma->vm_flags;
return 0;
@@ -2407,7 +2449,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
vma_iter_config(vmi, map->addr, map->end);
vma_set_range(vma, map->addr, map->end, map->pgoff);
vm_flags_init(vma, map->flags);
- vma->vm_page_prot = vm_get_page_prot(map->flags);
+ vma->vm_page_prot = map->page_prot;
if (vma_iter_prealloc(vmi, vma)) {
error = -ENOMEM;
@@ -2494,6 +2536,56 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
vma_set_page_prot(vma);
}
+/*
+ * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
+ * specifies it.
+ *
+ * This is called prior to any merge attempt, and updates whitelisted fields
+ * that are permitted to be updated by the caller.
+ *
+ * All but user-defined fields will be pre-populated with original values.
+ *
+ * Returns 0 on success, or an error code otherwise.
+ */
+static int call_mmap_prepare(struct mmap_state *map)
+{
+ int err;
+ struct vm_area_desc desc = {
+ .mm = map->mm,
+ .start = map->addr,
+ .end = map->end,
+
+ .pgoff = map->pgoff,
+ .file = map->file,
+ .vm_flags = map->flags,
+ .page_prot = map->page_prot,
+ };
+
+ /* Invoke the hook. */
+ err = __call_mmap_prepare(map->file, &desc);
+ if (err)
+ return err;
+
+ /* Update fields permitted to be changed. */
+ map->pgoff = desc.pgoff;
+ map->file = desc.file;
+ map->flags = desc.vm_flags;
+ map->page_prot = desc.page_prot;
+ /* User-defined fields. */
+ map->vm_ops = desc.vm_ops;
+ map->vm_private_data = desc.private_data;
+
+ return 0;
+}
+
+static void set_vma_user_defined_fields(struct vm_area_struct *vma,
+ struct mmap_state *map)
+{
+ if (map->vm_ops)
+ vma->vm_ops = map->vm_ops;
+ vma->vm_private_data = map->vm_private_data;
+}
+
static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
@@ -2501,10 +2593,13 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
int error;
+ bool have_mmap_prepare = file && file->f_op->mmap_prepare;
VMA_ITERATOR(vmi, mm, addr);
MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
error = __mmap_prepare(&map, uf);
+ if (!error && have_mmap_prepare)
+ error = call_mmap_prepare(&map);
if (error)
goto abort_munmap;
@@ -2522,16 +2617,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
goto unacct_error;
}
- /* If flags changed, we might be able to merge, so try again. */
- if (map.retry_merge) {
- struct vm_area_struct *merged;
- VMG_MMAP_STATE(vmg, &map, vma);
-
- vma_iter_config(map.vmi, map.addr, map.end);
- merged = vma_merge_existing_range(&vmg);
- if (merged)
- vma = merged;
- }
+ if (have_mmap_prepare)
+ set_vma_user_defined_fields(vma, &map);
__mmap_complete(&map, vma);
@@ -3018,3 +3105,46 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock)
userfaultfd_unmap_complete(mm, &uf);
return ret;
}
+
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree. If vm_file is non-NULL
+ * then i_mmap_rwsem is taken here.
+ */
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long charged = vma_pages(vma);
+
+
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
+ return -ENOMEM;
+
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+
+ /*
+ * The vm_pgoff of a purely anonymous vma should be irrelevant
+ * until its first write fault, when page's anon_vma and index
+ * are set. But now set the vm_pgoff it will almost certainly
+ * end up with (unless mremap moves it elsewhere before that
+ * first wfault), so /proc/pid/maps tells a consistent story.
+ *
+ * By setting it to reflect the virtual start address of the
+ * vma, merges and splits can happen in a seamless way, just
+ * using the existing file pgoff checks and manipulations.
+ * Similarly in do_mmap and in do_brk_flags.
+ */
+ if (vma_is_anonymous(vma)) {
+ BUG_ON(vma->anon_vma);
+ vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+ }
+
+ if (vma_link(mm, vma)) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
diff --git a/mm/vma.h b/mm/vma.h
index 149926e8a6d1..9a8af9be29a8 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -548,4 +548,19 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
int __vm_munmap(unsigned long start, size_t len, bool unlock);
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
+
+/* vma_init.h, shared between CONFIG_MMU and nommu. */
+void __init vma_state_init(void);
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
+void vm_area_free(struct vm_area_struct *vma);
+
+/* vma_exec.c */
+#ifdef CONFIG_MMU
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p);
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
+#endif
+
#endif /* __MM_VMA_H */
diff --git a/mm/vma_exec.c b/mm/vma_exec.c
new file mode 100644
index 000000000000..2dffb02ed6a2
--- /dev/null
+++ b/mm/vma_exec.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Functions explicitly implemented for exec functionality which however are
+ * explicitly VMA-only logic.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/*
+ * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
+ * this VMA and its relocated range, which will now reside at [vma->vm_start -
+ * shift, vma->vm_end - shift).
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable temporary stack relocation.
+ */
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+{
+ /*
+ * The process proceeds as follows:
+ *
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges. This ensures the
+ * arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
+ */
+
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_start = vma->vm_start;
+ unsigned long old_end = vma->vm_end;
+ unsigned long length = old_end - old_start;
+ unsigned long new_start = old_start - shift;
+ unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
+ struct vm_area_struct *next;
+ struct mmu_gather tlb;
+ PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
+
+ BUG_ON(new_start > new_end);
+
+ /*
+ * ensure there are no vmas between where we want to go
+ * and where we are
+ */
+ if (vma != vma_next(&vmi))
+ return -EFAULT;
+
+ vma_iter_prev_range(&vmi);
+ /*
+ * cover the whole range: [new_start, old_end)
+ */
+ vmg.middle = vma;
+ if (vma_expand(&vmg))
+ return -ENOMEM;
+
+ /*
+ * move the page tables downwards, on failure we rely on
+ * process cleanup to remove whatever mess we made.
+ */
+ pmc.for_stack = true;
+ if (length != move_page_tables(&pmc))
+ return -ENOMEM;
+
+ tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
+ if (new_end > old_start) {
+ /*
+ * when the old and new regions overlap clear from new_end.
+ */
+ free_pgd_range(&tlb, new_end, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ } else {
+ /*
+ * otherwise, clean from old_start; this is done to not touch
+ * the address space in [new_end, old_start) some architectures
+ * have constraints on va-space that make this illegal (IA64) -
+ * for the others its just a little faster.
+ */
+ free_pgd_range(&tlb, old_start, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ }
+ tlb_finish_mmu(&tlb);
+
+ vma_prev(&vmi);
+ /* Shrink the vma to just the new range */
+ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+}
+
+/*
+ * Establish the stack VMA in an execve'd process, located temporarily at the
+ * maximum stack address provided by the architecture.
+ *
+ * We later relocate this downwards in relocate_vma_down().
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable initialisation.
+ *
+ * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
+ * maximum addressable location in the stack (that is capable of storing a
+ * system word of data).
+ */
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p)
+{
+ int err;
+ struct vm_area_struct *vma = vm_area_alloc(mm);
+
+ if (!vma)
+ return -ENOMEM;
+
+ vma_set_anonymous(vma);
+
+ if (mmap_write_lock_killable(mm)) {
+ err = -EINTR;
+ goto err_free;
+ }
+
+ /*
+ * Need to be called with mmap write lock
+ * held, to avoid race with ksmd.
+ */
+ err = ksm_execve(mm);
+ if (err)
+ goto err_ksm;
+
+ /*
+ * Place the stack at the largest stack address the architecture
+ * supports. Later, we'll move this to an appropriate place. We don't
+ * use STACK_TOP because that can depend on attributes which aren't
+ * configured yet.
+ */
+ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_end = STACK_TOP_MAX;
+ vma->vm_start = vma->vm_end - PAGE_SIZE;
+ vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ err = insert_vm_struct(mm, vma);
+ if (err)
+ goto err;
+
+ mm->stack_vm = mm->total_vm = 1;
+ mmap_write_unlock(mm);
+ *vmap = vma;
+ *top_mem_p = vma->vm_end - sizeof(void *);
+ return 0;
+
+err:
+ ksm_exit(mm);
+err_ksm:
+ mmap_write_unlock(mm);
+err_free:
+ *vmap = NULL;
+ vm_area_free(vma);
+ return err;
+}
diff --git a/mm/vma_init.c b/mm/vma_init.c
new file mode 100644
index 000000000000..8e53c7943561
--- /dev/null
+++ b/mm/vma_init.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
+ * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/* SLAB cache for vm_area_struct structures */
+static struct kmem_cache *vm_area_cachep;
+
+void __init vma_state_init(void)
+{
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ };
+
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), &args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
+ SLAB_ACCOUNT);
+}
+
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+
+ return vma;
+}
+
+static void vm_area_init_from(const struct vm_area_struct *src,
+ struct vm_area_struct *dest)
+{
+ dest->vm_mm = src->vm_mm;
+ dest->vm_ops = src->vm_ops;
+ dest->vm_start = src->vm_start;
+ dest->vm_end = src->vm_end;
+ dest->anon_vma = src->anon_vma;
+ dest->vm_pgoff = src->vm_pgoff;
+ dest->vm_file = src->vm_file;
+ dest->vm_private_data = src->vm_private_data;
+ vm_flags_init(dest, src->vm_flags);
+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+ sizeof(dest->vm_page_prot));
+ /*
+ * src->shared.rb may be modified concurrently when called from
+ * dup_mmap(), but the clone will reinitialize it.
+ */
+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
+ sizeof(dest->vm_userfaultfd_ctx));
+#ifdef CONFIG_ANON_VMA_NAME
+ dest->anon_name = src->anon_name;
+#endif
+#ifdef CONFIG_SWAP
+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
+ sizeof(dest->swap_readahead_info));
+#endif
+#ifndef CONFIG_MMU
+ dest->vm_region = src->vm_region;
+#endif
+#ifdef CONFIG_NUMA
+ dest->vm_policy = src->vm_policy;
+#endif
+#ifdef __HAVE_PFNMAP_TRACKING
+ dest->pfnmap_track_ctx = NULL;
+#endif
+}
+
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return 0;
+
+ /*
+ * We don't expect to ever hit this. If ever required, we would have
+ * to duplicate the tracking.
+ */
+ if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX))
+ return -ENOMEM;
+ kref_get(&ctx->kref);
+ new->pfnmap_track_ctx = ctx;
+ return 0;
+}
+
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+ struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx;
+
+ if (likely(!ctx))
+ return;
+
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ vma->pfnmap_track_ctx = NULL;
+}
+#else
+static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig,
+ struct vm_area_struct *new)
+{
+ return 0;
+}
+static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma)
+{
+}
+#endif
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+ struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ vm_area_init_from(orig, new);
+
+ if (vma_pfnmap_track_ctx_dup(orig, new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
+ }
+ vma_lock_init(new, true);
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
+ return new;
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+ /* The vma should be detached while being destroyed. */
+ vma_assert_detached(vma);
+ vma_numab_state_free(vma);
+ free_anon_vma_name(vma);
+ vma_pfnmap_track_ctx_release(vma);
+ kmem_cache_free(vm_area_cachep, vma);
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 340edee108c0..ab986dd09b6a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -932,6 +932,11 @@ static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;
+/* A simple iterator over all vmap-nodes. */
+#define for_each_vmap_node(vn) \
+ for ((vn) = &vmap_nodes[0]; \
+ (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
+
static inline unsigned int
addr_to_node_id(unsigned long addr)
{
@@ -950,6 +955,19 @@ id_to_node(unsigned int id)
return &vmap_nodes[id % nr_vmap_nodes];
}
+static inline unsigned int
+node_to_id(struct vmap_node *node)
+{
+ /* Pointer arithmetic. */
+ unsigned int id = node - vmap_nodes;
+
+ if (likely(id < nr_vmap_nodes))
+ return id;
+
+ WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node);
+ return 0;
+}
+
/*
* We use the value 0 to represent "no node", that is why
* an encoded value will be the node-id incremented by 1.
@@ -1022,7 +1040,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
-static atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
+static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
unsigned long vmalloc_nr_pages(void)
{
@@ -1088,12 +1107,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
unsigned long va_start_lowest;
struct vmap_node *vn;
- int i;
repeat:
- for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ va_start_lowest = 0;
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
*va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
@@ -1730,7 +1748,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
- return -1;
+ return -ENOMEM;
}
/*
@@ -1744,7 +1762,7 @@ va_clip(struct rb_root *root, struct list_head *head,
*/
va->va_start = nva_start_addr + size;
} else {
- return -1;
+ return -EINVAL;
}
if (type != FL_FIT_TYPE) {
@@ -1773,19 +1791,19 @@ va_alloc(struct vmap_area *va,
/* Check the "vend" restriction. */
if (nva_start_addr + size > vend)
- return vend;
+ return -ERANGE;
/* Update the free vmap_area. */
ret = va_clip(root, head, va, nva_start_addr, size);
if (WARN_ON_ONCE(ret))
- return vend;
+ return ret;
return nva_start_addr;
}
/*
* Returns a start address of the newly allocated area, if success.
- * Otherwise a vend is returned that indicates failure.
+ * Otherwise an error value is returned that indicates failure.
*/
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
@@ -1810,14 +1828,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
- return vend;
+ return -ENOENT;
nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
- if (nva_start_addr == vend)
- return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(root, head, size, align);
+ if (!IS_ERR_VALUE(nva_start_addr))
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
@@ -1947,7 +1964,7 @@ node_alloc(unsigned long size, unsigned long align,
struct vmap_area *va;
*vn_id = 0;
- *addr = vend;
+ *addr = -EINVAL;
/*
* Fallback to a global heap if not vmalloc or there
@@ -2027,20 +2044,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
}
retry:
- if (addr == vend) {
+ if (IS_ERR_VALUE(addr)) {
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
}
- trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
/*
- * If an allocation fails, the "vend" address is
+ * If an allocation fails, the error value is
* returned. Therefore trigger the overflow path.
*/
- if (unlikely(addr == vend))
+ if (IS_ERR_VALUE(addr))
goto overflow;
va->va_start = addr;
@@ -2132,8 +2149,6 @@ static unsigned long lazy_max_pages(void)
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
-
/*
* Serialize vmap purging. There is no actual critical section protected
* by this lock, but we want to avoid concurrent calls for performance
@@ -2143,7 +2158,6 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
-static cpumask_t purge_nodes;
static void
reclaim_list_global(struct list_head *head)
@@ -2166,7 +2180,7 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
LIST_HEAD(decay_list);
struct rb_root decay_root = RB_ROOT;
struct vmap_area *va, *nva;
- unsigned long n_decay;
+ unsigned long n_decay, pool_len;
int i;
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
@@ -2180,22 +2194,20 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
list_replace_init(&vn->pool[i].head, &tmp_list);
spin_unlock(&vn->pool_lock);
- if (full_decay)
- WRITE_ONCE(vn->pool[i].len, 0);
+ pool_len = n_decay = vn->pool[i].len;
+ WRITE_ONCE(vn->pool[i].len, 0);
/* Decay a pool by ~25% out of left objects. */
- n_decay = vn->pool[i].len >> 2;
+ if (!full_decay)
+ n_decay >>= 2;
+ pool_len -= n_decay;
list_for_each_entry_safe(va, nva, &tmp_list, list) {
+ if (!n_decay--)
+ break;
+
list_del_init(&va->list);
merge_or_add_vmap_area(va, &decay_root, &decay_list);
-
- if (!full_decay) {
- WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
-
- if (!--n_decay)
- break;
- }
}
/*
@@ -2204,9 +2216,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
* can populate the pool therefore a simple list replace
* operation takes place here.
*/
- if (!full_decay && !list_empty(&tmp_list)) {
+ if (!list_empty(&tmp_list)) {
spin_lock(&vn->pool_lock);
list_replace_init(&tmp_list, &vn->pool[i].head);
+ WRITE_ONCE(vn->pool[i].len, pool_len);
spin_unlock(&vn->pool_lock);
}
}
@@ -2276,6 +2289,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
{
unsigned long nr_purged_areas = 0;
unsigned int nr_purge_helpers;
+ static cpumask_t purge_nodes;
unsigned int nr_purge_nodes;
struct vmap_node *vn;
int i;
@@ -2287,9 +2301,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
*/
purge_nodes = CPU_MASK_NONE;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
-
+ for_each_vmap_node(vn) {
INIT_LIST_HEAD(&vn->purge_list);
vn->skip_populate = full_pool_decay;
decay_va_pool_node(vn, full_pool_decay);
@@ -2308,7 +2320,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
end = max(end, list_last_entry(&vn->purge_list,
struct vmap_area, list)->va_end);
- cpumask_set_cpu(i, &purge_nodes);
+ cpumask_set_cpu(node_to_id(vn), &purge_nodes);
}
nr_purge_nodes = cpumask_weight(&purge_nodes);
@@ -2387,7 +2399,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
if (WARN_ON_ONCE(!list_empty(&va->list)))
return;
- nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT,
+ nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT,
&vmap_lazy_nr);
/*
@@ -2453,7 +2465,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2479,7 +2491,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
if (va)
return va;
- } while ((i = (i + 1) % nr_vmap_nodes) != j);
+ } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
@@ -2948,10 +2960,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
*/
void vm_unmap_aliases(void)
{
- unsigned long start = ULONG_MAX, end = 0;
- int flush = 0;
-
- _vm_unmap_aliases(start, end, flush);
+ _vm_unmap_aliases(ULONG_MAX, 0, 0);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -3132,7 +3141,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
/*
* Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
- * Pair with smp_rmb() in show_numa_info().
+ * Pair with smp_rmb() in vread_iter() and vmalloc_info_show().
*/
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
@@ -3403,12 +3412,13 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
+ mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
for (i = 0; i < vm->nr_pages; i++) {
struct page *page = vm->pages[i];
BUG_ON(!page);
- if (!(vm->flags & VM_MAP_PUT_PAGES))
- mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
/*
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
@@ -3704,12 +3714,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
node, page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (gfp_mask & __GFP_ACCOUNT) {
- int i;
-
- for (i = 0; i < area->nr_pages; i++)
- mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
- }
+ /* All pages of vm should be charged to same memcg, so use first one. */
+ if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
+ mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
+ area->nr_pages);
/*
* If not enough pages were obtained to accomplish an
@@ -4967,39 +4975,37 @@ bool vmalloc_dump_obj(void *object)
#endif
#ifdef CONFIG_PROC_FS
-static void show_numa_info(struct seq_file *m, struct vm_struct *v)
-{
- if (IS_ENABLED(CONFIG_NUMA)) {
- unsigned int nr, *counters = m->private;
- unsigned int step = 1U << vm_area_page_order(v);
- if (!counters)
- return;
+/*
+ * Print number of pages allocated on each memory node.
+ *
+ * This function can only be called if CONFIG_NUMA is enabled
+ * and VM_UNINITIALIZED bit in v->flags is disabled.
+ */
+static void show_numa_info(struct seq_file *m, struct vm_struct *v,
+ unsigned int *counters)
+{
+ unsigned int nr;
+ unsigned int step = 1U << vm_area_page_order(v);
- if (v->flags & VM_UNINITIALIZED)
- return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
+ if (!counters)
+ return;
- memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- for (nr = 0; nr < v->nr_pages; nr += step)
- counters[page_to_nid(v->pages[nr])] += step;
- for_each_node_state(nr, N_HIGH_MEMORY)
- if (counters[nr])
- seq_printf(m, " N%u=%u", nr, counters[nr]);
- }
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
}
static void show_purge_info(struct seq_file *m)
{
struct vmap_node *vn;
struct vmap_area *va;
- int i;
-
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ for_each_vmap_node(vn) {
spin_lock(&vn->lazy.lock);
list_for_each_entry(va, &vn->lazy.head, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
@@ -5015,11 +5021,12 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *v;
- int i;
+ unsigned int *counters;
- for (i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ if (IS_ENABLED(CONFIG_NUMA))
+ counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
list_for_each_entry(va, &vn->busy.head, list) {
if (!va->vm) {
@@ -5032,6 +5039,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
}
v = va->vm;
+ if (v->flags & VM_UNINITIALIZED)
+ continue;
+
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
@@ -5066,7 +5078,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
- show_numa_info(m, v);
+ if (IS_ENABLED(CONFIG_NUMA))
+ show_numa_info(m, v, counters);
+
seq_putc(m, '\n');
}
spin_unlock(&vn->busy.lock);
@@ -5076,19 +5090,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
* As a final step, dump "unpurged" areas.
*/
show_purge_info(m);
+ if (IS_ENABLED(CONFIG_NUMA))
+ kfree(counters);
return 0;
}
static int __init proc_vmalloc_init(void)
{
- void *priv_data = NULL;
-
- if (IS_ENABLED(CONFIG_NUMA))
- priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
-
- proc_create_single_data("vmallocinfo",
- 0400, NULL, vmalloc_info_show, priv_data);
-
+ proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show);
return 0;
}
module_init(proc_vmalloc_init);
@@ -5140,7 +5149,7 @@ static void __init vmap_init_free_space(void)
static void vmap_init_nodes(void)
{
struct vmap_node *vn;
- int i, n;
+ int i;
#if BITS_PER_LONG == 64
/*
@@ -5157,7 +5166,7 @@ static void vmap_init_nodes(void)
* set of cores. Therefore a per-domain purging is supposed to
* be added as well as a per-domain balancing.
*/
- n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+ int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
@@ -5172,8 +5181,7 @@ static void vmap_init_nodes(void)
}
#endif
- for (n = 0; n < nr_vmap_nodes; n++) {
- vn = &vmap_nodes[n];
+ for_each_vmap_node(vn) {
vn->busy.root = RB_ROOT;
INIT_LIST_HEAD(&vn->busy.head);
spin_lock_init(&vn->busy.lock);
@@ -5194,15 +5202,13 @@ static void vmap_init_nodes(void)
static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- unsigned long count;
+ unsigned long count = 0;
struct vmap_node *vn;
- int i, j;
-
- for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
- vn = &vmap_nodes[i];
+ int i;
- for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
- count += READ_ONCE(vn->pool[j].len);
+ for_each_vmap_node(vn) {
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++)
+ count += READ_ONCE(vn->pool[i].len);
}
return count ? count : SHRINK_EMPTY;
@@ -5211,10 +5217,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- int i;
+ struct vmap_node *vn;
- for (i = 0; i < nr_vmap_nodes; i++)
- decay_va_pool_node(&vmap_nodes[i], true);
+ for_each_vmap_node(vn)
+ decay_va_pool_node(vn, true);
return SHRINK_STOP;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b6f4db6c240f..f8dfd2864bbf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -342,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+ struct mem_cgroup *memcg)
{
+ int demotion_nid;
+
if (!numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+ demotion_nid = next_demotion_node(nid);
+ if (demotion_nid == NUMA_NO_NODE)
return false;
- return true;
+ /* If demotion node isn't in the cgroup's mems_allowed, fall back */
+ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -376,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
*
* Can it be reclaimed from this node via demotion?
*/
- return can_demote(nid, sc);
+ return can_demote(nid, sc, memcg);
}
/*
@@ -1099,7 +1105,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
*/
static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
- struct reclaim_stat *stat, bool ignore_references)
+ struct reclaim_stat *stat, bool ignore_references,
+ struct mem_cgroup *memcg)
{
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
@@ -1112,7 +1119,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
cond_resched();
- do_demote_pass = can_demote(pgdat->node_id, sc);
+ do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
retry:
while (!list_empty(folio_list)) {
@@ -1190,8 +1197,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1216,6 +1225,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1226,7 +1237,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have
@@ -1661,7 +1674,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
- &stat, true);
+ &stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
@@ -1728,13 +1741,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0;
- unsigned long scan, total_scan, nr_pages;
+ unsigned long skipped = 0, total_scan = 0, scan = 0;
+ unsigned long nr_pages;
unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
- total_scan = 0;
- scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
struct folio *folio;
@@ -2026,7 +2037,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2034,7 +2045,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+ lruvec_memcg(lruvec));
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
@@ -2045,7 +2057,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2135,7 +2147,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!cgroup_reclaim(sc))
__count_vm_events(PGREFILL, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2192,7 +2204,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2217,7 +2229,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
@@ -2506,6 +2518,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
+ /* Proactive reclaim initiated by userspace for anonymous memory only */
+ if (swappiness == SWAPPINESS_ANON_ONLY) {
+ WARN_ON_ONCE(!sc->proactive);
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
@@ -2526,7 +2545,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
/*
* If there is enough inactive page cache, we do not reclaim
- * anything from the anonymous working right now.
+ * anything from the anonymous working right now to make sure
+ * a streaming file access pattern doesn't cause swapping.
*/
if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
@@ -2649,7 +2669,7 @@ out:
* Anonymous LRU management is a waste if there is
* ultimately no way to reclaim the memory.
*/
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
struct scan_control *sc)
{
/* Aging the anon LRU is valuable if swap is present: */
@@ -2657,7 +2677,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return true;
/* Also valuable if anon pages can be demoted: */
- return can_demote(pgdat->node_id, sc);
+ return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+ lruvec_memcg(lruvec));
}
#ifdef CONFIG_LRU_GEN
@@ -2693,8 +2714,12 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+/* Get the min/max evictable type based on swappiness */
+#define min_type(swappiness) (!(swappiness))
+#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
+
#define evictable_min_seq(min_seq, swappiness) \
- min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS])
+ min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
@@ -2702,7 +2727,7 @@ static bool should_clear_pmd_young(void)
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
#define for_each_evictable_type(type, swappiness) \
- for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++)
+ for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2735,7 +2760,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
if (!sc->may_swap)
return 0;
- if (!can_demote(pgdat->node_id, sc) &&
+ if (!can_demote(pgdat->node_id, sc, memcg) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -3853,7 +3878,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type ? swappiness > MAX_SWAPPINESS : !swappiness)
+ /* For file type, skip the check if swappiness is anon only */
+ if (type && (swappiness == SWAPPINESS_ANON_ONLY))
+ goto done;
+
+ /* For anon type, skip the check if swappiness is zero (file only) */
+ if (!type && !swappiness)
goto done;
/* prevent cold/hot inversion if the type is evictable */
@@ -4591,8 +4621,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
- __count_memcg_events(memcg, item, isolated);
- __count_memcg_events(memcg, PGREFILL, sorted);
+ count_memcg_events(memcg, item, isolated);
+ count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
@@ -4698,7 +4728,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
retry:
- reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -4742,7 +4772,7 @@ retry:
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
- __count_memcg_events(memcg, item, reclaimed);
+ count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -5519,7 +5549,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > MAX_SWAPPINESS + 1)
+ else if (swappiness > SWAPPINESS_ANON_ONLY)
goto done;
switch (cmd) {
@@ -5576,24 +5606,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
while ((cur = strsep(&next, ",;\n"))) {
int n;
int end;
- char cmd;
+ char cmd, swap_string[5];
unsigned int memcg_id;
unsigned int nid;
unsigned long seq;
- unsigned int swappiness = -1;
+ unsigned int swappiness;
unsigned long opt = -1;
cur = skip_spaces(cur);
if (!*cur)
continue;
- n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
- &seq, &end, &swappiness, &end, &opt, &end);
+ n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, swap_string, &end, &opt, &end);
if (n < 4 || cur[end]) {
err = -EINVAL;
break;
}
+ if (n == 4) {
+ swappiness = -1;
+ } else if (!strcmp("max", swap_string)) {
+ /* set by userspace for anonymous memory only */
+ swappiness = SWAPPINESS_ANON_ONLY;
+ } else {
+ err = kstrtouint(swap_string, 0, &swappiness);
+ if (err)
+ break;
+ }
+
err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
if (err)
break;
@@ -5853,7 +5894,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ if (can_age_anon_pages(lruvec, sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -6684,10 +6725,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
return;
}
- if (!can_age_anon_pages(pgdat, sc))
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!can_age_anon_pages(lruvec, sc))
return;
- lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
return;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4c268ce39ff2..6f740f070b3d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,7 +7,7 @@
*
* zoned VM statistics
* Copyright (C) 2006 Silicon Graphics, Inc.,
- * Christoph Lameter <christoph@lameter.com>
+ * Christoph Lameter <cl@gentwo.org>
* Copyright (C) 2008-2014 Christoph Lameter
*/
#include <linux/fs.h>
@@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = {
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
+ "numa_task_migrated",
+ "numa_task_swapped",
#endif
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
diff --git a/mm/workingset.c b/mm/workingset.c
index 4841ae8af411..6e7f4cb1b9a7 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -612,7 +612,6 @@ struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
- struct address_space *mapping;
struct page *page = virt_to_page(node);
/*
@@ -623,8 +622,7 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- mapping = container_of(node->array, struct address_space, i_pages);
- lockdep_assert_held(&mapping->i_pages.xa_lock);
+ lockdep_assert_held(&node->array->xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
index fa47fece2237..d3df316e5bb7 100644
--- a/mm/zpdesc.h
+++ b/mm/zpdesc.h
@@ -7,6 +7,9 @@
#ifndef __MM_ZPDESC_H__
#define __MM_ZPDESC_H__
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+
/*
* struct zpdesc - Memory descriptor for zpool memory.
* @flags: Page flags, mostly unused by zsmalloc.
@@ -51,8 +54,8 @@ struct zpdesc {
ZPDESC_MATCH(flags, flags);
ZPDESC_MATCH(lru, lru);
ZPDESC_MATCH(mapping, movable_ops);
-ZPDESC_MATCH(index, next);
-ZPDESC_MATCH(index, handle);
+ZPDESC_MATCH(__folio_index, next);
+ZPDESC_MATCH(__folio_index, handle);
ZPDESC_MATCH(private, zspage);
ZPDESC_MATCH(page_type, first_obj_offset);
ZPDESC_MATCH(_refcount, _refcount);
diff --git a/mm/zpool.c b/mm/zpool.c
index 6d6d88930932..0a71d03369f1 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -226,20 +226,22 @@ const char *zpool_get_type(struct zpool *zpool)
* @size: The amount of memory to allocate.
* @gfp: The GFP flags to use when allocating memory.
* @handle: Pointer to the handle to set
+ * @nid: The preferred node id.
*
* This allocates the requested amount of memory from the pool.
* The gfp flags will be used when allocating memory, if the
* implementation supports it. The provided @handle will be
- * set to the allocated object handle.
+ * set to the allocated object handle. The allocation will
+ * prefer the NUMA node specified by @nid.
*
* Implementations must guarantee this to be thread-safe.
*
* Returns: 0 on success, negative value on error.
*/
int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
- unsigned long *handle)
+ unsigned long *handle, const int nid)
{
- return zpool->driver->malloc(zpool->pool, size, gfp, handle);
+ return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid);
}
/**
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d14a7e317ac8..999b513c7fdf 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -26,17 +26,10 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/cpumask.h>
-#include <linux/cpu.h>
-#include <linux/vmalloc.h>
-#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/sprintf.h>
#include <linux/shrinker.h>
@@ -44,11 +37,8 @@
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
-#include <linux/migrate.h>
-#include <linux/wait.h>
-#include <linux/pagemap.h>
#include <linux/fs.h>
-#include <linux/local_lock.h>
+#include <linux/workqueue.h>
#include "zpdesc.h"
#define ZSPAGE_MAGIC 0x58
@@ -243,9 +233,9 @@ static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc)
dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
}
-static inline struct zpdesc *alloc_zpdesc(gfp_t gfp)
+static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid)
{
- struct page *page = alloc_page(gfp);
+ struct page *page = alloc_pages_node(nid, gfp, 0);
return page_zpdesc(page);
}
@@ -462,9 +452,9 @@ static void zs_zpool_destroy(void *pool)
}
static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
+ unsigned long *handle, const int nid)
{
- *handle = zs_malloc(pool, size, gfp);
+ *handle = zs_malloc(pool, size, gfp, nid);
if (IS_ERR_VALUE(*handle))
return PTR_ERR((void *)*handle);
@@ -1043,8 +1033,8 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
* Allocate a zspage for the given size class
*/
static struct zspage *alloc_zspage(struct zs_pool *pool,
- struct size_class *class,
- gfp_t gfp)
+ struct size_class *class,
+ gfp_t gfp, const int nid)
{
int i;
struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
@@ -1061,7 +1051,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
for (i = 0; i < class->pages_per_zspage; i++) {
struct zpdesc *zpdesc;
- zpdesc = alloc_zpdesc(gfp);
+ zpdesc = alloc_zpdesc(gfp, nid);
if (!zpdesc) {
while (--i >= 0) {
zpdesc_dec_zone_page_state(zpdescs[i]);
@@ -1336,12 +1326,14 @@ static unsigned long obj_malloc(struct zs_pool *pool,
* @pool: pool to allocate from
* @size: size of block to allocate
* @gfp: gfp flags when allocating object
+ * @nid: The preferred node id to allocate new zspage (if needed)
*
* On success, handle to the allocated object is returned,
* otherwise an ERR_PTR().
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
-unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
+ const int nid)
{
unsigned long handle;
struct size_class *class;
@@ -1376,7 +1368,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
spin_unlock(&class->lock);
- zspage = alloc_zspage(pool, class, gfp);
+ zspage = alloc_zspage(pool, class, gfp, nid);
if (!zspage) {
cache_free_handle(pool, handle);
return (unsigned long)ERR_PTR(-ENOMEM);
diff --git a/mm/zswap.c b/mm/zswap.c
index 204fb59da33c..455e9425c5f5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -981,7 +981,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
zpool = pool->zpool;
gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
- alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
+ alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page));
if (alloc_ret)
goto unlock;