diff options
Diffstat (limited to 'mm')
84 files changed, 3843 insertions, 2047 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e113f713b493..bd08e151fa1b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -469,6 +469,10 @@ config HAVE_GUP_FAST depends on MMU bool +# Enable memblock support for scratch memory which is needed for kexec handover +config MEMBLOCK_KHO_SCRATCH + bool + # Don't discard allocated memory used to track "memory" and "reserved" memblocks # after early boot, so it can still be used to test for validity of memory. # Also, memblocks are updated with memory hot(un)plug. @@ -882,7 +886,7 @@ config THP_SWAP config READ_ONLY_THP_FOR_FS bool "Read-only THP for filesystems (EXPERIMENTAL)" - depends on TRANSPARENT_HUGEPAGE && SHMEM + depends on TRANSPARENT_HUGEPAGE help Allow khugepaged to put read-only file-backed pages in THP. diff --git a/mm/Makefile b/mm/Makefile index e7f6bbf8ae5f..1a7a11d4933d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,7 +37,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ - pgtable-generic.o rmap.o vmalloc.o vma.o + pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o ifdef CONFIG_CROSS_MEMORY_ATTACH @@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o percpu.o slab_common.o \ compaction.o show_mem.o \ interval_tree.o list_lru.o workingset.o \ - debug.o gup.o mmap_lock.o $(mmu-y) + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o @@ -143,13 +143,14 @@ bool cma_validate_zones(struct cma *cma) static void __init cma_activate_area(struct cma *cma) { - unsigned long pfn, end_pfn; + unsigned long pfn, end_pfn, early_pfn[CMA_MAX_RANGES]; int allocrange, r; struct cma_memrange *cmr; unsigned long bitmap_count, count; for (allocrange = 0; allocrange < cma->nranges; allocrange++) { cmr = &cma->ranges[allocrange]; + early_pfn[allocrange] = cmr->early_pfn; cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr), GFP_KERNEL); if (!cmr->bitmap) @@ -161,13 +162,13 @@ static void __init cma_activate_area(struct cma *cma) for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; - if (cmr->early_pfn != cmr->base_pfn) { - count = cmr->early_pfn - cmr->base_pfn; + if (early_pfn[r] != cmr->base_pfn) { + count = early_pfn[r] - cmr->base_pfn; bitmap_count = cma_bitmap_pages_to_bits(cma, count); bitmap_set(cmr->bitmap, 0, bitmap_count); } - for (pfn = cmr->early_pfn; pfn < cmr->base_pfn + cmr->count; + for (pfn = early_pfn[r]; pfn < cmr->base_pfn + cmr->count; pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); } @@ -193,7 +194,7 @@ cleanup: for (r = 0; r < allocrange; r++) { cmr = &cma->ranges[r]; end_pfn = cmr->base_pfn + cmr->count; - for (pfn = cmr->early_pfn; pfn < end_pfn; pfn++) + for (pfn = early_pfn[r]; pfn < end_pfn; pfn++) free_reserved_page(pfn_to_page(pfn)); } } @@ -608,7 +609,10 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, * complain. Find the boundary by adding one to the last valid * address. */ - highmem_start = __pa(high_memory - 1) + 1; + if (IS_ENABLED(CONFIG_HIGHMEM)) + highmem_start = __pa(high_memory - 1) + 1; + else + highmem_start = memblock_end_of_DRAM(); pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", __func__, &size, &base, &limit, &alignment); @@ -25,9 +25,11 @@ struct cma_kobject { */ struct cma_memrange { unsigned long base_pfn; - unsigned long early_pfn; unsigned long count; - unsigned long *bitmap; + union { + unsigned long early_pfn; + unsigned long *bitmap; + }; #ifdef CONFIG_CMA_DEBUGFS struct debugfs_u32_array dfs_bitmap; #endif diff --git a/mm/compaction.c b/mm/compaction.c index ca71fd3c3181..3925cb61dbb8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1001,10 +1001,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); + folio = page_folio(page); + ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages); /* - * Fail isolation in case isolate_or_dissolve_huge_page() + * Fail isolation in case isolate_or_dissolve_huge_folio() * reports an error. In case of -ENOMEM, abort right away. */ if (ret < 0) { @@ -1016,12 +1017,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail; } - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { /* * Hugepage was successfully isolated and placed * on the cc->migratepages list. */ - folio = page_folio(page); low_pfn += folio_nr_pages(folio) - 1; goto isolate_success_no_list; } @@ -2249,15 +2249,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) static unsigned int fragmentation_score_wmark(bool low) { - unsigned int wmark_low; + unsigned int wmark_low, leeway; - /* - * Cap the low watermark to avoid excessive compaction - * activity in case a user sets the proactiveness tunable - * close to 100 (maximum). - */ - wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); - return low ? wmark_low : min(wmark_low + 10, 100U); + wmark_low = 100U - sysctl_compaction_proactiveness; + leeway = min(10U, wmark_low / 2); + return low ? wmark_low : min(wmark_low + leeway, 100U); } static bool should_proactive_compact_node(pg_data_t *pgdat) @@ -2348,7 +2344,6 @@ static enum compact_result __compact_finished(struct compact_control *cc) ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; - bool claim_block; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) @@ -2364,8 +2359,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * Job done if allocation would steal freepages from * other migratetype buddy lists. */ - if (find_suitable_fallback(area, order, migratetype, - true, &claim_block) != -1) + if (find_suitable_fallback(area, order, migratetype, true) >= 0) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure diff --git a/mm/damon/core.c b/mm/damon/core.c index f0c1676f0599..0bb71e2ab713 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1392,6 +1392,19 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) } /* + * Warn and fix corrupted ->nr_accesses[_bp] for investigations and preventing + * the problem being propagated. + */ +static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r) +{ + if (r->nr_accesses_bp == r->nr_accesses * 10000) + return; + WARN_ONCE(true, "invalid nr_accesses_bp at reset: %u %u\n", + r->nr_accesses_bp, r->nr_accesses); + r->nr_accesses_bp = r->nr_accesses * 10000; +} + +/* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ static void kdamond_reset_aggregated(struct damon_ctx *c) @@ -1404,6 +1417,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_for_each_region(r, t) { trace_damon_aggregated(ti, r, damon_nr_regions(t)); + damon_warn_fix_nr_accesses_corruption(r); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } @@ -1889,6 +1903,29 @@ static inline u64 damos_get_some_mem_psi_total(void) #endif /* CONFIG_PSI */ +#ifdef CONFIG_NUMA +static __kernel_ulong_t damos_get_node_mem_bp( + struct damos_quota_goal *goal) +{ + struct sysinfo i; + __kernel_ulong_t numerator; + + si_meminfo_node(&i, goal->nid); + if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP) + numerator = i.totalram - i.freeram; + else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */ + numerator = i.freeram; + return numerator * 10000 / i.totalram; +} +#else +static __kernel_ulong_t damos_get_node_mem_bp( + struct damos_quota_goal *goal) +{ + return 0; +} +#endif + + static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) { u64 now_psi_total; @@ -1902,6 +1939,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = now_psi_total - goal->last_psi_total; goal->last_psi_total = now_psi_total; break; + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + goal->current_value = damos_get_node_mem_bp(goal); + break; default: break; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 1b70d3f36046..e8464f7e0014 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -548,7 +548,6 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, unsigned long *sz_filter_passed) { unsigned long addr; - LIST_HEAD(folio_list); struct folio *folio; if (!damon_pa_scheme_has_filter(s)) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 23b562df0839..0f6c9e1fec0b 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -465,7 +465,8 @@ static ssize_t memcg_path_store(struct kobject *kobj, { struct damon_sysfs_scheme_filter *filter = container_of(kobj, struct damon_sysfs_scheme_filter, kobj); - char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL); + char *path = kmalloc_array(size_add(count, 1), sizeof(*path), + GFP_KERNEL); if (!path) return -ENOMEM; @@ -936,12 +937,15 @@ struct damos_sysfs_quota_goal { enum damos_quota_goal_metric metric; unsigned long target_value; unsigned long current_value; + int nid; }; -/* This should match with enum damos_action */ +/* This should match with enum damos_quota_goal_metric */ static const char * const damos_sysfs_quota_goal_metric_strs[] = { "user_input", "some_mem_psi_us", + "node_mem_used_bp", + "node_mem_free_bp", }; static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) @@ -1014,6 +1018,28 @@ static ssize_t current_value_store(struct kobject *kobj, return err ? err : count; } +static ssize_t nid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + + /* todo: return error if the goal is not using nid */ + + return sysfs_emit(buf, "%d\n", goal->nid); +} + +static ssize_t nid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, struct + damos_sysfs_quota_goal, kobj); + int err = kstrtoint(buf, 0, &goal->nid); + + /* feed callback should check existence of this file and read value */ + return err ? err : count; +} + static void damos_sysfs_quota_goal_release(struct kobject *kobj) { /* or, notify this release to the feed callback */ @@ -1029,10 +1055,14 @@ static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr = static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr = __ATTR_RW_MODE(current_value, 0600); +static struct kobj_attribute damos_sysfs_quota_goal_nid_attr = + __ATTR_RW_MODE(nid, 0600); + static struct attribute *damos_sysfs_quota_goal_attrs[] = { &damos_sysfs_quota_goal_target_metric_attr.attr, &damos_sysfs_quota_goal_target_value_attr.attr, &damos_sysfs_quota_goal_current_value_attr.attr, + &damos_sysfs_quota_goal_nid_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damos_sysfs_quota_goal); @@ -2035,7 +2065,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) if (!memcg_path) return -EINVAL; - path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL); + path = kmalloc_array(PATH_MAX, sizeof(*path), GFP_KERNEL); if (!path) return -ENOMEM; @@ -2120,8 +2150,17 @@ static int damos_sysfs_add_quota_score( sysfs_goal->target_value); if (!goal) return -ENOMEM; - if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT) + switch (sysfs_goal->metric) { + case DAMOS_QUOTA_USER_INPUT: goal->current_value = sysfs_goal->current_value; + break; + case DAMOS_QUOTA_NODE_MEM_USED_BP: + case DAMOS_QUOTA_NODE_MEM_FREE_BP: + goal->nid = sysfs_goal->nid; + break; + default: + break; + } damos_add_quota_goal(quota, goal); } return 0; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index be0fea9ee5fc..298c67557fae 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -510,6 +510,75 @@ static void damon_test_feed_loop_next_input(struct kunit *test) damon_feed_loop_next_input(last_input, 2000)); } +static void damon_test_set_filters_default_reject(struct kunit *test) +{ + struct damos scheme; + struct damos_filter *target_filter, *anon_filter; + + INIT_LIST_HEAD(&scheme.filters); + INIT_LIST_HEAD(&scheme.ops_filters); + + damos_set_filters_default_reject(&scheme); + /* + * No filter is installed. Allow by default on both core and ops layer + * filtering stages, since there are no filters at all. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true); + damos_add_filter(&scheme, target_filter); + damos_set_filters_default_reject(&scheme); + /* + * A core-handled allow-filter is installed. + * Rejct by default on core layer filtering stage due to the last + * core-layer-filter's behavior. + * Allow by default on ops layer filtering stage due to the absence of + * ops layer filters. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, true); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + target_filter->allow = false; + damos_set_filters_default_reject(&scheme); + /* + * A core-handled reject-filter is installed. + * Allow by default on core layer filtering stage due to the last + * core-layer-filter's behavior. + * Allow by default on ops layer filtering stage due to the absence of + * ops layer filters. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true); + damos_add_filter(&scheme, anon_filter); + + damos_set_filters_default_reject(&scheme); + /* + * A core-handled reject-filter and ops-handled allow-filter are installed. + * Allow by default on core layer filtering stage due to the existence + * of the ops-handled filter. + * Reject by default on ops layer filtering stage due to the last + * ops-layer-filter's behavior. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); + + target_filter->allow = true; + damos_set_filters_default_reject(&scheme); + /* + * A core-handled allow-filter and ops-handled allow-filter are + * installed. + * Allow by default on core layer filtering stage due to the existence + * of the ops-handled filter. + * Reject by default on ops layer filtering stage due to the last + * ops-layer-filter's behavior. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -527,6 +596,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_new_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), + KUNIT_CASE(damon_test_set_filters_default_reject), {}, }; diff --git a/mm/debug.c b/mm/debug.c index db83e381a8ae..907382257062 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -71,10 +71,12 @@ static void __dump_folio(struct folio *folio, struct page *page, unsigned long pfn, unsigned long idx) { struct address_space *mapping = folio_mapping(folio); - int mapcount = atomic_read(&page->_mapcount); + int mapcount = atomic_read(&page->_mapcount) + 1; char *type = ""; - mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1; + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", folio_ref_count(folio), mapcount, mapping, folio->index + idx, pfn); diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index d46acf989dde..6a26eca546c3 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -23,7 +23,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) unsigned long res; if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) { - pr_err("Bad debug_guardpage_minorder value\n"); + pr_err("Bad debug_guardpage_minorder value: %s\n", buf); return 0; } _debug_guardpage_minorder = res; diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bc748f700a9e..7731b238b534 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -910,26 +910,18 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) #ifdef CONFIG_HUGETLB_PAGE static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { - struct page *page; pte_t pte; pr_debug("Validating HugeTLB basic\n"); - /* - * Accessing the page associated with the pfn is safe here, - * as it was previously derived from a real kernel symbol. - */ - page = pfn_to_page(args->fixed_pmd_pfn); - pte = mk_huge_pte(page, args->page_prot); + pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); + pte = arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS); +#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB + WARN_ON(!pte_huge(pte)); +#endif WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte))); WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte)))); WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte)))); - -#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB - pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); - - WARN_ON(!pte_huge(arch_make_huge_pte(pte, PMD_SHIFT, VM_ACCESS_FLAGS))); -#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } #else /* !CONFIG_HUGETLB_PAGE */ static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } diff --git a/mm/dmapool.c b/mm/dmapool.c index f0bfc6c490f4..5be8cc1c6529 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -56,6 +56,7 @@ struct dma_pool { /* the pool */ unsigned int size; unsigned int allocation; unsigned int boundary; + int node; char name[32]; struct list_head pools; }; @@ -199,12 +200,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, /** - * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma. * @name: name of pool, for diagnostics * @dev: device that will be doing the DMA * @size: size of the blocks in this pool. * @align: alignment requirement for blocks; must be a power of two * @boundary: returned blocks won't cross this power of two boundary + * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on * Context: not in_interrupt() * * Given one of these pools, dma_pool_alloc() @@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, * Return: a dma allocation pool with the requested characteristics, or * %NULL if one can't be created. */ -struct dma_pool *dma_pool_create(const char *name, struct device *dev, - size_t size, size_t align, size_t boundary) +struct dma_pool *dma_pool_create_node(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary, int node) { struct dma_pool *retval; size_t allocation; @@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, boundary = min(boundary, allocation); - retval = kzalloc(sizeof(*retval), GFP_KERNEL); + retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; @@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; + retval->node = node; INIT_LIST_HEAD(&retval->pools); /* @@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, mutex_unlock(&pools_reg_lock); return retval; } -EXPORT_SYMBOL(dma_pool_create); +EXPORT_SYMBOL(dma_pool_create_node); static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) { @@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) { struct dma_page *page; - page = kmalloc(sizeof(*page), mem_flags); + page = kmalloc_node(sizeof(*page), mem_flags, pool->node); if (!page) return NULL; diff --git a/mm/execmem.c b/mm/execmem.c index e6c4f5076ca8..9720ac2dfa41 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -254,6 +254,34 @@ out_unlock: return ptr; } +static bool execmem_cache_rox = false; + +void execmem_cache_make_ro(void) +{ + struct maple_tree *free_areas = &execmem_cache.free_areas; + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + MA_STATE(mas_free, free_areas, 0, ULONG_MAX); + MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); + struct mutex *mutex = &execmem_cache.mutex; + void *area; + + execmem_cache_rox = true; + + mutex_lock(mutex); + + mas_for_each(&mas_free, area, ULONG_MAX) { + unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT; + set_memory_ro(mas_free.index, pages); + } + + mas_for_each(&mas_busy, area, ULONG_MAX) { + unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT; + set_memory_ro(mas_busy.index, pages); + } + + mutex_unlock(mutex); +} + static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; @@ -274,9 +302,15 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - err = set_memory_rox((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; + if (execmem_cache_rox) { + err = set_memory_rox((unsigned long)p, vm->nr_pages); + if (err) + goto err_free_mem; + } else { + err = set_memory_x((unsigned long)p, vm->nr_pages); + if (err) + goto err_free_mem; + } err = execmem_cache_add(p, alloc_size); if (err) @@ -377,6 +411,8 @@ void *execmem_alloc(enum execmem_type type, size_t size) pgprot_t pgprot = range->pgprot; void *p; + size = PAGE_ALIGN(size); + if (use_cache) p = execmem_cache_alloc(range, size); else diff --git a/mm/filemap.c b/mm/filemap.c index 7b90cbeb4a1a..09d005848f0d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3533,7 +3533,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); - vm_fault_t ret = do_set_pmd(vmf, page); + vm_fault_t ret = do_set_pmd(vmf, folio, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); @@ -26,6 +26,7 @@ #include <asm/tlbflush.h> #include "internal.h" +#include "swap.h" struct follow_page_context { struct dev_pagemap *pgmap; @@ -844,11 +845,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, pte_t *ptep, pte; int ret; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) - return ERR_PTR(-EINVAL); - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags, address); @@ -1106,10 +1102,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; - if (address > TASK_SIZE) - pgd = pgd_offset_k(address); - else - pgd = pgd_offset_gate(mm, address); + pgd = pgd_offset(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); @@ -1432,7 +1425,11 @@ static long __get_user_pages(struct mm_struct *mm, start = untagged_addr_remote(mm, start); - VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET)); do { struct page *page; @@ -2114,28 +2111,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, */ size_t fault_in_writeable(char __user *uaddr, size_t size) { - char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; if (unlikely(size == 0)) return 0; if (!user_write_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_put_user(0, uaddr, out); - uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_put_user(0, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_put_user(0, (char __user *)cur, out); out: user_write_access_end(); - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_writeable); @@ -2189,26 +2180,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable); */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { - unsigned long start = (unsigned long)uaddr, end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; struct mm_struct *mm = current->mm; bool unlocked = false; if (unlikely(size == 0)) return 0; - end = PAGE_ALIGN(start + size); - if (end < start) - end = 0; mmap_read_lock(mm); - do { - if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked)) break; - start = (start + PAGE_SIZE) & PAGE_MASK; - } while (start != end); mmap_read_unlock(mm); - if (size > start - (unsigned long)uaddr) - return size - (start - (unsigned long)uaddr); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); @@ -2223,30 +2212,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable); */ size_t fault_in_readable(const char __user *uaddr, size_t size) { - const char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; volatile char c; if (unlikely(size == 0)) return 0; if (!user_read_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_get_user(c, uaddr, out); - uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_get_user(c, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_get_user(c, (const char __user *)cur, out); out: user_read_access_end(); (void)c; - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_readable); @@ -3173,46 +3156,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, return 1; } -static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - int refs; - struct page *page; - struct folio *folio; - - if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) - return 0; - - BUILD_BUG_ON(pgd_devmap(orig)); - - page = pgd_page(orig); - refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); - - folio = try_grab_folio_fast(page, refs, flags); - if (!folio) - return 0; - - if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!gup_fast_folio_allowed(folio, flags)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - *nr += refs; - folio_set_referenced(folio); - return 1; -} - static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) @@ -3307,12 +3250,9 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; - if (unlikely(pgd_leaf(pgd))) { - if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags, - pages, nr)) - return; - } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, - pages, nr)) + BUILD_BUG_ON(pgd_leaf(pgd)); + if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, + pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -3647,7 +3587,7 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, { unsigned int flags, nr_folios, nr_found; unsigned int i, pgshift = PAGE_SHIFT; - pgoff_t start_idx, end_idx, next_idx; + pgoff_t start_idx, end_idx; struct folio *folio = NULL; struct folio_batch fbatch; struct hstate *h; @@ -3697,20 +3637,8 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, folio = NULL; } - next_idx = 0; for (i = 0; i < nr_found; i++) { - /* - * As there can be multiple entries for a - * given folio in the batch returned by - * filemap_get_folios_contig(), the below - * check is to ensure that we pin and return a - * unique set of folios between start and end. - */ - if (next_idx && - next_idx != folio_index(fbatch.folios[i])) - continue; - - folio = page_folio(&fbatch.folios[i]->page); + folio = fbatch.folios[i]; if (try_grab_folio(folio, 1, FOLL_PIN)) { folio_batch_release(&fbatch); @@ -3722,7 +3650,6 @@ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, *offset = offset_in_folio(folio, start); folios[nr_folios] = folio; - next_idx = folio_next_index(folio); if (++nr_folios == max_folios) break; } @@ -10,6 +10,7 @@ */ #include <linux/pagewalk.h> #include <linux/hmm.h> +#include <linux/hmm-dma.h> #include <linux/init.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -23,6 +24,7 @@ #include <linux/sched/mm.h> #include <linux/jump_label.h> #include <linux/dma-mapping.h> +#include <linux/pci-p2pdma.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> @@ -39,13 +41,21 @@ enum { HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, }; +enum { + /* These flags are carried from input-to-output */ + HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | + HMM_PFN_P2PDMA_BUS, +}; + static int hmm_pfns_fill(unsigned long addr, unsigned long end, struct hmm_range *range, unsigned long cpu_flags) { unsigned long i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) - range->hmm_pfns[i] = cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= cpu_flags; + } return 0; } @@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long cpu_flags; pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; + uint64_t new_pfn_flags = 0; if (pte_none_mostly(pte)) { required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *hmm_pfn = 0; - return 0; + goto out; } if (!pte_present(pte)) { @@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; - return 0; + new_pfn_flags = swp_offset_pfn(entry) | cpu_flags; + goto out; } required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); - if (!required_fault) { - *hmm_pfn = 0; - return 0; - } + if (!required_fault) + goto out; if (!non_swap_entry(entry)) goto fault; @@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, pte_unmap(ptep); return -EFAULT; } - *hmm_pfn = HMM_PFN_ERROR; - return 0; + new_pfn_flags = HMM_PFN_ERROR; + goto out; } - *hmm_pfn = pte_pfn(pte) | cpu_flags; + new_pfn_flags = pte_pfn(pte) | cpu_flags; +out: + *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags; return 0; fault: @@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; i < npages; ++i, ++pfn) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } goto out_unlock; } @@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->hmm_pfns[i] = pfn | cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) { + range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + range->hmm_pfns[i] |= pfn | cpu_flags; + } spin_unlock(ptl); return 0; @@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range) return ret; } EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_dma_map_alloc - Allocate HMM map structure + * @dev: device to allocate structure for + * @map: HMM map to allocate + * @nr_entries: number of entries in the map + * @dma_entry_size: size of the DMA entry in the map + * + * Allocate the HMM map structure and all the lists it contains. + * Return 0 on success, -ENOMEM on failure. + */ +int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map, + size_t nr_entries, size_t dma_entry_size) +{ + bool dma_need_sync = false; + bool use_iova; + + WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size)); + + /* + * The HMM API violates our normal DMA buffer ownership rules and can't + * transfer buffer ownership. The dma_addressing_limited() check is a + * best approximation to ensure no swiotlb buffering happens. + */ +#ifdef CONFIG_DMA_NEED_SYNC + dma_need_sync = !dev->dma_skip_sync; +#endif /* CONFIG_DMA_NEED_SYNC */ + if (dma_need_sync || dma_addressing_limited(dev)) + return -EOPNOTSUPP; + + map->dma_entry_size = dma_entry_size; + map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->pfn_list) + return -ENOMEM; + + use_iova = dma_iova_try_alloc(dev, &map->state, 0, + nr_entries * PAGE_SIZE); + if (!use_iova && dma_need_unmap(dev)) { + map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->dma_list) + goto err_dma; + } + return 0; + +err_dma: + kvfree(map->pfn_list); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(hmm_dma_map_alloc); + +/** + * hmm_dma_map_free - iFree HMM map structure + * @dev: device to free structure from + * @map: HMM map containing the various lists and state + * + * Free the HMM map structure and all the lists it contains. + */ +void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map) +{ + if (dma_use_iova(&map->state)) + dma_iova_free(dev, &map->state); + kvfree(map->pfn_list); + kvfree(map->dma_list); +} +EXPORT_SYMBOL_GPL(hmm_dma_map_free); + +/** + * hmm_dma_map_pfn - Map a physical HMM page to DMA address + * @dev: Device to map the page for + * @map: HMM map + * @idx: Index into the PFN and dma address arrays + * @p2pdma_state: PCI P2P state. + * + * dma_alloc_iova() allocates IOVA based on the size specified by their use in + * iova->size. Call this function after IOVA allocation to link whole @page + * to get the DMA address. Note that very first call to this function + * will have @offset set to 0 in the IOVA space allocated from + * dma_alloc_iova(). For subsequent calls to this function on same @iova, + * @offset needs to be advanced by the caller with the size of previous + * page that was linked + DMA address returned for the previous page that was + * linked by this function. + */ +dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, + size_t idx, + struct pci_p2pdma_map_state *p2pdma_state) +{ + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + struct page *page = hmm_pfn_to_page(pfns[idx]); + phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]); + size_t offset = idx * map->dma_entry_size; + unsigned long attrs = 0; + dma_addr_t dma_addr; + int ret; + + if ((pfns[idx] & HMM_PFN_DMA_MAPPED) && + !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) { + /* + * We are in this flow when there is a need to resync flags, + * for example when page was already linked in prefetch call + * with READ flag and now we need to add WRITE flag + * + * This page was already programmed to HW and we don't want/need + * to unlink and link it again just to resync flags. + */ + if (dma_use_iova(state)) + return state->addr + offset; + + /* + * Without dma_need_unmap, the dma_addrs array is NULL, thus we + * need to regenerate the address below even if there already + * was a mapping. But !dma_need_unmap implies that the + * mapping stateless, so this is fine. + */ + if (dma_need_unmap(dev)) + return dma_addrs[idx]; + + /* Continue to remapping */ + } + + switch (pci_p2pdma_state(p2pdma_state, dev, page)) { + case PCI_P2PDMA_MAP_NONE: + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + pfns[idx] |= HMM_PFN_P2PDMA; + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED; + return pci_p2pdma_bus_addr_map(p2pdma_state, paddr); + default: + return DMA_MAPPING_ERROR; + } + + if (dma_use_iova(state)) { + ret = dma_iova_link(dev, state, paddr, offset, + map->dma_entry_size, DMA_BIDIRECTIONAL, + attrs); + if (ret) + goto error; + + ret = dma_iova_sync(dev, state, offset, map->dma_entry_size); + if (ret) { + dma_iova_unlink(dev, state, offset, map->dma_entry_size, + DMA_BIDIRECTIONAL, attrs); + goto error; + } + + dma_addr = state->addr + offset; + } else { + if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs)) + goto error; + + dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, dma_addr)) + goto error; + + if (dma_need_unmap(dev)) + dma_addrs[idx] = dma_addr; + } + pfns[idx] |= HMM_PFN_DMA_MAPPED; + return dma_addr; +error: + pfns[idx] &= ~HMM_PFN_P2PDMA; + return DMA_MAPPING_ERROR; + +} +EXPORT_SYMBOL_GPL(hmm_dma_map_pfn); + +/** + * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address + * @dev: Device to unmap the page from + * @map: HMM map + * @idx: Index of the PFN to unmap + * + * Returns true if the PFN was mapped and has been unmapped, false otherwise. + */ +bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx) +{ + const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED; + struct dma_iova_state *state = &map->state; + dma_addr_t *dma_addrs = map->dma_list; + unsigned long *pfns = map->pfn_list; + unsigned long attrs = 0; + + if ((pfns[idx] & valid_dma) != valid_dma) + return false; + + if (pfns[idx] & HMM_PFN_P2PDMA_BUS) + ; /* no need to unmap bus address P2P mappings */ + else if (dma_use_iova(state)) { + if (pfns[idx] & HMM_PFN_P2PDMA) + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + dma_iova_unlink(dev, state, idx * map->dma_entry_size, + map->dma_entry_size, DMA_BIDIRECTIONAL, attrs); + } else if (dma_need_unmap(dev)) + dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size, + DMA_BIDIRECTIONAL); + + pfns[idx] &= + ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS); + return true; +} +EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2a47682d1ab7..d3e66136e41a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1203,7 +1203,7 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, { pmd_t entry; - entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); @@ -1309,8 +1309,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, struct folio *zero_folio) { pmd_t entry; - entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); - entry = pmd_mkhuge(entry); + entry = folio_mk_pmd(zero_folio, vma->vm_page_prot); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); @@ -1456,7 +1455,8 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) return VM_FAULT_OOM; } - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); + ptl = pmd_lock(vma->vm_mm, vmf->pmd); error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); @@ -1578,7 +1578,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); ptl = pud_lock(vma->vm_mm, vmf->pud); insert_pfn_pud(vma, addr, vmf->pud, pfn, write); @@ -1786,7 +1786,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); - __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); + __split_huge_pmd(src_vma, src_pmd, addr, false); return -EAGAIN; } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -2008,7 +2008,7 @@ unlock_fallback: folio_unlock(folio); spin_unlock(vmf->ptl); fallback: - __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false); return VM_FAULT_FALLBACK; } @@ -2260,6 +2260,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PMD_NR); + + /* + * Use flush_needed to indicate whether the PMD entry + * is present, instead of checking pmd_present() again. + */ + if (flush_needed && pmd_young(orig_pmd) && + likely(vma_has_recency(vma))) + folio_mark_accessed(folio); } spin_unlock(ptl); @@ -2653,12 +2661,12 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); - _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); + _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); } else { src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); - _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); + _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot); } set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); @@ -3073,28 +3081,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, bool freeze, struct folio *folio) + pmd_t *pmd, bool freeze) { - VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio)); VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); - VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); - VM_BUG_ON(freeze && !folio); - - /* - * When the caller requests to set up a migration entry, we - * require a folio to check the PMD against. Otherwise, there - * is a risk of replacing the wrong folio. - */ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || - is_pmd_migration_entry(*pmd)) { - if (folio && folio != pmd_folio(*pmd)) - return; + is_pmd_migration_entry(*pmd)) __split_huge_pmd_locked(vma, pmd, address, freeze); - } } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct folio *folio) + unsigned long address, bool freeze) { spinlock_t *ptl; struct mmu_notifier_range range; @@ -3104,20 +3100,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); - split_huge_pmd_locked(vma, range.start, pmd, freeze, folio); + split_huge_pmd_locked(vma, range.start, pmd, freeze); spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, - bool freeze, struct folio *folio) + bool freeze) { pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); if (!pmd) return; - __split_huge_pmd(vma, pmd, address, freeze, folio); + __split_huge_pmd(vma, pmd, address, freeze); } static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) @@ -3129,7 +3125,7 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), ALIGN(address, HPAGE_PMD_SIZE))) - split_huge_pmd_address(vma, address, false, NULL); + split_huge_pmd_address(vma, address, false); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3641,7 +3637,7 @@ after_split: * requires taking the lru_lock so we do the put_page * of the tail pages after the split is complete. */ - free_page_and_swap_cache(&new_folio->page); + free_folio_and_swap_cache(new_folio); } return ret; } @@ -4675,7 +4671,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); folio_get(folio); - pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); + pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e3e6ac991b9c..32ab14aa4074 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -58,6 +58,7 @@ int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +__initdata nodemask_t hugetlb_bootmem_nodes; __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; @@ -1250,7 +1251,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. - * This function should be only used by move_vma() and operate on + * This function should be only used by mremap and operate on * same sized vma. It should never come here with last ref on the * reservation. */ @@ -1950,7 +1951,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; - bool retry = true; /* * By default we always try hard to allocate the folio with @@ -1965,22 +1965,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); -retry: - folio = __folio_alloc(gfp_mask, order, nid, nmask); - /* Ensure hugetlb folio won't have large_rmappable flag set. */ - if (folio) - folio_clear_large_rmappable(folio); - if (folio && !folio_ref_freeze(folio, 1)) { - folio_put(folio); - if (retry) { /* retry once */ - retry = false; - goto retry; - } - /* WOW! twice in a row. */ - pr_warn("HugeTLB unexpected inflated folio ref count\n"); - folio = NULL; - } + folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask); /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a @@ -2419,7 +2405,6 @@ static int gather_surplus_pages(struct hstate *h, long delta) long i; long needed, allocated; bool alloc_ok = true; - int node; nodemask_t *mbind_nodemask, alloc_nodemask; mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); @@ -2443,21 +2428,12 @@ retry: for (i = 0; i < needed; i++) { folio = NULL; - /* Prioritize current node */ - if (node_isset(numa_mem_id(), alloc_nodemask)) - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - numa_mem_id(), NULL); - - if (!folio) { - for_each_node_mask(node, alloc_nodemask) { - if (node == numa_mem_id()) - continue; - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - node, NULL); - if (folio) - break; - } - } + /* + * It is okay to use NUMA_NO_NODE because we use numa_mem_id() + * down the road to pick the current node if that is the case. + */ + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + NUMA_NO_NODE, &alloc_nodemask); if (!folio) { alloc_ok = false; break; @@ -2896,10 +2872,9 @@ free_new: return ret; } -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { struct hstate *h; - struct folio *folio = page_folio(page); int ret = -EBUSY; /* @@ -2949,12 +2924,20 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); + + /* + * The folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ + spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { h = folio_hstate(folio); } else { + spin_unlock_irq(&hugetlb_lock); start_pfn++; continue; } + spin_unlock_irq(&hugetlb_lock); if (!folio_ref_count(folio)) { ret = alloc_and_dissolve_hugetlb_folio(h, folio, @@ -3010,7 +2993,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long retval, gbl_chg; + long retval, gbl_chg, gbl_reserve; map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; @@ -3163,8 +3146,16 @@ out_uncharge_cgroup_reservation: hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg) - hugepage_subpool_put_pages(spool, 1); + /* + * put page to subpool iff the quota of subpool's rsv_hpages is used + * during hugepage_subpool_get_pages. + */ + if (map_chg && !gbl_chg) { + gbl_reserve = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -gbl_reserve); + } + + out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); @@ -3237,7 +3228,8 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } /* allocate from next node when distributing huge pages */ - for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, + &hugetlb_bootmem_nodes) { m = alloc_bootmem(h, node, false); if (!m) return 0; @@ -3701,6 +3693,15 @@ static void __init hugetlb_init_hstates(void) struct hstate *h, *h2; for_each_hstate(h) { + /* + * Always reset to first_memory_node here, even if + * next_nid_to_alloc was set before - we can't + * reference hugetlb_bootmem_nodes after init, and + * first_memory_node is right for all further allocations. + */ + h->next_nid_to_alloc = first_memory_node; + h->next_nid_to_free = first_memory_node; + /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -4034,10 +4035,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, list_for_each_entry_safe(folio, next, src_list, lru) { int i; + bool cma; if (folio_test_hugetlb_vmemmap_optimized(folio)) continue; + cma = folio_test_hugetlb_cma(folio); + list_del(&folio->lru); split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); @@ -4053,6 +4057,9 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, new_folio->mapping = NULL; init_new_hugetlb_folio(dst, new_folio); + /* Copy the CMA flag so that it is freed correctly */ + if (cma) + folio_set_hugetlb_cma(new_folio); list_add(&new_folio->lru, &dst_list); } } @@ -5007,6 +5014,20 @@ static int __init default_hugepagesz_setup(char *s) } hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); +void __init hugetlb_bootmem_set_nodes(void) +{ + int i, nid; + unsigned long start_pfn, end_pfn; + + if (!nodes_empty(hugetlb_bootmem_nodes)) + return; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + if (end_pfn > start_pfn) + node_set(nid, hugetlb_bootmem_nodes); + } +} + static bool __hugetlb_bootmem_allocated __initdata; bool __init hugetlb_bootmem_allocated(void) @@ -5022,6 +5043,8 @@ void __init hugetlb_bootmem_alloc(void) if (__hugetlb_bootmem_allocated) return; + hugetlb_bootmem_set_nodes(); + for (i = 0; i < MAX_NUMNODES; i++) INIT_LIST_HEAD(&huge_boot_pages[i]); @@ -5029,7 +5052,6 @@ void __init hugetlb_bootmem_alloc(void) for_each_hstate(h) { h->next_nid_to_alloc = first_online_node; - h->next_nid_to_free = first_online_node; if (hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -5458,18 +5480,16 @@ const struct vm_operations_struct hugetlb_vm_ops = { .pagesize = hugetlb_vm_op_pagesize, }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, +static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, bool try_mkwrite) { - pte_t entry; + pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); unsigned int shift = huge_page_shift(hstate_vma(vma)); if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { - entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, - vma->vm_page_prot))); + entry = pte_mkwrite_novma(pte_mkdirty(entry)); } else { - entry = huge_pte_wrprotect(mk_huge_pte(page, - vma->vm_page_prot)); + entry = pte_wrprotect(entry); } entry = pte_mkyoung(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); @@ -5524,7 +5544,7 @@ static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, true); + pte_t newpte = make_huge_pte(vma, new_folio, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -5828,14 +5848,14 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page, zap_flags_t zap_flags) + struct folio *folio, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; + const bool folio_provided = !!folio; unsigned long address; pte_t *ptep; pte_t pte; spinlock_t *ptl; - struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); bool adjust_reservation = false; @@ -5899,14 +5919,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, continue; } - page = pte_page(pte); /* - * If a reference page is supplied, it is because a specific - * page is being unmapped, not a range. Ensure the page we - * are about to unmap is the actual page of interest. + * If a folio is supplied, it is because a specific + * folio is being unmapped, not a range. Ensure the folio we + * are about to unmap is the actual folio of interest. */ - if (ref_page) { - if (page != ref_page) { + if (folio_provided) { + if (folio != page_folio(pte_page(pte))) { spin_unlock(ptl); continue; } @@ -5916,12 +5935,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * looking like data was lost */ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } else { + folio = page_folio(pte_page(pte)); } pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) - set_page_dirty(page); + folio_mark_dirty(folio); /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) @@ -5929,7 +5950,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); - hugetlb_remove_rmap(page_folio(page)); + hugetlb_remove_rmap(folio); /* * Restore the reservation for anonymous page, otherwise the @@ -5938,8 +5959,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * reservation bit. */ if (!h->surplus_huge_pages && __vma_private_lock(vma) && - folio_test_anon(page_folio(page))) { - folio_set_hugetlb_restore_reserve(page_folio(page)); + folio_test_anon(folio)) { + folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } @@ -5963,16 +5984,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * count will not be incremented by free_huge_folio. * Act as if we consumed the reservation. */ - folio_clear_hugetlb_restore_reserve(page_folio(page)); + folio_clear_hugetlb_restore_reserve(folio); else if (rc) vma_add_reservation(h, vma, address); } - tlb_remove_page_size(tlb, page, huge_page_size(h)); + tlb_remove_page_size(tlb, folio_page(folio, 0), + folio_size(folio)); /* - * Bail out after unmapping reference page if supplied + * If we were instructed to unmap a specific folio, we're done. */ - if (ref_page) + if (folio_provided) break; } tlb_end_vma(tlb, vma); @@ -6034,7 +6056,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma, } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page, + unsigned long end, struct folio *folio, zap_flags_t zap_flags) { struct mmu_notifier_range range; @@ -6046,7 +6068,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); - __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); + __unmap_hugepage_range(&tlb, vma, start, end, + folio, zap_flags); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); @@ -6059,7 +6082,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * same region. */ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) + struct folio *folio, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -6103,7 +6126,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, - address + huge_page_size(h), page, 0); + address + huge_page_size(h), + folio, 0); } i_mmap_unlock_write(mapping); } @@ -6226,8 +6250,7 @@ retry_avoidcopy: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, &old_folio->page, - vmf->address); + unmap_ref_private(mm, vma, old_folio, vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); @@ -6274,7 +6297,7 @@ retry_avoidcopy: spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); + pte_t newpte = make_huge_pte(vma, new_folio, !unshare); /* Break COW or unshare */ huge_ptep_clear_flush(vma, vmf->address, vmf->pte); @@ -6554,7 +6577,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); - new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); + new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. @@ -7039,7 +7062,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - _dst_pte = make_huge_pte(dst_vma, &folio->page, + _dst_pte = make_huge_pte(dst_vma, folio, !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be @@ -7233,7 +7256,7 @@ bool hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg = -1, add = -1; + long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -7368,8 +7391,16 @@ bool hugetlb_reserve_pages(struct inode *inode, return true; out_put_pages: - /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + spool_resv = chg - gbl_reserve; + if (spool_resv) { + /* put sub pool's reservation back, chg - gbl_reserve */ + gbl_resv = hugepage_subpool_put_pages(spool, spool_resv); + /* + * subpool's reserved pages can not be put back due to race, + * return to hstate. + */ + hugetlb_acct_memory(h, -gbl_resv); + } out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); @@ -7909,3 +7940,17 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE)); } + +/* + * For hugetlb, mremap() is an odd edge case - while the VMA copying is + * performed, we permit both the old and new VMAs to reference the same + * reservation. + * + * We fix this up after the operation succeeds, or if a newly allocated VMA + * is closed as a result of a failure to allocate memory. + */ +void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + clear_vma_resv_huge_pages(vma); +} diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index e0f2d5c3a84c..f58ef4969e7a 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -66,7 +66,7 @@ hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact) if (node_exact) return NULL; - for_each_online_node(node) { + for_each_node_mask(node, hugetlb_bootmem_nodes) { cma = hugetlb_cma[node]; if (!cma || node == *nid) continue; @@ -153,11 +153,13 @@ void __init hugetlb_cma_reserve(int order) if (!hugetlb_cma_size) return; + hugetlb_bootmem_set_nodes(); + for (nid = 0; nid < MAX_NUMNODES; nid++) { if (hugetlb_cma_size_in_node[nid] == 0) continue; - if (!node_online(nid)) { + if (!node_isset(nid, hugetlb_bootmem_nodes)) { pr_warn("hugetlb_cma: invalid node %d specified\n", nid); hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; hugetlb_cma_size_in_node[nid] = 0; @@ -190,13 +192,14 @@ void __init hugetlb_cma_reserve(int order) * If 3 GB area is requested on a machine with 4 numa nodes, * let's allocate 1 GB on first three nodes and ignore the last one. */ - per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); + per_node = DIV_ROUND_UP(hugetlb_cma_size, + nodes_weight(hugetlb_bootmem_nodes)); pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", hugetlb_cma_size / SZ_1M, per_node / SZ_1M); } reserved = 0; - for_each_online_node(nid) { + for_each_node_mask(nid, hugetlb_bootmem_nodes) { int res; char name[CMA_MAX_NAME]; diff --git a/mm/internal.h b/mm/internal.h index e9695baa5922..6b8ed2017743 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -248,11 +248,9 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { - unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); - const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; bool writable, young, dirty; - int nr; + int nr, cur_nr; if (any_writable) *any_writable = false; @@ -265,11 +263,15 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ + max_nr = min_t(unsigned long, max_nr, + folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); + nr = pte_batch_hint(start_ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); ptep = start_ptep + nr; - while (ptep < end_ptep) { + while (nr < max_nr) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); @@ -282,14 +284,6 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (!pte_same(pte, expected_pte)) break; - /* - * Stop immediately once we reached the end of the folio. In - * corner cases the next PFN might fall into a different - * folio. - */ - if (pte_pfn(pte) >= folio_end_pfn) - break; - if (any_writable) *any_writable |= writable; if (any_young) @@ -297,12 +291,13 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (any_dirty) *any_dirty |= dirty; - nr = pte_batch_hint(ptep, pte); - expected_pte = pte_advance_pfn(expected_pte, nr); - ptep += nr; + cur_nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, cur_nr); + ptep += cur_nr; + nr += cur_nr; } - return min(ptep - start_ptep, max_nr); + return min(nr, max_nr); } /** @@ -435,6 +430,9 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long size, struct zap_details *details); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); @@ -915,7 +913,7 @@ static inline void init_cma_pageblock(struct page *page) int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool claim_only, bool *claim_block); + int migratetype, bool claimable); static inline bool free_area_empty(struct free_area *area, int migratetype) { @@ -1121,6 +1119,8 @@ DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +void init_deferred_page(unsigned long pfn, int nid); + enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, @@ -1595,7 +1595,6 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); -void unaccepted_cleanup_work(struct work_struct *work); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { @@ -1625,5 +1624,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, } #endif /* CONFIG_PT_RECLAIM */ +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); #endif /* __MM_INTERNAL_H */ diff --git a/mm/io-mapping.c b/mm/io-mapping.c index 01b362799930..d3586e95c12c 100644 --- a/mm/io-mapping.c +++ b/mm/io-mapping.c @@ -21,9 +21,10 @@ int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) return -EINVAL; - /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ - return remap_pfn_range_notrack(vma, addr, pfn, size, - __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | - (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK))); + pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | + (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)); + + /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */ + return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot); } EXPORT_SYMBOL_GPL(io_mapping_map_user); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 88d1c9dcb507..d2c70cd2afb1 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -292,33 +292,99 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start, { } +struct vmalloc_populate_data { + unsigned long start; + struct page **pages; +}; + static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, - void *unused) + void *_data) { - unsigned long page; + struct vmalloc_populate_data *data = _data; + struct page *page; pte_t pte; + int index; if (likely(!pte_none(ptep_get(ptep)))) return 0; - page = __get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); - pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); + index = PFN_DOWN(addr - data->start); + page = data->pages[index]; + __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE); + pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); if (likely(pte_none(ptep_get(ptep)))) { set_pte_at(&init_mm, addr, ptep, pte); - page = 0; + data->pages[index] = NULL; } spin_unlock(&init_mm.page_table_lock); - if (page) - free_page(page); + + return 0; +} + +static void ___free_pages_bulk(struct page **pages, int nr_pages) +{ + int i; + + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + __free_pages(pages[i], 0); + pages[i] = NULL; + } + } +} + +static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +{ + unsigned long nr_populated, nr_total = nr_pages; + struct page **page_array = pages; + + while (nr_pages) { + nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + if (!nr_populated) { + ___free_pages_bulk(page_array, nr_total - nr_pages); + return -ENOMEM; + } + pages += nr_populated; + nr_pages -= nr_populated; + } + return 0; } +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +{ + unsigned long nr_pages, nr_total = PFN_UP(end - start); + struct vmalloc_populate_data data; + int ret = 0; + + data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!data.pages) + return -ENOMEM; + + while (nr_total) { + nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); + ret = ___alloc_pages_bulk(data.pages, nr_pages); + if (ret) + break; + + data.start = start; + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, + kasan_populate_vmalloc_pte, &data); + ___free_pages_bulk(data.pages, nr_pages); + if (ret) + break; + + start += nr_pages * PAGE_SIZE; + nr_total -= nr_pages; + } + + free_page((unsigned long)data.pages); + + return ret; +} + int kasan_populate_vmalloc(unsigned long addr, unsigned long size) { unsigned long shadow_start, shadow_end; @@ -348,9 +414,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = apply_to_page_range(&init_mm, shadow_start, - shadow_end - shadow_start, - kasan_populate_vmalloc_pte, NULL); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end); if (ret) return ret; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cc945c6ab3bd..cdf5a581368b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -696,13 +696,13 @@ next: result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, writable, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, referenced, writable, result); return result; } @@ -746,7 +746,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, ptep_clear(vma->vm_mm, address, _pte); folio_remove_rmap_pte(src, src_page, vma); spin_unlock(ptl); - free_page_and_swap_cache(src_page); + free_folio_and_swap_cache(src); } } @@ -1239,7 +1239,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); + _pmd = folio_mk_pmd(folio, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); @@ -1435,7 +1435,7 @@ out_unmap: *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced, none_or_zero, result, unmapped); return result; } @@ -1464,10 +1464,9 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) } } -#ifdef CONFIG_SHMEM -/* hpage must be locked, and mmap_lock must be held */ +/* folio must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmdp, struct page *hpage) + pmd_t *pmdp, struct folio *folio, struct page *page) { struct vm_fault vmf = { .vma = vma, @@ -1476,13 +1475,12 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, .pmd = pmdp, }; - VM_BUG_ON(!PageTransHuge(hpage)); mmap_assert_locked(vma->vm_mm); - if (do_set_pmd(&vmf, hpage)) + if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; - get_page(hpage); + folio_get(folio); return SCAN_SUCCEED; } @@ -1689,7 +1687,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd - ? set_huge_pmd(vma, haddr, pmd, &folio->page) + ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: @@ -2354,14 +2352,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } -#else -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) -{ - BUILD_BUG(); -} -#endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) @@ -2437,7 +2427,7 @@ skip: VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); @@ -2783,7 +2773,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c12cef3eeb32..da9cee34ee1b 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -210,13 +210,11 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static int kmemleak_enabled = 1; +static int kmemleak_enabled __read_mostly = 1; /* same as above but only for the kmemleak_free() callback */ -static int kmemleak_free_enabled = 1; +static int kmemleak_free_enabled __read_mostly = 1; /* set in the late_initcall if there were no errors */ static int kmemleak_late_initialized; -/* set if a kmemleak warning was issued */ -static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ static int kmemleak_error; @@ -254,7 +252,6 @@ static void kmemleak_disable(void); #define kmemleak_warn(x...) do { \ pr_warn(x); \ dump_stack(); \ - kmemleak_warning = 1; \ } while (0) /* @@ -325,8 +322,6 @@ static void hex_dump_object(struct seq_file *seq, * sufficient references to it (count >= min_count) * - black - ignore, it doesn't contain references (e.g. text section) * (min_count == -1). No function defined for this color. - * Newly created objects don't have any color assigned (object->count == -1) - * before the next memory scan when they become white. */ static bool color_white(const struct kmemleak_object *object) { diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index a495debf1436..1ea711786c52 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -159,8 +159,8 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) * Make sure we have enough spare bits in @id to hold the UAF bit and * the chain depth. */ - BUILD_BUG_ON( - (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1)); + BUILD_BUG_ON((1 << STACK_DEPOT_EXTRA_BITS) <= + (KMSAN_MAX_ORIGIN_DEPTH << 1)); extra_bits = stack_depot_get_extra_bits(id); depth = kmsan_depth_from_eb(extra_bits); @@ -274,11 +274,9 @@ void kmsan_internal_check_memory(void *addr, size_t size, * bytes before, report them. */ if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, user_addr, reason); - kmsan_leave_runtime(); } cur_origin = 0; cur_off_start = -1; @@ -292,11 +290,9 @@ void kmsan_internal_check_memory(void *addr, size_t size, * poisoned bytes before, report them. */ if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos + i - 1, user_addr, reason); - kmsan_leave_runtime(); } cur_origin = 0; cur_off_start = -1; @@ -312,11 +308,9 @@ void kmsan_internal_check_memory(void *addr, size_t size, */ if (cur_origin != new_origin) { if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos + i - 1, user_addr, reason); - kmsan_leave_runtime(); } cur_origin = new_origin; cur_off_start = pos + i; @@ -326,10 +320,8 @@ void kmsan_internal_check_memory(void *addr, size_t size, } KMSAN_WARN_ON(pos != size); if (cur_origin) { - kmsan_enter_runtime(); kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, user_addr, reason); - kmsan_leave_runtime(); } } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3df45c25c1f6..97de3d6194f0 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -114,9 +114,7 @@ void kmsan_kfree_large(const void *ptr) kmsan_enter_runtime(); page = virt_to_head_page((void *)ptr); KMSAN_WARN_ON(ptr != page_address(page)); - kmsan_internal_poison_memory((void *)ptr, - page_size(page), - GFP_KERNEL, + kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } @@ -277,8 +275,10 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, * Don't check anything, just copy the shadow of the copied * bytes. */ + kmsan_enter_runtime(); kmsan_internal_memmove_metadata((void *)to, (void *)from, to_copy - left); + kmsan_leave_runtime(); } user_access_restore(ua_flags); } diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index 10f52c085e6c..b14ce3417e65 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -35,8 +35,7 @@ static void __init kmsan_record_future_shadow_range(void *start, void *end) KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES); KMSAN_WARN_ON((nstart >= nend) || /* Virtual address 0 is valid on s390. */ - (!IS_ENABLED(CONFIG_S390) && !nstart) || - !nend); + (!IS_ENABLED(CONFIG_S390) && !nstart) || !nend); nstart = ALIGN_DOWN(nstart, PAGE_SIZE); nend = ALIGN(nend, PAGE_SIZE); diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 02a405e55d6c..69f0a57a401c 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -312,13 +312,9 @@ EXPORT_SYMBOL(__msan_unpoison_alloca); void __msan_warning(u32 origin); void __msan_warning(u32 origin) { - if (!kmsan_enabled || kmsan_in_runtime()) - return; - kmsan_enter_runtime(); kmsan_report(origin, /*address*/ NULL, /*size*/ 0, /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ NULL, REASON_ANY); - kmsan_leave_runtime(); } EXPORT_SYMBOL(__msan_warning); diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 29555a8bc315..bc3d1810f352 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -121,7 +121,6 @@ static __always_inline void kmsan_leave_runtime(void) KMSAN_WARN_ON(--ctx->kmsan_in_runtime); } -depot_stack_handle_t kmsan_save_stack(void); depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, unsigned int extra_bits); diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c index 94a3303fb65e..d6853ce08954 100644 --- a/mm/kmsan/report.c +++ b/mm/kmsan/report.c @@ -157,14 +157,14 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, unsigned long ua_flags; bool is_uaf; - if (!kmsan_enabled) + if (!kmsan_enabled || kmsan_in_runtime()) return; if (current->kmsan_ctx.depth) return; if (!origin) return; - kmsan_disable_current(); + kmsan_enter_runtime(); ua_flags = user_access_save(); raw_spin_lock(&kmsan_report_lock); pr_err("=====================================================\n"); @@ -217,5 +217,5 @@ void kmsan_report(depot_stack_handle_t origin, void *address, int size, if (panic_on_kmsan) panic("kmsan.panic set ...\n"); user_access_restore(ua_flags); - kmsan_enable_current(); + kmsan_leave_runtime(); } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 1bb505a08415..54f3c3c962f0 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -207,8 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order) if (!kmsan_enabled || kmsan_in_runtime()) return; kmsan_enter_runtime(); - kmsan_internal_poison_memory(page_address(page), - page_size(page), + kmsan_internal_poison_memory(page_address(page), page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); @@ -248,17 +247,19 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, s_pages, page_shift); + kmsan_leave_runtime(); if (mapped) { err = mapped; goto ret; } + kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, o_pages, page_shift); + kmsan_leave_runtime(); if (mapped) { err = mapped; goto ret; } - kmsan_leave_runtime(); flush_tlb_kernel_range(shadow_start, shadow_end); flush_tlb_kernel_range(origin_start, origin_end); flush_cache_vmap(shadow_start, shadow_end); diff --git a/mm/maccess.c b/mm/maccess.c index 8f0906180a94..831b4dd7296c 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -196,7 +196,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, if (ret >= count) { ret = count; dst[ret - 1] = '\0'; - } else if (ret > 0) { + } else if (ret >= 0) { ret++; } diff --git a/mm/madvise.c b/mm/madvise.c index b17f684322ad..8433ac9b27e0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -48,6 +48,11 @@ struct madvise_walk_private { bool pageout; }; +struct madvise_behavior { + int behavior; + struct mmu_gather *tlb; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need @@ -794,12 +799,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static int madvise_free_single_vma(struct vm_area_struct *vma, +static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - struct mmu_gather tlb; + struct mmu_gather *tlb = madv_behavior->tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) @@ -815,17 +821,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); - tlb_start_vma(&tlb, vma); + tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, - &madvise_free_walk_ops, &tlb); - tlb_end_vma(&tlb, vma); + &madvise_free_walk_ops, tlb); + tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); - return 0; } @@ -848,7 +851,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed_single_vma(struct vm_area_struct *vma, +static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct zap_details details = { @@ -856,7 +860,8 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, .even_cows = true, }; - zap_page_range_single(vma, start, end - start, &details); + zap_page_range_single_batched( + madv_behavior->tlb, vma, start, end - start, &details); return 0; } @@ -893,8 +898,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - int behavior) + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct mm_struct *mm = vma->vm_mm; *prev = vma; @@ -946,9 +952,10 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) - return madvise_dontneed_single_vma(vma, start, end); + return madvise_dontneed_single_vma( + madv_behavior, vma, start, end); else if (behavior == MADV_FREE) - return madvise_free_single_vma(vma, start, end); + return madvise_free_single_vma(madv_behavior, vma, start, end); else return -EINVAL; } @@ -1249,8 +1256,10 @@ static long madvise_guard_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long behavior) + void *behavior_arg) { + struct madvise_behavior *arg = behavior_arg; + int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; @@ -1270,7 +1279,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, behavior); + return madvise_dontneed_free(vma, prev, start, end, arg); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1487,10 +1496,10 @@ static bool process_madvise_remote_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, + unsigned long end, void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) + unsigned long end, void *arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; @@ -1548,7 +1557,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long anon_name) + void *anon_name) { int error; @@ -1557,7 +1566,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (struct anon_vma_name *)anon_name); + anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1589,7 +1598,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + return madvise_walk_vmas(mm, start, end, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ @@ -1619,6 +1628,31 @@ static void madvise_unlock(struct mm_struct *mm, int behavior) mmap_read_unlock(mm); } +static bool madvise_batch_tlb_flush(int behavior) +{ + switch (behavior) { + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_FREE: + return true; + default: + return false; + } +} + +static void madvise_init_tlb(struct madvise_behavior *madv_behavior, + struct mm_struct *mm) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_gather_mmu(madv_behavior->tlb, mm); +} + +static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_finish_mmu(madv_behavior->tlb); +} + static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) { size_t len; @@ -1677,8 +1711,10 @@ static bool is_madvise_populate(int behavior) } static int madvise_do_behavior(struct mm_struct *mm, - unsigned long start, size_t len_in, int behavior) + unsigned long start, size_t len_in, + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct blk_plug plug; unsigned long end; int error; @@ -1692,7 +1728,7 @@ static int madvise_do_behavior(struct mm_struct *mm, if (is_madvise_populate(behavior)) error = madvise_populate(mm, start, end, behavior); else - error = madvise_walk_vmas(mm, start, end, behavior, + error = madvise_walk_vmas(mm, start, end, madv_behavior, madvise_vma_behavior); blk_finish_plug(&plug); return error; @@ -1773,13 +1809,20 @@ static int madvise_do_behavior(struct mm_struct *mm, int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { int error; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; if (madvise_should_skip(start, len_in, behavior, &error)) return error; error = madvise_lock(mm, behavior); if (error) return error; - error = madvise_do_behavior(mm, start, len_in, behavior); + madvise_init_tlb(&madv_behavior, mm); + error = madvise_do_behavior(mm, start, len_in, &madv_behavior); + madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); return error; @@ -1796,12 +1839,18 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, { ssize_t ret = 0; size_t total_len; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; total_len = iov_iter_count(iter); ret = madvise_lock(mm, behavior); if (ret) return ret; + madvise_init_tlb(&madv_behavior, mm); while (iov_iter_count(iter)) { unsigned long start = (unsigned long)iter_iov_addr(iter); @@ -1811,7 +1860,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, if (madvise_should_skip(start, len_in, behavior, &error)) ret = error; else - ret = madvise_do_behavior(mm, start, len_in, behavior); + ret = madvise_do_behavior(mm, start, len_in, + &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat @@ -1829,14 +1879,17 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, } /* Drop and reacquire lock to unwind race. */ + madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); madvise_lock(mm, behavior); + madvise_init_tlb(&madv_behavior, mm); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } + madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); ret = (total_len - iov_iter_count(iter)) ? : ret; diff --git a/mm/memblock.c b/mm/memblock.c index 0a53db4d9f7b..154f1d73b61f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -18,6 +18,11 @@ #include <linux/memblock.h> #include <linux/mutex.h> +#ifdef CONFIG_KEXEC_HANDOVER +#include <linux/libfdt.h> +#include <linux/kexec_handover.h> +#endif /* CONFIG_KEXEC_HANDOVER */ + #include <asm/sections.h> #include <linux/io.h> @@ -107,6 +112,13 @@ unsigned long min_low_pfn; unsigned long max_pfn; unsigned long long max_possible_pfn; +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */ +static bool kho_scratch_only; +#else +#define kho_scratch_only false +#endif + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -166,6 +178,10 @@ bool __init_memblock memblock_has_mirror(void) static enum memblock_flags __init_memblock choose_memblock_flags(void) { + /* skip non-scratch memory for kho early boot allocations */ + if (kho_scratch_only) + return MEMBLOCK_KHO_SCRATCH; + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -457,7 +473,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, min(new_area_start, memblock.current_limit), new_alloc_size, PAGE_SIZE); - new_array = addr ? __va(addr) : NULL; + if (addr) { + /* The memory may not have been accepted, yet. */ + accept_memory(addr, new_alloc_size); + + new_array = __va(addr); + } else { + new_array = NULL; + } } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", @@ -492,7 +515,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, * needn't do it */ if (!use_slab) - BUG_ON(memblock_reserve(addr, new_alloc_size)); + BUG_ON(memblock_reserve_kern(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; @@ -642,7 +665,7 @@ repeat: #ifdef CONFIG_NUMA WARN_ON(nid != memblock_get_region_node(rgn)); #endif - WARN_ON(flags != rgn->flags); + WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags); nr_new++; if (insert) { if (start_rgn == -1) @@ -902,14 +925,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size, + int nid, enum memblock_flags flags) { phys_addr_t end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, - &base, &end, (void *)_RET_IP_); + memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__, + &base, &end, nid, flags, (void *)_RET_IP_); - return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -924,6 +948,40 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) } #endif +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +__init void memblock_set_kho_scratch_only(void) +{ + kho_scratch_only = true; +} + +__init void memblock_clear_kho_scratch_only(void) +{ + kho_scratch_only = false; +} + +__init void memmap_init_kho_scratch_pages(void) +{ + phys_addr_t start, end; + unsigned long pfn; + int nid; + u64 i; + + if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) + return; + + /* + * Initialize struct pages for free scratch memory. + * The struct pages for reserved scratch memory will be set up in + * reserve_bootmem_region() + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { + for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++) + init_deferred_page(pfn, nid); + } +} +#endif + /** * memblock_setclr_flag - set or clear flag for a memory region * @type: memblock type to set/clear flag for @@ -1049,6 +1107,36 @@ int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t MEMBLOCK_RSRV_NOINIT); } +/** + * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered + * for allocations during early boot with kexec handover. + * + * Return: 0 on success, -errno on failure. + */ +__init int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 1, + MEMBLOCK_KHO_SCRATCH); +} + +/** + * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a + * specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +__init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 0, + MEMBLOCK_KHO_SCRATCH); +} + static bool should_skip_region(struct memblock_type *type, struct memblock_region *m, int nid, int flags) @@ -1080,6 +1168,13 @@ static bool should_skip_region(struct memblock_type *type, if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) return true; + /* + * In early alloc during kexec handover, we can only consider + * MEMBLOCK_KHO_SCRATCH regions for the allocations + */ + if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m)) + return true; + return false; } @@ -1460,14 +1555,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); - if (found && !memblock_reserve(found, size)) + if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN)) goto done; if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); - if (found && !memblock_reserve(found, size)) + if (found && !memblock_reserve_kern(found, size)) goto done; } @@ -1752,6 +1847,28 @@ phys_addr_t __init_memblock memblock_reserved_size(void) return memblock.reserved.total_size; } +phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid) +{ + struct memblock_region *r; + phys_addr_t total = 0; + + for_each_reserved_mem_region(r) { + phys_addr_t size = r->size; + + if (r->base > limit) + break; + + if (r->base + r->size > limit) + size = limit - r->base; + + if (nid == memblock_get_region_node(r) || !numa_valid_node(nid)) + if (r->flags & MEMBLOCK_RSRV_KERN) + total += size; + } + + return total; +} + /** * memblock_estimated_nr_free_pages - return estimated number of free pages * from memblock point of view @@ -2183,11 +2300,14 @@ static void __init memmap_init_reserved_pages(void) struct memblock_region *region; phys_addr_t start, end; int nid; + unsigned long max_reserved; /* * set nid on all reserved pages and also treat struct * pages for the NOMAP regions as PageReserved */ +repeat: + max_reserved = memblock.reserved.max; for_each_mem_region(region) { nid = memblock_get_region_node(region); start = region->base; @@ -2196,8 +2316,15 @@ static void __init memmap_init_reserved_pages(void) if (memblock_is_nomap(region)) reserve_bootmem_region(start, end, nid); - memblock_set_node(start, end, &memblock.reserved, nid); + memblock_set_node(start, region->size, &memblock.reserved, nid); } + /* + * 'max' is changed means memblock.reserved has been doubled its + * array, which may result a new reserved region before current + * 'start'. Now we should repeat the procedure to set its node id. + */ + if (max_reserved != memblock.reserved.max) + goto repeat; /* * initialize struct pages for reserved regions that don't have @@ -2272,6 +2399,7 @@ void __init memblock_free_all(void) free_unused_memmap(); reset_all_zones_managed_pages(); + memblock_clear_kho_scratch_only(); pages = free_low_memory_core_early(); totalram_pages_add(pages); } @@ -2369,6 +2497,189 @@ int reserve_mem_release_by_name(const char *name) return 1; } +#ifdef CONFIG_KEXEC_HANDOVER +#define MEMBLOCK_KHO_FDT "memblock" +#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" +#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" +static struct page *kho_fdt; + +static int reserve_mem_kho_finalize(struct kho_serialization *ser) +{ + int err = 0, i; + + for (i = 0; i < reserved_mem_count; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + + err |= kho_preserve_phys(map->start, map->size); + } + + err |= kho_preserve_folio(page_folio(kho_fdt)); + err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt)); + + return notifier_from_errno(err); +} + +static int reserve_mem_kho_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + switch (cmd) { + case KEXEC_KHO_FINALIZE: + return reserve_mem_kho_finalize((struct kho_serialization *)v); + case KEXEC_KHO_ABORT: + return NOTIFY_DONE; + default: + return NOTIFY_BAD; + } +} + +static struct notifier_block reserve_mem_kho_nb = { + .notifier_call = reserve_mem_kho_notifier, +}; + +static int __init prepare_kho_fdt(void) +{ + int err = 0, i; + void *fdt; + + kho_fdt = alloc_page(GFP_KERNEL); + if (!kho_fdt) + return -ENOMEM; + + fdt = page_to_virt(kho_fdt); + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); + for (i = 0; i < reserved_mem_count; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + + err |= fdt_begin_node(fdt, map->name); + err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); + err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); + err |= fdt_property(fdt, "size", &map->size, sizeof(map->size)); + err |= fdt_end_node(fdt); + } + err |= fdt_end_node(fdt); + + err |= fdt_finish(fdt); + + if (err) { + pr_err("failed to prepare memblock FDT for KHO: %d\n", err); + put_page(kho_fdt); + kho_fdt = NULL; + } + + return err; +} + +static int __init reserve_mem_init(void) +{ + int err; + + if (!kho_is_enabled() || !reserved_mem_count) + return 0; + + err = prepare_kho_fdt(); + if (err) + return err; + + err = register_kho_notifier(&reserve_mem_kho_nb); + if (err) { + put_page(kho_fdt); + kho_fdt = NULL; + } + + return err; +} +late_initcall(reserve_mem_init); + +static void *__init reserve_mem_kho_retrieve_fdt(void) +{ + phys_addr_t fdt_phys; + static void *fdt; + int err; + + if (fdt) + return fdt; + + err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys); + if (err) { + if (err != -ENOENT) + pr_warn("failed to retrieve FDT '%s' from KHO: %d\n", + MEMBLOCK_KHO_FDT, err); + return NULL; + } + + fdt = phys_to_virt(fdt_phys); + + err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("FDT '%s' is incompatible with '%s': %d\n", + MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err); + fdt = NULL; + } + + return fdt; +} + +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + int err, len_start, len_size, offset; + const phys_addr_t *p_start, *p_size; + const void *fdt; + + fdt = reserve_mem_kho_retrieve_fdt(); + if (!fdt) + return false; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) { + pr_warn("FDT '%s' has no child '%s': %d\n", + MEMBLOCK_KHO_FDT, name, offset); + return false; + } + err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("Node '%s' is incompatible with '%s': %d\n", + name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err); + return false; + } + + p_start = fdt_getprop(fdt, offset, "start", &len_start); + p_size = fdt_getprop(fdt, offset, "size", &len_size); + if (!p_start || len_start != sizeof(*p_start) || !p_size || + len_size != sizeof(*p_size)) { + return false; + } + + if (*p_start & (align - 1)) { + pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n", + name, (long)align, (long)*p_start); + return false; + } + + if (*p_size != size) { + pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n", + name, (long)*p_size, (long)size); + return false; + } + + reserved_mem_add(*p_start, size, name); + pr_info("Revived memory reservation '%s' from KHO\n", name); + + return true; +} +#else +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER */ + /* * Parse reserve_mem=nn:align:name */ @@ -2424,6 +2735,11 @@ static int __init reserve_mem(char *p) if (reserve_mem_find_by_name(name, &start, &tmp)) return -EBUSY; + /* Pick previous allocations up from KHO if available */ + if (reserve_mem_kho_revive(name, size, align)) + return 1; + + /* TODO: Allocation must be outside of scratch region */ start = memblock_phys_alloc(size, align); if (!start) return -ENOMEM; @@ -2441,6 +2757,8 @@ static const char * const flagname[] = { [ilog2(MEMBLOCK_NOMAP)] = "NOMAP", [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG", [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT", + [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN", + [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH", }; static int memblock_debug_show(struct seq_file *m, void *private) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 4a9cf27a70af..4b94731305b9 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -512,9 +512,9 @@ static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) { /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __count_memcg_events(memcg, PGPGIN, 1); + count_memcg_events(memcg, PGPGIN, 1); else { - __count_memcg_events(memcg, PGPGOUT, 1); + count_memcg_events(memcg, PGPGOUT, 1); nr_pages = -nr_pages; /* for event */ } @@ -689,7 +689,7 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long flags; local_irq_save(flags); - __count_memcg_events(memcg, PGPGOUT, pgpgout); + count_memcg_events(memcg, PGPGOUT, pgpgout); __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); memcg1_check_events(memcg, nid); local_irq_restore(flags); @@ -2198,8 +2198,7 @@ bool memcg1_alloc_events(struct mem_cgroup *memcg) void memcg1_free_events(struct mem_cgroup *memcg) { - if (memcg->events_percpu) - free_percpu(memcg->events_percpu); + free_percpu(memcg->events_percpu); } static int __init memcg1_init(void) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c96c1f2b9cf5..b90aa3075950 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -29,6 +29,7 @@ #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> +#include <linux/cpuset.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> #include <linux/hugetlb.h> @@ -95,6 +96,9 @@ static bool cgroup_memory_nokmem __ro_after_init; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; +static struct kmem_cache *memcg_cachep; +static struct kmem_cache *memcg_pn_cachep; + #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -129,8 +133,7 @@ bool mem_cgroup_kmem_disabled(void) return cgroup_memory_nokmem; } -static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, - unsigned int nr_pages); +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { @@ -163,8 +166,16 @@ static void obj_cgroup_release(struct percpu_ref *ref) WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); nr_pages = nr_bytes >> PAGE_SHIFT; - if (nr_pages) - obj_cgroup_uncharge_pages(objcg, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + memcg1_account_kmem(memcg, -nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); + mem_cgroup_put(memcg); + } spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); @@ -492,8 +503,8 @@ struct memcg_vmstats_percpu { unsigned int stats_updates; /* Cached pointers for fast iteration in memcg_rstat_updated() */ - struct memcg_vmstats_percpu *parent; - struct memcg_vmstats *vmstats; + struct memcg_vmstats_percpu __percpu *parent_pcpu; + struct memcg_vmstats *vmstats; /* The above should fit a single cacheline for memcg_rstat_updated() */ @@ -544,60 +555,41 @@ static u64 flush_last_time; #define FLUSH_TIME (2UL*HZ) -/* - * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can - * not rely on this as part of an acquired spinlock_t lock. These functions are - * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion - * is sufficient. - */ -static void memcg_stats_lock(void) -{ - preempt_disable_nested(); - VM_WARN_ON_IRQS_ENABLED(); -} - -static void __memcg_stats_lock(void) -{ - preempt_disable_nested(); -} - -static void memcg_stats_unlock(void) -{ - preempt_enable_nested(); -} - - static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { return atomic64_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } -static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, + int cpu) { + struct memcg_vmstats_percpu __percpu *statc_pcpu; struct memcg_vmstats_percpu *statc; - int cpu = smp_processor_id(); unsigned int stats_updates; if (!val) return; - cgroup_rstat_updated(memcg->css.cgroup, cpu); - statc = this_cpu_ptr(memcg->vmstats_percpu); - for (; statc; statc = statc->parent) { - stats_updates = READ_ONCE(statc->stats_updates) + abs(val); - WRITE_ONCE(statc->stats_updates, stats_updates); + css_rstat_updated(&memcg->css, cpu); + statc_pcpu = memcg->vmstats_percpu; + for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { + statc = this_cpu_ptr(statc_pcpu); + /* + * If @memcg is already flushable then all its ancestors are + * flushable as well and also there is no need to increase + * stats_updates. + */ + if (memcg_vmstats_needs_flush(statc->vmstats)) + break; + + stats_updates = this_cpu_add_return(statc_pcpu->stats_updates, + abs(val)); if (stats_updates < MEMCG_CHARGE_BATCH) continue; - /* - * If @memcg is already flush-able, increasing stats_updates is - * redundant. Avoid the overhead of the atomic update. - */ - if (!memcg_vmstats_needs_flush(statc->vmstats)) - atomic64_add(stats_updates, - &statc->vmstats->stats_updates); - WRITE_ONCE(statc->stats_updates, 0); + stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0); + atomic64_add(stats_updates, &statc->vmstats->stats_updates); } } @@ -614,7 +606,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) if (mem_cgroup_is_root(memcg)) WRITE_ONCE(flush_last_time, jiffies_64); - cgroup_rstat_flush(memcg->css.cgroup); + css_rstat_flush(&memcg->css); } /* @@ -687,15 +679,16 @@ static int memcg_state_val_in_pages(int idx, int val) } /** - * __mod_memcg_state - update cgroup memory statistics + * mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item * @val: delta to add to the counter, can be negative */ -void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, +void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { int i = memcg_stats_index(idx); + int cpu; if (mem_cgroup_disabled()) return; @@ -703,10 +696,14 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - __this_cpu_add(memcg->vmstats_percpu->state[i], val); + cpu = get_cpu(); + + this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, val, cpu); trace_mod_memcg_state(memcg, idx, val); + + put_cpu(); } #ifdef CONFIG_MEMCG_V1 @@ -728,13 +725,14 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) } #endif -static void __mod_memcg_lruvec_state(struct lruvec *lruvec, +static void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; int i = memcg_stats_index(idx); + int cpu; if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; @@ -742,35 +740,19 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; - /* - * The caller from rmap relies on disabled preemption because they never - * update their counter from in-interrupt context. For these two - * counters we check that the update is never performed from an - * interrupt context while other caller need to have disabled interrupt. - */ - __memcg_stats_lock(); - if (IS_ENABLED(CONFIG_DEBUG_VM)) { - switch (idx) { - case NR_ANON_MAPPED: - case NR_FILE_MAPPED: - case NR_ANON_THPS: - WARN_ON_ONCE(!in_task()); - break; - default: - VM_WARN_ON_IRQS_ENABLED(); - } - } + cpu = get_cpu(); /* Update memcg */ - __this_cpu_add(memcg->vmstats_percpu->state[i], val); + this_cpu_add(memcg->vmstats_percpu->state[i], val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); + this_cpu_add(pn->lruvec_stats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, val, cpu); trace_mod_memcg_lruvec_state(memcg, idx, val); - memcg_stats_unlock(); + + put_cpu(); } /** @@ -791,7 +773,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg and lruvec */ if (!mem_cgroup_disabled()) - __mod_memcg_lruvec_state(lruvec, idx, val); + mod_memcg_lruvec_state(lruvec, idx, val); } void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, @@ -841,15 +823,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) } /** - * __count_memcg_events - account VM events in a cgroup + * count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup * @idx: the event item * @count: the number of events that occurred */ -void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, +void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { int i = memcg_events_index(idx); + int cpu; if (mem_cgroup_disabled()) return; @@ -857,11 +840,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[i], count); - memcg_rstat_updated(memcg, count); + cpu = get_cpu(); + + this_cpu_add(memcg->vmstats_percpu->events[i], count); + memcg_rstat_updated(memcg, count, cpu); trace_count_memcg_events(memcg, idx, count); - memcg_stats_unlock(); + + put_cpu(); } unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -1168,7 +1153,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { struct mem_cgroup *iter; int ret = 0; - int i = 0; BUG_ON(mem_cgroup_is_root(memcg)); @@ -1178,10 +1162,9 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); while (!ret && (task = css_task_iter_next(&it))) { - /* Avoid potential softlockup warning */ - if ((++i & 1023) == 0) - cond_resched(); ret = fn(task, arg); + /* Avoid potential softlockup warning */ + cond_resched(); } css_task_iter_end(&it); if (ret) { @@ -1664,7 +1647,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = task_is_dying() || out_of_memory(&oc); + ret = out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -1758,155 +1741,234 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) pr_cont(" are going to be killed due to memory.oom.group set\n"); } +/* + * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their + * nr_pages in a single cacheline. This may change in future. + */ +#define NR_MEMCG_STOCK 7 +#define FLUSHING_CACHED_CHARGE 0 struct memcg_stock_pcp { - local_trylock_t stock_lock; - struct mem_cgroup *cached; /* this never be root cgroup */ - unsigned int nr_pages; + local_trylock_t lock; + uint8_t nr_pages[NR_MEMCG_STOCK]; + struct mem_cgroup *cached[NR_MEMCG_STOCK]; + + struct work_struct work; + unsigned long flags; +}; + +static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = { + .lock = INIT_LOCAL_TRYLOCK(lock), +}; +struct obj_stock_pcp { + local_trylock_t lock; + unsigned int nr_bytes; struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; - unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; struct work_struct work; unsigned long flags; -#define FLUSHING_CACHED_CHARGE 0 }; -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { - .stock_lock = INIT_LOCAL_TRYLOCK(stock_lock), + +static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = { + .lock = INIT_LOCAL_TRYLOCK(lock), }; + static DEFINE_MUTEX(percpu_charge_mutex); -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +static void drain_obj_stock(struct obj_stock_pcp *stock); +static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg); /** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. * @nr_pages: how many pages to charge. - * @gfp_mask: allocation mask. * - * The charges will only happen if @memcg matches the current cpu's memcg - * stock, and at least @nr_pages are available in that stock. Failure to - * service an allocation will refill the stock. + * Consume the cached charge if enough nr_pages are present otherwise return + * failure. Also return failure for charge request larger than + * MEMCG_CHARGE_BATCH or if the local lock is already taken. * * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - unsigned int stock_pages; - unsigned long flags; + uint8_t stock_pages; bool ret = false; + int i; - if (nr_pages > MEMCG_CHARGE_BATCH) - return ret; - - if (gfpflags_allow_spinning(gfp_mask)) - local_lock_irqsave(&memcg_stock.stock_lock, flags); - else if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) + if (nr_pages > MEMCG_CHARGE_BATCH || + !local_trylock(&memcg_stock.lock)) return ret; stock = this_cpu_ptr(&memcg_stock); - stock_pages = READ_ONCE(stock->nr_pages); - if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) { - WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages); - ret = true; + + for (i = 0; i < NR_MEMCG_STOCK; ++i) { + if (memcg != READ_ONCE(stock->cached[i])) + continue; + + stock_pages = READ_ONCE(stock->nr_pages[i]); + if (stock_pages >= nr_pages) { + WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages); + ret = true; + } + break; } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_unlock(&memcg_stock.lock); return ret; } +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} + /* * Returns stocks cached in percpu and reset cached information. */ -static void drain_stock(struct memcg_stock_pcp *stock) +static void drain_stock(struct memcg_stock_pcp *stock, int i) { - unsigned int stock_pages = READ_ONCE(stock->nr_pages); - struct mem_cgroup *old = READ_ONCE(stock->cached); + struct mem_cgroup *old = READ_ONCE(stock->cached[i]); + uint8_t stock_pages; if (!old) return; + stock_pages = READ_ONCE(stock->nr_pages[i]); if (stock_pages) { - page_counter_uncharge(&old->memory, stock_pages); - if (do_memsw_account()) - page_counter_uncharge(&old->memsw, stock_pages); - - WRITE_ONCE(stock->nr_pages, 0); + memcg_uncharge(old, stock_pages); + WRITE_ONCE(stock->nr_pages[i], 0); } css_put(&old->css); - WRITE_ONCE(stock->cached, NULL); + WRITE_ONCE(stock->cached[i], NULL); } -static void drain_local_stock(struct work_struct *dummy) +static void drain_stock_fully(struct memcg_stock_pcp *stock) +{ + int i; + + for (i = 0; i < NR_MEMCG_STOCK; ++i) + drain_stock(stock, i); +} + +static void drain_local_memcg_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; - /* - * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. - * drain_stock races is that we always operate on local CPU stock - * here with IRQ disabled - */ - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (WARN_ONCE(!in_task(), "drain in non-task context")) + return; + + local_lock(&memcg_stock.lock); stock = this_cpu_ptr(&memcg_stock); - old = drain_obj_stock(stock); - drain_stock(stock); + drain_stock_fully(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); + local_unlock(&memcg_stock.lock); } -/* - * Cache charges(val) to local per_cpu area. - * This will be consumed by consume_stock() function, later. - */ -static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void drain_local_obj_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock; - unsigned int stock_pages; + struct obj_stock_pcp *stock; - stock = this_cpu_ptr(&memcg_stock); - if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ - drain_stock(stock); - css_get(&memcg->css); - WRITE_ONCE(stock->cached, memcg); - } - stock_pages = READ_ONCE(stock->nr_pages) + nr_pages; - WRITE_ONCE(stock->nr_pages, stock_pages); + if (WARN_ONCE(!in_task(), "drain in non-task context")) + return; + + local_lock(&obj_stock.lock); - if (stock_pages > MEMCG_CHARGE_BATCH) - drain_stock(stock); + stock = this_cpu_ptr(&obj_stock); + drain_obj_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + + local_unlock(&obj_stock.lock); } static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { - unsigned long flags; + struct memcg_stock_pcp *stock; + struct mem_cgroup *cached; + uint8_t stock_pages; + bool success = false; + int empty_slot = -1; + int i; - if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + /* + * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we + * decide to increase it more than 127 then we will need more careful + * handling of nr_pages[] in struct memcg_stock_pcp. + */ + BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX); + + VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); + + if (nr_pages > MEMCG_CHARGE_BATCH || + !local_trylock(&memcg_stock.lock)) { /* - * In case of unlikely failure to lock percpu stock_lock - * uncharge memcg directly. + * In case of larger than batch refill or unlikely failure to + * lock the percpu memcg_stock.lock, uncharge memcg directly. */ - if (mem_cgroup_is_root(memcg)) - return; - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); + memcg_uncharge(memcg, nr_pages); return; } - __refill_stock(memcg, nr_pages); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + for (i = 0; i < NR_MEMCG_STOCK; ++i) { + cached = READ_ONCE(stock->cached[i]); + if (!cached && empty_slot == -1) + empty_slot = i; + if (memcg == READ_ONCE(stock->cached[i])) { + stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages; + WRITE_ONCE(stock->nr_pages[i], stock_pages); + if (stock_pages > MEMCG_CHARGE_BATCH) + drain_stock(stock, i); + success = true; + break; + } + } + + if (!success) { + i = empty_slot; + if (i == -1) { + i = get_random_u32_below(NR_MEMCG_STOCK); + drain_stock(stock, i); + } + css_get(&memcg->css); + WRITE_ONCE(stock->cached[i], memcg); + WRITE_ONCE(stock->nr_pages[i], nr_pages); + } + + local_unlock(&memcg_stock.lock); +} + +static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) +{ + struct mem_cgroup *memcg; + bool flush = false; + int i; + + rcu_read_lock(); + for (i = 0; i < NR_MEMCG_STOCK; ++i) { + memcg = READ_ONCE(stock->cached[i]); + if (!memcg) + continue; + + if (READ_ONCE(stock->nr_pages[i]) && + mem_cgroup_is_descendant(memcg, root_memcg)) { + flush = true; + break; + } + } + rcu_read_unlock(); + return flush; } /* @@ -1929,25 +1991,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg) migrate_disable(); curcpu = smp_processor_id(); for_each_online_cpu(cpu) { - struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - struct mem_cgroup *memcg; - bool flush = false; + struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu); + struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu); - rcu_read_lock(); - memcg = READ_ONCE(stock->cached); - if (memcg && READ_ONCE(stock->nr_pages) && - mem_cgroup_is_descendant(memcg, root_memcg)) - flush = true; - else if (obj_stock_flush_required(stock, root_memcg)) - flush = true; - rcu_read_unlock(); + if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) && + is_memcg_drain_needed(memcg_st, root_memcg) && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, + &memcg_st->flags)) { + if (cpu == curcpu) + drain_local_memcg_stock(&memcg_st->work); + else if (!cpu_is_isolated(cpu)) + schedule_work_on(cpu, &memcg_st->work); + } - if (flush && - !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) && + obj_stock_flush_required(obj_st, root_memcg) && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, + &obj_st->flags)) { if (cpu == curcpu) - drain_local_stock(&stock->work); + drain_local_obj_stock(&obj_st->work); else if (!cpu_is_isolated(cpu)) - schedule_work_on(cpu, &stock->work); + schedule_work_on(cpu, &obj_st->work); } } migrate_enable(); @@ -1956,19 +2020,9 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { - struct memcg_stock_pcp *stock; - struct obj_cgroup *old; - unsigned long flags; - - stock = &per_cpu(memcg_stock, cpu); - - /* drain_obj_stock requires stock_lock */ - local_lock_irqsave(&memcg_stock.stock_lock, flags); - old = drain_obj_stock(stock); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - - drain_stock(stock); - obj_cgroup_put(old); + /* no need for the local lock */ + drain_obj_stock(&per_cpu(obj_stock, cpu)); + drain_stock_fully(&per_cpu(memcg_stock, cpu)); return 0; } @@ -2258,7 +2312,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long pflags; retry: - if (consume_stock(memcg, nr_pages, gfp_mask)) + if (consume_stock(memcg, nr_pages)) return 0; if (!gfpflags_allow_spinning(gfp_mask)) @@ -2459,7 +2513,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } -static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { @@ -2469,7 +2523,7 @@ static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_memcg_lruvec_state(lruvec, idx, nr); + mod_memcg_lruvec_state(lruvec, idx, nr); rcu_read_unlock(); } @@ -2764,50 +2818,27 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } -/* Replace the stock objcg with objcg, return the old objcg */ -static struct obj_cgroup *replace_stock_objcg(struct memcg_stock_pcp *stock, - struct obj_cgroup *objcg) -{ - struct obj_cgroup *old = NULL; - - old = drain_obj_stock(stock); - obj_cgroup_get(objcg); - stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) - ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - WRITE_ONCE(stock->cached_objcg, objcg); - return old; -} - -static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static void __account_obj_stock(struct obj_cgroup *objcg, + struct obj_stock_pcp *stock, int nr, + struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; int *bytes; - local_lock_irqsave(&memcg_stock.stock_lock, flags); - stock = this_cpu_ptr(&memcg_stock); - /* * Save vmstat data in stock and skip vmstat array update unless - * accumulating over a page of vmstat data or when pgdat or idx - * changes. + * accumulating over a page of vmstat data or when pgdat changes. */ - if (READ_ONCE(stock->cached_objcg) != objcg) { - old = replace_stock_objcg(stock, objcg); - stock->cached_pgdat = pgdat; - } else if (stock->cached_pgdat != pgdat) { + if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ struct pglist_data *oldpg = stock->cached_pgdat; if (stock->nr_slab_reclaimable_b) { - __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } @@ -2833,37 +2864,38 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, } } if (nr) - __mod_objcg_mlstate(objcg, pgdat, idx, nr); - - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); + mod_objcg_mlstate(objcg, pgdat, idx, nr); } -static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - unsigned long flags; + struct obj_stock_pcp *stock; bool ret = false; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!local_trylock(&obj_stock.lock)) + return ret; - stock = this_cpu_ptr(&memcg_stock); + stock = this_cpu_ptr(&obj_stock); if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; + + if (pgdat) + __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + local_unlock(&obj_stock.lock); return ret; } -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) +static void drain_obj_stock(struct obj_stock_pcp *stock) { struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) - return NULL; + return; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; @@ -2876,7 +2908,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); - __refill_stock(memcg, nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); css_put(&memcg->css); } @@ -2900,13 +2933,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) */ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { if (stock->nr_slab_reclaimable_b) { - __mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - __mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; @@ -2915,63 +2948,76 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) } WRITE_ONCE(stock->cached_objcg, NULL); - /* - * The `old' objects needs to be released by the caller via - * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. - */ - return old; + obj_cgroup_put(old); } -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg) { struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); struct mem_cgroup *memcg; + bool flush = false; + rcu_read_lock(); if (objcg) { memcg = obj_cgroup_memcg(objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) - return true; + flush = true; } + rcu_read_unlock(); - return false; + return flush; } static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - bool allow_uncharge) + bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, + enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; - unsigned long flags; + struct obj_stock_pcp *stock; unsigned int nr_pages = 0; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!local_trylock(&obj_stock.lock)) { + if (pgdat) + mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes); + nr_pages = nr_bytes >> PAGE_SHIFT; + nr_bytes = nr_bytes & (PAGE_SIZE - 1); + atomic_add(nr_bytes, &objcg->nr_charged_bytes); + goto out; + } - stock = this_cpu_ptr(&memcg_stock); + stock = this_cpu_ptr(&obj_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - old = replace_stock_objcg(stock, objcg); + drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + WRITE_ONCE(stock->cached_objcg, objcg); + allow_uncharge = true; /* Allow uncharge when objcg changes */ } stock->nr_bytes += nr_bytes; + if (pgdat) + __account_obj_stock(objcg, stock, nr_acct, pgdat, idx); + if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { nr_pages = stock->nr_bytes >> PAGE_SHIFT; stock->nr_bytes &= (PAGE_SIZE - 1); } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); - + local_unlock(&obj_stock.lock); +out: if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); } -int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, + struct pglist_data *pgdat, enum node_stat_item idx) { unsigned int nr_pages, nr_bytes; int ret; - if (consume_obj_stock(objcg, size)) + if (likely(consume_obj_stock(objcg, size, pgdat, idx))) return 0; /* @@ -3004,15 +3050,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) nr_pages += 1; ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); - if (!ret && nr_bytes) - refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); + if (!ret && (nr_bytes || pgdat)) + refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0, + false, size, pgdat, idx); return ret; } +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +{ + return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0); +} + void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { - refill_obj_stock(objcg, size, true); + refill_obj_stock(objcg, size, true, 0, NULL, 0); } static inline size_t obj_full_size(struct kmem_cache *s) @@ -3064,23 +3116,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, return false; } - if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) - return false; - for (i = 0; i < size; i++) { slab = virt_to_slab(p[i]); if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } + /* + * if we fail and size is 1, memcg_alloc_abort_single() will + * just free the object, which is ok as we have not assigned + * objcg to its obj_ext yet + * + * for larger sizes, kmem_cache_free_bulk() will uncharge + * any objects that were already charged and obj_ext assigned + * + * TODO: we could batch this until slab_pgdat(slab) changes + * between iterations, with a more complicated undo + */ + if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s), + slab_pgdat(slab), cache_vmstat_idx(s))) + return false; + off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); slab_obj_exts(slab)[off].objcg = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); } return true; @@ -3089,6 +3150,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts) { + size_t obj_size = obj_full_size(s); + for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; unsigned int off; @@ -3099,9 +3162,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, continue; obj_exts[off].objcg = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); + refill_obj_stock(objcg, obj_size, true, -obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); obj_cgroup_put(objcg); } } @@ -3543,7 +3605,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); + pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO, + node); if (!pn) return false; @@ -3590,13 +3653,14 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) { - struct memcg_vmstats_percpu *statc, *pstatc; + struct memcg_vmstats_percpu *statc; + struct memcg_vmstats_percpu __percpu *pstatc_pcpu; struct mem_cgroup *memcg; int node, cpu; int __maybe_unused i; long error; - memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); + memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL); if (!memcg) return ERR_PTR(-ENOMEM); @@ -3621,9 +3685,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) for_each_possible_cpu(cpu) { if (parent) - pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); + pstatc_pcpu = parent->vmstats_percpu; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - statc->parent = parent ? pstatc : NULL; + statc->parent_pcpu = parent ? pstatc_pcpu : NULL; statc->vmstats = memcg->vmstats; } @@ -4196,6 +4260,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, page_counter_set_high(&memcg->memory, high); + if (of->file->f_flags & O_NONBLOCK) + goto out; + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); unsigned long reclaimed; @@ -4218,7 +4285,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (!reclaimed && !nr_retries--) break; } - +out: memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -4245,6 +4312,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, xchg(&memcg->memory.max, max); + if (of->file->f_flags & O_NONBLOCK) + goto out; + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -4272,7 +4342,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, break; cond_resched(); } - +out: memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -4395,11 +4465,13 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, enum { MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_SWAPPINESS_MAX, MEMORY_RECLAIM_NULL, }; static const match_table_t tokens = { { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, { MEMORY_RECLAIM_NULL, NULL }, }; @@ -4433,6 +4505,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) return -EINVAL; break; + case MEMORY_RECLAIM_SWAPPINESS_MAX: + swappiness = SWAPPINESS_ANON_ONLY; + break; default: return -EINVAL; } @@ -4697,9 +4772,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { if (ug->nr_memory) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); - if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); + memcg_uncharge(ug->memcg, ug->nr_memory); if (ug->nr_kmem) { mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); memcg1_account_kmem(ug->memcg, -ug->nr_kmem); @@ -4975,15 +5048,16 @@ static int __init cgroup_memory(char *s) __setup("cgroup.memory=", cgroup_memory); /* - * subsys_initcall() for memory controller. + * Memory controller init before cgroup_init() initialize root_mem_cgroup. * * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this * context because of lock dependencies (cgroup_lock -> cpu hotplug) but * basically everything that doesn't depend on a specific mem_cgroup structure * should be initialized from here. */ -static int __init mem_cgroup_init(void) +int __init mem_cgroup_init(void) { + unsigned int memcg_size; int cpu; /* @@ -4997,13 +5071,22 @@ static int __init mem_cgroup_init(void) cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, - drain_local_stock); + drain_local_memcg_stock); + INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work, + drain_local_obj_stock); + } + + memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids); + memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0, + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); + + memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node, + SLAB_PANIC | SLAB_HWCACHE_ALIGN); return 0; } -subsys_initcall(mem_cgroup_init); #ifdef CONFIG_SWAP /** @@ -5457,3 +5540,8 @@ static int __init mem_cgroup_swap_init(void) subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_SWAP */ + +bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +{ + return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; +} diff --git a/mm/memfd.c b/mm/memfd.c index c64df1343059..ab367e61553d 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -20,6 +20,7 @@ #include <linux/memfd.h> #include <linux/pid_namespace.h> #include <uapi/linux/memfd.h> +#include "swap.h" /* * We need a tag: a new tag would expand every xa_node by 8 bytes, diff --git a/mm/memory.c b/mm/memory.c index ba3ea0a82f7f..5cb48f262ab0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -278,8 +278,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_free_tlb(tlb, p4d, start); } -/* - * This function frees user-level page tables of a process. +/** + * free_pgd_range - Unmap and free page tables in the range + * @tlb: the mmu_gather containing pending TLB flush info + * @addr: virtual address start + * @end: virtual address end + * @floor: lowest address boundary + * @ceiling: highest address boundary + * + * This function tears down all user-level page tables in the + * specified virtual address range [@addr..@end). It is part of + * the memory unmap flow. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -518,10 +527,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", + pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -929,7 +939,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); + pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -1361,7 +1371,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; - unsigned long next, pfn = 0; + unsigned long next; bool is_cow; int ret; @@ -1371,12 +1381,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); - if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - ret = track_pfn_copy(dst_vma, src_vma, &pfn); - if (ret) - return ret; - } - /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the @@ -1418,8 +1422,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } - if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) - untrack_pfn_copy(dst_vma, pfn); return ret; } @@ -1799,7 +1801,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - __split_huge_pmd(vma, pmd, addr, false, NULL); + __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; @@ -1914,9 +1916,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0, mm_wr_locked); - if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* @@ -1990,35 +1989,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, } /** - * zap_page_range_single - remove user pages in a given range + * zap_page_range_single_batched - remove user pages in a given range + * @tlb: pointer to the caller's struct mmu_gather * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap + * @address: starting address of pages to remove + * @size: number of bytes to remove * @details: details of shared cache invalidation * - * The range must fit into one VMA. + * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for + * hugetlb, @tlb is flushed and re-initialized by this function. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; - struct mmu_gather tlb; + + VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); - tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); + if (is_vm_hugetlb_page(vma)) { + /* + * flush tlb and free resources before hugetlb_zap_end(), to + * avoid concurrent page faults' allocation failure. + */ + tlb_finish_mmu(tlb); + hugetlb_zap_end(vma, details); + tlb_gather_mmu(tlb, vma->vm_mm); + } +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, address, size, details); tlb_finish_mmu(&tlb); - hugetlb_zap_end(vma, details); } /** @@ -2525,7 +2553,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); @@ -2588,7 +2616,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) return VM_FAULT_SIGBUS; @@ -2833,6 +2861,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, return error; } +#ifdef __HAVE_PFNMAP_TRACKING +static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn, + unsigned long size, pgprot_t *prot) +{ + struct pfnmap_track_ctx *ctx; + + if (pfnmap_track(pfn, size, prot)) + return ERR_PTR(-EINVAL); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (unlikely(!ctx)) { + pfnmap_untrack(pfn, size); + return ERR_PTR(-ENOMEM); + } + + ctx->pfn = pfn; + ctx->size = size; + kref_init(&ctx->kref); + return ctx; +} + +void pfnmap_track_ctx_release(struct kref *ref) +{ + struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref); + + pfnmap_untrack(ctx->pfn, ctx->size); + kfree(ctx); +} +#endif /* __HAVE_PFNMAP_TRACKING */ + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to @@ -2845,20 +2903,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, * * Return: %0 on success, negative error code otherwise. */ +#ifdef __HAVE_PFNMAP_TRACKING int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { + struct pfnmap_track_ctx *ctx = NULL; int err; - err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); - if (err) + size = PAGE_ALIGN(size); + + /* + * If we cover the full VMA, we'll perform actual tracking, and + * remember to untrack when the last reference to our tracking + * context from a VMA goes away. We'll keep tracking the whole pfn + * range even during VMA splits and partial unmapping. + * + * If we only cover parts of the VMA, we'll only setup the cachemode + * in the pgprot for the pfn range. + */ + if (addr == vma->vm_start && addr + size == vma->vm_end) { + if (vma->pfnmap_track_ctx) + return -EINVAL; + ctx = pfnmap_track_ctx_alloc(pfn, size, &prot); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + } else if (pfnmap_setup_cachemode(pfn, size, &prot)) { return -EINVAL; + } err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); - if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); + if (ctx) { + if (err) + kref_put(&ctx->kref, pfnmap_track_ctx_release); + else + vma->pfnmap_track_ctx = ctx; + } return err; } + +#else +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_notrack(vma, addr, pfn, size, prot); +} +#endif EXPORT_SYMBOL(remap_pfn_range); /** @@ -3523,7 +3612,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(&new_folio->page, vma->vm_page_prot); + entry = folio_mk_pte(new_folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3730,7 +3819,7 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, * If all folio references are from mappings, and all mappings are in * the page tables of this MM, then this folio is exclusive to this MM. */ - if (folio_test_large_maybe_mapped_shared(folio)) + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) return false; VM_WARN_ON_ONCE(folio_test_ksm(folio)); @@ -3751,9 +3840,9 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, /* Stabilize the mapcount vs. refcount and recheck. */ folio_lock_large_mapcount(folio); - VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio)); + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio); - if (folio_test_large_maybe_mapped_shared(folio)) + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) goto unlock; if (folio_large_mapcount(folio) != folio_ref_count(folio)) goto unlock; @@ -5013,7 +5102,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = mk_pte(&folio->page, vma->vm_page_prot); + entry = folio_mk_pte(folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); @@ -5138,9 +5227,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { - struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -5188,7 +5276,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) flush_icache_pages(vma, page, HPAGE_PMD_NR); - entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -5213,7 +5301,7 @@ out: return ret; } #else -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { return VM_FAULT_FALLBACK; } @@ -5245,6 +5333,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_write(entry) && folio_test_dirty(folio)) + entry = pte_mkdirty(entry); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ @@ -5305,6 +5395,7 @@ fallback: else page = vmf->page; + folio = page_folio(page); /* * check even for read faults because we might have lost our CoWed * page @@ -5316,8 +5407,8 @@ fallback: } if (pmd_none(*vmf->pmd)) { - if (PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); + if (folio_test_pmd_mappable(folio)) { + ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; } @@ -5328,7 +5419,6 @@ fallback: return VM_FAULT_OOM; } - folio = page_folio(page); nr_pages = folio_nr_pages(folio); /* @@ -5892,7 +5982,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) split: /* COW or write-notify handled on pte level: split pmd. */ - __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false); return VM_FAULT_FALLBACK; } @@ -6338,258 +6428,6 @@ out: } EXPORT_SYMBOL_GPL(handle_mm_fault); -#ifdef CONFIG_LOCK_MM_AND_FIND_VMA -#include <linux/extable.h> - -static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - if (likely(mmap_read_trylock(mm))) - return true; - - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - - return !mmap_read_lock_killable(mm); -} - -static inline bool mmap_upgrade_trylock(struct mm_struct *mm) -{ - /* - * We don't have this operation yet. - * - * It should be easy enough to do: it's basically a - * atomic_long_try_cmpxchg_acquire() - * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but - * it also needs the proper lockdep magic etc. - */ - return false; -} - -static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - mmap_read_unlock(mm); - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - return !mmap_write_lock_killable(mm); -} - -/* - * Helper for page fault handling. - * - * This is kind of equivalent to "mmap_read_lock()" followed - * by "find_extend_vma()", except it's a lot more careful about - * the locking (and will drop the lock on failure). - * - * For example, if we have a kernel bug that causes a page - * fault, we don't want to just use mmap_read_lock() to get - * the mm lock, because that would deadlock if the bug were - * to happen while we're holding the mm lock for writing. - * - * So this checks the exception tables on kernel faults in - * order to only do this all for instructions that are actually - * expected to fault. - * - * We can also actually take the mm lock for writing if we - * need to extend the vma, which helps the VM layer a lot. - */ -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, - unsigned long addr, struct pt_regs *regs) -{ - struct vm_area_struct *vma; - - if (!get_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (likely(vma && (vma->vm_start <= addr))) - return vma; - - /* - * Well, dang. We might still be successful, but only - * if we can extend a vma to do so. - */ - if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { - mmap_read_unlock(mm); - return NULL; - } - - /* - * We can try to upgrade the mmap lock atomically, - * in which case we can continue to use the vma - * we already looked up. - * - * Otherwise we'll have to drop the mmap lock and - * re-take it, and also look up the vma again, - * re-checking it. - */ - if (!mmap_upgrade_trylock(mm)) { - if (!upgrade_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (!vma) - goto fail; - if (vma->vm_start <= addr) - goto success; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto fail; - } - - if (expand_stack_locked(vma, addr)) - goto fail; - -success: - mmap_write_downgrade(mm); - return vma; - -fail: - mmap_write_unlock(mm); - return NULL; -} -#endif - -#ifdef CONFIG_PER_VMA_LOCK -static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) -{ - unsigned int tgt_refcnt = VMA_LOCK_OFFSET; - - /* Additional refcnt if the vma is attached. */ - if (!detaching) - tgt_refcnt++; - - /* - * If vma is detached then only vma_mark_attached() can raise the - * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). - */ - if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) - return false; - - rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); - rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, - refcount_read(&vma->vm_refcnt) == tgt_refcnt, - TASK_UNINTERRUPTIBLE); - lock_acquired(&vma->vmlock_dep_map, _RET_IP_); - - return true; -} - -static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) -{ - *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); -} - -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) -{ - bool locked; - - /* - * __vma_enter_locked() returns false immediately if the vma is not - * attached, otherwise it waits until refcnt is indicating that vma - * is attached with no readers. - */ - locked = __vma_enter_locked(vma, false); - - /* - * We should use WRITE_ONCE() here because we can have concurrent reads - * from the early lockless pessimistic check in vma_start_read(). - * We don't really care about the correctness of that early check, but - * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. - */ - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - - if (locked) { - bool detached; - - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(detached); /* vma should remain attached */ - } -} -EXPORT_SYMBOL_GPL(__vma_start_write); - -void vma_mark_detached(struct vm_area_struct *vma) -{ - vma_assert_write_locked(vma); - vma_assert_attached(vma); - - /* - * We are the only writer, so no need to use vma_refcount_put(). - * The condition below is unlikely because the vma has been already - * write-locked and readers can increment vm_refcnt only temporarily - * before they check vm_lock_seq, realize the vma is locked and drop - * back the vm_refcnt. That is a narrow window for observing a raised - * vm_refcnt. - */ - if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { - /* Wait until vma is detached with no readers. */ - if (__vma_enter_locked(vma, true)) { - bool detached; - - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(!detached); - } - } -} - -/* - * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be - * stable and not isolated. If the VMA is not found or is being modified the - * function returns NULL. - */ -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address) -{ - MA_STATE(mas, &mm->mm_mt, address, address); - struct vm_area_struct *vma; - - rcu_read_lock(); -retry: - vma = mas_walk(&mas); - if (!vma) - goto inval; - - vma = vma_start_read(mm, vma); - if (IS_ERR_OR_NULL(vma)) { - /* Check if the VMA got isolated after we found it */ - if (PTR_ERR(vma) == -EAGAIN) { - count_vm_vma_lock_event(VMA_LOCK_MISS); - /* The area was replaced with another one */ - goto retry; - } - - /* Failed to lock the VMA */ - goto inval; - } - /* - * At this point, we have a stable reference to a VMA: The VMA is - * locked and we know it hasn't already been isolated. - * From here on, we can access the VMA without worrying about which - * fields are accessible for RCU readers. - */ - - /* Check if the vma we locked is the right one. */ - if (unlikely(vma->vm_mm != mm || - address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; - - rcu_read_unlock(); - return vma; - -inval_end_read: - vma_end_read(vma); -inval: - rcu_read_unlock(); - count_vm_vma_lock_event(VMA_LOCK_ABORT); - return NULL; -} -#endif /* CONFIG_PER_VMA_LOCK */ - #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8305483de38b..b1caedbade5b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1756,12 +1756,10 @@ static int scan_movable_pages(unsigned long start, unsigned long end, { unsigned long pfn; - for (pfn = start; pfn < end; pfn++) { + for_each_valid_pfn(pfn, start, end) { struct page *page; struct folio *folio; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); if (PageLRU(page)) goto found; @@ -1805,11 +1803,9 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for_each_valid_pfn(pfn, start_pfn, end_pfn) { struct page *page; - if (!pfn_valid(pfn)) - continue; page = pfn_to_page(pfn); folio = page_folio(page); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b28a1e6ae096..72fd72e156b1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -109,10 +109,12 @@ #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> +#include <linux/gcd.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> +#include <linux/memory.h> #include "internal.h" @@ -139,31 +141,138 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /* - * iw_table is the sysfs-set interleave weight table, a value of 0 denotes - * system-default value should be used. A NULL iw_table also denotes that - * system-default values should be used. Until the system-default table - * is implemented, the system-default is always 1. - * - * iw_table is RCU protected + * weightiness balances the tradeoff between small weights (cycles through nodes + * faster, more fair/even distribution) and large weights (smaller errors + * between actual bandwidth ratios and weight ratios). 32 is a number that has + * been found to perform at a reasonable compromise between the two goals. + */ +static const int weightiness = 32; + +/* + * A null weighted_interleave_state is interpreted as having .mode="auto", + * and .iw_table is interpreted as an array of 1s with length nr_node_ids. */ -static u8 __rcu *iw_table; -static DEFINE_MUTEX(iw_table_lock); +struct weighted_interleave_state { + bool mode_auto; + u8 iw_table[]; +}; +static struct weighted_interleave_state __rcu *wi_state; +static unsigned int *node_bw_table; + +/* + * wi_state_lock protects both wi_state and node_bw_table. + * node_bw_table is only used by writers to update wi_state. + */ +static DEFINE_MUTEX(wi_state_lock); static u8 get_il_weight(int node) { - u8 *table; - u8 weight; + struct weighted_interleave_state *state; + u8 weight = 1; rcu_read_lock(); - table = rcu_dereference(iw_table); - /* if no iw_table, use system default */ - weight = table ? table[node] : 1; - /* if value in iw_table is 0, use system default */ - weight = weight ? weight : 1; + state = rcu_dereference(wi_state); + if (state) + weight = state->iw_table[node]; rcu_read_unlock(); return weight; } +/* + * Convert bandwidth values into weighted interleave weights. + * Call with wi_state_lock. + */ +static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) +{ + u64 sum_bw = 0; + unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; + int nid; + + for_each_node_state(nid, N_MEMORY) + sum_bw += bw[nid]; + + /* Scale bandwidths to whole numbers in the range [1, weightiness] */ + for_each_node_state(nid, N_MEMORY) { + /* + * Try not to perform 64-bit division. + * If sum_bw < scaling_factor, then sum_bw < U32_MAX. + * If sum_bw > scaling_factor, then round the weight up to 1. + */ + scaling_factor = weightiness * bw[nid]; + if (bw[nid] && sum_bw < scaling_factor) { + cast_sum_bw = (unsigned int)sum_bw; + new_iw[nid] = scaling_factor / cast_sum_bw; + } else { + new_iw[nid] = 1; + } + if (!iw_gcd) + iw_gcd = new_iw[nid]; + iw_gcd = gcd(iw_gcd, new_iw[nid]); + } + + /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ + for_each_node_state(nid, N_MEMORY) + new_iw[nid] /= iw_gcd; +} + +int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *old_bw, *new_bw; + unsigned int bw_val; + int i; + + bw_val = min(coords->read_bandwidth, coords->write_bandwidth); + new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); + if (!new_bw) + return -ENOMEM; + + new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) { + kfree(new_bw); + return -ENOMEM; + } + new_wi_state->mode_auto = true; + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + + /* + * Update bandwidth info, even in manual mode. That way, when switching + * to auto mode in the future, iw_table can be overwritten using + * accurate bw data. + */ + mutex_lock(&wi_state_lock); + + old_bw = node_bw_table; + if (old_bw) + memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); + new_bw[node] = bw_val; + node_bw_table = new_bw; + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state && !old_wi_state->mode_auto) { + /* Manual mode; skip reducing weights and updating wi_state */ + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + goto out; + } + + /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ + reduce_interleave_weights(new_bw, new_wi_state->iw_table); + rcu_assign_pointer(wi_state, new_wi_state); + + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } +out: + kfree(old_bw); + return 0; +} + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -566,6 +675,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; @@ -573,6 +683,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; + int max_nr, nr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -586,7 +697,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } - for (; addr != end; pte++, addr += PAGE_SIZE) { + for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { + max_nr = (end - addr) >> PAGE_SHIFT; + nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; @@ -598,6 +711,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; + if (folio_test_large(folio) && max_nr != 1) + nr = folio_pte_batch(folio, addr, pte, ptent, + max_nr, fpb_flags, + NULL, NULL, NULL); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. @@ -630,7 +747,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { - qp->nr_failed++; + qp->nr_failed += nr; if (strictly_unmovable(flags)) break; } @@ -2014,26 +2131,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol, static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) { + struct weighted_interleave_state *state; nodemask_t nodemask; unsigned int target, nr_nodes; - u8 *table; + u8 *table = NULL; unsigned int weight_total = 0; u8 weight; - int nid; + int nid = 0; nr_nodes = read_once_policy_nodemask(pol, &nodemask); if (!nr_nodes) return numa_node_id(); rcu_read_lock(); - table = rcu_dereference(iw_table); + + state = rcu_dereference(wi_state); + /* Uninitialized wi_state means we should assume all weights are 1 */ + if (state) + table = state->iw_table; + /* calculate the total weight */ - for_each_node_mask(nid, nodemask) { - /* detect system default usage */ - weight = table ? table[nid] : 1; - weight = weight ? weight : 1; - weight_total += weight; - } + for_each_node_mask(nid, nodemask) + weight_total += table ? table[nid] : 1; /* Calculate the node offset based on totals */ target = ilx % weight_total; @@ -2041,7 +2160,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) while (target) { /* detect system default usage */ weight = table ? table[nid] : 1; - weight = weight ? weight : 1; if (target < weight) break; target -= weight; @@ -2442,13 +2560,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { + struct weighted_interleave_state *state; struct task_struct *me = current; unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; - u8 *table, *weights, weight; + u8 *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; @@ -2498,17 +2617,19 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, return total_allocated; rcu_read_lock(); - table = rcu_dereference(iw_table); - if (table) - memcpy(weights, table, nr_node_ids); - rcu_read_unlock(); + state = rcu_dereference(wi_state); + if (state) { + memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + for (i = 0; i < nr_node_ids; i++) + weights[i] = 1; + } /* calculate total, detect system default usage */ - for_each_node_mask(node, nodes) { - if (!weights[node]) - weights[node] = 1; + for_each_node_mask(node, nodes) weight_total += weights[node]; - } /* * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. @@ -3419,6 +3540,14 @@ struct iw_node_attr { int nid; }; +struct sysfs_wi_group { + struct kobject wi_kobj; + struct mutex kobj_lock; + struct iw_node_attr *nattrs[]; +}; + +static struct sysfs_wi_group *wi_group; + static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3433,177 +3562,318 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; struct iw_node_attr *node_attr; - u8 *new; - u8 *old; u8 weight = 0; + int i; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); - if (count == 0 || sysfs_streq(buf, "")) - weight = 0; - else if (kstrtou8(buf, 0, &weight)) + if (count == 0 || sysfs_streq(buf, "") || + kstrtou8(buf, 0, &weight) || weight == 0) return -EINVAL; - new = kzalloc(nr_node_ids, GFP_KERNEL); - if (!new) + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) return -ENOMEM; - mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, - lockdep_is_held(&iw_table_lock)); - if (old) - memcpy(new, old, nr_node_ids); - new[node_attr->nid] = weight; - rcu_assign_pointer(iw_table, new); - mutex_unlock(&iw_table_lock); - synchronize_rcu(); - kfree(old); + mutex_lock(&wi_state_lock); + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state) { + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + } else { + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + } + new_wi_state->iw_table[node_attr->nid] = weight; + new_wi_state->mode_auto = false; + + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } return count; } -static struct iw_node_attr **node_attrs; - -static void sysfs_wi_node_release(struct iw_node_attr *node_attr, - struct kobject *parent) +static ssize_t weighted_interleave_auto_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { - if (!node_attr) - return; - sysfs_remove_file(parent, &node_attr->kobj_attr.attr); - kfree(node_attr->kobj_attr.attr.name); - kfree(node_attr); + struct weighted_interleave_state *state; + bool wi_auto = true; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) + wi_auto = state->mode_auto; + rcu_read_unlock(); + + return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); } -static void sysfs_wi_release(struct kobject *wi_kobj) +static ssize_t weighted_interleave_auto_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) { + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *bw; + bool input; int i; + if (kstrtobool(buf, &input)) + return -EINVAL; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) + return -ENOMEM; for (i = 0; i < nr_node_ids; i++) - sysfs_wi_node_release(node_attrs[i], wi_kobj); - kobject_put(wi_kobj); + new_wi_state->iw_table[i] = 1; + + mutex_lock(&wi_state_lock); + if (!input) { + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (!old_wi_state) + goto update_wi_state; + if (input == old_wi_state->mode_auto) { + mutex_unlock(&wi_state_lock); + return count; + } + + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + goto update_wi_state; + } + + bw = node_bw_table; + if (!bw) { + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + return -ENODEV; + } + + new_wi_state->mode_auto = true; + reduce_interleave_weights(bw, new_wi_state->iw_table); + +update_wi_state: + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + return count; +} + +static void sysfs_wi_node_delete(int nid) +{ + struct iw_node_attr *attr; + + if (nid < 0 || nid >= nr_node_ids) + return; + + mutex_lock(&wi_group->kobj_lock); + attr = wi_group->nattrs[nid]; + if (!attr) { + mutex_unlock(&wi_group->kobj_lock); + return; + } + + wi_group->nattrs[nid] = NULL; + mutex_unlock(&wi_group->kobj_lock); + + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); + kfree(attr->kobj_attr.attr.name); + kfree(attr); +} + +static void sysfs_wi_node_delete_all(void) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) + sysfs_wi_node_delete(nid); +} + +static void wi_state_free(void) +{ + struct weighted_interleave_state *old_wi_state; + + mutex_lock(&wi_state_lock); + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (!old_wi_state) { + mutex_unlock(&wi_state_lock); + goto out; + } + + rcu_assign_pointer(wi_state, NULL); + mutex_unlock(&wi_state_lock); + synchronize_rcu(); + kfree(old_wi_state); +out: + kfree(&wi_group->wi_kobj); +} + +static struct kobj_attribute wi_auto_attr = + __ATTR(auto, 0664, weighted_interleave_auto_show, + weighted_interleave_auto_store); + +static void wi_cleanup(void) { + sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + sysfs_wi_node_delete_all(); + wi_state_free(); +} + +static void wi_kobj_release(struct kobject *wi_kobj) +{ + kfree(wi_group); } static const struct kobj_type wi_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .release = sysfs_wi_release, + .release = wi_kobj_release, }; -static int add_weight_node(int nid, struct kobject *wi_kobj) +static int sysfs_wi_node_add(int nid) { - struct iw_node_attr *node_attr; + int ret; char *name; + struct iw_node_attr *new_attr; - node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); - if (!node_attr) + if (nid < 0 || nid >= nr_node_ids) { + pr_err("invalid node id: %d\n", nid); + return -EINVAL; + } + + new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); + if (!new_attr) return -ENOMEM; name = kasprintf(GFP_KERNEL, "node%d", nid); if (!name) { - kfree(node_attr); + kfree(new_attr); return -ENOMEM; } - sysfs_attr_init(&node_attr->kobj_attr.attr); - node_attr->kobj_attr.attr.name = name; - node_attr->kobj_attr.attr.mode = 0644; - node_attr->kobj_attr.show = node_show; - node_attr->kobj_attr.store = node_store; - node_attr->nid = nid; + sysfs_attr_init(&new_attr->kobj_attr.attr); + new_attr->kobj_attr.attr.name = name; + new_attr->kobj_attr.attr.mode = 0644; + new_attr->kobj_attr.show = node_show; + new_attr->kobj_attr.store = node_store; + new_attr->nid = nid; - if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { - kfree(node_attr->kobj_attr.attr.name); - kfree(node_attr); - pr_err("failed to add attribute to weighted_interleave\n"); - return -ENOMEM; + mutex_lock(&wi_group->kobj_lock); + if (wi_group->nattrs[nid]) { + mutex_unlock(&wi_group->kobj_lock); + ret = -EEXIST; + goto out; } - node_attrs[nid] = node_attr; + ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); + if (ret) { + mutex_unlock(&wi_group->kobj_lock); + goto out; + } + wi_group->nattrs[nid] = new_attr; + mutex_unlock(&wi_group->kobj_lock); return 0; + +out: + kfree(new_attr->kobj_attr.attr.name); + kfree(new_attr); + return ret; } -static int add_weighted_interleave_group(struct kobject *root_kobj) +static int wi_node_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + int err; + struct memory_notify *arg = data; + int nid = arg->status_change_nid; + + if (nid < 0) + return NOTIFY_OK; + + switch (action) { + case MEM_ONLINE: + err = sysfs_wi_node_add(nid); + if (err) + pr_err("failed to add sysfs for node%d during hotplug: %d\n", + nid, err); + break; + case MEM_OFFLINE: + sysfs_wi_node_delete(nid); + break; + } + + return NOTIFY_OK; +} + +static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) { - struct kobject *wi_kobj; int nid, err; - wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); - if (!wi_kobj) + wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), + GFP_KERNEL); + if (!wi_group) return -ENOMEM; + mutex_init(&wi_group->kobj_lock); - err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, + err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, "weighted_interleave"); - if (err) { - kfree(wi_kobj); - return err; - } + if (err) + goto err_put_kobj; - for_each_node_state(nid, N_POSSIBLE) { - err = add_weight_node(nid, wi_kobj); + err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + if (err) + goto err_put_kobj; + + for_each_online_node(nid) { + if (!node_state(nid, N_MEMORY)) + continue; + + err = sysfs_wi_node_add(nid); if (err) { - pr_err("failed to add sysfs [node%d]\n", nid); - break; + pr_err("failed to add sysfs for node%d during init: %d\n", + nid, err); + goto err_cleanup_kobj; } } - if (err) - kobject_put(wi_kobj); - return 0; -} -static void mempolicy_kobj_release(struct kobject *kobj) -{ - u8 *old; + hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); + return 0; - mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, - lockdep_is_held(&iw_table_lock)); - rcu_assign_pointer(iw_table, NULL); - mutex_unlock(&iw_table_lock); - synchronize_rcu(); - kfree(old); - kfree(node_attrs); - kfree(kobj); +err_cleanup_kobj: + wi_cleanup(); + kobject_del(&wi_group->wi_kobj); +err_put_kobj: + kobject_put(&wi_group->wi_kobj); + return err; } -static const struct kobj_type mempolicy_ktype = { - .release = mempolicy_kobj_release -}; - static int __init mempolicy_sysfs_init(void) { int err; static struct kobject *mempolicy_kobj; - mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); - if (!mempolicy_kobj) { - err = -ENOMEM; - goto err_out; - } - - node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), - GFP_KERNEL); - if (!node_attrs) { - err = -ENOMEM; - goto mempol_out; - } + mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); + if (!mempolicy_kobj) + return -ENOMEM; - err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, - "mempolicy"); + err = add_weighted_interleave_group(mempolicy_kobj); if (err) - goto node_out; + goto err_kobj; - err = add_weighted_interleave_group(mempolicy_kobj); - if (err) { - pr_err("mempolicy sysfs structure failed to initialize\n"); - kobject_put(mempolicy_kobj); - return err; - } + return 0; - return err; -node_out: - kfree(node_attrs); -mempol_out: - kfree(mempolicy_kobj); -err_out: - pr_err("failed to add mempolicy kobject to the system\n"); +err_kobj: + kobject_del(mempolicy_kobj); + kobject_put(mempolicy_kobj); return err; } diff --git a/mm/memremap.c b/mm/memremap.c index 2aebc1b192da..c417c843e9b1 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -130,7 +130,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); + pfnmap_untrack(PHYS_PFN(range->start), range_len(range)); pgmap_array_delete(range); } @@ -211,8 +211,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, if (nid < 0) nid = numa_mem_id(); - error = track_pfn_remap(NULL, ¶ms->pgprot, PHYS_PFN(range->start), 0, - range_len(range)); + error = pfnmap_track(PHYS_PFN(range->start), range_len(range), + ¶ms->pgprot); if (error) goto err_pfn_remap; @@ -277,7 +277,7 @@ err_add_memory: if (!is_private) kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); + pfnmap_untrack(PHYS_PFN(range->start), range_len(range)); err_pfn_remap: pgmap_array_delete(range); return error; diff --git a/mm/migrate.c b/mm/migrate.c index 676d9cfc7059..8cf0f9c9599d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -50,6 +50,7 @@ #include <trace/events/migrate.h> #include "internal.h" +#include "swap.h" bool isolate_movable_page(struct page *page, isolate_mode_t mode) { @@ -445,20 +446,6 @@ unlock: } #endif -static int folio_expected_refs(struct address_space *mapping, - struct folio *folio) -{ - int refs = 1; - if (!mapping) - return refs; - - refs += folio_nr_pages(folio); - if (folio_test_private(folio)) - refs++; - - return refs; -} - /* * Replace the folio in the mapping. * @@ -601,7 +588,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count) { - int expected_count = folio_expected_refs(mapping, folio) + extra_count; + int expected_count = folio_expected_ref_count(folio) + extra_count + 1; if (folio_ref_count(folio) != expected_count) return -EAGAIN; @@ -618,7 +605,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { XA_STATE(xas, &mapping->i_pages, folio_index(src)); - int rc, expected_count = folio_expected_refs(mapping, src); + int rc, expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) return -EAGAIN; @@ -749,7 +736,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, void *src_private, enum migrate_mode mode) { - int rc, expected_count = folio_expected_refs(mapping, src); + int rc, expected_count = folio_expected_ref_count(src) + 1; /* Check whether src does not have extra refs before we do more work */ if (folio_ref_count(src) != expected_count) @@ -837,7 +824,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, return migrate_folio(mapping, dst, src, mode); /* Check whether page does not have extra refs before we do more work */ - expected_count = folio_expected_refs(mapping, src); + expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) return -EAGAIN; @@ -947,66 +934,20 @@ int filemap_migrate_folio(struct address_space *mapping, EXPORT_SYMBOL_GPL(filemap_migrate_folio); /* - * Writeback a folio to clean the dirty state - */ -static int writeout(struct address_space *mapping, struct folio *folio) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = 1, - .range_start = 0, - .range_end = LLONG_MAX, - .for_reclaim = 1 - }; - int rc; - - if (!mapping->a_ops->writepage) - /* No write method for the address space */ - return -EINVAL; - - if (!folio_clear_dirty_for_io(folio)) - /* Someone else already triggered a write */ - return -EAGAIN; - - /* - * A dirty folio may imply that the underlying filesystem has - * the folio on some queue. So the folio must be clean for - * migration. Writeout may mean we lose the lock and the - * folio state is no longer what we checked for earlier. - * At this point we know that the migration attempt cannot - * be successful. - */ - remove_migration_ptes(folio, folio, 0); - - rc = mapping->a_ops->writepage(&folio->page, &wbc); - - if (rc != AOP_WRITEPAGE_ACTIVATE) - /* unlocked. Relock */ - folio_lock(folio); - - return (rc < 0) ? -EIO : -EAGAIN; -} - -/* * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - if (folio_test_dirty(src)) { - /* Only writeback folios in full synchronous migration */ - switch (mode) { - case MIGRATE_SYNC: - break; - default: - return -EBUSY; - } - return writeout(mapping, src); - } + WARN_ONCE(mapping->a_ops->writepages, + "%ps does not implement migrate_folio\n", + mapping->a_ops); + if (folio_test_dirty(src)) + return -EBUSY; /* - * Buffers may be managed in a filesystem specific way. - * We must have no buffers or drop them. + * Filesystem may have private data at folio->private that we + * can't migrate automatically. */ if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; diff --git a/mm/mincore.c b/mm/mincore.c index 832f29f46767..42d6c9c8da86 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -21,6 +21,7 @@ #include <linux/uaccess.h> #include "swap.h" +#include "internal.h" static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -105,6 +106,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *ptep; unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; + int step, i; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -118,16 +120,26 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, walk->action = ACTION_AGAIN; return 0; } - for (; addr != end; ptep++, addr += PAGE_SIZE) { + for (; addr != end; ptep += step, addr += step * PAGE_SIZE) { pte_t pte = ptep_get(ptep); + step = 1; /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); - else if (pte_present(pte)) - *vec = 1; - else { /* pte is a swap entry */ + else if (pte_present(pte)) { + unsigned int batch = pte_batch_hint(ptep, pte); + + if (batch > 1) { + unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + step = min_t(unsigned int, batch, max_nr); + } + + for (i = 0; i < step; i++) + vec[i] = 1; + } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); if (non_swap_entry(entry)) { @@ -146,7 +158,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, #endif } } - vec++; + vec += step; } pte_unmap_unlock(ptep - 1, ptl); out: diff --git a/mm/mm_init.c b/mm/mm_init.c index 9659689b8ace..f0bd0830daad 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,6 +30,7 @@ #include <linux/crash_dump.h> #include <linux/execmem.h> #include <linux/vmstat.h> +#include <linux/kexec_handover.h> #include <linux/hugetlb.h> #include "internal.h" #include "slab.h" @@ -743,7 +744,7 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static void __meminit init_deferred_page(unsigned long pfn, int nid) +static void __meminit __init_deferred_page(unsigned long pfn, int nid) { if (early_page_initialised(pfn, nid)) return; @@ -763,11 +764,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static inline void init_deferred_page(unsigned long pfn, int nid) +static inline void __init_deferred_page(unsigned long pfn, int nid) { } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +void __meminit init_deferred_page(unsigned long pfn, int nid) +{ + __init_deferred_page(pfn, nid); +} + /* * Initialised pages do not have PageReserved set. This function is * called for each range allocated by the bootmem allocator and @@ -777,22 +783,19 @@ static inline void init_deferred_page(unsigned long pfn, int nid) void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid) { - unsigned long start_pfn = PFN_DOWN(start); - unsigned long end_pfn = PFN_UP(end); + unsigned long pfn; - for (; start_pfn < end_pfn; start_pfn++) { - if (pfn_valid(start_pfn)) { - struct page *page = pfn_to_page(start_pfn); + for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { + struct page *page = pfn_to_page(pfn); - init_deferred_page(start_pfn, nid); + __init_deferred_page(pfn, nid); - /* - * no need for atomic set_bit because the struct - * page is not visible yet so nobody should - * access it yet. - */ - __SetPageReserved(page); - } + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } @@ -828,7 +831,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * - physical memory bank size is not necessarily the exact multiple of the * arbitrary section size * - early reserved memory may not be listed in memblock.memory - * - non-memory regions covered by the contigious flatmem mapping + * - non-memory regions covered by the contiguous flatmem mapping * - memory layouts defined with memmap= kernel parameter may not align * nicely with memmap sections * @@ -848,11 +851,7 @@ static void __init init_unavailable_range(unsigned long spfn, unsigned long pfn; u64 pgcnt = 0; - for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(pageblock_start_pfn(pfn))) { - pfn = pageblock_end_pfn(pfn) - 1; - continue; - } + for_each_valid_pfn(pfn, spfn, epfn) { __init_single_page(pfn_to_page(pfn), pfn, zone, node); __SetPageReserved(pfn_to_page(pfn)); pgcnt++; @@ -1441,7 +1440,6 @@ static void __meminit zone_init_free_lists(struct zone *zone) #ifdef CONFIG_UNACCEPTED_MEMORY INIT_LIST_HEAD(&zone->unaccepted_pages); - INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work); #endif } @@ -1786,7 +1784,7 @@ static bool arch_has_descending_max_zone_pfns(void) return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40); } -static void set_high_memory(void) +static void __init set_high_memory(void) { phys_addr_t highmem = memblock_end_of_DRAM(); @@ -1908,7 +1906,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) free_area_init_node(nid); /* - * No sysfs hierarcy will be created via register_one_node() + * No sysfs hierarchy will be created via register_one_node() *for memory-less node because here it's not marked as N_MEMORY *and won't be set online later. The benefit is userspace *program won't be confused by sysfs files/directories of @@ -2668,12 +2666,6 @@ static void __init report_meminit(void) stack = "all(pattern)"; else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO)) stack = "all(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL)) - stack = "byref_all(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF)) - stack = "byref(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER)) - stack = "__user(zero)"; else stack = "off"; @@ -2766,6 +2758,13 @@ void __init mm_core_init(void) report_meminit(); kmsan_init_shadow(); stack_depot_early_init(); + + /* + * KHO memory setup must happen while memblock is still active, but + * as close as possible to buddy initialization + */ + kho_memory_init(); + memblock_free_all(); mem_init(); kmem_cache_init(); diff --git a/mm/mmap.c b/mm/mmap.c index bd210aaf7ebd..09c563c95112 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -475,7 +475,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags &= ~VM_MAYEXEC; } - if (!file->f_op->mmap) + if (!file_has_valid_mmap_hooks(file)) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; @@ -1321,48 +1321,6 @@ destroy: vm_unacct_memory(nr_accounted); } -/* Insert vm structure into process list sorted by address - * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_rwsem is taken here. - */ -int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) -{ - unsigned long charged = vma_pages(vma); - - - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) - return -ENOMEM; - - if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; - - /* - * The vm_pgoff of a purely anonymous vma should be irrelevant - * until its first write fault, when page's anon_vma and index - * are set. But now set the vm_pgoff it will almost certainly - * end up with (unless mremap moves it elsewhere before that - * first wfault), so /proc/pid/maps tells a consistent story. - * - * By setting it to reflect the virtual start address of the - * vma, merges and splits can happen in a seamless way, just - * using the existing file pgoff checks and manipulations. - * Similarly in do_mmap and in do_brk_flags. - */ - if (vma_is_anonymous(vma)) { - BUG_ON(vma->anon_vma); - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; - } - - if (vma_link(mm, vma)) { - if (vma->vm_flags & VM_ACCOUNT) - vm_unacct_memory(charged); - return -ENOMEM; - } - - return 0; -} - /* * Return true if the calling process may expand its vm space by the passed * number of pages @@ -1596,7 +1554,7 @@ static const struct ctl_table mmap_table[] = { #endif /* CONFIG_SYSCTL */ /* - * initialise the percpu counter for VM + * initialise the percpu counter for VM, initialise VMA state. */ void __init mmap_init(void) { @@ -1607,6 +1565,7 @@ void __init mmap_init(void) #ifdef CONFIG_SYSCTL register_sysctl_init("vm", mmap_table); #endif + vma_state_init(); } /* @@ -1718,90 +1677,6 @@ static int __meminit init_reserve_notifier(void) subsys_initcall(init_reserve_notifier); /* - * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between - * this VMA and its relocated range, which will now reside at [vma->vm_start - - * shift, vma->vm_end - shift). - * - * This function is almost certainly NOT what you want for anything other than - * early executable temporary stack relocation. - */ -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) -{ - /* - * The process proceeds as follows: - * - * 1) Use shift to calculate the new vma endpoints. - * 2) Extend vma to cover both the old and new ranges. This ensures the - * arguments passed to subsequent functions are consistent. - * 3) Move vma's page tables to the new range. - * 4) Free up any cleared pgd range. - * 5) Shrink the vma to cover only the new range. - */ - - struct mm_struct *mm = vma->vm_mm; - unsigned long old_start = vma->vm_start; - unsigned long old_end = vma->vm_end; - unsigned long length = old_end - old_start; - unsigned long new_start = old_start - shift; - unsigned long new_end = old_end - shift; - VMA_ITERATOR(vmi, mm, new_start); - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); - struct vm_area_struct *next; - struct mmu_gather tlb; - PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); - - BUG_ON(new_start > new_end); - - /* - * ensure there are no vmas between where we want to go - * and where we are - */ - if (vma != vma_next(&vmi)) - return -EFAULT; - - vma_iter_prev_range(&vmi); - /* - * cover the whole range: [new_start, old_end) - */ - vmg.middle = vma; - if (vma_expand(&vmg)) - return -ENOMEM; - - /* - * move the page tables downwards, on failure we rely on - * process cleanup to remove whatever mess we made. - */ - pmc.for_stack = true; - if (length != move_page_tables(&pmc)) - return -ENOMEM; - - tlb_gather_mmu(&tlb, mm); - next = vma_next(&vmi); - if (new_end > old_start) { - /* - * when the old and new regions overlap clear from new_end. - */ - free_pgd_range(&tlb, new_end, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); - } else { - /* - * otherwise, clean from old_start; this is done to not touch - * the address space in [new_end, old_start) some architectures - * have constraints on va-space that make this illegal (IA64) - - * for the others its just a little faster. - */ - free_pgd_range(&tlb, old_start, old_end, new_end, - next ? next->vm_start : USER_PGTABLES_CEILING); - } - tlb_finish_mmu(&tlb); - - vma_prev(&vmi); - /* Shrink the vma to just the new range */ - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); -} - -#ifdef CONFIG_MMU -/* * Obtain a read lock on mm->mmap_lock, if the specified address is below the * start of the VMA, the intent is to perform a write, and it is a * downward-growing stack, then attempt to expand the stack to contain it. @@ -1844,10 +1719,175 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, mmap_write_downgrade(mm); return true; } -#else -bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, bool write) + +__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - return false; + struct vm_area_struct *mpnt, *tmp; + int retval; + unsigned long charge = 0; + LIST_HEAD(uf); + VMA_ITERATOR(vmi, mm, 0); + + if (mmap_write_lock_killable(oldmm)) + return -EINTR; + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + dup_mm_exe_file(mm, oldmm); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) + goto out; + + mt_clear_in_rcu(vmi.mas.tree); + for_each_vma(vmi, mpnt) { + struct file *file; + + vma_start_write(mpnt); + if (mpnt->vm_flags & VM_DONTCOPY) { + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, + mpnt->vm_end, GFP_KERNEL); + if (retval) + goto loop_out; + + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + /* + * Don't duplicate many vmas if we've been oom-killed (for + * example) + */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto loop_out; + } + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + + tmp = vm_area_dup(mpnt); + if (!tmp) + goto fail_nomem; + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* + * VM_WIPEONFORK gets a clean slate in the child. + * Don't prepare anon_vma until fault since we don't + * copy page for current vma. + */ + tmp->anon_vma = NULL; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + vm_flags_clear(tmp, VM_LOCKED_MASK); + /* + * Copy/update hugetlb private vma information. + */ + if (is_vm_hugetlb_page(tmp)) + hugetlb_dup_vma_private(tmp); + + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); + + mm->map_count++; + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + file = tmp->vm_file; + if (file) { + struct address_space *mapping = file->f_mapping; + + get_file(file); + i_mmap_lock_write(mapping); + if (vma_is_shared_maywrite(tmp)) + mapping_allow_writable(mapping); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(tmp, mpnt); + + if (retval) { + mpnt = vma_next(&vmi); + goto loop_out; + } + } + /* a new mm has just been created */ + retval = arch_dup_mmap(oldmm, mm); +loop_out: + vma_iter_free(&vmi); + if (!retval) { + mt_set_in_rcu(vmi.mas.tree); + ksm_fork(mm, oldmm); + khugepaged_fork(mm, oldmm); + } else { + + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + if (mpnt) { + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&vmi.mas, XA_ZERO_ENTRY); + /* Avoid OOM iterating a broken tree */ + set_bit(MMF_OOM_SKIP, &mm->flags); + } + /* + * The mm_struct is going to exit, but the locks will be dropped + * first. Set the mm_struct as unstable is advisable as it is + * not fully initialised. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + } +out: + mmap_write_unlock(mm); + flush_tlb_mm(oldmm); + mmap_write_unlock(oldmm); + if (!retval) + dup_userfaultfd_complete(&uf); + else + dup_userfaultfd_fail(&uf); + return retval; + +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + vm_area_free(tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto loop_out; } -#endif diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index e7dbaf96aa17..5f725cc67334 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -42,3 +42,276 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) } EXPORT_SYMBOL(__mmap_lock_do_trace_released); #endif /* CONFIG_TRACING */ + +#ifdef CONFIG_MMU +#ifdef CONFIG_PER_VMA_LOCK +static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +{ + unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + + /* Additional refcnt if the vma is attached. */ + if (!detaching) + tgt_refcnt++; + + /* + * If vma is detached then only vma_mark_attached() can raise the + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + */ + if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + return false; + + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + refcount_read(&vma->vm_refcnt) == tgt_refcnt, + TASK_UNINTERRUPTIBLE); + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); + + return true; +} + +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +{ + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); +} + +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) +{ + bool locked; + + /* + * __vma_enter_locked() returns false immediately if the vma is not + * attached, otherwise it waits until refcnt is indicating that vma + * is attached with no readers. + */ + locked = __vma_enter_locked(vma, false); + + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); + + if (locked) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(detached); /* vma should remain attached */ + } +} +EXPORT_SYMBOL_GPL(__vma_start_write); + +void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * We are the only writer, so no need to use vma_refcount_put(). + * The condition below is unlikely because the vma has been already + * write-locked and readers can increment vm_refcnt only temporarily + * before they check vm_lock_seq, realize the vma is locked and drop + * back the vm_refcnt. That is a narrow window for observing a raised + * vm_refcnt. + */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* Wait until vma is detached with no readers. */ + if (__vma_enter_locked(vma, true)) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(!detached); + } + } +} + +/* + * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be + * stable and not isolated. If the VMA is not found or is being modified the + * function returns NULL. + */ +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + MA_STATE(mas, &mm->mm_mt, address, address); + struct vm_area_struct *vma; + + rcu_read_lock(); +retry: + vma = mas_walk(&mas); + if (!vma) + goto inval; + + vma = vma_start_read(mm, vma); + if (IS_ERR_OR_NULL(vma)) { + /* Check if the VMA got isolated after we found it */ + if (PTR_ERR(vma) == -EAGAIN) { + count_vm_vma_lock_event(VMA_LOCK_MISS); + /* The area was replaced with another one */ + goto retry; + } + + /* Failed to lock the VMA */ + goto inval; + } + /* + * At this point, we have a stable reference to a VMA: The VMA is + * locked and we know it hasn't already been isolated. + * From here on, we can access the VMA without worrying about which + * fields are accessible for RCU readers. + */ + + /* Check if the vma we locked is the right one. */ + if (unlikely(vma->vm_mm != mm || + address < vma->vm_start || address >= vma->vm_end)) + goto inval_end_read; + + rcu_read_unlock(); + return vma; + +inval_end_read: + vma_end_read(vma); +inval: + rcu_read_unlock(); + count_vm_vma_lock_event(VMA_LOCK_ABORT); + return NULL; +} +#endif /* CONFIG_PER_VMA_LOCK */ + +#ifdef CONFIG_LOCK_MM_AND_FIND_VMA +#include <linux/extable.h> + +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + if (likely(mmap_read_trylock(mm))) + return true; + + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; + } + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + mmap_read_unlock(mm); + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; + } + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalent to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} +#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ + +#else /* CONFIG_MMU */ + +/* + * At least xtensa ends up having protection faults even with no + * MMU.. No stack expansion, at least. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + mmap_read_lock(mm); + vma = vma_lookup(mm, addr); + if (!vma) + mmap_read_unlock(mm); + return vma; +} + +#endif /* CONFIG_MMU */ diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index fc18fe274505..8e0125dc0522 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -4,7 +4,7 @@ * * Copyright (C) 2008 Qumranet, Inc. * Copyright (C) 2008 SGI - * Christoph Lameter <cl@linux.com> + * Christoph Lameter <cl@gentwo.org> */ #include <linux/rculist.h> diff --git a/mm/mprotect.c b/mm/mprotect.c index 62c1f7945741..88608d0dc2c2 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -379,7 +379,7 @@ again: if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || pgtable_split_needed(vma, cp_flags)) { - __split_huge_pmd(vma, pmd, addr, false, NULL); + __split_huge_pmd(vma, pmd, addr, false); /* * For file-backed, the pmd could have been * cleared; make sure pmd populated if diff --git a/mm/mremap.c b/mm/mremap.c index 7db9da609c84..83e359754961 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1188,12 +1188,7 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm, mremap_userfaultfd_prep(new_vma, vrm->uf); } - if (is_vm_hugetlb_page(vma)) - clear_vma_resv_huge_pages(vma); - - /* Tell pfnmap has moved from this vma */ - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(vma); + fixup_hugetlb_reservations(vma); *new_vma_ptr = new_vma; return err; diff --git a/mm/nommu.c b/mm/nommu.c index 617e7ba8022f..b624acec6d2e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -200,7 +200,23 @@ void *vmalloc_noprof(unsigned long size) } EXPORT_SYMBOL(vmalloc_noprof); -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof); +/* + * vmalloc_huge_node - allocate virtually contiguous memory, on a node + * + * @size: allocation size + * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * Due to NOMMU implications the node argument and HUGE page attribute is + * ignored. + */ +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) +{ + return __vmalloc_noprof(size, gfp_mask); +} /* * vzalloc - allocate virtually contiguous memory with zero fill @@ -399,7 +415,8 @@ static const struct ctl_table nommu_table[] = { }; /* - * initialise the percpu counter for VM and region record slabs + * initialise the percpu counter for VM and region record slabs, initialise VMA + * state. */ void __init mmap_init(void) { @@ -409,6 +426,7 @@ void __init mmap_init(void) VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); register_sysctl_init("vm", nommu_table); + vma_state_init(); } /* @@ -627,22 +645,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); /* - * At least xtensa ends up having protection faults even with no - * MMU.. No stack expansion, at least. - */ -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, - unsigned long addr, struct pt_regs *regs) -{ - struct vm_area_struct *vma; - - mmap_read_lock(mm); - vma = vma_lookup(mm, addr); - if (!vma) - mmap_read_unlock(mm); - return vma; -} - -/* * expand a stack to a given address * - not supported under NOMMU conditions */ @@ -1890,3 +1892,11 @@ static int __meminit init_admin_reserve(void) return 0; } subsys_initcall(init_admin_reserve); + +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + mmap_write_lock(oldmm); + dup_mm_exe_file(mm, oldmm); + mmap_write_unlock(oldmm); + return 0; +} diff --git a/mm/numa.c b/mm/numa.c index f1787d7713a6..7d5e06fe5bd4 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -13,7 +13,6 @@ void __init alloc_node_data(int nid) { const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); u64 nd_pa; - void *nd; int tnid; /* Allocate node data. Try node-local memory and then any node. */ @@ -21,7 +20,6 @@ void __init alloc_node_data(int nid) if (!nd_pa) panic("Cannot allocate %zu bytes for node %d data\n", nd_size, nid); - nd = __va(nd_pa); /* report and initialize */ pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, @@ -30,7 +28,7 @@ void __init alloc_node_data(int nid) if (tnid != nid) pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); - node_data[nid] = nd; + node_data[nid] = __va(nd_pa); memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); } diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index ff4054f4334d..541a99c4071a 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -201,6 +201,28 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) } /** + * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the numa_reserved_meminfo. + * + * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances + * against memblock_type information and moves any that intersect reserved + * ranges to numa_reserved_meminfo. However, when that information is known + * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk + * to numa_reserved_meminfo directly. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_reserved_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo); +} + +/** * numa_cleanup_meminfo - Cleanup a numa_meminfo * @mi: numa_meminfo to clean up * diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c81624bc3969..b603a59cf8f7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -41,6 +41,7 @@ #include <trace/events/writeback.h> #include "internal.h" +#include "swap.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). @@ -520,8 +521,8 @@ static int dirty_ratio_handler(const struct ctl_table *table, int write, void *b ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - writeback_set_ratelimit(); vm_dirty_bytes = 0; + writeback_set_ratelimit(); } return ret; } @@ -2621,27 +2622,6 @@ int write_cache_pages(struct address_space *mapping, } EXPORT_SYMBOL(write_cache_pages); -static int writeback_use_writepage(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct folio *folio = NULL; - struct blk_plug plug; - int err; - - blk_start_plug(&plug); - while ((folio = writeback_iter(mapping, wbc, folio, &err))) { - err = mapping->a_ops->writepage(&folio->page, wbc); - if (err == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - err = 0; - } - mapping_set_error(mapping, err); - } - blk_finish_plug(&plug); - - return err; -} - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -2652,14 +2632,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { - if (mapping->a_ops->writepages) { + if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); - } else if (mapping->a_ops->writepage) { - ret = writeback_use_writepage(mapping, wbc); - } else { + else /* deal with chardevs and other special files */ ret = 0; - } if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5669baf2a6fe..2ef3c07266b3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -290,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes); #endif static bool page_contains_unaccepted(struct page *page, unsigned int order); -static bool cond_accept_memory(struct zone *zone, unsigned int order); +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags); static bool __free_unaccepted(struct page *page); int page_group_by_mobility_disabled __read_mostly; @@ -897,9 +898,7 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif -#ifdef CONFIG_PAGE_POOL - ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | -#endif + page_pool_page_is_pp(page) | (page->flags & check_flags))) return false; @@ -926,26 +925,18 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif -#ifdef CONFIG_PAGE_POOL - if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) + if (unlikely(page_pool_page_is_pp(page))) bad_reason = "page_pool leak"; -#endif return bad_reason; } -static void free_page_is_bad_report(struct page *page) -{ - bad_page(page, - page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); -} - static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return false; /* Something has gone sideways, find it */ - free_page_is_bad_report(page); + bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); return true; } @@ -1151,14 +1142,9 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) __pgalloc_tag_sub(page, nr); } -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) +/* When tag is not NULL, assuming mem_alloc_profiling_enabled */ +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) { - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = __pgalloc_tag_get(page); if (tag) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } @@ -1168,7 +1154,7 @@ static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} +static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ @@ -2078,31 +2064,25 @@ static bool should_try_claim_block(unsigned int order, int start_mt) /* * Check whether there is a suitable fallback freepage with requested order. - * Sets *claim_block to instruct the caller whether it should convert a whole - * pageblock to the returned migratetype. - * If only_claim is true, this function returns fallback_mt only if + * If claimable is true, this function returns fallback_mt only if * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_claim, bool *claim_block) + int migratetype, bool claimable) { int i; - int fallback_mt; + + if (claimable && !should_try_claim_block(order, migratetype)) + return -2; if (area->nr_free == 0) return -1; - *claim_block = false; for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { - fallback_mt = fallbacks[migratetype][i]; - if (free_area_empty(area, fallback_mt)) - continue; + int fallback_mt = fallbacks[migratetype][i]; - if (should_try_claim_block(order, migratetype)) - *claim_block = true; - - if (*claim_block || !only_claim) + if (!free_area_empty(area, fallback_mt)) return fallback_mt; } @@ -2199,7 +2179,6 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype, int min_order = order; struct page *page; int fallback_mt; - bool claim_block; /* * Do not steal pages from freelists belonging to other pageblocks @@ -2218,11 +2197,14 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype, --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &claim_block); + start_migratetype, true); + + /* No block in that order */ if (fallback_mt == -1) continue; - if (!claim_block) + /* Advanced into orders too low to claim, abort */ + if (fallback_mt == -2) break; page = get_page_from_free_area(area, fallback_mt); @@ -2250,12 +2232,11 @@ __rmqueue_steal(struct zone *zone, int order, int start_migratetype) int current_order; struct page *page; int fallback_mt; - bool claim_block; for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &claim_block); + start_migratetype, false); if (fallback_mt == -1) continue; @@ -2676,10 +2657,10 @@ static void free_frozen_page_commit(struct zone *zone, * stops will be drained from vmstat refresh context. */ if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { - free_high = (pcp->free_count >= batch && + free_high = (pcp->free_count >= (batch + pcp->high_min / 2) && (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || - pcp->count >= READ_ONCE(batch))); + pcp->count >= batch)); pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; @@ -3558,7 +3539,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, retry: /* * Scan zonelist, looking for a zone with enough free. - * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. + * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c. */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; @@ -3616,7 +3597,7 @@ retry: } } - cond_accept_memory(zone, order); + cond_accept_memory(zone, order, alloc_flags); /* * Detect whether the number of free pages is below high @@ -3643,7 +3624,7 @@ check_alloc_wmark: gfp_mask)) { int ret; - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* @@ -3696,7 +3677,7 @@ try_this_zone: return page; } else { - if (cond_accept_memory(zone, order)) + if (cond_accept_memory(zone, order, alloc_flags)) goto try_this_zone; /* Try again if zone has deferred pages */ @@ -4245,7 +4226,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) /* * Ignore cpuset mems for non-blocking __GFP_HIGH (probably * GFP_ATOMIC) rather than fail, see the comment for - * cpuset_node_allowed(). + * cpuset_current_node_allowed(). */ if (alloc_flags & ALLOC_MIN_RESERVE) alloc_flags &= ~ALLOC_CPUSET; @@ -4566,6 +4547,14 @@ restart: } retry: + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * infinite retries. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); @@ -4849,7 +4838,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto failed; } - cond_accept_memory(zone, 0); + cond_accept_memory(zone, 0, alloc_flags); retry_this_zone: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; if (zone_watermark_fast(zone, 0, mark, @@ -4858,7 +4847,7 @@ retry_this_zone: break; } - if (cond_accept_memory(zone, 0)) + if (cond_accept_memory(zone, 0, alloc_flags)) goto retry_this_zone; /* Try again if zone has deferred pages */ @@ -5065,11 +5054,13 @@ static void ___free_pages(struct page *page, unsigned int order, { /* get PageHead before we drop reference */ int head = PageHead(page); + /* get alloc tag in case the page is released by others */ + struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) __free_frozen_pages(page, order, fpi_flags); else if (!head) { - pgalloc_tag_sub_pages(page, (1 << order) - 1); + pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) __free_frozen_pages(page + (1 << order), order, fpi_flags); @@ -5083,7 +5074,7 @@ EXPORT_SYMBOL(__free_pages); /* * Can be called while holding raw_spin_lock or from IRQ and NMI for any - * page type (not only those that came from try_alloc_pages) + * page type (not only those that came from alloc_pages_nolock) */ void free_pages_nolock(struct page *page, unsigned int order) { @@ -7174,16 +7165,8 @@ bool has_managed_dma(void) #ifdef CONFIG_UNACCEPTED_MEMORY -/* Counts number of zones with unaccepted pages. */ -static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); - static bool lazy_accept = true; -void unaccepted_cleanup_work(struct work_struct *work) -{ - static_branch_dec(&zones_with_unaccepted_pages); -} - static int __init accept_memory_parse(char *p) { if (!strcmp(p, "lazy")) { @@ -7208,11 +7191,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) static void __accept_page(struct zone *zone, unsigned long *flags, struct page *page) { - bool last; - list_del(&page->lru); - last = list_empty(&zone->unaccepted_pages); - account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); __ClearPageUnaccepted(page); @@ -7221,28 +7200,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags, accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - - if (last) { - /* - * There are two corner cases: - * - * - If allocation occurs during the CPU bring up, - * static_branch_dec() cannot be used directly as - * it causes a deadlock on cpu_hotplug_lock. - * - * Instead, use schedule_work() to prevent deadlock. - * - * - If allocation occurs before workqueues are initialized, - * static_branch_dec() should be called directly. - * - * Workqueues are initialized before CPU bring up, so this - * will not conflict with the first scenario. - */ - if (system_wq) - schedule_work(&zone->unaccepted_cleanup); - else - unaccepted_cleanup_work(&zone->unaccepted_cleanup); - } } void accept_page(struct page *page) @@ -7279,20 +7236,17 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; } -static inline bool has_unaccepted_memory(void) -{ - return static_branch_unlikely(&zones_with_unaccepted_pages); -} - -static bool cond_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { long to_accept, wmark; bool ret = false; - if (!has_unaccepted_memory()) + if (list_empty(&zone->unaccepted_pages)) return false; - if (list_empty(&zone->unaccepted_pages)) + /* Bailout, since try_to_accept_memory_one() needs to take a lock */ + if (alloc_flags & ALLOC_TRYLOCK) return false; wmark = promo_wmark_pages(zone); @@ -7325,22 +7279,17 @@ static bool __free_unaccepted(struct page *page) { struct zone *zone = page_zone(page); unsigned long flags; - bool first = false; if (!lazy_accept) return false; spin_lock_irqsave(&zone->lock, flags); - first = list_empty(&zone->unaccepted_pages); list_add_tail(&page->lru, &zone->unaccepted_pages); account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); __SetPageUnaccepted(page); spin_unlock_irqrestore(&zone->lock, flags); - if (first) - static_branch_inc(&zones_with_unaccepted_pages); - return true; } @@ -7351,7 +7300,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) return false; } -static bool cond_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order, + int alloc_flags) { return false; } @@ -7365,20 +7315,21 @@ static bool __free_unaccepted(struct page *page) #endif /* CONFIG_UNACCEPTED_MEMORY */ /** - * try_alloc_pages - opportunistic reentrant allocation from any context + * alloc_pages_nolock - opportunistic reentrant allocation from any context * @nid: node to allocate from * @order: allocation order size * * Allocates pages of a given order from the given node. This is safe to * call from any context (from atomic, NMI, and also reentrant - * allocator -> tracepoint -> try_alloc_pages_noprof). + * allocator -> tracepoint -> alloc_pages_nolock_noprof). * Allocation is best effort and to be expected to fail easily so nobody should * rely on the success. Failures are not reported via warn_alloc(). * See always fail conditions below. * - * Return: allocated page or NULL on failure. + * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. + * It means ENOMEM. There is no reason to call it again and expect !NULL. */ -struct page *try_alloc_pages_noprof(int nid, unsigned int order) +struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) { /* * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. @@ -7387,7 +7338,7 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) * * These two are the conditions for gfpflags_allow_spinning() being true. * - * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason + * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason * to warn. Also warn would trigger printk() which is unsafe from * various contexts. We cannot use printk_deferred_enter() to mitigate, * since the running context is unknown. @@ -7397,7 +7348,7 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) * BPF use cases. * * Though __GFP_NOMEMALLOC is not checked in the code path below, - * specify it here to highlight that try_alloc_pages() + * specify it here to highlight that alloc_pages_nolock() * doesn't want to deplete reserves. */ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC @@ -7422,11 +7373,6 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) if (!pcp_allowed_order(order)) return NULL; -#ifdef CONFIG_UNACCEPTED_MEMORY - /* Bailout, since try_to_accept_memory_one() needs to take a lock */ - if (has_unaccepted_memory()) - return NULL; -#endif /* Bailout, since _deferred_grow_zone() needs to take a lock */ if (deferred_pages_enabled()) return NULL; diff --git a/mm/page_io.c b/mm/page_io.c index 4bce19df557b..f7716b6569fa 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -237,9 +237,8 @@ static void swap_zeromap_folio_clear(struct folio *folio) * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ -int swap_writepage(struct page *page, struct writeback_control *wbc) +int swap_writeout(struct folio *folio, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); int ret; if (folio_free_swap(folio)) { diff --git a/mm/page_owner.c b/mm/page_owner.c index cc4a6916eec6..9928c9ac8c31 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -302,7 +302,7 @@ void __reset_page_owner(struct page *page, unsigned short order) /* * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false * to prevent issues in stack_depot_save(). - * This is similar to try_alloc_pages() gfp flags, but only used + * This is similar to alloc_pages_nolock() gfp flags, but only used * to signal stack_depot to avoid spin_locks. */ handle = save_stack(__GFP_NOWARN); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 68109ee93841..4eeca782b888 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -218,33 +218,39 @@ static inline void page_table_check_pmd_flags(pmd_t pmd) WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); } -void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) +void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, + unsigned int nr) { + unsigned long stride = PMD_SIZE >> PAGE_SHIFT; + unsigned int i; + if (&init_mm == mm) return; page_table_check_pmd_flags(pmd); - __page_table_check_pmd_clear(mm, *pmdp); - if (pmd_user_accessible_page(pmd)) { - page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, - pmd_write(pmd)); - } + for (i = 0; i < nr; i++) + __page_table_check_pmd_clear(mm, *(pmdp + i)); + if (pmd_user_accessible_page(pmd)) + page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } -EXPORT_SYMBOL(__page_table_check_pmd_set); +EXPORT_SYMBOL(__page_table_check_pmds_set); -void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) +void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, + unsigned int nr) { + unsigned long stride = PUD_SIZE >> PAGE_SHIFT; + unsigned int i; + if (&init_mm == mm) return; - __page_table_check_pud_clear(mm, *pudp); - if (pud_user_accessible_page(pud)) { - page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, - pud_write(pud)); - } + for (i = 0; i < nr; i++) + __page_table_check_pud_clear(mm, *(pudp + i)); + if (pud_user_accessible_page(pud)) + page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } -EXPORT_SYMBOL(__page_table_check_pud_set); +EXPORT_SYMBOL(__page_table_check_puds_set); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, diff --git a/mm/ptdump.c b/mm/ptdump.c index 106e1d66e9f9..9374f29cdc6f 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -18,7 +18,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk, { struct ptdump_state *st = walk->private; - st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0])); + st->note_page_pte(st, addr, kasan_early_shadow_pte[0]); walk->action = ACTION_CONTINUE; @@ -38,11 +38,11 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 0, pgd_val(val)); + if (st->effective_prot_pgd) + st->effective_prot_pgd(st, val); if (pgd_leaf(val)) { - st->note_page(st, addr, 0, pgd_val(val)); + st->note_page_pgd(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -61,11 +61,11 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 1, p4d_val(val)); + if (st->effective_prot_p4d) + st->effective_prot_p4d(st, val); if (p4d_leaf(val)) { - st->note_page(st, addr, 1, p4d_val(val)); + st->note_page_p4d(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -84,11 +84,11 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 2, pud_val(val)); + if (st->effective_prot_pud) + st->effective_prot_pud(st, val); if (pud_leaf(val)) { - st->note_page(st, addr, 2, pud_val(val)); + st->note_page_pud(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -106,10 +106,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif - if (st->effective_prot) - st->effective_prot(st, 3, pmd_val(val)); + if (st->effective_prot_pmd) + st->effective_prot_pmd(st, val); if (pmd_leaf(val)) { - st->note_page(st, addr, 3, pmd_val(val)); + st->note_page_pmd(st, addr, val); walk->action = ACTION_CONTINUE; } @@ -122,10 +122,10 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, struct ptdump_state *st = walk->private; pte_t val = ptep_get_lockless(pte); - if (st->effective_prot) - st->effective_prot(st, 4, pte_val(val)); + if (st->effective_prot_pte) + st->effective_prot_pte(st, val); - st->note_page(st, addr, 4, pte_val(val)); + st->note_page_pte(st, addr, val); return 0; } @@ -134,9 +134,31 @@ static int ptdump_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - - st->note_page(st, addr, depth, 0); - + pte_t pte_zero = {0}; + pmd_t pmd_zero = {0}; + pud_t pud_zero = {0}; + p4d_t p4d_zero = {0}; + pgd_t pgd_zero = {0}; + + switch (depth) { + case 4: + st->note_page_pte(st, addr, pte_zero); + break; + case 3: + st->note_page_pmd(st, addr, pmd_zero); + break; + case 2: + st->note_page_pud(st, addr, pud_zero); + break; + case 1: + st->note_page_p4d(st, addr, p4d_zero); + break; + case 0: + st->note_page_pgd(st, addr, pgd_zero); + break; + default: + break; + } return 0; } @@ -162,7 +184,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) mmap_write_unlock(mm); /* Flush out the last page */ - st->note_page(st, 0, -1, 0); + st->note_page_flush(st); } static int check_wx_show(struct seq_file *m, void *v) diff --git a/mm/readahead.c b/mm/readahead.c index 6a4e96b69702..20d36d6b055e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { + struct file *file; + const struct inode *inode; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; - if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) + file = fd_file(f); + if (!(file->f_mode & FMODE_READ)) return -EBADF; /* @@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ - if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops || - (!S_ISREG(file_inode(fd_file(f))->i_mode) && - !S_ISBLK(file_inode(fd_file(f))->i_mode))) + if (!file->f_mapping) + return -EINVAL; + if (!file->f_mapping->a_ops) + return -EINVAL; + + inode = file_inode(file); + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) + return -EINVAL; + if (IS_ANON_FILE(inode)) return -EINVAL; return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); diff --git a/mm/rmap.c b/mm/rmap.c index 67bb273dfb80..fb63d9256f09 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -774,7 +774,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) * @vma: The VMA we need to know the address in. * * Calculates the user virtual address of this page in the specified VMA. - * It is the caller's responsibililty to check the page is actually + * It is the caller's responsibility to check the page is actually * within the VMA. There may not currently be a PTE pointing at this * page, but if a page fault occurs at this address, this is the page * which will be accessed. @@ -789,13 +789,13 @@ unsigned long page_address_in_vma(const struct folio *folio, const struct page *page, const struct vm_area_struct *vma) { if (folio_test_anon(folio)) { - struct anon_vma *page__anon_vma = folio_anon_vma(folio); + struct anon_vma *anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. */ - if (!vma->anon_vma || !page__anon_vma || - vma->anon_vma->root != page__anon_vma->root) + if (!vma->anon_vma || !anon_vma || + vma->anon_vma->root != anon_vma->root) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; @@ -803,7 +803,7 @@ unsigned long page_address_in_vma(const struct folio *folio, return -EFAULT; } - /* KSM folios don't reach here because of the !page__anon_vma check */ + /* KSM folios don't reach here because of the !anon_vma check */ return vma_address(vma, page_pgoff(folio, page), 1); } @@ -1944,7 +1944,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * restart so we can process the PTE-mapped THP. */ split_huge_pmd_locked(vma, pvmw.address, - pvmw.pmd, false, folio); + pvmw.pmd, false); flags &= ~TTU_SPLIT_HUGE_PMD; page_vma_mapped_walk_restart(&pvmw); continue; @@ -2292,13 +2292,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, pvmw.flags = PVMW_SYNC; /* - * unmap_page() in mm/huge_memory.c is the only user of migration with - * TTU_SPLIT_HUGE_PMD and it wants to freeze. - */ - if (flags & TTU_SPLIT_HUGE_PMD) - split_huge_pmd_address(vma, address, true, folio); - - /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. @@ -2323,9 +2316,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { + if (flags & TTU_SPLIT_HUGE_PMD) { + split_huge_pmd_locked(vma, pvmw.address, + pvmw.pmd, true); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION subpage = folio_page(folio, pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || @@ -2337,8 +2337,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } continue; - } #endif + } /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); diff --git a/mm/secretmem.c b/mm/secretmem.c index 1b0a214ee558..589b26c2d553 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -120,18 +120,18 @@ static int secretmem_release(struct inode *inode, struct file *file) return 0; } -static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) +static int secretmem_mmap_prepare(struct vm_area_desc *desc) { - unsigned long len = vma->vm_end - vma->vm_start; + const unsigned long len = desc->end - desc->start; - if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) + if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; - if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len)) return -EAGAIN; - vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); - vma->vm_ops = &secretmem_vm_ops; + desc->vm_flags |= VM_LOCKED | VM_DONTDUMP; + desc->vm_ops = &secretmem_vm_ops; return 0; } @@ -143,7 +143,7 @@ bool vma_is_secretmem(struct vm_area_struct *vma) static const struct file_operations secretmem_fops = { .release = secretmem_release, - .mmap = secretmem_mmap, + .mmap_prepare = secretmem_mmap_prepare, }; static int secretmem_migrate_folio(struct address_space *mapping, diff --git a/mm/shmem.c b/mm/shmem.c index 99327c30507c..858cee02ca49 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ @@ -107,7 +107,7 @@ struct shmem_falloc { pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ - pgoff_t nr_unswapped; /* how often writepage refused to swap out */ + pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { @@ -446,7 +446,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), - * shmem_writepage() has to raise swapped before nrpages is lowered - + * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ @@ -1536,12 +1536,15 @@ int shmem_unuse(unsigned int type) return error; } -/* - * Move the page from the page cache to the swap cache. +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @wbc: How writeback is to be done + * + * Move the folio from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1550,13 +1553,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) int nr_pages; bool split = false; - /* - * Our capabilities prevent regular writeback or sync from ever calling - * shmem_writepage; but a stacking filesystem might use ->writepage of - * its underlying filesystem, in which case tmpfs should write out to - * swap only in response to memory pressure, and not for the writeback - * threads or sync. - */ if (WARN_ON_ONCE(!wbc->for_reclaim)) goto redirty; @@ -1586,9 +1582,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page_to_list_to_order(page, wbc->list, 0)) + if (split_folio_to_list(folio, wbc->list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } @@ -1646,7 +1641,7 @@ try_split: mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - return swap_writepage(&folio->page, wbc); + return swap_writeout(folio, wbc); } list_del_init(&info->swaplist); @@ -1660,6 +1655,7 @@ redirty: folio_unlock(folio); return 0; } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) @@ -3768,7 +3764,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, index--; /* - * Inform shmem_writepage() how far we have reached. + * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) @@ -5191,7 +5187,6 @@ static int shmem_error_remove_folio(struct address_space *mapping, } static const struct address_space_operations shmem_aops = { - .writepage = shmem_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, diff --git a/mm/show_mem.c b/mm/show_mem.c index 6af13bcd2ab3..0cf8bf5d832d 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - val->totalram = managed_pages; - val->sharedram = node_page_state(pgdat, NR_SHMEM); - val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); -#ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - + managed_pages += zone_managed_pages(zone); if (is_highmem(zone)) { managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } + + val->totalram = managed_pages; + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); val->totalhigh = managed_highpages; val->freehigh = free_highpages; -#else - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#endif val->mem_unit = PAGE_SIZE; } #endif @@ -223,7 +217,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_node_page_state(NR_SECONDARY_PAGETABLE), - global_zone_page_state(NR_BOUNCE), + 0UL, global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, @@ -311,6 +305,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " low:%lukB" " high:%lukB" " reserved_highatomic:%luKB" + " free_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" @@ -332,6 +327,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), K(zone->nr_reserved_highatomic), + K(zone->nr_free_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), @@ -341,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_BOUNCE)), + 0UL, K(free_pcp), K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); diff --git a/mm/slab_common.c b/mm/slab_common.c index 5be257e03c7c..bfe7c40eeee1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -2,7 +2,7 @@ /* * Slab allocator functions that are independent of the allocator strategy * - * (C) 2012 Christoph Lameter <cl@linux.com> + * (C) 2012 Christoph Lameter <cl@gentwo.org> */ #include <linux/slab.h> diff --git a/mm/slub.c b/mm/slub.c index dc9e729e1d26..be8b09e09d30 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2028,8 +2028,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, return 0; } -/* Should be called only if mem_alloc_profiling_enabled() */ -static noinline void free_slab_obj_exts(struct slab *slab) +static inline void free_slab_obj_exts(struct slab *slab) { struct slabobj_ext *obj_exts; @@ -2049,18 +2048,6 @@ static noinline void free_slab_obj_exts(struct slab *slab) slab->obj_exts = 0; } -static inline bool need_slab_obj_ext(void) -{ - if (mem_alloc_profiling_enabled()) - return true; - - /* - * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally - * inside memcg_slab_post_alloc_hook. No other users for now. - */ - return false; -} - #else /* CONFIG_SLAB_OBJ_EXT */ static inline void init_slab_obj_exts(struct slab *slab) @@ -2077,11 +2064,6 @@ static inline void free_slab_obj_exts(struct slab *slab) { } -static inline bool need_slab_obj_ext(void) -{ - return false; -} - #endif /* CONFIG_SLAB_OBJ_EXT */ #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -2129,7 +2111,7 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) static inline void alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { - if (need_slab_obj_ext()) + if (mem_alloc_profiling_enabled()) __alloc_tagging_slab_alloc_hook(s, object, flags); } @@ -2601,8 +2583,12 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_online() || need_slab_obj_ext()) - free_slab_obj_exts(slab); + /* + * The slab object extensions should now be freed regardless of + * whether mem_alloc_profiling_enabled() or not because profiling + * might have been disabled after slab->obj_exts got allocated. + */ + free_slab_obj_exts(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); diff --git a/mm/swap.c b/mm/swap.c index 77b2d5997873..4fc322f7111a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -309,7 +309,7 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio) trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP @@ -581,7 +581,7 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } @@ -599,7 +599,7 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) @@ -625,7 +625,7 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* diff --git a/mm/swap.h b/mm/swap.h index 6f4a3f927edb..2269eb9df0af 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -20,7 +20,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) __swap_read_unplug(plug); } void swap_write_unplug(struct swap_iocb *sio); -int swap_writepage(struct page *page, struct writeback_control *wbc); +int swap_writeout(struct folio *folio, struct writeback_control *wbc); void __swap_writepage(struct folio *folio, struct writeback_control *wbc); /* linux/mm/swap_state.c */ @@ -141,7 +141,7 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } -static inline int swap_writepage(struct page *p, struct writeback_control *wbc) +static inline int swap_writeout(struct folio *f, struct writeback_control *wbc) { return 0; } @@ -201,4 +201,22 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, #endif /* CONFIG_SWAP */ +/** + * folio_index - File index of a folio. + * @folio: The folio. + * + * For a folio which is either in the page cache or the swap cache, + * return its index within the address_space it belongs to. If you know + * the folio is definitely in the page cache, you can look at the folio's + * index directly. + * + * Return: The index (offset in units of pages) of a folio in its file. + */ +static inline pgoff_t folio_index(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return swap_cache_index(folio->swap); + return folio->index; +} + #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 68fd981b514f..c354435a0923 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -30,7 +30,6 @@ * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { - .writepage = swap_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, @@ -232,13 +231,11 @@ void free_swap_cache(struct folio *folio) } /* - * Perform a free_page(), also freeing any swap cache associated with - * this page if it is the last user of the page. + * Freeing a folio and also freeing any swap cache associated with + * this folio if it is the last user. */ -void free_page_and_swap_cache(struct page *page) +void free_folio_and_swap_cache(struct folio *folio) { - struct folio *folio = page_folio(page); - free_swap_cache(folio); if (!is_huge_zero_folio(folio)) folio_put(folio); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2eff8b51a945..68ce283e84be 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -52,9 +52,9 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages); +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); @@ -192,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; unsigned char count = *map; - if (swap_count(count) != 1) + if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) return false; while (++map < map_end) { @@ -1272,13 +1272,22 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); - /* - * Should not even be attempting large allocations when huge - * page swap is disabled. Warn and fail the allocation. - */ - if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) { - VM_WARN_ON_ONCE(1); - return -EINVAL; + if (order) { + /* + * Reject large allocation when THP_SWAP is disabled, + * the caller should split the folio and try again. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP)) + return -EAGAIN; + + /* + * Allocation size should never exceed cluster size + * (HPAGE_PMD_SIZE). + */ + if (size > SWAPFILE_CLUSTER) { + VM_WARN_ON_ONCE(1); + return -EINVAL; + } } local_lock(&percpu_swap_cluster.lock); @@ -1346,10 +1355,12 @@ out: return NULL; } -static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, - unsigned long offset, - unsigned char usage) +static unsigned char swap_entry_put_locked(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, + unsigned char usage) { + unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; @@ -1381,7 +1392,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, if (usage) WRITE_ONCE(si->swap_map[offset], usage); else - WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); + swap_entries_free(si, ci, entry, 1); return usage; } @@ -1452,71 +1463,104 @@ put_out: return NULL; } -static unsigned char __swap_entry_free(struct swap_info_struct *si, - swp_entry_t entry) +static void swap_entries_put_cache(struct swap_info_struct *si, + swp_entry_t entry, int nr) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); - unsigned char usage; + struct swap_cluster_info *ci; ci = lock_cluster(si, offset); - usage = __swap_entry_free_locked(si, offset, 1); - if (!usage) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + if (swap_only_has_cache(si, offset, nr)) + swap_entries_free(si, ci, entry, nr); + else { + for (int i = 0; i < nr; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); + } unlock_cluster(ci); - - return usage; } -static bool __swap_entries_free(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_map(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); - unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; int i; - if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + if (nr <= 1) goto fallback; - /* cross into another cluster */ - if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) + count = swap_count(data_race(si->swap_map[offset])); + if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster(ci); - goto fallback; + goto locked_fallback; } - for (i = 0; i < nr; i++) - WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) - swap_entry_range_free(si, ci, entry, nr); + swap_entries_free(si, ci, entry, nr); + else + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); unlock_cluster(ci); return has_cache; fallback: - for (i = 0; i < nr; i++) { - if (data_race(si->swap_map[offset + i])) { - count = __swap_entry_free(si, swp_entry(type, offset + i)); - if (count == SWAP_HAS_CACHE) - has_cache = true; - } else { - WARN_ON_ONCE(1); - } + ci = lock_cluster(si, offset); +locked_fallback: + for (i = 0; i < nr; i++, entry.val++) { + count = swap_entry_put_locked(si, ci, entry, 1); + if (count == SWAP_HAS_CACHE) + has_cache = true; } + unlock_cluster(ci); return has_cache; + } /* - * Drop the last HAS_CACHE flag of swap entries, caller have to - * ensure all entries belong to the same cgroup. + * Only functions with "_nr" suffix are able to free entries spanning + * cross multi clusters, so ensure the range is within a single cluster + * when freeing entries with functions without "_nr" suffix. */ -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages) +static bool swap_entries_put_map_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + int cluster_nr, cluster_rest; + unsigned long offset = swp_offset(entry); + bool has_cache = false; + + cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER; + while (nr) { + cluster_nr = min(nr, cluster_rest); + has_cache |= swap_entries_put_map(si, entry, cluster_nr); + cluster_rest = SWAPFILE_CLUSTER; + nr -= cluster_nr; + entry.val += cluster_nr; + } + + return has_cache; +} + +/* + * Check if it's the last ref of swap entry in the freeing path. + * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. + */ +static inline bool __maybe_unused swap_is_last_ref(unsigned char count) +{ + return (count == SWAP_HAS_CACHE) || (count == 1) || + (count == SWAP_MAP_SHMEM); +} + +/* + * Drop the last ref of swap entries, caller have to ensure all entries + * belong to the same cgroup and cluster. + */ +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; @@ -1529,7 +1573,7 @@ static void swap_entry_range_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(*map != SWAP_HAS_CACHE); + VM_BUG_ON(!swap_is_last_ref(*map)); *map = 0; } while (++map < map_end); @@ -1542,21 +1586,6 @@ static void swap_entry_range_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -static void cluster_swap_free_nr(struct swap_info_struct *si, - unsigned long offset, int nr_pages, - unsigned char usage) -{ - struct swap_cluster_info *ci; - unsigned long end = offset + nr_pages; - - ci = lock_cluster(si, offset); - do { - if (!__swap_entry_free_locked(si, offset, usage)) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); - } while (++offset < end); - unlock_cluster(ci); -} - /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. @@ -1573,7 +1602,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr, 1); + swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); offset += nr; nr_pages -= nr; } @@ -1584,8 +1613,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); @@ -1593,16 +1620,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, size)) - swap_entry_range_free(si, ci, entry, size); - else { - for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) - swap_entry_range_free(si, ci, entry, 1); - } - } - unlock_cluster(ci); + swap_entries_put_cache(si, entry, size); } int __swap_count(swp_entry_t entry) @@ -1797,7 +1815,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = __swap_entries_free(si, entry, nr); + any_only_cache = swap_entries_put_map_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1807,13 +1825,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) goto out; /* - * Now go back over the range trying to reclaim the swap cache. This is - * more efficient for large folios because we will only try to reclaim - * the swap once per folio in the common case. If we do - * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the - * latter will get a reference and lock the folio for every individual - * page but will only succeed once the swap slot for every subpage is - * zero. + * Now go back over the range trying to reclaim the swap cache. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; @@ -2359,7 +2371,7 @@ retry: * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; - * and even shmem_writepage() could have been preempted after + * and even shmem_writeout() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ @@ -3323,6 +3335,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* + * The swap subsystem needs a major overhaul to support this. + * It doesn't work yet so just disable it for now. + */ + if (mapping_min_folio_order(mapping) > 0) { + error = -EINVAL; + goto bad_swap_unlock_inode; + } + + /* * Read the swap header. */ if (!mapping->a_ops->read_folio) { @@ -3636,11 +3657,13 @@ int swapcache_prepare(swp_entry_t entry, int nr) return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } +/* + * Caller should ensure entries belong to the same folio so + * the entries won't span cross cluster boundary. + */ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - unsigned long offset = swp_offset(entry); - - cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); + swap_entries_put_cache(si, entry, nr); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) @@ -3649,21 +3672,6 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry) } /* - * out-of-line methods to avoid include hell. - */ -struct address_space *swapcache_mapping(struct folio *folio) -{ - return swp_swap_info(folio->swap)->swap_file->f_mapping; -} -EXPORT_SYMBOL_GPL(swapcache_mapping); - -pgoff_t __folio_swap_cache_index(struct folio *folio) -{ - return swap_cache_index(folio->swap); -} -EXPORT_SYMBOL_GPL(__folio_swap_cache_index); - -/* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count @@ -3780,7 +3788,7 @@ outer: * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of __swap_entry_free_locked() + * Called while __swap_duplicate() or caller of swap_entry_put_locked() * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, diff --git a/mm/truncate.c b/mm/truncate.c index 5d98054094d1..f2aaf99f2990 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -191,6 +191,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio) bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { loff_t pos = folio_pos(folio); + size_t size = folio_size(folio); unsigned int offset, length; struct page *split_at, *split_at2; @@ -198,14 +199,13 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) offset = start - pos; else offset = 0; - length = folio_size(folio); - if (pos + length <= (u64)end) - length = length - offset; + if (pos + size <= (u64)end) + length = size - offset; else length = end + 1 - pos - offset; folio_wait_writeback(folio); - if (length == folio_size(folio)) { + if (length == size) { truncate_inode_folio(folio->mapping, folio); return true; } @@ -224,16 +224,20 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) return true; split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); - split_at2 = folio_page(folio, - PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE); - if (!try_folio_split(folio, split_at, NULL)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste * for shmem truncate */ - struct folio *folio2 = page_folio(split_at2); + struct folio *folio2; + + if (offset + length == size) + goto no_split; + + split_at2 = folio_page(folio, + PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE); + folio2 = page_folio(split_at2); if (!folio_try_get(folio2)) goto no_split; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7d5d709cc838..bc473ad21202 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1063,9 +1063,14 @@ static int move_present_pte(struct mm_struct *mm, folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); - orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); - /* Follow mremap() behavior and treat the entry dirty after the move */ - orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); + orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); + /* Set soft dirty bit so userspace can notice the pte was moved */ +#ifdef CONFIG_MEM_SOFT_DIRTY + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); +#endif + if (pte_dirty(orig_src_pte)) + orig_dst_pte = pte_mkdirty(orig_dst_pte); + orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); out: @@ -1100,6 +1105,9 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); +#ifdef CONFIG_MEM_SOFT_DIRTY + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); +#endif set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); @@ -17,9 +17,13 @@ struct mmap_state { unsigned long pglen; unsigned long flags; struct file *file; + pgprot_t page_prot; + + /* User-defined fields, perhaps updated by .mmap_prepare(). */ + const struct vm_operations_struct *vm_ops; + void *vm_private_data; unsigned long charged; - bool retry_merge; struct vm_area_struct *prev; struct vm_area_struct *next; @@ -40,6 +44,7 @@ struct mmap_state { .pglen = PHYS_PFN(len_), \ .flags = flags_, \ .file = file_, \ + .page_prot = vm_get_page_prot(flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ @@ -57,6 +62,22 @@ struct mmap_state { .state = VMA_MERGE_START, \ } +/* + * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain + * more than one anon_vma_chain connecting it to more than one anon_vma. A merge + * would mean a wider range of folios sharing the root anon_vma lock, and thus + * potential lock contention, we do not wish to encourage merging such that this + * scales to a problem. + */ +static bool vma_had_uncowed_parents(struct vm_area_struct *vma) +{ + /* + * The list_is_singular() test is to avoid merging VMA cloned from + * parents. This can improve scalability caused by anon_vma lock. + */ + return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); +} + static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; @@ -82,24 +103,28 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex return true; } -static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2, struct vm_area_struct *vma) +static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { + struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; + struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ + struct anon_vma *tgt_anon = tgt->anon_vma; + struct anon_vma *src_anon = vmg->anon_vma; + /* - * The list_is_singular() test is to avoid merging VMA cloned from - * parents. This can improve scalability caused by anon_vma lock. + * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we + * will remove the existing VMA's anon_vma's so there's no scalability + * concerns. */ - if ((!anon_vma1 || !anon_vma2) && (!vma || - list_is_singular(&vma->anon_vma_chain))) - return true; - return anon_vma1 == anon_vma2; -} + VM_WARN_ON(src && src_anon != src->anon_vma); -/* Are the anon_vma's belonging to each VMA compatible with one another? */ -static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1, - struct vm_area_struct *vma2) -{ - return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL); + /* Case 1 - we will dup_anon_vma() from src into tgt. */ + if (!tgt_anon && src_anon) + return !vma_had_uncowed_parents(src); + /* Case 2 - we will simply use tgt's anon_vma. */ + if (tgt_anon && !src_anon) + return !vma_had_uncowed_parents(tgt); + /* Case 3 - the anon_vma's are already shared. */ + return src_anon == tgt_anon; } /* @@ -164,7 +189,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); if (is_mergeable_vma(vmg, /* merge_next = */ true) && - is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) { + is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } @@ -184,7 +209,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, /* merge_next = */ false) && - is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { + is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } @@ -400,8 +425,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg) static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { - if (!vmg->next || vmg->end != vmg->next->vm_start || - !can_vma_merge_before(vmg)) + struct vm_area_struct *next = vmg->next; + struct vm_area_struct *prev; + + if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) return false; if (!can_merge_left) @@ -414,7 +441,9 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg, * * We therefore check this in addition to mergeability to either side. */ - return are_anon_vmas_compatible(vmg->prev, vmg->next); + prev = vmg->prev; + return !prev->anon_vma || !next->anon_vma || + prev->anon_vma == next->anon_vma; } /* @@ -554,7 +583,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, } /* - * dup_anon_vma() - Helper function to duplicate anon_vma + * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the + * instance that the destination VMA has no anon_vma but the source does. + * * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. @@ -565,9 +596,18 @@ static int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { /* - * Easily overlooked: when mprotect shifts the boundary, make sure the - * expanding vma has anon_vma set if the shrinking vma had, to cover any - * anon pages imported. + * There are three cases to consider for correctly propagating + * anon_vma's on merge. + * + * The first is trivial - neither VMA has anon_vma, we need not do + * anything. + * + * The second where both have anon_vma is also a no-op, as they must + * then be the same, so there is simply nothing to copy. + * + * Here we cover the third - if the destination VMA has no anon_vma, + * that is it is unfaulted, we need to ensure that the newly merged + * range is referenced by the anon_vma's of the source. */ if (src->anon_vma && !dst->anon_vma) { int ret; @@ -1834,6 +1874,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_vma_link: + fixup_hugetlb_reservations(new_vma); vma_close(new_vma); if (new_vma->vm_file) @@ -2350,6 +2391,10 @@ static int __mmap_new_file_vma(struct mmap_state *map, int error; vma->vm_file = get_file(map->file); + + if (!map->file->f_op->mmap) + return 0; + error = mmap_file(vma->vm_file, vma); if (error) { fput(vma->vm_file); @@ -2372,8 +2417,6 @@ static int __mmap_new_file_vma(struct mmap_state *map, !(map->flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); - /* If the flags change (and are mergeable), let's retry later. */ - map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL); map->flags = vma->vm_flags; return 0; @@ -2406,7 +2449,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); vm_flags_init(vma, map->flags); - vma->vm_page_prot = vm_get_page_prot(map->flags); + vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { error = -ENOMEM; @@ -2493,6 +2536,56 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } +/* + * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that + * specifies it. + * + * This is called prior to any merge attempt, and updates whitelisted fields + * that are permitted to be updated by the caller. + * + * All but user-defined fields will be pre-populated with original values. + * + * Returns 0 on success, or an error code otherwise. + */ +static int call_mmap_prepare(struct mmap_state *map) +{ + int err; + struct vm_area_desc desc = { + .mm = map->mm, + .start = map->addr, + .end = map->end, + + .pgoff = map->pgoff, + .file = map->file, + .vm_flags = map->flags, + .page_prot = map->page_prot, + }; + + /* Invoke the hook. */ + err = __call_mmap_prepare(map->file, &desc); + if (err) + return err; + + /* Update fields permitted to be changed. */ + map->pgoff = desc.pgoff; + map->file = desc.file; + map->flags = desc.vm_flags; + map->page_prot = desc.page_prot; + /* User-defined fields. */ + map->vm_ops = desc.vm_ops; + map->vm_private_data = desc.private_data; + + return 0; +} + +static void set_vma_user_defined_fields(struct vm_area_struct *vma, + struct mmap_state *map) +{ + if (map->vm_ops) + vma->vm_ops = map->vm_ops; + vma->vm_private_data = map->vm_private_data; +} + static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) @@ -2500,10 +2593,13 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; int error; + bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); error = __mmap_prepare(&map, uf); + if (!error && have_mmap_prepare) + error = call_mmap_prepare(&map); if (error) goto abort_munmap; @@ -2521,16 +2617,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, goto unacct_error; } - /* If flags changed, we might be able to merge, so try again. */ - if (map.retry_merge) { - struct vm_area_struct *merged; - VMG_MMAP_STATE(vmg, &map, vma); - - vma_iter_config(map.vmi, map.addr, map.end); - merged = vma_merge_existing_range(&vmg); - if (merged) - vma = merged; - } + if (have_mmap_prepare) + set_vma_user_defined_fields(vma, &map); __mmap_complete(&map, vma); @@ -3017,3 +3105,46 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock) userfaultfd_unmap_complete(mm, &uf); return ret; } + + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap tree. If vm_file is non-NULL + * then i_mmap_rwsem is taken here. + */ +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) +{ + unsigned long charged = vma_pages(vma); + + + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) + return -ENOMEM; + + if ((vma->vm_flags & VM_ACCOUNT) && + security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + + /* + * The vm_pgoff of a purely anonymous vma should be irrelevant + * until its first write fault, when page's anon_vma and index + * are set. But now set the vm_pgoff it will almost certainly + * end up with (unless mremap moves it elsewhere before that + * first wfault), so /proc/pid/maps tells a consistent story. + * + * By setting it to reflect the virtual start address of the + * vma, merges and splits can happen in a seamless way, just + * using the existing file pgoff checks and manipulations. + * Similarly in do_mmap and in do_brk_flags. + */ + if (vma_is_anonymous(vma)) { + BUG_ON(vma->anon_vma); + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; + } + + if (vma_link(mm, vma)) { + if (vma->vm_flags & VM_ACCOUNT) + vm_unacct_memory(charged); + return -ENOMEM; + } + + return 0; +} @@ -548,4 +548,19 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address); int __vm_munmap(unsigned long start, size_t len, bool unlock); +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); + +/* vma_init.h, shared between CONFIG_MMU and nommu. */ +void __init vma_state_init(void); +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); +void vm_area_free(struct vm_area_struct *vma); + +/* vma_exec.c */ +#ifdef CONFIG_MMU +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, + unsigned long *top_mem_p); +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +#endif + #endif /* __MM_VMA_H */ diff --git a/mm/vma_exec.c b/mm/vma_exec.c new file mode 100644 index 000000000000..2dffb02ed6a2 --- /dev/null +++ b/mm/vma_exec.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Functions explicitly implemented for exec functionality which however are + * explicitly VMA-only logic. + */ + +#include "vma_internal.h" +#include "vma.h" + +/* + * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between + * this VMA and its relocated range, which will now reside at [vma->vm_start - + * shift, vma->vm_end - shift). + * + * This function is almost certainly NOT what you want for anything other than + * early executable temporary stack relocation. + */ +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) +{ + /* + * The process proceeds as follows: + * + * 1) Use shift to calculate the new vma endpoints. + * 2) Extend vma to cover both the old and new ranges. This ensures the + * arguments passed to subsequent functions are consistent. + * 3) Move vma's page tables to the new range. + * 4) Free up any cleared pgd range. + * 5) Shrink the vma to cover only the new range. + */ + + struct mm_struct *mm = vma->vm_mm; + unsigned long old_start = vma->vm_start; + unsigned long old_end = vma->vm_end; + unsigned long length = old_end - old_start; + unsigned long new_start = old_start - shift; + unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); + struct vm_area_struct *next; + struct mmu_gather tlb; + PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); + + BUG_ON(new_start > new_end); + + /* + * ensure there are no vmas between where we want to go + * and where we are + */ + if (vma != vma_next(&vmi)) + return -EFAULT; + + vma_iter_prev_range(&vmi); + /* + * cover the whole range: [new_start, old_end) + */ + vmg.middle = vma; + if (vma_expand(&vmg)) + return -ENOMEM; + + /* + * move the page tables downwards, on failure we rely on + * process cleanup to remove whatever mess we made. + */ + pmc.for_stack = true; + if (length != move_page_tables(&pmc)) + return -ENOMEM; + + tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); + if (new_end > old_start) { + /* + * when the old and new regions overlap clear from new_end. + */ + free_pgd_range(&tlb, new_end, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } else { + /* + * otherwise, clean from old_start; this is done to not touch + * the address space in [new_end, old_start) some architectures + * have constraints on va-space that make this illegal (IA64) - + * for the others its just a little faster. + */ + free_pgd_range(&tlb, old_start, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } + tlb_finish_mmu(&tlb); + + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); +} + +/* + * Establish the stack VMA in an execve'd process, located temporarily at the + * maximum stack address provided by the architecture. + * + * We later relocate this downwards in relocate_vma_down(). + * + * This function is almost certainly NOT what you want for anything other than + * early executable initialisation. + * + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the + * maximum addressable location in the stack (that is capable of storing a + * system word of data). + */ +int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, + unsigned long *top_mem_p) +{ + int err; + struct vm_area_struct *vma = vm_area_alloc(mm); + + if (!vma) + return -ENOMEM; + + vma_set_anonymous(vma); + + if (mmap_write_lock_killable(mm)) { + err = -EINTR; + goto err_free; + } + + /* + * Need to be called with mmap write lock + * held, to avoid race with ksmd. + */ + err = ksm_execve(mm); + if (err) + goto err_ksm; + + /* + * Place the stack at the largest stack address the architecture + * supports. Later, we'll move this to an appropriate place. We don't + * use STACK_TOP because that can depend on attributes which aren't + * configured yet. + */ + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + vma->vm_end = STACK_TOP_MAX; + vma->vm_start = vma->vm_end - PAGE_SIZE; + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + + err = insert_vm_struct(mm, vma); + if (err) + goto err; + + mm->stack_vm = mm->total_vm = 1; + mmap_write_unlock(mm); + *vmap = vma; + *top_mem_p = vma->vm_end - sizeof(void *); + return 0; + +err: + ksm_exit(mm); +err_ksm: + mmap_write_unlock(mm); +err_free: + *vmap = NULL; + vm_area_free(vma); + return err; +} diff --git a/mm/vma_init.c b/mm/vma_init.c new file mode 100644 index 000000000000..8e53c7943561 --- /dev/null +++ b/mm/vma_init.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. + */ + +#include "vma_internal.h" +#include "vma.h" + +/* SLAB cache for vm_area_struct structures */ +static struct kmem_cache *vm_area_cachep; + +void __init vma_state_init(void) +{ + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| + SLAB_ACCOUNT); +} + +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + return NULL; + + vma_init(vma, mm); + + return vma; +} + +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +#ifdef __HAVE_PFNMAP_TRACKING + dest->pfnmap_track_ctx = NULL; +#endif +} + +#ifdef __HAVE_PFNMAP_TRACKING +static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, + struct vm_area_struct *new) +{ + struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx; + + if (likely(!ctx)) + return 0; + + /* + * We don't expect to ever hit this. If ever required, we would have + * to duplicate the tracking. + */ + if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX)) + return -ENOMEM; + kref_get(&ctx->kref); + new->pfnmap_track_ctx = ctx; + return 0; +} + +static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) +{ + struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx; + + if (likely(!ctx)) + return; + + kref_put(&ctx->kref, pfnmap_track_ctx_release); + vma->pfnmap_track_ctx = NULL; +} +#else +static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, + struct vm_area_struct *new) +{ + return 0; +} +static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) +{ +} +#endif + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + + if (!new) + return NULL; + + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + vm_area_init_from(orig, new); + + if (vma_pfnmap_track_ctx_dup(orig, new)) { + kmem_cache_free(vm_area_cachep, new); + return NULL; + } + vma_lock_init(new, true); + INIT_LIST_HEAD(&new->anon_vma_chain); + vma_numab_state_init(new); + dup_anon_vma_name(orig, new); + + return new; +} + +void vm_area_free(struct vm_area_struct *vma) +{ + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); + vma_numab_state_free(vma); + free_anon_vma_name(vma); + vma_pfnmap_track_ctx_release(vma); + kmem_cache_free(vm_area_cachep, vma); +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3ed720a787ec..ab986dd09b6a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; + + arch_enter_lazy_mmu_mode(); + do { if (unlikely(!pte_none(ptep_get(pte)))) { if (pfn_valid(pfn)) { @@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -350,12 +355,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; + pte_t ptent; + unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); + arch_enter_lazy_mmu_mode(); + do { - pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); +#ifdef CONFIG_HUGETLB_PAGE + size = arch_vmap_pte_range_unmap_size(addr, pte); + if (size != PAGE_SIZE) { + if (WARN_ON(!IS_ALIGNED(addr, size))) { + addr = ALIGN_DOWN(addr, size); + pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT)); + } + ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size); + if (WARN_ON(end - addr < size)) + size = end - addr; + } else +#endif + ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; } @@ -374,8 +397,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, if (cleared || pmd_bad(*pmd)) *mask |= PGTBL_PMD_MODIFIED; - if (cleared) + if (cleared) { + WARN_ON(next - addr < PMD_SIZE); continue; + } if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); @@ -399,8 +424,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (cleared || pud_bad(*pud)) *mask |= PGTBL_PUD_MODIFIED; - if (cleared) + if (cleared) { + WARN_ON(next - addr < PUD_SIZE); continue; + } if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next, mask); @@ -497,6 +524,9 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; + + arch_enter_lazy_mmu_mode(); + do { struct page *page = pages[*nr]; @@ -510,6 +540,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -900,6 +932,11 @@ static struct vmap_node *vmap_nodes = &single; static __read_mostly unsigned int nr_vmap_nodes = 1; static __read_mostly unsigned int vmap_zone_size = 1; +/* A simple iterator over all vmap-nodes. */ +#define for_each_vmap_node(vn) \ + for ((vn) = &vmap_nodes[0]; \ + (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++) + static inline unsigned int addr_to_node_id(unsigned long addr) { @@ -918,6 +955,19 @@ id_to_node(unsigned int id) return &vmap_nodes[id % nr_vmap_nodes]; } +static inline unsigned int +node_to_id(struct vmap_node *node) +{ + /* Pointer arithmetic. */ + unsigned int id = node - vmap_nodes; + + if (likely(id < nr_vmap_nodes)) + return id; + + WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node); + return 0; +} + /* * We use the value 0 to represent "no node", that is why * an encoded value will be the node-id incremented by 1. @@ -990,7 +1040,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); -static atomic_long_t nr_vmalloc_pages; +static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages; +static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr; unsigned long vmalloc_nr_pages(void) { @@ -1056,12 +1107,11 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { unsigned long va_start_lowest; struct vmap_node *vn; - int i; repeat: - for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + va_start_lowest = 0; + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); @@ -1698,7 +1748,7 @@ va_clip(struct rb_root *root, struct list_head *head, */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) - return -1; + return -ENOMEM; } /* @@ -1712,7 +1762,7 @@ va_clip(struct rb_root *root, struct list_head *head, */ va->va_start = nva_start_addr + size; } else { - return -1; + return -EINVAL; } if (type != FL_FIT_TYPE) { @@ -1741,19 +1791,19 @@ va_alloc(struct vmap_area *va, /* Check the "vend" restriction. */ if (nva_start_addr + size > vend) - return vend; + return -ERANGE; /* Update the free vmap_area. */ ret = va_clip(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) - return vend; + return ret; return nva_start_addr; } /* * Returns a start address of the newly allocated area, if success. - * Otherwise a vend is returned that indicates failure. + * Otherwise an error value is returned that indicates failure. */ static __always_inline unsigned long __alloc_vmap_area(struct rb_root *root, struct list_head *head, @@ -1778,14 +1828,13 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) - return vend; + return -ENOENT; nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); - if (nva_start_addr == vend) - return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(root, head, size, align); + if (!IS_ERR_VALUE(nva_start_addr)) + find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; @@ -1915,7 +1964,7 @@ node_alloc(unsigned long size, unsigned long align, struct vmap_area *va; *vn_id = 0; - *addr = vend; + *addr = -EINVAL; /* * Fallback to a global heap if not vmalloc or there @@ -1940,7 +1989,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, { vm->flags = flags; vm->addr = (void *)va->va_start; - vm->size = va_size(va); + vm->size = vm->requested_size = va_size(va); vm->caller = caller; va->vm = vm; } @@ -1995,20 +2044,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, } retry: - if (addr == vend) { + if (IS_ERR_VALUE(addr)) { preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); } - trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); + trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr)); /* - * If an allocation fails, the "vend" address is + * If an allocation fails, the error value is * returned. Therefore trigger the overflow path. */ - if (unlikely(addr == vend)) + if (IS_ERR_VALUE(addr)) goto overflow; va->va_start = addr; @@ -2100,8 +2149,6 @@ static unsigned long lazy_max_pages(void) return log * (32UL * 1024 * 1024 / PAGE_SIZE); } -static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); - /* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance @@ -2111,7 +2158,6 @@ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); -static cpumask_t purge_nodes; static void reclaim_list_global(struct list_head *head) @@ -2134,7 +2180,7 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) LIST_HEAD(decay_list); struct rb_root decay_root = RB_ROOT; struct vmap_area *va, *nva; - unsigned long n_decay; + unsigned long n_decay, pool_len; int i; for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { @@ -2148,22 +2194,20 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) list_replace_init(&vn->pool[i].head, &tmp_list); spin_unlock(&vn->pool_lock); - if (full_decay) - WRITE_ONCE(vn->pool[i].len, 0); + pool_len = n_decay = vn->pool[i].len; + WRITE_ONCE(vn->pool[i].len, 0); /* Decay a pool by ~25% out of left objects. */ - n_decay = vn->pool[i].len >> 2; + if (!full_decay) + n_decay >>= 2; + pool_len -= n_decay; list_for_each_entry_safe(va, nva, &tmp_list, list) { + if (!n_decay--) + break; + list_del_init(&va->list); merge_or_add_vmap_area(va, &decay_root, &decay_list); - - if (!full_decay) { - WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1); - - if (!--n_decay) - break; - } } /* @@ -2172,9 +2216,10 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) * can populate the pool therefore a simple list replace * operation takes place here. */ - if (!full_decay && !list_empty(&tmp_list)) { + if (!list_empty(&tmp_list)) { spin_lock(&vn->pool_lock); list_replace_init(&tmp_list, &vn->pool[i].head); + WRITE_ONCE(vn->pool[i].len, pool_len); spin_unlock(&vn->pool_lock); } } @@ -2244,6 +2289,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, { unsigned long nr_purged_areas = 0; unsigned int nr_purge_helpers; + static cpumask_t purge_nodes; unsigned int nr_purge_nodes; struct vmap_node *vn; int i; @@ -2255,9 +2301,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, */ purge_nodes = CPU_MASK_NONE; - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; - + for_each_vmap_node(vn) { INIT_LIST_HEAD(&vn->purge_list); vn->skip_populate = full_pool_decay; decay_va_pool_node(vn, full_pool_decay); @@ -2276,7 +2320,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, end = max(end, list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end); - cpumask_set_cpu(i, &purge_nodes); + cpumask_set_cpu(node_to_id(vn), &purge_nodes); } nr_purge_nodes = cpumask_weight(&purge_nodes); @@ -2355,7 +2399,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) if (WARN_ON_ONCE(!list_empty(&va->list))) return; - nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, + nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT, &vmap_lazy_nr); /* @@ -2421,7 +2465,7 @@ struct vmap_area *find_vmap_area(unsigned long addr) if (va) return va; - } while ((i = (i + 1) % nr_vmap_nodes) != j); + } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); return NULL; } @@ -2447,7 +2491,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) if (va) return va; - } while ((i = (i + 1) % nr_vmap_nodes) != j); + } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); return NULL; } @@ -2916,10 +2960,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) */ void vm_unmap_aliases(void) { - unsigned long start = ULONG_MAX, end = 0; - int flush = 0; - - _vm_unmap_aliases(start, end, flush); + _vm_unmap_aliases(ULONG_MAX, 0, 0); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); @@ -3100,7 +3141,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) /* * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. - * Pair with smp_rmb() in show_numa_info(). + * Pair with smp_rmb() in vread_iter() and vmalloc_info_show(). */ smp_wmb(); vm->flags &= ~VM_UNINITIALIZED; @@ -3133,6 +3174,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, area->flags = flags; area->caller = caller; + area->requested_size = requested_size; va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); if (IS_ERR(va)) { @@ -3370,12 +3412,13 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); + /* All pages of vm should be charged to same memcg, so use first one. */ + if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) + mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations @@ -3671,12 +3714,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (gfp_mask & __GFP_ACCOUNT) { - int i; - - for (i = 0; i < area->nr_pages; i++) - mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); - } + /* All pages of vm should be charged to same memcg, so use first one. */ + if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) + mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, + area->nr_pages); /* * If not enough pages were obtained to accomplish an @@ -3943,9 +3984,10 @@ void *vmalloc_noprof(unsigned long size) EXPORT_SYMBOL(vmalloc_noprof); /** - * vmalloc_huge - allocate virtually contiguous memory, allow huge pages + * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages * @size: allocation size * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. @@ -3954,13 +3996,13 @@ EXPORT_SYMBOL(vmalloc_noprof); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) { return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - NUMA_NO_NODE, __builtin_return_address(0)); + gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); } -EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); +EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof); /** * vzalloc - allocate virtually contiguous memory with zero fill @@ -4063,6 +4105,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); */ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) { + struct vm_struct *vm = NULL; + size_t alloced_size = 0; size_t old_size = 0; void *n; @@ -4072,15 +4116,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) } if (p) { - struct vm_struct *vm; - vm = find_vm_area(p); if (unlikely(!vm)) { WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); return NULL; } - old_size = get_vm_area_size(vm); + alloced_size = get_vm_area_size(vm); + old_size = vm->requested_size; + if (WARN(alloced_size < old_size, + "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) + return NULL; } /* @@ -4088,11 +4134,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) * would be a good heuristic for when to shrink the vm_area? */ if (size <= old_size) { - /* Zero out spare memory. */ - if (want_init_on_alloc(flags)) + /* Zero out "freed" memory, potentially for future realloc. */ + if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); + vm->requested_size = size; kasan_poison_vmalloc(p + size, old_size - size); - kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL); + return (void *)p; + } + + /* + * We already have the bytes available in the allocation; use them. + */ + if (size <= alloced_size) { + kasan_unpoison_vmalloc(p + old_size, size - old_size, + KASAN_VMALLOC_PROT_NORMAL); + /* + * No need to zero memory here, as unused memory will have + * already been zeroed at initial allocation time or during + * realloc shrink time. + */ + vm->requested_size = size; return (void *)p; } @@ -4914,39 +4975,37 @@ bool vmalloc_dump_obj(void *object) #endif #ifdef CONFIG_PROC_FS -static void show_numa_info(struct seq_file *m, struct vm_struct *v) -{ - if (IS_ENABLED(CONFIG_NUMA)) { - unsigned int nr, *counters = m->private; - unsigned int step = 1U << vm_area_page_order(v); - if (!counters) - return; +/* + * Print number of pages allocated on each memory node. + * + * This function can only be called if CONFIG_NUMA is enabled + * and VM_UNINITIALIZED bit in v->flags is disabled. + */ +static void show_numa_info(struct seq_file *m, struct vm_struct *v, + unsigned int *counters) +{ + unsigned int nr; + unsigned int step = 1U << vm_area_page_order(v); - if (v->flags & VM_UNINITIALIZED) - return; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); + if (!counters) + return; - memset(counters, 0, nr_node_ids * sizeof(unsigned int)); + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); - for (nr = 0; nr < v->nr_pages; nr += step) - counters[page_to_nid(v->pages[nr])] += step; - for_each_node_state(nr, N_HIGH_MEMORY) - if (counters[nr]) - seq_printf(m, " N%u=%u", nr, counters[nr]); - } + for (nr = 0; nr < v->nr_pages; nr += step) + counters[page_to_nid(v->pages[nr])] += step; + for_each_node_state(nr, N_HIGH_MEMORY) + if (counters[nr]) + seq_printf(m, " N%u=%u", nr, counters[nr]); } static void show_purge_info(struct seq_file *m) { struct vmap_node *vn; struct vmap_area *va; - int i; - - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + for_each_vmap_node(vn) { spin_lock(&vn->lazy.lock); list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", @@ -4962,11 +5021,12 @@ static int vmalloc_info_show(struct seq_file *m, void *p) struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; - int i; + unsigned int *counters; - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + if (IS_ENABLED(CONFIG_NUMA)) + counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + for_each_vmap_node(vn) { spin_lock(&vn->busy.lock); list_for_each_entry(va, &vn->busy.head, list) { if (!va->vm) { @@ -4979,6 +5039,11 @@ static int vmalloc_info_show(struct seq_file *m, void *p) } v = va->vm; + if (v->flags & VM_UNINITIALIZED) + continue; + + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); @@ -5013,7 +5078,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p) if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); - show_numa_info(m, v); + if (IS_ENABLED(CONFIG_NUMA)) + show_numa_info(m, v, counters); + seq_putc(m, '\n'); } spin_unlock(&vn->busy.lock); @@ -5023,19 +5090,14 @@ static int vmalloc_info_show(struct seq_file *m, void *p) * As a final step, dump "unpurged" areas. */ show_purge_info(m); + if (IS_ENABLED(CONFIG_NUMA)) + kfree(counters); return 0; } static int __init proc_vmalloc_init(void) { - void *priv_data = NULL; - - if (IS_ENABLED(CONFIG_NUMA)) - priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); - - proc_create_single_data("vmallocinfo", - 0400, NULL, vmalloc_info_show, priv_data); - + proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show); return 0; } module_init(proc_vmalloc_init); @@ -5087,7 +5149,7 @@ static void __init vmap_init_free_space(void) static void vmap_init_nodes(void) { struct vmap_node *vn; - int i, n; + int i; #if BITS_PER_LONG == 64 /* @@ -5104,7 +5166,7 @@ static void vmap_init_nodes(void) * set of cores. Therefore a per-domain purging is supposed to * be added as well as a per-domain balancing. */ - n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); + int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); @@ -5119,8 +5181,7 @@ static void vmap_init_nodes(void) } #endif - for (n = 0; n < nr_vmap_nodes; n++) { - vn = &vmap_nodes[n]; + for_each_vmap_node(vn) { vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); @@ -5141,15 +5202,13 @@ static void vmap_init_nodes(void) static unsigned long vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - unsigned long count; + unsigned long count = 0; struct vmap_node *vn; - int i, j; - - for (count = 0, i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; + int i; - for (j = 0; j < MAX_VA_SIZE_PAGES; j++) - count += READ_ONCE(vn->pool[j].len); + for_each_vmap_node(vn) { + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) + count += READ_ONCE(vn->pool[i].len); } return count ? count : SHRINK_EMPTY; @@ -5158,10 +5217,10 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) static unsigned long vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - int i; + struct vmap_node *vn; - for (i = 0; i < nr_vmap_nodes; i++) - decay_va_pool_node(&vmap_nodes[i], true); + for_each_vmap_node(vn) + decay_va_pool_node(vn, true); return SHRINK_STOP; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 3783e45bfc92..f2858cc92c2c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -342,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc) } } -static bool can_demote(int nid, struct scan_control *sc) +static bool can_demote(int nid, struct scan_control *sc, + struct mem_cgroup *memcg) { + int demotion_nid; + if (!numa_demotion_enabled) return false; if (sc && sc->no_demotion) return false; - if (next_demotion_node(nid) == NUMA_NO_NODE) + + demotion_nid = next_demotion_node(nid); + if (demotion_nid == NUMA_NO_NODE) return false; - return true; + /* If demotion node isn't in the cgroup's mems_allowed, fall back */ + return mem_cgroup_node_allowed(memcg, demotion_nid); } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, @@ -376,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, * * Can it be reclaimed from this node via demotion? */ - return can_demote(nid, sc); + return can_demote(nid, sc, memcg); } /* @@ -648,21 +654,20 @@ typedef enum { /* * pageout is called by shrink_folio_list() for each dirty folio. - * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) { + int (*writeout)(struct folio *, struct writeback_control *); + /* - * If the folio is dirty, only perform writeback if that write - * will be non-blocking. To prevent this allocation from being - * stalled by pagecache activity. But note that there may be - * stalls if we need to run get_block(). We could test - * PagePrivate for that. - * - * If this process is currently in __generic_file_write_iter() against - * this folio's queue, we can perform writeback even if that - * will block. + * We no longer attempt to writeback filesystem folios here, other + * than tmpfs/shmem. That's taken care of in page-writeback. + * If we find a dirty filesystem folio at the end of the LRU list, + * typically that means the filesystem is saturating the storage + * with contiguous writes and telling it to write a folio here + * would only make the situation worse by injecting an element + * of random access. * * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because @@ -685,7 +690,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } return PAGE_KEEP; } - if (mapping->a_ops->writepage == NULL) + if (shmem_mapping(mapping)) + writeout = shmem_writeout; + else if (folio_test_anon(folio)) + writeout = swap_writeout; + else return PAGE_ACTIVATE; if (folio_clear_dirty_for_io(folio)) { @@ -708,7 +717,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, wbc.list = folio_list; folio_set_reclaim(folio); - res = mapping->a_ops->writepage(&folio->page, &wbc); + res = writeout(folio, &wbc); if (res < 0) handle_write_error(mapping, folio, res); if (res == AOP_WRITEPAGE_ACTIVATE) { @@ -717,7 +726,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } if (!folio_test_writeback(folio)) { - /* synchronous write or broken a_ops? */ + /* synchronous write? */ folio_clear_reclaim(folio); } trace_mm_vmscan_write_folio(folio); @@ -1096,7 +1105,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) */ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, - struct reclaim_stat *stat, bool ignore_references) + struct reclaim_stat *stat, bool ignore_references, + struct mem_cgroup *memcg) { struct folio_batch free_folios; LIST_HEAD(ret_folios); @@ -1109,7 +1119,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); - do_demote_pass = can_demote(pgdat->node_id, sc); + do_demote_pass = can_demote(pgdat->node_id, sc, memcg); retry: while (!list_empty(folio_list)) { @@ -1658,7 +1668,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, */ noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, - &stat, true); + &stat, true, NULL); memalloc_noreclaim_restore(noreclaim_flag); list_splice(&clean_folios, folio_list); @@ -1725,13 +1735,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; - unsigned long skipped = 0; - unsigned long scan, total_scan, nr_pages; + unsigned long skipped = 0, total_scan = 0, scan = 0; + unsigned long nr_pages; unsigned long max_nr_skipped = 0; LIST_HEAD(folios_skipped); - total_scan = 0; - scan = 0; while (scan < nr_to_scan && !list_empty(src)) { struct list_head *move_to = src; struct folio *folio; @@ -2023,7 +2031,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2031,7 +2039,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, + lruvec_memcg(lruvec)); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); @@ -2042,7 +2051,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); @@ -2132,7 +2141,7 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!cgroup_reclaim(sc)) __count_vm_events(PGREFILL, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2189,7 +2198,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); __count_vm_events(PGDEACTIVATE, nr_deactivate); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); @@ -2214,7 +2223,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); @@ -2503,6 +2512,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } + /* Proactive reclaim initiated by userspace for anonymous memory only */ + if (swappiness == SWAPPINESS_ANON_ONLY) { + WARN_ON_ONCE(!sc->proactive); + scan_balance = SCAN_ANON; + goto out; + } + /* * Do not apply any pressure balancing cleverness when the * system is close to OOM, scan both anon and file equally @@ -2523,7 +2539,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, /* * If there is enough inactive page cache, we do not reclaim - * anything from the anonymous working right now. + * anything from the anonymous working right now to make sure + * a streaming file access pattern doesn't cause swapping. */ if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; @@ -2646,7 +2663,7 @@ out: * Anonymous LRU management is a waste if there is * ultimately no way to reclaim the memory. */ -static bool can_age_anon_pages(struct pglist_data *pgdat, +static bool can_age_anon_pages(struct lruvec *lruvec, struct scan_control *sc) { /* Aging the anon LRU is valuable if swap is present: */ @@ -2654,7 +2671,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return true; /* Also valuable if anon pages can be demoted: */ - return can_demote(pgdat->node_id, sc); + return can_demote(lruvec_pgdat(lruvec)->node_id, sc, + lruvec_memcg(lruvec)); } #ifdef CONFIG_LRU_GEN @@ -2690,8 +2708,12 @@ static bool should_clear_pmd_young(void) READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } +/* Get the min/max evictable type based on swappiness */ +#define min_type(swappiness) (!(swappiness)) +#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY) + #define evictable_min_seq(min_seq, swappiness) \ - min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS]) + min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)]) #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ @@ -2699,7 +2721,7 @@ static bool should_clear_pmd_young(void) for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) #define for_each_evictable_type(type, swappiness) \ - for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++) + for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++) #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) @@ -2732,7 +2754,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) if (!sc->may_swap) return 0; - if (!can_demote(pgdat->node_id, sc) && + if (!can_demote(pgdat->node_id, sc, memcg) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; @@ -3850,7 +3872,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - if (type ? swappiness > MAX_SWAPPINESS : !swappiness) + /* For file type, skip the check if swappiness is anon only */ + if (type && (swappiness == SWAPPINESS_ANON_ONLY)) + goto done; + + /* For anon type, skip the check if swappiness is zero (file only) */ + if (!type && !swappiness) goto done; /* prevent cold/hot inversion if the type is evictable */ @@ -4588,8 +4615,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); } - __count_memcg_events(memcg, item, isolated); - __count_memcg_events(memcg, PGREFILL, sorted); + count_memcg_events(memcg, item, isolated); + count_memcg_events(memcg, PGREFILL, sorted); __count_vm_events(PGSCAN_ANON + type, isolated); trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, scanned, skipped, isolated, @@ -4695,7 +4722,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (list_empty(&list)) return scanned; retry: - reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg); sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, @@ -4739,7 +4766,7 @@ retry: item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); - __count_memcg_events(memcg, item, reclaimed); + count_memcg_events(memcg, item, reclaimed); __count_vm_events(PGSTEAL_ANON + type, reclaimed); spin_unlock_irq(&lruvec->lru_lock); @@ -5516,7 +5543,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (swappiness < MIN_SWAPPINESS) swappiness = get_swappiness(lruvec, sc); - else if (swappiness > MAX_SWAPPINESS + 1) + else if (swappiness > SWAPPINESS_ANON_ONLY) goto done; switch (cmd) { @@ -5573,24 +5600,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, while ((cur = strsep(&next, ",;\n"))) { int n; int end; - char cmd; + char cmd, swap_string[5]; unsigned int memcg_id; unsigned int nid; unsigned long seq; - unsigned int swappiness = -1; + unsigned int swappiness; unsigned long opt = -1; cur = skip_spaces(cur); if (!*cur) continue; - n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, - &seq, &end, &swappiness, &end, &opt, &end); + n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, swap_string, &end, &opt, &end); if (n < 4 || cur[end]) { err = -EINVAL; break; } + if (n == 4) { + swappiness = -1; + } else if (!strcmp("max", swap_string)) { + /* set by userspace for anonymous memory only */ + swappiness = SWAPPINESS_ANON_ONLY; + } else { + err = kstrtouint(swap_string, 0, &swappiness); + if (err) + break; + } + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); if (err) break; @@ -5850,7 +5888,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + if (can_age_anon_pages(lruvec, sc) && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -6681,10 +6719,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) return; } - if (!can_age_anon_pages(pgdat, sc)) + lruvec = mem_cgroup_lruvec(NULL, pgdat); + if (!can_age_anon_pages(lruvec, sc)) return; - lruvec = mem_cgroup_lruvec(NULL, pgdat); if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) return; diff --git a/mm/vmstat.c b/mm/vmstat.c index 4c268ce39ff2..d888c248d99f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -7,7 +7,7 @@ * * zoned VM statistics * Copyright (C) 2006 Silicon Graphics, Inc., - * Christoph Lameter <christoph@lameter.com> + * Christoph Lameter <cl@gentwo.org> * Copyright (C) 2008-2014 Christoph Lameter */ #include <linux/fs.h> diff --git a/mm/workingset.c b/mm/workingset.c index 4841ae8af411..6e7f4cb1b9a7 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -612,7 +612,6 @@ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { - struct address_space *mapping; struct page *page = virt_to_page(node); /* @@ -623,8 +622,7 @@ void workingset_update_node(struct xa_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - mapping = container_of(node->array, struct address_space, i_pages); - lockdep_assert_held(&mapping->i_pages.xa_lock); + lockdep_assert_held(&node->array->xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { diff --git a/mm/zpdesc.h b/mm/zpdesc.h index fa47fece2237..57e7a4d6c6ca 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -7,6 +7,9 @@ #ifndef __MM_ZPDESC_H__ #define __MM_ZPDESC_H__ +#include <linux/migrate.h> +#include <linux/pagemap.h> + /* * struct zpdesc - Memory descriptor for zpool memory. * @flags: Page flags, mostly unused by zsmalloc. diff --git a/mm/zpool.c b/mm/zpool.c index 6d6d88930932..0a71d03369f1 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -226,20 +226,22 @@ const char *zpool_get_type(struct zpool *zpool) * @size: The amount of memory to allocate. * @gfp: The GFP flags to use when allocating memory. * @handle: Pointer to the handle to set + * @nid: The preferred node id. * * This allocates the requested amount of memory from the pool. * The gfp flags will be used when allocating memory, if the * implementation supports it. The provided @handle will be - * set to the allocated object handle. + * set to the allocated object handle. The allocation will + * prefer the NUMA node specified by @nid. * * Implementations must guarantee this to be thread-safe. * * Returns: 0 on success, negative value on error. */ int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - return zpool->driver->malloc(zpool->pool, size, gfp, handle); + return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid); } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 961b270f023c..999b513c7fdf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -26,17 +26,10 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/bitops.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/pgtable.h> -#include <asm/tlbflush.h> -#include <linux/cpumask.h> -#include <linux/cpu.h> -#include <linux/vmalloc.h> -#include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/shrinker.h> @@ -44,11 +37,8 @@ #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> -#include <linux/migrate.h> -#include <linux/wait.h> -#include <linux/pagemap.h> #include <linux/fs.h> -#include <linux/local_lock.h> +#include <linux/workqueue.h> #include "zpdesc.h" #define ZSPAGE_MAGIC 0x58 @@ -243,9 +233,9 @@ static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); } -static inline struct zpdesc *alloc_zpdesc(gfp_t gfp) +static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid) { - struct page *page = alloc_page(gfp); + struct page *page = alloc_pages_node(nid, gfp, 0); return page_zpdesc(page); } @@ -462,9 +452,9 @@ static void zs_zpool_destroy(void *pool) } static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - *handle = zs_malloc(pool, size, gfp); + *handle = zs_malloc(pool, size, gfp, nid); if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); @@ -1043,8 +1033,8 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, * Allocate a zspage for the given size class */ static struct zspage *alloc_zspage(struct zs_pool *pool, - struct size_class *class, - gfp_t gfp) + struct size_class *class, + gfp_t gfp, const int nid) { int i; struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; @@ -1061,7 +1051,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, for (i = 0; i < class->pages_per_zspage; i++) { struct zpdesc *zpdesc; - zpdesc = alloc_zpdesc(gfp); + zpdesc = alloc_zpdesc(gfp, nid); if (!zpdesc) { while (--i >= 0) { zpdesc_dec_zone_page_state(zpdescs[i]); @@ -1243,19 +1233,19 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (off + class->size <= PAGE_SIZE) { + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { /* this object is contained entirely within a page */ void *dst = kmap_local_zpdesc(zpdesc); - if (!ZsHugePage(zspage)) - off += ZS_HANDLE_SIZE; memcpy(dst + off, handle_mem, mem_len); kunmap_local(dst); } else { /* this object spans two pages */ size_t sizes[2]; - off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = mem_len - sizes[0]; @@ -1336,12 +1326,14 @@ static unsigned long obj_malloc(struct zs_pool *pool, * @pool: pool to allocate from * @size: size of block to allocate * @gfp: gfp flags when allocating object + * @nid: The preferred node id to allocate new zspage (if needed) * * On success, handle to the allocated object is returned, * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, + const int nid) { unsigned long handle; struct size_class *class; @@ -1376,7 +1368,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) spin_unlock(&class->lock); - zspage = alloc_zspage(pool, class, gfp); + zspage = alloc_zspage(pool, class, gfp, nid); if (!zspage) { cache_free_handle(pool, handle); return (unsigned long)ERR_PTR(-ENOMEM); diff --git a/mm/zswap.c b/mm/zswap.c index 204fb59da33c..455e9425c5f5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -981,7 +981,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, zpool = pool->zpool; gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; - alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page)); if (alloc_ret) goto unlock; |