diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/cma.c | 5 | ||||
-rw-r--r-- | mm/dmapool.c | 15 | ||||
-rw-r--r-- | mm/hugetlb.c | 24 | ||||
-rw-r--r-- | mm/kasan/Makefile | 3 | ||||
-rw-r--r-- | mm/kasan/shadow.c | 92 | ||||
-rw-r--r-- | mm/memcontrol.c | 10 | ||||
-rw-r--r-- | mm/migrate.c | 60 | ||||
-rw-r--r-- | mm/mremap.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 18 | ||||
-rw-r--r-- | mm/page-writeback.c | 28 | ||||
-rw-r--r-- | mm/page_alloc.c | 8 | ||||
-rw-r--r-- | mm/page_io.c | 3 | ||||
-rw-r--r-- | mm/readahead.c | 20 | ||||
-rw-r--r-- | mm/shmem.c | 33 | ||||
-rw-r--r-- | mm/show_mem.c | 4 | ||||
-rw-r--r-- | mm/swap.h | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 20 | ||||
-rw-r--r-- | mm/vma.c | 1 | ||||
-rw-r--r-- | mm/vmalloc.c | 24 | ||||
-rw-r--r-- | mm/vmscan.c | 29 |
22 files changed, 234 insertions, 173 deletions
@@ -608,7 +608,10 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, * complain. Find the boundary by adding one to the last valid * address. */ - highmem_start = __pa(high_memory - 1) + 1; + if (IS_ENABLED(CONFIG_HIGHMEM)) + highmem_start = __pa(high_memory - 1) + 1; + else + highmem_start = memblock_end_of_DRAM(); pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", __func__, &size, &base, &limit, &alignment); diff --git a/mm/dmapool.c b/mm/dmapool.c index f0bfc6c490f4..5be8cc1c6529 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -56,6 +56,7 @@ struct dma_pool { /* the pool */ unsigned int size; unsigned int allocation; unsigned int boundary; + int node; char name[32]; struct list_head pools; }; @@ -199,12 +200,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, /** - * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma. * @name: name of pool, for diagnostics * @dev: device that will be doing the DMA * @size: size of the blocks in this pool. * @align: alignment requirement for blocks; must be a power of two * @boundary: returned blocks won't cross this power of two boundary + * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on * Context: not in_interrupt() * * Given one of these pools, dma_pool_alloc() @@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block, * Return: a dma allocation pool with the requested characteristics, or * %NULL if one can't be created. */ -struct dma_pool *dma_pool_create(const char *name, struct device *dev, - size_t size, size_t align, size_t boundary) +struct dma_pool *dma_pool_create_node(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary, int node) { struct dma_pool *retval; size_t allocation; @@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, boundary = min(boundary, allocation); - retval = kzalloc(sizeof(*retval), GFP_KERNEL); + retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node); if (!retval) return retval; @@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; + retval->node = node; INIT_LIST_HEAD(&retval->pools); /* @@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, mutex_unlock(&pools_reg_lock); return retval; } -EXPORT_SYMBOL(dma_pool_create); +EXPORT_SYMBOL(dma_pool_create_node); static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) { @@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) { struct dma_page *page; - page = kmalloc(sizeof(*page), mem_flags); + page = kmalloc_node(sizeof(*page), mem_flags, pool->node); if (!page) return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7ae38bfb9096..6a3cf7935c14 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1250,7 +1250,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. - * This function should be only used by move_vma() and operate on + * This function should be only used by mremap and operate on * same sized vma. It should never come here with last ref on the * reservation. */ @@ -2949,12 +2949,20 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); + + /* + * The folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ + spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { h = folio_hstate(folio); } else { + spin_unlock_irq(&hugetlb_lock); start_pfn++; continue; } + spin_unlock_irq(&hugetlb_lock); if (!folio_ref_count(folio)) { ret = alloc_and_dissolve_hugetlb_folio(h, folio, @@ -7931,3 +7939,17 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE)); } + +/* + * For hugetlb, mremap() is an odd edge case - while the VMA copying is + * performed, we permit both the old and new VMAs to reference the same + * reservation. + * + * We fix this up after the operation succeeds, or if a newly allocated VMA + * is closed as a result of a failure to allocate memory. + */ +void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + clear_vma_resv_huge_pages(vma); +} diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1a958e7c8a46..dd93ae8a6beb 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) -CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla) +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX # If compiler instruments memintrinsics by prefixing them with __asan/__hwasan, # we need to treat them normally (as builtins), otherwise the compiler won't @@ -44,6 +44,7 @@ ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX CFLAGS_KASAN_TEST += -fno-builtin endif +CFLAGS_REMOVE_kasan_test_c.o += $(call cc-option, -Wvla-larger-than=1) CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST) RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 88d1c9dcb507..d2c70cd2afb1 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -292,33 +292,99 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start, { } +struct vmalloc_populate_data { + unsigned long start; + struct page **pages; +}; + static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, - void *unused) + void *_data) { - unsigned long page; + struct vmalloc_populate_data *data = _data; + struct page *page; pte_t pte; + int index; if (likely(!pte_none(ptep_get(ptep)))) return 0; - page = __get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); - pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); + index = PFN_DOWN(addr - data->start); + page = data->pages[index]; + __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE); + pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); if (likely(pte_none(ptep_get(ptep)))) { set_pte_at(&init_mm, addr, ptep, pte); - page = 0; + data->pages[index] = NULL; } spin_unlock(&init_mm.page_table_lock); - if (page) - free_page(page); + + return 0; +} + +static void ___free_pages_bulk(struct page **pages, int nr_pages) +{ + int i; + + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + __free_pages(pages[i], 0); + pages[i] = NULL; + } + } +} + +static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +{ + unsigned long nr_populated, nr_total = nr_pages; + struct page **page_array = pages; + + while (nr_pages) { + nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + if (!nr_populated) { + ___free_pages_bulk(page_array, nr_total - nr_pages); + return -ENOMEM; + } + pages += nr_populated; + nr_pages -= nr_populated; + } + return 0; } +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +{ + unsigned long nr_pages, nr_total = PFN_UP(end - start); + struct vmalloc_populate_data data; + int ret = 0; + + data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!data.pages) + return -ENOMEM; + + while (nr_total) { + nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); + ret = ___alloc_pages_bulk(data.pages, nr_pages); + if (ret) + break; + + data.start = start; + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, + kasan_populate_vmalloc_pte, &data); + ___free_pages_bulk(data.pages, nr_pages); + if (ret) + break; + + start += nr_pages * PAGE_SIZE; + nr_total -= nr_pages; + } + + free_page((unsigned long)data.pages); + + return ret; +} + int kasan_populate_vmalloc(unsigned long addr, unsigned long size) { unsigned long shadow_start, shadow_end; @@ -348,9 +414,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = apply_to_page_range(&init_mm, shadow_start, - shadow_end - shadow_start, - kasan_populate_vmalloc_pte, NULL); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end); if (ret) return ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c96c1f2b9cf5..ec39e62b172e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -582,7 +582,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) if (!val) return; - cgroup_rstat_updated(memcg->css.cgroup, cpu); + css_rstat_updated(&memcg->css, cpu); statc = this_cpu_ptr(memcg->vmstats_percpu); for (; statc; statc = statc->parent) { stats_updates = READ_ONCE(statc->stats_updates) + abs(val); @@ -614,7 +614,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) if (mem_cgroup_is_root(memcg)) WRITE_ONCE(flush_last_time, jiffies_64); - cgroup_rstat_flush(memcg->css.cgroup); + css_rstat_flush(&memcg->css); } /* @@ -1168,7 +1168,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { struct mem_cgroup *iter; int ret = 0; - int i = 0; BUG_ON(mem_cgroup_is_root(memcg)); @@ -1178,10 +1177,9 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); while (!ret && (task = css_task_iter_next(&it))) { - /* Avoid potential softlockup warning */ - if ((++i & 1023) == 0) - cond_resched(); ret = fn(task, arg); + /* Avoid potential softlockup warning */ + cond_resched(); } css_task_iter_end(&it); if (ret) { diff --git a/mm/migrate.c b/mm/migrate.c index 676d9cfc7059..c80591514e66 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -947,66 +947,20 @@ int filemap_migrate_folio(struct address_space *mapping, EXPORT_SYMBOL_GPL(filemap_migrate_folio); /* - * Writeback a folio to clean the dirty state - */ -static int writeout(struct address_space *mapping, struct folio *folio) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = 1, - .range_start = 0, - .range_end = LLONG_MAX, - .for_reclaim = 1 - }; - int rc; - - if (!mapping->a_ops->writepage) - /* No write method for the address space */ - return -EINVAL; - - if (!folio_clear_dirty_for_io(folio)) - /* Someone else already triggered a write */ - return -EAGAIN; - - /* - * A dirty folio may imply that the underlying filesystem has - * the folio on some queue. So the folio must be clean for - * migration. Writeout may mean we lose the lock and the - * folio state is no longer what we checked for earlier. - * At this point we know that the migration attempt cannot - * be successful. - */ - remove_migration_ptes(folio, folio, 0); - - rc = mapping->a_ops->writepage(&folio->page, &wbc); - - if (rc != AOP_WRITEPAGE_ACTIVATE) - /* unlocked. Relock */ - folio_lock(folio); - - return (rc < 0) ? -EIO : -EAGAIN; -} - -/* * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - if (folio_test_dirty(src)) { - /* Only writeback folios in full synchronous migration */ - switch (mode) { - case MIGRATE_SYNC: - break; - default: - return -EBUSY; - } - return writeout(mapping, src); - } + WARN_ONCE(mapping->a_ops->writepages, + "%ps does not implement migrate_folio\n", + mapping->a_ops); + if (folio_test_dirty(src)) + return -EBUSY; /* - * Buffers may be managed in a filesystem specific way. - * We must have no buffers or drop them. + * Filesystem may have private data at folio->private that we + * can't migrate automatically. */ if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; diff --git a/mm/mremap.c b/mm/mremap.c index 7db9da609c84..0d4948b720e2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1188,8 +1188,7 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm, mremap_userfaultfd_prep(new_vma, vrm->uf); } - if (is_vm_hugetlb_page(vma)) - clear_vma_resv_huge_pages(vma); + fixup_hugetlb_reservations(vma); /* Tell pfnmap has moved from this vma */ if (unlikely(vma->vm_flags & VM_PFNMAP)) diff --git a/mm/nommu.c b/mm/nommu.c index 617e7ba8022f..70f92f9a7fab 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -200,7 +200,23 @@ void *vmalloc_noprof(unsigned long size) } EXPORT_SYMBOL(vmalloc_noprof); -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof); +/* + * vmalloc_huge_node - allocate virtually contiguous memory, on a node + * + * @size: allocation size + * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * Due to NOMMU implications the node argument and HUGE page attribute is + * ignored. + */ +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) +{ + return __vmalloc_noprof(size, gfp_mask); +} /* * vzalloc - allocate virtually contiguous memory with zero fill diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c81624bc3969..76200cd85fe7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2621,27 +2621,6 @@ int write_cache_pages(struct address_space *mapping, } EXPORT_SYMBOL(write_cache_pages); -static int writeback_use_writepage(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct folio *folio = NULL; - struct blk_plug plug; - int err; - - blk_start_plug(&plug); - while ((folio = writeback_iter(mapping, wbc, folio, &err))) { - err = mapping->a_ops->writepage(&folio->page, wbc); - if (err == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - err = 0; - } - mapping_set_error(mapping, err); - } - blk_finish_plug(&plug); - - return err; -} - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -2652,14 +2631,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { - if (mapping->a_ops->writepages) { + if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); - } else if (mapping->a_ops->writepage) { - ret = writeback_use_writepage(mapping, wbc); - } else { + else /* deal with chardevs and other special files */ ret = 0; - } if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8258349e49ac..47fa713ccb4d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4562,6 +4562,14 @@ restart: } retry: + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * infinite retries. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); diff --git a/mm/page_io.c b/mm/page_io.c index 4bce19df557b..f7716b6569fa 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -237,9 +237,8 @@ static void swap_zeromap_folio_clear(struct folio *folio) * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ -int swap_writepage(struct page *page, struct writeback_control *wbc) +int swap_writeout(struct folio *folio, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); int ret; if (folio_free_swap(folio)) { diff --git a/mm/readahead.c b/mm/readahead.c index 6a4e96b69702..20d36d6b055e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { + struct file *file; + const struct inode *inode; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; - if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) + file = fd_file(f); + if (!(file->f_mode & FMODE_READ)) return -EBADF; /* @@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ - if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops || - (!S_ISREG(file_inode(fd_file(f))->i_mode) && - !S_ISBLK(file_inode(fd_file(f))->i_mode))) + if (!file->f_mapping) + return -EINVAL; + if (!file->f_mapping->a_ops) + return -EINVAL; + + inode = file_inode(file); + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) + return -EINVAL; + if (IS_ANON_FILE(inode)) return -EINVAL; return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); diff --git a/mm/shmem.c b/mm/shmem.c index 99327c30507c..858cee02ca49 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ @@ -107,7 +107,7 @@ struct shmem_falloc { pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ - pgoff_t nr_unswapped; /* how often writepage refused to swap out */ + pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { @@ -446,7 +446,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), - * shmem_writepage() has to raise swapped before nrpages is lowered - + * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ @@ -1536,12 +1536,15 @@ int shmem_unuse(unsigned int type) return error; } -/* - * Move the page from the page cache to the swap cache. +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @wbc: How writeback is to be done + * + * Move the folio from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1550,13 +1553,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) int nr_pages; bool split = false; - /* - * Our capabilities prevent regular writeback or sync from ever calling - * shmem_writepage; but a stacking filesystem might use ->writepage of - * its underlying filesystem, in which case tmpfs should write out to - * swap only in response to memory pressure, and not for the writeback - * threads or sync. - */ if (WARN_ON_ONCE(!wbc->for_reclaim)) goto redirty; @@ -1586,9 +1582,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page_to_list_to_order(page, wbc->list, 0)) + if (split_folio_to_list(folio, wbc->list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } @@ -1646,7 +1641,7 @@ try_split: mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - return swap_writepage(&folio->page, wbc); + return swap_writeout(folio, wbc); } list_del_init(&info->swaplist); @@ -1660,6 +1655,7 @@ redirty: folio_unlock(folio); return 0; } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) @@ -3768,7 +3764,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, index--; /* - * Inform shmem_writepage() how far we have reached. + * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) @@ -5191,7 +5187,6 @@ static int shmem_error_remove_folio(struct address_space *mapping, } static const struct address_space_operations shmem_aops = { - .writepage = shmem_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, diff --git a/mm/show_mem.c b/mm/show_mem.c index 6af13bcd2ab3..5acb51a9fc49 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -223,7 +223,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_node_page_state(NR_SECONDARY_PAGETABLE), - global_zone_page_state(NR_BOUNCE), + 0UL, global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, @@ -341,7 +341,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_BOUNCE)), + 0UL, K(free_pcp), K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); diff --git a/mm/swap.h b/mm/swap.h index 6f4a3f927edb..aa62463976d5 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -20,7 +20,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) __swap_read_unplug(plug); } void swap_write_unplug(struct swap_iocb *sio); -int swap_writepage(struct page *page, struct writeback_control *wbc); +int swap_writeout(struct folio *folio, struct writeback_control *wbc); void __swap_writepage(struct folio *folio, struct writeback_control *wbc); /* linux/mm/swap_state.c */ @@ -141,7 +141,7 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } -static inline int swap_writepage(struct page *p, struct writeback_control *wbc) +static inline int swap_writeout(struct folio *f, struct writeback_control *wbc) { return 0; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 68fd981b514f..ec2b1c9c9926 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -30,7 +30,6 @@ * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { - .writepage = swap_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, diff --git a/mm/swapfile.c b/mm/swapfile.c index 412ccd6543b3..86643b181098 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2368,7 +2368,7 @@ retry: * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; - * and even shmem_writepage() could have been preempted after + * and even shmem_writeout() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ diff --git a/mm/truncate.c b/mm/truncate.c index 5d98054094d1..f2aaf99f2990 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -191,6 +191,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio) bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { loff_t pos = folio_pos(folio); + size_t size = folio_size(folio); unsigned int offset, length; struct page *split_at, *split_at2; @@ -198,14 +199,13 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) offset = start - pos; else offset = 0; - length = folio_size(folio); - if (pos + length <= (u64)end) - length = length - offset; + if (pos + size <= (u64)end) + length = size - offset; else length = end + 1 - pos - offset; folio_wait_writeback(folio); - if (length == folio_size(folio)) { + if (length == size) { truncate_inode_folio(folio->mapping, folio); return true; } @@ -224,16 +224,20 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) return true; split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); - split_at2 = folio_page(folio, - PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE); - if (!try_folio_split(folio, split_at, NULL)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste * for shmem truncate */ - struct folio *folio2 = page_folio(split_at2); + struct folio *folio2; + + if (offset + length == size) + goto no_split; + + split_at2 = folio_page(folio, + PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE); + folio2 = page_folio(split_at2); if (!folio_try_get(folio2)) goto no_split; @@ -1834,6 +1834,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_vma_link: + fixup_hugetlb_reservations(new_vma); vma_close(new_vma); if (new_vma->vm_file) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2d7511654831..8a1f7783bbdb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3944,9 +3944,10 @@ void *vmalloc_noprof(unsigned long size) EXPORT_SYMBOL(vmalloc_noprof); /** - * vmalloc_huge - allocate virtually contiguous memory, allow huge pages + * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages * @size: allocation size * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. @@ -3955,13 +3956,13 @@ EXPORT_SYMBOL(vmalloc_noprof); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) { return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - NUMA_NO_NODE, __builtin_return_address(0)); + gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); } -EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); +EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof); /** * vzalloc - allocate virtually contiguous memory with zero fill @@ -4093,8 +4094,8 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) * would be a good heuristic for when to shrink the vm_area? */ if (size <= old_size) { - /* Zero out "freed" memory. */ - if (want_init_on_free()) + /* Zero out "freed" memory, potentially for future realloc. */ + if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); vm->requested_size = size; kasan_poison_vmalloc(p + size, old_size - size); @@ -4107,10 +4108,13 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) if (size <= alloced_size) { kasan_unpoison_vmalloc(p + old_size, size - old_size, KASAN_VMALLOC_PROT_NORMAL); - /* Zero out "alloced" memory. */ - if (want_init_on_alloc(flags)) - memset((void *)p + old_size, 0, size - old_size); + /* + * No need to zero memory here, as unused memory will have + * already been zeroed at initial allocation time or during + * realloc shrink time. + */ vm->requested_size = size; + return (void *)p; } /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 3783e45bfc92..b6f4db6c240f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -648,21 +648,20 @@ typedef enum { /* * pageout is called by shrink_folio_list() for each dirty folio. - * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) { + int (*writeout)(struct folio *, struct writeback_control *); + /* - * If the folio is dirty, only perform writeback if that write - * will be non-blocking. To prevent this allocation from being - * stalled by pagecache activity. But note that there may be - * stalls if we need to run get_block(). We could test - * PagePrivate for that. - * - * If this process is currently in __generic_file_write_iter() against - * this folio's queue, we can perform writeback even if that - * will block. + * We no longer attempt to writeback filesystem folios here, other + * than tmpfs/shmem. That's taken care of in page-writeback. + * If we find a dirty filesystem folio at the end of the LRU list, + * typically that means the filesystem is saturating the storage + * with contiguous writes and telling it to write a folio here + * would only make the situation worse by injecting an element + * of random access. * * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because @@ -685,7 +684,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } return PAGE_KEEP; } - if (mapping->a_ops->writepage == NULL) + if (shmem_mapping(mapping)) + writeout = shmem_writeout; + else if (folio_test_anon(folio)) + writeout = swap_writeout; + else return PAGE_ACTIVATE; if (folio_clear_dirty_for_io(folio)) { @@ -708,7 +711,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, wbc.list = folio_list; folio_set_reclaim(folio); - res = mapping->a_ops->writepage(&folio->page, &wbc); + res = writeout(folio, &wbc); if (res < 0) handle_write_error(mapping, folio, res); if (res == AOP_WRITEPAGE_ACTIVATE) { @@ -717,7 +720,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } if (!folio_test_writeback(folio)) { - /* synchronous write or broken a_ops? */ + /* synchronous write? */ folio_clear_reclaim(folio); } trace_mm_vmscan_write_folio(folio); |