summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/cma.c24
-rw-r--r--mm/compaction.c6
-rw-r--r--mm/dmapool.c15
-rw-r--r--mm/execmem.c40
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/gup.c4
-rw-r--r--mm/hmm.c262
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/hugetlb.c81
-rw-r--r--mm/hugetlb_vmemmap.c6
-rw-r--r--mm/internal.h27
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/kasan_test_c.c21
-rw-r--r--mm/kasan/shadow.c92
-rw-r--r--mm/memblock.c21
-rw-r--r--mm/memcontrol-v1.c2
-rw-r--r--mm/memcontrol.c49
-rw-r--r--mm/memory.c12
-rw-r--r--mm/migrate.c68
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/nommu.c18
-rw-r--r--mm/page-writeback.c32
-rw-r--r--mm/page_alloc.c220
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/page_table_check.c34
-rw-r--r--mm/readahead.c20
-rw-r--r--mm/shmem.c33
-rw-r--r--mm/show_mem.c4
-rw-r--r--mm/slub.c38
-rw-r--r--mm/swap.h4
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c34
-rw-r--r--mm/truncate.c20
-rw-r--r--mm/userfaultfd.c25
-rw-r--r--mm/vma.c52
-rw-r--r--mm/vma.h9
-rw-r--r--mm/vmalloc.c85
-rw-r--r--mm/vmscan.c58
-rw-r--r--mm/zsmalloc.c8
42 files changed, 1010 insertions, 442 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d3fb3762887b..e113f713b493 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -201,7 +201,7 @@ config KVFREE_RCU_BATCHED
config SLUB_TINY
bool "Configure for minimal memory footprint"
- depends on EXPERT
+ depends on EXPERT && !COMPILE_TEST
select SLAB_MERGE_DEFAULT
help
Configures the slab allocator in a way to achieve minimal memory
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e61bbb1bd622..783904d8c5ef 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1151,7 +1151,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
- del_timer_sync(&bdi->laptop_mode_wb_timer);
+ timer_delete_sync(&bdi->laptop_mode_wb_timer);
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
diff --git a/mm/cma.c b/mm/cma.c
index b06d5fe73399..c04be488b099 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -35,7 +35,7 @@
struct cma cma_areas[MAX_CMA_AREAS];
unsigned int cma_area_count;
-static int __init __cma_declare_contiguous_nid(phys_addr_t base,
+static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
phys_addr_t size, phys_addr_t limit,
phys_addr_t alignment, unsigned int order_per_bit,
bool fixed, const char *name, struct cma **res_cma,
@@ -370,7 +370,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size,
phys_addr_t align, unsigned int order_per_bit,
const char *name, struct cma **res_cma, int nid)
{
- phys_addr_t start, end;
+ phys_addr_t start = 0, end;
phys_addr_t size, sizesum, sizeleft;
struct cma_init_memrange *mrp, *mlp, *failed;
struct cma_memrange *cmrp;
@@ -384,7 +384,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size,
/*
* First, try it the normal way, producing just one range.
*/
- ret = __cma_declare_contiguous_nid(0, total_size, 0, align,
+ ret = __cma_declare_contiguous_nid(&start, total_size, 0, align,
order_per_bit, false, name, res_cma, nid);
if (ret != -ENOMEM)
goto out;
@@ -580,7 +580,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
{
int ret;
- ret = __cma_declare_contiguous_nid(base, size, limit, alignment,
+ ret = __cma_declare_contiguous_nid(&base, size, limit, alignment,
order_per_bit, fixed, name, res_cma, nid);
if (ret != 0)
pr_err("Failed to reserve %ld MiB\n",
@@ -592,14 +592,14 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
return ret;
}
-static int __init __cma_declare_contiguous_nid(phys_addr_t base,
+static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
phys_addr_t size, phys_addr_t limit,
phys_addr_t alignment, unsigned int order_per_bit,
bool fixed, const char *name, struct cma **res_cma,
int nid)
{
phys_addr_t memblock_end = memblock_end_of_DRAM();
- phys_addr_t highmem_start;
+ phys_addr_t highmem_start, base = *basep;
int ret;
/*
@@ -608,7 +608,10 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t base,
* complain. Find the boundary by adding one to the last valid
* address.
*/
- highmem_start = __pa(high_memory - 1) + 1;
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ highmem_start = __pa(high_memory - 1) + 1;
+ else
+ highmem_start = memblock_end_of_DRAM();
pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
__func__, &size, &base, &limit, &alignment);
@@ -722,12 +725,15 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t base,
}
ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
- if (ret)
+ if (ret) {
memblock_phys_free(base, size);
+ return ret;
+ }
(*res_cma)->nid = nid;
+ *basep = base;
- return ret;
+ return 0;
}
static void cma_debug_show_areas(struct cma *cma)
diff --git a/mm/compaction.c b/mm/compaction.c
index 139f00c0308a..ca71fd3c3181 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -981,13 +981,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
if (PageHuge(page)) {
+ const unsigned int order = compound_order(page);
/*
* skip hugetlbfs if we are not compacting for pages
* bigger than its order. THPs and other compound pages
* are handled below.
*/
if (!cc->alloc_contig) {
- const unsigned int order = compound_order(page);
if (order <= MAX_PAGE_ORDER) {
low_pfn += (1UL << order) - 1;
@@ -1011,8 +1011,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Do not report -EBUSY down the chain */
if (ret == -EBUSY)
ret = 0;
- low_pfn += compound_nr(page) - 1;
- nr_scanned += compound_nr(page) - 1;
+ low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
goto isolate_fail;
}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f0bfc6c490f4..5be8cc1c6529 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -56,6 +56,7 @@ struct dma_pool { /* the pool */
unsigned int size;
unsigned int allocation;
unsigned int boundary;
+ int node;
char name[32];
struct list_head pools;
};
@@ -199,12 +200,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
/**
- * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma.
* @name: name of pool, for diagnostics
* @dev: device that will be doing the DMA
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
+ * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on
* Context: not in_interrupt()
*
* Given one of these pools, dma_pool_alloc()
@@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
* Return: a dma allocation pool with the requested characteristics, or
* %NULL if one can't be created.
*/
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t boundary)
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary, int node)
{
struct dma_pool *retval;
size_t allocation;
@@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
boundary = min(boundary, allocation);
- retval = kzalloc(sizeof(*retval), GFP_KERNEL);
+ retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
@@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
+ retval->node = node;
INIT_LIST_HEAD(&retval->pools);
/*
@@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
mutex_unlock(&pools_reg_lock);
return retval;
}
-EXPORT_SYMBOL(dma_pool_create);
+EXPORT_SYMBOL(dma_pool_create_node);
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
@@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
{
struct dma_page *page;
- page = kmalloc(sizeof(*page), mem_flags);
+ page = kmalloc_node(sizeof(*page), mem_flags, pool->node);
if (!page)
return NULL;
diff --git a/mm/execmem.c b/mm/execmem.c
index e6c4f5076ca8..6f7a2653b280 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -254,6 +254,34 @@ out_unlock:
return ptr;
}
+static bool execmem_cache_rox = false;
+
+void execmem_cache_make_ro(void)
+{
+ struct maple_tree *free_areas = &execmem_cache.free_areas;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
+ MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
+ struct mutex *mutex = &execmem_cache.mutex;
+ void *area;
+
+ execmem_cache_rox = true;
+
+ mutex_lock(mutex);
+
+ mas_for_each(&mas_free, area, ULONG_MAX) {
+ unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT;
+ set_memory_ro(mas_free.index, pages);
+ }
+
+ mas_for_each(&mas_busy, area, ULONG_MAX) {
+ unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT;
+ set_memory_ro(mas_busy.index, pages);
+ }
+
+ mutex_unlock(mutex);
+}
+
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
@@ -274,9 +302,15 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
- err = set_memory_rox((unsigned long)p, vm->nr_pages);
- if (err)
- goto err_free_mem;
+ if (execmem_cache_rox) {
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
+ if (err)
+ goto err_free_mem;
+ } else {
+ err = set_memory_x((unsigned long)p, vm->nr_pages);
+ if (err)
+ goto err_free_mem;
+ }
err = execmem_cache_add(p, alloc_size);
if (err)
diff --git a/mm/filemap.c b/mm/filemap.c
index b5e784f34d98..7b90cbeb4a1a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2244,6 +2244,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
*start = folio->index + nr;
goto out;
}
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
diff --git a/mm/gup.c b/mm/gup.c
index 92351e2fa876..84461d384ae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2207,8 +2207,8 @@ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
} while (start != end);
mmap_read_unlock(mm);
- if (size > (unsigned long)uaddr - start)
- return size - ((unsigned long)uaddr - start);
+ if (size > start - (unsigned long)uaddr)
+ return size - (start - (unsigned long)uaddr);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
diff --git a/mm/hmm.c b/mm/hmm.c
index 082f7b7c0b9e..feac86196a65 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
+#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -39,13 +41,21 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
+enum {
+ /* These flags are carried from input-to-output */
+ HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
+ HMM_PFN_P2PDMA_BUS,
+};
+
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, unsigned long cpu_flags)
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++)
- range->hmm_pfns[i] = cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= cpu_flags;
+ }
return 0;
}
@@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long cpu_flags;
pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
+ uint64_t new_pfn_flags = 0;
if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *hmm_pfn = 0;
- return 0;
+ goto out;
}
if (!pte_present(pte)) {
@@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
- return 0;
+ new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+ goto out;
}
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
- if (!required_fault) {
- *hmm_pfn = 0;
- return 0;
- }
+ if (!required_fault)
+ goto out;
if (!non_swap_entry(entry))
goto fault;
@@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_unmap(ptep);
return -EFAULT;
}
- *hmm_pfn = HMM_PFN_ERROR;
- return 0;
+ new_pfn_flags = HMM_PFN_ERROR;
+ goto out;
}
- *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ new_pfn_flags = pte_pfn(pte) | cpu_flags;
+out:
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
return 0;
fault:
@@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
goto out_unlock;
}
@@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->hmm_pfns[i] = pfn | cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= pfn | cpu_flags;
+ }
spin_unlock(ptl);
return 0;
@@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_dma_map_alloc - Allocate HMM map structure
+ * @dev: device to allocate structure for
+ * @map: HMM map to allocate
+ * @nr_entries: number of entries in the map
+ * @dma_entry_size: size of the DMA entry in the map
+ *
+ * Allocate the HMM map structure and all the lists it contains.
+ * Return 0 on success, -ENOMEM on failure.
+ */
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size)
+{
+ bool dma_need_sync = false;
+ bool use_iova;
+
+ WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
+
+ /*
+ * The HMM API violates our normal DMA buffer ownership rules and can't
+ * transfer buffer ownership. The dma_addressing_limited() check is a
+ * best approximation to ensure no swiotlb buffering happens.
+ */
+#ifdef CONFIG_DMA_NEED_SYNC
+ dma_need_sync = !dev->dma_skip_sync;
+#endif /* CONFIG_DMA_NEED_SYNC */
+ if (dma_need_sync || dma_addressing_limited(dev))
+ return -EOPNOTSUPP;
+
+ map->dma_entry_size = dma_entry_size;
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
+ return -ENOMEM;
+
+ use_iova = dma_iova_try_alloc(dev, &map->state, 0,
+ nr_entries * PAGE_SIZE);
+ if (!use_iova && dma_need_unmap(dev)) {
+ map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->dma_list)
+ goto err_dma;
+ }
+ return 0;
+
+err_dma:
+ kvfree(map->pfn_list);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
+
+/**
+ * hmm_dma_map_free - iFree HMM map structure
+ * @dev: device to free structure from
+ * @map: HMM map containing the various lists and state
+ *
+ * Free the HMM map structure and all the lists it contains.
+ */
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
+{
+ if (dma_use_iova(&map->state))
+ dma_iova_free(dev, &map->state);
+ kvfree(map->pfn_list);
+ kvfree(map->dma_list);
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_free);
+
+/**
+ * hmm_dma_map_pfn - Map a physical HMM page to DMA address
+ * @dev: Device to map the page for
+ * @map: HMM map
+ * @idx: Index into the PFN and dma address arrays
+ * @p2pdma_state: PCI P2P state.
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by their use in
+ * iova->size. Call this function after IOVA allocation to link whole @page
+ * to get the DMA address. Note that very first call to this function
+ * will have @offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx,
+ struct pci_p2pdma_map_state *p2pdma_state)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ struct page *page = hmm_pfn_to_page(pfns[idx]);
+ phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
+ size_t offset = idx * map->dma_entry_size;
+ unsigned long attrs = 0;
+ dma_addr_t dma_addr;
+ int ret;
+
+ if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
+ !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ */
+ if (dma_use_iova(state))
+ return state->addr + offset;
+
+ /*
+ * Without dma_need_unmap, the dma_addrs array is NULL, thus we
+ * need to regenerate the address below even if there already
+ * was a mapping. But !dma_need_unmap implies that the
+ * mapping stateless, so this is fine.
+ */
+ if (dma_need_unmap(dev))
+ return dma_addrs[idx];
+
+ /* Continue to remapping */
+ }
+
+ switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ pfns[idx] |= HMM_PFN_P2PDMA;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
+ return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ default:
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (dma_use_iova(state)) {
+ ret = dma_iova_link(dev, state, paddr, offset,
+ map->dma_entry_size, DMA_BIDIRECTIONAL,
+ attrs);
+ if (ret)
+ goto error;
+
+ ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
+ if (ret) {
+ dma_iova_unlink(dev, state, offset, map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+ goto error;
+ }
+
+ dma_addr = state->addr + offset;
+ } else {
+ if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
+ goto error;
+
+ dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, dma_addr))
+ goto error;
+
+ if (dma_need_unmap(dev))
+ dma_addrs[idx] = dma_addr;
+ }
+ pfns[idx] |= HMM_PFN_DMA_MAPPED;
+ return dma_addr;
+error:
+ pfns[idx] &= ~HMM_PFN_P2PDMA;
+ return DMA_MAPPING_ERROR;
+
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
+
+/**
+ * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
+ * @dev: Device to unmap the page from
+ * @map: HMM map
+ * @idx: Index of the PFN to unmap
+ *
+ * Returns true if the PFN was mapped and has been unmapped, false otherwise.
+ */
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
+{
+ const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ unsigned long attrs = 0;
+
+ if ((pfns[idx] & valid_dma) != valid_dma)
+ return false;
+
+ if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
+ ; /* no need to unmap bus address P2P mappings */
+ else if (dma_use_iova(state)) {
+ if (pfns[idx] & HMM_PFN_P2PDMA)
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ dma_iova_unlink(dev, state, idx * map->dma_entry_size,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
+ } else if (dma_need_unmap(dev))
+ dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+
+ pfns[idx] &=
+ ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
+ return true;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2a47682d1ab7..47d76d03ce30 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3075,6 +3075,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze, struct folio *folio)
{
+ bool pmd_migration = is_pmd_migration_entry(*pmd);
+
VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
@@ -3085,9 +3087,12 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
* require a folio to check the PMD against. Otherwise, there
* is a risk of replacing the wrong folio.
*/
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
- is_pmd_migration_entry(*pmd)) {
- if (folio && folio != pmd_folio(*pmd))
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || pmd_migration) {
+ /*
+ * Do not apply pmd_folio() to a migration entry; and folio lock
+ * guarantees that it must be of the wrong folio anyway.
+ */
+ if (folio && (pmd_migration || folio != pmd_folio(*pmd)))
return;
__split_huge_pmd_locked(vma, pmd, address, freeze);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 39f92aad7bd1..6a3cf7935c14 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1250,7 +1250,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
/*
* Reset and decrement one ref on hugepage private reservation.
* Called with mm->mmap_lock writer semaphore held.
- * This function should be only used by move_vma() and operate on
+ * This function should be only used by mremap and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
*/
@@ -2271,7 +2271,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
* as surplus_pages, otherwise it might confuse
* persistent_huge_pages() momentarily.
*/
- __prep_account_new_huge_page(h, nid);
+ __prep_account_new_huge_page(h, folio_nid(folio));
/*
* We could have raced with the pool size change.
@@ -2949,12 +2949,20 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
while (start_pfn < end_pfn) {
folio = pfn_folio(start_pfn);
+
+ /*
+ * The folio might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ */
+ spin_lock_irq(&hugetlb_lock);
if (folio_test_hugetlb(folio)) {
h = folio_hstate(folio);
} else {
+ spin_unlock_irq(&hugetlb_lock);
start_pfn++;
continue;
}
+ spin_unlock_irq(&hugetlb_lock);
if (!folio_ref_count(folio)) {
ret = alloc_and_dissolve_hugetlb_folio(h, folio,
@@ -3010,7 +3018,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long retval, gbl_chg;
+ long retval, gbl_chg, gbl_reserve;
map_chg_state map_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
@@ -3163,8 +3171,16 @@ out_uncharge_cgroup_reservation:
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
- if (map_chg)
- hugepage_subpool_put_pages(spool, 1);
+ /*
+ * put page to subpool iff the quota of subpool's rsv_hpages is used
+ * during hugepage_subpool_get_pages.
+ */
+ if (map_chg && !gbl_chg) {
+ gbl_reserve = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -gbl_reserve);
+ }
+
+
out_end_reservation:
if (map_chg != MAP_CHG_ENFORCED)
vma_end_reservation(h, vma, addr);
@@ -3825,6 +3841,7 @@ found:
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
+ unsigned long persistent_free_count;
unsigned long min_count;
unsigned long allocated;
struct folio *folio;
@@ -3959,8 +3976,24 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
+ *
+ * min_count is the expected number of persistent pages, we
+ * shouldn't calculate min_count by using
+ * resv_huge_pages + persistent_huge_pages() - free_huge_pages,
+ * because there may exist free surplus huge pages, and this will
+ * lead to subtracting twice. Free surplus huge pages come from HVO
+ * failing to restore vmemmap, see comments in the callers of
+ * hugetlb_vmemmap_restore_folio(). Thus, we should calculate
+ * persistent free count first.
*/
- min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+ persistent_free_count = h->free_huge_pages;
+ if (h->free_huge_pages > persistent_huge_pages(h)) {
+ if (h->free_huge_pages > h->surplus_huge_pages)
+ persistent_free_count -= h->surplus_huge_pages;
+ else
+ persistent_free_count = 0;
+ }
+ min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
@@ -4017,10 +4050,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
list_for_each_entry_safe(folio, next, src_list, lru) {
int i;
+ bool cma;
if (folio_test_hugetlb_vmemmap_optimized(folio))
continue;
+ cma = folio_test_hugetlb_cma(folio);
+
list_del(&folio->lru);
split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
@@ -4036,6 +4072,9 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
new_folio->mapping = NULL;
init_new_hugetlb_folio(dst, new_folio);
+ /* Copy the CMA flag so that it is freed correctly */
+ if (cma)
+ folio_set_hugetlb_cma(new_folio);
list_add(&new_folio->lru, &dst_list);
}
}
@@ -4630,7 +4669,7 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
}
#ifdef CONFIG_NUMA
@@ -7216,7 +7255,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg = -1, add = -1;
+ long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -7351,8 +7390,16 @@ bool hugetlb_reserve_pages(struct inode *inode,
return true;
out_put_pages:
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
+ spool_resv = chg - gbl_reserve;
+ if (spool_resv) {
+ /* put sub pool's reservation back, chg - gbl_reserve */
+ gbl_resv = hugepage_subpool_put_pages(spool, spool_resv);
+ /*
+ * subpool's reserved pages can not be put back due to race,
+ * return to hstate.
+ */
+ hugetlb_acct_memory(h, -gbl_resv);
+ }
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
@@ -7892,3 +7939,17 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
}
+
+/*
+ * For hugetlb, mremap() is an odd edge case - while the VMA copying is
+ * performed, we permit both the old and new VMAs to reference the same
+ * reservation.
+ *
+ * We fix this up after the operation succeeds, or if a newly allocated VMA
+ * is closed as a result of a failure to allocate memory.
+ */
+void fixup_hugetlb_reservations(struct vm_area_struct *vma)
+{
+ if (is_vm_hugetlb_page(vma))
+ clear_vma_resv_huge_pages(vma);
+}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 9a99dfa3c495..27245e86df25 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* struct page, the special metadata (e.g. page->flags or page->mapping)
* cannot copy to the tail struct page structs. The invalid value will be
* checked in the free_tail_page_prepare(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
+ * of "corrupted mapping in tail page". We need to reset at least 4 (one
+ * head struct page struct and three tail struct page structs) struct page
* structs.
*/
-#define NR_RESET_STRUCT_PAGE 3
+#define NR_RESET_STRUCT_PAGE 4
static inline void reset_struct_pages(struct page *start)
{
diff --git a/mm/internal.h b/mm/internal.h
index 50c2f590b2d0..5c7a2b43ad76 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -248,11 +248,9 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
bool *any_writable, bool *any_young, bool *any_dirty)
{
- unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
- const pte_t *end_ptep = start_ptep + max_nr;
pte_t expected_pte, *ptep;
bool writable, young, dirty;
- int nr;
+ int nr, cur_nr;
if (any_writable)
*any_writable = false;
@@ -265,11 +263,15 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+ max_nr = min_t(unsigned long, max_nr,
+ folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
+
nr = pte_batch_hint(start_ptep, pte);
expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
ptep = start_ptep + nr;
- while (ptep < end_ptep) {
+ while (nr < max_nr) {
pte = ptep_get(ptep);
if (any_writable)
writable = !!pte_write(pte);
@@ -282,14 +284,6 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
if (!pte_same(pte, expected_pte))
break;
- /*
- * Stop immediately once we reached the end of the folio. In
- * corner cases the next PFN might fall into a different
- * folio.
- */
- if (pte_pfn(pte) >= folio_end_pfn)
- break;
-
if (any_writable)
*any_writable |= writable;
if (any_young)
@@ -297,12 +291,13 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
if (any_dirty)
*any_dirty |= dirty;
- nr = pte_batch_hint(ptep, pte);
- expected_pte = pte_advance_pfn(expected_pte, nr);
- ptep += nr;
+ cur_nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, cur_nr);
+ ptep += cur_nr;
+ nr += cur_nr;
}
- return min(ptep - start_ptep, max_nr);
+ return min(nr, max_nr);
}
/**
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1a958e7c8a46..dd93ae8a6beb 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN)
ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan,
# we need to treat them normally (as builtins), otherwise the compiler won't
@@ -44,6 +44,7 @@ ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
CFLAGS_KASAN_TEST += -fno-builtin
endif
+CFLAGS_REMOVE_kasan_test_c.o += $(call cc-option, -Wvla-larger-than=1)
CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST)
RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN)
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 3ea317837c2d..5f922dd38ffa 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1567,6 +1567,7 @@ static void kasan_memcmp(struct kunit *test)
static void kasan_strings(struct kunit *test)
{
char *ptr;
+ char *src;
size_t size = 24;
/*
@@ -1578,6 +1579,25 @@ static void kasan_strings(struct kunit *test)
ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO);
+ strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE);
+
+ /*
+ * Make sure that strscpy() does not trigger KASAN if it overreads into
+ * poisoned memory.
+ *
+ * The expected size does not include the terminator '\0'
+ * so it is (KASAN_GRANULE_SIZE - 2) ==
+ * KASAN_GRANULE_SIZE - ("initial removed character" + "\0").
+ */
+ KUNIT_EXPECT_EQ(test, KASAN_GRANULE_SIZE - 2,
+ strscpy(ptr, src + 1, KASAN_GRANULE_SIZE));
+
+ /* strscpy should fail if the first byte is unreadable. */
+ KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE,
+ KASAN_GRANULE_SIZE));
+
+ kfree(src);
kfree(ptr);
/*
@@ -2127,4 +2147,5 @@ static struct kunit_suite kasan_kunit_test_suite = {
kunit_test_suite(kasan_kunit_test_suite);
+MODULE_DESCRIPTION("KUnit tests for checking KASAN bug-detection capabilities");
MODULE_LICENSE("GPL");
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 88d1c9dcb507..d2c70cd2afb1 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -292,33 +292,99 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start,
{
}
+struct vmalloc_populate_data {
+ unsigned long start;
+ struct page **pages;
+};
+
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
+ void *_data)
{
- unsigned long page;
+ struct vmalloc_populate_data *data = _data;
+ struct page *page;
pte_t pte;
+ int index;
if (likely(!pte_none(ptep_get(ptep))))
return 0;
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+ index = PFN_DOWN(addr - data->start);
+ page = data->pages[index];
+ __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
+ data->pages[index] = NULL;
}
spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
+
+ return 0;
+}
+
+static void ___free_pages_bulk(struct page **pages, int nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ __free_pages(pages[i], 0);
+ pages[i] = NULL;
+ }
+ }
+}
+
+static int ___alloc_pages_bulk(struct page **pages, int nr_pages)
+{
+ unsigned long nr_populated, nr_total = nr_pages;
+ struct page **page_array = pages;
+
+ while (nr_pages) {
+ nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages);
+ if (!nr_populated) {
+ ___free_pages_bulk(page_array, nr_total - nr_pages);
+ return -ENOMEM;
+ }
+ pages += nr_populated;
+ nr_pages -= nr_populated;
+ }
+
return 0;
}
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
+{
+ unsigned long nr_pages, nr_total = PFN_UP(end - start);
+ struct vmalloc_populate_data data;
+ int ret = 0;
+
+ data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!data.pages)
+ return -ENOMEM;
+
+ while (nr_total) {
+ nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
+ ret = ___alloc_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ data.start = start;
+ ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
+ kasan_populate_vmalloc_pte, &data);
+ ___free_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ start += nr_pages * PAGE_SIZE;
+ nr_total -= nr_pages;
+ }
+
+ free_page((unsigned long)data.pages);
+
+ return ret;
+}
+
int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
{
unsigned long shadow_start, shadow_end;
@@ -348,9 +414,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
shadow_start = PAGE_ALIGN_DOWN(shadow_start);
shadow_end = PAGE_ALIGN(shadow_end);
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
+ ret = __kasan_populate_vmalloc(shadow_start, shadow_end);
if (ret)
return ret;
diff --git a/mm/memblock.c b/mm/memblock.c
index 0a53db4d9f7b..0e9ebb8aa7fe 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -457,7 +457,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
min(new_area_start, memblock.current_limit),
new_alloc_size, PAGE_SIZE);
- new_array = addr ? __va(addr) : NULL;
+ if (addr) {
+ /* The memory may not have been accepted, yet. */
+ accept_memory(addr, new_alloc_size);
+
+ new_array = __va(addr);
+ } else {
+ new_array = NULL;
+ }
}
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
@@ -2183,11 +2190,14 @@ static void __init memmap_init_reserved_pages(void)
struct memblock_region *region;
phys_addr_t start, end;
int nid;
+ unsigned long max_reserved;
/*
* set nid on all reserved pages and also treat struct
* pages for the NOMAP regions as PageReserved
*/
+repeat:
+ max_reserved = memblock.reserved.max;
for_each_mem_region(region) {
nid = memblock_get_region_node(region);
start = region->base;
@@ -2196,8 +2206,15 @@ static void __init memmap_init_reserved_pages(void)
if (memblock_is_nomap(region))
reserve_bootmem_region(start, end, nid);
- memblock_set_node(start, end, &memblock.reserved, nid);
+ memblock_set_node(start, region->size, &memblock.reserved, nid);
}
+ /*
+ * 'max' is changed means memblock.reserved has been doubled its
+ * array, which may result a new reserved region before current
+ * 'start'. Now we should repeat the procedure to set its node id.
+ */
+ if (max_reserved != memblock.reserved.max)
+ goto repeat;
/*
* initialize struct pages for reserved regions that don't have
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 8660908850dc..4a9cf27a70af 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -620,7 +620,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+ swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 421740f1bcdc..ec39e62b172e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -582,7 +582,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
if (!val)
return;
- cgroup_rstat_updated(memcg->css.cgroup, cpu);
+ css_rstat_updated(&memcg->css, cpu);
statc = this_cpu_ptr(memcg->vmstats_percpu);
for (; statc; statc = statc->parent) {
stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
@@ -614,7 +614,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
- cgroup_rstat_flush(memcg->css.cgroup);
+ css_rstat_flush(&memcg->css);
}
/*
@@ -1168,7 +1168,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
- int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1178,10 +1177,9 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it))) {
- /* Avoid potential softlockup warning */
- if ((++i & 1023) == 0)
- cond_resched();
ret = fn(task, arg);
+ /* Avoid potential softlockup warning */
+ cond_resched();
}
css_task_iter_end(&it);
if (ret) {
@@ -1759,7 +1757,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
struct memcg_stock_pcp {
- localtry_lock_t stock_lock;
+ local_trylock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -1774,7 +1772,7 @@ struct memcg_stock_pcp {
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCALTRY_LOCK(stock_lock),
+ .stock_lock = INIT_LOCAL_TRYLOCK(stock_lock),
};
static DEFINE_MUTEX(percpu_charge_mutex);
@@ -1805,11 +1803,10 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
- if (!gfpflags_allow_spinning(gfp_mask))
- return ret;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
- }
+ if (gfpflags_allow_spinning(gfp_mask))
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ else if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags))
+ return ret;
stock = this_cpu_ptr(&memcg_stock);
stock_pages = READ_ONCE(stock->nr_pages);
@@ -1818,7 +1815,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
ret = true;
}
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -1857,14 +1854,14 @@ static void drain_local_stock(struct work_struct *dummy)
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
old = drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -1894,7 +1891,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long flags;
- if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
/*
* In case of unlikely failure to lock percpu stock_lock
* uncharge memcg directly.
@@ -1907,7 +1904,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
return;
}
__refill_stock(memcg, nr_pages);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
}
/*
@@ -1964,9 +1961,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
stock = &per_cpu(memcg_stock, cpu);
/* drain_obj_stock requires stock_lock */
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
old = drain_obj_stock(stock);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
drain_stock(stock);
obj_cgroup_put(old);
@@ -2787,7 +2784,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
unsigned long flags;
int *bytes;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
/*
@@ -2836,7 +2833,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
__mod_objcg_mlstate(objcg, pgdat, idx, nr);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -2846,7 +2843,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
unsigned long flags;
bool ret = false;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
@@ -2854,7 +2851,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
ret = true;
}
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -2946,7 +2943,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
unsigned long flags;
unsigned int nr_pages = 0;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
@@ -2960,7 +2957,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
if (nr_pages)
diff --git a/mm/memory.c b/mm/memory.c
index 2d8c265fc7d6..49199410805c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1361,7 +1361,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
- unsigned long next, pfn;
+ unsigned long next, pfn = 0;
bool is_cow;
int ret;
@@ -2938,11 +2938,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (fn) {
do {
if (create || !pte_none(ptep_get(pte))) {
- err = fn(pte++, addr, data);
+ err = fn(pte, addr, data);
if (err)
break;
}
- } while (addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
@@ -3734,8 +3734,6 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
return false;
VM_WARN_ON_ONCE(folio_test_ksm(folio));
- VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio));
- VM_WARN_ON_ONCE(folio_entire_mapcount(folio));
if (unlikely(folio_test_swapcache(folio))) {
/*
@@ -3753,13 +3751,15 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
/* Stabilize the mapcount vs. refcount and recheck. */
folio_lock_large_mapcount(folio);
- VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio));
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
if (folio_test_large_maybe_mapped_shared(folio))
goto unlock;
if (folio_large_mapcount(folio) != folio_ref_count(folio))
goto unlock;
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
diff --git a/mm/migrate.c b/mm/migrate.c
index f3ee6d8d5e2e..c80591514e66 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -845,9 +845,11 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return -EAGAIN;
if (check_refs) {
- bool busy;
+ bool busy, migrating;
bool invalidated = false;
+ migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
+ VM_WARN_ON_ONCE(migrating);
recheck_buffers:
busy = false;
spin_lock(&mapping->i_private_lock);
@@ -859,12 +861,12 @@ recheck_buffers:
}
bh = bh->b_this_page;
} while (bh != head);
+ spin_unlock(&mapping->i_private_lock);
if (busy) {
if (invalidated) {
rc = -EAGAIN;
goto unlock_buffers;
}
- spin_unlock(&mapping->i_private_lock);
invalidate_bh_lrus();
invalidated = true;
goto recheck_buffers;
@@ -883,7 +885,7 @@ recheck_buffers:
unlock_buffers:
if (check_refs)
- spin_unlock(&mapping->i_private_lock);
+ clear_bit_unlock(BH_Migrate, &head->b_state);
bh = head;
do {
unlock_buffer(bh);
@@ -945,66 +947,20 @@ int filemap_migrate_folio(struct address_space *mapping,
EXPORT_SYMBOL_GPL(filemap_migrate_folio);
/*
- * Writeback a folio to clean the dirty state
- */
-static int writeout(struct address_space *mapping, struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 1,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1
- };
- int rc;
-
- if (!mapping->a_ops->writepage)
- /* No write method for the address space */
- return -EINVAL;
-
- if (!folio_clear_dirty_for_io(folio))
- /* Someone else already triggered a write */
- return -EAGAIN;
-
- /*
- * A dirty folio may imply that the underlying filesystem has
- * the folio on some queue. So the folio must be clean for
- * migration. Writeout may mean we lose the lock and the
- * folio state is no longer what we checked for earlier.
- * At this point we know that the migration attempt cannot
- * be successful.
- */
- remove_migration_ptes(folio, folio, 0);
-
- rc = mapping->a_ops->writepage(&folio->page, &wbc);
-
- if (rc != AOP_WRITEPAGE_ACTIVATE)
- /* unlocked. Relock */
- folio_lock(folio);
-
- return (rc < 0) ? -EIO : -EAGAIN;
-}
-
-/*
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- if (folio_test_dirty(src)) {
- /* Only writeback folios in full synchronous migration */
- switch (mode) {
- case MIGRATE_SYNC:
- break;
- default:
- return -EBUSY;
- }
- return writeout(mapping, src);
- }
+ WARN_ONCE(mapping->a_ops->writepages,
+ "%ps does not implement migrate_folio\n",
+ mapping->a_ops);
+ if (folio_test_dirty(src))
+ return -EBUSY;
/*
- * Buffers may be managed in a filesystem specific way.
- * We must have no buffers or drop them.
+ * Filesystem may have private data at folio->private that we
+ * can't migrate automatically.
*/
if (!filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 84f14fa12d0d..eedce9321e13 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1785,7 +1785,7 @@ static bool arch_has_descending_max_zone_pfns(void)
return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
}
-static void set_high_memory(void)
+static void __init set_high_memory(void)
{
phys_addr_t highmem = memblock_end_of_DRAM();
diff --git a/mm/mremap.c b/mm/mremap.c
index 7db9da609c84..0d4948b720e2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1188,8 +1188,7 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
mremap_userfaultfd_prep(new_vma, vrm->uf);
}
- if (is_vm_hugetlb_page(vma))
- clear_vma_resv_huge_pages(vma);
+ fixup_hugetlb_reservations(vma);
/* Tell pfnmap has moved from this vma */
if (unlikely(vma->vm_flags & VM_PFNMAP))
diff --git a/mm/nommu.c b/mm/nommu.c
index 617e7ba8022f..70f92f9a7fab 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -200,7 +200,23 @@ void *vmalloc_noprof(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_noprof);
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
+/*
+ * vmalloc_huge_node - allocate virtually contiguous memory, on a node
+ *
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Due to NOMMU implications the node argument and HUGE page attribute is
+ * ignored.
+ */
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
+{
+ return __vmalloc_noprof(size, gfp_mask);
+}
/*
* vzalloc - allocate virtually contiguous memory with zero fill
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 18456ddd463b..76200cd85fe7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -640,7 +640,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
- del_timer_sync(&dom->period_timer);
+ timer_delete_sync(&dom->period_timer);
fprop_global_destroy(&dom->completions);
}
#endif
@@ -2229,7 +2229,7 @@ void laptop_sync_completion(void)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
- del_timer(&bdi->laptop_mode_wb_timer);
+ timer_delete(&bdi->laptop_mode_wb_timer);
rcu_read_unlock();
}
@@ -2621,27 +2621,6 @@ int write_cache_pages(struct address_space *mapping,
}
EXPORT_SYMBOL(write_cache_pages);
-static int writeback_use_writepage(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct folio *folio = NULL;
- struct blk_plug plug;
- int err;
-
- blk_start_plug(&plug);
- while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
- err = mapping->a_ops->writepage(&folio->page, wbc);
- if (err == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- err = 0;
- }
- mapping_set_error(mapping, err);
- }
- blk_finish_plug(&plug);
-
- return err;
-}
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2652,14 +2631,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages) {
+ if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
- } else if (mapping->a_ops->writepage) {
- ret = writeback_use_writepage(mapping, wbc);
- } else {
+ else
/* deal with chardevs and other special files */
ret = 0;
- }
if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd6b865cb1ab..4f29e393f6af 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -290,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
-static bool cond_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
@@ -897,9 +898,7 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
-#ifdef CONFIG_PAGE_POOL
- ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
-#endif
+ page_pool_page_is_pp(page) |
(page->flags & check_flags)))
return false;
@@ -926,10 +925,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
-#ifdef CONFIG_PAGE_POOL
- if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ if (unlikely(page_pool_page_is_pp(page)))
bad_reason = "page_pool leak";
-#endif
return bad_reason;
}
@@ -1151,14 +1148,9 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
__pgalloc_tag_sub(page, nr);
}
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
+/* When tag is not NULL, assuming mem_alloc_profiling_enabled */
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
{
- struct alloc_tag *tag;
-
- if (!mem_alloc_profiling_enabled())
- return;
-
- tag = __pgalloc_tag_get(page);
if (tag)
this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
}
@@ -1168,7 +1160,7 @@ static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
#endif /* CONFIG_MEM_ALLOC_PROFILING */
@@ -1400,11 +1392,12 @@ static void free_one_page(struct zone *zone, struct page *page,
struct llist_head *llhead;
unsigned long flags;
- if (!spin_trylock_irqsave(&zone->lock, flags)) {
- if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
add_page_to_zone_llist(zone, page, order);
return;
}
+ } else {
spin_lock_irqsave(&zone->lock, flags);
}
@@ -2182,23 +2175,15 @@ try_to_claim_block(struct zone *zone, struct page *page,
}
/*
- * Try finding a free buddy page on the fallback list.
- *
- * This will attempt to claim a whole pageblock for the requested type
- * to ensure grouping of such requests in the future.
- *
- * If a whole block cannot be claimed, steal an individual page, regressing to
- * __rmqueue_smallest() logic to at least break up as little contiguity as
- * possible.
+ * Try to allocate from some fallback migratetype by claiming the entire block,
+ * i.e. converting it to the allocation's start migratetype.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
- *
- * Return the stolen page, or NULL if none can be found.
*/
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2236,14 +2221,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
page = try_to_claim_block(zone, page, current_order, order,
start_migratetype, fallback_mt,
alloc_flags);
- if (page)
- goto got_one;
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
- if (alloc_flags & ALLOC_NOFRAGMENT)
- return NULL;
+ return NULL;
+}
+
+/*
+ * Try to steal a single page from some fallback migratetype. Leave the rest of
+ * the block as its current migratetype, potentially causing fragmentation.
+ */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
+ bool claim_block;
- /* No luck claiming pageblock. Find the smallest fallback page */
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2253,25 +2253,28 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
page = get_page_from_free_area(area, fallback_mt);
page_del_and_expand(zone, page, order, current_order, fallback_mt);
- goto got_one;
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
}
return NULL;
-
-got_one:
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
-
- return page;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2290,16 +2293,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * First try the freelists of the requested migratetype, then try
+ * fallbacks modes with increasing levels of fragmentation risk.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished preferred freelist, back to normal mode. */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+ return NULL;
}
/*
@@ -2311,17 +2346,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
- if (!spin_trylock_irqsave(&zone->lock, flags)) {
- if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
return 0;
+ } else {
spin_lock_irqsave(&zone->lock, flags);
}
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2937,15 +2974,18 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- if (!spin_trylock_irqsave(&zone->lock, flags)) {
- if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
return NULL;
+ } else {
spin_lock_irqsave(&zone->lock, flags);
}
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
+
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
* If the allocation fails, allow OOM handling and
@@ -3422,18 +3462,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
return false;
}
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int highest_zoneidx)
-{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
-
- if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
- free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
- return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
- free_pages);
-}
-
#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
@@ -3580,7 +3608,7 @@ retry:
}
}
- cond_accept_memory(zone, order);
+ cond_accept_memory(zone, order, alloc_flags);
/*
* Detect whether the number of free pages is below high
@@ -3607,7 +3635,7 @@ check_alloc_wmark:
gfp_mask)) {
int ret;
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/*
@@ -3660,7 +3688,7 @@ try_this_zone:
return page;
} else {
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/* Try again if zone has deferred pages */
@@ -4530,6 +4558,14 @@ restart:
}
retry:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -4813,7 +4849,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto failed;
}
- cond_accept_memory(zone, 0);
+ cond_accept_memory(zone, 0, alloc_flags);
retry_this_zone:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
if (zone_watermark_fast(zone, 0, mark,
@@ -4822,7 +4858,7 @@ retry_this_zone:
break;
}
- if (cond_accept_memory(zone, 0))
+ if (cond_accept_memory(zone, 0, alloc_flags))
goto retry_this_zone;
/* Try again if zone has deferred pages */
@@ -5029,11 +5065,13 @@ static void ___free_pages(struct page *page, unsigned int order,
{
/* get PageHead before we drop reference */
int head = PageHead(page);
+ /* get alloc tag in case the page is released by others */
+ struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
__free_frozen_pages(page, order, fpi_flags);
else if (!head) {
- pgalloc_tag_sub_pages(page, (1 << order) - 1);
+ pgalloc_tag_sub_pages(tag, (1 << order) - 1);
while (order-- > 0)
__free_frozen_pages(page + (1 << order), order,
fpi_flags);
@@ -7138,9 +7176,6 @@ bool has_managed_dma(void)
#ifdef CONFIG_UNACCEPTED_MEMORY
-/* Counts number of zones with unaccepted pages. */
-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
-
static bool lazy_accept = true;
static int __init accept_memory_parse(char *p)
@@ -7167,11 +7202,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
static void __accept_page(struct zone *zone, unsigned long *flags,
struct page *page)
{
- bool last;
-
list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
-
account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
__ClearPageUnaccepted(page);
@@ -7180,9 +7211,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags,
accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
-
- if (last)
- static_branch_dec(&zones_with_unaccepted_pages);
}
void accept_page(struct page *page)
@@ -7219,20 +7247,17 @@ static bool try_to_accept_memory_one(struct zone *zone)
return true;
}
-static inline bool has_unaccepted_memory(void)
-{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
long to_accept, wmark;
bool ret = false;
- if (!has_unaccepted_memory())
+ if (list_empty(&zone->unaccepted_pages))
return false;
- if (list_empty(&zone->unaccepted_pages))
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (alloc_flags & ALLOC_TRYLOCK)
return false;
wmark = promo_wmark_pages(zone);
@@ -7265,22 +7290,17 @@ static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long flags;
- bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
- first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
__SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
- if (first)
- static_branch_inc(&zones_with_unaccepted_pages);
-
return true;
}
@@ -7291,7 +7311,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
return false;
}
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
return false;
}
@@ -7362,11 +7383,6 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
if (!pcp_allowed_order(order))
return NULL;
-#ifdef CONFIG_UNACCEPTED_MEMORY
- /* Bailout, since try_to_accept_memory_one() needs to take a lock */
- if (has_unaccepted_memory())
- return NULL;
-#endif
/* Bailout, since _deferred_grow_zone() needs to take a lock */
if (deferred_pages_enabled())
return NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index 4bce19df557b..f7716b6569fa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -237,9 +237,8 @@ static void swap_zeromap_folio_clear(struct folio *folio)
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
-int swap_writepage(struct page *page, struct writeback_control *wbc)
+int swap_writeout(struct folio *folio, struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
int ret;
if (folio_free_swap(folio)) {
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 68109ee93841..4eeca782b888 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -218,33 +218,39 @@ static inline void page_table_check_pmd_flags(pmd_t pmd)
WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
}
-void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
+void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
+ unsigned int nr)
{
+ unsigned long stride = PMD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
page_table_check_pmd_flags(pmd);
- __page_table_check_pmd_clear(mm, *pmdp);
- if (pmd_user_accessible_page(pmd)) {
- page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
- pmd_write(pmd));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pmd_clear(mm, *(pmdp + i));
+ if (pmd_user_accessible_page(pmd))
+ page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd));
}
-EXPORT_SYMBOL(__page_table_check_pmd_set);
+EXPORT_SYMBOL(__page_table_check_pmds_set);
-void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
+void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud,
+ unsigned int nr)
{
+ unsigned long stride = PUD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
- __page_table_check_pud_clear(mm, *pudp);
- if (pud_user_accessible_page(pud)) {
- page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
- pud_write(pud));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pud_clear(mm, *(pudp + i));
+ if (pud_user_accessible_page(pud))
+ page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud));
}
-EXPORT_SYMBOL(__page_table_check_pud_set);
+EXPORT_SYMBOL(__page_table_check_puds_set);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
diff --git a/mm/readahead.c b/mm/readahead.c
index 6a4e96b69702..20d36d6b055e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
+ struct file *file;
+ const struct inode *inode;
+
CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ file = fd_file(f);
+ if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+ if (!file->f_mapping)
+ return -EINVAL;
+ if (!file->f_mapping->a_ops)
+ return -EINVAL;
+
+ inode = file_inode(file);
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -EINVAL;
+ if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
diff --git a/mm/shmem.c b/mm/shmem.c
index 99327c30507c..858cee02ca49 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * shmem_fallocate communicates with shmem_fault or shmem_writeout via
* inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
@@ -107,7 +107,7 @@ struct shmem_falloc {
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
- pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+ pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
struct shmem_options {
@@ -446,7 +446,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/*
* Special case: whereas normally shmem_recalc_inode() is called
* after i_mapping->nrpages has already been adjusted (up or down),
- * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * shmem_writeout() has to raise swapped before nrpages is lowered -
* to stop a racing shmem_recalc_inode() from thinking that a page has
* been freed. Compensate here, to avoid the need for a followup call.
*/
@@ -1536,12 +1536,15 @@ int shmem_unuse(unsigned int type)
return error;
}
-/*
- * Move the page from the page cache to the swap cache.
+/**
+ * shmem_writeout - Write the folio to swap
+ * @folio: The folio to write
+ * @wbc: How writeback is to be done
+ *
+ * Move the folio from the page cache to the swap cache.
*/
-static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+int shmem_writeout(struct folio *folio, struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -1550,13 +1553,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
int nr_pages;
bool split = false;
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
if (WARN_ON_ONCE(!wbc->for_reclaim))
goto redirty;
@@ -1586,9 +1582,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
try_split:
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page_to_list_to_order(page, wbc->list, 0))
+ if (split_folio_to_list(folio, wbc->list))
goto redirty;
- folio = page_folio(page);
folio_clear_dirty(folio);
}
@@ -1646,7 +1641,7 @@ try_split:
mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
- return swap_writepage(&folio->page, wbc);
+ return swap_writeout(folio, wbc);
}
list_del_init(&info->swaplist);
@@ -1660,6 +1655,7 @@ redirty:
folio_unlock(folio);
return 0;
}
+EXPORT_SYMBOL_GPL(shmem_writeout);
#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -3768,7 +3764,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
index--;
/*
- * Inform shmem_writepage() how far we have reached.
+ * Inform shmem_writeout() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
if (!folio_test_uptodate(folio))
@@ -5191,7 +5187,6 @@ static int shmem_error_remove_folio(struct address_space *mapping,
}
static const struct address_space_operations shmem_aops = {
- .writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 6af13bcd2ab3..5acb51a9fc49 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -223,7 +223,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
+ 0UL,
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
@@ -341,7 +341,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
+ 0UL,
K(free_pcp),
K(this_cpu_read(zone->per_cpu_pageset->count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
diff --git a/mm/slub.c b/mm/slub.c
index b46f87662e71..be8b09e09d30 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1973,6 +1973,11 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
__GFP_ACCOUNT | __GFP_NOFAIL)
+static inline void init_slab_obj_exts(struct slab *slab)
+{
+ slab->obj_exts = 0;
+}
+
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2023,8 +2028,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
return 0;
}
-/* Should be called only if mem_alloc_profiling_enabled() */
-static noinline void free_slab_obj_exts(struct slab *slab)
+static inline void free_slab_obj_exts(struct slab *slab)
{
struct slabobj_ext *obj_exts;
@@ -2044,20 +2048,12 @@ static noinline void free_slab_obj_exts(struct slab *slab)
slab->obj_exts = 0;
}
-static inline bool need_slab_obj_ext(void)
-{
- if (mem_alloc_profiling_enabled())
- return true;
+#else /* CONFIG_SLAB_OBJ_EXT */
- /*
- * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally
- * inside memcg_slab_post_alloc_hook. No other users for now.
- */
- return false;
+static inline void init_slab_obj_exts(struct slab *slab)
+{
}
-#else /* CONFIG_SLAB_OBJ_EXT */
-
static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2068,11 +2064,6 @@ static inline void free_slab_obj_exts(struct slab *slab)
{
}
-static inline bool need_slab_obj_ext(void)
-{
- return false;
-}
-
#endif /* CONFIG_SLAB_OBJ_EXT */
#ifdef CONFIG_MEM_ALLOC_PROFILING
@@ -2120,7 +2111,7 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
static inline void
alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
- if (need_slab_obj_ext())
+ if (mem_alloc_profiling_enabled())
__alloc_tagging_slab_alloc_hook(s, object, flags);
}
@@ -2592,8 +2583,12 @@ static __always_inline void account_slab(struct slab *slab, int order,
static __always_inline void unaccount_slab(struct slab *slab, int order,
struct kmem_cache *s)
{
- if (memcg_kmem_online() || need_slab_obj_ext())
- free_slab_obj_exts(slab);
+ /*
+ * The slab object extensions should now be freed regardless of
+ * whether mem_alloc_profiling_enabled() or not because profiling
+ * might have been disabled after slab->obj_exts got allocated.
+ */
+ free_slab_obj_exts(slab);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
@@ -2637,6 +2632,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
slab->objects = oo_objects(oo);
slab->inuse = 0;
slab->frozen = 0;
+ init_slab_obj_exts(slab);
account_slab(slab, oo_order(oo), s, flags);
diff --git a/mm/swap.h b/mm/swap.h
index 6f4a3f927edb..aa62463976d5 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -20,7 +20,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
__swap_read_unplug(plug);
}
void swap_write_unplug(struct swap_iocb *sio);
-int swap_writepage(struct page *page, struct writeback_control *wbc);
+int swap_writeout(struct folio *folio, struct writeback_control *wbc);
void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
/* linux/mm/swap_state.c */
@@ -141,7 +141,7 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+static inline int swap_writeout(struct folio *f, struct writeback_control *wbc)
{
return 0;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 68fd981b514f..ec2b1c9c9926 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -30,7 +30,6 @@
* vmscan's shrink_folio_list.
*/
static const struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
.migrate_folio = migrate_folio,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2eff8b51a945..86643b181098 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1272,13 +1272,22 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
- /*
- * Should not even be attempting large allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
- VM_WARN_ON_ONCE(1);
- return -EINVAL;
+ if (order) {
+ /*
+ * Reject large allocation when THP_SWAP is disabled,
+ * the caller should split the folio and try again.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP))
+ return -EAGAIN;
+
+ /*
+ * Allocation size should never exceed cluster size
+ * (HPAGE_PMD_SIZE).
+ */
+ if (size > SWAPFILE_CLUSTER) {
+ VM_WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
}
local_lock(&percpu_swap_cluster.lock);
@@ -2359,7 +2368,7 @@ retry:
* Limit the number of retries? No: when mmget_not_zero()
* above fails, that mm is likely to be freeing swap from
* exit_mmap(), which proceeds at its own independent pace;
- * and even shmem_writepage() could have been preempted after
+ * and even shmem_writeout() could have been preempted after
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
@@ -3323,6 +3332,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
/*
+ * The swap subsystem needs a major overhaul to support this.
+ * It doesn't work yet so just disable it for now.
+ */
+ if (mapping_min_folio_order(mapping) > 0) {
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ /*
* Read the swap header.
*/
if (!mapping->a_ops->read_folio) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 5d98054094d1..f2aaf99f2990 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -191,6 +191,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
loff_t pos = folio_pos(folio);
+ size_t size = folio_size(folio);
unsigned int offset, length;
struct page *split_at, *split_at2;
@@ -198,14 +199,13 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
offset = start - pos;
else
offset = 0;
- length = folio_size(folio);
- if (pos + length <= (u64)end)
- length = length - offset;
+ if (pos + size <= (u64)end)
+ length = size - offset;
else
length = end + 1 - pos - offset;
folio_wait_writeback(folio);
- if (length == folio_size(folio)) {
+ if (length == size) {
truncate_inode_folio(folio->mapping, folio);
return true;
}
@@ -224,16 +224,20 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
return true;
split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
- split_at2 = folio_page(folio,
- PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
-
if (!try_folio_split(folio, split_at, NULL)) {
/*
* try to split at offset + length to make sure folios within
* the range can be dropped, especially to avoid memory waste
* for shmem truncate
*/
- struct folio *folio2 = page_folio(split_at2);
+ struct folio *folio2;
+
+ if (offset + length == size)
+ goto no_split;
+
+ split_at2 = folio_page(folio,
+ PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
+ folio2 = page_folio(split_at2);
if (!folio_try_get(folio2))
goto no_split;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index fbf2cf62ab9f..e0db855c89b4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1064,8 +1064,13 @@ static int move_present_pte(struct mm_struct *mm,
src_folio->index = linear_page_index(dst_vma, dst_addr);
orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+#endif
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
out:
@@ -1100,6 +1105,9 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
+#endif
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1902,6 +1910,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
unsigned long end)
{
struct vm_area_struct *ret;
+ bool give_up_on_oom = false;
+
+ /*
+ * If we are modifying only and not splitting, just give up on the merge
+ * if OOM prevents us from merging successfully.
+ */
+ if (start == vma->vm_start && end == vma->vm_end)
+ give_up_on_oom = true;
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
@@ -1909,7 +1925,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
vma->vm_flags & ~__VM_UFFD_FLAGS,
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, give_up_on_oom);
/*
* In the vma_merge() successful mprotect-like case 8:
@@ -1960,7 +1976,8 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
new_flags,
- (struct vm_userfaultfd_ctx){ctx});
+ (struct vm_userfaultfd_ctx){ctx},
+ /* give_up_on_oom = */false);
if (IS_ERR(vma))
return PTR_ERR(vma);
diff --git a/mm/vma.c b/mm/vma.c
index 5cdc5612bfc1..a468d4c29c0c 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -666,6 +666,9 @@ static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
/*
* Actually perform the VMA merge operation.
*
+ * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
+ * modify any VMAs or cause inconsistent state should an OOM condition arise.
+ *
* Returns 0 on success, or an error value on failure.
*/
static int commit_merge(struct vma_merge_struct *vmg)
@@ -685,6 +688,12 @@ static int commit_merge(struct vma_merge_struct *vmg)
init_multi_vma_prep(&vp, vma, vmg);
+ /*
+ * If vmg->give_up_on_oom is set, we're safe, because we don't actually
+ * manipulate any VMAs until we succeed at preallocation.
+ *
+ * Past this point, we will not return an error.
+ */
if (vma_iter_prealloc(vmg->vmi, vma))
return -ENOMEM;
@@ -915,7 +924,13 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
if (anon_dup)
unlink_anon_vmas(anon_dup);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+ /*
+ * We've cleaned up any cloned anon_vma's, no VMAs have been
+ * modified, no harm no foul if the user requests that we not
+ * report this and just give up, leaving the VMAs unmerged.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -926,7 +941,15 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+
+ /*
+ * This means we have failed to clone anon_vma's correctly, but no
+ * actual changes to VMAs have occurred, so no harm no foul - if the
+ * user doesn't want this reported and instead just wants to give up on
+ * the merge, allow it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -1068,6 +1091,10 @@ int vma_expand(struct vma_merge_struct *vmg)
/* This should already have been checked by this point. */
VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
vma_start_write(next);
+ /*
+ * In this case we don't report OOM, so vmg->give_up_on_mm is
+ * safe.
+ */
ret = dup_anon_vma(middle, next, &anon_dup);
if (ret)
return ret;
@@ -1090,9 +1117,15 @@ int vma_expand(struct vma_merge_struct *vmg)
return 0;
nomem:
- vmg->state = VMA_MERGE_ERROR_NOMEM;
if (anon_dup)
unlink_anon_vmas(anon_dup);
+ /*
+ * If the user requests that we just give upon OOM, we are safe to do so
+ * here, as commit merge provides this contract to us. Nothing has been
+ * changed - no harm no foul, just don't report it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return -ENOMEM;
}
@@ -1534,6 +1567,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
if (vmg_nomem(vmg))
return ERR_PTR(-ENOMEM);
+ /*
+ * Split can fail for reasons other than OOM, so if the user requests
+ * this it's probably a mistake.
+ */
+ VM_WARN_ON(vmg->give_up_on_oom &&
+ (vma->vm_start != start || vma->vm_end != end));
+
/* Split any preceding portion of the VMA. */
if (vma->vm_start < start) {
int err = split_vma(vmg->vmi, vma, start, 1);
@@ -1602,12 +1642,15 @@ struct vm_area_struct
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
vmg.flags = new_flags;
vmg.uffd_ctx = new_ctx;
+ if (give_up_on_oom)
+ vmg.give_up_on_oom = true;
return vma_modify(&vmg);
}
@@ -1791,6 +1834,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
+ fixup_hugetlb_reservations(new_vma);
vma_close(new_vma);
if (new_vma->vm_file)
diff --git a/mm/vma.h b/mm/vma.h
index 7356ca5a22d3..149926e8a6d1 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -114,6 +114,12 @@ struct vma_merge_struct {
*/
bool just_expand :1;
+ /*
+ * If a merge is possible, but an OOM error occurs, give up and don't
+ * execute the merge, returning NULL.
+ */
+ bool give_up_on_oom :1;
+
/* Internal flags set during merge process: */
/*
@@ -255,7 +261,8 @@ __must_check struct vm_area_struct
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx);
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom);
__must_check struct vm_area_struct
*vma_merge_new_range(struct vma_merge_struct *vmg);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3ed720a787ec..340edee108c0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
if (pfn_valid(pfn)) {
@@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -350,12 +355,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pte_t *pte;
+ pte_t ptent;
+ unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
+ arch_enter_lazy_mmu_mode();
+
do {
- pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_unmap_size(addr, pte);
+ if (size != PAGE_SIZE) {
+ if (WARN_ON(!IS_ALIGNED(addr, size))) {
+ addr = ALIGN_DOWN(addr, size);
+ pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
+ }
+ ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
+ if (WARN_ON(end - addr < size))
+ size = end - addr;
+ } else
+#endif
+ ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -374,8 +397,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
if (cleared || pmd_bad(*pmd))
*mask |= PGTBL_PMD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PMD_SIZE);
continue;
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next, mask);
@@ -399,8 +424,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (cleared || pud_bad(*pud))
*mask |= PGTBL_PUD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PUD_SIZE);
continue;
+ }
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next, mask);
@@ -497,6 +524,9 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
struct page *page = pages[*nr];
@@ -510,6 +540,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -1940,7 +1972,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm,
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
- vm->size = va_size(va);
+ vm->size = vm->requested_size = va_size(va);
vm->caller = caller;
va->vm = vm;
}
@@ -3133,6 +3165,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size,
area->flags = flags;
area->caller = caller;
+ area->requested_size = requested_size;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
@@ -3943,9 +3976,10 @@ void *vmalloc_noprof(unsigned long size)
EXPORT_SYMBOL(vmalloc_noprof);
/**
- * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
* @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
@@ -3954,13 +3988,13 @@ EXPORT_SYMBOL(vmalloc_noprof);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- NUMA_NO_NODE, __builtin_return_address(0));
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
-EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
+EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -4063,6 +4097,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof);
*/
void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
{
+ struct vm_struct *vm = NULL;
+ size_t alloced_size = 0;
size_t old_size = 0;
void *n;
@@ -4072,15 +4108,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
}
if (p) {
- struct vm_struct *vm;
-
vm = find_vm_area(p);
if (unlikely(!vm)) {
WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
return NULL;
}
- old_size = get_vm_area_size(vm);
+ alloced_size = get_vm_area_size(vm);
+ old_size = vm->requested_size;
+ if (WARN(alloced_size < old_size,
+ "vrealloc() has mismatched area vs requested sizes (%p)\n", p))
+ return NULL;
}
/*
@@ -4088,11 +4126,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
* would be a good heuristic for when to shrink the vm_area?
*/
if (size <= old_size) {
- /* Zero out spare memory. */
- if (want_init_on_alloc(flags))
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
+ vm->requested_size = size;
kasan_poison_vmalloc(p + size, old_size - size);
- kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL);
+ return (void *)p;
+ }
+
+ /*
+ * We already have the bytes available in the allocation; use them.
+ */
+ if (size <= alloced_size) {
+ kasan_unpoison_vmalloc(p + old_size, size - old_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
+ vm->requested_size = size;
return (void *)p;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b620d74b0f66..b6f4db6c240f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -648,21 +648,20 @@ typedef enum {
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
+ int (*writeout)(struct folio *, struct writeback_control *);
+
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
@@ -685,7 +684,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (mapping->a_ops->writepage == NULL)
+ if (shmem_mapping(mapping))
+ writeout = shmem_writeout;
+ else if (folio_test_anon(folio))
+ writeout = swap_writeout;
+ else
return PAGE_ACTIVATE;
if (folio_clear_dirty_for_io(folio)) {
@@ -708,7 +711,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
wbc.list = folio_list;
folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
+ res = writeout(folio, &wbc);
if (res < 0)
handle_write_error(mapping, folio, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
@@ -717,7 +720,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
+ /* synchronous write? */
folio_clear_reclaim(folio);
}
trace_mm_vmscan_write_folio(folio);
@@ -6736,6 +6739,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* meet watermarks.
*/
for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ enum zone_stat_item item;
unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
@@ -6746,11 +6750,33 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
/*
* In defrag_mode, watermarks must be met in whole
* blocks to avoid polluting allocator fallbacks.
+ *
+ * However, kswapd usually cannot accomplish this on
+ * its own and needs kcompactd support. Once it's
+ * reclaimed a compaction gap, and kswapd_shrink_node
+ * has dropped order, simply ensure there are enough
+ * base pages for compaction, wake kcompactd & sleep.
*/
- if (defrag_mode)
- free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+ if (defrag_mode && order)
+ item = NR_FREE_PAGES_BLOCKS;
else
- free_pages = zone_page_state(zone, NR_FREE_PAGES);
+ item = NR_FREE_PAGES;
+
+ /*
+ * When there is a high number of CPUs in the system,
+ * the cumulative error from the vmstat per-cpu cache
+ * can blur the line between the watermarks. In that
+ * case, be safe and get an accurate snapshot.
+ *
+ * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+ * pageblock_nr_pages, while the vmstat pcp threshold
+ * is limited to 125. On many configurations that
+ * counter won't actually be per-cpu cached. But keep
+ * things simple for now; revisit when somebody cares.
+ */
+ free_pages = zone_page_state(zone, item);
+ if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(zone, item);
if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
0, free_pages))
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 961b270f023c..d14a7e317ac8 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1243,19 +1243,19 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle,
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- if (off + class->size <= PAGE_SIZE) {
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+
+ if (off + mem_len <= PAGE_SIZE) {
/* this object is contained entirely within a page */
void *dst = kmap_local_zpdesc(zpdesc);
- if (!ZsHugePage(zspage))
- off += ZS_HANDLE_SIZE;
memcpy(dst + off, handle_mem, mem_len);
kunmap_local(dst);
} else {
/* this object spans two pages */
size_t sizes[2];
- off += ZS_HANDLE_SIZE;
sizes[0] = PAGE_SIZE - off;
sizes[1] = mem_len - sizes[0];