summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c33
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kasan/common.c12
-rw-r--r--mm/kfence/core.c14
-rw-r--r--mm/memcontrol.c40
-rw-r--r--mm/memfd.c14
-rw-r--r--mm/memory.c62
-rw-r--r--mm/memory_hotplug.c17
-rw-r--r--mm/mempool.c409
-rw-r--r--mm/page_alloc.c15
-rw-r--r--mm/slab.h112
-rw-r--r--mm/slab_common.c29
-rw-r--r--mm/slub.c694
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/usercopy.c24
15 files changed, 811 insertions, 669 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6cba1cb14b23..1192e62531cd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1642,17 +1642,30 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+/**
+ * touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
+ * @vma: The VMA covering @addr
+ * @addr: The virtual address
+ * @pmd: pmd pointer into the page table mapping @addr
+ * @write: Whether it's a write access
+ *
+ * Return: whether the pmd entry is changed
+ */
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write)
{
- pmd_t _pmd;
+ pmd_t entry;
- _pmd = pmd_mkyoung(*pmd);
+ entry = pmd_mkyoung(*pmd);
if (write)
- _pmd = pmd_mkdirty(_pmd);
+ entry = pmd_mkdirty(entry);
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, write))
+ pmd, entry, write)) {
update_mmu_cache_pmd(vma, addr, pmd);
+ return true;
+ }
+
+ return false;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1842,18 +1855,14 @@ unlock:
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-void huge_pmd_set_accessed(struct vm_fault *vmf)
+bool huge_pmd_set_accessed(struct vm_fault *vmf)
{
bool write = vmf->flags & FAULT_FLAG_WRITE;
- vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
- goto unlock;
-
- touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
+ return false;
-unlock:
- spin_unlock(vmf->ptl);
+ return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
}
static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
diff --git a/mm/internal.h b/mm/internal.h
index 1561fc2ff5b8..27ad37a41868 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1402,7 +1402,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
*/
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, bool write);
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write);
/*
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index d4c14359feaf..38e8bb0bf326 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -520,24 +520,20 @@ void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order,
bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
{
- struct folio *folio = virt_to_folio(ptr);
+ struct page *page = virt_to_page(ptr);
struct slab *slab;
- /*
- * This function can be called for large kmalloc allocation that get
- * their memory from page_alloc. Thus, the folio might not be a slab.
- */
- if (unlikely(!folio_test_slab(folio))) {
+ if (unlikely(PageLargeKmalloc(page))) {
if (check_page_allocation(ptr, ip))
return false;
- kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false);
+ kasan_poison(ptr, page_size(page), KASAN_PAGE_FREE, false);
return true;
}
if (is_kfence_address(ptr))
return true;
- slab = folio_slab(folio);
+ slab = page_slab(page);
if (check_slab_allocation(slab->slab_cache, ptr, ip))
return false;
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 727c20c94ac5..e62b5516bf48 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -612,14 +612,15 @@ static unsigned long kfence_init_pool(void)
* enters __slab_free() slow-path.
*/
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
- struct slab *slab;
+ struct page *page;
if (!i || (i % 2))
continue;
- slab = page_slab(pfn_to_page(start_pfn + i));
- __folio_set_slab(slab_folio(slab));
+ page = pfn_to_page(start_pfn + i);
+ __SetPageSlab(page);
#ifdef CONFIG_MEMCG
+ struct slab *slab = page_slab(page);
slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
MEMCG_DATA_OBJEXTS;
#endif
@@ -665,16 +666,17 @@ static unsigned long kfence_init_pool(void)
reset_slab:
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
- struct slab *slab;
+ struct page *page;
if (!i || (i % 2))
continue;
- slab = page_slab(pfn_to_page(start_pfn + i));
+ page = pfn_to_page(start_pfn + i);
#ifdef CONFIG_MEMCG
+ struct slab *slab = page_slab(page);
slab->obj_exts = 0;
#endif
- __folio_clear_slab(slab_folio(slab));
+ __ClearPageSlab(page);
}
return addr;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4deda33625f4..b46356da6c0e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2557,38 +2557,25 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
}
static __always_inline
-struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
+struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p)
{
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* slab->obj_exts.
*/
- if (folio_test_slab(folio)) {
- struct slabobj_ext *obj_exts;
- struct slab *slab;
- unsigned int off;
-
- slab = folio_slab(folio);
- obj_exts = slab_obj_exts(slab);
- if (!obj_exts)
- return NULL;
-
- off = obj_to_index(slab->slab_cache, slab, p);
- if (obj_exts[off].objcg)
- return obj_cgroup_memcg(obj_exts[off].objcg);
+ struct slabobj_ext *obj_exts;
+ unsigned int off;
+ obj_exts = slab_obj_exts(slab);
+ if (!obj_exts)
return NULL;
- }
- /*
- * folio_memcg_check() is used here, because in theory we can encounter
- * a folio where the slab flag has been cleared already, but
- * slab->obj_exts has not been freed yet
- * folio_memcg_check() will guarantee that a proper memory
- * cgroup pointer or NULL will be returned.
- */
- return folio_memcg_check(folio);
+ off = obj_to_index(slab->slab_cache, slab, p);
+ if (obj_exts[off].objcg)
+ return obj_cgroup_memcg(obj_exts[off].objcg);
+
+ return NULL;
}
/*
@@ -2602,10 +2589,15 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
*/
struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
{
+ struct slab *slab;
+
if (mem_cgroup_disabled())
return NULL;
- return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
+ slab = virt_to_slab(p);
+ if (slab)
+ return mem_cgroup_from_obj_slab(slab, p);
+ return folio_memcg_check(virt_to_folio(p));
}
static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
diff --git a/mm/memfd.c b/mm/memfd.c
index 805e297916e5..ab5312aff14b 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -460,6 +460,8 @@ static struct file *alloc_file(const char *name, unsigned int flags)
{
unsigned int *file_seals;
struct file *file;
+ struct inode *inode;
+ int err = 0;
if (flags & MFD_HUGETLB) {
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
@@ -471,12 +473,20 @@ static struct file *alloc_file(const char *name, unsigned int flags)
}
if (IS_ERR(file))
return file;
+
+ inode = file_inode(file);
+ err = security_inode_init_security_anon(inode,
+ &QSTR(MEMFD_ANON_NAME), NULL);
+ if (err) {
+ fput(file);
+ file = ERR_PTR(err);
+ return file;
+ }
+
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
if (flags & MFD_NOEXEC_SEAL) {
- struct inode *inode = file_inode(file);
-
inode->i_mode &= ~0111;
file_seals = memfd_file_seals_ptr(file);
if (file_seals) {
diff --git a/mm/memory.c b/mm/memory.c
index b59ae7ce42eb..aad432e71251 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6134,6 +6134,45 @@ split:
}
/*
+ * The page faults may be spurious because of the racy access to the
+ * page table. For example, a non-populated virtual page is accessed
+ * on 2 CPUs simultaneously, thus the page faults are triggered on
+ * both CPUs. However, it's possible that one CPU (say CPU A) cannot
+ * find the reason for the page fault if the other CPU (say CPU B) has
+ * changed the page table before the PTE is checked on CPU A. Most of
+ * the time, the spurious page faults can be ignored safely. However,
+ * if the page fault is for the write access, it's possible that a
+ * stale read-only TLB entry exists in the local CPU and needs to be
+ * flushed on some architectures. This is called the spurious page
+ * fault fixing.
+ *
+ * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
+ * by default and used as such on most architectures, while
+ * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
+ * used as such on most architectures.
+ */
+static void fix_spurious_fault(struct vm_fault *vmf,
+ enum pgtable_level ptlevel)
+{
+ /* Skip spurious TLB flush for retried page fault */
+ if (vmf->flags & FAULT_FLAG_TRIED)
+ return;
+ /*
+ * This is needed only for protection faults but the arch code
+ * is not yet telling us if this is a protection fault or not.
+ * This still avoids useless tlb flushes for .text page faults
+ * with threads.
+ */
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ if (ptlevel == PGTABLE_LEVEL_PTE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+ vmf->pte);
+ else
+ flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
+ vmf->pmd);
+ }
+}
+/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
@@ -6214,23 +6253,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
- vmf->flags & FAULT_FLAG_WRITE)) {
+ vmf->flags & FAULT_FLAG_WRITE))
update_mmu_cache_range(vmf, vmf->vma, vmf->address,
vmf->pte, 1);
- } else {
- /* Skip spurious TLB flush for retried page fault */
- if (vmf->flags & FAULT_FLAG_TRIED)
- goto unlock;
- /*
- * This is needed only for protection faults but the arch code
- * is not yet telling us if this is a protection fault or not.
- * This still avoids useless tlb flushes for .text page faults
- * with threads.
- */
- if (vmf->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
- vmf->pte);
- }
+ else
+ fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@@ -6327,7 +6354,10 @@ retry_pud:
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&vmf);
+ vmf.ptl = pmd_lock(mm, vmf.pmd);
+ if (!huge_pmd_set_accessed(&vmf))
+ fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
+ spin_unlock(vmf.ptl);
return 0;
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0be83039c3b5..238a6712738e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1088,7 +1088,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone, bool mhp_off_inaccessible)
+ struct zone *zone)
{
unsigned long end_pfn = pfn + nr_pages;
int ret, i;
@@ -1097,15 +1097,6 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (ret)
return ret;
- /*
- * Memory block is accessible at this stage and hence poison the struct
- * pages now. If the memory block is accessible during memory hotplug
- * addition phase, then page poisining is already performed in
- * sparse_add_section().
- */
- if (mhp_off_inaccessible)
- page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
-
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
false);
@@ -1444,7 +1435,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
}
static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
- u64 start, u64 size, mhp_t mhp_flags)
+ u64 start, u64 size)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1460,8 +1451,6 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
};
mhp_altmap.free = memory_block_memmap_on_memory_pages();
- if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
- mhp_altmap.inaccessible = true;
params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
GFP_KERNEL);
if (!params.altmap) {
@@ -1555,7 +1544,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
*/
if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
mhp_supports_memmap_on_memory()) {
- ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
+ ret = create_altmaps_and_memory_blocks(nid, group, start, size);
if (ret)
goto error;
} else {
diff --git a/mm/mempool.c b/mm/mempool.c
index d7bbf1189db9..c290e5261b47 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/mm/mempool.c
- *
* memory buffer pool support. Such pools are mostly used
* for guaranteed, deadlock-free memory allocations during
* extreme VM load.
@@ -9,7 +7,7 @@
* started by Ingo Molnar, Copyright (C) 2001
* debugging by David Rientjes, Copyright (C) 2015
*/
-
+#include <linux/fault-inject.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/highmem.h>
@@ -20,8 +18,27 @@
#include <linux/writeback.h>
#include "slab.h"
+static DECLARE_FAULT_ATTR(fail_mempool_alloc);
+static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk);
+
+static int __init mempool_faul_inject_init(void)
+{
+ int error;
+
+ error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc",
+ NULL, &fail_mempool_alloc));
+ if (error)
+ return error;
+
+ /* booting will fail on error return here, don't bother to cleanup */
+ return PTR_ERR_OR_ZERO(
+ fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL,
+ &fail_mempool_alloc_bulk));
+}
+late_initcall(mempool_faul_inject_init);
+
#ifdef CONFIG_SLUB_DEBUG_ON
-static void poison_error(mempool_t *pool, void *element, size_t size,
+static void poison_error(struct mempool *pool, void *element, size_t size,
size_t byte)
{
const int nr = pool->curr_nr;
@@ -38,7 +55,7 @@ static void poison_error(mempool_t *pool, void *element, size_t size,
dump_stack();
}
-static void __check_element(mempool_t *pool, void *element, size_t size)
+static void __check_element(struct mempool *pool, void *element, size_t size)
{
u8 *obj = element;
size_t i;
@@ -54,7 +71,7 @@ static void __check_element(mempool_t *pool, void *element, size_t size)
memset(obj, POISON_INUSE, size);
}
-static void check_element(mempool_t *pool, void *element)
+static void check_element(struct mempool *pool, void *element)
{
/* Skip checking: KASAN might save its metadata in the element. */
if (kasan_enabled())
@@ -93,7 +110,7 @@ static void __poison_element(void *element, size_t size)
obj[size - 1] = POISON_END;
}
-static void poison_element(mempool_t *pool, void *element)
+static void poison_element(struct mempool *pool, void *element)
{
/* Skip poisoning: KASAN might save its metadata in the element. */
if (kasan_enabled())
@@ -124,15 +141,16 @@ static void poison_element(mempool_t *pool, void *element)
}
}
#else /* CONFIG_SLUB_DEBUG_ON */
-static inline void check_element(mempool_t *pool, void *element)
+static inline void check_element(struct mempool *pool, void *element)
{
}
-static inline void poison_element(mempool_t *pool, void *element)
+static inline void poison_element(struct mempool *pool, void *element)
{
}
#endif /* CONFIG_SLUB_DEBUG_ON */
-static __always_inline bool kasan_poison_element(mempool_t *pool, void *element)
+static __always_inline bool kasan_poison_element(struct mempool *pool,
+ void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
return kasan_mempool_poison_object(element);
@@ -142,7 +160,7 @@ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element)
return true;
}
-static void kasan_unpoison_element(mempool_t *pool, void *element)
+static void kasan_unpoison_element(struct mempool *pool, void *element)
{
if (pool->alloc == mempool_kmalloc)
kasan_mempool_unpoison_object(element, (size_t)pool->pool_data);
@@ -154,7 +172,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
(unsigned long)pool->pool_data);
}
-static __always_inline void add_element(mempool_t *pool, void *element)
+static __always_inline void add_element(struct mempool *pool, void *element)
{
BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
poison_element(pool, element);
@@ -162,7 +180,7 @@ static __always_inline void add_element(mempool_t *pool, void *element)
pool->elements[pool->curr_nr++] = element;
}
-static void *remove_element(mempool_t *pool)
+static void *remove_element(struct mempool *pool)
{
void *element = pool->elements[--pool->curr_nr];
@@ -183,7 +201,7 @@ static void *remove_element(mempool_t *pool)
* May be called on a zeroed but uninitialized mempool (i.e. allocated with
* kzalloc()).
*/
-void mempool_exit(mempool_t *pool)
+void mempool_exit(struct mempool *pool)
{
while (pool->curr_nr) {
void *element = remove_element(pool);
@@ -202,7 +220,7 @@ EXPORT_SYMBOL(mempool_exit);
* Free all reserved elements in @pool and @pool itself. This function
* only sleeps if the free_fn() function sleeps.
*/
-void mempool_destroy(mempool_t *pool)
+void mempool_destroy(struct mempool *pool)
{
if (unlikely(!pool))
return;
@@ -212,9 +230,9 @@ void mempool_destroy(mempool_t *pool)
}
EXPORT_SYMBOL(mempool_destroy);
-int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
- mempool_free_t *free_fn, void *pool_data,
- gfp_t gfp_mask, int node_id)
+int mempool_init_node(struct mempool *pool, int min_nr,
+ mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
+ void *pool_data, gfp_t gfp_mask, int node_id)
{
spin_lock_init(&pool->lock);
pool->min_nr = min_nr;
@@ -264,8 +282,9 @@ EXPORT_SYMBOL(mempool_init_node);
*
* Return: %0 on success, negative error code otherwise.
*/
-int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
- mempool_free_t *free_fn, void *pool_data)
+int mempool_init_noprof(struct mempool *pool, int min_nr,
+ mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
+ void *pool_data)
{
return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
pool_data, GFP_KERNEL, NUMA_NO_NODE);
@@ -291,11 +310,11 @@ EXPORT_SYMBOL(mempool_init_noprof);
*
* Return: pointer to the created memory pool object or %NULL on error.
*/
-mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
- mempool_free_t *free_fn, void *pool_data,
- gfp_t gfp_mask, int node_id)
+struct mempool *mempool_create_node_noprof(int min_nr,
+ mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
+ void *pool_data, gfp_t gfp_mask, int node_id)
{
- mempool_t *pool;
+ struct mempool *pool;
pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
if (!pool)
@@ -329,7 +348,7 @@ EXPORT_SYMBOL(mempool_create_node_noprof);
*
* Return: %0 on success, negative error code otherwise.
*/
-int mempool_resize(mempool_t *pool, int new_min_nr)
+int mempool_resize(struct mempool *pool, int new_min_nr)
{
void *element;
void **new_elements;
@@ -391,140 +410,227 @@ out:
}
EXPORT_SYMBOL(mempool_resize);
-/**
- * mempool_alloc - allocate an element from a specific memory pool
- * @pool: pointer to the memory pool which was allocated via
- * mempool_create().
- * @gfp_mask: the usual allocation bitmask.
- *
- * this function only sleeps if the alloc_fn() function sleeps or
- * returns NULL. Note that due to preallocation, this function
- * *never* fails when called from process contexts. (it might
- * fail if called from an IRQ context.)
- * Note: using __GFP_ZERO is not supported.
- *
- * Return: pointer to the allocated element or %NULL on error.
- */
-void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask)
+static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems,
+ unsigned int count, unsigned int allocated,
+ gfp_t gfp_mask)
{
- void *element;
unsigned long flags;
- wait_queue_entry_t wait;
- gfp_t gfp_temp;
+ unsigned int i;
- VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
- might_alloc(gfp_mask);
-
- gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
- gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
- gfp_mask |= __GFP_NOWARN; /* failures are OK */
+ spin_lock_irqsave(&pool->lock, flags);
+ if (unlikely(pool->curr_nr < count - allocated))
+ goto fail;
+ for (i = 0; i < count; i++) {
+ if (!elems[i]) {
+ elems[i] = remove_element(pool);
+ allocated++;
+ }
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
- gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
+ /* Paired with rmb in mempool_free(), read comment there. */
+ smp_wmb();
-repeat_alloc:
+ /*
+ * Update the allocation stack trace as this is more useful for
+ * debugging.
+ */
+ for (i = 0; i < count; i++)
+ kmemleak_update_trace(elems[i]);
+ return allocated;
- element = pool->alloc(gfp_temp, pool->pool_data);
- if (likely(element != NULL))
- return element;
+fail:
+ if (gfp_mask & __GFP_DIRECT_RECLAIM) {
+ DEFINE_WAIT(wait);
- spin_lock_irqsave(&pool->lock, flags);
- if (likely(pool->curr_nr)) {
- element = remove_element(pool);
+ prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock_irqrestore(&pool->lock, flags);
- /* paired with rmb in mempool_free(), read comment there */
- smp_wmb();
+
/*
- * Update the allocation stack trace as this is more useful
- * for debugging.
+ * Wait for someone else to return an element to @pool, but wake
+ * up occasionally as memory pressure might have reduced even
+ * and the normal allocation in alloc_fn could succeed even if
+ * no element was returned.
*/
- kmemleak_update_trace(element);
- return element;
- }
-
- /*
- * We use gfp mask w/o direct reclaim or IO for the first round. If
- * alloc failed with that and @pool was empty, retry immediately.
- */
- if (gfp_temp != gfp_mask) {
+ io_schedule_timeout(5 * HZ);
+ finish_wait(&pool->wait, &wait);
+ } else {
+ /* We must not sleep if __GFP_DIRECT_RECLAIM is not set. */
spin_unlock_irqrestore(&pool->lock, flags);
- gfp_temp = gfp_mask;
- goto repeat_alloc;
}
- /* We must not sleep if !__GFP_DIRECT_RECLAIM */
- if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
- spin_unlock_irqrestore(&pool->lock, flags);
- return NULL;
- }
+ return allocated;
+}
- /* Let's wait for someone else to return an element to @pool */
- init_wait(&wait);
- prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+/*
+ * Adjust the gfp flags for mempool allocations, as we never want to dip into
+ * the global emergency reserves or retry in the page allocator.
+ *
+ * The first pass also doesn't want to go reclaim, but the next passes do, so
+ * return a separate subset for that first iteration.
+ */
+static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask)
+{
+ *gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+ return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
+}
- spin_unlock_irqrestore(&pool->lock, flags);
+/**
+ * mempool_alloc_bulk - allocate multiple elements from a memory pool
+ * @pool: pointer to the memory pool
+ * @elems: partially or fully populated elements array
+ * @count: number of entries in @elem that need to be allocated
+ * @allocated: number of entries in @elem already allocated
+ *
+ * Allocate elements for each slot in @elem that is non-%NULL. This is done by
+ * first calling into the alloc_fn supplied at pool initialization time, and
+ * dipping into the reserved pool when alloc_fn fails to allocate an element.
+ *
+ * On return all @count elements in @elems will be populated.
+ *
+ * Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void.
+ */
+int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems,
+ unsigned int count, unsigned int allocated)
+{
+ gfp_t gfp_mask = GFP_KERNEL;
+ gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
+ unsigned int i = 0;
+
+ VM_WARN_ON_ONCE(count > pool->min_nr);
+ might_alloc(gfp_mask);
/*
- * FIXME: this should be io_schedule(). The timeout is there as a
- * workaround for some DM problems in 2.6.18.
+ * If an error is injected, fail all elements in a bulk allocation so
+ * that we stress the multiple elements missing path.
*/
- io_schedule_timeout(5*HZ);
+ if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) {
+ pr_info("forcing mempool usage for %pS\n",
+ (void *)_RET_IP_);
+ goto use_pool;
+ }
+
+repeat_alloc:
+ /*
+ * Try to allocate the elements using the allocation callback first as
+ * that might succeed even when the caller's bulk allocation did not.
+ */
+ for (i = 0; i < count; i++) {
+ if (elems[i])
+ continue;
+ elems[i] = pool->alloc(gfp_temp, pool->pool_data);
+ if (unlikely(!elems[i]))
+ goto use_pool;
+ allocated++;
+ }
+
+ return 0;
- finish_wait(&pool->wait, &wait);
+use_pool:
+ allocated = mempool_alloc_from_pool(pool, elems, count, allocated,
+ gfp_temp);
+ gfp_temp = gfp_mask;
goto repeat_alloc;
}
-EXPORT_SYMBOL(mempool_alloc_noprof);
+EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof);
/**
- * mempool_alloc_preallocated - allocate an element from preallocated elements
- * belonging to a specific memory pool
- * @pool: pointer to the memory pool which was allocated via
- * mempool_create().
+ * mempool_alloc - allocate an element from a memory pool
+ * @pool: pointer to the memory pool
+ * @gfp_mask: GFP_* flags. %__GFP_ZERO is not supported.
*
- * This function is similar to mempool_alloc, but it only attempts allocating
- * an element from the preallocated elements. It does not sleep and immediately
- * returns if no preallocated elements are available.
+ * Allocate an element from @pool. This is done by first calling into the
+ * alloc_fn supplied at pool initialization time, and dipping into the reserved
+ * pool when alloc_fn fails to allocate an element.
*
- * Return: pointer to the allocated element or %NULL if no elements are
- * available.
+ * This function only sleeps if the alloc_fn callback sleeps, or when waiting
+ * for elements to become available in the pool.
+ *
+ * Return: pointer to the allocated element or %NULL when failing to allocate
+ * an element. Allocation failure can only happen when @gfp_mask does not
+ * include %__GFP_DIRECT_RECLAIM.
*/
-void *mempool_alloc_preallocated(mempool_t *pool)
+void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask)
{
+ gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
void *element;
- unsigned long flags;
- spin_lock_irqsave(&pool->lock, flags);
- if (likely(pool->curr_nr)) {
- element = remove_element(pool);
- spin_unlock_irqrestore(&pool->lock, flags);
- /* paired with rmb in mempool_free(), read comment there */
- smp_wmb();
+ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
+ might_alloc(gfp_mask);
+
+repeat_alloc:
+ if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) {
+ pr_info("forcing mempool usage for %pS\n",
+ (void *)_RET_IP_);
+ element = NULL;
+ } else {
+ element = pool->alloc(gfp_temp, pool->pool_data);
+ }
+
+ if (unlikely(!element)) {
/*
- * Update the allocation stack trace as this is more useful
- * for debugging.
+ * Try to allocate an element from the pool.
+ *
+ * The first pass won't have __GFP_DIRECT_RECLAIM and won't
+ * sleep in mempool_alloc_from_pool. Retry the allocation
+ * with all flags set in that case.
*/
- kmemleak_update_trace(element);
- return element;
+ if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) {
+ if (gfp_temp != gfp_mask) {
+ gfp_temp = gfp_mask;
+ goto repeat_alloc;
+ }
+ if (gfp_mask & __GFP_DIRECT_RECLAIM) {
+ goto repeat_alloc;
+ }
+ }
}
- spin_unlock_irqrestore(&pool->lock, flags);
- return NULL;
+ return element;
+}
+EXPORT_SYMBOL(mempool_alloc_noprof);
+
+/**
+ * mempool_alloc_preallocated - allocate an element from preallocated elements
+ * belonging to a memory pool
+ * @pool: pointer to the memory pool
+ *
+ * This function is similar to mempool_alloc(), but it only attempts allocating
+ * an element from the preallocated elements. It only takes a single spinlock_t
+ * and immediately returns if no preallocated elements are available.
+ *
+ * Return: pointer to the allocated element or %NULL if no elements are
+ * available.
+ */
+void *mempool_alloc_preallocated(struct mempool *pool)
+{
+ void *element = NULL;
+
+ mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT);
+ return element;
}
EXPORT_SYMBOL(mempool_alloc_preallocated);
/**
- * mempool_free - return an element to the pool.
- * @element: pool element pointer.
- * @pool: pointer to the memory pool which was allocated via
- * mempool_create().
+ * mempool_free_bulk - return elements to a mempool
+ * @pool: pointer to the memory pool
+ * @elems: elements to return
+ * @count: number of elements to return
*
- * this function only sleeps if the free_fn() function sleeps.
+ * Returns a number of elements from the start of @elem to @pool if @pool needs
+ * replenishing and sets their slots in @elem to NULL. Other elements are left
+ * in @elem.
+ *
+ * Return: number of elements transferred to @pool. Elements are always
+ * transferred from the beginning of @elem, so the return value can be used as
+ * an offset into @elem for the freeing the remaining elements in the caller.
*/
-void mempool_free(void *element, mempool_t *pool)
+unsigned int mempool_free_bulk(struct mempool *pool, void **elems,
+ unsigned int count)
{
unsigned long flags;
-
- if (unlikely(element == NULL))
- return;
+ unsigned int freed = 0;
+ bool added = false;
/*
* Paired with the wmb in mempool_alloc(). The preceding read is
@@ -558,21 +664,6 @@ void mempool_free(void *element, mempool_t *pool)
* Waiters happen iff curr_nr is 0 and the above guarantee also
* ensures that there will be frees which return elements to the
* pool waking up the waiters.
- */
- if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
- spin_lock_irqsave(&pool->lock, flags);
- if (likely(pool->curr_nr < pool->min_nr)) {
- add_element(pool, element);
- spin_unlock_irqrestore(&pool->lock, flags);
- if (wq_has_sleeper(&pool->wait))
- wake_up(&pool->wait);
- return;
- }
- spin_unlock_irqrestore(&pool->lock, flags);
- }
-
- /*
- * Handle the min_nr = 0 edge case:
*
* For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds,
* so waiters sleeping on pool->wait would never be woken by the
@@ -580,20 +671,45 @@ void mempool_free(void *element, mempool_t *pool)
* allocation of element when both min_nr and curr_nr are 0, and
* any active waiters are properly awakened.
*/
- if (unlikely(pool->min_nr == 0 &&
+ if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
+ spin_lock_irqsave(&pool->lock, flags);
+ while (pool->curr_nr < pool->min_nr && freed < count) {
+ add_element(pool, elems[freed++]);
+ added = true;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ } else if (unlikely(pool->min_nr == 0 &&
READ_ONCE(pool->curr_nr) == 0)) {
+ /* Handle the min_nr = 0 edge case: */
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr == 0)) {
- add_element(pool, element);
- spin_unlock_irqrestore(&pool->lock, flags);
- if (wq_has_sleeper(&pool->wait))
- wake_up(&pool->wait);
- return;
+ add_element(pool, elems[freed++]);
+ added = true;
}
spin_unlock_irqrestore(&pool->lock, flags);
}
- pool->free(element, pool->pool_data);
+ if (unlikely(added) && wq_has_sleeper(&pool->wait))
+ wake_up(&pool->wait);
+
+ return freed;
+}
+EXPORT_SYMBOL_GPL(mempool_free_bulk);
+
+/**
+ * mempool_free - return an element to the pool.
+ * @element: element to return
+ * @pool: pointer to the memory pool
+ *
+ * Returns @element to @pool if it needs replenishing, else frees it using
+ * the free_fn callback in @pool.
+ *
+ * This function only sleeps if the free_fn callback sleeps.
+ */
+void mempool_free(void *element, struct mempool *pool)
+{
+ if (likely(element) && !mempool_free_bulk(pool, &element, 1))
+ pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);
@@ -632,19 +748,6 @@ void mempool_kfree(void *element, void *pool_data)
}
EXPORT_SYMBOL(mempool_kfree);
-void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
-{
- size_t size = (size_t)pool_data;
- return kvmalloc(size, gfp_mask);
-}
-EXPORT_SYMBOL(mempool_kvmalloc);
-
-void mempool_kvfree(void *element, void *pool_data)
-{
- kvfree(element);
-}
-EXPORT_SYMBOL(mempool_kvfree);
-
/*
* A simple mempool-backed page allocator that allocates pages
* of the order specified by pool_data.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed82ee55e66a..4074c07d02ca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4977,13 +4977,18 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
* @nr_pages: The number of pages desired in the array
* @page_array: Array to store the pages
*
- * This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly. Pages are added to the page_array.
+ * This is a batched version of the page allocator that attempts to allocate
+ * @nr_pages quickly. Pages are added to @page_array.
*
- * Note that only NULL elements are populated with pages and nr_pages
- * is the maximum number of pages that will be stored in the array.
+ * Note that only the elements in @page_array that were cleared to %NULL on
+ * entry are populated with newly allocated pages. @nr_pages is the maximum
+ * number of pages that will be stored in the array.
*
- * Returns the number of pages in the array.
+ * Returns the number of pages in @page_array, including ones already
+ * allocated on entry. This can be less than the number requested in @nr_pages,
+ * but all empty slots are filled from the beginning. I.e., if all slots in
+ * @page_array were set to %NULL on entry, the slots from 0 to the return value
+ * - 1 will be filled.
*/
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
diff --git a/mm/slab.h b/mm/slab.h
index 078daecc7cf5..f730e012553c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -40,13 +40,29 @@ typedef u64 freelist_full_t;
* Freelist pointer and counter to cmpxchg together, avoids the typical ABA
* problems with cmpxchg of just a pointer.
*/
-typedef union {
- struct {
- void *freelist;
- unsigned long counter;
+struct freelist_counters {
+ union {
+ struct {
+ void *freelist;
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ /*
+ * If slab debugging is enabled then the
+ * frozen bit can be reused to indicate
+ * that the slab was corrupted
+ */
+ unsigned frozen:1;
+ };
+ };
+ };
+#ifdef system_has_freelist_aba
+ freelist_full_t freelist_counters;
+#endif
};
- freelist_full_t full;
-} freelist_aba_t;
+};
/* Reuses the bits in struct page */
struct slab {
@@ -69,27 +85,7 @@ struct slab {
#endif
};
/* Double-word boundary */
- union {
- struct {
- void *freelist; /* first free object */
- union {
- unsigned long counters;
- struct {
- unsigned inuse:16;
- unsigned objects:15;
- /*
- * If slab debugging is enabled then the
- * frozen bit can be reused to indicate
- * that the slab was corrupted
- */
- unsigned frozen:1;
- };
- };
- };
-#ifdef system_has_freelist_aba
- freelist_aba_t freelist_counter;
-#endif
- };
+ struct freelist_counters;
};
struct rcu_head rcu_head;
};
@@ -114,23 +110,10 @@ SLAB_MATCH(_unused_slab_obj_exts, obj_exts);
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
#if defined(system_has_freelist_aba)
-static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)));
+static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist_counters)));
#endif
/**
- * folio_slab - Converts from folio to slab.
- * @folio: The folio.
- *
- * Currently struct slab is a different representation of a folio where
- * folio_test_slab() is true.
- *
- * Return: The slab which contains this folio.
- */
-#define folio_slab(folio) (_Generic((folio), \
- const struct folio *: (const struct slab *)(folio), \
- struct folio *: (struct slab *)(folio)))
-
-/**
* slab_folio - The folio allocated for a slab
* @s: The slab.
*
@@ -146,20 +129,24 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
struct slab *: (struct folio *)s))
/**
- * page_slab - Converts from first struct page to slab.
- * @p: The first (either head of compound or single) page of slab.
- *
- * A temporary wrapper to convert struct page to struct slab in situations where
- * we know the page is the compound head, or single order-0 page.
- *
- * Long-term ideally everything would work with struct slab directly or go
- * through folio to struct slab.
+ * page_slab - Converts from struct page to its slab.
+ * @page: A page which may or may not belong to a slab.
*
- * Return: The slab which contains this page
+ * Return: The slab which contains this page or NULL if the page does
+ * not belong to a slab. This includes pages returned from large kmalloc.
*/
-#define page_slab(p) (_Generic((p), \
- const struct page *: (const struct slab *)(p), \
- struct page *: (struct slab *)(p)))
+static inline struct slab *page_slab(const struct page *page)
+{
+ unsigned long head;
+
+ head = READ_ONCE(page->compound_head);
+ if (head & 1)
+ page = (struct page *)(head - 1);
+ if (data_race(page->page_type >> 24) != PGTY_slab)
+ page = NULL;
+
+ return (struct slab *)page;
+}
/**
* slab_page - The first struct page allocated for a slab
@@ -188,12 +175,7 @@ static inline pg_data_t *slab_pgdat(const struct slab *slab)
static inline struct slab *virt_to_slab(const void *addr)
{
- struct folio *folio = virt_to_folio(addr);
-
- if (!folio_test_slab(folio))
- return NULL;
-
- return folio_slab(folio);
+ return page_slab(virt_to_page(addr));
}
static inline int slab_order(const struct slab *slab)
@@ -236,10 +218,8 @@ struct kmem_cache_order_objects {
* Slab cache management.
*/
struct kmem_cache {
-#ifndef CONFIG_SLUB_TINY
struct kmem_cache_cpu __percpu *cpu_slab;
struct lock_class_key lock_key;
-#endif
struct slub_percpu_sheaves __percpu *cpu_sheaves;
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
@@ -601,6 +581,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
return s->size;
}
+static inline unsigned int large_kmalloc_order(const struct page *page)
+{
+ return page[1].flags.f & 0xff;
+}
+
+static inline size_t large_kmalloc_size(const struct page *page)
+{
+ return PAGE_SIZE << large_kmalloc_order(page);
+}
+
#ifdef CONFIG_SLUB_DEBUG
void dump_unreclaimable_slab(void);
#else
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 932d13ada36c..84dfff4f7b1f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -997,26 +997,27 @@ void __init create_kmalloc_caches(void)
*/
size_t __ksize(const void *object)
{
- struct folio *folio;
+ const struct page *page;
+ const struct slab *slab;
if (unlikely(object == ZERO_SIZE_PTR))
return 0;
- folio = virt_to_folio(object);
+ page = virt_to_page(object);
- if (unlikely(!folio_test_slab(folio))) {
- if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
- return 0;
- if (WARN_ON(object != folio_address(folio)))
- return 0;
- return folio_size(folio);
- }
+ if (unlikely(PageLargeKmalloc(page)))
+ return large_kmalloc_size(page);
+
+ slab = page_slab(page);
+ /* Delete this after we're sure there are no users */
+ if (WARN_ON(!slab))
+ return page_size(page);
#ifdef CONFIG_SLUB_DEBUG
- skip_orig_size_check(folio_slab(folio)->slab_cache, object);
+ skip_orig_size_check(slab->slab_cache, object);
#endif
- return slab_ksize(folio_slab(folio)->slab_cache);
+ return slab_ksize(slab->slab_cache);
}
gfp_t kmalloc_fix_flags(gfp_t flags)
@@ -1614,17 +1615,15 @@ static void kfree_rcu_work(struct work_struct *work)
static bool kfree_rcu_sheaf(void *obj)
{
struct kmem_cache *s;
- struct folio *folio;
struct slab *slab;
if (is_vmalloc_addr(obj))
return false;
- folio = virt_to_folio(obj);
- if (unlikely(!folio_test_slab(folio)))
+ slab = virt_to_slab(obj);
+ if (unlikely(!slab))
return false;
- slab = folio_slab(folio);
s = slab->slab_cache;
if (s->cpu_sheaves) {
if (likely(!IS_ENABLED(CONFIG_NUMA) ||
diff --git a/mm/slub.c b/mm/slub.c
index a0b905c2a557..2acce22590f8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -410,19 +410,22 @@ enum stat_item {
NR_SLUB_STAT_ITEMS
};
-#ifndef CONFIG_SLUB_TINY
-/*
- * When changing the layout, make sure freelist and tid are still compatible
- * with this_cpu_cmpxchg_double() alignment requirements.
- */
-struct kmem_cache_cpu {
+struct freelist_tid {
union {
struct {
- void **freelist; /* Pointer to next available object */
+ void *freelist; /* Pointer to next available object */
unsigned long tid; /* Globally unique transaction id */
};
- freelist_aba_t freelist_tid;
+ freelist_full_t freelist_tid;
};
+};
+
+/*
+ * When changing the layout, make sure freelist and tid are still compatible
+ * with this_cpu_cmpxchg_double() alignment requirements.
+ */
+struct kmem_cache_cpu {
+ struct freelist_tid;
struct slab *slab; /* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct slab *partial; /* Partially allocated slabs */
@@ -432,7 +435,6 @@ struct kmem_cache_cpu {
unsigned int stat[NR_SLUB_STAT_ITEMS];
#endif
};
-#endif /* CONFIG_SLUB_TINY */
static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
@@ -469,7 +471,10 @@ struct slab_sheaf {
struct rcu_head rcu_head;
struct list_head barn_list;
/* only used for prefilled sheafs */
- unsigned int capacity;
+ struct {
+ unsigned int capacity;
+ bool pfmemalloc;
+ };
};
struct kmem_cache *cache;
unsigned int size;
@@ -594,12 +599,10 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
return freelist_ptr_decode(s, p, ptr_addr);
}
-#ifndef CONFIG_SLUB_TINY
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
prefetchw(object + s->offset);
}
-#endif
/*
* When running under KMSAN, get_freepointer_safe() may return an uninitialized
@@ -711,10 +714,12 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
return s->cpu_partial_slabs;
}
#else
+#ifdef SLAB_SUPPORTS_SYSFS
static inline void
slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
{
}
+#endif
static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
{
@@ -755,32 +760,29 @@ static __always_inline void slab_unlock(struct slab *slab)
}
static inline bool
-__update_freelist_fast(struct slab *slab,
- void *freelist_old, unsigned long counters_old,
- void *freelist_new, unsigned long counters_new)
+__update_freelist_fast(struct slab *slab, struct freelist_counters *old,
+ struct freelist_counters *new)
{
#ifdef system_has_freelist_aba
- freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
- freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
-
- return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
+ return try_cmpxchg_freelist(&slab->freelist_counters,
+ &old->freelist_counters,
+ new->freelist_counters);
#else
return false;
#endif
}
static inline bool
-__update_freelist_slow(struct slab *slab,
- void *freelist_old, unsigned long counters_old,
- void *freelist_new, unsigned long counters_new)
+__update_freelist_slow(struct slab *slab, struct freelist_counters *old,
+ struct freelist_counters *new)
{
bool ret = false;
slab_lock(slab);
- if (slab->freelist == freelist_old &&
- slab->counters == counters_old) {
- slab->freelist = freelist_new;
- slab->counters = counters_new;
+ if (slab->freelist == old->freelist &&
+ slab->counters == old->counters) {
+ slab->freelist = new->freelist;
+ slab->counters = new->counters;
ret = true;
}
slab_unlock(slab);
@@ -796,22 +798,18 @@ __update_freelist_slow(struct slab *slab,
* interrupt the operation.
*/
static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
- void *freelist_old, unsigned long counters_old,
- void *freelist_new, unsigned long counters_new,
- const char *n)
+ struct freelist_counters *old, struct freelist_counters *new, const char *n)
{
bool ret;
if (USE_LOCKLESS_FAST_PATH())
lockdep_assert_irqs_disabled();
- if (s->flags & __CMPXCHG_DOUBLE) {
- ret = __update_freelist_fast(slab, freelist_old, counters_old,
- freelist_new, counters_new);
- } else {
- ret = __update_freelist_slow(slab, freelist_old, counters_old,
- freelist_new, counters_new);
- }
+ if (s->flags & __CMPXCHG_DOUBLE)
+ ret = __update_freelist_fast(slab, old, new);
+ else
+ ret = __update_freelist_slow(slab, old, new);
+
if (likely(ret))
return true;
@@ -826,21 +824,17 @@ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *sla
}
static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
- void *freelist_old, unsigned long counters_old,
- void *freelist_new, unsigned long counters_new,
- const char *n)
+ struct freelist_counters *old, struct freelist_counters *new, const char *n)
{
bool ret;
if (s->flags & __CMPXCHG_DOUBLE) {
- ret = __update_freelist_fast(slab, freelist_old, counters_old,
- freelist_new, counters_new);
+ ret = __update_freelist_fast(slab, old, new);
} else {
unsigned long flags;
local_irq_save(flags);
- ret = __update_freelist_slow(slab, freelist_old, counters_old,
- freelist_new, counters_new);
+ ret = __update_freelist_slow(slab, old, new);
local_irq_restore(flags);
}
if (likely(ret))
@@ -978,7 +972,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
static slab_flags_t slub_debug;
#endif
-static char *slub_debug_string;
+static const char *slub_debug_string __ro_after_init;
static int disable_higher_order_debug;
/*
@@ -1785,8 +1779,8 @@ static inline int free_consistency_checks(struct kmem_cache *s,
*
* returns the start of next block if there's any, or NULL
*/
-static char *
-parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
+static const char *
+parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init)
{
bool higher_order_disable = false;
@@ -1863,17 +1857,17 @@ check_slabs:
return NULL;
}
-static int __init setup_slub_debug(char *str)
+static int __init setup_slub_debug(const char *str, const struct kernel_param *kp)
{
slab_flags_t flags;
slab_flags_t global_flags;
- char *saved_str;
- char *slab_list;
+ const char *saved_str;
+ const char *slab_list;
bool global_slub_debug_changed = false;
bool slab_list_specified = false;
global_flags = DEBUG_DEFAULT_FLAGS;
- if (*str++ != '=' || !*str)
+ if (!str || !*str)
/*
* No options specified. Switch on full debugging.
*/
@@ -1917,11 +1911,15 @@ out:
static_branch_unlikely(&init_on_free)) &&
(slub_debug & SLAB_POISON))
pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
- return 1;
+ return 0;
}
-__setup("slab_debug", setup_slub_debug);
-__setup_param("slub_debug", slub_debug, setup_slub_debug, 0);
+static const struct kernel_param_ops param_ops_slab_debug __initconst = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = setup_slub_debug,
+};
+__core_param_cb(slab_debug, &param_ops_slab_debug, NULL, 0);
+__core_param_cb(slub_debug, &param_ops_slab_debug, NULL, 0);
/*
* kmem_cache_flags - apply debugging options to the cache
@@ -1935,9 +1933,9 @@ __setup_param("slub_debug", slub_debug, setup_slub_debug, 0);
*/
slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
{
- char *iter;
+ const char *iter;
size_t len;
- char *next_block;
+ const char *next_block;
slab_flags_t block_flags;
slab_flags_t slub_debug_local = slub_debug;
@@ -1961,7 +1959,7 @@ slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
continue;
/* Found a block that has a slab list, search it */
while (*iter) {
- char *end, *glob;
+ const char *end, *glob;
size_t cmplen;
end = strchrnul(iter, ',');
@@ -2023,15 +2021,21 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
int objects) {}
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-#ifndef CONFIG_SLUB_TINY
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
void **freelist, void *nextfree)
{
return false;
}
-#endif
#endif /* CONFIG_SLUB_DEBUG */
+/*
+ * The allocated objcg pointers array is not accounted directly.
+ * Moreover, it should not come from DMA buffer and is not readily
+ * reclaimable. So those GFP bits should be masked off.
+ */
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
+ __GFP_ACCOUNT | __GFP_NOFAIL)
+
#ifdef CONFIG_SLAB_OBJ_EXT
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
@@ -2086,14 +2090,6 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
-/*
- * The allocated objcg pointers array is not accounted directly.
- * Moreover, it should not come from DMA buffer and is not readily
- * reclaimable. So those GFP bits should be masked off.
- */
-#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
- __GFP_ACCOUNT | __GFP_NOFAIL)
-
static inline void init_slab_obj_exts(struct slab *slab)
{
slab->obj_exts = 0;
@@ -2373,33 +2369,34 @@ bool memcg_slab_post_charge(void *p, gfp_t flags)
{
struct slabobj_ext *slab_exts;
struct kmem_cache *s;
- struct folio *folio;
+ struct page *page;
struct slab *slab;
unsigned long off;
- folio = virt_to_folio(p);
- if (!folio_test_slab(folio)) {
+ page = virt_to_page(p);
+ if (PageLargeKmalloc(page)) {
+ unsigned int order;
int size;
- if (folio_memcg_kmem(folio))
+ if (PageMemcgKmem(page))
return true;
- if (__memcg_kmem_charge_page(folio_page(folio, 0), flags,
- folio_order(folio)))
+ order = large_kmalloc_order(page);
+ if (__memcg_kmem_charge_page(page, flags, order))
return false;
/*
- * This folio has already been accounted in the global stats but
+ * This page has already been accounted in the global stats but
* not in the memcg stats. So, subtract from the global and use
* the interface which adds to both global and memcg stats.
*/
- size = folio_size(folio);
- node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size);
- lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size);
+ size = PAGE_SIZE << order;
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size);
return true;
}
- slab = folio_slab(folio);
+ slab = page_slab(page);
s = slab->slab_cache;
/*
@@ -2601,8 +2598,24 @@ static void *setup_object(struct kmem_cache *s, void *object)
static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
{
- struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects,
- s->sheaf_capacity), gfp);
+ struct slab_sheaf *sheaf;
+ size_t sheaf_size;
+
+ if (gfp & __GFP_NO_OBJ_EXT)
+ return NULL;
+
+ gfp &= ~OBJCGS_CLEAR_MASK;
+
+ /*
+ * Prevent recursion to the same cache, or a deep stack of kmallocs of
+ * varying sizes (sheaf capacity might differ for each kmalloc size
+ * bucket)
+ */
+ if (s->flags & SLAB_KMALLOC)
+ gfp |= __GFP_NO_OBJ_EXT;
+
+ sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity);
+ sheaf = kzalloc(sheaf_size, gfp);
if (unlikely(!sheaf))
return NULL;
@@ -2655,7 +2668,7 @@ static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
if (!sheaf)
return NULL;
- if (refill_sheaf(s, sheaf, gfp)) {
+ if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) {
free_empty_sheaf(s, sheaf);
return NULL;
}
@@ -2733,12 +2746,13 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
sheaf->size = 0;
}
-static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
+static bool __rcu_free_sheaf_prepare(struct kmem_cache *s,
struct slab_sheaf *sheaf)
{
bool init = slab_want_init_on_free(s);
void **p = &sheaf->objects[0];
unsigned int i = 0;
+ bool pfmemalloc = false;
while (i < sheaf->size) {
struct slab *slab = virt_to_slab(p[i]);
@@ -2751,8 +2765,13 @@ static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
continue;
}
+ if (slab_test_pfmemalloc(slab))
+ pfmemalloc = true;
+
i++;
}
+
+ return pfmemalloc;
}
static void rcu_free_sheaf_nobarn(struct rcu_head *head)
@@ -3015,14 +3034,11 @@ static void barn_init(struct node_barn *barn)
static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
{
- struct list_head empty_list;
- struct list_head full_list;
+ LIST_HEAD(empty_list);
+ LIST_HEAD(full_list);
struct slab_sheaf *sheaf, *sheaf2;
unsigned long flags;
- INIT_LIST_HEAD(&empty_list);
- INIT_LIST_HEAD(&full_list);
-
spin_lock_irqsave(&barn->lock, flags);
list_splice_init(&barn->sheaves_full, &full_list);
@@ -3048,24 +3064,24 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node,
struct kmem_cache_order_objects oo,
bool allow_spin)
{
- struct folio *folio;
+ struct page *page;
struct slab *slab;
unsigned int order = oo_order(oo);
if (unlikely(!allow_spin))
- folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
+ page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
node, order);
else if (node == NUMA_NO_NODE)
- folio = (struct folio *)alloc_frozen_pages(flags, order);
+ page = alloc_frozen_pages(flags, order);
else
- folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);
+ page = __alloc_frozen_pages(flags, order, node, NULL);
- if (!folio)
+ if (!page)
return NULL;
- slab = folio_slab(folio);
- __folio_set_slab(folio);
- if (folio_is_pfmemalloc(folio))
+ __SetPageSlab(page);
+ slab = page_slab(page);
+ if (page_is_pfmemalloc(page))
slab_set_pfmemalloc(slab);
return slab;
@@ -3289,16 +3305,16 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
static void __free_slab(struct kmem_cache *s, struct slab *slab)
{
- struct folio *folio = slab_folio(slab);
- int order = folio_order(folio);
+ struct page *page = slab_page(slab);
+ int order = compound_order(page);
int pages = 1 << order;
__slab_clear_pfmemalloc(slab);
- folio->mapping = NULL;
- __folio_clear_slab(folio);
+ page->mapping = NULL;
+ __ClearPageSlab(page);
mm_account_reclaimed_pages(pages);
unaccount_slab(slab, order, s);
- free_frozen_pages(&folio->page, order);
+ free_frozen_pages(page, order);
}
static void rcu_free_slab(struct rcu_head *h)
@@ -3618,8 +3634,6 @@ static struct slab *get_partial(struct kmem_cache *s, int node,
return get_any_partial(s, pc);
}
-#ifndef CONFIG_SLUB_TINY
-
#ifdef CONFIG_PREEMPTION
/*
* Calculate the next globally unique transaction for disambiguation
@@ -3723,8 +3737,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
unsigned long flags = 0;
- struct slab new;
- struct slab old;
+ struct freelist_counters old, new;
if (READ_ONCE(slab->freelist)) {
stat(s, DEACTIVATE_REMOTE_FREES);
@@ -3773,10 +3786,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
} else {
new.freelist = old.freelist;
}
- } while (!slab_update_freelist(s, slab,
- old.freelist, old.counters,
- new.freelist, new.counters,
- "unfreezing slab"));
+ } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab"));
/*
* Stage three: Manipulate the slab list based on the updated state.
@@ -4019,12 +4029,6 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s)
return c->slab || slub_percpu_partial(c);
}
-#else /* CONFIG_SLUB_TINY */
-static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
-static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; }
-static inline void flush_this_cpu_slab(struct kmem_cache *s) { }
-#endif /* CONFIG_SLUB_TINY */
-
static bool has_pcs_used(int cpu, struct kmem_cache *s)
{
struct slub_percpu_sheaves *pcs;
@@ -4365,17 +4369,16 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
return true;
}
-#ifndef CONFIG_SLUB_TINY
static inline bool
__update_cpu_freelist_fast(struct kmem_cache *s,
void *freelist_old, void *freelist_new,
unsigned long tid)
{
- freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
- freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
+ struct freelist_tid old = { .freelist = freelist_old, .tid = tid };
+ struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) };
- return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
- &old.full, new.full);
+ return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid,
+ &old.freelist_tid, new.freelist_tid);
}
/*
@@ -4388,27 +4391,24 @@ __update_cpu_freelist_fast(struct kmem_cache *s,
*/
static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
{
- struct slab new;
- unsigned long counters;
- void *freelist;
+ struct freelist_counters old, new;
lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
do {
- freelist = slab->freelist;
- counters = slab->counters;
+ old.freelist = slab->freelist;
+ old.counters = slab->counters;
+
+ new.freelist = NULL;
+ new.counters = old.counters;
- new.counters = counters;
+ new.inuse = old.objects;
+ new.frozen = old.freelist != NULL;
- new.inuse = slab->objects;
- new.frozen = freelist != NULL;
- } while (!__slab_update_freelist(s, slab,
- freelist, counters,
- NULL, new.counters,
- "get_freelist"));
+ } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist"));
- return freelist;
+ return old.freelist;
}
/*
@@ -4416,26 +4416,22 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
*/
static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
{
- struct slab new;
- unsigned long counters;
- void *freelist;
+ struct freelist_counters old, new;
do {
- freelist = slab->freelist;
- counters = slab->counters;
+ old.freelist = slab->freelist;
+ old.counters = slab->counters;
- new.counters = counters;
+ new.freelist = NULL;
+ new.counters = old.counters;
VM_BUG_ON(new.frozen);
- new.inuse = slab->objects;
+ new.inuse = old.objects;
new.frozen = 1;
- } while (!slab_update_freelist(s, slab,
- freelist, counters,
- NULL, new.counters,
- "freeze_slab"));
+ } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab"));
- return freelist;
+ return old.freelist;
}
/*
@@ -4629,7 +4625,7 @@ new_objects:
pc.orig_size = orig_size;
slab = get_partial(s, node, &pc);
if (slab) {
- if (kmem_cache_debug(s)) {
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
freelist = pc.object;
/*
* For debug caches here we had to go through
@@ -4667,7 +4663,7 @@ new_objects:
stat(s, ALLOC_SLAB);
- if (kmem_cache_debug(s)) {
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
if (unlikely(!freelist)) {
@@ -4879,32 +4875,6 @@ redo:
return object;
}
-#else /* CONFIG_SLUB_TINY */
-static void *__slab_alloc_node(struct kmem_cache *s,
- gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
-{
- struct partial_context pc;
- struct slab *slab;
- void *object;
-
- pc.flags = gfpflags;
- pc.orig_size = orig_size;
- slab = get_partial(s, node, &pc);
-
- if (slab)
- return pc.object;
-
- slab = new_slab(s, gfpflags, node);
- if (unlikely(!slab)) {
- slab_out_of_memory(s, gfpflags, node);
- return NULL;
- }
-
- object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
-
- return object;
-}
-#endif /* CONFIG_SLUB_TINY */
/*
* If the object has been wiped upon free, make sure it's fully initialized by
@@ -5045,7 +5015,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
return NULL;
if (empty) {
- if (!refill_sheaf(s, empty, gfp)) {
+ if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) {
full = empty;
} else {
/*
@@ -5156,7 +5126,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
* be false because of cpu migration during an unlocked part of
* the current allocation or previous freeing process.
*/
- if (folio_nid(virt_to_folio(object)) != node) {
+ if (page_to_nid(virt_to_page(object)) != node) {
local_unlock(&s->cpu_sheaves->lock);
return NULL;
}
@@ -5345,6 +5315,26 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod
}
EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
+static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
+ struct slab_sheaf *sheaf, gfp_t gfp)
+{
+ int ret = 0;
+
+ ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC);
+
+ if (likely(!ret || !gfp_pfmemalloc_allowed(gfp)))
+ return ret;
+
+ /*
+ * if we are allowed to, refill sheaf with pfmemalloc but then remember
+ * it for when it's returned
+ */
+ ret = refill_sheaf(s, sheaf, gfp);
+ sheaf->pfmemalloc = true;
+
+ return ret;
+}
+
/*
* returns a sheaf that has at least the requested size
* when prefilling is needed, do so with given gfp flags
@@ -5379,6 +5369,10 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
sheaf->cache = s;
sheaf->capacity = size;
+ /*
+ * we do not need to care about pfmemalloc here because oversize
+ * sheaves area always flushed and freed when returned
+ */
if (!__kmem_cache_alloc_bulk(s, gfp, size,
&sheaf->objects[0])) {
kfree(sheaf);
@@ -5415,17 +5409,18 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
if (!sheaf)
sheaf = alloc_empty_sheaf(s, gfp);
- if (sheaf && sheaf->size < size) {
- if (refill_sheaf(s, sheaf, gfp)) {
+ if (sheaf) {
+ sheaf->capacity = s->sheaf_capacity;
+ sheaf->pfmemalloc = false;
+
+ if (sheaf->size < size &&
+ __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) {
sheaf_flush_unused(s, sheaf);
free_empty_sheaf(s, sheaf);
sheaf = NULL;
}
}
- if (sheaf)
- sheaf->capacity = s->sheaf_capacity;
-
return sheaf;
}
@@ -5445,7 +5440,8 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
struct slub_percpu_sheaves *pcs;
struct node_barn *barn;
- if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
+ if (unlikely((sheaf->capacity != s->sheaf_capacity)
+ || sheaf->pfmemalloc)) {
sheaf_flush_unused(s, sheaf);
kfree(sheaf);
return;
@@ -5511,7 +5507,7 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
if (likely(sheaf->capacity >= size)) {
if (likely(sheaf->capacity == s->sheaf_capacity))
- return refill_sheaf(s, sheaf, gfp);
+ return __prefill_sheaf_pfmemalloc(s, sheaf, gfp);
if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
&sheaf->objects[sheaf->size])) {
@@ -5544,6 +5540,9 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
*
* The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
* memcg charging is forced over limit if necessary, to avoid failure.
+ *
+ * It is possible that the allocation comes from kfence and then the sheaf
+ * size is not decreased.
*/
void *
kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
@@ -5555,7 +5554,10 @@ kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
if (sheaf->size == 0)
goto out;
- ret = sheaf->objects[--sheaf->size];
+ ret = kfence_alloc(s, s->object_size, gfp);
+
+ if (likely(!ret))
+ ret = sheaf->objects[--sheaf->size];
init = slab_want_init_on_alloc(gfp, s);
@@ -5578,7 +5580,7 @@ unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
*/
static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- struct folio *folio;
+ struct page *page;
void *ptr = NULL;
unsigned int order = get_order(size);
@@ -5588,15 +5590,15 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
flags |= __GFP_COMP;
if (node == NUMA_NO_NODE)
- folio = (struct folio *)alloc_frozen_pages_noprof(flags, order);
+ page = alloc_frozen_pages_noprof(flags, order);
else
- folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL);
+ page = __alloc_frozen_pages_noprof(flags, order, node, NULL);
- if (folio) {
- ptr = folio_address(folio);
- lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
+ if (page) {
+ ptr = page_address(page);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
- __folio_set_large_kmalloc(folio);
+ __SetPageLargeKmalloc(page);
}
ptr = kasan_kmalloc_large(ptr, size, flags);
@@ -5723,9 +5725,7 @@ retry:
* it did local_lock_irqsave(&s->cpu_slab->lock, flags).
* In this case fast path with __update_cpu_freelist_fast() is not safe.
*/
-#ifndef CONFIG_SLUB_TINY
if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
-#endif
ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
if (PTR_ERR(ret) == -EBUSY) {
@@ -5863,10 +5863,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
unsigned long addr)
{
- void *prior;
- int was_frozen;
- struct slab new;
- unsigned long counters;
+ bool was_frozen, was_full;
+ struct freelist_counters old, new;
struct kmem_cache_node *n = NULL;
unsigned long flags;
bool on_node_partial;
@@ -5878,20 +5876,43 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
return;
}
+ /*
+ * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below
+ * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s)
+ * is the only other reason it can be false, and it is already handled
+ * above.
+ */
+
do {
if (unlikely(n)) {
spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
- prior = slab->freelist;
- counters = slab->counters;
- set_freepointer(s, tail, prior);
- new.counters = counters;
- was_frozen = new.frozen;
+
+ old.freelist = slab->freelist;
+ old.counters = slab->counters;
+
+ was_full = (old.freelist == NULL);
+ was_frozen = old.frozen;
+
+ set_freepointer(s, tail, old.freelist);
+
+ new.freelist = head;
+ new.counters = old.counters;
new.inuse -= cnt;
- if ((!new.inuse || !prior) && !was_frozen) {
- /* Needs to be taken off a list */
- if (!kmem_cache_has_cpu_partial(s) || prior) {
+
+ /*
+ * Might need to be taken off (due to becoming empty) or added
+ * to (due to not being full anymore) the partial list.
+ * Unless it's frozen.
+ */
+ if ((!new.inuse || was_full) && !was_frozen) {
+ /*
+ * If slab becomes non-full and we have cpu partial
+ * lists, we put it there unconditionally to avoid
+ * taking the list_lock. Otherwise we need it.
+ */
+ if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) {
n = get_node(s, slab_nid(slab));
/*
@@ -5908,10 +5929,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
}
}
- } while (!slab_update_freelist(s, slab,
- prior, counters,
- head, new.counters,
- "__slab_free"));
+ } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free"));
if (likely(!n)) {
@@ -5921,7 +5939,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
* activity can be necessary.
*/
stat(s, FREE_FROZEN);
- } else if (kmem_cache_has_cpu_partial(s) && !prior) {
+ } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) {
/*
* If we started with a full slab then put it onto the
* per cpu partial list.
@@ -5930,6 +5948,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
stat(s, CPU_PARTIAL_FREE);
}
+ /*
+ * In other cases we didn't take the list_lock because the slab
+ * was already on the partial list and will remain there.
+ */
+
return;
}
@@ -5937,19 +5960,24 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
* This slab was partially empty but not on the per-node partial list,
* in which case we shouldn't manipulate its list, just return.
*/
- if (prior && !on_node_partial) {
+ if (!was_full && !on_node_partial) {
spin_unlock_irqrestore(&n->list_lock, flags);
return;
}
+ /*
+ * If slab became empty, should we add/keep it on the partial list or we
+ * have enough?
+ */
if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
goto slab_empty;
/*
* Objects left in the slab. If it was not on the partial list before
- * then add it.
+ * then add it. This can only happen when cache has no per cpu partial
+ * list otherwise we would have put it there.
*/
- if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
+ if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) {
add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -5957,10 +5985,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
return;
slab_empty:
- if (prior) {
- /*
- * Slab on the partial list.
- */
+ /*
+ * The slab could have a single object and thus go from full to empty in
+ * a single free, but more likely it was on the partial list. Remove it.
+ */
+ if (likely(!was_full)) {
remove_partial(n, slab);
stat(s, FREE_REMOVE_PARTIAL);
}
@@ -6185,8 +6214,12 @@ static void rcu_free_sheaf(struct rcu_head *head)
* handles it fine. The only downside is that sheaf will serve fewer
* allocations when reused. It only happens due to debugging, which is a
* performance hit anyway.
+ *
+ * If it returns true, there was at least one object from pfmemalloc
+ * slab so simply flush everything.
*/
- __rcu_free_sheaf_prepare(s, sheaf);
+ if (__rcu_free_sheaf_prepare(s, sheaf))
+ goto flush;
n = get_node(s, sheaf->node);
if (!n)
@@ -6339,7 +6372,8 @@ next_remote_batch:
continue;
}
- if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) {
+ if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)
+ || slab_test_pfmemalloc(slab))) {
remote_objects[remote_nr] = p[i];
p[i] = p[--size];
if (++remote_nr >= PCS_BATCH_MAX)
@@ -6487,14 +6521,10 @@ static void free_deferred_objects(struct irq_work *work)
llist_for_each_safe(pos, t, llnode) {
struct slab *slab = container_of(pos, struct slab, llnode);
-#ifdef CONFIG_SLUB_TINY
- free_slab(slab->slab_cache, slab);
-#else
if (slab->frozen)
deactivate_slab(slab->slab_cache, slab, slab->flush_freelist);
else
free_slab(slab->slab_cache, slab);
-#endif
}
}
@@ -6530,7 +6560,6 @@ void defer_free_barrier(void)
irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
}
-#ifndef CONFIG_SLUB_TINY
/*
* Fastpath with forced inlining to produce a kfree and kmem_cache_free that
* can perform fastpath freeing without additional function calls.
@@ -6623,14 +6652,6 @@ redo:
}
stat_add(s, FREE_FASTPATH, cnt);
}
-#else /* CONFIG_SLUB_TINY */
-static void do_slab_free(struct kmem_cache *s,
- struct slab *slab, void *head, void *tail,
- int cnt, unsigned long addr)
-{
- __slab_free(s, slab, head, tail, cnt, addr);
-}
-#endif /* CONFIG_SLUB_TINY */
static __fastpath_inline
void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
@@ -6643,7 +6664,8 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
return;
if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
- slab_nid(slab) == numa_mem_id())) {
+ slab_nid(slab) == numa_mem_id())
+ && likely(!slab_test_pfmemalloc(slab))) {
if (likely(free_to_pcs(s, object)))
return;
}
@@ -6753,12 +6775,12 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
}
EXPORT_SYMBOL(kmem_cache_free);
-static void free_large_kmalloc(struct folio *folio, void *object)
+static void free_large_kmalloc(struct page *page, void *object)
{
- unsigned int order = folio_order(folio);
+ unsigned int order = compound_order(page);
- if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) {
- dump_page(&folio->page, "Not a kmalloc allocation");
+ if (WARN_ON_ONCE(!PageLargeKmalloc(page))) {
+ dump_page(page, "Not a kmalloc allocation");
return;
}
@@ -6769,10 +6791,10 @@ static void free_large_kmalloc(struct folio *folio, void *object)
kasan_kfree_large(object);
kmsan_kfree_large(object);
- lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
- __folio_clear_large_kmalloc(folio);
- free_frozen_pages(&folio->page, order);
+ __ClearPageLargeKmalloc(page);
+ free_frozen_pages(page, order);
}
/*
@@ -6782,7 +6804,7 @@ static void free_large_kmalloc(struct folio *folio, void *object)
void kvfree_rcu_cb(struct rcu_head *head)
{
void *obj = head;
- struct folio *folio;
+ struct page *page;
struct slab *slab;
struct kmem_cache *s;
void *slab_addr;
@@ -6793,20 +6815,20 @@ void kvfree_rcu_cb(struct rcu_head *head)
return;
}
- folio = virt_to_folio(obj);
- if (!folio_test_slab(folio)) {
+ page = virt_to_page(obj);
+ slab = page_slab(page);
+ if (!slab) {
/*
* rcu_head offset can be only less than page size so no need to
- * consider folio order
+ * consider allocation order
*/
obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
- free_large_kmalloc(folio, obj);
+ free_large_kmalloc(page, obj);
return;
}
- slab = folio_slab(folio);
s = slab->slab_cache;
- slab_addr = folio_address(folio);
+ slab_addr = slab_address(slab);
if (is_kfence_address(obj)) {
obj = kfence_object_start(obj);
@@ -6828,7 +6850,7 @@ void kvfree_rcu_cb(struct rcu_head *head)
*/
void kfree(const void *object)
{
- struct folio *folio;
+ struct page *page;
struct slab *slab;
struct kmem_cache *s;
void *x = (void *)object;
@@ -6838,13 +6860,13 @@ void kfree(const void *object)
if (unlikely(ZERO_OR_NULL_PTR(object)))
return;
- folio = virt_to_folio(object);
- if (unlikely(!folio_test_slab(folio))) {
- free_large_kmalloc(folio, (void *)object);
+ page = virt_to_page(object);
+ slab = page_slab(page);
+ if (!slab) {
+ free_large_kmalloc(page, (void *)object);
return;
}
- slab = folio_slab(folio);
s = slab->slab_cache;
slab_free(s, slab, x, _RET_IP_);
}
@@ -6861,7 +6883,6 @@ EXPORT_SYMBOL(kfree);
*/
void kfree_nolock(const void *object)
{
- struct folio *folio;
struct slab *slab;
struct kmem_cache *s;
void *x = (void *)object;
@@ -6869,13 +6890,12 @@ void kfree_nolock(const void *object)
if (unlikely(ZERO_OR_NULL_PTR(object)))
return;
- folio = virt_to_folio(object);
- if (unlikely(!folio_test_slab(folio))) {
+ slab = virt_to_slab(object);
+ if (unlikely(!slab)) {
WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
return;
}
- slab = folio_slab(folio);
s = slab->slab_cache;
memcg_slab_free_hook(s, slab, &x, 1);
@@ -6907,11 +6927,7 @@ void kfree_nolock(const void *object)
* since kasan quarantine takes locks and not supported from NMI.
*/
kasan_slab_free(s, x, false, false, /* skip quarantine */true);
-#ifndef CONFIG_SLUB_TINY
do_slab_free(s, slab, x, x, 0, _RET_IP_);
-#else
- defer_free(s, x);
-#endif
}
EXPORT_SYMBOL_GPL(kfree_nolock);
@@ -6943,16 +6959,16 @@ __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags,
if (is_kfence_address(p)) {
ks = orig_size = kfence_ksize(p);
} else {
- struct folio *folio;
+ struct page *page = virt_to_page(p);
+ struct slab *slab = page_slab(page);
- folio = virt_to_folio(p);
- if (unlikely(!folio_test_slab(folio))) {
+ if (!slab) {
/* Big kmalloc object */
- WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE);
- WARN_ON(p != folio_address(folio));
- ks = folio_size(folio);
+ ks = page_size(page);
+ WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE);
+ WARN_ON(p != page_address(page));
} else {
- s = folio_slab(folio)->slab_cache;
+ s = slab->slab_cache;
orig_size = get_orig_size(s, (void *)p);
ks = s->object_size;
}
@@ -7256,23 +7272,25 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
{
int lookahead = 3;
void *object;
- struct folio *folio;
+ struct page *page;
+ struct slab *slab;
size_t same;
object = p[--size];
- folio = virt_to_folio(object);
+ page = virt_to_page(object);
+ slab = page_slab(page);
if (!s) {
/* Handle kalloc'ed objects */
- if (unlikely(!folio_test_slab(folio))) {
- free_large_kmalloc(folio, object);
+ if (!slab) {
+ free_large_kmalloc(page, object);
df->slab = NULL;
return size;
}
/* Derive kmem_cache from object */
- df->slab = folio_slab(folio);
- df->s = df->slab->slab_cache;
+ df->slab = slab;
+ df->s = slab->slab_cache;
} else {
- df->slab = folio_slab(folio);
+ df->slab = slab;
df->s = cache_from_obj(s, object); /* Support for memcg */
}
@@ -7361,7 +7379,6 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-#ifndef CONFIG_SLUB_TINY
static inline
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
@@ -7379,14 +7396,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
local_lock_irqsave(&s->cpu_slab->lock, irqflags);
for (i = 0; i < size; i++) {
- void *object = kfence_alloc(s, s->object_size, flags);
-
- if (unlikely(object)) {
- p[i] = object;
- continue;
- }
+ void *object = c->freelist;
- object = c->freelist;
if (unlikely(!object)) {
/*
* We may have removed an object from c->freelist using
@@ -7432,41 +7443,13 @@ error:
return 0;
}
-#else /* CONFIG_SLUB_TINY */
-static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p)
-{
- int i;
-
- for (i = 0; i < size; i++) {
- void *object = kfence_alloc(s, s->object_size, flags);
-
- if (unlikely(object)) {
- p[i] = object;
- continue;
- }
-
- p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE,
- _RET_IP_, s->object_size);
- if (unlikely(!p[i]))
- goto error;
-
- maybe_wipe_obj_freeptr(s, p[i]);
- }
-
- return i;
-
-error:
- __kmem_cache_free_bulk(s, i, p);
- return 0;
-}
-#endif /* CONFIG_SLUB_TINY */
/* Note that interrupts must be enabled when calling this function. */
int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
unsigned int i = 0;
+ void *kfence_obj;
if (!size)
return 0;
@@ -7475,6 +7458,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
if (unlikely(!s))
return 0;
+ /*
+ * to make things simpler, only assume at most once kfence allocated
+ * object per bulk allocation and choose its index randomly
+ */
+ kfence_obj = kfence_alloc(s, s->object_size, flags);
+
+ if (unlikely(kfence_obj)) {
+ if (unlikely(size == 1)) {
+ p[0] = kfence_obj;
+ goto out;
+ }
+ size--;
+ }
+
if (s->cpu_sheaves)
i = alloc_from_pcs_bulk(s, size, p);
@@ -7486,10 +7483,23 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) {
if (i > 0)
__kmem_cache_free_bulk(s, i, p);
+ if (kfence_obj)
+ __kfence_free(kfence_obj);
return 0;
}
}
+ if (unlikely(kfence_obj)) {
+ int idx = get_random_u32_below(size + 1);
+
+ if (idx != size)
+ p[size] = p[idx];
+ p[idx] = kfence_obj;
+
+ size++;
+ }
+
+out:
/*
* memcg and kmem_cache debug support and memory initialization.
* Done outside of the IRQ disabled fastpath loop.
@@ -7651,7 +7661,6 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn)
barn_init(barn);
}
-#ifndef CONFIG_SLUB_TINY
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
@@ -7672,12 +7681,6 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
return 1;
}
-#else
-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
-{
- return 1;
-}
-#endif /* CONFIG_SLUB_TINY */
static int init_percpu_sheaves(struct kmem_cache *s)
{
@@ -7767,13 +7770,11 @@ void __kmem_cache_release(struct kmem_cache *s)
cache_random_seq_destroy(s);
if (s->cpu_sheaves)
pcs_destroy(s);
-#ifndef CONFIG_SLUB_TINY
#ifdef CONFIG_PREEMPT_RT
if (s->cpu_slab)
lockdep_unregister_key(&s->lock_key);
#endif
free_percpu(s->cpu_slab);
-#endif
free_kmem_cache_nodes(s);
}
@@ -8139,46 +8140,53 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
* Kmalloc subsystem
*******************************************************************/
-static int __init setup_slub_min_order(char *str)
+static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp)
{
- get_option(&str, (int *)&slub_min_order);
+ int ret;
+
+ ret = kstrtouint(str, 0, &slub_min_order);
+ if (ret)
+ return ret;
if (slub_min_order > slub_max_order)
slub_max_order = slub_min_order;
- return 1;
+ return 0;
}
-__setup("slab_min_order=", setup_slub_min_order);
-__setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0);
-
+static const struct kernel_param_ops param_ops_slab_min_order __initconst = {
+ .set = setup_slub_min_order,
+};
+__core_param_cb(slab_min_order, &param_ops_slab_min_order, &slub_min_order, 0);
+__core_param_cb(slub_min_order, &param_ops_slab_min_order, &slub_min_order, 0);
-static int __init setup_slub_max_order(char *str)
+static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp)
{
- get_option(&str, (int *)&slub_max_order);
+ int ret;
+
+ ret = kstrtouint(str, 0, &slub_max_order);
+ if (ret)
+ return ret;
+
slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
if (slub_min_order > slub_max_order)
slub_min_order = slub_max_order;
- return 1;
+ return 0;
}
-__setup("slab_max_order=", setup_slub_max_order);
-__setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0);
-
-static int __init setup_slub_min_objects(char *str)
-{
- get_option(&str, (int *)&slub_min_objects);
-
- return 1;
-}
+static const struct kernel_param_ops param_ops_slab_max_order __initconst = {
+ .set = setup_slub_max_order,
+};
+__core_param_cb(slab_max_order, &param_ops_slab_max_order, &slub_max_order, 0);
+__core_param_cb(slub_max_order, &param_ops_slab_max_order, &slub_max_order, 0);
-__setup("slab_min_objects=", setup_slub_min_objects);
-__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0);
+core_param(slab_min_objects, slub_min_objects, uint, 0);
+core_param(slub_min_objects, slub_min_objects, uint, 0);
#ifdef CONFIG_NUMA
-static int __init setup_slab_strict_numa(char *str)
+static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp)
{
if (nr_node_ids > 1) {
static_branch_enable(&strict_numa);
@@ -8187,10 +8195,14 @@ static int __init setup_slab_strict_numa(char *str)
pr_warn("slab_strict_numa parameter set on non NUMA system.\n");
}
- return 1;
+ return 0;
}
-__setup("slab_strict_numa", setup_slab_strict_numa);
+static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = setup_slab_strict_numa,
+};
+__core_param_cb(slab_strict_numa, &param_ops_slab_strict_numa, NULL, 0);
#endif
@@ -8516,10 +8528,8 @@ void __init kmem_cache_init(void)
void __init kmem_cache_init_late(void)
{
-#ifndef CONFIG_SLUB_TINY
flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
WARN_ON(!flushwq);
-#endif
}
struct kmem_cache *
diff --git a/mm/sparse.c b/mm/sparse.c
index 17c50a6415c2..b5b2b6f7041b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -951,8 +951,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- if (!altmap || !altmap->inaccessible)
- page_init_poison(memmap, sizeof(struct page) * nr_pages);
+ page_init_poison(memmap, sizeof(struct page) * nr_pages);
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
diff --git a/mm/usercopy.c b/mm/usercopy.c
index dbdcc43964fb..5de7a518b1b1 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -164,7 +164,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
{
unsigned long addr = (unsigned long)ptr;
unsigned long offset;
- struct folio *folio;
+ struct page *page;
+ struct slab *slab;
if (is_kmap_addr(ptr)) {
offset = offset_in_page(ptr);
@@ -189,16 +190,23 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
if (!virt_addr_valid(ptr))
return;
- folio = virt_to_folio(ptr);
-
- if (folio_test_slab(folio)) {
+ page = virt_to_page(ptr);
+ slab = page_slab(page);
+ if (slab) {
/* Check slab allocator for flags and size. */
- __check_heap_object(ptr, n, folio_slab(folio), to_user);
- } else if (folio_test_large(folio)) {
- offset = ptr - folio_address(folio);
- if (n > folio_size(folio) - offset)
+ __check_heap_object(ptr, n, slab, to_user);
+ } else if (PageCompound(page)) {
+ page = compound_head(page);
+ offset = ptr - page_address(page);
+ if (n > page_size(page) - offset)
usercopy_abort("page alloc", NULL, to_user, offset, n);
}
+
+ /*
+ * We cannot check non-compound pages. They might be part of
+ * a large allocation, in which case crossing a page boundary
+ * is fine.
+ */
}
DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,