summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-30 13:45:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-30 13:45:28 -0700
commitaa918db707fba507e85217961643281ee8dfb2ed (patch)
tree86d529825cc85a1d309f33efba97016dd64c8529
parent494e7fe591bf834d57c6607cdc26ab8873708aa7 (diff)
parentf90b474a35744b5d43009e4fab232e74a3024cae (diff)
Merge tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf try_alloc_pages() support from Alexei Starovoitov: "The pull includes work from Sebastian, Vlastimil and myself with a lot of help from Michal and Shakeel. This is a first step towards making kmalloc reentrant to get rid of slab wrappers: bpf_mem_alloc, kretprobe's objpool, etc. These patches make page allocator safe from any context. Vlastimil kicked off this effort at LSFMM 2024: https://lwn.net/Articles/974138/ and we continued at LSFMM 2025: https://lore.kernel.org/all/CAADnVQKfkGxudNUkcPJgwe3nTZ=xohnRshx9kLZBTmR_E1DFEg@mail.gmail.com/ Why: SLAB wrappers bind memory to a particular subsystem making it unavailable to the rest of the kernel. Some BPF maps in production consume Gbytes of preallocated memory. Top 5 in Meta: 1.5G, 1.2G, 1.1G, 300M, 200M. Once we have kmalloc that works in any context BPF map preallocation won't be necessary. How: Synchronous kmalloc/page alloc stack has multiple stages going from fast to slow: cmpxchg16 -> slab_alloc -> new_slab -> alloc_pages -> rmqueue_pcplist -> __rmqueue, where rmqueue_pcplist was already relying on trylock. This set changes rmqueue_bulk/rmqueue_buddy to attempt a trylock and return ENOMEM if alloc_flags & ALLOC_TRYLOCK. It then wraps this functionality into try_alloc_pages() helper. We make sure that the logic is sane in PREEMPT_RT. End result: try_alloc_pages()/free_pages_nolock() are safe to call from any context. try_kmalloc() for any context with similar trylock approach will follow. It will use try_alloc_pages() when slab needs a new page. Though such try_kmalloc/page_alloc() is an opportunistic allocator, this design ensures that the probability of successful allocation of small objects (up to one page in size) is high. Even before we have try_kmalloc(), we already use try_alloc_pages() in BPF arena implementation and it's going to be used more extensively in BPF" * tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: mm: Fix the flipped condition in gfpflags_allow_spinning() bpf: Use try_alloc_pages() to allocate pages for bpf needs. mm, bpf: Use memcg in try_alloc_pages(). memcg: Use trylock to access memcg stock_lock. mm, bpf: Introduce free_pages_nolock() mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation locking/local_lock: Introduce localtry_lock_t
-rw-r--r--include/linux/bpf.h2
-rw-r--r--include/linux/gfp.h23
-rw-r--r--include/linux/local_lock.h70
-rw-r--r--include/linux/local_lock_internal.h146
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/mmzone.h3
-rw-r--r--kernel/bpf/arena.c5
-rw-r--r--kernel/bpf/syscall.c23
-rw-r--r--lib/stackdepot.c10
-rw-r--r--mm/internal.h1
-rw-r--r--mm/memcontrol.c57
-rw-r--r--mm/page_alloc.c203
-rw-r--r--mm/page_owner.c8
13 files changed, 511 insertions, 44 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d67490dc3a2b..3f0cc89c0622 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2385,7 +2385,7 @@ int generic_map_delete_batch(struct bpf_map *map,
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
-int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6bb1a5a7a4ae..c9fa6309c903 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -39,6 +39,25 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}
+static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
+{
+ /*
+ * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
+ * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
+ * All GFP_* flags including GFP_NOWAIT use one or both flags.
+ * try_alloc_pages() is the only API that doesn't specify either flag.
+ *
+ * This is stronger than GFP_NOWAIT or GFP_ATOMIC because
+ * those are guaranteed to never block on a sleeping lock.
+ * Here we are enforcing that the allocation doesn't ever spin
+ * on any locks (i.e. only trylocks). There is no high level
+ * GFP_$FOO flag for this use in try_alloc_pages() as the
+ * regular page allocator doesn't fully support this
+ * allocation mode.
+ */
+ return !!(gfp_flags & __GFP_RECLAIM);
+}
+
#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
@@ -335,6 +354,9 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
}
#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
+struct page *try_alloc_pages_noprof(int nid, unsigned int order);
+#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))
+
extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
@@ -357,6 +379,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
__get_free_pages((gfp_mask) | GFP_DMA, (order))
extern void __free_pages(struct page *page, unsigned int order);
+extern void free_pages_nolock(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
#define __free_page(page) __free_pages((page), 0)
diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
index 091dc0b6bdfb..1a0bc35839e3 100644
--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -51,6 +51,76 @@
#define local_unlock_irqrestore(lock, flags) \
__local_unlock_irqrestore(lock, flags)
+/**
+ * localtry_lock_init - Runtime initialize a lock instance
+ */
+#define localtry_lock_init(lock) __localtry_lock_init(lock)
+
+/**
+ * localtry_lock - Acquire a per CPU local lock
+ * @lock: The lock variable
+ */
+#define localtry_lock(lock) __localtry_lock(lock)
+
+/**
+ * localtry_lock_irq - Acquire a per CPU local lock and disable interrupts
+ * @lock: The lock variable
+ */
+#define localtry_lock_irq(lock) __localtry_lock_irq(lock)
+
+/**
+ * localtry_lock_irqsave - Acquire a per CPU local lock, save and disable
+ * interrupts
+ * @lock: The lock variable
+ * @flags: Storage for interrupt flags
+ */
+#define localtry_lock_irqsave(lock, flags) \
+ __localtry_lock_irqsave(lock, flags)
+
+/**
+ * localtry_trylock - Try to acquire a per CPU local lock.
+ * @lock: The lock variable
+ *
+ * The function can be used in any context such as NMI or HARDIRQ. Due to
+ * locking constrains it will _always_ fail to acquire the lock in NMI or
+ * HARDIRQ context on PREEMPT_RT.
+ */
+#define localtry_trylock(lock) __localtry_trylock(lock)
+
+/**
+ * localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
+ * interrupts if acquired
+ * @lock: The lock variable
+ * @flags: Storage for interrupt flags
+ *
+ * The function can be used in any context such as NMI or HARDIRQ. Due to
+ * locking constrains it will _always_ fail to acquire the lock in NMI or
+ * HARDIRQ context on PREEMPT_RT.
+ */
+#define localtry_trylock_irqsave(lock, flags) \
+ __localtry_trylock_irqsave(lock, flags)
+
+/**
+ * local_unlock - Release a per CPU local lock
+ * @lock: The lock variable
+ */
+#define localtry_unlock(lock) __localtry_unlock(lock)
+
+/**
+ * local_unlock_irq - Release a per CPU local lock and enable interrupts
+ * @lock: The lock variable
+ */
+#define localtry_unlock_irq(lock) __localtry_unlock_irq(lock)
+
+/**
+ * localtry_unlock_irqrestore - Release a per CPU local lock and restore
+ * interrupt flags
+ * @lock: The lock variable
+ * @flags: Interrupt flags to restore
+ */
+#define localtry_unlock_irqrestore(lock, flags) \
+ __localtry_unlock_irqrestore(lock, flags)
+
DEFINE_GUARD(local_lock, local_lock_t __percpu*,
local_lock(_T),
local_unlock(_T))
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index 8dd71fbbb6d2..67bd13d142fa 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -15,6 +15,11 @@ typedef struct {
#endif
} local_lock_t;
+typedef struct {
+ local_lock_t llock;
+ unsigned int acquired;
+} localtry_lock_t;
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname) \
.dep_map = { \
@@ -31,6 +36,13 @@ static inline void local_lock_acquire(local_lock_t *l)
l->owner = current;
}
+static inline void local_trylock_acquire(local_lock_t *l)
+{
+ lock_map_acquire_try(&l->dep_map);
+ DEBUG_LOCKS_WARN_ON(l->owner);
+ l->owner = current;
+}
+
static inline void local_lock_release(local_lock_t *l)
{
DEBUG_LOCKS_WARN_ON(l->owner != current);
@@ -45,11 +57,13 @@ static inline void local_lock_debug_init(local_lock_t *l)
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
+static inline void local_trylock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
#define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) }
+#define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }}
#define __local_lock_init(lock) \
do { \
@@ -118,6 +132,104 @@ do { \
#define __local_unlock_nested_bh(lock) \
local_lock_release(this_cpu_ptr(lock))
+/* localtry_lock_t variants */
+
+#define __localtry_lock_init(lock) \
+do { \
+ __local_lock_init(&(lock)->llock); \
+ WRITE_ONCE((lock)->acquired, 0); \
+} while (0)
+
+#define __localtry_lock(lock) \
+ do { \
+ localtry_lock_t *lt; \
+ preempt_disable(); \
+ lt = this_cpu_ptr(lock); \
+ local_lock_acquire(&lt->llock); \
+ WRITE_ONCE(lt->acquired, 1); \
+ } while (0)
+
+#define __localtry_lock_irq(lock) \
+ do { \
+ localtry_lock_t *lt; \
+ local_irq_disable(); \
+ lt = this_cpu_ptr(lock); \
+ local_lock_acquire(&lt->llock); \
+ WRITE_ONCE(lt->acquired, 1); \
+ } while (0)
+
+#define __localtry_lock_irqsave(lock, flags) \
+ do { \
+ localtry_lock_t *lt; \
+ local_irq_save(flags); \
+ lt = this_cpu_ptr(lock); \
+ local_lock_acquire(&lt->llock); \
+ WRITE_ONCE(lt->acquired, 1); \
+ } while (0)
+
+#define __localtry_trylock(lock) \
+ ({ \
+ localtry_lock_t *lt; \
+ bool _ret; \
+ \
+ preempt_disable(); \
+ lt = this_cpu_ptr(lock); \
+ if (!READ_ONCE(lt->acquired)) { \
+ WRITE_ONCE(lt->acquired, 1); \
+ local_trylock_acquire(&lt->llock); \
+ _ret = true; \
+ } else { \
+ _ret = false; \
+ preempt_enable(); \
+ } \
+ _ret; \
+ })
+
+#define __localtry_trylock_irqsave(lock, flags) \
+ ({ \
+ localtry_lock_t *lt; \
+ bool _ret; \
+ \
+ local_irq_save(flags); \
+ lt = this_cpu_ptr(lock); \
+ if (!READ_ONCE(lt->acquired)) { \
+ WRITE_ONCE(lt->acquired, 1); \
+ local_trylock_acquire(&lt->llock); \
+ _ret = true; \
+ } else { \
+ _ret = false; \
+ local_irq_restore(flags); \
+ } \
+ _ret; \
+ })
+
+#define __localtry_unlock(lock) \
+ do { \
+ localtry_lock_t *lt; \
+ lt = this_cpu_ptr(lock); \
+ WRITE_ONCE(lt->acquired, 0); \
+ local_lock_release(&lt->llock); \
+ preempt_enable(); \
+ } while (0)
+
+#define __localtry_unlock_irq(lock) \
+ do { \
+ localtry_lock_t *lt; \
+ lt = this_cpu_ptr(lock); \
+ WRITE_ONCE(lt->acquired, 0); \
+ local_lock_release(&lt->llock); \
+ local_irq_enable(); \
+ } while (0)
+
+#define __localtry_unlock_irqrestore(lock, flags) \
+ do { \
+ localtry_lock_t *lt; \
+ lt = this_cpu_ptr(lock); \
+ WRITE_ONCE(lt->acquired, 0); \
+ local_lock_release(&lt->llock); \
+ local_irq_restore(flags); \
+ } while (0)
+
#else /* !CONFIG_PREEMPT_RT */
/*
@@ -125,8 +237,10 @@ do { \
* critical section while staying preemptible.
*/
typedef spinlock_t local_lock_t;
+typedef spinlock_t localtry_lock_t;
#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
+#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname)
#define __local_lock_init(l) \
do { \
@@ -169,4 +283,36 @@ do { \
spin_unlock(this_cpu_ptr((lock))); \
} while (0)
+/* localtry_lock_t variants */
+
+#define __localtry_lock_init(lock) __local_lock_init(lock)
+#define __localtry_lock(lock) __local_lock(lock)
+#define __localtry_lock_irq(lock) __local_lock(lock)
+#define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags)
+#define __localtry_unlock(lock) __local_unlock(lock)
+#define __localtry_unlock_irq(lock) __local_unlock(lock)
+#define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags)
+
+#define __localtry_trylock(lock) \
+ ({ \
+ int __locked; \
+ \
+ if (in_nmi() | in_hardirq()) { \
+ __locked = 0; \
+ } else { \
+ migrate_disable(); \
+ __locked = spin_trylock(this_cpu_ptr((lock))); \
+ if (!__locked) \
+ migrate_enable(); \
+ } \
+ __locked; \
+ })
+
+#define __localtry_trylock_irqsave(lock, flags) \
+ ({ \
+ typecheck(unsigned long, flags); \
+ flags = 0; \
+ __localtry_trylock(lock); \
+ })
+
#endif /* CONFIG_PREEMPT_RT */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0234f14f2aa6..75e8850cec3a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -99,6 +99,10 @@ struct page {
/* Or, free page */
struct list_head buddy_list;
struct list_head pcp_list;
+ struct {
+ struct llist_node pcp_llist;
+ unsigned int order;
+ };
};
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9540b41894da..e16939553930 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -972,6 +972,9 @@ struct zone {
/* Primarily protects free_area */
spinlock_t lock;
+ /* Pages to be freed when next trylock succeeds */
+ struct llist_head trylock_free_pages;
+
/* Write-intensive fields used by compaction and vmstats. */
CACHELINE_PADDING(_pad2_);
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 647b709d7d77..0d56cea71602 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_SIGSEGV;
/* Account into memcg of the process that created bpf_arena */
- ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
+ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
return VM_FAULT_SIGSEGV;
@@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (ret)
goto out_free_pages;
- ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
- node_id, page_cnt, pages);
+ ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
if (ret)
goto out;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 77062799143e..9794446bc8c6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
-int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+static bool can_alloc_pages(void)
+{
+ return preempt_count() == 0 && !irqs_disabled() &&
+ !IS_ENABLED(CONFIG_PREEMPT_RT);
+}
+
+static struct page *__bpf_alloc_page(int nid)
+{
+ if (!can_alloc_pages())
+ return try_alloc_pages(nid, 0);
+
+ return alloc_pages_node(nid,
+ GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
+ | __GFP_NOWARN,
+ 0);
+}
+
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long nr_pages, struct page **pages)
{
unsigned long i, j;
@@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
old_memcg = set_active_memcg(memcg);
#endif
for (i = 0; i < nr_pages; i++) {
- pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
+ pg = __bpf_alloc_page(nid);
if (pg) {
pages[i] = pg;
continue;
}
for (j = 0; j < i; j++)
- __free_page(pages[j]);
+ free_pages_nolock(pages[j], 0);
ret = -ENOMEM;
break;
}
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 245d5b416699..73d7b50924ef 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -591,7 +591,8 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
depot_stack_handle_t handle = 0;
struct page *page = NULL;
void *prealloc = NULL;
- bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
+ bool allow_spin = gfpflags_allow_spinning(alloc_flags);
+ bool can_alloc = (depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC) && allow_spin;
unsigned long flags;
u32 hash;
@@ -630,7 +631,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
prealloc = page_address(page);
}
- if (in_nmi()) {
+ if (in_nmi() || !allow_spin) {
/* We can never allocate in NMI context. */
WARN_ON_ONCE(can_alloc);
/* Best effort; bail if we fail to take the lock. */
@@ -671,7 +672,10 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
exit:
if (prealloc) {
/* Stack depot didn't use this memory, free it. */
- free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
+ if (!allow_spin)
+ free_pages_nolock(virt_to_page(prealloc), DEPOT_POOL_ORDER);
+ else
+ free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
}
if (found)
handle = found->handle.handle;
diff --git a/mm/internal.h b/mm/internal.h
index 7eb27ab83ed7..8d1bada7323a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1198,6 +1198,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_NOFRAGMENT 0x0
#endif
#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
+#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
/* Flags that allow allocations below the min watermark. */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a037ec92881d..83c2df73e4b6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1739,7 +1739,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
struct memcg_stock_pcp {
- local_lock_t stock_lock;
+ localtry_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -1754,7 +1754,7 @@ struct memcg_stock_pcp {
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+ .stock_lock = INIT_LOCALTRY_LOCK(stock_lock),
};
static DEFINE_MUTEX(percpu_charge_mutex);
@@ -1766,6 +1766,7 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
+ * @gfp_mask: allocation mask.
*
* The charges will only happen if @memcg matches the current cpu's memcg
* stock, and at least @nr_pages are available in that stock. Failure to
@@ -1773,7 +1774,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
*
* returns true if successful, false otherwise.
*/
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
+ gfp_t gfp_mask)
{
struct memcg_stock_pcp *stock;
unsigned int stock_pages;
@@ -1783,7 +1785,11 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ if (!gfpflags_allow_spinning(gfp_mask))
+ return ret;
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ }
stock = this_cpu_ptr(&memcg_stock);
stock_pages = READ_ONCE(stock->nr_pages);
@@ -1792,7 +1798,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
ret = true;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -1831,14 +1837,14 @@ static void drain_local_stock(struct work_struct *dummy)
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
old = drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -1868,9 +1874,20 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long flags;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ /*
+ * In case of unlikely failure to lock percpu stock_lock
+ * uncharge memcg directly.
+ */
+ if (mem_cgroup_is_root(memcg))
+ return;
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+ return;
+ }
__refill_stock(memcg, nr_pages);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
}
/*
@@ -1927,9 +1944,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
stock = &per_cpu(memcg_stock, cpu);
/* drain_obj_stock requires stock_lock */
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
old = drain_obj_stock(stock);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
drain_stock(stock);
obj_cgroup_put(old);
@@ -2222,9 +2239,13 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long pflags;
retry:
- if (consume_stock(memcg, nr_pages))
+ if (consume_stock(memcg, nr_pages, gfp_mask))
return 0;
+ if (!gfpflags_allow_spinning(gfp_mask))
+ /* Avoid the refill and flush of the older stock */
+ batch = nr_pages;
+
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
@@ -2708,7 +2729,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
unsigned long flags;
int *bytes;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
/*
@@ -2761,7 +2782,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
__mod_objcg_mlstate(objcg, pgdat, idx, nr);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -2771,7 +2792,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
unsigned long flags;
bool ret = false;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
@@ -2779,7 +2800,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
ret = true;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -2871,7 +2892,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
unsigned long flags;
unsigned int nr_pages = 0;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
@@ -2889,7 +2910,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
if (nr_pages)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 542d25f77be8..e3ea5bf5c459 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/* Free the page without taking locks. Rely on trylock only. */
+#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -1249,13 +1252,44 @@ static void split_large_buddy(struct zone *zone, struct page *page,
} while (1);
}
+static void add_page_to_zone_llist(struct zone *zone, struct page *page,
+ unsigned int order)
+{
+ /* Remember the order */
+ page->order = order;
+ /* Add the page to the free list */
+ llist_add(&page->pcp_llist, &zone->trylock_free_pages);
+}
+
static void free_one_page(struct zone *zone, struct page *page,
unsigned long pfn, unsigned int order,
fpi_t fpi_flags)
{
+ struct llist_head *llhead;
unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+
+ /* The lock succeeded. Process deferred pages. */
+ llhead = &zone->trylock_free_pages;
+ if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
+ struct llist_node *llnode;
+ struct page *p, *tmp;
+
+ llnode = llist_del_all(llhead);
+ llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
+ unsigned int p_order = p->order;
+
+ split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
+ __count_vm_events(PGFREE, 1 << p_order);
+ }
+ }
split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -2307,7 +2341,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long flags;
int i;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ return 0;
+ spin_lock_irqsave(&zone->lock, flags);
+ }
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
@@ -2595,7 +2633,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
static void free_frozen_page_commit(struct zone *zone,
struct per_cpu_pages *pcp, struct page *page, int migratetype,
- unsigned int order)
+ unsigned int order, fpi_t fpi_flags)
{
int high, batch;
int pindex;
@@ -2630,6 +2668,14 @@ static void free_frozen_page_commit(struct zone *zone,
}
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
pcp->free_count += (1 << order);
+
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ /*
+ * Do not attempt to take a zone lock. Let pcp->count get
+ * over high mark temporarily.
+ */
+ return;
+ }
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2644,7 +2690,8 @@ static void free_frozen_page_commit(struct zone *zone,
/*
* Free a pcp page
*/
-void free_frozen_pages(struct page *page, unsigned int order)
+static void __free_frozen_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
@@ -2653,7 +2700,7 @@ void free_frozen_pages(struct page *page, unsigned int order)
int migratetype;
if (!pcp_allowed_order(order)) {
- __free_pages_ok(page, order, FPI_NONE);
+ __free_pages_ok(page, order, fpi_flags);
return;
}
@@ -2671,23 +2718,33 @@ void free_frozen_pages(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
return;
}
migratetype = MIGRATE_MOVABLE;
}
+ if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
+ && (in_nmi() || in_hardirq()))) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_frozen_page_commit(zone, pcp, page, migratetype, order);
+ free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
pcp_spin_unlock(pcp);
} else {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
}
pcp_trylock_finish(UP_flags);
}
+void free_frozen_pages(struct page *page, unsigned int order)
+{
+ __free_frozen_pages(page, order, FPI_NONE);
+}
+
/*
* Free a batch of folios
*/
@@ -2776,7 +2833,7 @@ void free_unref_folios(struct folio_batch *folios)
trace_mm_page_free_batched(&folio->page);
free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
- order);
+ order, FPI_NONE);
}
if (pcp) {
@@ -2907,7 +2964,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ return NULL;
+ spin_lock_irqsave(&zone->lock, flags);
+ }
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
@@ -4512,7 +4573,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
might_alloc(gfp_mask);
- if (should_fail_alloc_page(gfp_mask, order))
+ /*
+ * Don't invoke should_fail logic, since it may call
+ * get_random_u32() and printk() which need to spin_lock.
+ */
+ if (!(*alloc_flags & ALLOC_TRYLOCK) &&
+ should_fail_alloc_page(gfp_mask, order))
return false;
*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@@ -4810,9 +4876,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
EXPORT_SYMBOL(get_zeroed_page_noprof);
/**
- * __free_pages - Free pages allocated with alloc_pages().
+ * ___free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
* @order: The order of the allocation.
+ * @fpi_flags: Free Page Internal flags.
*
* This function can free multi-page allocations that are not compound
* pages. It does not check that the @order passed in matches that of
@@ -4829,22 +4896,37 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
* Context: May be called in interrupt context or while holding a normal
* spinlock, but not in NMI context or while holding a raw spinlock.
*/
-void __free_pages(struct page *page, unsigned int order)
+static void ___free_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
/* get PageHead before we drop reference */
int head = PageHead(page);
struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
- free_frozen_pages(page, order);
+ __free_frozen_pages(page, order, fpi_flags);
else if (!head) {
pgalloc_tag_sub_pages(tag, (1 << order) - 1);
while (order-- > 0)
- free_frozen_pages(page + (1 << order), order);
+ __free_frozen_pages(page + (1 << order), order,
+ fpi_flags);
}
}
+void __free_pages(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_NONE);
+}
EXPORT_SYMBOL(__free_pages);
+/*
+ * Can be called while holding raw_spin_lock or from IRQ and NMI for any
+ * page type (not only those that came from try_alloc_pages)
+ */
+void free_pages_nolock(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_TRYLOCK);
+}
+
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@@ -7081,3 +7163,94 @@ static bool __free_unaccepted(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+
+/**
+ * try_alloc_pages - opportunistic reentrant allocation from any context
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> try_alloc_pages_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure.
+ */
+struct page *try_alloc_pages_noprof(int nid, unsigned int order)
+{
+ /*
+ * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
+ * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
+ * is not safe in arbitrary context.
+ *
+ * These two are the conditions for gfpflags_allow_spinning() being true.
+ *
+ * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason
+ * to warn. Also warn would trigger printk() which is unsafe from
+ * various contexts. We cannot use printk_deferred_enter() to mitigate,
+ * since the running context is unknown.
+ *
+ * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
+ * is safe in any context. Also zeroing the page is mandatory for
+ * BPF use cases.
+ *
+ * Though __GFP_NOMEMALLOC is not checked in the code path below,
+ * specify it here to highlight that try_alloc_pages()
+ * doesn't want to deplete reserves.
+ */
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
+ | __GFP_ACCOUNT;
+ unsigned int alloc_flags = ALLOC_TRYLOCK;
+ struct alloc_context ac = { };
+ struct page *page;
+
+ /*
+ * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
+ * unsafe in NMI. If spin_trylock() is called from hard IRQ the current
+ * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
+ * mark the task as the owner of another rt_spin_lock which will
+ * confuse PI logic, so return immediately if called form hard IRQ or
+ * NMI.
+ *
+ * Note, irqs_disabled() case is ok. This function can be called
+ * from raw_spin_lock_irqsave region.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+ return NULL;
+ if (!pcp_allowed_order(order))
+ return NULL;
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (has_unaccepted_memory())
+ return NULL;
+#endif
+ /* Bailout, since _deferred_grow_zone() needs to take a lock */
+ if (deferred_pages_enabled())
+ return NULL;
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_node_id();
+
+ prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
+ &alloc_gfp, &alloc_flags);
+
+ /*
+ * Best effort allocation from percpu free list.
+ * If it's empty attempt to spin_trylock zone->lock.
+ */
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
+
+ /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
+
+ if (memcg_kmem_online() && page &&
+ unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
+ free_pages_nolock(page, order);
+ page = NULL;
+ }
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
+ return page;
+}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 2d6360eaccbb..90e31d0e3ed7 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -294,7 +294,13 @@ void __reset_page_owner(struct page *page, unsigned short order)
page_owner = get_page_owner(page_ext);
alloc_handle = page_owner->handle;
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+ /*
+ * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
+ * to prevent issues in stack_depot_save().
+ * This is similar to try_alloc_pages() gfp flags, but only used
+ * to signal stack_depot to avoid spin_locks.
+ */
+ handle = save_stack(__GFP_NOWARN);
__update_page_owner_free_handle(page_ext, handle, order, current->pid,
current->tgid, free_ts_nsec);
page_ext_put(page_ext);