summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-30 13:45:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-30 13:45:28 -0700
commitaa918db707fba507e85217961643281ee8dfb2ed (patch)
tree86d529825cc85a1d309f33efba97016dd64c8529 /include
parent494e7fe591bf834d57c6607cdc26ab8873708aa7 (diff)
parentf90b474a35744b5d43009e4fab232e74a3024cae (diff)
Merge tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf try_alloc_pages() support from Alexei Starovoitov: "The pull includes work from Sebastian, Vlastimil and myself with a lot of help from Michal and Shakeel. This is a first step towards making kmalloc reentrant to get rid of slab wrappers: bpf_mem_alloc, kretprobe's objpool, etc. These patches make page allocator safe from any context. Vlastimil kicked off this effort at LSFMM 2024: https://lwn.net/Articles/974138/ and we continued at LSFMM 2025: https://lore.kernel.org/all/CAADnVQKfkGxudNUkcPJgwe3nTZ=xohnRshx9kLZBTmR_E1DFEg@mail.gmail.com/ Why: SLAB wrappers bind memory to a particular subsystem making it unavailable to the rest of the kernel. Some BPF maps in production consume Gbytes of preallocated memory. Top 5 in Meta: 1.5G, 1.2G, 1.1G, 300M, 200M. Once we have kmalloc that works in any context BPF map preallocation won't be necessary. How: Synchronous kmalloc/page alloc stack has multiple stages going from fast to slow: cmpxchg16 -> slab_alloc -> new_slab -> alloc_pages -> rmqueue_pcplist -> __rmqueue, where rmqueue_pcplist was already relying on trylock. This set changes rmqueue_bulk/rmqueue_buddy to attempt a trylock and return ENOMEM if alloc_flags & ALLOC_TRYLOCK. It then wraps this functionality into try_alloc_pages() helper. We make sure that the logic is sane in PREEMPT_RT. End result: try_alloc_pages()/free_pages_nolock() are safe to call from any context. try_kmalloc() for any context with similar trylock approach will follow. It will use try_alloc_pages() when slab needs a new page. Though such try_kmalloc/page_alloc() is an opportunistic allocator, this design ensures that the probability of successful allocation of small objects (up to one page in size) is high. Even before we have try_kmalloc(), we already use try_alloc_pages() in BPF arena implementation and it's going to be used more extensively in BPF" * tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: mm: Fix the flipped condition in gfpflags_allow_spinning() bpf: Use try_alloc_pages() to allocate pages for bpf needs. mm, bpf: Use memcg in try_alloc_pages(). memcg: Use trylock to access memcg stock_lock. mm, bpf: Introduce free_pages_nolock() mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation locking/local_lock: Introduce localtry_lock_t
Diffstat (limited to 'include')
-rw-r--r--include/linux/bpf.h2
-rw-r--r--include/linux/gfp.h23
-rw-r--r--include/linux/local_lock.h70
-rw-r--r--include/linux/local_lock_internal.h146
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/mmzone.h3
6 files changed, 247 insertions, 1 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d67490dc3a2b..3f0cc89c0622 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2385,7 +2385,7 @@ int generic_map_delete_batch(struct bpf_map *map,
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
-int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6bb1a5a7a4ae..c9fa6309c903 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -39,6 +39,25 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}
+static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
+{
+ /*
+ * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
+ * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
+ * All GFP_* flags including GFP_NOWAIT use one or both flags.
+ * try_alloc_pages() is the only API that doesn't specify either flag.
+ *
+ * This is stronger than GFP_NOWAIT or GFP_ATOMIC because
+ * those are guaranteed to never block on a sleeping lock.
+ * Here we are enforcing that the allocation doesn't ever spin
+ * on any locks (i.e. only trylocks). There is no high level
+ * GFP_$FOO flag for this use in try_alloc_pages() as the
+ * regular page allocator doesn't fully support this
+ * allocation mode.
+ */
+ return !!(gfp_flags & __GFP_RECLAIM);
+}
+
#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
@@ -335,6 +354,9 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
}
#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
+struct page *try_alloc_pages_noprof(int nid, unsigned int order);
+#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))
+
extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
@@ -357,6 +379,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
__get_free_pages((gfp_mask) | GFP_DMA, (order))
extern void __free_pages(struct page *page, unsigned int order);
+extern void free_pages_nolock(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
#define __free_page(page) __free_pages((page), 0)
diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
index 091dc0b6bdfb..1a0bc35839e3 100644
--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -51,6 +51,76 @@
#define local_unlock_irqrestore(lock, flags) \
__local_unlock_irqrestore(lock, flags)
+/**
+ * localtry_lock_init - Runtime initialize a lock instance
+ */
+#define localtry_lock_init(lock) __localtry_lock_init(lock)
+
+/**
+ * localtry_lock - Acquire a per CPU local lock
+ * @lock: The lock variable
+ */
+#define localtry_lock(lock) __localtry_lock(lock)
+
+/**
+ * localtry_lock_irq - Acquire a per CPU local lock and disable interrupts
+ * @lock: The lock variable
+ */
+#define localtry_lock_irq(lock) __localtry_lock_irq(lock)
+
+/**
+ * localtry_lock_irqsave - Acquire a per CPU local lock, save and disable
+ * interrupts
+ * @lock: The lock variable
+ * @flags: Storage for interrupt flags
+ */
+#define localtry_lock_irqsave(lock, flags) \
+ __localtry_lock_irqsave(lock, flags)
+
+/**
+ * localtry_trylock - Try to acquire a per CPU local lock.
+ * @lock: The lock variable
+ *
+ * The function can be used in any context such as NMI or HARDIRQ. Due to
+ * locking constrains it will _always_ fail to acquire the lock in NMI or
+ * HARDIRQ context on PREEMPT_RT.
+ */
+#define localtry_trylock(lock) __localtry_trylock(lock)
+
+/**
+ * localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
+ * interrupts if acquired
+ * @lock: The lock variable
+ * @flags: Storage for interrupt flags
+ *
+ * The function can be used in any context such as NMI or HARDIRQ. Due to
+ * locking constrains it will _always_ fail to acquire the lock in NMI or
+ * HARDIRQ context on PREEMPT_RT.
+ */
+#define localtry_trylock_irqsave(lock, flags) \
+ __localtry_trylock_irqsave(lock, flags)
+
+/**
+ * local_unlock - Release a per CPU local lock
+ * @lock: The lock variable
+ */
+#define localtry_unlock(lock) __localtry_unlock(lock)
+
+/**
+ * local_unlock_irq - Release a per CPU local lock and enable interrupts
+ * @lock: The lock variable
+ */
+#define localtry_unlock_irq(lock) __localtry_unlock_irq(lock)
+
+/**
+ * localtry_unlock_irqrestore - Release a per CPU local lock and restore
+ * interrupt flags
+ * @lock: The lock variable
+ * @flags: Interrupt flags to restore
+ */
+#define localtry_unlock_irqrestore(lock, flags) \
+ __localtry_unlock_irqrestore(lock, flags)
+
DEFINE_GUARD(local_lock, local_lock_t __percpu*,
local_lock(_T),
local_unlock(_T))
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index 8dd71fbbb6d2..67bd13d142fa 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -15,6 +15,11 @@ typedef struct {
#endif
} local_lock_t;
+typedef struct {
+ local_lock_t llock;
+ unsigned int acquired;
+} localtry_lock_t;
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname) \
.dep_map = { \
@@ -31,6 +36,13 @@ static inline void local_lock_acquire(local_lock_t *l)
l->owner = current;
}
+static inline void local_trylock_acquire(local_lock_t *l)
+{
+ lock_map_acquire_try(&l->dep_map);
+ DEBUG_LOCKS_WARN_ON(l->owner);
+ l->owner = current;
+}
+
static inline void local_lock_release(local_lock_t *l)
{
DEBUG_LOCKS_WARN_ON(l->owner != current);
@@ -45,11 +57,13 @@ static inline void local_lock_debug_init(local_lock_t *l)
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
+static inline void local_trylock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
#define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) }
+#define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }}
#define __local_lock_init(lock) \
do { \
@@ -118,6 +132,104 @@ do { \
#define __local_unlock_nested_bh(lock) \
local_lock_release(this_cpu_ptr(lock))
+/* localtry_lock_t variants */
+
+#define __localtry_lock_init(lock) \
+do { \
+ __local_lock_init(&(lock)->llock); \
+ WRITE_ONCE((lock)->acquired, 0); \
+} while (0)
+
+#define __localtry_lock(lock) \
+ do { \
+ localtry_lock_t *lt; \
+ preempt_disable(); \
+ lt = this_cpu_ptr(lock); \
+ local_lock_acquire(&lt->llock); \
+ WRITE_ONCE(lt->acquired, 1); \
+ } while (0)
+
+#define __localtry_lock_irq(lock) \
+ do { \
+ localtry_lock_t *lt; \
+ local_irq_disable(); \
+ lt = this_cpu_ptr(lock); \
+ local_lock_acquire(&lt->llock); \
+ WRITE_ONCE(lt->acquired, 1); \
+ } while (0)
+
+#define __localtry_lock_irqsave(lock, flags) \
+ do { \
+ localtry_lock_t *lt; \
+ local_irq_save(flags); \
+ lt = this_cpu_ptr(lock); \
+ local_lock_acquire(&lt->llock); \
+ WRITE_ONCE(lt->acquired, 1); \
+ } while (0)
+
+#define __localtry_trylock(lock) \
+ ({ \
+ localtry_lock_t *lt; \
+ bool _ret; \
+ \
+ preempt_disable(); \
+ lt = this_cpu_ptr(lock); \
+ if (!READ_ONCE(lt->acquired)) { \
+ WRITE_ONCE(lt->acquired, 1); \
+ local_trylock_acquire(&lt->llock); \
+ _ret = true; \
+ } else { \
+ _ret = false; \
+ preempt_enable(); \
+ } \
+ _ret; \
+ })
+
+#define __localtry_trylock_irqsave(lock, flags) \
+ ({ \
+ localtry_lock_t *lt; \
+ bool _ret; \
+ \
+ local_irq_save(flags); \
+ lt = this_cpu_ptr(lock); \
+ if (!READ_ONCE(lt->acquired)) { \
+ WRITE_ONCE(lt->acquired, 1); \
+ local_trylock_acquire(&lt->llock); \
+ _ret = true; \
+ } else { \
+ _ret = false; \
+ local_irq_restore(flags); \
+ } \
+ _ret; \
+ })
+
+#define __localtry_unlock(lock) \
+ do { \
+ localtry_lock_t *lt; \
+ lt = this_cpu_ptr(lock); \
+ WRITE_ONCE(lt->acquired, 0); \
+ local_lock_release(&lt->llock); \
+ preempt_enable(); \
+ } while (0)
+
+#define __localtry_unlock_irq(lock) \
+ do { \
+ localtry_lock_t *lt; \
+ lt = this_cpu_ptr(lock); \
+ WRITE_ONCE(lt->acquired, 0); \
+ local_lock_release(&lt->llock); \
+ local_irq_enable(); \
+ } while (0)
+
+#define __localtry_unlock_irqrestore(lock, flags) \
+ do { \
+ localtry_lock_t *lt; \
+ lt = this_cpu_ptr(lock); \
+ WRITE_ONCE(lt->acquired, 0); \
+ local_lock_release(&lt->llock); \
+ local_irq_restore(flags); \
+ } while (0)
+
#else /* !CONFIG_PREEMPT_RT */
/*
@@ -125,8 +237,10 @@ do { \
* critical section while staying preemptible.
*/
typedef spinlock_t local_lock_t;
+typedef spinlock_t localtry_lock_t;
#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
+#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname)
#define __local_lock_init(l) \
do { \
@@ -169,4 +283,36 @@ do { \
spin_unlock(this_cpu_ptr((lock))); \
} while (0)
+/* localtry_lock_t variants */
+
+#define __localtry_lock_init(lock) __local_lock_init(lock)
+#define __localtry_lock(lock) __local_lock(lock)
+#define __localtry_lock_irq(lock) __local_lock(lock)
+#define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags)
+#define __localtry_unlock(lock) __local_unlock(lock)
+#define __localtry_unlock_irq(lock) __local_unlock(lock)
+#define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags)
+
+#define __localtry_trylock(lock) \
+ ({ \
+ int __locked; \
+ \
+ if (in_nmi() | in_hardirq()) { \
+ __locked = 0; \
+ } else { \
+ migrate_disable(); \
+ __locked = spin_trylock(this_cpu_ptr((lock))); \
+ if (!__locked) \
+ migrate_enable(); \
+ } \
+ __locked; \
+ })
+
+#define __localtry_trylock_irqsave(lock, flags) \
+ ({ \
+ typecheck(unsigned long, flags); \
+ flags = 0; \
+ __localtry_trylock(lock); \
+ })
+
#endif /* CONFIG_PREEMPT_RT */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0234f14f2aa6..75e8850cec3a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -99,6 +99,10 @@ struct page {
/* Or, free page */
struct list_head buddy_list;
struct list_head pcp_list;
+ struct {
+ struct llist_node pcp_llist;
+ unsigned int order;
+ };
};
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9540b41894da..e16939553930 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -972,6 +972,9 @@ struct zone {
/* Primarily protects free_area */
spinlock_t lock;
+ /* Pages to be freed when next trylock succeeds */
+ struct llist_head trylock_free_pages;
+
/* Write-intensive fields used by compaction and vmstats. */
CACHELINE_PADDING(_pad2_);