From a5f1783be29adae15666fd803efd7d2979130869 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 2 Mar 2022 12:02:22 +0100 Subject: lib/stackdepot: allow requesting early initialization dynamically In a later patch we want to add stackdepot support for object owner tracking in slub caches, which is enabled by slub_debug boot parameter. This creates a bootstrap problem as some caches are created early in boot when slab_is_available() is false and thus stack_depot_init() tries to use memblock. But, as reported by Hyeonggon Yoo [1] we are already beyond memblock_free_all(). Ideally memblock allocation should fail, yet it succeeds, but later the system crashes, which is a separately handled issue. To resolve this boostrap issue in a robust way, this patch adds another way to request stack_depot_early_init(), which happens at a well-defined point of time. In addition to build-time CONFIG_STACKDEPOT_ALWAYS_INIT, code that's e.g. processing boot parameters (which happens early enough) can call a new function stack_depot_want_early_init(), which sets a flag that stack_depot_early_init() will check. In this patch we also convert page_owner to this approach. While it doesn't have the bootstrap issue as slub, it's also a functionality enabled by a boot param and can thus request stack_depot_early_init() with memblock allocation instead of later initialization with kvmalloc(). As suggested by Mike, make stack_depot_early_init() only attempt memblock allocation and stack_depot_init() only attempt kvmalloc(). Also change the latter to kvcalloc(). In both cases we can lose the explicit array zeroing, which the allocations do already. As suggested by Marco, provide empty implementations of the init functions for !CONFIG_STACKDEPOT builds to simplify the callers. [1] https://lore.kernel.org/all/YhnUcqyeMgCrWZbd@ip-172-31-19-208.ap-northeast-1.compute.internal/ Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Suggested-by: Mike Rapoport Suggested-by: Marco Elver Signed-off-by: Vlastimil Babka Reviewed-by: Marco Elver Reviewed-and-tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Mike Rapoport Acked-by: David Rientjes --- lib/stackdepot.c | 67 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 22 deletions(-) (limited to 'lib/stackdepot.c') diff --git a/lib/stackdepot.c b/lib/stackdepot.c index bf5ba9af0500..5ca0d086ef4a 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -66,6 +66,9 @@ struct stack_record { unsigned long entries[]; /* Variable-sized array of entries. */ }; +static bool __stack_depot_want_early_init __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); +static bool __stack_depot_early_init_passed __initdata; + static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; static int depot_index; @@ -162,38 +165,58 @@ static int __init is_stack_depot_disabled(char *str) } early_param("stack_depot_disable", is_stack_depot_disabled); -/* - * __ref because of memblock_alloc(), which will not be actually called after - * the __init code is gone, because at that point slab_is_available() is true - */ -__ref int stack_depot_init(void) +void __init stack_depot_want_early_init(void) +{ + /* Too late to request early init now */ + WARN_ON(__stack_depot_early_init_passed); + + __stack_depot_want_early_init = true; +} + +int __init stack_depot_early_init(void) +{ + size_t size; + + /* This is supposed to be called only once, from mm_init() */ + if (WARN_ON(__stack_depot_early_init_passed)) + return 0; + + __stack_depot_early_init_passed = true; + + if (!__stack_depot_want_early_init || stack_depot_disable) + return 0; + + size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); + pr_info("Stack Depot early init allocating hash table with memblock_alloc, %zu bytes\n", + size); + stack_table = memblock_alloc(size, SMP_CACHE_BYTES); + + if (!stack_table) { + pr_err("Stack Depot hash table allocation failed, disabling\n"); + stack_depot_disable = true; + return -ENOMEM; + } + + return 0; +} + +int stack_depot_init(void) { static DEFINE_MUTEX(stack_depot_init_mutex); + int ret = 0; mutex_lock(&stack_depot_init_mutex); if (!stack_depot_disable && !stack_table) { - size_t size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); - int i; - - if (slab_is_available()) { - pr_info("Stack Depot allocating hash table with kvmalloc\n"); - stack_table = kvmalloc(size, GFP_KERNEL); - } else { - pr_info("Stack Depot allocating hash table with memblock_alloc\n"); - stack_table = memblock_alloc(size, SMP_CACHE_BYTES); - } - if (stack_table) { - for (i = 0; i < STACK_HASH_SIZE; i++) - stack_table[i] = NULL; - } else { + pr_info("Stack Depot allocating hash table with kvcalloc\n"); + stack_table = kvcalloc(STACK_HASH_SIZE, sizeof(struct stack_record *), GFP_KERNEL); + if (!stack_table) { pr_err("Stack Depot hash table allocation failed, disabling\n"); stack_depot_disable = true; - mutex_unlock(&stack_depot_init_mutex); - return -ENOMEM; + ret = -ENOMEM; } } mutex_unlock(&stack_depot_init_mutex); - return 0; + return ret; } EXPORT_SYMBOL_GPL(stack_depot_init); -- cgit From f9987921cb541b1187a648141a9048547ea89ffb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 20 Jun 2022 17:02:49 +0200 Subject: lib/stackdepot: replace CONFIG_STACK_HASH_ORDER with automatic sizing As Linus explained [1], setting the stackdepot hash table size as a config option is suboptimal, especially as stackdepot becomes a dependency of less "expert" subsystems than initially (e.g. DRM, networking, SLUB_DEBUG): : (a) it introduces a new compile-time question that isn't sane to ask : a regular user, but is now exposed to regular users. : (b) this by default uses 1MB of memory for a feature that didn't in : the past, so now if you have small machines you need to make sure you : make a special kernel config for them. Ideally we would employ rhashtable for fully automatic resizing, which should be feasible for many of the new users, but problematic for the original users with restricted context that call __stack_depot_save() with can_alloc == false, i.e. KASAN. However we can easily remove the config option and scale the hash table automatically with system memory. The STACK_HASH_MASK constant becomes stack_hash_mask variable and is used only in one mask operation, so the overhead should be negligible to none. For early allocation we can employ the existing alloc_large_system_hash() function and perform similar scaling for the late allocation. The existing limits of the config option (between 4k and 1M buckets) are preserved, and scaling factor is set to one bucket per 16kB memory so on 64bit the max 1M buckets (8MB memory) is achieved with 16GB system, while a 1GB system will use 512kB. Because KASAN is reported to need the maximum number of buckets even with smaller amounts of memory [2], set it as such when kasan_enabled(). If needed, the automatic scaling could be complemented with a boot-time kernel parameter, but it feels pointless to add it without a specific use case. [1] https://lore.kernel.org/all/CAHk-=wjC5nS+fnf6EzRD9yQRJApAhxx7gRB87ZV+pAWo9oVrTg@mail.gmail.com/ [2] https://lore.kernel.org/all/CACT4Y+Y4GZfXOru2z5tFPzFdaSUd+GFc6KVL=bsa0+1m197cQQ@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220620150249.16814-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Linus Torvalds Acked-by: Dmitry Vyukov Cc: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Signed-off-by: Andrew Morton --- lib/stackdepot.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 10 deletions(-) (limited to 'lib/stackdepot.c') diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 5ca0d086ef4a..e73fda23388d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -32,6 +32,7 @@ #include #include #include +#include #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) @@ -145,10 +146,16 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return stack; } -#define STACK_HASH_SIZE (1L << CONFIG_STACK_HASH_ORDER) -#define STACK_HASH_MASK (STACK_HASH_SIZE - 1) +/* one hash table bucket entry per 16kB of memory */ +#define STACK_HASH_SCALE 14 +/* limited between 4k and 1M buckets */ +#define STACK_HASH_ORDER_MIN 12 +#define STACK_HASH_ORDER_MAX 20 #define STACK_HASH_SEED 0x9747b28c +static unsigned int stack_hash_order; +static unsigned int stack_hash_mask; + static bool stack_depot_disable; static struct stack_record **stack_table; @@ -175,7 +182,7 @@ void __init stack_depot_want_early_init(void) int __init stack_depot_early_init(void) { - size_t size; + unsigned long entries = 0; /* This is supposed to be called only once, from mm_init() */ if (WARN_ON(__stack_depot_early_init_passed)) @@ -183,13 +190,23 @@ int __init stack_depot_early_init(void) __stack_depot_early_init_passed = true; + if (kasan_enabled() && !stack_hash_order) + stack_hash_order = STACK_HASH_ORDER_MAX; + if (!__stack_depot_want_early_init || stack_depot_disable) return 0; - size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); - pr_info("Stack Depot early init allocating hash table with memblock_alloc, %zu bytes\n", - size); - stack_table = memblock_alloc(size, SMP_CACHE_BYTES); + if (stack_hash_order) + entries = 1UL << stack_hash_order; + stack_table = alloc_large_system_hash("stackdepot", + sizeof(struct stack_record *), + entries, + STACK_HASH_SCALE, + HASH_EARLY | HASH_ZERO, + NULL, + &stack_hash_mask, + 1UL << STACK_HASH_ORDER_MIN, + 1UL << STACK_HASH_ORDER_MAX); if (!stack_table) { pr_err("Stack Depot hash table allocation failed, disabling\n"); @@ -207,13 +224,35 @@ int stack_depot_init(void) mutex_lock(&stack_depot_init_mutex); if (!stack_depot_disable && !stack_table) { - pr_info("Stack Depot allocating hash table with kvcalloc\n"); - stack_table = kvcalloc(STACK_HASH_SIZE, sizeof(struct stack_record *), GFP_KERNEL); + unsigned long entries; + int scale = STACK_HASH_SCALE; + + if (stack_hash_order) { + entries = 1UL << stack_hash_order; + } else { + entries = nr_free_buffer_pages(); + entries = roundup_pow_of_two(entries); + + if (scale > PAGE_SHIFT) + entries >>= (scale - PAGE_SHIFT); + else + entries <<= (PAGE_SHIFT - scale); + } + + if (entries < 1UL << STACK_HASH_ORDER_MIN) + entries = 1UL << STACK_HASH_ORDER_MIN; + if (entries > 1UL << STACK_HASH_ORDER_MAX) + entries = 1UL << STACK_HASH_ORDER_MAX; + + pr_info("Stack Depot allocating hash table of %lu entries with kvcalloc\n", + entries); + stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { pr_err("Stack Depot hash table allocation failed, disabling\n"); stack_depot_disable = true; ret = -ENOMEM; } + stack_hash_mask = entries - 1; } mutex_unlock(&stack_depot_init_mutex); return ret; @@ -386,7 +425,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, goto fast_exit; hash = hash_stack(entries, nr_entries); - bucket = &stack_table[hash & STACK_HASH_MASK]; + bucket = &stack_table[hash & stack_hash_mask]; /* * Fast path: look the stack trace up without locking. -- cgit From 83a4f1ef45a90d740bc6edf6a2533b14a3e5d183 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:36 +0200 Subject: stackdepot: reserve 5 extra bits in depot_stack_handle_t Some users (currently only KMSAN) may want to use spare bits in depot_stack_handle_t. Let them do so by adding @extra_bits to __stack_depot_save() to store arbitrary flags, and providing stack_depot_get_extra_bits() to retrieve those flags. Also adapt KASAN to the new prototype by passing extra_bits=0, as KASAN does not intend to store additional information in the stack handle. Link: https://lkml.kernel.org/r/20220915150417.722975-3-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/stackdepot.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'lib/stackdepot.c') diff --git a/lib/stackdepot.c b/lib/stackdepot.c index e73fda23388d..79e894cf8406 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -43,7 +43,8 @@ #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ STACK_ALLOC_ALIGN) #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ - STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS) + STACK_ALLOC_NULL_PROTECTION_BITS - \ + STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) #define STACK_ALLOC_SLABS_CAP 8192 #define STACK_ALLOC_MAX_SLABS \ (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ @@ -56,6 +57,7 @@ union handle_parts { u32 slabindex : STACK_ALLOC_INDEX_BITS; u32 offset : STACK_ALLOC_OFFSET_BITS; u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; @@ -77,6 +79,14 @@ static int next_slab_inited; static size_t depot_offset; static DEFINE_RAW_SPINLOCK(depot_lock); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) +{ + union handle_parts parts = { .handle = handle }; + + return parts.extra; +} +EXPORT_SYMBOL(stack_depot_get_extra_bits); + static bool init_stack_slab(void **prealloc) { if (!*prealloc) @@ -140,6 +150,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->handle.slabindex = depot_index; stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; stack->handle.valid = 1; + stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); depot_offset += required_size; @@ -382,6 +393,7 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * * @entries: Pointer to storage array * @nr_entries: Size of the storage array + * @extra_bits: Flags to store in unused bits of depot_stack_handle_t * @alloc_flags: Allocation gfp flags * @can_alloc: Allocate stack slabs (increased chance of failure if false) * @@ -393,6 +405,10 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * If the stack trace in @entries is from an interrupt, only the portion up to * interrupt entry is saved. * + * Additional opaque flags can be passed in @extra_bits, stored in the unused + * bits of the stack handle, and retrieved using stack_depot_get_extra_bits() + * without calling stack_depot_fetch(). + * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently * this is the case from contexts where neither %GFP_ATOMIC nor @@ -402,10 +418,11 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, + unsigned int extra_bits, gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; - depot_stack_handle_t retval = 0; + union handle_parts retval = { .handle = 0 }; struct page *page = NULL; void *prealloc = NULL; unsigned long flags; @@ -489,9 +506,11 @@ exit: free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); } if (found) - retval = found->handle.handle; + retval.handle = found->handle.handle; fast_exit: - return retval; + retval.extra = extra_bits; + + return retval.handle; } EXPORT_SYMBOL_GPL(__stack_depot_save); @@ -511,6 +530,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, alloc_flags, true); + return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); -- cgit