diff options
Diffstat (limited to 'include')
56 files changed, 1483 insertions, 598 deletions
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 3c61c29ff6ab..11abad6c87e1 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1111,7 +1111,7 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot); void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot); + pgprot_t prot); void iounmap(volatile void __iomem *addr); void generic_iounmap(volatile void __iomem *addr); @@ -1120,7 +1120,7 @@ void generic_iounmap(volatile void __iomem *addr); static inline void __iomem *ioremap(phys_addr_t addr, size_t size) { /* _PAGE_IOREMAP needs to be supplied by the architecture */ - return ioremap_prot(addr, size, _PAGE_IOREMAP); + return ioremap_prot(addr, size, __pgprot(_PAGE_IOREMAP)); } #endif #endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */ diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 6d1fb6162ac1..a3b5029aebbd 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,11 +19,12 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +/* avoid <linux/mm.h> include hell */ +extern unsigned long max_mapnr; + #ifndef pfn_valid static inline int pfn_valid(unsigned long pfn) { - /* avoid <linux/mm.h> include hell */ - extern unsigned long max_mapnr; unsigned long pfn_offset = ARCH_PFN_OFFSET; return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 94cbd50cc870..02aeca21479a 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -6,6 +6,19 @@ #include <linux/threads.h> #include <linux/percpu-defs.h> +/* + * __percpu_qual is the qualifier for the percpu named address space. + * + * Most arches use generic named address space for percpu variables but + * some arches define percpu variables in different named address space + * (on the x86 arch, percpu variable may be declared as being relative + * to the %fs or %gs segments using __seg_fs or __seg_gs named address + * space qualifier). + */ +#ifndef __percpu_qual +# define __percpu_qual +#endif + #ifdef CONFIG_SMP /* @@ -74,7 +87,7 @@ do { \ #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ \ *__p += val; \ *__p; \ @@ -82,8 +95,8 @@ do { \ #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __ret; \ __ret = *__p; \ *__p = nval; \ __ret; \ @@ -91,7 +104,7 @@ do { \ #define __cpu_fallback_try_cmpxchg(pcp, ovalp, nval, _cmpxchg) \ ({ \ - typeof(pcp) __val, __old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) __val, __old = *(ovalp); \ __val = _cmpxchg(pcp, __old, nval); \ if (__val != __old) \ *(ovalp) = __val; \ @@ -100,8 +113,8 @@ do { \ #define raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __val = *__p, ___old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __val = *__p, ___old = *(ovalp); \ bool __ret; \ if (__val == ___old) { \ *__p = nval; \ @@ -115,14 +128,14 @@ do { \ #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __old = (oval); \ + TYPEOF_UNQUAL(pcp) __old = (oval); \ raw_cpu_generic_try_cmpxchg(pcp, &__old, nval); \ __old; \ }) #define __this_cpu_generic_read_nopreempt(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ preempt_disable_notrace(); \ ___ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ preempt_enable_notrace(); \ @@ -131,7 +144,7 @@ do { \ #define __this_cpu_generic_read_noirq(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ unsigned long ___flags; \ raw_local_irq_save(___flags); \ ___ret = raw_cpu_generic_read(pcp); \ @@ -141,7 +154,7 @@ do { \ #define this_cpu_generic_read(pcp) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ if (__native_word(pcp)) \ __ret = __this_cpu_generic_read_nopreempt(pcp); \ else \ @@ -160,7 +173,7 @@ do { \ #define this_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_add_return(pcp, val); \ @@ -170,7 +183,7 @@ do { \ #define this_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_xchg(pcp, nval); \ @@ -190,7 +203,7 @@ do { \ #define this_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \ diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e402aef79c93..d1adfba8387e 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -67,22 +67,21 @@ * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * - * - tlb_remove_page() / __tlb_remove_page() - * - tlb_remove_page_size() / __tlb_remove_page_size() - * - __tlb_remove_folio_pages() + * - tlb_remove_page() / tlb_remove_page_size() + * - __tlb_remove_folio_pages() / __tlb_remove_page_size() + * - __tlb_remove_folio_pages_size() * - * __tlb_remove_page_size() is the basic primitive that queues a page for - * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a - * boolean indicating if the queue is (now) full and a call to - * tlb_flush_mmu() is required. + * __tlb_remove_folio_pages_size() is the basic primitive that queues pages + * for freeing. It will return a boolean indicating if the queue is (now) + * full and a call to tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * - * __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however, - * instead of removing a single page, remove the given number of consecutive - * pages that are all part of the same (large) folio: just like calling - * __tlb_remove_page() on each page individually. + * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(), + * however, instead of removing a single page, assume PAGE_SIZE and remove + * the given number of consecutive pages that are all part of the + * same (large) folio. * * - tlb_change_page_size() * @@ -489,16 +488,6 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, tlb_flush_mmu(tlb); } -static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, - struct page *page, bool delay_rmap) -{ - return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE); -} - -/* tlb_remove_page - * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when - * required. - */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index bbc4730a6505..c0989b5b0407 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -13,7 +13,7 @@ * Don't use this unless you really need to: spin_lock() and spin_unlock() * are significantly faster. */ -static inline void bit_spin_lock(int bitnum, unsigned long *addr) +static __always_inline void bit_spin_lock(int bitnum, unsigned long *addr) { /* * Assuming the lock is uncontended, this never enters @@ -38,7 +38,7 @@ static inline void bit_spin_lock(int bitnum, unsigned long *addr) /* * Return true if it was acquired */ -static inline int bit_spin_trylock(int bitnum, unsigned long *addr) +static __always_inline int bit_spin_trylock(int bitnum, unsigned long *addr) { preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) @@ -54,7 +54,7 @@ static inline int bit_spin_trylock(int bitnum, unsigned long *addr) /* * bit-based spin_unlock() */ -static inline void bit_spin_unlock(int bitnum, unsigned long *addr) +static __always_inline void bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); @@ -71,7 +71,7 @@ static inline void bit_spin_unlock(int bitnum, unsigned long *addr) * non-atomic version, which can be used eg. if the bit lock itself is * protecting the rest of the flags in the word. */ -static inline void __bit_spin_unlock(int bitnum, unsigned long *addr) +static __always_inline void __bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index d8a8d245824a..4c506e76a808 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -18,6 +18,8 @@ enum bootmem_type { #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE void __init register_page_bootmem_info_node(struct pglist_data *pgdat); +void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, + unsigned long nr_pages); void get_page_bootmem(unsigned long info, struct page *page, enum bootmem_type type); @@ -58,6 +60,11 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } +static inline void register_page_bootmem_memmap(unsigned long section_nr, + struct page *map, unsigned long nr_pages) +{ +} + static inline void put_page_bootmem(struct page *page) { } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index fab70b26e131..f0a4ad7839b6 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -270,7 +270,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); -void block_commit_write(struct page *page, unsigned int from, unsigned int to); +void block_commit_write(struct folio *folio, size_t from, size_t to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); diff --git a/include/linux/cma.h b/include/linux/cma.h index d15b64f51336..62d9c1cf6326 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -40,6 +40,9 @@ static inline int __init cma_declare_contiguous(phys_addr_t base, return cma_declare_contiguous_nid(base, size, limit, alignment, order_per_bit, fixed, name, res_cma, NUMA_NO_NODE); } +extern int __init cma_declare_contiguous_multi(phys_addr_t size, + phys_addr_t align, unsigned int order_per_bit, + const char *name, struct cma **res_cma, int nid); extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, @@ -50,12 +53,14 @@ extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); +extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end); extern void cma_reserve_pages_on_error(struct cma *cma); #ifdef CONFIG_CMA struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); bool cma_free_folio(struct cma *cma, const struct folio *folio); +bool cma_validate_zones(struct cma *cma); #else static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) { @@ -66,6 +71,10 @@ static inline bool cma_free_folio(struct cma *cma, const struct folio *folio) { return false; } +static inline bool cma_validate_zones(struct cma *cma) +{ + return false; +} #endif #endif diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 7bf0c521db63..173d9c07a895 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -95,7 +95,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern bool compaction_suitable(struct zone *zone, int order, - int highest_zoneidx); + unsigned long watermark, int highest_zoneidx); extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); @@ -113,7 +113,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline bool compaction_suitable(struct zone *zone, int order, - int highest_zoneidx) + unsigned long watermark, + int highest_zoneidx) { return false; } diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 2e7c2c282f3a..4fc8e26914ad 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -128,3 +128,11 @@ */ #define ASM_INPUT_G "ir" #define ASM_INPUT_RM "r" + +/* + * Declare compiler support for __typeof_unqual__() operator. + * + * Bindgen uses LLVM even if our C compiler is GCC, so we cannot + * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL. + */ +#define CC_HAS_TYPEOF_UNQUAL (__clang_major__ >= 19) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index c9b58188ec61..32048052c64a 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -137,3 +137,11 @@ #if GCC_VERSION < 90100 #undef __alloc_size__ #endif + +/* + * Declare compiler support for __typeof_unqual__() operator. + * + * Bindgen uses LLVM even if our C compiler is GCC, so we cannot + * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL. + */ +#define CC_HAS_TYPEOF_UNQUAL (__GNUC__ >= 14) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 9fc30b6b80c9..27725f1ab5ab 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -226,6 +226,26 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __BUILD_BUG_ON_ZERO_MSG(!__is_noncstr(p), \ "must be non-C-string (not NUL-terminated)") +/* + * Use __typeof_unqual__() when available. + * + * XXX: Remove test for __CHECKER__ once + * sparse learns about __typeof_unqual__(). + */ +#if CC_HAS_TYPEOF_UNQUAL && !defined(__CHECKER__) +# define USE_TYPEOF_UNQUAL 1 +#endif + +/* + * Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof + * operator when available, to return an unqualified type of the exp. + */ +#if defined(USE_TYPEOF_UNQUAL) +# define TYPEOF_UNQUAL(exp) __typeof_unqual__(exp) +#else +# define TYPEOF_UNQUAL(exp) __typeof__(exp) +#endif + #endif /* __KERNEL__ */ #if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index e09d323be845..501cffddc2f4 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -57,7 +57,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __user BTF_TYPE_TAG(user) # endif # define __iomem -# define __percpu BTF_TYPE_TAG(percpu) +# define __percpu __percpu_qual BTF_TYPE_TAG(percpu) # define __rcu BTF_TYPE_TAG(rcu) # define __chk_user_ptr(x) (void)0 diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 6cc5e484547c..1987400000b4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -116,7 +116,6 @@ enum cpuhp_state { CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, - CPUHP_MM_ZS_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, diff --git a/include/linux/damon.h b/include/linux/damon.h index c9074d569596..47e36e6ea203 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -36,6 +36,16 @@ struct damon_addr_range { }; /** + * struct damon_size_range - Represents size for filter to operate on [@min, @max]. + * @min: Min size (inclusive). + * @max: Max size (inclusive). + */ +struct damon_size_range { + unsigned long min; + unsigned long max; +}; + +/** * struct damon_region - Represents a monitoring target region. * @ar: The address range of the region. * @sampling_addr: Address of the sample for the next access check. @@ -324,8 +334,11 @@ struct damos_stat { /** * enum damos_filter_type - Type of memory for &struct damos_filter * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. + * @DAMOS_FILTER_TYPE_ACTIVE: Active pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_YOUNG: Recently accessed pages. + * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: Page is part of a hugepage. + * @DAMOS_FILTER_TYPE_UNMAPPED: Unmapped pages. * @DAMOS_FILTER_TYPE_ADDR: Address range. * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target. * @NR_DAMOS_FILTER_TYPES: Number of filter types. @@ -343,8 +356,11 @@ struct damos_stat { */ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, + DAMOS_FILTER_TYPE_ACTIVE, DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_YOUNG, + DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, + DAMOS_FILTER_TYPE_UNMAPPED, DAMOS_FILTER_TYPE_ADDR, DAMOS_FILTER_TYPE_TARGET, NR_DAMOS_FILTER_TYPES, @@ -360,6 +376,7 @@ enum damos_filter_type { * @target_idx: Index of the &struct damon_target of * &damon_ctx->adaptive_targets if @type is * DAMOS_FILTER_TYPE_TARGET. + * @sz_range: Size range if @type is DAMOS_FILTER_TYPE_HUGEPAGE_SIZE. * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each @@ -376,6 +393,7 @@ struct damos_filter { unsigned short memcg_id; struct damon_addr_range addr_range; int target_idx; + struct damon_size_range sz_range; }; struct list_head list; }; @@ -432,6 +450,8 @@ struct damos_access_pattern { * @wmarks: Watermarks for automated (in)activation of this scheme. * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. + * @ops_filters: ops layer handling &struct damos_filter objects list. + * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. * @list: List head for siblings. * @@ -454,6 +474,15 @@ struct damos_access_pattern { * implementation could check pages of the region and skip &action to respect * &filters * + * The minimum entity that @action can be applied depends on the underlying + * &struct damon_operations. Since it may not be aligned with the core layer + * abstract, namely &struct damon_region, &struct damon_operations could apply + * @action to same entity multiple times. Large folios that underlying on + * multiple &struct damon region objects could be such examples. The &struct + * damon_operations can use @last_applied to avoid that. DAMOS core logic + * unsets @last_applied when each regions walking for applying the scheme is + * finished. + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -475,6 +504,9 @@ struct damos { * layer-handled filters. If true, operations layer allows it, too. */ bool core_filters_allowed; + /* whether to reject core/ops filters umatched regions */ + bool core_filters_default_reject; + bool ops_filters_default_reject; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; @@ -482,6 +514,8 @@ struct damos { int target_nid; }; struct list_head filters; + struct list_head ops_filters; + void *last_applied; struct damos_stat stat; struct list_head list; }; @@ -510,7 +544,6 @@ enum damon_ops_id { * @update: Update operations-related data structures. * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. - * @reset_aggregated: Reset aggregated accesses monitoring results. * @get_scheme_score: Get the score of a region for a scheme. * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. @@ -522,8 +555,7 @@ enum damon_ops_id { * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting * the monitoring, @update after each &damon_attrs.ops_update_interval, and * @check_accesses, @target_valid and @prepare_access_checks after each - * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after - * each &damon_attrs.aggr_interval. + * &damon_attrs.sample_interval. * * Each &struct damon_operations instance having valid @id can be registered * via damon_register_ops() and selected by damon_select_ops() later. @@ -538,8 +570,6 @@ enum damon_ops_id { * last preparation and update the number of observed accesses of each region. * It should also return max number of observed accesses that made as a result * of its update. The value will be used for regions adjustment threshold. - * @reset_aggregated should reset the access monitoring results that aggregated - * by @check_accesses. * @get_scheme_score should return the priority score of a region for a scheme * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided @@ -557,7 +587,6 @@ struct damon_operations { void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); - void (*reset_aggregated)(struct damon_ctx *context); int (*get_scheme_score)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); @@ -571,43 +600,28 @@ struct damon_operations { /** * struct damon_callback - Monitoring events notification callbacks. * - * @before_start: Called before starting the monitoring. * @after_wmarks_check: Called after each schemes' watermarks check. - * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. - * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. - * @private: User private data. * - * The monitoring thread (&damon_ctx.kdamond) calls @before_start and - * @before_terminate just before starting and finishing the monitoring, - * respectively. Therefore, those are good places for installing and cleaning - * @private. + * The monitoring thread (&damon_ctx.kdamond) calls @before_terminate just + * before finishing the monitoring. * * The monitoring thread calls @after_wmarks_check after each DAMON-based * operation schemes' watermarks check. If users need to make changes to the * attributes of the monitoring context while it's deactivated due to the * watermarks, this is the good place to do. * - * The monitoring thread calls @after_sampling and @after_aggregation for each - * of the sampling intervals and aggregation intervals, respectively. - * Therefore, users can safely access the monitoring results without additional - * protection. For the reason, users are recommended to use these callback for - * the accesses to the results. + * The monitoring thread calls @after_aggregation for each of the aggregation + * intervals. Therefore, users can safely access the monitoring results + * without additional protection. For the reason, users are recommended to use + * these callback for the accesses to the results. * * If any callback returns non-zero, monitoring stops. */ struct damon_callback { - void *private; - - int (*before_start)(struct damon_ctx *context); int (*after_wmarks_check)(struct damon_ctx *context); - int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); - int (*before_damos_apply)(struct damon_ctx *context, - struct damon_target *target, - struct damon_region *region, - struct damos *scheme); void (*before_terminate)(struct damon_ctx *context); }; @@ -633,11 +647,37 @@ struct damon_call_control { }; /** + * struct damon_intervals_goal - Monitoring intervals auto-tuning goal. + * + * @access_bp: Access events observation ratio to achieve in bp. + * @aggrs: Number of aggregations to acheive @access_bp within. + * @min_sample_us: Minimum resulting sampling interval in microseconds. + * @max_sample_us: Maximum resulting sampling interval in microseconds. + * + * DAMON automatically tunes &damon_attrs->sample_interval and + * &damon_attrs->aggr_interval aiming the ratio in bp (1/10,000) of + * DAMON-observed access events to theoretical maximum amount within @aggrs + * aggregations be same to @access_bp. The logic increases + * &damon_attrs->aggr_interval and &damon_attrs->sampling_interval in same + * ratio if the current access events observation ratio is lower than the + * target for each @aggrs aggregations, and vice versa. + * + * If @aggrs is zero, the tuning is disabled and hence this struct is ignored. + */ +struct damon_intervals_goal { + unsigned long access_bp; + unsigned long aggrs; + unsigned long min_sample_us; + unsigned long max_sample_us; +}; + +/** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * * @sample_interval: The time between access samplings. * @aggr_interval: The time between monitor results aggregations. * @ops_update_interval: The time between monitoring operations updates. + * @intervals_goal: Intervals auto-tuning goal. * @min_nr_regions: The minimum number of adaptive monitoring * regions. * @max_nr_regions: The maximum number of adaptive monitoring @@ -657,8 +697,20 @@ struct damon_attrs { unsigned long sample_interval; unsigned long aggr_interval; unsigned long ops_update_interval; + struct damon_intervals_goal intervals_goal; unsigned long min_nr_regions; unsigned long max_nr_regions; +/* private: internal use only */ + /* + * @aggr_interval to @sample_interval ratio. + * Core-external components call damon_set_attrs() with &damon_attrs + * that this field is unset. In the case, damon_set_attrs() sets this + * field of resulting &damon_attrs. Core-internal components such as + * kdamond_tune_intervals() calls damon_set_attrs() with &damon_attrs + * that this field is set. In the case, damon_set_attrs() just keep + * it. + */ + unsigned long aggr_samples; }; /** @@ -707,6 +759,11 @@ struct damon_ctx { * update */ unsigned long next_ops_update_sis; + /* + * number of sample intervals that should be passed before next + * intervals tuning + */ + unsigned long next_intervals_tune_sis; /* for waiting until the execution of the kdamond_fn is started */ struct completion kdamond_started; /* for scheme quotas prioritization */ @@ -788,6 +845,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damos_for_each_filter_safe(f, next, scheme) \ list_for_each_entry_safe(f, next, &(scheme)->filters, list) +#define damos_for_each_ops_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->ops_filters, list) + +#define damos_for_each_ops_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->ops_filters, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); @@ -813,6 +876,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damos_filter *damos_new_filter(enum damos_filter_type type, bool matching, bool allow); void damos_add_filter(struct damos *s, struct damos_filter *f); +bool damos_filter_for_ops(enum damos_filter_type type); void damos_destroy_filter(struct damos_filter *f); struct damos_quota_goal *damos_new_quota_goal( diff --git a/include/linux/dax.h b/include/linux/dax.h index df41a0017b31..dcc9fcdf14e4 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -207,6 +207,11 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); +static inline bool dax_page_is_idle(struct page *page) +{ + return page && page_ref_count(page) == 0; +} + #if IS_ENABLED(CONFIG_DAX) int dax_read_lock(void); void dax_read_unlock(int id); @@ -220,6 +225,19 @@ static inline void dax_read_unlock(int id) { } #endif /* CONFIG_DAX */ + +#if !IS_ENABLED(CONFIG_FS_DAX) +static inline int __must_check dax_break_layout(struct inode *inode, + loff_t start, loff_t end, void (cb)(struct inode *)) +{ + return 0; +} + +static inline void dax_break_layout_final(struct inode *inode) +{ +} +#endif + bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, @@ -241,8 +259,18 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); +void dax_delete_mapping_range(struct address_space *mapping, + loff_t start, loff_t end); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int __must_check dax_break_layout(struct inode *inode, loff_t start, + loff_t end, void (cb)(struct inode *)); +static inline int __must_check dax_break_layout_inode(struct inode *inode, + void (cb)(struct inode *)) +{ + return dax_break_layout(inode, 0, LLONG_MAX, cb); +} +void dax_break_layout_final(struct inode *inode); int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same, diff --git a/include/linux/fb.h b/include/linux/fb.h index 5ba187e08cf7..cd653862ab99 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -225,6 +225,7 @@ struct fb_deferred_io { int open_count; /* number of opened files; protected by fb_info lock */ struct mutex lock; /* mutex that protects the pageref list */ struct list_head pagereflist; /* list of pagerefs for touched pages */ + struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 4d3f8074c137..45ad2408a80c 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -15,6 +15,7 @@ #define _LINUX_FOLIO_QUEUE_H #include <linux/pagevec.h> +#include <linux/mm.h> /* * Segment in a queue of running buffers. Each segment can hold a number of @@ -216,13 +217,6 @@ static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot) clear_bit(slot, &folioq->marks3); } -static inline unsigned int __folio_order(struct folio *folio) -{ - if (!folio_test_large(folio)) - return 0; - return folio->_flags_1 & 0xff; -} - /** * folioq_append: Add a folio to a folio queue segment * @folioq: The segment to add to @@ -241,7 +235,7 @@ static inline unsigned int folioq_append(struct folio_queue *folioq, struct foli unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; - folioq->orders[slot] = __folio_order(folio); + folioq->orders[slot] = folio_order(folio); return slot; } @@ -263,7 +257,7 @@ static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; - folioq->orders[slot] = __folio_order(folio); + folioq->orders[slot] = folio_order(folio); folioq_mark(folioq, slot); return slot; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 93e509b6c00e..e893d546a49f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -39,6 +39,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, + bool write); +vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, + bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_UNSUPPORTED, @@ -341,6 +345,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); +bool uniform_split_supported(struct folio *folio, unsigned int new_order, + bool warns); +bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, + bool warns); +int folio_split(struct folio *folio, unsigned int new_order, struct page *page, + struct list_head *list); +/* + * try_folio_split - try to split a @folio at @page using non uniform split. + * @folio: folio to be split + * @page: split to order-0 at the given page + * @list: store the after-split folios + * + * Try to split a @folio at @page using non uniform split to order-0, if + * non uniform split is not supported, fall back to uniform split. + * + * Return: 0: split is successful, otherwise split failed. + */ +static inline int try_folio_split(struct folio *folio, struct page *page, + struct list_head *list) +{ + int ret = min_order_for_split(folio); + + if (ret < 0) + return ret; + + if (!non_uniform_split_supported(folio, 0, false)) + return split_huge_page_to_list_to_order(&folio->page, list, + ret); + return folio_split(folio, ret, page, list); +} static inline int split_huge_page(struct page *page) { struct folio *folio = page_folio(page); @@ -404,7 +438,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, - unsigned long end, long adjust_next); + unsigned long end, struct vm_area_struct *next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); @@ -533,6 +567,12 @@ static inline int split_folio_to_list(struct folio *folio, struct list_head *lis return 0; } +static inline int try_folio_split(struct folio *folio, struct page *page, + struct list_head *list) +{ + return 0; +} + static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) @@ -571,7 +611,7 @@ static inline int madvise_collapse(struct vm_area_struct *vma, static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, - long adjust_next) + struct vm_area_struct *next) { } static inline int is_swap_pmd(pmd_t pmd) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 76a75ec03dd6..8f3ac832ee7f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -174,6 +174,9 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; +void hugetlb_bootmem_alloc(void); +bool hugetlb_bootmem_allocated(void); + /* arch callbacks */ #ifndef CONFIG_HIGHPTE @@ -588,6 +591,7 @@ enum hugetlb_page_flags { HPG_freed, HPG_vmemmap_optimized, HPG_raw_hwp_unreliable, + HPG_cma, __NR_HPAGEFLAGS, }; @@ -647,6 +651,7 @@ HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) +HPAGEFLAG(Cma, cma) #ifdef CONFIG_HUGETLB_PAGE @@ -675,11 +680,21 @@ struct hstate { char name[HSTATE_NAME_LEN]; }; +struct cma; + struct huge_bootmem_page { struct list_head list; struct hstate *hstate; + unsigned long flags; + struct cma *cma; }; +#define HUGE_BOOTMEM_HVO 0x0001 +#define HUGE_BOOTMEM_ZONES_VALID 0x0002 +#define HUGE_BOOTMEM_CMA 0x0004 + +bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); + int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); @@ -815,6 +830,17 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, } #endif +#ifndef arch_has_huge_bootmem_alloc +/* + * Some architectures do their own bootmem allocation, so they can't use + * early CMA allocation. + */ +static inline bool arch_has_huge_bootmem_alloc(void) +{ + return false; +} +#endif + static inline struct hstate *folio_hstate(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); @@ -1257,6 +1283,15 @@ static inline bool hugetlbfs_pagecache_present( { return false; } + +static inline void hugetlb_bootmem_alloc(void) +{ +} + +static inline bool hugetlb_bootmem_allocated(void) +{ + return false; +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e79eb6ac516f..ef5a1ecc6e59 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -133,7 +133,6 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); -void memblock_free_all(void); void memblock_free(void *ptr, size_t size); void reset_all_zones_managed_pages(void); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6e74b8254d9b..53364526d877 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -438,9 +438,7 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) */ static inline bool folio_memcg_charged(struct folio *folio) { - if (folio_memcg_kmem(folio)) - return __folio_objcg(folio) != NULL; - return __folio_memcg(folio) != NULL; + return folio->memcg_data != 0; } /* @@ -649,8 +647,6 @@ int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); - void __mem_cgroup_uncharge(struct folio *folio); /** @@ -1040,7 +1036,9 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, int old_order, int new_order); +void split_page_memcg(struct page *first, unsigned order); +void folio_split_memcg_refs(struct folio *folio, unsigned old_order, + unsigned new_order); static inline u64 cgroup_id_from_mm(struct mm_struct *mm) { @@ -1165,10 +1163,6 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, return 0; } -static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr) -{ -} - static inline void mem_cgroup_uncharge(struct folio *folio) { } @@ -1465,7 +1459,12 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, int old_order, int new_order) +static inline void split_page_memcg(struct page *first, unsigned order) +{ +} + +static inline void folio_split_memcg_refs(struct folio *folio, + unsigned old_order, unsigned new_order) { } @@ -1848,6 +1847,9 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault = 0; } +void memcg1_swapout(struct folio *folio, swp_entry_t entry); +void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages); + #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -1875,6 +1877,14 @@ static inline void mem_cgroup_exit_user_fault(void) { } +static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry) +{ +} + +static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +{ +} + #endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/memory.h b/include/linux/memory.h index c0afee5d126e..12daa6ec7d09 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -25,7 +25,7 @@ /** * struct memory_group - a logical group of memory blocks * @nid: The node id for all memory blocks inside the memory group. - * @blocks: List of all memory blocks belonging to this memory group. + * @memory_blocks: List of all memory blocks belonging to this memory group. * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this * memory group. * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 3f7143ade32c..4aa151914eab 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -161,7 +161,7 @@ static inline bool is_device_private_page(const struct page *page) { return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; + page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE; } static inline bool folio_is_device_private(const struct folio *folio) @@ -173,13 +173,13 @@ static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; + page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; } static inline bool is_device_coherent_page(const struct page *page) { return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_COHERENT; + page_pgmap(page)->type == MEMORY_DEVICE_COHERENT; } static inline bool folio_is_device_coherent(const struct folio *folio) @@ -187,6 +187,17 @@ static inline bool folio_is_device_coherent(const struct folio *folio) return is_device_coherent_page(&folio->page); } +static inline bool is_fsdax_page(const struct page *page) +{ + return is_zone_device_page(page) && + page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX; +} + +static inline bool folio_is_fsdax(const struct folio *folio) +{ + return is_fsdax_page(&folio->page); +} + #ifdef CONFIG_ZONE_DEVICE void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 80891120cca9..aaa2114498d6 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -205,8 +205,8 @@ struct migrate_vma { unsigned long end; /* - * Set to the owner value also stored in page->pgmap->owner for - * migrating out of device private memory. The flags also need to + * Set to the owner value also stored in page_pgmap(page)->owner + * for migrating out of device private memory. The flags also need to * be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE. * The caller should always set this field when using mmu notifier * callbacks to avoid device MMU invalidations for device private diff --git a/include/linux/mm.h b/include/linux/mm.h index beba5ba0fd97..32ba0e33422b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,7 @@ #include <linux/memremap.h> #include <linux/slab.h> #include <linux/cacheinfo.h> +#include <linux/rcuwait.h> struct mempolicy; struct anon_vma; @@ -40,20 +41,10 @@ struct user_struct; struct pt_regs; struct folio_batch; +void arch_mm_preinit(void); void mm_core_init(void); void init_mm_internals(void); -#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ -extern unsigned long max_mapnr; - -static inline void set_max_mapnr(unsigned long limit) -{ - max_mapnr = limit; -} -#else -static inline void set_max_mapnr(unsigned long limit) { } -#endif - extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { @@ -242,8 +233,6 @@ void setup_initial_init_mm(void *start_code, void *end_code, struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); -/* Use only if VMA has no other users */ -void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; @@ -682,13 +671,57 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key lockdep_key; + + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); +#endif + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); + vma->vm_lock_seq = UINT_MAX; +} + +static inline bool is_vma_writer_only(int refcnt) +{ + /* + * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma + * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on + * a detached vma happens only in vma_mark_detached() and is a rare + * case, therefore most of the time there will be no unnecessary wakeup. + */ + return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; +} + +static inline void vma_refcount_put(struct vm_area_struct *vma) +{ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *mm = vma->vm_mm; + int oldcnt; + + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { + + if (is_vma_writer_only(oldcnt - 1)) + rcuwait_wake_up(&mm->vma_writer_wait); + } +} + /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. */ -static inline bool vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) { + int oldcnt; + /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need @@ -696,16 +729,26 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) - return false; + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) + return NULL; - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) - return false; + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + return oldcnt ? NULL : ERR_PTR(-EAGAIN); + } + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); /* - * Overflow might produce false locked result. + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are @@ -713,18 +756,47 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { - up_read(&vma->vm_lock->lock); - return false; + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); + return NULL; } + + return vma; +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +{ + int oldcnt; + + mmap_assert_locked(vma->vm_mm); + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); return true; } +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked(struct vm_area_struct *vma) +{ + return vma_start_read_locked_nested(vma, 0); +} + static inline void vma_end_read(struct vm_area_struct *vma) { - rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->vm_lock->lock); - rcu_read_unlock(); + vma_refcount_put(vma); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ @@ -740,6 +812,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l return (vma->vm_lock_seq == *mm_lock_seq); } +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); + /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently @@ -752,15 +826,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->vm_lock->lock); - /* - * We should use WRITE_ONCE() here because we can have concurrent reads - * from the early lockless pessimistic check in vma_start_read(). - * We don't really care about the correctness of that early check, but - * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. - */ - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock->lock); + __vma_start_write(vma, mm_lock_seq); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -772,18 +838,36 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) static inline void vma_assert_locked(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_lock->lock)) - vma_assert_write_locked(vma); + unsigned int mm_lock_seq; + + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && + !__is_vma_write_locked(vma, &mm_lock_seq), vma); +} + +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); } -static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +static inline void vma_assert_detached(struct vm_area_struct *vma) { - /* When detaching vma should be write-locked */ - if (detached) - vma_assert_write_locked(vma); - vma->detached = detached; + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); } +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set_release(&vma->vm_refcnt, 1); +} + +void vma_mark_detached(struct vm_area_struct *vma); + static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) @@ -805,14 +889,18 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ -static inline bool vma_start_read(struct vm_area_struct *vma) - { return false; } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) + { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } -static inline void vma_mark_detached(struct vm_area_struct *vma, - bool detached) {} +static inline void vma_assert_attached(struct vm_area_struct *vma) {} +static inline void vma_assert_detached(struct vm_area_struct *vma) {} +static inline void vma_mark_attached(struct vm_area_struct *vma) {} +static inline void vma_mark_detached(struct vm_area_struct *vma) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) @@ -839,18 +927,13 @@ static inline void assert_fault_locked(struct vm_fault *vmf) extern const struct vm_operations_struct vma_dummy_vm_ops; -/* - * WARNING: vma_init does not initialize vma->vm_lock. - * Use vm_area_alloc()/vm_area_free() if vma needs locking. - */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_mark_detached(vma, false); - vma_numab_state_init(vma); + vma_lock_init(vma, false); } /* Use when VMA is not part of the VMA tree and needs no locking */ @@ -1043,6 +1126,7 @@ static inline int vma_iter_bulk_store(struct vma_iterator *vmi, if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; + vma_mark_attached(vma); return 0; } @@ -1083,6 +1167,25 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; +extern void prep_compound_page(struct page *page, unsigned int order); + +static inline unsigned int folio_large_order(const struct folio *folio) +{ + return folio->_flags_1 & 0xff; +} + +#ifdef NR_PAGES_IN_LARGE_FOLIO +static inline long folio_large_nr_pages(const struct folio *folio) +{ + return folio->_nr_pages; +} +#else +static inline long folio_large_nr_pages(const struct folio *folio) +{ + return 1L << folio_large_order(folio); +} +#endif + /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be @@ -1096,7 +1199,7 @@ static inline unsigned int compound_order(struct page *page) if (!test_bit(PG_head, &folio->flags)) return 0; - return folio->_flags_1 & 0xff; + return folio_large_order(folio); } /** @@ -1112,7 +1215,7 @@ static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; - return folio->_flags_1 & 0xff; + return folio_large_order(folio); } #include <linux/huge_mm.h> @@ -1205,6 +1308,8 @@ static inline int is_vmalloc_or_module_addr(const void *x) static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) + return 0; return atomic_read(&folio->_entire_mapcount) + 1; } @@ -1404,25 +1509,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf); * back into memory. */ -#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) -DECLARE_STATIC_KEY_FALSE(devmap_managed_key); - -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs); -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) -{ - if (!static_branch_unlikely(&devmap_managed_key)) - return false; - if (!folio_is_zone_device(folio)) - return false; - return __put_devmap_managed_folio_refs(folio, refs); -} -#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) -{ - return false; -} -#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ - /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) @@ -1543,12 +1629,6 @@ static inline void put_page(struct page *page) if (folio_test_slab(folio)) return; - /* - * For some devmap managed pages we need to catch refcount transition - * from 2 to 1: - */ - if (put_devmap_managed_folio_refs(folio, 1)) - return; folio_put(folio); } @@ -1907,6 +1987,13 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } +static inline bool folio_has_pincount(const struct folio *folio) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return folio_test_large(folio); + return folio_order(folio) > 1; +} + /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. @@ -1923,7 +2010,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * - * For large folios, the result will be exactly correct. That's because + * For most large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * @@ -1934,7 +2021,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { - if (folio_test_large(folio)) + if (folio_has_pincount(folio)) return atomic_read(&folio->_pincount) > 0; /* @@ -2006,6 +2093,13 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio) if (folio_is_device_coherent(folio)) return false; + /* + * Filesystems can only tolerate transient delays to truncate and + * hole-punch operations + */ + if (folio_is_fsdax(folio)) + return false; + /* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio); @@ -2049,11 +2143,7 @@ static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; -#ifdef CONFIG_64BIT - return folio->_folio_nr_pages; -#else - return 1L << (folio->_flags_1 & 0xff); -#endif + return folio_large_nr_pages(folio); } /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ @@ -2068,24 +2158,20 @@ static inline long folio_nr_pages(const struct folio *folio) * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline unsigned long compound_nr(struct page *page) +static inline long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 1; -#ifdef CONFIG_64BIT - return folio->_folio_nr_pages; -#else - return 1L << (folio->_flags_1 & 0xff); -#endif + return folio_large_nr_pages(folio); } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ -static inline int thp_nr_pages(struct page *page) +static inline long thp_nr_pages(struct page *page) { return folio_nr_pages((struct folio *)page); } @@ -2140,23 +2226,18 @@ static inline size_t folio_size(const struct folio *folio) } /** - * folio_likely_mapped_shared - Estimate if the folio is mapped into the page - * tables of more than one MM + * folio_maybe_mapped_shared - Whether the folio is mapped into the page + * tables of more than one MM * @folio: The folio. * - * This function checks if the folio is currently mapped into more than one - * MM ("mapped shared"), or if the folio is only mapped into a single MM - * ("mapped exclusively"). + * This function checks if the folio maybe currently mapped into more than one + * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single + * MM ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * - * As precise information is not easily available for all folios, this function - * estimates the number of MMs ("sharers") that are currently mapping a folio - * using the number of times the first page of the folio is currently mapped - * into page tables. - * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are @@ -2164,8 +2245,8 @@ static inline size_t folio_size(const struct folio *folio) * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly - * indicate "mapped exclusively" (false negative) when the folio is - * only partially mapped into at least one MM. + * indicate "mapped shared" (false positive) if a folio was mapped by + * more than two MMs at one point in time. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. @@ -2182,7 +2263,7 @@ static inline size_t folio_size(const struct folio *folio) * * Return: Whether the folio is estimated to be mapped into more than one MM. */ -static inline bool folio_likely_mapped_shared(struct folio *folio) +static inline bool folio_maybe_mapped_shared(struct folio *folio) { int mapcount = folio_mapcount(folio); @@ -2190,16 +2271,22 @@ static inline bool folio_likely_mapped_shared(struct folio *folio) if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1; - /* A single mapping implies "mapped exclusively". */ - if (mapcount <= 1) - return false; - - /* If any page is mapped more than once we treat it "mapped shared". */ - if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio)) + /* + * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... + * simply assume "mapped shared", nobody should really care + * about this for arbitrary kernel allocations. + */ + if (!IS_ENABLED(CONFIG_MM_ID)) return true; - /* Let's guess based on the first subpage. */ - return atomic_read(&folio->_mapcount) > 0; + /* + * A single mapping implies "mapped exclusively", even if the + * folio flag says something different: it's easier to handle this + * case here instead of on the RMAP hot path. + */ + if (mapcount <= 1) + return false; + return folio_test_large_maybe_mapped_shared(folio); } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE @@ -3179,7 +3266,6 @@ extern void reserve_bootmem_region(phys_addr_t start, /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); -#define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { @@ -3539,6 +3625,8 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num); +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, + bool write); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, @@ -3817,6 +3905,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) #endif void *sparse_buffer_alloc(unsigned long size); +unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); @@ -3825,7 +3914,8 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, - struct vmem_altmap *altmap, struct page *reuse); + struct vmem_altmap *altmap, unsigned long ptpfn, + unsigned long flags); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, @@ -3841,6 +3931,12 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, @@ -3907,9 +4003,6 @@ static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, } #endif -void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, - unsigned long nr_pages); - enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 75e8850cec3a..56d07edd01f9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -19,6 +19,7 @@ #include <linux/workqueue.h> #include <linux/seqlock.h> #include <linux/percpu_counter.h> +#include <linux/types.h> #include <asm/mmu.h> @@ -133,8 +134,11 @@ struct page { unsigned long compound_head; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ - /** @pgmap: Points to the hosting device page map. */ - struct dev_pagemap *pgmap; + /* + * The first word is used for compound_head or folio + * pgmap + */ + void *_unused_pgmap_compound_head; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being @@ -287,6 +291,49 @@ typedef struct { unsigned long val; } swp_entry_t; +#if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT) +/* We have some extra room after the refcount in tail pages. */ +#define NR_PAGES_IN_LARGE_FOLIO +#endif + +/* + * On 32bit, we can cut the required metadata in half, because: + * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have, + * so we can limit MM IDs to 15 bit (32767). + * (b) We don't expect folios where even a single complete PTE mapping by + * one MM would exceed 15 bits (order-15). + */ +#ifdef CONFIG_64BIT +typedef int mm_id_mapcount_t; +#define MM_ID_MAPCOUNT_MAX INT_MAX +typedef unsigned int mm_id_t; +#else /* !CONFIG_64BIT */ +typedef short mm_id_mapcount_t; +#define MM_ID_MAPCOUNT_MAX SHRT_MAX +typedef unsigned short mm_id_t; +#endif /* CONFIG_64BIT */ + +/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */ +#define MM_ID_DUMMY 0 +#define MM_ID_MIN (MM_ID_DUMMY + 1) + +/* + * We leave the highest bit of each MM id unused, so we can store a flag + * in the highest bit of each folio->_mm_id[]. + */ +#define MM_ID_BITS ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1) +#define MM_ID_MASK ((1U << MM_ID_BITS) - 1) +#define MM_ID_MAX MM_ID_MASK + +/* + * In order to use bit_spin_lock(), which requires an unsigned long, we + * operate on folio->_mm_ids when working on flags. + */ +#define FOLIO_MM_IDS_LOCK_BITNUM MM_ID_BITS +#define FOLIO_MM_IDS_LOCK_BIT BIT(FOLIO_MM_IDS_LOCK_BITNUM) +#define FOLIO_MM_IDS_SHARED_BITNUM (2 * MM_ID_BITS + 1) +#define FOLIO_MM_IDS_SHARED_BIT BIT(FOLIO_MM_IDS_SHARED_BITNUM) + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. @@ -296,6 +343,8 @@ typedef struct { * anonymous memory. * @index: Offset within the file, in units of pages. For anonymous memory, * this is the index from the beginning of the mmap. + * @share: number of DAX mappings that reference this folio. See + * dax_associate_entry. * @private: Filesystem per-folio data (see folio_attach_private()). * @swap: Used for swp_entry_t if folio_test_swapcache(). * @_mapcount: Do not access this member directly. Use folio_mapcount() to @@ -303,13 +352,17 @@ typedef struct { * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. + * @pgmap: Metadata for ZONE_DEVICE mappings * @virtual: Virtual address in the kernel direct map. * @_last_cpupid: IDs of last CPU and last process that accessed the folio. * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_large_mapcount: Do not use directly, call folio_mapcount(). * @_nr_pages_mapped: Do not use outside of rmap and debug code. * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). - * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). + * @_nr_pages: Do not use directly, call folio_nr_pages(). + * @_mm_id: Do not use outside of rmap code. + * @_mm_ids: Do not use outside of rmap code. + * @_mm_id_mapcount: Do not use outside of rmap code. * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. @@ -341,9 +394,13 @@ struct folio { /* private: */ }; /* public: */ + struct dev_pagemap *pgmap; }; struct address_space *mapping; - pgoff_t index; + union { + pgoff_t index; + unsigned long share; + }; union { void *private; swp_entry_t swap; @@ -369,14 +426,30 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + union { + struct { /* public: */ - atomic_t _large_mapcount; - atomic_t _entire_mapcount; - atomic_t _nr_pages_mapped; - atomic_t _pincount; + atomic_t _large_mapcount; + atomic_t _nr_pages_mapped; #ifdef CONFIG_64BIT - unsigned int _folio_nr_pages; -#endif + atomic_t _entire_mapcount; + atomic_t _pincount; +#endif /* CONFIG_64BIT */ + mm_id_mapcount_t _mm_id_mapcount[2]; + union { + mm_id_t _mm_id[2]; + unsigned long _mm_ids; + }; + /* private: the union with struct page is transitional */ + }; + unsigned long _usable_1[4]; + }; + atomic_t _mapcount_1; + atomic_t _refcount_1; + /* public: */ +#ifdef NR_PAGES_IN_LARGE_FOLIO + unsigned int _nr_pages; +#endif /* NR_PAGES_IN_LARGE_FOLIO */ /* private: the union with struct page is transitional */ }; struct page __page_1; @@ -386,20 +459,27 @@ struct folio { unsigned long _flags_2; unsigned long _head_2; /* public: */ - void *_hugetlb_subpool; - void *_hugetlb_cgroup; - void *_hugetlb_cgroup_rsvd; - void *_hugetlb_hwpoison; + struct list_head _deferred_list; +#ifndef CONFIG_64BIT + atomic_t _entire_mapcount; + atomic_t _pincount; +#endif /* !CONFIG_64BIT */ /* private: the union with struct page is transitional */ }; + struct page __page_2; + }; + union { struct { - unsigned long _flags_2a; - unsigned long _head_2a; + unsigned long _flags_3; + unsigned long _head_3; /* public: */ - struct list_head _deferred_list; + void *_hugetlb_subpool; + void *_hugetlb_cgroup; + void *_hugetlb_cgroup_rsvd; + void *_hugetlb_hwpoison; /* private: the union with struct page is transitional */ }; - struct page __page_2; + struct page __page_3; }; }; @@ -428,14 +508,20 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); +FOLIO_MATCH(_mapcount, _mapcount_1); +FOLIO_MATCH(_refcount, _refcount_1); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); -FOLIO_MATCH(flags, _flags_2a); -FOLIO_MATCH(compound_head, _head_2a); +#undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + 3 * sizeof(struct page)) +FOLIO_MATCH(flags, _flags_3); +FOLIO_MATCH(compound_head, _head_3); #undef FOLIO_MATCH /** @@ -578,6 +664,12 @@ static inline void *folio_get_private(struct folio *folio) typedef unsigned long vm_flags_t; /* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + +/* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that * map parts of them. @@ -633,9 +725,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) } #endif -struct vma_lock { - struct rw_semaphore lock; -}; +#define VMA_LOCK_OFFSET 0x40000000 +#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) struct vma_numab_state { /* @@ -681,6 +772,9 @@ struct vma_numab_state { * * Only explicitly marked struct members may be accessed by RCU readers before * getting a stable reference. + * + * WARNING: when adding new members, please update vm_area_init_from() to copy + * them during vm_area_struct content duplication. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -691,9 +785,7 @@ struct vm_area_struct { unsigned long vm_start; unsigned long vm_end; }; -#ifdef CONFIG_PER_VMA_LOCK - struct rcu_head vm_rcu; /* Used for deferred freeing. */ -#endif + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; /* @@ -714,18 +806,12 @@ struct vm_area_struct { #ifdef CONFIG_PER_VMA_LOCK /* - * Flag to indicate areas detached from the mm->mm_mt tree. - * Unstable RCU readers are allowed to read this. - */ - bool detached; - - /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock->lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock->lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -734,20 +820,7 @@ struct vm_area_struct { * slowpath. */ unsigned int vm_lock_seq; - /* Unstable RCU readers are allowed to read this. */ - struct vma_lock *vm_lock; #endif - - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree. - * - */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma @@ -767,14 +840,6 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ -#ifdef CONFIG_ANON_VMA_NAME - /* - * For private and shared anonymous mappings, a pointer to a null - * terminated string containing the name given to the vma, or NULL if - * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. - */ - struct anon_vma_name *anon_name; -#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif @@ -787,6 +852,30 @@ struct vm_area_struct { #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + refcount_t vm_refcnt ____cacheline_aligned_in_smp; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map vmlock_dep_map; +#endif +#endif + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; @@ -922,6 +1011,7 @@ struct mm_struct { * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK + struct rcuwait vma_writer_wait; /* * This field has lock-like semantics, meaning it is sometimes * accessed with ACQUIRE/RELEASE semantics. @@ -1074,6 +1164,9 @@ struct mm_struct { #endif } lru_gen; #endif /* CONFIG_LRU_GEN_WALKS_MMU */ +#ifdef CONFIG_MM_ID + mm_id_t mm_id; +#endif /* CONFIG_MM_ID */ } __randomize_layout; /* diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 45a21faa3ff6..4706c6769902 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -122,12 +122,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int #endif /* CONFIG_PER_VMA_LOCK */ -static inline void mmap_init_lock(struct mm_struct *mm) -{ - init_rwsem(&mm->mmap_lock); - mm_lock_seqcount_init(mm); -} - static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index e2dd57ca368b..bc2402a45741 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -43,10 +43,10 @@ struct mmu_interval_notifier; * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. * - * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no - * longer have exclusive access to the page. When sent during creation of an - * exclusive range the owner will be initialised to the value provided by the - * caller of make_device_exclusive_range(), otherwise the owner will be NULL. + * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive. + * The owner is initialized to the value provided by the caller of + * make_device_exclusive(), such that this caller can filter out these + * events. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e16939553930..25e80b2ca7f4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -138,6 +138,7 @@ enum numa_stat_item { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, + NR_FREE_PAGES_BLOCKS, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, @@ -220,9 +221,11 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, + PGDEMOTE_PROACTIVE, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif + NR_BALLOON_PAGES, NR_VM_NODE_STAT_ITEMS }; @@ -1161,6 +1164,12 @@ static inline bool is_zone_device_page(const struct page *page) return page_zonenum(page) == ZONE_DEVICE; } +static inline struct dev_pagemap *page_pgmap(const struct page *page) +{ + VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page); + return page_folio(page)->pgmap; +} + /* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different @@ -1176,7 +1185,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, return false; if (!is_zone_device_page(a)) return true; - return a->pgmap == b->pgmap; + return page_pgmap(a) == page_pgmap(b); } extern void memmap_init_zone_device(struct zone *, unsigned long, @@ -1191,6 +1200,10 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, { return true; } +static inline struct dev_pagemap *page_pgmap(const struct page *page) +{ + return NULL; +} #endif static inline bool folio_is_zone_device(const struct folio *folio) @@ -1937,6 +1950,9 @@ enum { #ifdef CONFIG_ZONE_DEVICE SECTION_TAINT_ZONE_DEVICE_BIT, #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT + SECTION_IS_VMEMMAP_PREINIT_BIT, +#endif SECTION_MAP_LAST_BIT, }; @@ -1947,6 +1963,9 @@ enum { #ifdef CONFIG_ZONE_DEVICE #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT +#define SECTION_IS_VMEMMAP_PREINIT BIT(SECTION_IS_VMEMMAP_PREINIT_BIT) +#endif #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT @@ -2001,6 +2020,30 @@ static inline int online_device_section(struct mem_section *section) } #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT +static inline int preinited_vmemmap_section(struct mem_section *section) +{ + return (section && + (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); +} + +void sparse_vmemmap_init_nid_early(int nid); +void sparse_vmemmap_init_nid_late(int nid); + +#else +static inline int preinited_vmemmap_section(struct mem_section *section) +{ + return 0; +} +static inline void sparse_vmemmap_init_nid_early(int nid) +{ +} + +static inline void sparse_vmemmap_init_nid_late(int nid) +{ +} +#endif + static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); @@ -2038,6 +2081,9 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) } #endif +void sparse_init_early_section(int nid, struct page *map, unsigned long pnum, + unsigned long flags); + #ifndef CONFIG_HAVE_ARCH_PFN_VALID /** * pfn_valid - check if there is a valid memory map entry for a PFN @@ -2100,6 +2146,11 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) return -1; } +#define for_each_present_section_nr(start, section_nr) \ + for (section_nr = next_present_section_nr(start - 1); \ + section_nr != -1; \ + section_nr = next_present_section_nr(section_nr)) + /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate @@ -2119,6 +2170,8 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) +#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0) +#define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index df9234e5f478..5bd9492a66ee 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -673,12 +673,6 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) -/* - * Different with flags above, this flag is used only for fsdax mode. It - * indicates that this page->mapping is now under reflink case. - */ -#define PAGE_MAPPING_DAX_SHARED ((void *)0x1) - static __always_inline bool folio_mapping_flags(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; @@ -1106,6 +1100,12 @@ static inline bool is_page_hwpoison(const struct page *page) return folio_test_hugetlb(folio) && PageHWPoison(&folio->page); } +static inline bool folio_contain_hwpoisoned_page(struct folio *folio) +{ + return folio_test_hwpoison(folio) || + (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)); +} + bool is_free_buddy_page(const struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); @@ -1193,6 +1193,10 @@ static inline int folio_has_private(const struct folio *folio) return !!(folio->flags & PAGE_FLAGS_PRIVATE); } +static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio) +{ + return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); +} #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 46406f3fe34d..d649b6bbbc87 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -9,10 +9,12 @@ struct page_counter { /* - * Make sure 'usage' does not share cacheline with any other field. The - * memcg->memory.usage is a hot member of struct mem_cgroup. + * Make sure 'usage' does not share cacheline with any other field in + * v2. The memcg->memory.usage is a hot member of struct mem_cgroup. */ atomic_long_t usage; + unsigned long failcnt; /* v1-only field */ + CACHELINE_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ @@ -28,12 +30,12 @@ struct page_counter { unsigned long watermark; /* Latest cg2 reset watermark */ unsigned long local_watermark; - unsigned long failcnt; /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); bool protection_support; + bool track_failcnt; unsigned long min; unsigned long low; unsigned long high; @@ -58,6 +60,7 @@ static inline void page_counter_init(struct page_counter *counter, counter->max = PAGE_COUNTER_MAX; counter->parent = parent; counter->protection_support = protection_support; + counter->track_failcnt = false; } static inline unsigned long page_counter_read(struct page_counter *counter) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index e4b48a0dda24..76c817162d2f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -3,6 +3,7 @@ #define __LINUX_PAGE_EXT_H #include <linux/types.h> +#include <linux/mmzone.h> #include <linux/stacktrace.h> struct pglist_data; @@ -69,16 +70,31 @@ extern void page_ext_init(void); static inline void page_ext_init_flatmem_late(void) { } + +static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) +{ + /* + * page_ext is allocated per memory section. Once we cross a + * memory section, we have to fetch the new pointer. + */ + return next_pfn % PAGES_PER_SECTION; +} #else extern void page_ext_init_flatmem(void); extern void page_ext_init_flatmem_late(void); static inline void page_ext_init(void) { } + +static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) +{ + return true; +} #endif extern struct page_ext *page_ext_get(const struct page *page); extern void page_ext_put(struct page_ext *page_ext); +extern struct page_ext *page_ext_lookup(unsigned long pfn); static inline void *page_ext_data(struct page_ext *page_ext, struct page_ext_operations *ops) @@ -93,6 +109,83 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr) return next; } +struct page_ext_iter { + unsigned long index; + unsigned long start_pfn; + struct page_ext *page_ext; +}; + +/** + * page_ext_iter_begin() - Prepare for iterating through page extensions. + * @iter: page extension iterator. + * @pfn: PFN of the page we're interested in. + * + * Must be called with RCU read lock taken. + * + * Return: NULL if no page_ext exists for this page. + */ +static inline struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter, + unsigned long pfn) +{ + iter->index = 0; + iter->start_pfn = pfn; + iter->page_ext = page_ext_lookup(pfn); + + return iter->page_ext; +} + +/** + * page_ext_iter_next() - Get next page extension + * @iter: page extension iterator. + * + * Must be called with RCU read lock taken. + * + * Return: NULL if no next page_ext exists. + */ +static inline struct page_ext *page_ext_iter_next(struct page_ext_iter *iter) +{ + unsigned long pfn; + + if (WARN_ON_ONCE(!iter->page_ext)) + return NULL; + + iter->index++; + pfn = iter->start_pfn + iter->index; + + if (page_ext_iter_next_fast_possible(pfn)) + iter->page_ext = page_ext_next(iter->page_ext); + else + iter->page_ext = page_ext_lookup(pfn); + + return iter->page_ext; +} + +/** + * page_ext_iter_get() - Get current page extension + * @iter: page extension iterator. + * + * Return: NULL if no page_ext exists for this iterator. + */ +static inline struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter) +{ + return iter->page_ext; +} + +/** + * for_each_page_ext(): iterate through page_ext objects. + * @__page: the page we're interested in + * @__pgcount: how many pages to iterate through + * @__page_ext: struct page_ext pointer where the current page_ext + * object is returned + * @__iter: struct page_ext_iter object (defined in the stack) + * + * IMPORTANT: must be called with RCU read lock taken. + */ +#define for_each_page_ext(__page, __pgcount, __page_ext, __iter) \ + for (__page_ext = page_ext_iter_begin(&__iter, page_to_pfn(__page));\ + __page_ext && __iter.index < __pgcount; \ + __page_ext = page_ext_iter_next(&__iter)) + #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7661be85136c..26baa78f1ca7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -536,26 +536,6 @@ struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); /** - * folio_file_mapping - Find the mapping this folio belongs to. - * @folio: The folio. - * - * For folios which are in the page cache, return the mapping that this - * page belongs to. Folios in the swap cache return the mapping of the - * swap file or swap device where the data is stored. This is different - * from the mapping returned by folio_mapping(). The only reason to - * use it is if, like NFS, you return 0 from ->activate_swapfile. - * - * Do not call this for folios which aren't in the page cache or swap cache. - */ -static inline struct address_space *folio_file_mapping(struct folio *folio) -{ - if (unlikely(folio_test_swapcache(folio))) - return swapcache_mapping(folio); - - return folio->mapping; -} - -/** * folio_flush_mapping - Find the file mapping this folio belongs to. * @folio: The folio. * @@ -575,11 +555,6 @@ static inline struct address_space *folio_flush_mapping(struct folio *folio) return folio_mapping(folio); } -static inline struct address_space *page_file_mapping(struct page *page) -{ - return folio_file_mapping(page_folio(page)); -} - /** * folio_inode - Get the host inode for this folio. * @folio: The folio. diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index ac8c44dd8237..c5e9cac0575e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -33,7 +33,7 @@ struct disk_stats { #define part_stat_read(part, field) \ ({ \ - typeof((part)->bd_stats->field) res = 0; \ + TYPEOF_UNQUAL((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 0fcacb909778..0aeb0e276a3e 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -222,7 +222,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ - (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) + (TYPEOF_UNQUAL(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP @@ -318,7 +318,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return(stem, variable) \ ({ \ - typeof(variable) pscr_ret__; \ + TYPEOF_UNQUAL(variable) pscr_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr_ret__ = stem##1(variable); break; \ @@ -333,7 +333,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return2(stem, variable, ...) \ ({ \ - typeof(variable) pscr2_ret__; \ + TYPEOF_UNQUAL(variable) pscr2_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 3469c4b20105..c74077977830 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -162,74 +162,32 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code } } -static inline void clear_page_tag_ref(struct page *page) -{ - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - set_codetag_empty(&ref); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} - -static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, - unsigned int nr) -{ - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} +/* Should be called only if mem_alloc_profiling_enabled() */ +void __clear_page_tag_ref(struct page *page); -static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) +static inline void clear_page_tag_ref(struct page *page) { - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_sub(&ref, PAGE_SIZE * nr); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } + if (mem_alloc_profiling_enabled()) + __clear_page_tag_ref(page); } -static inline struct alloc_tag *pgalloc_tag_get(struct page *page) +/* Should be called only if mem_alloc_profiling_enabled() */ +static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) { struct alloc_tag *tag = NULL; - - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_sub_check(&ref); - if (ref.ct) - tag = ct_to_alloc_tag(ref.ct); - put_page_tag_ref(handle); - } + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub_check(&ref); + if (ref.ct) + tag = ct_to_alloc_tag(ref.ct); + put_page_tag_ref(handle); } return tag; } -static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) -{ - if (mem_alloc_profiling_enabled() && tag) - this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); -} - void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); void pgalloc_tag_swap(struct folio *new, struct folio *old); @@ -238,11 +196,6 @@ void __init alloc_tag_sec_init(void); #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void clear_page_tag_ref(struct page *page) {} -static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, - unsigned int nr) {} -static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } -static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 4c107e17c547..e2b705c14945 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -222,10 +222,14 @@ static inline int pmd_dirty(pmd_t pmd) * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be - * up to date. This mode can only be entered and left under the protection of - * the page table locks for all page tables which may be modified. In the UP - * case, this is required so that preemption is disabled, and in the SMP case, - * it must synchronize the delayed page table writes properly on other CPUs. + * up to date. + * + * In the general case, no lock is guaranteed to be held between entry and exit + * of the lazy mode. So the implementation must assume preemption may be enabled + * and cpu migration is possible; it must take steps to be robust against this. + * (In practice, for user PTE updates, the appropriate page table lock(s) are + * held, but for kernel PTE updates, no lock is held). Nesting is not permitted + * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) @@ -287,7 +291,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, { page_table_check_ptes_set(mm, ptep, pte, nr); - arch_enter_lazy_mmu_mode(); for (;;) { set_pte(ptep, pte); if (--nr == 0) @@ -295,7 +298,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, ptep++; pte = pte_next_pfn(pte); } - arch_leave_lazy_mmu_mode(); } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 27343424225c..9ad134a04b41 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -4,18 +4,7 @@ #include <linux/rcupdate.h> #include <linux/sched/signal.h> - -/* - * rcuwait provides a way of blocking and waking up a single - * task in an rcu-safe manner. - * - * The only time @task is non-nil is when a user is blocked (or - * checking if it needs to) on a condition, and reset as soon as we - * know that the condition has succeeded and are awoken. - */ -struct rcuwait { - struct task_struct __rcu *task; -}; +#include <linux/types.h> #define __RCUWAIT_INITIALIZER(name) \ { .task = NULL, } diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 35f039ecb272..80dc023ac2bf 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -87,6 +87,15 @@ * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * + * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide + * acquire and release ordering for cases when the memory occupied by the + * object might be reused to store another object. This is important for the + * cases where secondary validation is required to detect such reuse, e.g. + * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after + * the refcount is taken, hence acquire order is necessary. Similarly, when the + * object is initialized, all stores to its attributes should be visible before + * the refcount is set, otherwise a stale attribute value might be used by + * another task which succeeds in taking a refcount to the new object. */ #ifndef _LINUX_REFCOUNT_H @@ -126,6 +135,31 @@ static inline void refcount_set(refcount_t *r, int n) } /** + * refcount_set_release - set a refcount's value with release ordering + * @r: the refcount + * @n: value to which the refcount will be set + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides release memory ordering which will order previous memory operations + * against this store. This ensures all updates to this object are visible + * once the refcount is set and stale values from the object previously + * occupying this memory are overwritten with new ones. + * + * This function should be called only after new object is fully initialized. + * After this call the object should be considered visible to other tasks even + * if it was not yet added into an object collection normally used to discover + * it. This is because other tasks might have discovered the object previously + * occupying the same memory and after memory reuse they can succeed in taking + * refcount to the new object and start using it. + */ +static inline void refcount_set_release(refcount_t *r, int n) +{ + atomic_set_release(&r->refs, n); +} + +/** * refcount_read - get a refcount's value * @r: the refcount * @@ -178,6 +212,71 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) return __refcount_add_not_zero(i, r, NULL); } +static inline __must_check __signed_wrap +bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp, + int limit) +{ + int old = refcount_read(r); + + do { + if (!old) + break; + + if (i > limit - old) { + if (oldp) + *oldp = old; + return false; + } + } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); + + if (oldp) + *oldp = old; + + if (unlikely(old < 0 || old + i < 0)) + refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); + + return old; +} + +static inline __must_check bool +__refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit) +{ + return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit); +} + +static inline __must_check __signed_wrap +bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX); +} + +/** + * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0 + * + * @i: the value to add to the refcount + * @r: the refcount + * + * Will saturate at REFCOUNT_SATURATED and WARN. + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides acquire memory ordering on success, it is assumed the caller has + * guaranteed the object memory to be stable (RCU, etc.). It does provide a + * control dependency and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc_not_zero_acquire() should instead be used to increment a + * reference count. + * + * Return: false if the passed refcount is 0, true otherwise + */ +static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r) +{ + return __refcount_add_not_zero_acquire(i, r, NULL); +} + static inline __signed_wrap void __refcount_add(int i, refcount_t *r, int *oldp) { @@ -236,6 +335,32 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) return __refcount_inc_not_zero(r, NULL); } +static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_acquire(1, r, oldp); +} + +/** + * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0 + * @r: the refcount to increment + * + * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on + * success. + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides acquire memory ordering on success, it is assumed the caller has + * guaranteed the object memory to be stable (RCU, etc.). It does provide a + * control dependency and thereby orders future stores. See the comment on top. + * + * Return: true if the increment was successful, false otherwise + */ +static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r) +{ + return __refcount_inc_not_zero_acquire(r, NULL); +} + static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 683a04088f3f..6b82b618846e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -13,6 +13,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/memremap.h> +#include <linux/bit_spinlock.h> /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -173,6 +174,214 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *folio_get_anon_vma(const struct folio *folio); +#ifdef CONFIG_MM_ID +static __always_inline void folio_lock_large_mapcount(struct folio *folio) +{ + bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); +} + +static __always_inline void folio_unlock_large_mapcount(struct folio *folio) +{ + __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); +} + +static inline unsigned int folio_mm_id(const struct folio *folio, int idx) +{ + VM_WARN_ON_ONCE(idx != 0 && idx != 1); + return folio->_mm_id[idx] & MM_ID_MASK; +} + +static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id) +{ + VM_WARN_ON_ONCE(idx != 0 && idx != 1); + folio->_mm_id[idx] &= ~MM_ID_MASK; + folio->_mm_id[idx] |= id; +} + +static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio, + int diff, mm_id_t mm_id) +{ + VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio)); + VM_WARN_ON_ONCE(diff <= 0); + VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX); + + /* + * Make sure we can detect at least one complete PTE mapping of the + * folio in a single MM as "exclusively mapped". This is primarily + * a check on 32bit, where we currently reduce the size of the per-MM + * mapcount to a short. + */ + VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio)); + VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX); + + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY && + folio->_mm_id_mapcount[0] != -1); + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY && + folio->_mm_id_mapcount[0] < 0); + VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY && + folio->_mm_id_mapcount[1] != -1); + VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY && + folio->_mm_id_mapcount[1] < 0); + VM_WARN_ON_ONCE(!folio_mapped(folio) && + folio_test_large_maybe_mapped_shared(folio)); +} + +static __always_inline void folio_set_large_mapcount(struct folio *folio, + int mapcount, struct vm_area_struct *vma) +{ + __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id); + + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY); + VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY); + + /* Note: mapcounts start at -1. */ + atomic_set(&folio->_large_mapcount, mapcount - 1); + folio->_mm_id_mapcount[0] = mapcount - 1; + folio_set_mm_id(folio, 0, vma->vm_mm->mm_id); +} + +static __always_inline int folio_add_return_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + const mm_id_t mm_id = vma->vm_mm->mm_id; + int new_mapcount_val; + + folio_lock_large_mapcount(folio); + __folio_large_mapcount_sanity_checks(folio, diff, mm_id); + + new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff; + atomic_set(&folio->_large_mapcount, new_mapcount_val); + + /* + * If a folio is mapped more than once into an MM on 32bit, we + * can in theory overflow the per-MM mapcount (although only for + * fairly large folios), turning it negative. In that case, just + * free up the slot and mark the folio "mapped shared", otherwise + * we might be in trouble when unmapping pages later. + */ + if (folio_mm_id(folio, 0) == mm_id) { + folio->_mm_id_mapcount[0] += diff; + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) { + folio->_mm_id_mapcount[0] = -1; + folio_set_mm_id(folio, 0, MM_ID_DUMMY); + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } + } else if (folio_mm_id(folio, 1) == mm_id) { + folio->_mm_id_mapcount[1] += diff; + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) { + folio->_mm_id_mapcount[1] = -1; + folio_set_mm_id(folio, 1, MM_ID_DUMMY); + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } + } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) { + folio_set_mm_id(folio, 0, mm_id); + folio->_mm_id_mapcount[0] = diff - 1; + /* We might have other mappings already. */ + if (new_mapcount_val != diff - 1) + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) { + folio_set_mm_id(folio, 1, mm_id); + folio->_mm_id_mapcount[1] = diff - 1; + /* Slot 0 certainly has mappings as well. */ + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } + folio_unlock_large_mapcount(folio); + return new_mapcount_val + 1; +} +#define folio_add_large_mapcount folio_add_return_large_mapcount + +static __always_inline int folio_sub_return_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + const mm_id_t mm_id = vma->vm_mm->mm_id; + int new_mapcount_val; + + folio_lock_large_mapcount(folio); + __folio_large_mapcount_sanity_checks(folio, diff, mm_id); + + new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff; + atomic_set(&folio->_large_mapcount, new_mapcount_val); + + /* + * There are valid corner cases where we might underflow a per-MM + * mapcount (some mappings added when no slot was free, some mappings + * added once a slot was free), so we always set it to -1 once we go + * negative. + */ + if (folio_mm_id(folio, 0) == mm_id) { + folio->_mm_id_mapcount[0] -= diff; + if (folio->_mm_id_mapcount[0] >= 0) + goto out; + folio->_mm_id_mapcount[0] = -1; + folio_set_mm_id(folio, 0, MM_ID_DUMMY); + } else if (folio_mm_id(folio, 1) == mm_id) { + folio->_mm_id_mapcount[1] -= diff; + if (folio->_mm_id_mapcount[1] >= 0) + goto out; + folio->_mm_id_mapcount[1] = -1; + folio_set_mm_id(folio, 1, MM_ID_DUMMY); + } + + /* + * If one MM slot owns all mappings, the folio is mapped exclusively. + * Note that if the folio is now unmapped (new_mapcount_val == -1), both + * slots must be free (mapcount == -1), and we'll also mark it as + * exclusive. + */ + if (folio->_mm_id_mapcount[0] == new_mapcount_val || + folio->_mm_id_mapcount[1] == new_mapcount_val) + folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT; +out: + folio_unlock_large_mapcount(folio); + return new_mapcount_val + 1; +} +#define folio_sub_large_mapcount folio_sub_return_large_mapcount +#else /* !CONFIG_MM_ID */ +/* + * See __folio_rmap_sanity_checks(), we might map large folios even without + * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now. + */ +static inline void folio_set_large_mapcount(struct folio *folio, int mapcount, + struct vm_area_struct *vma) +{ + /* Note: mapcounts start at -1. */ + atomic_set(&folio->_large_mapcount, mapcount - 1); +} + +static inline void folio_add_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + atomic_add(diff, &folio->_large_mapcount); +} + +static inline int folio_add_return_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + BUILD_BUG(); +} + +static inline void folio_sub_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + atomic_sub(diff, &folio->_large_mapcount); +} + +static inline int folio_sub_return_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + BUILD_BUG(); +} +#endif /* CONFIG_MM_ID */ + +#define folio_inc_large_mapcount(folio, vma) \ + folio_add_large_mapcount(folio, 1, vma) +#define folio_inc_return_large_mapcount(folio, vma) \ + folio_add_return_large_mapcount(folio, 1, vma) +#define folio_dec_large_mapcount(folio, vma) \ + folio_sub_large_mapcount(folio, 1, vma) +#define folio_dec_return_large_mapcount(folio, vma) \ + folio_sub_return_large_mapcount(folio, 1, vma) + /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; @@ -192,6 +401,7 @@ typedef int __bitwise rmap_t; enum rmap_level { RMAP_LEVEL_PTE = 0, RMAP_LEVEL_PMD, + RMAP_LEVEL_PUD, }; static inline void __folio_rmap_sanity_checks(const struct folio *folio, @@ -228,6 +438,14 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); break; + case RMAP_LEVEL_PUD: + /* + * Assume that we are creating a single "entire" mapping of the + * folio. + */ + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); + break; default: VM_WARN_ON_ONCE(true); } @@ -251,12 +469,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, folio_add_file_rmap_ptes(folio, page, 1, vma) void folio_add_file_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); +void folio_add_file_rmap_pud(struct folio *, struct page *, + struct vm_area_struct *); void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_remove_rmap_pte(folio, page, vma) \ folio_remove_rmap_ptes(folio, page, 1, vma) void folio_remove_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); +void folio_remove_rmap_pud(struct folio *, struct page *, + struct vm_area_struct *); void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); @@ -322,7 +544,8 @@ static inline void hugetlb_remove_rmap(struct folio *folio) } static __always_inline void __folio_dup_file_rmap(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma, + enum rmap_level level) { const int orig_nr_pages = nr_pages; @@ -335,14 +558,17 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, break; } - do { - atomic_inc(&page->_mapcount); - } while (page++, --nr_pages > 0); - atomic_add(orig_nr_pages, &folio->_large_mapcount); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { + do { + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + } + folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); - atomic_inc(&folio->_large_mapcount); + folio_inc_large_mapcount(folio, dst_vma); break; } } @@ -352,45 +578,47 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, * @folio: The folio to duplicate the mappings of * @page: The first page to duplicate the mappings of * @nr_pages: The number of pages of which the mapping will be duplicated + * @dst_vma: The destination vm area * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ static inline void folio_dup_file_rmap_ptes(struct folio *folio, - struct page *page, int nr_pages) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); } static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, - struct page *page) + struct page *page, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, 1, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); } /** * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio * @folio: The folio to duplicate the mapping of * @page: The first page to duplicate the mapping of + * @dst_vma: The destination vm area * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ static inline void folio_dup_file_rmap_pmd(struct folio *folio, - struct page *page) + struct page *page, struct vm_area_struct *dst_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); #else WARN_ON_ONCE(true); #endif } static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, - struct page *page, int nr_pages, struct vm_area_struct *src_vma, - enum rmap_level level) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, enum rmap_level level) { const int orig_nr_pages = nr_pages; bool maybe_pinned; @@ -432,18 +660,20 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, do { if (PageAnonExclusive(page)) ClearPageAnonExclusive(page); - atomic_inc(&page->_mapcount); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); - atomic_add(orig_nr_pages, &folio->_large_mapcount); + folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: if (PageAnonExclusive(page)) { if (unlikely(maybe_pinned)) return -EBUSY; ClearPageAnonExclusive(page); } atomic_inc(&folio->_entire_mapcount); - atomic_inc(&folio->_large_mapcount); + folio_inc_large_mapcount(folio, dst_vma); break; } return 0; @@ -455,6 +685,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * @folio: The folio to duplicate the mappings of * @page: The first page to duplicate the mappings of * @nr_pages: The number of pages of which the mapping will be duplicated + * @dst_vma: The destination vm area * @src_vma: The vm area from which the mappings are duplicated * * The page range of the folio is defined by [page, page + nr_pages) @@ -473,16 +704,18 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. */ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, - struct page *page, int nr_pages, struct vm_area_struct *src_vma) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { - return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma, - RMAP_LEVEL_PTE); + return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, + src_vma, RMAP_LEVEL_PTE); } static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, - struct page *page, struct vm_area_struct *src_vma) + struct page *page, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { - return __folio_try_dup_anon_rmap(folio, page, 1, src_vma, + return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, RMAP_LEVEL_PTE); } @@ -491,6 +724,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, * of a folio * @folio: The folio to duplicate the mapping of * @page: The first page to duplicate the mapping of + * @dst_vma: The destination vm area * @src_vma: The vm area from which the mapping is duplicated * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) @@ -509,11 +743,12 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. */ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, - struct page *page, struct vm_area_struct *src_vma) + struct page *page, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma, - RMAP_LEVEL_PMD); + return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, + src_vma, RMAP_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; @@ -663,9 +898,8 @@ int folio_referenced(struct folio *, int is_locked, void try_to_migrate(struct folio *folio, enum ttu_flags flags); void try_to_unmap(struct folio *, enum ttu_flags flags); -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct page **pages, - void *arg); +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, + void *owner, struct folio **foliop); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) @@ -739,6 +973,9 @@ unsigned long page_address_in_vma(const struct folio *folio, */ int folio_mkclean(struct folio *); +int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, + unsigned long pfn, unsigned long nr_pages); + int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); diff --git a/include/linux/slab.h b/include/linux/slab.h index 98e07e9e9e58..d5a8ab98035c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -137,6 +137,15 @@ enum _slab_flag_bits { * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. * + * Note that object identity check has to be done *after* acquiring a + * reference, therefore user has to ensure proper ordering for loads. + * Similarly, when initializing objects allocated with SLAB_TYPESAFE_BY_RCU, + * the newly allocated object has to be fully initialized *before* its + * refcount gets initialized and proper ordering for stores is required. + * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() are + * designed with the proper fences required for reference counting objects + * allocated with SLAB_TYPESAFE_BY_RCU. + * * Note that it is not possible to acquire a lock within a structure * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference * as described above. The reason is that SLAB_TYPESAFE_BY_RCU pages @@ -236,12 +245,6 @@ enum _slab_flag_bits { #endif /* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - -/* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault. diff --git a/include/linux/swap.h b/include/linux/swap.h index a98c757400fe..db46b25a65ae 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -24,7 +24,6 @@ struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff -#define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ @@ -74,14 +73,13 @@ static inline int current_is_kswapd(void) * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page - * table entries to special SWP_DEVICE_EXCLUSIVE_* entries. + * table entries to a special SWP_DEVICE_EXCLUSIVE entry. */ #ifdef CONFIG_DEVICE_PRIVATE -#define SWP_DEVICE_NUM 4 +#define SWP_DEVICE_NUM 3 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) -#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) -#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) +#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #else #define SWP_DEVICE_NUM 0 #endif @@ -286,12 +284,10 @@ enum swap_cluster_flags { #endif /* - * We assign a cluster to each CPU, so each CPU can allocate swap entry from - * its own cluster and swapout sequentially. The purpose is to optimize swapout - * throughput. + * We keep using same cluster for rotational device so IO will be sequential. + * The purpose is to optimize SWAP throughput on these device. */ -struct percpu_cluster { - local_lock_t lock; /* Protect the percpu_cluster above */ +struct swap_sequential_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -317,8 +313,7 @@ struct swap_info_struct { atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ - struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ - struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ @@ -461,7 +456,6 @@ void free_pages_and_swap_cache(struct encoded_page **, int); extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; -extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) @@ -475,24 +469,22 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -swp_entry_t folio_alloc_swap(struct folio *folio); +int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); -extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); +extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; @@ -575,9 +567,9 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { - return 0; + return false; } static inline int swp_swapcount(swp_entry_t entry) @@ -585,11 +577,9 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline swp_entry_t folio_alloc_swap(struct folio *folio) +static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) { - swp_entry_t entry; - entry.val = 0; - return entry; + return -EINVAL; } static inline bool folio_free_swap(struct folio *folio) @@ -650,7 +640,6 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) @@ -671,10 +660,6 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else -static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) -{ -} - static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h deleted file mode 100644 index 840aec3523b2..000000000000 --- a/include/linux/swap_slots.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SWAP_SLOTS_H -#define _LINUX_SWAP_SLOTS_H - -#include <linux/swap.h> -#include <linux/spinlock.h> -#include <linux/mutex.h> - -#define SWAP_SLOTS_CACHE_SIZE SWAP_BATCH -#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE (5*SWAP_SLOTS_CACHE_SIZE) -#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE (2*SWAP_SLOTS_CACHE_SIZE) - -struct swap_slots_cache { - bool lock_initialized; - struct mutex alloc_lock; /* protects slots, nr, cur */ - swp_entry_t *slots; - int nr; - int cur; - int n_ret; -}; - -void disable_swap_slots_cache_lock(void); -void reenable_swap_slots_cache_unlock(void); -void enable_swap_slots_cache(void); - -extern bool swap_slot_cache_enabled; - -#endif /* _LINUX_SWAP_SLOTS_H */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 96f26e29fefe..64ea151a7ae3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -186,26 +186,16 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry) return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } -static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) +static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { - return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset); -} - -static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) -{ - return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset); + return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { - return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ || - swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE; + return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; } -static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE); -} #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -227,12 +217,7 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry) return false; } -static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) -{ - return swp_entry(0, 0); -} - -static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) +static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } @@ -242,10 +227,6 @@ static inline bool is_device_exclusive_entry(swp_entry_t entry) return false; } -static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) -{ - return false; -} #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION diff --git a/include/linux/types.h b/include/linux/types.h index 1c509ce8f7f6..a3d2182c2686 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -248,5 +248,17 @@ typedef void (*swap_func_t)(void *a, void *b, int size); typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv); typedef int (*cmp_func_t)(const void *a, const void *b); +/* + * rcuwait provides a way of blocking and waking up a single + * task in an rcu-safe manner. + * + * The only time @task is non-nil is when a user is blocked (or + * checking if it needs to) on a condition, and reset as soon as we + * know that the condition has succeeded and are awoken. + */ +struct rcuwait { + struct task_struct __rcu *task; +}; + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_TYPES_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 5a37cb2b6f93..9e15a088ba38 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -41,9 +41,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, PGSCAN_DIRECT_THROTTLE, PGSCAN_ANON, PGSCAN_FILE, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 4751e3ecc467..b2ccb6845595 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -504,7 +504,7 @@ static inline const char *node_stat_name(enum node_stat_item item) static inline const char *lru_list_name(enum lru_list lru) { - return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_" + return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index caf4f0b12235..eda4b62511f7 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -313,6 +313,30 @@ static inline void cgroup_writeback_umount(struct super_block *sb) /* * mm/page-writeback.c */ +/* consolidated parameters for balance_dirty_pages() and its subroutines */ +struct dirty_throttle_control { +#ifdef CONFIG_CGROUP_WRITEBACK + struct wb_domain *dom; + struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ +#endif + struct bdi_writeback *wb; + struct fprop_local_percpu *wb_completions; + + unsigned long avail; /* dirtyable */ + unsigned long dirty; /* file_dirty + write + nfs */ + unsigned long thresh; /* dirty threshold */ + unsigned long bg_thresh; /* dirty background threshold */ + unsigned long limit; /* hard dirty limit */ + + unsigned long wb_dirty; /* per-wb counterparts */ + unsigned long wb_thresh; + unsigned long wb_bg_thresh; + + unsigned long pos_ratio; + bool freerun; + bool dirty_exceeded; +}; + void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); void laptop_mode_timer_fn(struct timer_list *t); diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 0b618ec04115..78eede109b1a 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1555,6 +1555,8 @@ int xa_get_order(struct xarray *, unsigned long index); int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); +void xas_try_split(struct xa_state *xas, void *entry, unsigned int order); +unsigned int xas_try_split_min_order(unsigned int order); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { @@ -1576,6 +1578,17 @@ static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } + +static inline void xas_try_split(struct xa_state *xas, void *entry, + unsigned int order) +{ +} + +static inline unsigned int xas_try_split_min_order(unsigned int order) +{ + return 0; +} + #endif /** diff --git a/include/linux/zpool.h b/include/linux/zpool.h index a67d62b79698..52f30e526607 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -4,9 +4,8 @@ * * Copyright (C) 2014 Dan Streetman * - * This is a common frontend for the zbud and zsmalloc memory - * storage pool implementations. Typically, this is used to - * store compressed memory. + * This is a common frontend for the zswap compressed memory storage + * implementations. */ #ifndef _ZPOOL_H_ @@ -14,25 +13,6 @@ struct zpool; -/* - * Control how a handle is mapped. It will be ignored if the - * implementation does not support it. Its use is optional. - * Note that this does not refer to memory protection, it - * refers to how the memory will be copied in/out if copying - * is necessary during mapping; read-write is the safest as - * it copies the existing memory in on map, and copies the - * changed memory back out on unmap. Write-only does not copy - * in the memory and should only be used for initialization. - * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. - */ -enum zpool_mapmode { - ZPOOL_MM_RW, /* normal read-write mapping */ - ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ - ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ - - ZPOOL_MM_DEFAULT = ZPOOL_MM_RW -}; - bool zpool_has_pool(char *type); struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp); @@ -41,17 +21,19 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); -bool zpool_malloc_support_movable(struct zpool *pool); - int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, unsigned long *handle); void zpool_free(struct zpool *pool, unsigned long handle); -void *zpool_map_handle(struct zpool *pool, unsigned long handle, - enum zpool_mapmode mm); +void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, + void *local_copy); + +void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, + void *handle_mem); -void zpool_unmap_handle(struct zpool *pool, unsigned long handle); +void zpool_obj_write(struct zpool *zpool, unsigned long handle, + void *handle_mem, size_t mem_len); u64 zpool_get_total_pages(struct zpool *pool); @@ -81,15 +63,16 @@ struct zpool_driver { void *(*create)(const char *name, gfp_t gfp); void (*destroy)(void *pool); - bool malloc_support_movable; int (*malloc)(void *pool, size_t size, gfp_t gfp, unsigned long *handle); void (*free)(void *pool, unsigned long handle); - bool sleep_mapped; - void *(*map)(void *pool, unsigned long handle, - enum zpool_mapmode mm); - void (*unmap)(void *pool, unsigned long handle); + void *(*obj_read_begin)(void *pool, unsigned long handle, + void *local_copy); + void (*obj_read_end)(void *pool, unsigned long handle, + void *handle_mem); + void (*obj_write)(void *pool, unsigned long handle, + void *handle_mem, size_t mem_len); u64 (*total_pages)(void *pool); }; diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index a48cd0ffe57d..c26baf9fb331 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -16,23 +16,6 @@ #include <linux/types.h> -/* - * zsmalloc mapping modes - * - * NOTE: These only make a difference when a mapped object spans pages. - */ -enum zs_mapmode { - ZS_MM_RW, /* normal read-write mapping */ - ZS_MM_RO, /* read-only (no copy-out at unmap time) */ - ZS_MM_WO /* write-only (no copy-in at map time) */ - /* - * NOTE: ZS_MM_WO should only be used for initializing new - * (uninitialized) allocations. Partial writes to already - * initialized allocations should use ZS_MM_RW to preserve the - * existing data. - */ -}; - struct zs_pool_stats { /* How many pages were migrated (freed) */ atomic_long_t pages_compacted; @@ -48,14 +31,18 @@ void zs_free(struct zs_pool *pool, unsigned long obj); size_t zs_huge_class_size(struct zs_pool *pool); -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm); -void zs_unmap_object(struct zs_pool *pool, unsigned long handle); - unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); + +void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, + void *local_copy); +void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, + void *handle_mem); +void zs_obj_write(struct zs_pool *pool, unsigned long handle, + void *handle_mem, size_t mem_len); + #endif diff --git a/include/linux/zswap.h b/include/linux/zswap.h index d961ead91bf1..30c193a1207e 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -26,7 +26,7 @@ struct zswap_lruvec_state { unsigned long zswap_total_pages(void); bool zswap_store(struct folio *folio); -bool zswap_load(struct folio *folio); +int zswap_load(struct folio *folio); void zswap_invalidate(swp_entry_t swp); int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); @@ -44,9 +44,9 @@ static inline bool zswap_store(struct folio *folio) return false; } -static inline bool zswap_load(struct folio *folio) +static inline int zswap_load(struct folio *folio) { - return false; + return -ENOENT; } static inline void zswap_invalidate(swp_entry_t swp) {} diff --git a/include/net/snmp.h b/include/net/snmp.h index 468a67836e2f..4cb4326dfebe 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -159,7 +159,7 @@ struct linux_tls_mib { #define __SNMP_ADD_STATS64(mib, field, addend) \ do { \ - __typeof__(*mib) *ptr = raw_cpu_ptr(mib); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[field] += addend; \ u64_stats_update_end(&ptr->syncp); \ @@ -176,8 +176,7 @@ struct linux_tls_mib { #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1) #define __SNMP_UPD_PO_STATS64(mib, basefield, addend) \ do { \ - __typeof__(*mib) *ptr; \ - ptr = raw_cpu_ptr((mib)); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend; \ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index b37eb0a7060f..f74925a6cf69 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -342,6 +342,84 @@ TRACE_EVENT(mm_alloc_contig_migrate_range_info, __entry->nr_mapped) ); +TRACE_EVENT(mm_setup_per_zone_wmarks, + + TP_PROTO(struct zone *zone), + + TP_ARGS(zone), + + TP_STRUCT__entry( + __field(int, node_id) + __string(name, zone->name) + __field(unsigned long, watermark_min) + __field(unsigned long, watermark_low) + __field(unsigned long, watermark_high) + __field(unsigned long, watermark_promo) + ), + + TP_fast_assign( + __entry->node_id = zone->zone_pgdat->node_id; + __assign_str(name); + __entry->watermark_min = zone->_watermark[WMARK_MIN]; + __entry->watermark_low = zone->_watermark[WMARK_LOW]; + __entry->watermark_high = zone->_watermark[WMARK_HIGH]; + __entry->watermark_promo = zone->_watermark[WMARK_PROMO]; + ), + + TP_printk("node_id=%d zone name=%s watermark min=%lu low=%lu high=%lu promo=%lu", + __entry->node_id, + __get_str(name), + __entry->watermark_min, + __entry->watermark_low, + __entry->watermark_high, + __entry->watermark_promo) +); + +TRACE_EVENT(mm_setup_per_zone_lowmem_reserve, + + TP_PROTO(struct zone *zone, struct zone *upper_zone, long lowmem_reserve), + + TP_ARGS(zone, upper_zone, lowmem_reserve), + + TP_STRUCT__entry( + __field(int, node_id) + __string(name, zone->name) + __string(upper_name, upper_zone->name) + __field(long, lowmem_reserve) + ), + + TP_fast_assign( + __entry->node_id = zone->zone_pgdat->node_id; + __assign_str(name); + __assign_str(upper_name); + __entry->lowmem_reserve = lowmem_reserve; + ), + + TP_printk("node_id=%d zone name=%s upper_zone name=%s lowmem_reserve_pages=%ld", + __entry->node_id, + __get_str(name), + __get_str(upper_name), + __entry->lowmem_reserve) +); + +TRACE_EVENT(mm_calculate_totalreserve_pages, + + TP_PROTO(unsigned long totalreserve_pages), + + TP_ARGS(totalreserve_pages), + + TP_STRUCT__entry( + __field(unsigned long, totalreserve_pages) + ), + + TP_fast_assign( + __entry->totalreserve_pages = totalreserve_pages; + ), + + TP_printk("totalreserve_pages=%lu", __entry->totalreserve_pages) +); + + /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index a261e86e61fa..0ff388131fc9 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -629,11 +629,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, + struct dirty_throttle_control *dtc, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, @@ -641,7 +637,7 @@ TRACE_EVENT(balance_dirty_pages, long pause, unsigned long start_time), - TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, + TP_ARGS(wb, dtc, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), @@ -650,8 +646,8 @@ TRACE_EVENT(balance_dirty_pages, __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) - __field(unsigned long, bdi_setpoint) - __field(unsigned long, bdi_dirty) + __field(unsigned long, wb_setpoint) + __field(unsigned long, wb_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) @@ -664,16 +660,15 @@ TRACE_EVENT(balance_dirty_pages, ), TP_fast_assign( - unsigned long freerun = (thresh + bg_thresh) / 2; + unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); - __entry->limit = global_wb_domain.dirty_limit; - __entry->setpoint = (global_wb_domain.dirty_limit + - freerun) / 2; - __entry->dirty = dirty; - __entry->bdi_setpoint = __entry->setpoint * - bdi_thresh / (thresh + 1); - __entry->bdi_dirty = bdi_dirty; + __entry->limit = dtc->limit; + __entry->setpoint = (dtc->limit + freerun) / 2; + __entry->dirty = dtc->dirty; + __entry->wb_setpoint = __entry->setpoint * + dtc->wb_thresh / (dtc->thresh + 1); + __entry->wb_dirty = dtc->wb_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; @@ -689,7 +684,7 @@ TRACE_EVENT(balance_dirty_pages, TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " - "bdi_setpoint=%lu bdi_dirty=%lu " + "wb_setpoint=%lu wb_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", @@ -697,8 +692,8 @@ TRACE_EVENT(balance_dirty_pages, __entry->limit, __entry->setpoint, __entry->dirty, - __entry->bdi_setpoint, - __entry->bdi_dirty, + __entry->wb_setpoint, + __entry->wb_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, |