summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/io.h4
-rw-r--r--include/asm-generic/memory_model.h5
-rw-r--r--include/asm-generic/percpu.h39
-rw-r--r--include/asm-generic/tlb.h31
-rw-r--r--include/linux/bit_spinlock.h8
-rw-r--r--include/linux/bootmem_info.h7
-rw-r--r--include/linux/buffer_head.h2
-rw-r--r--include/linux/cma.h9
-rw-r--r--include/linux/compaction.h5
-rw-r--r--include/linux/compiler-clang.h8
-rw-r--r--include/linux/compiler-gcc.h8
-rw-r--r--include/linux/compiler.h20
-rw-r--r--include/linux/compiler_types.h2
-rw-r--r--include/linux/cpuhotplug.h1
-rw-r--r--include/linux/damon.h118
-rw-r--r--include/linux/dax.h28
-rw-r--r--include/linux/fb.h1
-rw-r--r--include/linux/folio_queue.h12
-rw-r--r--include/linux/huge_mm.h44
-rw-r--r--include/linux/hugetlb.h35
-rw-r--r--include/linux/memblock.h1
-rw-r--r--include/linux/memcontrol.h32
-rw-r--r--include/linux/memory.h2
-rw-r--r--include/linux/memremap.h17
-rw-r--r--include/linux/migrate.h4
-rw-r--r--include/linux/mm.h331
-rw-r--r--include/linux/mm_types.h203
-rw-r--r--include/linux/mmap_lock.h6
-rw-r--r--include/linux/mmu_notifier.h8
-rw-r--r--include/linux/mmzone.h55
-rw-r--r--include/linux/page-flags.h16
-rw-r--r--include/linux/page_counter.h9
-rw-r--r--include/linux/page_ext.h93
-rw-r--r--include/linux/pagemap.h25
-rw-r--r--include/linux/part_stat.h2
-rw-r--r--include/linux/percpu-defs.h6
-rw-r--r--include/linux/pgalloc_tag.h77
-rw-r--r--include/linux/pgtable.h14
-rw-r--r--include/linux/rcuwait.h13
-rw-r--r--include/linux/refcount.h125
-rw-r--r--include/linux/rmap.h293
-rw-r--r--include/linux/slab.h15
-rw-r--r--include/linux/swap.h41
-rw-r--r--include/linux/swap_slots.h28
-rw-r--r--include/linux/swapops.h27
-rw-r--r--include/linux/types.h12
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/linux/writeback.h24
-rw-r--r--include/linux/xarray.h13
-rw-r--r--include/linux/zpool.h47
-rw-r--r--include/linux/zsmalloc.h29
-rw-r--r--include/linux/zswap.h6
-rw-r--r--include/net/snmp.h5
-rw-r--r--include/trace/events/kmem.h78
-rw-r--r--include/trace/events/writeback.h33
56 files changed, 1483 insertions, 598 deletions
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 3c61c29ff6ab..11abad6c87e1 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -1111,7 +1111,7 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size,
pgprot_t prot);
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
- unsigned long prot);
+ pgprot_t prot);
void iounmap(volatile void __iomem *addr);
void generic_iounmap(volatile void __iomem *addr);
@@ -1120,7 +1120,7 @@ void generic_iounmap(volatile void __iomem *addr);
static inline void __iomem *ioremap(phys_addr_t addr, size_t size)
{
/* _PAGE_IOREMAP needs to be supplied by the architecture */
- return ioremap_prot(addr, size, _PAGE_IOREMAP);
+ return ioremap_prot(addr, size, __pgprot(_PAGE_IOREMAP));
}
#endif
#endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 6d1fb6162ac1..a3b5029aebbd 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -19,11 +19,12 @@
#define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \
ARCH_PFN_OFFSET)
+/* avoid <linux/mm.h> include hell */
+extern unsigned long max_mapnr;
+
#ifndef pfn_valid
static inline int pfn_valid(unsigned long pfn)
{
- /* avoid <linux/mm.h> include hell */
- extern unsigned long max_mapnr;
unsigned long pfn_offset = ARCH_PFN_OFFSET;
return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr;
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 94cbd50cc870..02aeca21479a 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -6,6 +6,19 @@
#include <linux/threads.h>
#include <linux/percpu-defs.h>
+/*
+ * __percpu_qual is the qualifier for the percpu named address space.
+ *
+ * Most arches use generic named address space for percpu variables but
+ * some arches define percpu variables in different named address space
+ * (on the x86 arch, percpu variable may be declared as being relative
+ * to the %fs or %gs segments using __seg_fs or __seg_gs named address
+ * space qualifier).
+ */
+#ifndef __percpu_qual
+# define __percpu_qual
+#endif
+
#ifdef CONFIG_SMP
/*
@@ -74,7 +87,7 @@ do { \
#define raw_cpu_generic_add_return(pcp, val) \
({ \
- typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \
+ TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \
\
*__p += val; \
*__p; \
@@ -82,8 +95,8 @@ do { \
#define raw_cpu_generic_xchg(pcp, nval) \
({ \
- typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \
- typeof(pcp) __ret; \
+ TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \
+ TYPEOF_UNQUAL(pcp) __ret; \
__ret = *__p; \
*__p = nval; \
__ret; \
@@ -91,7 +104,7 @@ do { \
#define __cpu_fallback_try_cmpxchg(pcp, ovalp, nval, _cmpxchg) \
({ \
- typeof(pcp) __val, __old = *(ovalp); \
+ TYPEOF_UNQUAL(pcp) __val, __old = *(ovalp); \
__val = _cmpxchg(pcp, __old, nval); \
if (__val != __old) \
*(ovalp) = __val; \
@@ -100,8 +113,8 @@ do { \
#define raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval) \
({ \
- typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \
- typeof(pcp) __val = *__p, ___old = *(ovalp); \
+ TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \
+ TYPEOF_UNQUAL(pcp) __val = *__p, ___old = *(ovalp); \
bool __ret; \
if (__val == ___old) { \
*__p = nval; \
@@ -115,14 +128,14 @@ do { \
#define raw_cpu_generic_cmpxchg(pcp, oval, nval) \
({ \
- typeof(pcp) __old = (oval); \
+ TYPEOF_UNQUAL(pcp) __old = (oval); \
raw_cpu_generic_try_cmpxchg(pcp, &__old, nval); \
__old; \
})
#define __this_cpu_generic_read_nopreempt(pcp) \
({ \
- typeof(pcp) ___ret; \
+ TYPEOF_UNQUAL(pcp) ___ret; \
preempt_disable_notrace(); \
___ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \
preempt_enable_notrace(); \
@@ -131,7 +144,7 @@ do { \
#define __this_cpu_generic_read_noirq(pcp) \
({ \
- typeof(pcp) ___ret; \
+ TYPEOF_UNQUAL(pcp) ___ret; \
unsigned long ___flags; \
raw_local_irq_save(___flags); \
___ret = raw_cpu_generic_read(pcp); \
@@ -141,7 +154,7 @@ do { \
#define this_cpu_generic_read(pcp) \
({ \
- typeof(pcp) __ret; \
+ TYPEOF_UNQUAL(pcp) __ret; \
if (__native_word(pcp)) \
__ret = __this_cpu_generic_read_nopreempt(pcp); \
else \
@@ -160,7 +173,7 @@ do { \
#define this_cpu_generic_add_return(pcp, val) \
({ \
- typeof(pcp) __ret; \
+ TYPEOF_UNQUAL(pcp) __ret; \
unsigned long __flags; \
raw_local_irq_save(__flags); \
__ret = raw_cpu_generic_add_return(pcp, val); \
@@ -170,7 +183,7 @@ do { \
#define this_cpu_generic_xchg(pcp, nval) \
({ \
- typeof(pcp) __ret; \
+ TYPEOF_UNQUAL(pcp) __ret; \
unsigned long __flags; \
raw_local_irq_save(__flags); \
__ret = raw_cpu_generic_xchg(pcp, nval); \
@@ -190,7 +203,7 @@ do { \
#define this_cpu_generic_cmpxchg(pcp, oval, nval) \
({ \
- typeof(pcp) __ret; \
+ TYPEOF_UNQUAL(pcp) __ret; \
unsigned long __flags; \
raw_local_irq_save(__flags); \
__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e402aef79c93..d1adfba8387e 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -67,22 +67,21 @@
*
* See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE.
*
- * - tlb_remove_page() / __tlb_remove_page()
- * - tlb_remove_page_size() / __tlb_remove_page_size()
- * - __tlb_remove_folio_pages()
+ * - tlb_remove_page() / tlb_remove_page_size()
+ * - __tlb_remove_folio_pages() / __tlb_remove_page_size()
+ * - __tlb_remove_folio_pages_size()
*
- * __tlb_remove_page_size() is the basic primitive that queues a page for
- * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
- * boolean indicating if the queue is (now) full and a call to
- * tlb_flush_mmu() is required.
+ * __tlb_remove_folio_pages_size() is the basic primitive that queues pages
+ * for freeing. It will return a boolean indicating if the queue is (now)
+ * full and a call to tlb_flush_mmu() is required.
*
* tlb_remove_page() and tlb_remove_page_size() imply the call to
* tlb_flush_mmu() when required and has no return value.
*
- * __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however,
- * instead of removing a single page, remove the given number of consecutive
- * pages that are all part of the same (large) folio: just like calling
- * __tlb_remove_page() on each page individually.
+ * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(),
+ * however, instead of removing a single page, assume PAGE_SIZE and remove
+ * the given number of consecutive pages that are all part of the
+ * same (large) folio.
*
* - tlb_change_page_size()
*
@@ -489,16 +488,6 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
tlb_flush_mmu(tlb);
}
-static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb,
- struct page *page, bool delay_rmap)
-{
- return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE);
-}
-
-/* tlb_remove_page
- * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
- * required.
- */
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
return tlb_remove_page_size(tlb, page, PAGE_SIZE);
diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
index bbc4730a6505..c0989b5b0407 100644
--- a/include/linux/bit_spinlock.h
+++ b/include/linux/bit_spinlock.h
@@ -13,7 +13,7 @@
* Don't use this unless you really need to: spin_lock() and spin_unlock()
* are significantly faster.
*/
-static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+static __always_inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
/*
* Assuming the lock is uncontended, this never enters
@@ -38,7 +38,7 @@ static inline void bit_spin_lock(int bitnum, unsigned long *addr)
/*
* Return true if it was acquired
*/
-static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+static __always_inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
@@ -54,7 +54,7 @@ static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
/*
* bit-based spin_unlock()
*/
-static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+static __always_inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
BUG_ON(!test_bit(bitnum, addr));
@@ -71,7 +71,7 @@ static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
* non-atomic version, which can be used eg. if the bit lock itself is
* protecting the rest of the flags in the word.
*/
-static inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
+static __always_inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
BUG_ON(!test_bit(bitnum, addr));
diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index d8a8d245824a..4c506e76a808 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -18,6 +18,8 @@ enum bootmem_type {
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
+void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
+ unsigned long nr_pages);
void get_page_bootmem(unsigned long info, struct page *page,
enum bootmem_type type);
@@ -58,6 +60,11 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
{
}
+static inline void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *map, unsigned long nr_pages)
+{
+}
+
static inline void put_page_bootmem(struct page *page)
{
}
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index fab70b26e131..f0a4ad7839b6 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -270,7 +270,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
unsigned, struct folio **, void **,
get_block_t *, loff_t *);
int generic_cont_expand_simple(struct inode *inode, loff_t size);
-void block_commit_write(struct page *page, unsigned int from, unsigned int to);
+void block_commit_write(struct folio *folio, size_t from, size_t to);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
diff --git a/include/linux/cma.h b/include/linux/cma.h
index d15b64f51336..62d9c1cf6326 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -40,6 +40,9 @@ static inline int __init cma_declare_contiguous(phys_addr_t base,
return cma_declare_contiguous_nid(base, size, limit, alignment,
order_per_bit, fixed, name, res_cma, NUMA_NO_NODE);
}
+extern int __init cma_declare_contiguous_multi(phys_addr_t size,
+ phys_addr_t align, unsigned int order_per_bit,
+ const char *name, struct cma **res_cma, int nid);
extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
unsigned int order_per_bit,
const char *name,
@@ -50,12 +53,14 @@ extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned
extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
+extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end);
extern void cma_reserve_pages_on_error(struct cma *cma);
#ifdef CONFIG_CMA
struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp);
bool cma_free_folio(struct cma *cma, const struct folio *folio);
+bool cma_validate_zones(struct cma *cma);
#else
static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
{
@@ -66,6 +71,10 @@ static inline bool cma_free_folio(struct cma *cma, const struct folio *folio)
{
return false;
}
+static inline bool cma_validate_zones(struct cma *cma)
+{
+ return false;
+}
#endif
#endif
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 7bf0c521db63..173d9c07a895 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -95,7 +95,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx);
+ unsigned long watermark, int highest_zoneidx);
extern void compaction_defer_reset(struct zone *zone, int order,
bool alloc_success);
@@ -113,7 +113,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
}
static inline bool compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx)
+ unsigned long watermark,
+ int highest_zoneidx)
{
return false;
}
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 2e7c2c282f3a..4fc8e26914ad 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -128,3 +128,11 @@
*/
#define ASM_INPUT_G "ir"
#define ASM_INPUT_RM "r"
+
+/*
+ * Declare compiler support for __typeof_unqual__() operator.
+ *
+ * Bindgen uses LLVM even if our C compiler is GCC, so we cannot
+ * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL.
+ */
+#define CC_HAS_TYPEOF_UNQUAL (__clang_major__ >= 19)
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index c9b58188ec61..32048052c64a 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -137,3 +137,11 @@
#if GCC_VERSION < 90100
#undef __alloc_size__
#endif
+
+/*
+ * Declare compiler support for __typeof_unqual__() operator.
+ *
+ * Bindgen uses LLVM even if our C compiler is GCC, so we cannot
+ * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL.
+ */
+#define CC_HAS_TYPEOF_UNQUAL (__GNUC__ >= 14)
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 9fc30b6b80c9..27725f1ab5ab 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -226,6 +226,26 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
__BUILD_BUG_ON_ZERO_MSG(!__is_noncstr(p), \
"must be non-C-string (not NUL-terminated)")
+/*
+ * Use __typeof_unqual__() when available.
+ *
+ * XXX: Remove test for __CHECKER__ once
+ * sparse learns about __typeof_unqual__().
+ */
+#if CC_HAS_TYPEOF_UNQUAL && !defined(__CHECKER__)
+# define USE_TYPEOF_UNQUAL 1
+#endif
+
+/*
+ * Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof
+ * operator when available, to return an unqualified type of the exp.
+ */
+#if defined(USE_TYPEOF_UNQUAL)
+# define TYPEOF_UNQUAL(exp) __typeof_unqual__(exp)
+#else
+# define TYPEOF_UNQUAL(exp) __typeof__(exp)
+#endif
+
#endif /* __KERNEL__ */
#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index e09d323be845..501cffddc2f4 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -57,7 +57,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
# define __user BTF_TYPE_TAG(user)
# endif
# define __iomem
-# define __percpu BTF_TYPE_TAG(percpu)
+# define __percpu __percpu_qual BTF_TYPE_TAG(percpu)
# define __rcu BTF_TYPE_TAG(rcu)
# define __chk_user_ptr(x) (void)0
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 6cc5e484547c..1987400000b4 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -116,7 +116,6 @@ enum cpuhp_state {
CPUHP_NET_IUCV_PREPARE,
CPUHP_ARM_BL_PREPARE,
CPUHP_TRACE_RB_PREPARE,
- CPUHP_MM_ZS_PREPARE,
CPUHP_MM_ZSWP_POOL_PREPARE,
CPUHP_KVM_PPC_BOOK3S_PREPARE,
CPUHP_ZCOMP_PREPARE,
diff --git a/include/linux/damon.h b/include/linux/damon.h
index c9074d569596..47e36e6ea203 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -36,6 +36,16 @@ struct damon_addr_range {
};
/**
+ * struct damon_size_range - Represents size for filter to operate on [@min, @max].
+ * @min: Min size (inclusive).
+ * @max: Max size (inclusive).
+ */
+struct damon_size_range {
+ unsigned long min;
+ unsigned long max;
+};
+
+/**
* struct damon_region - Represents a monitoring target region.
* @ar: The address range of the region.
* @sampling_addr: Address of the sample for the next access check.
@@ -324,8 +334,11 @@ struct damos_stat {
/**
* enum damos_filter_type - Type of memory for &struct damos_filter
* @DAMOS_FILTER_TYPE_ANON: Anonymous pages.
+ * @DAMOS_FILTER_TYPE_ACTIVE: Active pages.
* @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages.
* @DAMOS_FILTER_TYPE_YOUNG: Recently accessed pages.
+ * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: Page is part of a hugepage.
+ * @DAMOS_FILTER_TYPE_UNMAPPED: Unmapped pages.
* @DAMOS_FILTER_TYPE_ADDR: Address range.
* @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target.
* @NR_DAMOS_FILTER_TYPES: Number of filter types.
@@ -343,8 +356,11 @@ struct damos_stat {
*/
enum damos_filter_type {
DAMOS_FILTER_TYPE_ANON,
+ DAMOS_FILTER_TYPE_ACTIVE,
DAMOS_FILTER_TYPE_MEMCG,
DAMOS_FILTER_TYPE_YOUNG,
+ DAMOS_FILTER_TYPE_HUGEPAGE_SIZE,
+ DAMOS_FILTER_TYPE_UNMAPPED,
DAMOS_FILTER_TYPE_ADDR,
DAMOS_FILTER_TYPE_TARGET,
NR_DAMOS_FILTER_TYPES,
@@ -360,6 +376,7 @@ enum damos_filter_type {
* @target_idx: Index of the &struct damon_target of
* &damon_ctx->adaptive_targets if @type is
* DAMOS_FILTER_TYPE_TARGET.
+ * @sz_range: Size range if @type is DAMOS_FILTER_TYPE_HUGEPAGE_SIZE.
* @list: List head for siblings.
*
* Before applying the &damos->action to a memory region, DAMOS checks if each
@@ -376,6 +393,7 @@ struct damos_filter {
unsigned short memcg_id;
struct damon_addr_range addr_range;
int target_idx;
+ struct damon_size_range sz_range;
};
struct list_head list;
};
@@ -432,6 +450,8 @@ struct damos_access_pattern {
* @wmarks: Watermarks for automated (in)activation of this scheme.
* @target_nid: Destination node if @action is "migrate_{hot,cold}".
* @filters: Additional set of &struct damos_filter for &action.
+ * @ops_filters: ops layer handling &struct damos_filter objects list.
+ * @last_applied: Last @action applied ops-managing entity.
* @stat: Statistics of this scheme.
* @list: List head for siblings.
*
@@ -454,6 +474,15 @@ struct damos_access_pattern {
* implementation could check pages of the region and skip &action to respect
* &filters
*
+ * The minimum entity that @action can be applied depends on the underlying
+ * &struct damon_operations. Since it may not be aligned with the core layer
+ * abstract, namely &struct damon_region, &struct damon_operations could apply
+ * @action to same entity multiple times. Large folios that underlying on
+ * multiple &struct damon region objects could be such examples. The &struct
+ * damon_operations can use @last_applied to avoid that. DAMOS core logic
+ * unsets @last_applied when each regions walking for applying the scheme is
+ * finished.
+ *
* After applying the &action to each region, &stat_count and &stat_sz is
* updated to reflect the number of regions and total size of regions that the
* &action is applied.
@@ -475,6 +504,9 @@ struct damos {
* layer-handled filters. If true, operations layer allows it, too.
*/
bool core_filters_allowed;
+ /* whether to reject core/ops filters umatched regions */
+ bool core_filters_default_reject;
+ bool ops_filters_default_reject;
/* public: */
struct damos_quota quota;
struct damos_watermarks wmarks;
@@ -482,6 +514,8 @@ struct damos {
int target_nid;
};
struct list_head filters;
+ struct list_head ops_filters;
+ void *last_applied;
struct damos_stat stat;
struct list_head list;
};
@@ -510,7 +544,6 @@ enum damon_ops_id {
* @update: Update operations-related data structures.
* @prepare_access_checks: Prepare next access check of target regions.
* @check_accesses: Check the accesses to target regions.
- * @reset_aggregated: Reset aggregated accesses monitoring results.
* @get_scheme_score: Get the score of a region for a scheme.
* @apply_scheme: Apply a DAMON-based operation scheme.
* @target_valid: Determine if the target is valid.
@@ -522,8 +555,7 @@ enum damon_ops_id {
* (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting
* the monitoring, @update after each &damon_attrs.ops_update_interval, and
* @check_accesses, @target_valid and @prepare_access_checks after each
- * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after
- * each &damon_attrs.aggr_interval.
+ * &damon_attrs.sample_interval.
*
* Each &struct damon_operations instance having valid @id can be registered
* via damon_register_ops() and selected by damon_select_ops() later.
@@ -538,8 +570,6 @@ enum damon_ops_id {
* last preparation and update the number of observed accesses of each region.
* It should also return max number of observed accesses that made as a result
* of its update. The value will be used for regions adjustment threshold.
- * @reset_aggregated should reset the access monitoring results that aggregated
- * by @check_accesses.
* @get_scheme_score should return the priority score of a region for a scheme
* as an integer in [0, &DAMOS_MAX_SCORE].
* @apply_scheme is called from @kdamond when a region for user provided
@@ -557,7 +587,6 @@ struct damon_operations {
void (*update)(struct damon_ctx *context);
void (*prepare_access_checks)(struct damon_ctx *context);
unsigned int (*check_accesses)(struct damon_ctx *context);
- void (*reset_aggregated)(struct damon_ctx *context);
int (*get_scheme_score)(struct damon_ctx *context,
struct damon_target *t, struct damon_region *r,
struct damos *scheme);
@@ -571,43 +600,28 @@ struct damon_operations {
/**
* struct damon_callback - Monitoring events notification callbacks.
*
- * @before_start: Called before starting the monitoring.
* @after_wmarks_check: Called after each schemes' watermarks check.
- * @after_sampling: Called after each sampling.
* @after_aggregation: Called after each aggregation.
- * @before_damos_apply: Called before applying DAMOS action.
* @before_terminate: Called before terminating the monitoring.
- * @private: User private data.
*
- * The monitoring thread (&damon_ctx.kdamond) calls @before_start and
- * @before_terminate just before starting and finishing the monitoring,
- * respectively. Therefore, those are good places for installing and cleaning
- * @private.
+ * The monitoring thread (&damon_ctx.kdamond) calls @before_terminate just
+ * before finishing the monitoring.
*
* The monitoring thread calls @after_wmarks_check after each DAMON-based
* operation schemes' watermarks check. If users need to make changes to the
* attributes of the monitoring context while it's deactivated due to the
* watermarks, this is the good place to do.
*
- * The monitoring thread calls @after_sampling and @after_aggregation for each
- * of the sampling intervals and aggregation intervals, respectively.
- * Therefore, users can safely access the monitoring results without additional
- * protection. For the reason, users are recommended to use these callback for
- * the accesses to the results.
+ * The monitoring thread calls @after_aggregation for each of the aggregation
+ * intervals. Therefore, users can safely access the monitoring results
+ * without additional protection. For the reason, users are recommended to use
+ * these callback for the accesses to the results.
*
* If any callback returns non-zero, monitoring stops.
*/
struct damon_callback {
- void *private;
-
- int (*before_start)(struct damon_ctx *context);
int (*after_wmarks_check)(struct damon_ctx *context);
- int (*after_sampling)(struct damon_ctx *context);
int (*after_aggregation)(struct damon_ctx *context);
- int (*before_damos_apply)(struct damon_ctx *context,
- struct damon_target *target,
- struct damon_region *region,
- struct damos *scheme);
void (*before_terminate)(struct damon_ctx *context);
};
@@ -633,11 +647,37 @@ struct damon_call_control {
};
/**
+ * struct damon_intervals_goal - Monitoring intervals auto-tuning goal.
+ *
+ * @access_bp: Access events observation ratio to achieve in bp.
+ * @aggrs: Number of aggregations to acheive @access_bp within.
+ * @min_sample_us: Minimum resulting sampling interval in microseconds.
+ * @max_sample_us: Maximum resulting sampling interval in microseconds.
+ *
+ * DAMON automatically tunes &damon_attrs->sample_interval and
+ * &damon_attrs->aggr_interval aiming the ratio in bp (1/10,000) of
+ * DAMON-observed access events to theoretical maximum amount within @aggrs
+ * aggregations be same to @access_bp. The logic increases
+ * &damon_attrs->aggr_interval and &damon_attrs->sampling_interval in same
+ * ratio if the current access events observation ratio is lower than the
+ * target for each @aggrs aggregations, and vice versa.
+ *
+ * If @aggrs is zero, the tuning is disabled and hence this struct is ignored.
+ */
+struct damon_intervals_goal {
+ unsigned long access_bp;
+ unsigned long aggrs;
+ unsigned long min_sample_us;
+ unsigned long max_sample_us;
+};
+
+/**
* struct damon_attrs - Monitoring attributes for accuracy/overhead control.
*
* @sample_interval: The time between access samplings.
* @aggr_interval: The time between monitor results aggregations.
* @ops_update_interval: The time between monitoring operations updates.
+ * @intervals_goal: Intervals auto-tuning goal.
* @min_nr_regions: The minimum number of adaptive monitoring
* regions.
* @max_nr_regions: The maximum number of adaptive monitoring
@@ -657,8 +697,20 @@ struct damon_attrs {
unsigned long sample_interval;
unsigned long aggr_interval;
unsigned long ops_update_interval;
+ struct damon_intervals_goal intervals_goal;
unsigned long min_nr_regions;
unsigned long max_nr_regions;
+/* private: internal use only */
+ /*
+ * @aggr_interval to @sample_interval ratio.
+ * Core-external components call damon_set_attrs() with &damon_attrs
+ * that this field is unset. In the case, damon_set_attrs() sets this
+ * field of resulting &damon_attrs. Core-internal components such as
+ * kdamond_tune_intervals() calls damon_set_attrs() with &damon_attrs
+ * that this field is set. In the case, damon_set_attrs() just keep
+ * it.
+ */
+ unsigned long aggr_samples;
};
/**
@@ -707,6 +759,11 @@ struct damon_ctx {
* update
*/
unsigned long next_ops_update_sis;
+ /*
+ * number of sample intervals that should be passed before next
+ * intervals tuning
+ */
+ unsigned long next_intervals_tune_sis;
/* for waiting until the execution of the kdamond_fn is started */
struct completion kdamond_started;
/* for scheme quotas prioritization */
@@ -788,6 +845,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
#define damos_for_each_filter_safe(f, next, scheme) \
list_for_each_entry_safe(f, next, &(scheme)->filters, list)
+#define damos_for_each_ops_filter(f, scheme) \
+ list_for_each_entry(f, &(scheme)->ops_filters, list)
+
+#define damos_for_each_ops_filter_safe(f, next, scheme) \
+ list_for_each_entry_safe(f, next, &(scheme)->ops_filters, list)
+
#ifdef CONFIG_DAMON
struct damon_region *damon_new_region(unsigned long start, unsigned long end);
@@ -813,6 +876,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed,
struct damos_filter *damos_new_filter(enum damos_filter_type type,
bool matching, bool allow);
void damos_add_filter(struct damos *s, struct damos_filter *f);
+bool damos_filter_for_ops(enum damos_filter_type type);
void damos_destroy_filter(struct damos_filter *f);
struct damos_quota_goal *damos_new_quota_goal(
diff --git a/include/linux/dax.h b/include/linux/dax.h
index df41a0017b31..dcc9fcdf14e4 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -207,6 +207,11 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
+static inline bool dax_page_is_idle(struct page *page)
+{
+ return page && page_ref_count(page) == 0;
+}
+
#if IS_ENABLED(CONFIG_DAX)
int dax_read_lock(void);
void dax_read_unlock(int id);
@@ -220,6 +225,19 @@ static inline void dax_read_unlock(int id)
{
}
#endif /* CONFIG_DAX */
+
+#if !IS_ENABLED(CONFIG_FS_DAX)
+static inline int __must_check dax_break_layout(struct inode *inode,
+ loff_t start, loff_t end, void (cb)(struct inode *))
+{
+ return 0;
+}
+
+static inline void dax_break_layout_final(struct inode *inode)
+{
+}
+#endif
+
bool dax_alive(struct dax_device *dax_dev);
void *dax_get_private(struct dax_device *dax_dev);
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
@@ -241,8 +259,18 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
unsigned int order, pfn_t pfn);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
+void dax_delete_mapping_range(struct address_space *mapping,
+ loff_t start, loff_t end);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
+int __must_check dax_break_layout(struct inode *inode, loff_t start,
+ loff_t end, void (cb)(struct inode *));
+static inline int __must_check dax_break_layout_inode(struct inode *inode,
+ void (cb)(struct inode *))
+{
+ return dax_break_layout(inode, 0, LLONG_MAX, cb);
+}
+void dax_break_layout_final(struct inode *inode);
int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same,
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 5ba187e08cf7..cd653862ab99 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -225,6 +225,7 @@ struct fb_deferred_io {
int open_count; /* number of opened files; protected by fb_info lock */
struct mutex lock; /* mutex that protects the pageref list */
struct list_head pagereflist; /* list of pagerefs for touched pages */
+ struct address_space *mapping; /* page cache object for fb device */
/* callback */
struct page *(*get_page)(struct fb_info *info, unsigned long offset);
void (*deferred_io)(struct fb_info *info, struct list_head *pagelist);
diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h
index 4d3f8074c137..45ad2408a80c 100644
--- a/include/linux/folio_queue.h
+++ b/include/linux/folio_queue.h
@@ -15,6 +15,7 @@
#define _LINUX_FOLIO_QUEUE_H
#include <linux/pagevec.h>
+#include <linux/mm.h>
/*
* Segment in a queue of running buffers. Each segment can hold a number of
@@ -216,13 +217,6 @@ static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot)
clear_bit(slot, &folioq->marks3);
}
-static inline unsigned int __folio_order(struct folio *folio)
-{
- if (!folio_test_large(folio))
- return 0;
- return folio->_flags_1 & 0xff;
-}
-
/**
* folioq_append: Add a folio to a folio queue segment
* @folioq: The segment to add to
@@ -241,7 +235,7 @@ static inline unsigned int folioq_append(struct folio_queue *folioq, struct foli
unsigned int slot = folioq->vec.nr++;
folioq->vec.folios[slot] = folio;
- folioq->orders[slot] = __folio_order(folio);
+ folioq->orders[slot] = folio_order(folio);
return slot;
}
@@ -263,7 +257,7 @@ static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct
unsigned int slot = folioq->vec.nr++;
folioq->vec.folios[slot] = folio;
- folioq->orders[slot] = __folio_order(folio);
+ folioq->orders[slot] = folio_order(folio);
folioq_mark(folioq, slot);
return slot;
}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 93e509b6c00e..e893d546a49f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
+ bool write);
+vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
+ bool write);
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_UNSUPPORTED,
@@ -341,6 +345,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order);
int min_order_for_split(struct folio *folio);
int split_folio_to_list(struct folio *folio, struct list_head *list);
+bool uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns);
+bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns);
+int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
+ struct list_head *list);
+/*
+ * try_folio_split - try to split a @folio at @page using non uniform split.
+ * @folio: folio to be split
+ * @page: split to order-0 at the given page
+ * @list: store the after-split folios
+ *
+ * Try to split a @folio at @page using non uniform split to order-0, if
+ * non uniform split is not supported, fall back to uniform split.
+ *
+ * Return: 0: split is successful, otherwise split failed.
+ */
+static inline int try_folio_split(struct folio *folio, struct page *page,
+ struct list_head *list)
+{
+ int ret = min_order_for_split(folio);
+
+ if (ret < 0)
+ return ret;
+
+ if (!non_uniform_split_supported(folio, 0, false))
+ return split_huge_page_to_list_to_order(&folio->page, list,
+ ret);
+ return folio_split(folio, ret, page, list);
+}
static inline int split_huge_page(struct page *page)
{
struct folio *folio = page_folio(page);
@@ -404,7 +438,7 @@ int madvise_collapse(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, long adjust_next);
+ unsigned long end, struct vm_area_struct *next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);
@@ -533,6 +567,12 @@ static inline int split_folio_to_list(struct folio *folio, struct list_head *lis
return 0;
}
+static inline int try_folio_split(struct folio *folio, struct page *page,
+ struct list_head *list)
+{
+ return 0;
+}
+
static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
@@ -571,7 +611,7 @@ static inline int madvise_collapse(struct vm_area_struct *vma,
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
- long adjust_next)
+ struct vm_area_struct *next)
{
}
static inline int is_swap_pmd(pmd_t pmd)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 76a75ec03dd6..8f3ac832ee7f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -174,6 +174,9 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
extern int sysctl_hugetlb_shm_group;
extern struct list_head huge_boot_pages[MAX_NUMNODES];
+void hugetlb_bootmem_alloc(void);
+bool hugetlb_bootmem_allocated(void);
+
/* arch callbacks */
#ifndef CONFIG_HIGHPTE
@@ -588,6 +591,7 @@ enum hugetlb_page_flags {
HPG_freed,
HPG_vmemmap_optimized,
HPG_raw_hwp_unreliable,
+ HPG_cma,
__NR_HPAGEFLAGS,
};
@@ -647,6 +651,7 @@ HPAGEFLAG(Temporary, temporary)
HPAGEFLAG(Freed, freed)
HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
+HPAGEFLAG(Cma, cma)
#ifdef CONFIG_HUGETLB_PAGE
@@ -675,11 +680,21 @@ struct hstate {
char name[HSTATE_NAME_LEN];
};
+struct cma;
+
struct huge_bootmem_page {
struct list_head list;
struct hstate *hstate;
+ unsigned long flags;
+ struct cma *cma;
};
+#define HUGE_BOOTMEM_HVO 0x0001
+#define HUGE_BOOTMEM_ZONES_VALID 0x0002
+#define HUGE_BOOTMEM_CMA 0x0004
+
+bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
+
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
void wait_for_freed_hugetlb_folios(void);
@@ -815,6 +830,17 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
}
#endif
+#ifndef arch_has_huge_bootmem_alloc
+/*
+ * Some architectures do their own bootmem allocation, so they can't use
+ * early CMA allocation.
+ */
+static inline bool arch_has_huge_bootmem_alloc(void)
+{
+ return false;
+}
+#endif
+
static inline struct hstate *folio_hstate(struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
@@ -1257,6 +1283,15 @@ static inline bool hugetlbfs_pagecache_present(
{
return false;
}
+
+static inline void hugetlb_bootmem_alloc(void)
+{
+}
+
+static inline bool hugetlb_bootmem_allocated(void)
+{
+ return false;
+}
#endif /* CONFIG_HUGETLB_PAGE */
static inline spinlock_t *huge_pte_lock(struct hstate *h,
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index e79eb6ac516f..ef5a1ecc6e59 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -133,7 +133,6 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);
-void memblock_free_all(void);
void memblock_free(void *ptr, size_t size);
void reset_all_zones_managed_pages(void);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6e74b8254d9b..53364526d877 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -438,9 +438,7 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio)
*/
static inline bool folio_memcg_charged(struct folio *folio)
{
- if (folio_memcg_kmem(folio))
- return __folio_objcg(folio) != NULL;
- return __folio_memcg(folio) != NULL;
+ return folio->memcg_data != 0;
}
/*
@@ -649,8 +647,6 @@ int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp);
int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
-
void __mem_cgroup_uncharge(struct folio *folio);
/**
@@ -1040,7 +1036,9 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
rcu_read_unlock();
}
-void split_page_memcg(struct page *head, int old_order, int new_order);
+void split_page_memcg(struct page *first, unsigned order);
+void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
+ unsigned new_order);
static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
{
@@ -1165,10 +1163,6 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
return 0;
}
-static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr)
-{
-}
-
static inline void mem_cgroup_uncharge(struct folio *folio)
{
}
@@ -1465,7 +1459,12 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
-static inline void split_page_memcg(struct page *head, int old_order, int new_order)
+static inline void split_page_memcg(struct page *first, unsigned order)
+{
+}
+
+static inline void folio_split_memcg_refs(struct folio *folio,
+ unsigned old_order, unsigned new_order)
{
}
@@ -1848,6 +1847,9 @@ static inline void mem_cgroup_exit_user_fault(void)
current->in_user_fault = 0;
}
+void memcg1_swapout(struct folio *folio, swp_entry_t entry);
+void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages);
+
#else /* CONFIG_MEMCG_V1 */
static inline
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
@@ -1875,6 +1877,14 @@ static inline void mem_cgroup_exit_user_fault(void)
{
}
+static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry)
+{
+}
+
+static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
+{
+}
+
#endif /* CONFIG_MEMCG_V1 */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index c0afee5d126e..12daa6ec7d09 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -25,7 +25,7 @@
/**
* struct memory_group - a logical group of memory blocks
* @nid: The node id for all memory blocks inside the memory group.
- * @blocks: List of all memory blocks belonging to this memory group.
+ * @memory_blocks: List of all memory blocks belonging to this memory group.
* @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this
* memory group.
* @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 3f7143ade32c..4aa151914eab 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -161,7 +161,7 @@ static inline bool is_device_private_page(const struct page *page)
{
return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
is_zone_device_page(page) &&
- page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+ page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE;
}
static inline bool folio_is_device_private(const struct folio *folio)
@@ -173,13 +173,13 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
{
return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
is_zone_device_page(page) &&
- page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
+ page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA;
}
static inline bool is_device_coherent_page(const struct page *page)
{
return is_zone_device_page(page) &&
- page->pgmap->type == MEMORY_DEVICE_COHERENT;
+ page_pgmap(page)->type == MEMORY_DEVICE_COHERENT;
}
static inline bool folio_is_device_coherent(const struct folio *folio)
@@ -187,6 +187,17 @@ static inline bool folio_is_device_coherent(const struct folio *folio)
return is_device_coherent_page(&folio->page);
}
+static inline bool is_fsdax_page(const struct page *page)
+{
+ return is_zone_device_page(page) &&
+ page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX;
+}
+
+static inline bool folio_is_fsdax(const struct folio *folio)
+{
+ return is_fsdax_page(&folio->page);
+}
+
#ifdef CONFIG_ZONE_DEVICE
void zone_device_page_init(struct page *page);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 80891120cca9..aaa2114498d6 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -205,8 +205,8 @@ struct migrate_vma {
unsigned long end;
/*
- * Set to the owner value also stored in page->pgmap->owner for
- * migrating out of device private memory. The flags also need to
+ * Set to the owner value also stored in page_pgmap(page)->owner
+ * for migrating out of device private memory. The flags also need to
* be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE.
* The caller should always set this field when using mmu notifier
* callbacks to avoid device MMU invalidations for device private
diff --git a/include/linux/mm.h b/include/linux/mm.h
index beba5ba0fd97..32ba0e33422b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -32,6 +32,7 @@
#include <linux/memremap.h>
#include <linux/slab.h>
#include <linux/cacheinfo.h>
+#include <linux/rcuwait.h>
struct mempolicy;
struct anon_vma;
@@ -40,20 +41,10 @@ struct user_struct;
struct pt_regs;
struct folio_batch;
+void arch_mm_preinit(void);
void mm_core_init(void);
void init_mm_internals(void);
-#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */
-extern unsigned long max_mapnr;
-
-static inline void set_max_mapnr(unsigned long limit)
-{
- max_mapnr = limit;
-}
-#else
-static inline void set_max_mapnr(unsigned long limit) { }
-#endif
-
extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
@@ -242,8 +233,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
-/* Use only if VMA has no other users */
-void __vm_area_free(struct vm_area_struct *vma);
#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
@@ -682,13 +671,57 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ static struct lock_class_key lockdep_key;
+
+ lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+ if (reset_refcnt)
+ refcount_set(&vma->vm_refcnt, 0);
+ vma->vm_lock_seq = UINT_MAX;
+}
+
+static inline bool is_vma_writer_only(int refcnt)
+{
+ /*
+ * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
+ * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
+ * a detached vma happens only in vma_mark_detached() and is a rare
+ * case, therefore most of the time there will be no unnecessary wakeup.
+ */
+ return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
+}
+
+static inline void vma_refcount_put(struct vm_area_struct *vma)
+{
+ /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+ struct mm_struct *mm = vma->vm_mm;
+ int oldcnt;
+
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+ if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
+
+ if (is_vma_writer_only(oldcnt - 1))
+ rcuwait_wake_up(&mm->vma_writer_wait);
+ }
+}
+
/*
* Try to read-lock a vma. The function is allowed to occasionally yield false
* locked result to avoid performance overhead, in which case we fall back to
* using mmap_lock. The function should never yield false unlocked result.
+ * False locked result is possible if mm_lock_seq overflows or if vma gets
+ * reused and attached to a different mm before we lock it.
+ * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
+ * detached.
*/
-static inline bool vma_start_read(struct vm_area_struct *vma)
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+ struct vm_area_struct *vma)
{
+ int oldcnt;
+
/*
* Check before locking. A race might cause false locked result.
* We can use READ_ONCE() for the mm_lock_seq here, and don't need
@@ -696,16 +729,26 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* we don't rely on for anything - the mm_lock_seq read against which we
* need ordering is below.
*/
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
- return false;
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
+ return NULL;
- if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
- return false;
+ /*
+ * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
+ * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+ * Acquire fence is required here to avoid reordering against later
+ * vm_lock_seq check and checks inside lock_vma_under_rcu().
+ */
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT))) {
+ /* return EAGAIN if vma got detached from under us */
+ return oldcnt ? NULL : ERR_PTR(-EAGAIN);
+ }
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
/*
- * Overflow might produce false locked result.
+ * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
* False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+ * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
* modification invalidates all existing locks.
*
* We must use ACQUIRE semantics for the mm_lock_seq so that if we are
@@ -713,18 +756,47 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* after it has been unlocked.
* This pairs with RELEASE semantics in vma_end_write_all().
*/
- if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
- up_read(&vma->vm_lock->lock);
- return false;
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+ vma_refcount_put(vma);
+ return NULL;
}
+
+ return vma;
+}
+
+/*
+ * Use only while holding mmap read lock which guarantees that locking will not
+ * fail (nobody can concurrently write-lock the vma). vma_start_read() should
+ * not be used in such cases because it might fail due to mm_lock_seq overflow.
+ * This functionality is used to obtain vma read lock and drop the mmap read lock.
+ */
+static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
+{
+ int oldcnt;
+
+ mmap_assert_locked(vma->vm_mm);
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT)))
+ return false;
+
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
return true;
}
+/*
+ * Use only while holding mmap read lock which guarantees that locking will not
+ * fail (nobody can concurrently write-lock the vma). vma_start_read() should
+ * not be used in such cases because it might fail due to mm_lock_seq overflow.
+ * This functionality is used to obtain vma read lock and drop the mmap read lock.
+ */
+static inline bool vma_start_read_locked(struct vm_area_struct *vma)
+{
+ return vma_start_read_locked_nested(vma, 0);
+}
+
static inline void vma_end_read(struct vm_area_struct *vma)
{
- rcu_read_lock(); /* keeps vma alive till the end of up_read */
- up_read(&vma->vm_lock->lock);
- rcu_read_unlock();
+ vma_refcount_put(vma);
}
/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
@@ -740,6 +812,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l
return (vma->vm_lock_seq == *mm_lock_seq);
}
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
+
/*
* Begin writing to a VMA.
* Exclude concurrent readers under the per-VMA lock until the currently
@@ -752,15 +826,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
if (__is_vma_write_locked(vma, &mm_lock_seq))
return;
- down_write(&vma->vm_lock->lock);
- /*
- * We should use WRITE_ONCE() here because we can have concurrent reads
- * from the early lockless pessimistic check in vma_start_read().
- * We don't really care about the correctness of that early check, but
- * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
- */
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- up_write(&vma->vm_lock->lock);
+ __vma_start_write(vma, mm_lock_seq);
}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -772,18 +838,36 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
static inline void vma_assert_locked(struct vm_area_struct *vma)
{
- if (!rwsem_is_locked(&vma->vm_lock->lock))
- vma_assert_write_locked(vma);
+ unsigned int mm_lock_seq;
+
+ VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
+ !__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+/*
+ * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
+ * assertions should be made either under mmap_write_lock or when the object
+ * has been isolated under mmap_write_lock, ensuring no competing writers.
+ */
+static inline void vma_assert_attached(struct vm_area_struct *vma)
+{
+ WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
}
-static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+static inline void vma_assert_detached(struct vm_area_struct *vma)
{
- /* When detaching vma should be write-locked */
- if (detached)
- vma_assert_write_locked(vma);
- vma->detached = detached;
+ WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
}
+static inline void vma_mark_attached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_detached(vma);
+ refcount_set_release(&vma->vm_refcnt, 1);
+}
+
+void vma_mark_detached(struct vm_area_struct *vma);
+
static inline void release_fault_lock(struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_VMA_LOCK)
@@ -805,14 +889,18 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
#else /* CONFIG_PER_VMA_LOCK */
-static inline bool vma_start_read(struct vm_area_struct *vma)
- { return false; }
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+ { return NULL; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{ mmap_assert_write_locked(vma->vm_mm); }
-static inline void vma_mark_detached(struct vm_area_struct *vma,
- bool detached) {}
+static inline void vma_assert_attached(struct vm_area_struct *vma) {}
+static inline void vma_assert_detached(struct vm_area_struct *vma) {}
+static inline void vma_mark_attached(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma) {}
static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
unsigned long address)
@@ -839,18 +927,13 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
extern const struct vm_operations_struct vma_dummy_vm_ops;
-/*
- * WARNING: vma_init does not initialize vma->vm_lock.
- * Use vm_area_alloc()/vm_area_free() if vma needs locking.
- */
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_ops = &vma_dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
- vma_mark_detached(vma, false);
- vma_numab_state_init(vma);
+ vma_lock_init(vma, false);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
@@ -1043,6 +1126,7 @@ static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
if (unlikely(mas_is_err(&vmi->mas)))
return -ENOMEM;
+ vma_mark_attached(vma);
return 0;
}
@@ -1083,6 +1167,25 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
struct mmu_gather;
struct inode;
+extern void prep_compound_page(struct page *page, unsigned int order);
+
+static inline unsigned int folio_large_order(const struct folio *folio)
+{
+ return folio->_flags_1 & 0xff;
+}
+
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+static inline long folio_large_nr_pages(const struct folio *folio)
+{
+ return folio->_nr_pages;
+}
+#else
+static inline long folio_large_nr_pages(const struct folio *folio)
+{
+ return 1L << folio_large_order(folio);
+}
+#endif
+
/*
* compound_order() can be called without holding a reference, which means
* that niceties like page_folio() don't work. These callers should be
@@ -1096,7 +1199,7 @@ static inline unsigned int compound_order(struct page *page)
if (!test_bit(PG_head, &folio->flags))
return 0;
- return folio->_flags_1 & 0xff;
+ return folio_large_order(folio);
}
/**
@@ -1112,7 +1215,7 @@ static inline unsigned int folio_order(const struct folio *folio)
{
if (!folio_test_large(folio))
return 0;
- return folio->_flags_1 & 0xff;
+ return folio_large_order(folio);
}
#include <linux/huge_mm.h>
@@ -1205,6 +1308,8 @@ static inline int is_vmalloc_or_module_addr(const void *x)
static inline int folio_entire_mapcount(const struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1))
+ return 0;
return atomic_read(&folio->_entire_mapcount) + 1;
}
@@ -1404,25 +1509,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
* back into memory.
*/
-#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
-DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-
-bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
-static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
- if (!static_branch_unlikely(&devmap_managed_key))
- return false;
- if (!folio_is_zone_device(folio))
- return false;
- return __put_devmap_managed_folio_refs(folio, refs);
-}
-#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
- return false;
-}
-#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-
/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1543,12 +1629,6 @@ static inline void put_page(struct page *page)
if (folio_test_slab(folio))
return;
- /*
- * For some devmap managed pages we need to catch refcount transition
- * from 2 to 1:
- */
- if (put_devmap_managed_folio_refs(folio, 1))
- return;
folio_put(folio);
}
@@ -1907,6 +1987,13 @@ static inline struct folio *pfn_folio(unsigned long pfn)
return page_folio(pfn_to_page(pfn));
}
+static inline bool folio_has_pincount(const struct folio *folio)
+{
+ if (IS_ENABLED(CONFIG_64BIT))
+ return folio_test_large(folio);
+ return folio_order(folio) > 1;
+}
+
/**
* folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
* @folio: The folio.
@@ -1923,7 +2010,7 @@ static inline struct folio *pfn_folio(unsigned long pfn)
* get that many refcounts, and b) all the callers of this routine are
* expected to be able to deal gracefully with a false positive.
*
- * For large folios, the result will be exactly correct. That's because
+ * For most large folios, the result will be exactly correct. That's because
* we have more tracking data available: the _pincount field is used
* instead of the GUP_PIN_COUNTING_BIAS scheme.
*
@@ -1934,7 +2021,7 @@ static inline struct folio *pfn_folio(unsigned long pfn)
*/
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
- if (folio_test_large(folio))
+ if (folio_has_pincount(folio))
return atomic_read(&folio->_pincount) > 0;
/*
@@ -2006,6 +2093,13 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio)
if (folio_is_device_coherent(folio))
return false;
+ /*
+ * Filesystems can only tolerate transient delays to truncate and
+ * hole-punch operations
+ */
+ if (folio_is_fsdax(folio))
+ return false;
+
/* Otherwise, non-movable zone folios can be pinned. */
return !folio_is_zone_movable(folio);
@@ -2049,11 +2143,7 @@ static inline long folio_nr_pages(const struct folio *folio)
{
if (!folio_test_large(folio))
return 1;
-#ifdef CONFIG_64BIT
- return folio->_folio_nr_pages;
-#else
- return 1L << (folio->_flags_1 & 0xff);
-#endif
+ return folio_large_nr_pages(folio);
}
/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
@@ -2068,24 +2158,20 @@ static inline long folio_nr_pages(const struct folio *folio)
* page. compound_nr() can be called on a tail page, and is defined to
* return 1 in that case.
*/
-static inline unsigned long compound_nr(struct page *page)
+static inline long compound_nr(struct page *page)
{
struct folio *folio = (struct folio *)page;
if (!test_bit(PG_head, &folio->flags))
return 1;
-#ifdef CONFIG_64BIT
- return folio->_folio_nr_pages;
-#else
- return 1L << (folio->_flags_1 & 0xff);
-#endif
+ return folio_large_nr_pages(folio);
}
/**
* thp_nr_pages - The number of regular pages in this huge page.
* @page: The head page of a huge page.
*/
-static inline int thp_nr_pages(struct page *page)
+static inline long thp_nr_pages(struct page *page)
{
return folio_nr_pages((struct folio *)page);
}
@@ -2140,23 +2226,18 @@ static inline size_t folio_size(const struct folio *folio)
}
/**
- * folio_likely_mapped_shared - Estimate if the folio is mapped into the page
- * tables of more than one MM
+ * folio_maybe_mapped_shared - Whether the folio is mapped into the page
+ * tables of more than one MM
* @folio: The folio.
*
- * This function checks if the folio is currently mapped into more than one
- * MM ("mapped shared"), or if the folio is only mapped into a single MM
- * ("mapped exclusively").
+ * This function checks if the folio maybe currently mapped into more than one
+ * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single
+ * MM ("mapped exclusively").
*
* For KSM folios, this function also returns "mapped shared" when a folio is
* mapped multiple times into the same MM, because the individual page mappings
* are independent.
*
- * As precise information is not easily available for all folios, this function
- * estimates the number of MMs ("sharers") that are currently mapping a folio
- * using the number of times the first page of the folio is currently mapped
- * into page tables.
- *
* For small anonymous folios and anonymous hugetlb folios, the return
* value will be exactly correct: non-KSM folios can only be mapped at most once
* into an MM, and they cannot be partially mapped. KSM folios are
@@ -2164,8 +2245,8 @@ static inline size_t folio_size(const struct folio *folio)
*
* For other folios, the result can be fuzzy:
* #. For partially-mappable large folios (THP), the return value can wrongly
- * indicate "mapped exclusively" (false negative) when the folio is
- * only partially mapped into at least one MM.
+ * indicate "mapped shared" (false positive) if a folio was mapped by
+ * more than two MMs at one point in time.
* #. For pagecache folios (including hugetlb), the return value can wrongly
* indicate "mapped shared" (false positive) when two VMAs in the same MM
* cover the same file range.
@@ -2182,7 +2263,7 @@ static inline size_t folio_size(const struct folio *folio)
*
* Return: Whether the folio is estimated to be mapped into more than one MM.
*/
-static inline bool folio_likely_mapped_shared(struct folio *folio)
+static inline bool folio_maybe_mapped_shared(struct folio *folio)
{
int mapcount = folio_mapcount(folio);
@@ -2190,16 +2271,22 @@ static inline bool folio_likely_mapped_shared(struct folio *folio)
if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
return mapcount > 1;
- /* A single mapping implies "mapped exclusively". */
- if (mapcount <= 1)
- return false;
-
- /* If any page is mapped more than once we treat it "mapped shared". */
- if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio))
+ /*
+ * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ...
+ * simply assume "mapped shared", nobody should really care
+ * about this for arbitrary kernel allocations.
+ */
+ if (!IS_ENABLED(CONFIG_MM_ID))
return true;
- /* Let's guess based on the first subpage. */
- return atomic_read(&folio->_mapcount) > 0;
+ /*
+ * A single mapping implies "mapped exclusively", even if the
+ * folio flag says something different: it's easier to handle this
+ * case here instead of on the RMAP hot path.
+ */
+ if (mapcount <= 1)
+ return false;
+ return folio_test_large_maybe_mapped_shared(folio);
}
#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
@@ -3179,7 +3266,6 @@ extern void reserve_bootmem_region(phys_addr_t start,
/* Free the reserved page into the buddy system, so it gets managed. */
void free_reserved_page(struct page *page);
-#define free_highmem_page(page) free_reserved_page(page)
static inline void mark_page_reserved(struct page *page)
{
@@ -3539,6 +3625,8 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
+ bool write);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
@@ -3817,6 +3905,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
#endif
void *sparse_buffer_alloc(unsigned long size);
+unsigned long section_map_size(void);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
@@ -3825,7 +3914,8 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
- struct vmem_altmap *altmap, struct page *reuse);
+ struct vmem_altmap *altmap, unsigned long ptpfn,
+ unsigned long flags);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
@@ -3841,6 +3931,12 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap);
+int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node,
+ unsigned long headsize);
+int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node,
+ unsigned long headsize);
+void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
+ unsigned long headsize);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
@@ -3907,9 +4003,6 @@ static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
}
#endif
-void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
- unsigned long nr_pages);
-
enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
MF_ACTION_REQUIRED = 1 << 1,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 75e8850cec3a..56d07edd01f9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -19,6 +19,7 @@
#include <linux/workqueue.h>
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
+#include <linux/types.h>
#include <asm/mmu.h>
@@ -133,8 +134,11 @@ struct page {
unsigned long compound_head; /* Bit zero is set */
};
struct { /* ZONE_DEVICE pages */
- /** @pgmap: Points to the hosting device page map. */
- struct dev_pagemap *pgmap;
+ /*
+ * The first word is used for compound_head or folio
+ * pgmap
+ */
+ void *_unused_pgmap_compound_head;
void *zone_device_data;
/*
* ZONE_DEVICE private pages are counted as being
@@ -287,6 +291,49 @@ typedef struct {
unsigned long val;
} swp_entry_t;
+#if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT)
+/* We have some extra room after the refcount in tail pages. */
+#define NR_PAGES_IN_LARGE_FOLIO
+#endif
+
+/*
+ * On 32bit, we can cut the required metadata in half, because:
+ * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have,
+ * so we can limit MM IDs to 15 bit (32767).
+ * (b) We don't expect folios where even a single complete PTE mapping by
+ * one MM would exceed 15 bits (order-15).
+ */
+#ifdef CONFIG_64BIT
+typedef int mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX INT_MAX
+typedef unsigned int mm_id_t;
+#else /* !CONFIG_64BIT */
+typedef short mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX SHRT_MAX
+typedef unsigned short mm_id_t;
+#endif /* CONFIG_64BIT */
+
+/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */
+#define MM_ID_DUMMY 0
+#define MM_ID_MIN (MM_ID_DUMMY + 1)
+
+/*
+ * We leave the highest bit of each MM id unused, so we can store a flag
+ * in the highest bit of each folio->_mm_id[].
+ */
+#define MM_ID_BITS ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1)
+#define MM_ID_MASK ((1U << MM_ID_BITS) - 1)
+#define MM_ID_MAX MM_ID_MASK
+
+/*
+ * In order to use bit_spin_lock(), which requires an unsigned long, we
+ * operate on folio->_mm_ids when working on flags.
+ */
+#define FOLIO_MM_IDS_LOCK_BITNUM MM_ID_BITS
+#define FOLIO_MM_IDS_LOCK_BIT BIT(FOLIO_MM_IDS_LOCK_BITNUM)
+#define FOLIO_MM_IDS_SHARED_BITNUM (2 * MM_ID_BITS + 1)
+#define FOLIO_MM_IDS_SHARED_BIT BIT(FOLIO_MM_IDS_SHARED_BITNUM)
+
/**
* struct folio - Represents a contiguous set of bytes.
* @flags: Identical to the page flags.
@@ -296,6 +343,8 @@ typedef struct {
* anonymous memory.
* @index: Offset within the file, in units of pages. For anonymous memory,
* this is the index from the beginning of the mmap.
+ * @share: number of DAX mappings that reference this folio. See
+ * dax_associate_entry.
* @private: Filesystem per-folio data (see folio_attach_private()).
* @swap: Used for swp_entry_t if folio_test_swapcache().
* @_mapcount: Do not access this member directly. Use folio_mapcount() to
@@ -303,13 +352,17 @@ typedef struct {
* @_refcount: Do not access this member directly. Use folio_ref_count()
* to find how many references there are to this folio.
* @memcg_data: Memory Control Group data.
+ * @pgmap: Metadata for ZONE_DEVICE mappings
* @virtual: Virtual address in the kernel direct map.
* @_last_cpupid: IDs of last CPU and last process that accessed the folio.
* @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
* @_large_mapcount: Do not use directly, call folio_mapcount().
* @_nr_pages_mapped: Do not use outside of rmap and debug code.
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
- * @_folio_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_mm_id: Do not use outside of rmap code.
+ * @_mm_ids: Do not use outside of rmap code.
+ * @_mm_id_mapcount: Do not use outside of rmap code.
* @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
* @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
* @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
@@ -341,9 +394,13 @@ struct folio {
/* private: */
};
/* public: */
+ struct dev_pagemap *pgmap;
};
struct address_space *mapping;
- pgoff_t index;
+ union {
+ pgoff_t index;
+ unsigned long share;
+ };
union {
void *private;
swp_entry_t swap;
@@ -369,14 +426,30 @@ struct folio {
struct {
unsigned long _flags_1;
unsigned long _head_1;
+ union {
+ struct {
/* public: */
- atomic_t _large_mapcount;
- atomic_t _entire_mapcount;
- atomic_t _nr_pages_mapped;
- atomic_t _pincount;
+ atomic_t _large_mapcount;
+ atomic_t _nr_pages_mapped;
#ifdef CONFIG_64BIT
- unsigned int _folio_nr_pages;
-#endif
+ atomic_t _entire_mapcount;
+ atomic_t _pincount;
+#endif /* CONFIG_64BIT */
+ mm_id_mapcount_t _mm_id_mapcount[2];
+ union {
+ mm_id_t _mm_id[2];
+ unsigned long _mm_ids;
+ };
+ /* private: the union with struct page is transitional */
+ };
+ unsigned long _usable_1[4];
+ };
+ atomic_t _mapcount_1;
+ atomic_t _refcount_1;
+ /* public: */
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ unsigned int _nr_pages;
+#endif /* NR_PAGES_IN_LARGE_FOLIO */
/* private: the union with struct page is transitional */
};
struct page __page_1;
@@ -386,20 +459,27 @@ struct folio {
unsigned long _flags_2;
unsigned long _head_2;
/* public: */
- void *_hugetlb_subpool;
- void *_hugetlb_cgroup;
- void *_hugetlb_cgroup_rsvd;
- void *_hugetlb_hwpoison;
+ struct list_head _deferred_list;
+#ifndef CONFIG_64BIT
+ atomic_t _entire_mapcount;
+ atomic_t _pincount;
+#endif /* !CONFIG_64BIT */
/* private: the union with struct page is transitional */
};
+ struct page __page_2;
+ };
+ union {
struct {
- unsigned long _flags_2a;
- unsigned long _head_2a;
+ unsigned long _flags_3;
+ unsigned long _head_3;
/* public: */
- struct list_head _deferred_list;
+ void *_hugetlb_subpool;
+ void *_hugetlb_cgroup;
+ void *_hugetlb_cgroup_rsvd;
+ void *_hugetlb_hwpoison;
/* private: the union with struct page is transitional */
};
- struct page __page_2;
+ struct page __page_3;
};
};
@@ -428,14 +508,20 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid);
offsetof(struct page, pg) + sizeof(struct page))
FOLIO_MATCH(flags, _flags_1);
FOLIO_MATCH(compound_head, _head_1);
+FOLIO_MATCH(_mapcount, _mapcount_1);
+FOLIO_MATCH(_refcount, _refcount_1);
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl) \
static_assert(offsetof(struct folio, fl) == \
offsetof(struct page, pg) + 2 * sizeof(struct page))
FOLIO_MATCH(flags, _flags_2);
FOLIO_MATCH(compound_head, _head_2);
-FOLIO_MATCH(flags, _flags_2a);
-FOLIO_MATCH(compound_head, _head_2a);
+#undef FOLIO_MATCH
+#define FOLIO_MATCH(pg, fl) \
+ static_assert(offsetof(struct folio, fl) == \
+ offsetof(struct page, pg) + 3 * sizeof(struct page))
+FOLIO_MATCH(flags, _flags_3);
+FOLIO_MATCH(compound_head, _head_3);
#undef FOLIO_MATCH
/**
@@ -578,6 +664,12 @@ static inline void *folio_get_private(struct folio *folio)
typedef unsigned long vm_flags_t;
/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
+/*
* A region containing a mapping of a non-memory backed file under NOMMU
* conditions. These are held in a global tree and are pinned by the VMAs that
* map parts of them.
@@ -633,9 +725,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
}
#endif
-struct vma_lock {
- struct rw_semaphore lock;
-};
+#define VMA_LOCK_OFFSET 0x40000000
+#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1)
struct vma_numab_state {
/*
@@ -681,6 +772,9 @@ struct vma_numab_state {
*
* Only explicitly marked struct members may be accessed by RCU readers before
* getting a stable reference.
+ *
+ * WARNING: when adding new members, please update vm_area_init_from() to copy
+ * them during vm_area_struct content duplication.
*/
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
@@ -691,9 +785,7 @@ struct vm_area_struct {
unsigned long vm_start;
unsigned long vm_end;
};
-#ifdef CONFIG_PER_VMA_LOCK
- struct rcu_head vm_rcu; /* Used for deferred freeing. */
-#endif
+ freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
};
/*
@@ -714,18 +806,12 @@ struct vm_area_struct {
#ifdef CONFIG_PER_VMA_LOCK
/*
- * Flag to indicate areas detached from the mm->mm_mt tree.
- * Unstable RCU readers are allowed to read this.
- */
- bool detached;
-
- /*
* Can only be written (using WRITE_ONCE()) while holding both:
* - mmap_lock (in write mode)
- * - vm_lock->lock (in write mode)
+ * - vm_refcnt bit at VMA_LOCK_OFFSET is set
* Can be read reliably while holding one of:
* - mmap_lock (in read or write mode)
- * - vm_lock->lock (in read or write mode)
+ * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
* Can be read unreliably (using READ_ONCE()) for pessimistic bailout
* while holding nothing (except RCU to keep the VMA struct allocated).
*
@@ -734,20 +820,7 @@ struct vm_area_struct {
* slowpath.
*/
unsigned int vm_lock_seq;
- /* Unstable RCU readers are allowed to read this. */
- struct vma_lock *vm_lock;
#endif
-
- /*
- * For areas with an address space and backing store,
- * linkage into the address_space->i_mmap interval tree.
- *
- */
- struct {
- struct rb_node rb;
- unsigned long rb_subtree_last;
- } shared;
-
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
@@ -767,14 +840,6 @@ struct vm_area_struct {
struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */
-#ifdef CONFIG_ANON_VMA_NAME
- /*
- * For private and shared anonymous mappings, a pointer to a null
- * terminated string containing the name given to the vma, or NULL if
- * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
- */
- struct anon_vma_name *anon_name;
-#endif
#ifdef CONFIG_SWAP
atomic_long_t swap_readahead_info;
#endif
@@ -787,6 +852,30 @@ struct vm_area_struct {
#ifdef CONFIG_NUMA_BALANCING
struct vma_numab_state *numab_state; /* NUMA Balancing state */
#endif
+#ifdef CONFIG_PER_VMA_LOCK
+ /* Unstable RCU readers are allowed to read this. */
+ refcount_t vm_refcnt ____cacheline_aligned_in_smp;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map vmlock_dep_map;
+#endif
+#endif
+ /*
+ * For areas with an address space and backing store,
+ * linkage into the address_space->i_mmap interval tree.
+ *
+ */
+ struct {
+ struct rb_node rb;
+ unsigned long rb_subtree_last;
+ } shared;
+#ifdef CONFIG_ANON_VMA_NAME
+ /*
+ * For private and shared anonymous mappings, a pointer to a null
+ * terminated string containing the name given to the vma, or NULL if
+ * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
+ */
+ struct anon_vma_name *anon_name;
+#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
@@ -922,6 +1011,7 @@ struct mm_struct {
* by mmlist_lock
*/
#ifdef CONFIG_PER_VMA_LOCK
+ struct rcuwait vma_writer_wait;
/*
* This field has lock-like semantics, meaning it is sometimes
* accessed with ACQUIRE/RELEASE semantics.
@@ -1074,6 +1164,9 @@ struct mm_struct {
#endif
} lru_gen;
#endif /* CONFIG_LRU_GEN_WALKS_MMU */
+#ifdef CONFIG_MM_ID
+ mm_id_t mm_id;
+#endif /* CONFIG_MM_ID */
} __randomize_layout;
/*
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 45a21faa3ff6..4706c6769902 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -122,12 +122,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
#endif /* CONFIG_PER_VMA_LOCK */
-static inline void mmap_init_lock(struct mm_struct *mm)
-{
- init_rwsem(&mm->mmap_lock);
- mm_lock_seqcount_init(mm);
-}
-
static inline void mmap_write_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, true);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index e2dd57ca368b..bc2402a45741 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -43,10 +43,10 @@ struct mmu_interval_notifier;
* a device driver to possibly ignore the invalidation if the
* owner field matches the driver's device private pgmap owner.
*
- * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
- * longer have exclusive access to the page. When sent during creation of an
- * exclusive range the owner will be initialised to the value provided by the
- * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
+ * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive.
+ * The owner is initialized to the value provided by the caller of
+ * make_device_exclusive(), such that this caller can filter out these
+ * events.
*/
enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e16939553930..25e80b2ca7f4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -138,6 +138,7 @@ enum numa_stat_item {
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
+ NR_FREE_PAGES_BLOCKS,
NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
NR_ZONE_ACTIVE_ANON,
@@ -220,9 +221,11 @@ enum node_stat_item {
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+ PGDEMOTE_PROACTIVE,
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
+ NR_BALLOON_PAGES,
NR_VM_NODE_STAT_ITEMS
};
@@ -1161,6 +1164,12 @@ static inline bool is_zone_device_page(const struct page *page)
return page_zonenum(page) == ZONE_DEVICE;
}
+static inline struct dev_pagemap *page_pgmap(const struct page *page)
+{
+ VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page);
+ return page_folio(page)->pgmap;
+}
+
/*
* Consecutive zone device pages should not be merged into the same sgl
* or bvec segment with other types of pages or if they belong to different
@@ -1176,7 +1185,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
return false;
if (!is_zone_device_page(a))
return true;
- return a->pgmap == b->pgmap;
+ return page_pgmap(a) == page_pgmap(b);
}
extern void memmap_init_zone_device(struct zone *, unsigned long,
@@ -1191,6 +1200,10 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
{
return true;
}
+static inline struct dev_pagemap *page_pgmap(const struct page *page)
+{
+ return NULL;
+}
#endif
static inline bool folio_is_zone_device(const struct folio *folio)
@@ -1937,6 +1950,9 @@ enum {
#ifdef CONFIG_ZONE_DEVICE
SECTION_TAINT_ZONE_DEVICE_BIT,
#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+ SECTION_IS_VMEMMAP_PREINIT_BIT,
+#endif
SECTION_MAP_LAST_BIT,
};
@@ -1947,6 +1963,9 @@ enum {
#ifdef CONFIG_ZONE_DEVICE
#define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+#define SECTION_IS_VMEMMAP_PREINIT BIT(SECTION_IS_VMEMMAP_PREINIT_BIT)
+#endif
#define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1))
#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT
@@ -2001,6 +2020,30 @@ static inline int online_device_section(struct mem_section *section)
}
#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+static inline int preinited_vmemmap_section(struct mem_section *section)
+{
+ return (section &&
+ (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT));
+}
+
+void sparse_vmemmap_init_nid_early(int nid);
+void sparse_vmemmap_init_nid_late(int nid);
+
+#else
+static inline int preinited_vmemmap_section(struct mem_section *section)
+{
+ return 0;
+}
+static inline void sparse_vmemmap_init_nid_early(int nid)
+{
+}
+
+static inline void sparse_vmemmap_init_nid_late(int nid)
+{
+}
+#endif
+
static inline int online_section_nr(unsigned long nr)
{
return online_section(__nr_to_section(nr));
@@ -2038,6 +2081,9 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
}
#endif
+void sparse_init_early_section(int nid, struct page *map, unsigned long pnum,
+ unsigned long flags);
+
#ifndef CONFIG_HAVE_ARCH_PFN_VALID
/**
* pfn_valid - check if there is a valid memory map entry for a PFN
@@ -2100,6 +2146,11 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
return -1;
}
+#define for_each_present_section_nr(start, section_nr) \
+ for (section_nr = next_present_section_nr(start - 1); \
+ section_nr != -1; \
+ section_nr = next_present_section_nr(section_nr))
+
/*
* These are _only_ used during initialisation, therefore they
* can use __initdata ... They could have names to indicate
@@ -2119,6 +2170,8 @@ void sparse_init(void);
#else
#define sparse_init() do {} while (0)
#define sparse_index_init(_sec, _nid) do {} while (0)
+#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0)
+#define sparse_vmemmap_init_nid_late(_nid) do {} while (0)
#define pfn_in_present_section pfn_valid
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index df9234e5f478..5bd9492a66ee 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -673,12 +673,6 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
#define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
-/*
- * Different with flags above, this flag is used only for fsdax mode. It
- * indicates that this page->mapping is now under reflink case.
- */
-#define PAGE_MAPPING_DAX_SHARED ((void *)0x1)
-
static __always_inline bool folio_mapping_flags(const struct folio *folio)
{
return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
@@ -1106,6 +1100,12 @@ static inline bool is_page_hwpoison(const struct page *page)
return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
}
+static inline bool folio_contain_hwpoisoned_page(struct folio *folio)
+{
+ return folio_test_hwpoison(folio) ||
+ (folio_test_large(folio) && folio_test_has_hwpoisoned(folio));
+}
+
bool is_free_buddy_page(const struct page *page);
PAGEFLAG(Isolated, isolated, PF_ANY);
@@ -1193,6 +1193,10 @@ static inline int folio_has_private(const struct folio *folio)
return !!(folio->flags & PAGE_FLAGS_PRIVATE);
}
+static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio)
+{
+ return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
+}
#undef PF_ANY
#undef PF_HEAD
#undef PF_NO_TAIL
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 46406f3fe34d..d649b6bbbc87 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -9,10 +9,12 @@
struct page_counter {
/*
- * Make sure 'usage' does not share cacheline with any other field. The
- * memcg->memory.usage is a hot member of struct mem_cgroup.
+ * Make sure 'usage' does not share cacheline with any other field in
+ * v2. The memcg->memory.usage is a hot member of struct mem_cgroup.
*/
atomic_long_t usage;
+ unsigned long failcnt; /* v1-only field */
+
CACHELINE_PADDING(_pad1_);
/* effective memory.min and memory.min usage tracking */
@@ -28,12 +30,12 @@ struct page_counter {
unsigned long watermark;
/* Latest cg2 reset watermark */
unsigned long local_watermark;
- unsigned long failcnt;
/* Keep all the read most fields in a separete cacheline. */
CACHELINE_PADDING(_pad2_);
bool protection_support;
+ bool track_failcnt;
unsigned long min;
unsigned long low;
unsigned long high;
@@ -58,6 +60,7 @@ static inline void page_counter_init(struct page_counter *counter,
counter->max = PAGE_COUNTER_MAX;
counter->parent = parent;
counter->protection_support = protection_support;
+ counter->track_failcnt = false;
}
static inline unsigned long page_counter_read(struct page_counter *counter)
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index e4b48a0dda24..76c817162d2f 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -3,6 +3,7 @@
#define __LINUX_PAGE_EXT_H
#include <linux/types.h>
+#include <linux/mmzone.h>
#include <linux/stacktrace.h>
struct pglist_data;
@@ -69,16 +70,31 @@ extern void page_ext_init(void);
static inline void page_ext_init_flatmem_late(void)
{
}
+
+static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn)
+{
+ /*
+ * page_ext is allocated per memory section. Once we cross a
+ * memory section, we have to fetch the new pointer.
+ */
+ return next_pfn % PAGES_PER_SECTION;
+}
#else
extern void page_ext_init_flatmem(void);
extern void page_ext_init_flatmem_late(void);
static inline void page_ext_init(void)
{
}
+
+static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn)
+{
+ return true;
+}
#endif
extern struct page_ext *page_ext_get(const struct page *page);
extern void page_ext_put(struct page_ext *page_ext);
+extern struct page_ext *page_ext_lookup(unsigned long pfn);
static inline void *page_ext_data(struct page_ext *page_ext,
struct page_ext_operations *ops)
@@ -93,6 +109,83 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr)
return next;
}
+struct page_ext_iter {
+ unsigned long index;
+ unsigned long start_pfn;
+ struct page_ext *page_ext;
+};
+
+/**
+ * page_ext_iter_begin() - Prepare for iterating through page extensions.
+ * @iter: page extension iterator.
+ * @pfn: PFN of the page we're interested in.
+ *
+ * Must be called with RCU read lock taken.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ */
+static inline struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter,
+ unsigned long pfn)
+{
+ iter->index = 0;
+ iter->start_pfn = pfn;
+ iter->page_ext = page_ext_lookup(pfn);
+
+ return iter->page_ext;
+}
+
+/**
+ * page_ext_iter_next() - Get next page extension
+ * @iter: page extension iterator.
+ *
+ * Must be called with RCU read lock taken.
+ *
+ * Return: NULL if no next page_ext exists.
+ */
+static inline struct page_ext *page_ext_iter_next(struct page_ext_iter *iter)
+{
+ unsigned long pfn;
+
+ if (WARN_ON_ONCE(!iter->page_ext))
+ return NULL;
+
+ iter->index++;
+ pfn = iter->start_pfn + iter->index;
+
+ if (page_ext_iter_next_fast_possible(pfn))
+ iter->page_ext = page_ext_next(iter->page_ext);
+ else
+ iter->page_ext = page_ext_lookup(pfn);
+
+ return iter->page_ext;
+}
+
+/**
+ * page_ext_iter_get() - Get current page extension
+ * @iter: page extension iterator.
+ *
+ * Return: NULL if no page_ext exists for this iterator.
+ */
+static inline struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter)
+{
+ return iter->page_ext;
+}
+
+/**
+ * for_each_page_ext(): iterate through page_ext objects.
+ * @__page: the page we're interested in
+ * @__pgcount: how many pages to iterate through
+ * @__page_ext: struct page_ext pointer where the current page_ext
+ * object is returned
+ * @__iter: struct page_ext_iter object (defined in the stack)
+ *
+ * IMPORTANT: must be called with RCU read lock taken.
+ */
+#define for_each_page_ext(__page, __pgcount, __page_ext, __iter) \
+ for (__page_ext = page_ext_iter_begin(&__iter, page_to_pfn(__page));\
+ __page_ext && __iter.index < __pgcount; \
+ __page_ext = page_ext_iter_next(&__iter))
+
#else /* !CONFIG_PAGE_EXTENSION */
struct page_ext;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7661be85136c..26baa78f1ca7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -536,26 +536,6 @@ struct address_space *folio_mapping(struct folio *);
struct address_space *swapcache_mapping(struct folio *);
/**
- * folio_file_mapping - Find the mapping this folio belongs to.
- * @folio: The folio.
- *
- * For folios which are in the page cache, return the mapping that this
- * page belongs to. Folios in the swap cache return the mapping of the
- * swap file or swap device where the data is stored. This is different
- * from the mapping returned by folio_mapping(). The only reason to
- * use it is if, like NFS, you return 0 from ->activate_swapfile.
- *
- * Do not call this for folios which aren't in the page cache or swap cache.
- */
-static inline struct address_space *folio_file_mapping(struct folio *folio)
-{
- if (unlikely(folio_test_swapcache(folio)))
- return swapcache_mapping(folio);
-
- return folio->mapping;
-}
-
-/**
* folio_flush_mapping - Find the file mapping this folio belongs to.
* @folio: The folio.
*
@@ -575,11 +555,6 @@ static inline struct address_space *folio_flush_mapping(struct folio *folio)
return folio_mapping(folio);
}
-static inline struct address_space *page_file_mapping(struct page *page)
-{
- return folio_file_mapping(page_folio(page));
-}
-
/**
* folio_inode - Get the host inode for this folio.
* @folio: The folio.
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index ac8c44dd8237..c5e9cac0575e 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -33,7 +33,7 @@ struct disk_stats {
#define part_stat_read(part, field) \
({ \
- typeof((part)->bd_stats->field) res = 0; \
+ TYPEOF_UNQUAL((part)->bd_stats->field) res = 0; \
unsigned int _cpu; \
for_each_possible_cpu(_cpu) \
res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 0fcacb909778..0aeb0e276a3e 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -222,7 +222,7 @@ do { \
} while (0)
#define PERCPU_PTR(__p) \
- (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p))
+ (TYPEOF_UNQUAL(*(__p)) __force __kernel *)((__force unsigned long)(__p))
#ifdef CONFIG_SMP
@@ -318,7 +318,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { }
#define __pcpu_size_call_return(stem, variable) \
({ \
- typeof(variable) pscr_ret__; \
+ TYPEOF_UNQUAL(variable) pscr_ret__; \
__verify_pcpu_ptr(&(variable)); \
switch(sizeof(variable)) { \
case 1: pscr_ret__ = stem##1(variable); break; \
@@ -333,7 +333,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { }
#define __pcpu_size_call_return2(stem, variable, ...) \
({ \
- typeof(variable) pscr2_ret__; \
+ TYPEOF_UNQUAL(variable) pscr2_ret__; \
__verify_pcpu_ptr(&(variable)); \
switch(sizeof(variable)) { \
case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 3469c4b20105..c74077977830 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -162,74 +162,32 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code
}
}
-static inline void clear_page_tag_ref(struct page *page)
-{
- if (mem_alloc_profiling_enabled()) {
- union pgtag_ref_handle handle;
- union codetag_ref ref;
-
- if (get_page_tag_ref(page, &ref, &handle)) {
- set_codetag_empty(&ref);
- update_page_tag_ref(handle, &ref);
- put_page_tag_ref(handle);
- }
- }
-}
-
-static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
- unsigned int nr)
-{
- if (mem_alloc_profiling_enabled()) {
- union pgtag_ref_handle handle;
- union codetag_ref ref;
-
- if (get_page_tag_ref(page, &ref, &handle)) {
- alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
- update_page_tag_ref(handle, &ref);
- put_page_tag_ref(handle);
- }
- }
-}
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page);
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+static inline void clear_page_tag_ref(struct page *page)
{
- if (mem_alloc_profiling_enabled()) {
- union pgtag_ref_handle handle;
- union codetag_ref ref;
-
- if (get_page_tag_ref(page, &ref, &handle)) {
- alloc_tag_sub(&ref, PAGE_SIZE * nr);
- update_page_tag_ref(handle, &ref);
- put_page_tag_ref(handle);
- }
- }
+ if (mem_alloc_profiling_enabled())
+ __clear_page_tag_ref(page);
}
-static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
+/* Should be called only if mem_alloc_profiling_enabled() */
+static inline struct alloc_tag *__pgalloc_tag_get(struct page *page)
{
struct alloc_tag *tag = NULL;
-
- if (mem_alloc_profiling_enabled()) {
- union pgtag_ref_handle handle;
- union codetag_ref ref;
-
- if (get_page_tag_ref(page, &ref, &handle)) {
- alloc_tag_sub_check(&ref);
- if (ref.ct)
- tag = ct_to_alloc_tag(ref.ct);
- put_page_tag_ref(handle);
- }
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub_check(&ref);
+ if (ref.ct)
+ tag = ct_to_alloc_tag(ref.ct);
+ put_page_tag_ref(handle);
}
return tag;
}
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
-{
- if (mem_alloc_profiling_enabled() && tag)
- this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
-}
-
void pgalloc_tag_split(struct folio *folio, int old_order, int new_order);
void pgalloc_tag_swap(struct folio *new, struct folio *old);
@@ -238,11 +196,6 @@ void __init alloc_tag_sec_init(void);
#else /* CONFIG_MEM_ALLOC_PROFILING */
static inline void clear_page_tag_ref(struct page *page) {}
-static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
- unsigned int nr) {}
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
static inline void alloc_tag_sec_init(void) {}
static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {}
static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {}
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 4c107e17c547..e2b705c14945 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -222,10 +222,14 @@ static inline int pmd_dirty(pmd_t pmd)
* hazard could result in the direct mode hypervisor case, since the actual
* write to the page tables may not yet have taken place, so reads though
* a raw PTE pointer after it has been modified are not guaranteed to be
- * up to date. This mode can only be entered and left under the protection of
- * the page table locks for all page tables which may be modified. In the UP
- * case, this is required so that preemption is disabled, and in the SMP case,
- * it must synchronize the delayed page table writes properly on other CPUs.
+ * up to date.
+ *
+ * In the general case, no lock is guaranteed to be held between entry and exit
+ * of the lazy mode. So the implementation must assume preemption may be enabled
+ * and cpu migration is possible; it must take steps to be robust against this.
+ * (In practice, for user PTE updates, the appropriate page table lock(s) are
+ * held, but for kernel PTE updates, no lock is held). Nesting is not permitted
+ * and the mode cannot be used in interrupt context.
*/
#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
#define arch_enter_lazy_mmu_mode() do {} while (0)
@@ -287,7 +291,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
{
page_table_check_ptes_set(mm, ptep, pte, nr);
- arch_enter_lazy_mmu_mode();
for (;;) {
set_pte(ptep, pte);
if (--nr == 0)
@@ -295,7 +298,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
ptep++;
pte = pte_next_pfn(pte);
}
- arch_leave_lazy_mmu_mode();
}
#endif
#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)
diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
index 27343424225c..9ad134a04b41 100644
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -4,18 +4,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/signal.h>
-
-/*
- * rcuwait provides a way of blocking and waking up a single
- * task in an rcu-safe manner.
- *
- * The only time @task is non-nil is when a user is blocked (or
- * checking if it needs to) on a condition, and reset as soon as we
- * know that the condition has succeeded and are awoken.
- */
-struct rcuwait {
- struct task_struct __rcu *task;
-};
+#include <linux/types.h>
#define __RCUWAIT_INITIALIZER(name) \
{ .task = NULL, }
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 35f039ecb272..80dc023ac2bf 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -87,6 +87,15 @@
* The decrements dec_and_test() and sub_and_test() also provide acquire
* ordering on success.
*
+ * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide
+ * acquire and release ordering for cases when the memory occupied by the
+ * object might be reused to store another object. This is important for the
+ * cases where secondary validation is required to detect such reuse, e.g.
+ * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after
+ * the refcount is taken, hence acquire order is necessary. Similarly, when the
+ * object is initialized, all stores to its attributes should be visible before
+ * the refcount is set, otherwise a stale attribute value might be used by
+ * another task which succeeds in taking a refcount to the new object.
*/
#ifndef _LINUX_REFCOUNT_H
@@ -126,6 +135,31 @@ static inline void refcount_set(refcount_t *r, int n)
}
/**
+ * refcount_set_release - set a refcount's value with release ordering
+ * @r: the refcount
+ * @n: value to which the refcount will be set
+ *
+ * This function should be used when memory occupied by the object might be
+ * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
+ *
+ * Provides release memory ordering which will order previous memory operations
+ * against this store. This ensures all updates to this object are visible
+ * once the refcount is set and stale values from the object previously
+ * occupying this memory are overwritten with new ones.
+ *
+ * This function should be called only after new object is fully initialized.
+ * After this call the object should be considered visible to other tasks even
+ * if it was not yet added into an object collection normally used to discover
+ * it. This is because other tasks might have discovered the object previously
+ * occupying the same memory and after memory reuse they can succeed in taking
+ * refcount to the new object and start using it.
+ */
+static inline void refcount_set_release(refcount_t *r, int n)
+{
+ atomic_set_release(&r->refs, n);
+}
+
+/**
* refcount_read - get a refcount's value
* @r: the refcount
*
@@ -178,6 +212,71 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
return __refcount_add_not_zero(i, r, NULL);
}
+static inline __must_check __signed_wrap
+bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp,
+ int limit)
+{
+ int old = refcount_read(r);
+
+ do {
+ if (!old)
+ break;
+
+ if (i > limit - old) {
+ if (oldp)
+ *oldp = old;
+ return false;
+ }
+ } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i));
+
+ if (oldp)
+ *oldp = old;
+
+ if (unlikely(old < 0 || old + i < 0))
+ refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);
+
+ return old;
+}
+
+static inline __must_check bool
+__refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit)
+{
+ return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit);
+}
+
+static inline __must_check __signed_wrap
+bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp)
+{
+ return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX);
+}
+
+/**
+ * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0
+ *
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Will saturate at REFCOUNT_SATURATED and WARN.
+ *
+ * This function should be used when memory occupied by the object might be
+ * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
+ *
+ * Provides acquire memory ordering on success, it is assumed the caller has
+ * guaranteed the object memory to be stable (RCU, etc.). It does provide a
+ * control dependency and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time. In these
+ * cases, refcount_inc_not_zero_acquire() should instead be used to increment a
+ * reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
+ */
+static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r)
+{
+ return __refcount_add_not_zero_acquire(i, r, NULL);
+}
+
static inline __signed_wrap
void __refcount_add(int i, refcount_t *r, int *oldp)
{
@@ -236,6 +335,32 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
return __refcount_inc_not_zero(r, NULL);
}
+static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp)
+{
+ return __refcount_add_not_zero_acquire(1, r, oldp);
+}
+
+/**
+ * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0
+ * @r: the refcount to increment
+ *
+ * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on
+ * success.
+ *
+ * This function should be used when memory occupied by the object might be
+ * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
+ *
+ * Provides acquire memory ordering on success, it is assumed the caller has
+ * guaranteed the object memory to be stable (RCU, etc.). It does provide a
+ * control dependency and thereby orders future stores. See the comment on top.
+ *
+ * Return: true if the increment was successful, false otherwise
+ */
+static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r)
+{
+ return __refcount_inc_not_zero_acquire(r, NULL);
+}
+
static inline void __refcount_inc(refcount_t *r, int *oldp)
{
__refcount_add(1, r, oldp);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 683a04088f3f..6b82b618846e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -13,6 +13,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
+#include <linux/bit_spinlock.h>
/*
* The anon_vma heads a list of private "related" vmas, to scan if
@@ -173,6 +174,214 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
struct anon_vma *folio_get_anon_vma(const struct folio *folio);
+#ifdef CONFIG_MM_ID
+static __always_inline void folio_lock_large_mapcount(struct folio *folio)
+{
+ bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
+}
+
+static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
+{
+ __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
+}
+
+static inline unsigned int folio_mm_id(const struct folio *folio, int idx)
+{
+ VM_WARN_ON_ONCE(idx != 0 && idx != 1);
+ return folio->_mm_id[idx] & MM_ID_MASK;
+}
+
+static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id)
+{
+ VM_WARN_ON_ONCE(idx != 0 && idx != 1);
+ folio->_mm_id[idx] &= ~MM_ID_MASK;
+ folio->_mm_id[idx] |= id;
+}
+
+static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
+ int diff, mm_id_t mm_id)
+{
+ VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio));
+ VM_WARN_ON_ONCE(diff <= 0);
+ VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX);
+
+ /*
+ * Make sure we can detect at least one complete PTE mapping of the
+ * folio in a single MM as "exclusively mapped". This is primarily
+ * a check on 32bit, where we currently reduce the size of the per-MM
+ * mapcount to a short.
+ */
+ VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
+ VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX);
+
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY &&
+ folio->_mm_id_mapcount[0] != -1);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY &&
+ folio->_mm_id_mapcount[0] < 0);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY &&
+ folio->_mm_id_mapcount[1] != -1);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
+ folio->_mm_id_mapcount[1] < 0);
+ VM_WARN_ON_ONCE(!folio_mapped(folio) &&
+ folio_test_large_maybe_mapped_shared(folio));
+}
+
+static __always_inline void folio_set_large_mapcount(struct folio *folio,
+ int mapcount, struct vm_area_struct *vma)
+{
+ __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);
+
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);
+
+ /* Note: mapcounts start at -1. */
+ atomic_set(&folio->_large_mapcount, mapcount - 1);
+ folio->_mm_id_mapcount[0] = mapcount - 1;
+ folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
+}
+
+static __always_inline int folio_add_return_large_mapcount(struct folio *folio,
+ int diff, struct vm_area_struct *vma)
+{
+ const mm_id_t mm_id = vma->vm_mm->mm_id;
+ int new_mapcount_val;
+
+ folio_lock_large_mapcount(folio);
+ __folio_large_mapcount_sanity_checks(folio, diff, mm_id);
+
+ new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
+ atomic_set(&folio->_large_mapcount, new_mapcount_val);
+
+ /*
+ * If a folio is mapped more than once into an MM on 32bit, we
+ * can in theory overflow the per-MM mapcount (although only for
+ * fairly large folios), turning it negative. In that case, just
+ * free up the slot and mark the folio "mapped shared", otherwise
+ * we might be in trouble when unmapping pages later.
+ */
+ if (folio_mm_id(folio, 0) == mm_id) {
+ folio->_mm_id_mapcount[0] += diff;
+ if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) {
+ folio->_mm_id_mapcount[0] = -1;
+ folio_set_mm_id(folio, 0, MM_ID_DUMMY);
+ folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+ }
+ } else if (folio_mm_id(folio, 1) == mm_id) {
+ folio->_mm_id_mapcount[1] += diff;
+ if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) {
+ folio->_mm_id_mapcount[1] = -1;
+ folio_set_mm_id(folio, 1, MM_ID_DUMMY);
+ folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+ }
+ } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) {
+ folio_set_mm_id(folio, 0, mm_id);
+ folio->_mm_id_mapcount[0] = diff - 1;
+ /* We might have other mappings already. */
+ if (new_mapcount_val != diff - 1)
+ folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+ } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) {
+ folio_set_mm_id(folio, 1, mm_id);
+ folio->_mm_id_mapcount[1] = diff - 1;
+ /* Slot 0 certainly has mappings as well. */
+ folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+ }
+ folio_unlock_large_mapcount(folio);
+ return new_mapcount_val + 1;
+}
+#define folio_add_large_mapcount folio_add_return_large_mapcount
+
+static __always_inline int folio_sub_return_large_mapcount(struct folio *folio,
+ int diff, struct vm_area_struct *vma)
+{
+ const mm_id_t mm_id = vma->vm_mm->mm_id;
+ int new_mapcount_val;
+
+ folio_lock_large_mapcount(folio);
+ __folio_large_mapcount_sanity_checks(folio, diff, mm_id);
+
+ new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
+ atomic_set(&folio->_large_mapcount, new_mapcount_val);
+
+ /*
+ * There are valid corner cases where we might underflow a per-MM
+ * mapcount (some mappings added when no slot was free, some mappings
+ * added once a slot was free), so we always set it to -1 once we go
+ * negative.
+ */
+ if (folio_mm_id(folio, 0) == mm_id) {
+ folio->_mm_id_mapcount[0] -= diff;
+ if (folio->_mm_id_mapcount[0] >= 0)
+ goto out;
+ folio->_mm_id_mapcount[0] = -1;
+ folio_set_mm_id(folio, 0, MM_ID_DUMMY);
+ } else if (folio_mm_id(folio, 1) == mm_id) {
+ folio->_mm_id_mapcount[1] -= diff;
+ if (folio->_mm_id_mapcount[1] >= 0)
+ goto out;
+ folio->_mm_id_mapcount[1] = -1;
+ folio_set_mm_id(folio, 1, MM_ID_DUMMY);
+ }
+
+ /*
+ * If one MM slot owns all mappings, the folio is mapped exclusively.
+ * Note that if the folio is now unmapped (new_mapcount_val == -1), both
+ * slots must be free (mapcount == -1), and we'll also mark it as
+ * exclusive.
+ */
+ if (folio->_mm_id_mapcount[0] == new_mapcount_val ||
+ folio->_mm_id_mapcount[1] == new_mapcount_val)
+ folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
+out:
+ folio_unlock_large_mapcount(folio);
+ return new_mapcount_val + 1;
+}
+#define folio_sub_large_mapcount folio_sub_return_large_mapcount
+#else /* !CONFIG_MM_ID */
+/*
+ * See __folio_rmap_sanity_checks(), we might map large folios even without
+ * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
+ */
+static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
+ struct vm_area_struct *vma)
+{
+ /* Note: mapcounts start at -1. */
+ atomic_set(&folio->_large_mapcount, mapcount - 1);
+}
+
+static inline void folio_add_large_mapcount(struct folio *folio,
+ int diff, struct vm_area_struct *vma)
+{
+ atomic_add(diff, &folio->_large_mapcount);
+}
+
+static inline int folio_add_return_large_mapcount(struct folio *folio,
+ int diff, struct vm_area_struct *vma)
+{
+ BUILD_BUG();
+}
+
+static inline void folio_sub_large_mapcount(struct folio *folio,
+ int diff, struct vm_area_struct *vma)
+{
+ atomic_sub(diff, &folio->_large_mapcount);
+}
+
+static inline int folio_sub_return_large_mapcount(struct folio *folio,
+ int diff, struct vm_area_struct *vma)
+{
+ BUILD_BUG();
+}
+#endif /* CONFIG_MM_ID */
+
+#define folio_inc_large_mapcount(folio, vma) \
+ folio_add_large_mapcount(folio, 1, vma)
+#define folio_inc_return_large_mapcount(folio, vma) \
+ folio_add_return_large_mapcount(folio, 1, vma)
+#define folio_dec_large_mapcount(folio, vma) \
+ folio_sub_large_mapcount(folio, 1, vma)
+#define folio_dec_return_large_mapcount(folio, vma) \
+ folio_sub_return_large_mapcount(folio, 1, vma)
+
/* RMAP flags, currently only relevant for some anon rmap operations. */
typedef int __bitwise rmap_t;
@@ -192,6 +401,7 @@ typedef int __bitwise rmap_t;
enum rmap_level {
RMAP_LEVEL_PTE = 0,
RMAP_LEVEL_PMD,
+ RMAP_LEVEL_PUD,
};
static inline void __folio_rmap_sanity_checks(const struct folio *folio,
@@ -228,6 +438,14 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
break;
+ case RMAP_LEVEL_PUD:
+ /*
+ * Assume that we are creating a single "entire" mapping of the
+ * folio.
+ */
+ VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
+ VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
+ break;
default:
VM_WARN_ON_ONCE(true);
}
@@ -251,12 +469,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
folio_add_file_rmap_ptes(folio, page, 1, vma)
void folio_add_file_rmap_pmd(struct folio *, struct page *,
struct vm_area_struct *);
+void folio_add_file_rmap_pud(struct folio *, struct page *,
+ struct vm_area_struct *);
void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
struct vm_area_struct *);
#define folio_remove_rmap_pte(folio, page, vma) \
folio_remove_rmap_ptes(folio, page, 1, vma)
void folio_remove_rmap_pmd(struct folio *, struct page *,
struct vm_area_struct *);
+void folio_remove_rmap_pud(struct folio *, struct page *,
+ struct vm_area_struct *);
void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
unsigned long address, rmap_t flags);
@@ -322,7 +544,8 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
}
static __always_inline void __folio_dup_file_rmap(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level)
+ struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
+ enum rmap_level level)
{
const int orig_nr_pages = nr_pages;
@@ -335,14 +558,17 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
break;
}
- do {
- atomic_inc(&page->_mapcount);
- } while (page++, --nr_pages > 0);
- atomic_add(orig_nr_pages, &folio->_large_mapcount);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
+ do {
+ atomic_inc(&page->_mapcount);
+ } while (page++, --nr_pages > 0);
+ }
+ folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
atomic_inc(&folio->_entire_mapcount);
- atomic_inc(&folio->_large_mapcount);
+ folio_inc_large_mapcount(folio, dst_vma);
break;
}
}
@@ -352,45 +578,47 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
* @folio: The folio to duplicate the mappings of
* @page: The first page to duplicate the mappings of
* @nr_pages: The number of pages of which the mapping will be duplicated
+ * @dst_vma: The destination vm area
*
* The page range of the folio is defined by [page, page + nr_pages)
*
* The caller needs to hold the page table lock.
*/
static inline void folio_dup_file_rmap_ptes(struct folio *folio,
- struct page *page, int nr_pages)
+ struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
{
- __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE);
}
static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
- struct page *page)
+ struct page *page, struct vm_area_struct *dst_vma)
{
- __folio_dup_file_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE);
}
/**
* folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
* @folio: The folio to duplicate the mapping of
* @page: The first page to duplicate the mapping of
+ * @dst_vma: The destination vm area
*
* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
*
* The caller needs to hold the page table lock.
*/
static inline void folio_dup_file_rmap_pmd(struct folio *folio,
- struct page *page)
+ struct page *page, struct vm_area_struct *dst_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE);
+ __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE);
#else
WARN_ON_ONCE(true);
#endif
}
static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
- struct page *page, int nr_pages, struct vm_area_struct *src_vma,
- enum rmap_level level)
+ struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, enum rmap_level level)
{
const int orig_nr_pages = nr_pages;
bool maybe_pinned;
@@ -432,18 +660,20 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
do {
if (PageAnonExclusive(page))
ClearPageAnonExclusive(page);
- atomic_inc(&page->_mapcount);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_inc(&page->_mapcount);
} while (page++, --nr_pages > 0);
- atomic_add(orig_nr_pages, &folio->_large_mapcount);
+ folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
if (PageAnonExclusive(page)) {
if (unlikely(maybe_pinned))
return -EBUSY;
ClearPageAnonExclusive(page);
}
atomic_inc(&folio->_entire_mapcount);
- atomic_inc(&folio->_large_mapcount);
+ folio_inc_large_mapcount(folio, dst_vma);
break;
}
return 0;
@@ -455,6 +685,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
* @folio: The folio to duplicate the mappings of
* @page: The first page to duplicate the mappings of
* @nr_pages: The number of pages of which the mapping will be duplicated
+ * @dst_vma: The destination vm area
* @src_vma: The vm area from which the mappings are duplicated
*
* The page range of the folio is defined by [page, page + nr_pages)
@@ -473,16 +704,18 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
* Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
*/
static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
- struct page *page, int nr_pages, struct vm_area_struct *src_vma)
+ struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
{
- return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma,
- RMAP_LEVEL_PTE);
+ return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
+ src_vma, RMAP_LEVEL_PTE);
}
static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
- struct page *page, struct vm_area_struct *src_vma)
+ struct page *page, struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
{
- return __folio_try_dup_anon_rmap(folio, page, 1, src_vma,
+ return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
RMAP_LEVEL_PTE);
}
@@ -491,6 +724,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
* of a folio
* @folio: The folio to duplicate the mapping of
* @page: The first page to duplicate the mapping of
+ * @dst_vma: The destination vm area
* @src_vma: The vm area from which the mapping is duplicated
*
* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
@@ -509,11 +743,12 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
* Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
*/
static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
- struct page *page, struct vm_area_struct *src_vma)
+ struct page *page, struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma,
- RMAP_LEVEL_PMD);
+ return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
+ src_vma, RMAP_LEVEL_PMD);
#else
WARN_ON_ONCE(true);
return -EBUSY;
@@ -663,9 +898,8 @@ int folio_referenced(struct folio *, int is_locked,
void try_to_migrate(struct folio *folio, enum ttu_flags flags);
void try_to_unmap(struct folio *, enum ttu_flags flags);
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct page **pages,
- void *arg);
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+ void *owner, struct folio **foliop);
/* Avoid racy checks */
#define PVMW_SYNC (1 << 0)
@@ -739,6 +973,9 @@ unsigned long page_address_in_vma(const struct folio *folio,
*/
int folio_mkclean(struct folio *);
+int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
+ unsigned long pfn, unsigned long nr_pages);
+
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
struct vm_area_struct *vma);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 98e07e9e9e58..d5a8ab98035c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -137,6 +137,15 @@ enum _slab_flag_bits {
* rcu_read_lock before reading the address, then rcu_read_unlock after
* taking the spinlock within the structure expected at that address.
*
+ * Note that object identity check has to be done *after* acquiring a
+ * reference, therefore user has to ensure proper ordering for loads.
+ * Similarly, when initializing objects allocated with SLAB_TYPESAFE_BY_RCU,
+ * the newly allocated object has to be fully initialized *before* its
+ * refcount gets initialized and proper ordering for stores is required.
+ * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() are
+ * designed with the proper fences required for reference counting objects
+ * allocated with SLAB_TYPESAFE_BY_RCU.
+ *
* Note that it is not possible to acquire a lock within a structure
* allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference
* as described above. The reason is that SLAB_TYPESAFE_BY_RCU pages
@@ -236,12 +245,6 @@ enum _slab_flag_bits {
#endif
/*
- * freeptr_t represents a SLUB freelist pointer, which might be encoded
- * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
- */
-typedef struct { unsigned long v; } freeptr_t;
-
-/*
* ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
*
* Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a98c757400fe..db46b25a65ae 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -24,7 +24,6 @@ struct pagevec;
#define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */
#define SWAP_FLAG_PRIO_MASK 0x7fff
-#define SWAP_FLAG_PRIO_SHIFT 0
#define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */
#define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */
#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */
@@ -74,14 +73,13 @@ static inline int current_is_kswapd(void)
* to a special SWP_DEVICE_{READ|WRITE} entry.
*
* When a page is mapped by the device for exclusive access we set the CPU page
- * table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
+ * table entries to a special SWP_DEVICE_EXCLUSIVE entry.
*/
#ifdef CONFIG_DEVICE_PRIVATE
-#define SWP_DEVICE_NUM 4
+#define SWP_DEVICE_NUM 3
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
-#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
-#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
+#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
#else
#define SWP_DEVICE_NUM 0
#endif
@@ -286,12 +284,10 @@ enum swap_cluster_flags {
#endif
/*
- * We assign a cluster to each CPU, so each CPU can allocate swap entry from
- * its own cluster and swapout sequentially. The purpose is to optimize swapout
- * throughput.
+ * We keep using same cluster for rotational device so IO will be sequential.
+ * The purpose is to optimize SWAP throughput on these device.
*/
-struct percpu_cluster {
- local_lock_t lock; /* Protect the percpu_cluster above */
+struct swap_sequential_cluster {
unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};
@@ -317,8 +313,7 @@ struct swap_info_struct {
atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
unsigned int pages; /* total of usable pages of swap */
atomic_long_t inuse_pages; /* number of those currently in use */
- struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
- struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
+ struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
spinlock_t global_cluster_lock; /* Serialize usage of global cluster */
struct rb_root swap_extent_root;/* root of the swap extent rbtree */
struct block_device *bdev; /* swap device or bdev of swap file */
@@ -461,7 +456,6 @@ void free_pages_and_swap_cache(struct encoded_page **, int);
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
extern atomic_t nr_rotate_swap;
-extern bool has_usable_swap(void);
/* Swap 50% full? Release swapcache more aggressively.. */
static inline bool vm_swap_full(void)
@@ -475,24 +469,22 @@ static inline long get_nr_swap_pages(void)
}
extern void si_swapinfo(struct sysinfo *);
-swp_entry_t folio_alloc_swap(struct folio *folio);
+int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t, int);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t entry, int nr);
extern void swap_free_nr(swp_entry_t entry, int nr_pages);
-extern void swapcache_free_entries(swp_entry_t *entries, int n);
extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
extern sector_t swapdev_block(int, pgoff_t);
extern int __swap_count(swp_entry_t entry);
-extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry);
+extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
struct swap_info_struct *swp_swap_info(swp_entry_t entry);
struct backing_dev_info;
@@ -575,9 +567,9 @@ static inline int __swap_count(swp_entry_t entry)
return 0;
}
-static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
- return 0;
+ return false;
}
static inline int swp_swapcount(swp_entry_t entry)
@@ -585,11 +577,9 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-static inline swp_entry_t folio_alloc_swap(struct folio *folio)
+static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask)
{
- swp_entry_t entry;
- entry.val = 0;
- return entry;
+ return -EINVAL;
}
static inline bool folio_free_swap(struct folio *folio)
@@ -650,7 +640,6 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
#endif
#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
-void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
swp_entry_t entry)
@@ -671,10 +660,6 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct folio *folio);
#else
-static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
-{
-}
-
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
swp_entry_t entry)
{
diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
deleted file mode 100644
index 840aec3523b2..000000000000
--- a/include/linux/swap_slots.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SWAP_SLOTS_H
-#define _LINUX_SWAP_SLOTS_H
-
-#include <linux/swap.h>
-#include <linux/spinlock.h>
-#include <linux/mutex.h>
-
-#define SWAP_SLOTS_CACHE_SIZE SWAP_BATCH
-#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE (5*SWAP_SLOTS_CACHE_SIZE)
-#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE (2*SWAP_SLOTS_CACHE_SIZE)
-
-struct swap_slots_cache {
- bool lock_initialized;
- struct mutex alloc_lock; /* protects slots, nr, cur */
- swp_entry_t *slots;
- int nr;
- int cur;
- int n_ret;
-};
-
-void disable_swap_slots_cache_lock(void);
-void reenable_swap_slots_cache_unlock(void);
-void enable_swap_slots_cache(void);
-
-extern bool swap_slot_cache_enabled;
-
-#endif /* _LINUX_SWAP_SLOTS_H */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 96f26e29fefe..64ea151a7ae3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -186,26 +186,16 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
}
-static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
+static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
{
- return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset);
-}
-
-static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
-{
- return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset);
+ return swp_entry(SWP_DEVICE_EXCLUSIVE, offset);
}
static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
- return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
- swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
+ return swp_type(entry) == SWP_DEVICE_EXCLUSIVE;
}
-static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
-{
- return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE);
-}
#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
@@ -227,12 +217,7 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
return false;
}
-static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
-{
- return swp_entry(0, 0);
-}
-
-static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
+static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
{
return swp_entry(0, 0);
}
@@ -242,10 +227,6 @@ static inline bool is_device_exclusive_entry(swp_entry_t entry)
return false;
}
-static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
-{
- return false;
-}
#endif /* CONFIG_DEVICE_PRIVATE */
#ifdef CONFIG_MIGRATION
diff --git a/include/linux/types.h b/include/linux/types.h
index 1c509ce8f7f6..a3d2182c2686 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -248,5 +248,17 @@ typedef void (*swap_func_t)(void *a, void *b, int size);
typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv);
typedef int (*cmp_func_t)(const void *a, const void *b);
+/*
+ * rcuwait provides a way of blocking and waking up a single
+ * task in an rcu-safe manner.
+ *
+ * The only time @task is non-nil is when a user is blocked (or
+ * checking if it needs to) on a condition, and reset as soon as we
+ * know that the condition has succeeded and are awoken.
+ */
+struct rcuwait {
+ struct task_struct __rcu *task;
+};
+
#endif /* __ASSEMBLY__ */
#endif /* _LINUX_TYPES_H */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 5a37cb2b6f93..9e15a088ba38 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -41,9 +41,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSTEAL_KHUGEPAGED,
+ PGSTEAL_PROACTIVE,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
+ PGSCAN_PROACTIVE,
PGSCAN_DIRECT_THROTTLE,
PGSCAN_ANON,
PGSCAN_FILE,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 4751e3ecc467..b2ccb6845595 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -504,7 +504,7 @@ static inline const char *node_stat_name(enum node_stat_item item)
static inline const char *lru_list_name(enum lru_list lru)
{
- return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_"
+ return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
}
#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index caf4f0b12235..eda4b62511f7 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -313,6 +313,30 @@ static inline void cgroup_writeback_umount(struct super_block *sb)
/*
* mm/page-writeback.c
*/
+/* consolidated parameters for balance_dirty_pages() and its subroutines */
+struct dirty_throttle_control {
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct wb_domain *dom;
+ struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
+#endif
+ struct bdi_writeback *wb;
+ struct fprop_local_percpu *wb_completions;
+
+ unsigned long avail; /* dirtyable */
+ unsigned long dirty; /* file_dirty + write + nfs */
+ unsigned long thresh; /* dirty threshold */
+ unsigned long bg_thresh; /* dirty background threshold */
+ unsigned long limit; /* hard dirty limit */
+
+ unsigned long wb_dirty; /* per-wb counterparts */
+ unsigned long wb_thresh;
+ unsigned long wb_bg_thresh;
+
+ unsigned long pos_ratio;
+ bool freerun;
+ bool dirty_exceeded;
+};
+
void laptop_io_completion(struct backing_dev_info *info);
void laptop_sync_completion(void);
void laptop_mode_timer_fn(struct timer_list *t);
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 0b618ec04115..78eede109b1a 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1555,6 +1555,8 @@ int xa_get_order(struct xarray *, unsigned long index);
int xas_get_order(struct xa_state *xas);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
+void xas_try_split(struct xa_state *xas, void *entry, unsigned int order);
+unsigned int xas_try_split_min_order(unsigned int order);
#else
static inline int xa_get_order(struct xarray *xa, unsigned long index)
{
@@ -1576,6 +1578,17 @@ static inline void xas_split_alloc(struct xa_state *xas, void *entry,
unsigned int order, gfp_t gfp)
{
}
+
+static inline void xas_try_split(struct xa_state *xas, void *entry,
+ unsigned int order)
+{
+}
+
+static inline unsigned int xas_try_split_min_order(unsigned int order)
+{
+ return 0;
+}
+
#endif
/**
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index a67d62b79698..52f30e526607 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -4,9 +4,8 @@
*
* Copyright (C) 2014 Dan Streetman
*
- * This is a common frontend for the zbud and zsmalloc memory
- * storage pool implementations. Typically, this is used to
- * store compressed memory.
+ * This is a common frontend for the zswap compressed memory storage
+ * implementations.
*/
#ifndef _ZPOOL_H_
@@ -14,25 +13,6 @@
struct zpool;
-/*
- * Control how a handle is mapped. It will be ignored if the
- * implementation does not support it. Its use is optional.
- * Note that this does not refer to memory protection, it
- * refers to how the memory will be copied in/out if copying
- * is necessary during mapping; read-write is the safest as
- * it copies the existing memory in on map, and copies the
- * changed memory back out on unmap. Write-only does not copy
- * in the memory and should only be used for initialization.
- * If in doubt, use ZPOOL_MM_DEFAULT which is read-write.
- */
-enum zpool_mapmode {
- ZPOOL_MM_RW, /* normal read-write mapping */
- ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */
- ZPOOL_MM_WO, /* write-only (no copy-in at map time) */
-
- ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
-};
-
bool zpool_has_pool(char *type);
struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp);
@@ -41,17 +21,19 @@ const char *zpool_get_type(struct zpool *pool);
void zpool_destroy_pool(struct zpool *pool);
-bool zpool_malloc_support_movable(struct zpool *pool);
-
int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
unsigned long *handle);
void zpool_free(struct zpool *pool, unsigned long handle);
-void *zpool_map_handle(struct zpool *pool, unsigned long handle,
- enum zpool_mapmode mm);
+void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
+ void *local_copy);
+
+void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
+ void *handle_mem);
-void zpool_unmap_handle(struct zpool *pool, unsigned long handle);
+void zpool_obj_write(struct zpool *zpool, unsigned long handle,
+ void *handle_mem, size_t mem_len);
u64 zpool_get_total_pages(struct zpool *pool);
@@ -81,15 +63,16 @@ struct zpool_driver {
void *(*create)(const char *name, gfp_t gfp);
void (*destroy)(void *pool);
- bool malloc_support_movable;
int (*malloc)(void *pool, size_t size, gfp_t gfp,
unsigned long *handle);
void (*free)(void *pool, unsigned long handle);
- bool sleep_mapped;
- void *(*map)(void *pool, unsigned long handle,
- enum zpool_mapmode mm);
- void (*unmap)(void *pool, unsigned long handle);
+ void *(*obj_read_begin)(void *pool, unsigned long handle,
+ void *local_copy);
+ void (*obj_read_end)(void *pool, unsigned long handle,
+ void *handle_mem);
+ void (*obj_write)(void *pool, unsigned long handle,
+ void *handle_mem, size_t mem_len);
u64 (*total_pages)(void *pool);
};
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index a48cd0ffe57d..c26baf9fb331 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -16,23 +16,6 @@
#include <linux/types.h>
-/*
- * zsmalloc mapping modes
- *
- * NOTE: These only make a difference when a mapped object spans pages.
- */
-enum zs_mapmode {
- ZS_MM_RW, /* normal read-write mapping */
- ZS_MM_RO, /* read-only (no copy-out at unmap time) */
- ZS_MM_WO /* write-only (no copy-in at map time) */
- /*
- * NOTE: ZS_MM_WO should only be used for initializing new
- * (uninitialized) allocations. Partial writes to already
- * initialized allocations should use ZS_MM_RW to preserve the
- * existing data.
- */
-};
-
struct zs_pool_stats {
/* How many pages were migrated (freed) */
atomic_long_t pages_compacted;
@@ -48,14 +31,18 @@ void zs_free(struct zs_pool *pool, unsigned long obj);
size_t zs_huge_class_size(struct zs_pool *pool);
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
- enum zs_mapmode mm);
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
-
unsigned long zs_get_total_pages(struct zs_pool *pool);
unsigned long zs_compact(struct zs_pool *pool);
unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size);
void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
+
+void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
+ void *local_copy);
+void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
+ void *handle_mem);
+void zs_obj_write(struct zs_pool *pool, unsigned long handle,
+ void *handle_mem, size_t mem_len);
+
#endif
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index d961ead91bf1..30c193a1207e 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -26,7 +26,7 @@ struct zswap_lruvec_state {
unsigned long zswap_total_pages(void);
bool zswap_store(struct folio *folio);
-bool zswap_load(struct folio *folio);
+int zswap_load(struct folio *folio);
void zswap_invalidate(swp_entry_t swp);
int zswap_swapon(int type, unsigned long nr_pages);
void zswap_swapoff(int type);
@@ -44,9 +44,9 @@ static inline bool zswap_store(struct folio *folio)
return false;
}
-static inline bool zswap_load(struct folio *folio)
+static inline int zswap_load(struct folio *folio)
{
- return false;
+ return -ENOENT;
}
static inline void zswap_invalidate(swp_entry_t swp) {}
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 468a67836e2f..4cb4326dfebe 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -159,7 +159,7 @@ struct linux_tls_mib {
#define __SNMP_ADD_STATS64(mib, field, addend) \
do { \
- __typeof__(*mib) *ptr = raw_cpu_ptr(mib); \
+ TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \
u64_stats_update_begin(&ptr->syncp); \
ptr->mibs[field] += addend; \
u64_stats_update_end(&ptr->syncp); \
@@ -176,8 +176,7 @@ struct linux_tls_mib {
#define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1)
#define __SNMP_UPD_PO_STATS64(mib, basefield, addend) \
do { \
- __typeof__(*mib) *ptr; \
- ptr = raw_cpu_ptr((mib)); \
+ TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \
u64_stats_update_begin(&ptr->syncp); \
ptr->mibs[basefield##PKTS]++; \
ptr->mibs[basefield##OCTETS] += addend; \
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index b37eb0a7060f..f74925a6cf69 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -342,6 +342,84 @@ TRACE_EVENT(mm_alloc_contig_migrate_range_info,
__entry->nr_mapped)
);
+TRACE_EVENT(mm_setup_per_zone_wmarks,
+
+ TP_PROTO(struct zone *zone),
+
+ TP_ARGS(zone),
+
+ TP_STRUCT__entry(
+ __field(int, node_id)
+ __string(name, zone->name)
+ __field(unsigned long, watermark_min)
+ __field(unsigned long, watermark_low)
+ __field(unsigned long, watermark_high)
+ __field(unsigned long, watermark_promo)
+ ),
+
+ TP_fast_assign(
+ __entry->node_id = zone->zone_pgdat->node_id;
+ __assign_str(name);
+ __entry->watermark_min = zone->_watermark[WMARK_MIN];
+ __entry->watermark_low = zone->_watermark[WMARK_LOW];
+ __entry->watermark_high = zone->_watermark[WMARK_HIGH];
+ __entry->watermark_promo = zone->_watermark[WMARK_PROMO];
+ ),
+
+ TP_printk("node_id=%d zone name=%s watermark min=%lu low=%lu high=%lu promo=%lu",
+ __entry->node_id,
+ __get_str(name),
+ __entry->watermark_min,
+ __entry->watermark_low,
+ __entry->watermark_high,
+ __entry->watermark_promo)
+);
+
+TRACE_EVENT(mm_setup_per_zone_lowmem_reserve,
+
+ TP_PROTO(struct zone *zone, struct zone *upper_zone, long lowmem_reserve),
+
+ TP_ARGS(zone, upper_zone, lowmem_reserve),
+
+ TP_STRUCT__entry(
+ __field(int, node_id)
+ __string(name, zone->name)
+ __string(upper_name, upper_zone->name)
+ __field(long, lowmem_reserve)
+ ),
+
+ TP_fast_assign(
+ __entry->node_id = zone->zone_pgdat->node_id;
+ __assign_str(name);
+ __assign_str(upper_name);
+ __entry->lowmem_reserve = lowmem_reserve;
+ ),
+
+ TP_printk("node_id=%d zone name=%s upper_zone name=%s lowmem_reserve_pages=%ld",
+ __entry->node_id,
+ __get_str(name),
+ __get_str(upper_name),
+ __entry->lowmem_reserve)
+);
+
+TRACE_EVENT(mm_calculate_totalreserve_pages,
+
+ TP_PROTO(unsigned long totalreserve_pages),
+
+ TP_ARGS(totalreserve_pages),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, totalreserve_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->totalreserve_pages = totalreserve_pages;
+ ),
+
+ TP_printk("totalreserve_pages=%lu", __entry->totalreserve_pages)
+);
+
+
/*
* Required for uniquely and securely identifying mm in rss_stat tracepoint.
*/
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index a261e86e61fa..0ff388131fc9 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -629,11 +629,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
TRACE_EVENT(balance_dirty_pages,
TP_PROTO(struct bdi_writeback *wb,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty,
+ struct dirty_throttle_control *dtc,
unsigned long dirty_ratelimit,
unsigned long task_ratelimit,
unsigned long dirtied,
@@ -641,7 +637,7 @@ TRACE_EVENT(balance_dirty_pages,
long pause,
unsigned long start_time),
- TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
+ TP_ARGS(wb, dtc,
dirty_ratelimit, task_ratelimit,
dirtied, period, pause, start_time),
@@ -650,8 +646,8 @@ TRACE_EVENT(balance_dirty_pages,
__field(unsigned long, limit)
__field(unsigned long, setpoint)
__field(unsigned long, dirty)
- __field(unsigned long, bdi_setpoint)
- __field(unsigned long, bdi_dirty)
+ __field(unsigned long, wb_setpoint)
+ __field(unsigned long, wb_dirty)
__field(unsigned long, dirty_ratelimit)
__field(unsigned long, task_ratelimit)
__field(unsigned int, dirtied)
@@ -664,16 +660,15 @@ TRACE_EVENT(balance_dirty_pages,
),
TP_fast_assign(
- unsigned long freerun = (thresh + bg_thresh) / 2;
+ unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2;
strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
- __entry->limit = global_wb_domain.dirty_limit;
- __entry->setpoint = (global_wb_domain.dirty_limit +
- freerun) / 2;
- __entry->dirty = dirty;
- __entry->bdi_setpoint = __entry->setpoint *
- bdi_thresh / (thresh + 1);
- __entry->bdi_dirty = bdi_dirty;
+ __entry->limit = dtc->limit;
+ __entry->setpoint = (dtc->limit + freerun) / 2;
+ __entry->dirty = dtc->dirty;
+ __entry->wb_setpoint = __entry->setpoint *
+ dtc->wb_thresh / (dtc->thresh + 1);
+ __entry->wb_dirty = dtc->wb_dirty;
__entry->dirty_ratelimit = KBps(dirty_ratelimit);
__entry->task_ratelimit = KBps(task_ratelimit);
__entry->dirtied = dirtied;
@@ -689,7 +684,7 @@ TRACE_EVENT(balance_dirty_pages,
TP_printk("bdi %s: "
"limit=%lu setpoint=%lu dirty=%lu "
- "bdi_setpoint=%lu bdi_dirty=%lu "
+ "wb_setpoint=%lu wb_dirty=%lu "
"dirty_ratelimit=%lu task_ratelimit=%lu "
"dirtied=%u dirtied_pause=%u "
"paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
@@ -697,8 +692,8 @@ TRACE_EVENT(balance_dirty_pages,
__entry->limit,
__entry->setpoint,
__entry->dirty,
- __entry->bdi_setpoint,
- __entry->bdi_dirty,
+ __entry->wb_setpoint,
+ __entry->wb_dirty,
__entry->dirty_ratelimit,
__entry->task_ratelimit,
__entry->dirtied,