diff options
Diffstat (limited to 'include/linux')
33 files changed, 686 insertions, 290 deletions
diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index e1a3c9c9754c..cffa38a73618 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -60,7 +60,7 @@ static inline void get_page_bootmem(unsigned long info, struct page *page,  static inline void free_bootmem_page(struct page *page)  { -	kmemleak_free_part(page_to_virt(page), PAGE_SIZE); +	kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);  	free_reserved_page(page);  }  #endif diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 44e9de51eedf..5f23ee599889 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -198,13 +198,11 @@ void touch_buffer(struct buffer_head *bh);  void folio_set_bh(struct buffer_head *bh, struct folio *folio,  		  unsigned long offset);  struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, -					bool retry); +					gfp_t gfp);  struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,  		bool retry); -void create_empty_buffers(struct page *, unsigned long, -			unsigned long b_state); -void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, -				unsigned long b_state); +struct buffer_head *create_empty_buffers(struct folio *folio, +		unsigned long blocksize, unsigned long b_state);  void end_buffer_read_sync(struct buffer_head *bh, int uptodate);  void end_buffer_write_sync(struct buffer_head *bh, int uptodate);  void end_buffer_async_write(struct buffer_head *bh, int uptodate); @@ -227,8 +225,8 @@ void __wait_on_buffer(struct buffer_head *);  wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);  struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,  			unsigned size); -struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, -				  unsigned size, gfp_t gfp); +struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, +		unsigned size, gfp_t gfp);  void __brelse(struct buffer_head *);  void __bforget(struct buffer_head *);  void __breadahead(struct block_device *, sector_t block, unsigned int size); @@ -338,17 +336,38 @@ sb_breadahead(struct super_block *sb, sector_t block)  	__breadahead(sb->s_bdev, block, sb->s_blocksize);  } -static inline struct buffer_head * -sb_getblk(struct super_block *sb, sector_t block) +static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, +		sector_t block, unsigned size)  { -	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); +	gfp_t gfp; + +	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); +	gfp |= __GFP_NOFAIL; + +	return bdev_getblk(bdev, block, size, gfp);  } +static inline struct buffer_head *__getblk(struct block_device *bdev, +		sector_t block, unsigned size) +{ +	gfp_t gfp; -static inline struct buffer_head * -sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp) +	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); +	gfp |= __GFP_MOVABLE | __GFP_NOFAIL; + +	return bdev_getblk(bdev, block, size, gfp); +} + +static inline struct buffer_head *sb_getblk(struct super_block *sb, +		sector_t block)  { -	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp); +	return __getblk(sb->s_bdev, block, sb->s_blocksize); +} + +static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb, +		sector_t block, gfp_t gfp) +{ +	return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp);  }  static inline struct buffer_head * @@ -385,20 +404,6 @@ static inline void lock_buffer(struct buffer_head *bh)  		__lock_buffer(bh);  } -static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, -						   sector_t block, -						   unsigned size) -{ -	return __getblk_gfp(bdev, block, size, 0); -} - -static inline struct buffer_head *__getblk(struct block_device *bdev, -					   sector_t block, -					   unsigned size) -{ -	return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); -} -  static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)  {  	if (!buffer_uptodate(bh) && trylock_buffer(bh)) { @@ -450,6 +455,28 @@ __bread(struct block_device *bdev, sector_t block, unsigned size)  	return __bread_gfp(bdev, block, size, __GFP_MOVABLE);  } +/** + * get_nth_bh - Get a reference on the n'th buffer after this one. + * @bh: The buffer to start counting from. + * @count: How many buffers to skip. + * + * This is primarily useful for finding the nth buffer in a folio; in + * that case you pass the head buffer and the byte offset in the folio + * divided by the block size.  It can be used for other purposes, but + * it will wrap at the end of the folio rather than returning NULL or + * proceeding to the next folio for you. + * + * Return: The requested buffer with an elevated refcount. + */ +static inline __must_check +struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count) +{ +	while (count--) +		bh = bh->b_this_page; +	get_bh(bh); +	return bh; +} +  bool block_dirty_folio(struct address_space *mapping, struct folio *folio);  #ifdef CONFIG_BUFFER_HEAD diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index a5cfd44fab45..d504eb4b49ab 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -73,6 +73,7 @@ struct cacheinfo {  struct cpu_cacheinfo {  	struct cacheinfo *info_list; +	unsigned int per_cpu_data_slice_size;  	unsigned int num_levels;  	unsigned int num_leaves;  	bool cpu_map_populated; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 265da00a1a8b..4a6b6b77ccb6 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -115,6 +115,11 @@ enum {  	 * Enable recursive subtree protection  	 */  	CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), + +	/* +	 * Enable hugetlb accounting for the memory controller. +	 */ +	 CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),  };  /* cftype->flags */ diff --git a/include/linux/damon.h b/include/linux/damon.h index ae2664d1d5f1..ab2f17d9926b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -40,9 +40,24 @@ struct damon_addr_range {   * @ar:			The address range of the region.   * @sampling_addr:	Address of the sample for the next access check.   * @nr_accesses:	Access frequency of this region. + * @nr_accesses_bp:	@nr_accesses in basis point (0.01%) that updated for + *			each sampling interval.   * @list:		List head for siblings.   * @age:		Age of this region.   * + * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be + * increased for every &damon_attrs->sample_interval if an access to the region + * during the last sampling interval is found.  The update of this field should + * not be done with direct access but with the helper function, + * damon_update_region_access_rate(). + * + * @nr_accesses_bp is another representation of @nr_accesses in basis point + * (1 in 10,000) that updated for every &damon_attrs->sample_interval in a + * manner similar to moving sum.  By the algorithm, this value becomes + * @nr_accesses * 10000 for every &struct damon_attrs->aggr_interval.  This can + * be used when the aggregation interval is too huge and therefore cannot wait + * for it before getting the access monitoring results. + *   * @age is initially zero, increased for each aggregation interval, and reset   * to zero again if the access frequency is significantly changed.  If two   * regions are merged into a new region, both @nr_accesses and @age of the new @@ -52,6 +67,7 @@ struct damon_region {  	struct damon_addr_range ar;  	unsigned long sampling_addr;  	unsigned int nr_accesses; +	unsigned int nr_accesses_bp;  	struct list_head list;  	unsigned int age; @@ -298,24 +314,24 @@ struct damos_access_pattern {   * struct damos - Represents a Data Access Monitoring-based Operation Scheme.   * @pattern:		Access pattern of target regions.   * @action:		&damo_action to be applied to the target regions. + * @apply_interval_us:	The time between applying the @action.   * @quota:		Control the aggressiveness of this scheme.   * @wmarks:		Watermarks for automated (in)activation of this scheme.   * @filters:		Additional set of &struct damos_filter for &action.   * @stat:		Statistics of this scheme.   * @list:		List head for siblings.   * - * For each aggregation interval, DAMON finds regions which fit in the + * For each @apply_interval_us, DAMON finds regions which fit in the   * &pattern and applies &action to those. To avoid consuming too much   * CPU time or IO resources for the &action, "a is used.   * + * If @apply_interval_us is zero, &damon_attrs->aggr_interval is used instead. + *   * To do the work only when needed, schemes can be activated for specific   * system situations using &wmarks.  If all schemes that registered to the   * monitoring context are inactive, DAMON stops monitoring either, and just   * repeatedly checks the watermarks.   * - * If all schemes that registered to a &struct damon_ctx are inactive, DAMON - * stops monitoring and just repeatedly checks the watermarks. - *   * Before applying the &action to a memory region, &struct damon_operations   * implementation could check pages of the region and skip &action to respect   * &filters @@ -327,6 +343,14 @@ struct damos_access_pattern {  struct damos {  	struct damos_access_pattern pattern;  	enum damos_action action; +	unsigned long apply_interval_us; +/* private: internal use only */ +	/* +	 * number of sample intervals that should be passed before applying +	 * @action +	 */ +	unsigned long next_apply_sis; +/* public: */  	struct damos_quota quota;  	struct damos_watermarks wmarks;  	struct list_head filters; @@ -472,13 +496,14 @@ struct damon_callback {   *				regions.   *   * For each @sample_interval, DAMON checks whether each region is accessed or - * not.  It aggregates and keeps the access information (number of accesses to - * each region) for @aggr_interval time.  DAMON also checks whether the target - * memory regions need update (e.g., by ``mmap()`` calls from the application, - * in case of virtual memory monitoring) and applies the changes for each - * @ops_update_interval.  All time intervals are in micro-seconds. - * Please refer to &struct damon_operations and &struct damon_callback for more - * detail. + * not during the last @sample_interval.  If such access is found, DAMON + * aggregates the information by increasing &damon_region->nr_accesses for + * @aggr_interval time.  For each @aggr_interval, the count is reset.  DAMON + * also checks whether the target memory regions need update (e.g., by + * ``mmap()`` calls from the application, in case of virtual memory monitoring) + * and applies the changes for each @ops_update_interval.  All time intervals + * are in micro-seconds.  Please refer to &struct damon_operations and &struct + * damon_callback for more detail.   */  struct damon_attrs {  	unsigned long sample_interval; @@ -522,8 +547,18 @@ struct damon_ctx {  	struct damon_attrs attrs;  /* private: internal use only */ -	struct timespec64 last_aggregation; -	struct timespec64 last_ops_update; +	/* number of sample intervals that passed since this context started */ +	unsigned long passed_sample_intervals; +	/* +	 * number of sample intervals that should be passed before next +	 * aggregation +	 */ +	unsigned long next_aggregation_sis; +	/* +	 * number of sample intervals that should be passed before next ops +	 * update +	 */ +	unsigned long next_ops_update_sis;  /* public: */  	struct task_struct *kdamond; @@ -608,6 +643,8 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);  void damon_destroy_region(struct damon_region *r, struct damon_target *t);  int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,  		unsigned int nr_ranges); +void damon_update_region_access_rate(struct damon_region *r, bool accessed, +		struct damon_attrs *attrs);  struct damos_filter *damos_new_filter(enum damos_filter_type type,  		bool matching); @@ -615,7 +652,9 @@ void damos_add_filter(struct damos *s, struct damos_filter *f);  void damos_destroy_filter(struct damos_filter *f);  struct damos *damon_new_scheme(struct damos_access_pattern *pattern, -			enum damos_action action, struct damos_quota *quota, +			enum damos_action action, +			unsigned long apply_interval_us, +			struct damos_quota *quota,  			struct damos_watermarks *wmarks);  void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);  void damon_destroy_scheme(struct damos *s); @@ -642,6 +681,13 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)  	return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR;  } +static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs) +{ +	/* {aggr,sample}_interval are unsigned long, hence could overflow */ +	return min(attrs->aggr_interval / attrs->sample_interval, +			(unsigned long)UINT_MAX); +} +  int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);  int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/include/linux/dax.h b/include/linux/dax.h index 22cd9902345d..b463502b16e1 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -159,8 +159,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,  struct page *dax_layout_busy_page(struct address_space *mapping);  struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); -dax_entry_t dax_lock_page(struct page *page); -void dax_unlock_page(struct page *page, dax_entry_t cookie); +dax_entry_t dax_lock_folio(struct folio *folio); +void dax_unlock_folio(struct folio *folio, dax_entry_t cookie);  dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,  		unsigned long index, struct page **page);  void dax_unlock_mapping_entry(struct address_space *mapping, @@ -182,14 +182,14 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping,  	return -EOPNOTSUPP;  } -static inline dax_entry_t dax_lock_page(struct page *page) +static inline dax_entry_t dax_lock_folio(struct folio *folio)  { -	if (IS_DAX(page->mapping->host)) +	if (IS_DAX(folio->mapping->host))  		return ~0UL;  	return 0;  } -static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) +static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)  {  } diff --git a/include/linux/fs.h b/include/linux/fs.h index c27c324ba58a..98b7a7a8c42e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops;   *   It is also used to block modification of page cache contents through   *   memory mappings.   * @gfp_mask: Memory allocation flags to use for allocating pages. - * @i_mmap_writable: Number of VM_SHARED mappings. + * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.   * @nr_thps: Number of THPs in the pagecache (non-shmem only).   * @i_mmap: Tree of private and shared mappings.   * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. @@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping)  /*   * Might pages of this file have been modified in userspace? - * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap + * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap   * marks vma as VM_SHARED if it is shared, and the file was opened for   * writing i.e. vma may be mprotected writable even if now readonly.   * @@ -1270,7 +1270,7 @@ struct super_block {  	const struct dentry_operations *s_d_op; /* default d_op for dentries */ -	struct shrinker s_shrink;	/* per-sb shrinker handle */ +	struct shrinker *s_shrink;	/* per-sb shrinker handle */  	/* Number of inodes with nlink == 0 but still referenced */  	atomic_long_t s_remove_count; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 665f06675c83..de292a007138 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -8,6 +8,7 @@  #include <linux/topology.h>  struct vm_area_struct; +struct mempolicy;  /* Convert GFP flags to their corresponding migrate type */  #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) @@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,  #ifdef CONFIG_NUMA  struct page *alloc_pages(gfp_t gfp, unsigned int order); -struct folio *folio_alloc(gfp_t gfp, unsigned order); +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, +		struct mempolicy *mpol, pgoff_t ilx, int nid); +struct folio *folio_alloc(gfp_t gfp, unsigned int order);  struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,  		unsigned long addr, bool hugepage);  #else @@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)  {  	return alloc_pages_node(numa_node_id(), gfp_mask, order);  } +static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, +		struct mempolicy *mpol, pgoff_t ilx, int nid) +{ +	return alloc_pages(gfp, order); +}  static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)  {  	return __folio_alloc_node(gfp, order, numa_node_id()); @@ -320,11 +328,13 @@ extern void page_frag_free(void *addr);  #define free_page(addr) free_pages((addr), 0)  void page_alloc_init_cpuhp(void); +int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);  void drain_all_pages(struct zone *zone);  void drain_local_pages(struct zone *zone);  void page_alloc_init_late(void); +void setup_pcp_cacheinfo(void);  /*   * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 47d25a5e1933..d3acecc5db4b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -30,7 +30,7 @@ void free_huge_folio(struct folio *folio);  #ifdef CONFIG_HUGETLB_PAGE -#include <linux/mempolicy.h> +#include <linux/pagemap.h>  #include <linux/shm.h>  #include <asm/tlbflush.h> @@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,  		unsigned long cp_flags);  bool is_hugetlb_entry_migration(pte_t pte); +bool is_hugetlb_entry_hwpoisoned(pte_t pte);  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);  #else /* !CONFIG_HUGETLB_PAGE */ @@ -544,7 +545,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)  }  struct hugetlbfs_inode_info { -	struct shared_policy policy;  	struct inode vfs_inode;  	unsigned int seals;  }; @@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,  				unsigned long addr, int avoid_reserve);  struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,  				nodemask_t *nmask, gfp_t gfp_mask); -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, -				unsigned long address);  int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,  			pgoff_t idx);  void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, @@ -844,6 +842,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)  	return huge_page_size(h) / 512;  } +static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, +				struct address_space *mapping, pgoff_t idx) +{ +	return filemap_lock_folio(mapping, idx << huge_page_order(h)); +} +  #include <asm/hugetlb.h>  #ifndef is_hugepage_only_range @@ -1040,6 +1044,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio  	return NULL;  } +static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, +				struct address_space *mapping, pgoff_t idx) +{ +	return NULL; +} +  static inline int isolate_or_dissolve_huge_page(struct page *page,  						struct list_head *list)  { @@ -1060,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,  	return NULL;  } -static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, -					       struct vm_area_struct *vma, -					       unsigned long address) -{ -	return NULL; -} -  static inline int __alloc_bootmem_huge_page(struct hstate *h)  {  	return 0; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 3d82d91f49ac..e5d64b8b59c2 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -22,13 +22,6 @@ struct resv_map;  struct file_region;  #ifdef CONFIG_CGROUP_HUGETLB -/* - * Minimum page order trackable by hugetlb cgroup. - * At least 3 pages are necessary for all the tracking information. - * The second tail page contains all of the hugetlb-specific fields. - */ -#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE) -  enum hugetlb_memory_event {  	HUGETLB_MAX,  	HUGETLB_NR_MEMORY_EVENTS, @@ -68,8 +61,6 @@ static inline struct hugetlb_cgroup *  __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd)  {  	VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); -	if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) -		return NULL;  	if (rsvd)  		return folio->_hugetlb_cgroup_rsvd;  	else @@ -91,8 +82,6 @@ static inline void __set_hugetlb_cgroup(struct folio *folio,  				       struct hugetlb_cgroup *h_cg, bool rsvd)  {  	VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); -	if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) -		return;  	if (rsvd)  		folio->_hugetlb_cgroup_rsvd = h_cg;  	else diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 52772c826c86..6dcbb4eb80fb 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -886,7 +886,7 @@ struct journal_s  	 * Journal head shrinker, reclaim buffer's journal head which  	 * has been written back.  	 */ -	struct shrinker		j_shrinker; +	struct shrinker		*j_shrinker;  	/**  	 * @j_checkpoint_jh_count: diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 1c1072e3ca06..ae3bde302f70 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -40,6 +40,8 @@ extern unsigned long long max_possible_pfn;   * via a driver, and never indicated in the firmware-provided memory map as   * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the   * kernel resource tree. + * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are + * not initialized (only for reserved regions).   */  enum memblock_flags {  	MEMBLOCK_NONE		= 0x0,	/* No special request */ @@ -47,6 +49,7 @@ enum memblock_flags {  	MEMBLOCK_MIRROR		= 0x2,	/* mirrored region */  	MEMBLOCK_NOMAP		= 0x4,	/* don't add to kernel direct mapping */  	MEMBLOCK_DRIVER_MANAGED = 0x8,	/* always detected via a driver */ +	MEMBLOCK_RSRV_NOINIT	= 0x10,	/* don't initialize struct pages */  };  /** @@ -125,6 +128,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);  int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);  int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);  int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); +int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);  void memblock_free_all(void);  void memblock_free(void *ptr, size_t size); @@ -259,6 +263,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)  	return m->flags & MEMBLOCK_NOMAP;  } +static inline bool memblock_is_reserved_noinit(struct memblock_region *m) +{ +	return m->flags & MEMBLOCK_RSRV_NOINIT; +} +  static inline bool memblock_is_driver_managed(struct memblock_region *m)  {  	return m->flags & MEMBLOCK_DRIVER_MANAGED; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e4e24da16d2c..7bdcf3020d7a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,7 @@  #include <linux/vmstat.h>  #include <linux/writeback.h>  #include <linux/page-flags.h> +#include <linux/shrinker.h>  struct mem_cgroup;  struct obj_cgroup; @@ -88,17 +89,6 @@ struct mem_cgroup_reclaim_iter {  	unsigned int generation;  }; -/* - * Bitmap and deferred work of shrinker::id corresponding to memcg-aware - * shrinkers, which have elements charged to this memcg. - */ -struct shrinker_info { -	struct rcu_head rcu; -	atomic_long_t *nr_deferred; -	unsigned long *map; -	int map_nr_max; -}; -  struct lruvec_stats_percpu {  	/* Local (CPU and cgroup) state */  	long state[NR_VM_NODE_STAT_ITEMS]; @@ -153,7 +143,7 @@ struct mem_cgroup_threshold_ary {  	/* Size of entries[] */  	unsigned int size;  	/* Array of thresholds */ -	struct mem_cgroup_threshold entries[]; +	struct mem_cgroup_threshold entries[] __counted_by(size);  };  struct mem_cgroup_thresholds { @@ -299,7 +289,13 @@ struct mem_cgroup {  #ifdef CONFIG_MEMCG_KMEM  	int kmemcg_id; -	struct obj_cgroup __rcu *objcg; +	/* +	 * memcg->objcg is wiped out as a part of the objcg repaprenting +	 * process. memcg->orig_objcg preserves a pointer (and a reference) +	 * to the original objcg until the end of live of memcg. +	 */ +	struct obj_cgroup __rcu	*objcg; +	struct obj_cgroup	*orig_objcg;  	/* list of inherited objcgs, protected by objcg_lock */  	struct list_head objcg_list;  #endif @@ -662,6 +658,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,  		page_counter_read(&memcg->memory);  } +void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); +  int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);  /** @@ -686,6 +684,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,  	return __mem_cgroup_charge(folio, mm, gfp);  } +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, +		long nr_pages); +  int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,  				  gfp_t gfp, swp_entry_t entry);  void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -713,6 +714,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)  	__mem_cgroup_uncharge_list(page_list);  } +void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); + +void mem_cgroup_replace_folio(struct folio *old, struct folio *new); +  void mem_cgroup_migrate(struct folio *old, struct folio *new);  /** @@ -769,6 +774,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);  struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); +struct mem_cgroup *get_mem_cgroup_from_current(void); +  struct lruvec *folio_lruvec_lock(struct folio *folio);  struct lruvec *folio_lruvec_lock_irq(struct folio *folio);  struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, @@ -1080,15 +1087,6 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,  	local_irq_restore(flags);  } -static inline void count_memcg_page_event(struct page *page, -					  enum vm_event_item idx) -{ -	struct mem_cgroup *memcg = page_memcg(page); - -	if (memcg) -		count_memcg_events(memcg, idx, 1); -} -  static inline void count_memcg_folio_events(struct folio *folio,  		enum vm_event_item idx, unsigned long nr)  { @@ -1249,12 +1247,23 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,  	return false;  } +static inline void mem_cgroup_commit_charge(struct folio *folio, +		struct mem_cgroup *memcg) +{ +} +  static inline int mem_cgroup_charge(struct folio *folio,  		struct mm_struct *mm, gfp_t gfp)  {  	return 0;  } +static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, +		gfp_t gfp, long nr_pages) +{ +	return 0; +} +  static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,  			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)  { @@ -1273,6 +1282,16 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)  {  } +static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, +		unsigned int nr_pages) +{ +} + +static inline void mem_cgroup_replace_folio(struct folio *old, +		struct folio *new) +{ +} +  static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)  {  } @@ -1310,6 +1329,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)  	return NULL;  } +static inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ +	return NULL; +} +  static inline  struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)  { @@ -1565,11 +1589,6 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg,  {  } -static inline void count_memcg_page_event(struct page *page, -					  int idx) -{ -} -  static inline void count_memcg_folio_events(struct folio *folio,  		enum vm_event_item idx, unsigned long nr)  { @@ -1763,9 +1782,27 @@ bool mem_cgroup_kmem_disabled(void);  int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);  void __memcg_kmem_uncharge_page(struct page *page, int order); -struct obj_cgroup *get_obj_cgroup_from_current(void); +/* + * The returned objcg pointer is safe to use without additional + * protection within a scope. The scope is defined either by + * the current task (similar to the "current" global variable) + * or by set_active_memcg() pair. + * Please, use obj_cgroup_get() to get a reference if the pointer + * needs to be used outside of the local scope. + */ +struct obj_cgroup *current_obj_cgroup(void);  struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); +static inline struct obj_cgroup *get_obj_cgroup_from_current(void) +{ +	struct obj_cgroup *objcg = current_obj_cgroup(); + +	if (objcg) +		obj_cgroup_get(objcg); + +	return objcg; +} +  int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);  void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 437441cdf78f..1e39d27bee41 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -6,6 +6,7 @@  #include <linux/nodemask.h>  #include <linux/kref.h>  #include <linux/mmzone.h> +#include <linux/notifier.h>  /*   * Each tier cover a abstrace distance chunk size of 128   */ @@ -22,7 +23,9 @@  struct memory_tier;  struct memory_dev_type {  	/* list of memory types that are part of same tier as this type */ -	struct list_head tier_sibiling; +	struct list_head tier_sibling; +	/* list of memory types that are managed by one driver */ +	struct list_head list;  	/* abstract distance for this specific memory type */  	int adistance;  	/* Nodes of same abstract distance */ @@ -30,12 +33,21 @@ struct memory_dev_type {  	struct kref kref;  }; +struct node_hmem_attrs; +  #ifdef CONFIG_NUMA  extern bool numa_demotion_enabled; +extern struct memory_dev_type *default_dram_type;  struct memory_dev_type *alloc_memory_type(int adistance);  void put_memory_type(struct memory_dev_type *memtype);  void init_node_memory_type(int node, struct memory_dev_type *default_type);  void clear_node_memory_type(int node, struct memory_dev_type *memtype); +int register_mt_adistance_algorithm(struct notifier_block *nb); +int unregister_mt_adistance_algorithm(struct notifier_block *nb); +int mt_calc_adistance(int node, int *adist); +int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, +			     const char *source); +int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);  #ifdef CONFIG_MIGRATION  int next_demotion_node(int node);  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); @@ -60,6 +72,7 @@ static inline bool node_is_toptier(int node)  #else  #define numa_demotion_enabled	false +#define default_dram_type	NULL  /*   * CONFIG_NUMA implementation returns non NULL error.   */ @@ -97,5 +110,31 @@ static inline bool node_is_toptier(int node)  {  	return true;  } + +static inline int register_mt_adistance_algorithm(struct notifier_block *nb) +{ +	return 0; +} + +static inline int unregister_mt_adistance_algorithm(struct notifier_block *nb) +{ +	return 0; +} + +static inline int mt_calc_adistance(int node, int *adist) +{ +	return NOTIFY_DONE; +} + +static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, +					   const char *source) +{ +	return -EIO; +} + +static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist) +{ +	return -EIO; +}  #endif	/* CONFIG_NUMA */  #endif  /* _LINUX_MEMORY_TIERS_H */ diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index d232de7cdc56..931b118336f4 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -17,6 +17,8 @@  struct mm_struct; +#define NO_INTERLEAVE_INDEX (-1UL)	/* use task il_prev for interleaving */ +  #ifdef CONFIG_NUMA  /* @@ -89,8 +91,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)  	return pol;  } -#define vma_policy(vma) ((vma)->vm_policy) -  static inline void mpol_get(struct mempolicy *pol)  {  	if (pol) @@ -107,35 +107,30 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)  /*   * Tree of shared policies for a shared memory region. - * Maintain the policies in a pseudo mm that contains vmas. The vmas - * carry the policy. As a special twist the pseudo mm is indexed in pages, not - * bytes, so that we can work with shared memory segments bigger than - * unsigned long.   */ - -struct sp_node { -	struct rb_node nd; -	unsigned long start, end; -	struct mempolicy *policy; -}; -  struct shared_policy {  	struct rb_root root;  	rwlock_t lock;  }; +struct sp_node { +	struct rb_node nd; +	pgoff_t start, end; +	struct mempolicy *policy; +};  int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);  void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); -int mpol_set_shared_policy(struct shared_policy *info, -				struct vm_area_struct *vma, -				struct mempolicy *new); -void mpol_free_shared_policy(struct shared_policy *p); +int mpol_set_shared_policy(struct shared_policy *sp, +			   struct vm_area_struct *vma, struct mempolicy *mpol); +void mpol_free_shared_policy(struct shared_policy *sp);  struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, -					    unsigned long idx); +					    pgoff_t idx);  struct mempolicy *get_task_policy(struct task_struct *p);  struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, -		unsigned long addr); +		unsigned long addr, pgoff_t *ilx); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +		unsigned long addr, int order, pgoff_t *ilx);  bool vma_policy_mof(struct vm_area_struct *vma);  extern void numa_default_policy(void); @@ -149,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma,  extern bool init_nodemask_of_mempolicy(nodemask_t *mask);  extern bool mempolicy_in_oom_domain(struct task_struct *tsk,  				const nodemask_t *mask); -extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); -  extern unsigned int mempolicy_slab_node(void);  extern enum zone_type policy_zone; @@ -174,7 +167,7 @@ extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);  /* Check if a vma is migratable */  extern bool vma_migratable(struct vm_area_struct *vma); -extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); +int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long);  extern void mpol_put_task_policy(struct task_struct *);  static inline bool mpol_is_preferred_many(struct mempolicy *pol) @@ -188,12 +181,17 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);  struct mempolicy {}; +static inline struct mempolicy *get_task_policy(struct task_struct *p) +{ +	return NULL; +} +  static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)  {  	return true;  } -static inline void mpol_put(struct mempolicy *p) +static inline void mpol_put(struct mempolicy *pol)  {  } @@ -212,17 +210,22 @@ static inline void mpol_shared_policy_init(struct shared_policy *sp,  {  } -static inline void mpol_free_shared_policy(struct shared_policy *p) +static inline void mpol_free_shared_policy(struct shared_policy *sp)  {  }  static inline struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)  {  	return NULL;  } -#define vma_policy(vma) NULL +static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +				unsigned long addr, int order, pgoff_t *ilx) +{ +	*ilx = 0; +	return NULL; +}  static inline int  vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) @@ -278,7 +281,8 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)  }  #endif -static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, +static inline int mpol_misplaced(struct folio *folio, +				 struct vm_area_struct *vma,  				 unsigned long address)  {  	return -1; /* no node preference */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 711dd9412561..2ce13e8a309b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -142,10 +142,10 @@ const struct movable_operations *page_movable_ops(struct page *page)  }  #ifdef CONFIG_NUMA_BALANCING -int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, +int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,  			   int node);  #else -static inline int migrate_misplaced_page(struct page *page, +static inline int migrate_misplaced_folio(struct folio *folio,  					 struct vm_area_struct *vma, int node)  {  	return -EAGAIN; /* can't migrate now */ diff --git a/include/linux/mm.h b/include/linux/mm.h index ba896e946651..418d26608ece 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -617,7 +617,7 @@ struct vm_operations_struct {  	 * policy.  	 */  	struct mempolicy *(*get_policy)(struct vm_area_struct *vma, -					unsigned long addr); +					unsigned long addr, pgoff_t *ilx);  #endif  	/*  	 * Called by vm_normal_page() for special PTEs to find the @@ -935,6 +935,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)  	return vma->vm_flags & VM_ACCESS_FLAGS;  } +static inline bool is_shared_maywrite(vm_flags_t vm_flags) +{ +	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == +		(VM_SHARED | VM_MAYWRITE); +} + +static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +{ +	return is_shared_maywrite(vma->vm_flags); +} +  static inline  struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)  { @@ -1335,7 +1346,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,  		struct page *page, unsigned int nr, unsigned long addr);  vm_fault_t finish_fault(struct vm_fault *vmf); -vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);  #endif  /* @@ -1684,26 +1694,26 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)  #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)  #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS -static inline int page_cpupid_xchg_last(struct page *page, int cpupid) +static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)  { -	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK); +	return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);  } -static inline int page_cpupid_last(struct page *page) +static inline int folio_last_cpupid(struct folio *folio)  { -	return page->_last_cpupid; +	return folio->_last_cpupid;  }  static inline void page_cpupid_reset_last(struct page *page)  {  	page->_last_cpupid = -1 & LAST_CPUPID_MASK;  }  #else -static inline int page_cpupid_last(struct page *page) +static inline int folio_last_cpupid(struct folio *folio)  { -	return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; +	return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;  } -extern int page_cpupid_xchg_last(struct page *page, int cpupid); +int folio_xchg_last_cpupid(struct folio *folio, int cpupid);  static inline void page_cpupid_reset_last(struct page *page)  { @@ -1711,11 +1721,12 @@ static inline void page_cpupid_reset_last(struct page *page)  }  #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ -static inline int xchg_page_access_time(struct page *page, int time) +static inline int folio_xchg_access_time(struct folio *folio, int time)  {  	int last_time; -	last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS); +	last_time = folio_xchg_last_cpupid(folio, +					   time >> PAGE_ACCESS_TIME_BUCKETS);  	return last_time << PAGE_ACCESS_TIME_BUCKETS;  } @@ -1729,19 +1740,19 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)  	}  }  #else /* !CONFIG_NUMA_BALANCING */ -static inline int page_cpupid_xchg_last(struct page *page, int cpupid) +static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)  { -	return page_to_nid(page); /* XXX */ +	return folio_nid(folio); /* XXX */  } -static inline int xchg_page_access_time(struct page *page, int time) +static inline int folio_xchg_access_time(struct folio *folio, int time)  {  	return 0;  } -static inline int page_cpupid_last(struct page *page) +static inline int folio_last_cpupid(struct folio *folio)  { -	return page_to_nid(page); /* XXX */ +	return folio_nid(folio); /* XXX */  }  static inline int cpupid_to_nid(int cpupid) @@ -2325,6 +2336,8 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,  			     pte_t pte);  struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,  			     pte_t pte); +struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, +				  unsigned long addr, pmd_t pmd);  struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,  				pmd_t pmd); @@ -2411,8 +2424,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,  		void *buf, int len, unsigned int gup_flags);  extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,  		void *buf, int len, unsigned int gup_flags); -extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, -			      void *buf, int len, unsigned int gup_flags);  long get_user_pages_remote(struct mm_struct *mm,  			   unsigned long start, unsigned long nr_pages, @@ -2423,6 +2434,9 @@ long pin_user_pages_remote(struct mm_struct *mm,  			   unsigned int gup_flags, struct page **pages,  			   int *locked); +/* + * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. + */  static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,  						    unsigned long addr,  						    int gup_flags, @@ -2430,12 +2444,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,  {  	struct page *page;  	struct vm_area_struct *vma; -	int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); +	int got; + +	if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) +		return ERR_PTR(-EINVAL); + +	got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);  	if (got < 0)  		return ERR_PTR(got); -	if (got == 0) -		return NULL;  	vma = vma_lookup(mm, addr);  	if (WARN_ON_ONCE(!vma)) { @@ -2478,7 +2495,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);  extern unsigned long move_page_tables(struct vm_area_struct *vma,  		unsigned long old_addr, struct vm_area_struct *new_vma,  		unsigned long new_addr, unsigned long len, -		bool need_rmap_locks); +		bool need_rmap_locks, bool for_stack);  /*   * Flags used by change_protection().  For now we make it a bitmap so @@ -2626,14 +2643,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,  		*maxrss = hiwater_rss;  } -#if defined(SPLIT_RSS_COUNTING) -void sync_mm_rss(struct mm_struct *mm); -#else -static inline void sync_mm_rss(struct mm_struct *mm) -{ -} -#endif -  #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL  static inline int pte_special(pte_t pte)  { @@ -3055,6 +3064,22 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)  	return ptl;  } +static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) +{ +	struct folio *folio = ptdesc_folio(ptdesc); + +	__folio_set_pgtable(folio); +	lruvec_stat_add_folio(folio, NR_PAGETABLE); +} + +static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) +{ +	struct folio *folio = ptdesc_folio(ptdesc); + +	__folio_clear_pgtable(folio); +	lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} +  extern void __init pagecache_init(void);  extern void free_initmem(void); @@ -3219,22 +3244,73 @@ extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,  		      struct vm_area_struct *next);  extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,  		       unsigned long start, unsigned long end, pgoff_t pgoff); -extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, -	struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, -	unsigned long end, unsigned long vm_flags, struct anon_vma *, -	struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, -	struct anon_vma_name *);  extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, -		       unsigned long addr, int new_below); -extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, -			 unsigned long addr, int new_below);  extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);  extern void unlink_file_vma(struct vm_area_struct *);  extern struct vm_area_struct *copy_vma(struct vm_area_struct **,  	unsigned long addr, unsigned long len, pgoff_t pgoff,  	bool *need_rmap_locks);  extern void exit_mmap(struct mm_struct *); +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, +				  struct vm_area_struct *prev, +				  struct vm_area_struct *vma, +				  unsigned long start, unsigned long end, +				  unsigned long vm_flags, +				  struct mempolicy *policy, +				  struct vm_userfaultfd_ctx uffd_ctx, +				  struct anon_vma_name *anon_name); + +/* We are about to modify the VMA's flags. */ +static inline struct vm_area_struct +*vma_modify_flags(struct vma_iterator *vmi, +		  struct vm_area_struct *prev, +		  struct vm_area_struct *vma, +		  unsigned long start, unsigned long end, +		  unsigned long new_flags) +{ +	return vma_modify(vmi, prev, vma, start, end, new_flags, +			  vma_policy(vma), vma->vm_userfaultfd_ctx, +			  anon_vma_name(vma)); +} + +/* We are about to modify the VMA's flags and/or anon_name. */ +static inline struct vm_area_struct +*vma_modify_flags_name(struct vma_iterator *vmi, +		       struct vm_area_struct *prev, +		       struct vm_area_struct *vma, +		       unsigned long start, +		       unsigned long end, +		       unsigned long new_flags, +		       struct anon_vma_name *new_name) +{ +	return vma_modify(vmi, prev, vma, start, end, new_flags, +			  vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); +} + +/* We are about to modify the VMA's memory policy. */ +static inline struct vm_area_struct +*vma_modify_policy(struct vma_iterator *vmi, +		   struct vm_area_struct *prev, +		   struct vm_area_struct *vma, +		   unsigned long start, unsigned long end, +		   struct mempolicy *new_pol) +{ +	return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, +			  new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); +} + +/* We are about to modify the VMA's flags and/or uffd context. */ +static inline struct vm_area_struct +*vma_modify_flags_uffd(struct vma_iterator *vmi, +		       struct vm_area_struct *prev, +		       struct vm_area_struct *vma, +		       unsigned long start, unsigned long end, +		       unsigned long new_flags, +		       struct vm_userfaultfd_ctx new_ctx) +{ +	return vma_modify(vmi, prev, vma, start, end, new_flags, +			  vma_policy(vma), new_ctx, anon_vma_name(vma)); +}  static inline int check_data_rlimit(unsigned long rlim,  				    unsigned long new, @@ -3997,25 +4073,26 @@ static inline void mem_dump_obj(void *object) {}  #endif  /** - * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it + * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and + *                    handle them.   * @seals: the seals to check   * @vma: the vma to operate on   * - * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on - * the vma flags.  Return 0 if check pass, or <0 for errors. + * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper + * check/handling on the vma flags.  Return 0 if check pass, or <0 for errors.   */ -static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) +static inline int seal_check_write(int seals, struct vm_area_struct *vma)  { -	if (seals & F_SEAL_FUTURE_WRITE) { +	if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {  		/*  		 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when -		 * "future write" seal active. +		 * write seals are active.  		 */  		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))  			return -EPERM;  		/* -		 * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as +		 * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as  		 * MAP_SHARED and read-only, take care to not allow mprotect to  		 * revert protections on such mappings. Do this only for shared  		 * mappings. For private mappings, don't need to mask @@ -4059,4 +4136,11 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)  #endif +static inline bool pfn_is_unaccepted_memory(unsigned long pfn) +{ +	phys_addr_t paddr = pfn << PAGE_SHIFT; + +	return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); +} +  #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8148b30a9df1..9ae7def16cb2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@  #include <linux/atomic.h>  #include <linux/huge_mm.h> +#include <linux/mm_types.h>  #include <linux/swap.h>  #include <linux/string.h>  #include <linux/userfaultfd_k.h> @@ -352,15 +353,6 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)  }  #ifdef CONFIG_ANON_VMA_NAME -/* - * mmap_lock should be read-locked when calling anon_vma_name(). Caller should - * either keep holding the lock while using the returned pointer or it should - * raise anon_vma_name refcount before releasing the lock. - */ -extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); -extern struct anon_vma_name *anon_vma_name_alloc(const char *name); -extern void anon_vma_name_free(struct kref *kref); -  /* mmap_lock should be read-locked */  static inline void anon_vma_name_get(struct anon_vma_name *anon_name)  { @@ -415,16 +407,6 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,  }  #else /* CONFIG_ANON_VMA_NAME */ -static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) -{ -	return NULL; -} - -static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) -{ -	return NULL; -} -  static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}  static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}  static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4be8e310b189..957ce38768b2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -188,6 +188,10 @@ struct page {  					   not kmapped, ie. highmem) */  #endif /* WANT_PAGE_VIRTUAL */ +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +	int _last_cpupid; +#endif +  #ifdef CONFIG_KMSAN  	/*  	 * KMSAN metadata for this page: @@ -199,10 +203,6 @@ struct page {  	struct page *kmsan_shadow;  	struct page *kmsan_origin;  #endif - -#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS -	int _last_cpupid; -#endif  } _struct_page_alignment;  /* @@ -261,6 +261,8 @@ typedef struct {   * @_refcount: Do not access this member directly.  Use folio_ref_count()   *    to find how many references there are to this folio.   * @memcg_data: Memory Control Group data. + * @virtual: Virtual address in the kernel direct map. + * @_last_cpupid: IDs of last CPU and last process that accessed the folio.   * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().   * @_nr_pages_mapped: Do not use directly, call folio_mapcount().   * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). @@ -307,6 +309,12 @@ struct folio {  #ifdef CONFIG_MEMCG  			unsigned long memcg_data;  #endif +#if defined(WANT_PAGE_VIRTUAL) +			void *virtual; +#endif +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +			int _last_cpupid; +#endif  	/* private: the union with struct page is transitional */  		};  		struct page page; @@ -362,6 +370,12 @@ FOLIO_MATCH(_refcount, _refcount);  #ifdef CONFIG_MEMCG  FOLIO_MATCH(memcg_data, memcg_data);  #endif +#if defined(WANT_PAGE_VIRTUAL) +FOLIO_MATCH(virtual, virtual); +#endif +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +FOLIO_MATCH(_last_cpupid, _last_cpupid); +#endif  #undef FOLIO_MATCH  #define FOLIO_MATCH(pg, fl)						\  	static_assert(offsetof(struct folio, fl) ==			\ @@ -535,6 +549,27 @@ struct anon_vma_name {  	char name[];  }; +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should + * either keep holding the lock while using the returned pointer or it should + * raise anon_vma_name refcount before releasing the lock. + */ +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); +struct anon_vma_name *anon_vma_name_alloc(const char *name); +void anon_vma_name_free(struct kref *kref); +#else /* CONFIG_ANON_VMA_NAME */ +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ +	return NULL; +} + +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ +	return NULL; +} +#endif +  struct vma_lock {  	struct rw_semaphore lock;  }; @@ -678,6 +713,12 @@ struct vm_area_struct {  	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;  } __randomize_layout; +#ifdef CONFIG_NUMA +#define vma_policy(vma) ((vma)->vm_policy) +#else +#define vma_policy(vma) NULL +#endif +  #ifdef CONFIG_SCHED_MM_CID  struct mm_cid {  	u64 time; diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 6e3c857606f1..f349e08a9dfe 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -459,7 +459,14 @@ mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)  	lock_map_release(&__mmu_notifier_invalidate_range_start_map);  } -static inline int +/* + * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it + * can return an error if a notifier can't proceed without blocking, in which + * case you're not allowed to modify PTEs in the specified range. + * + * This is mainly intended for OOM handling. + */ +static inline int __must_check  mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)  {  	int ret = 0; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4106fbc5b4b3..3c25226beeed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -639,8 +639,6 @@ struct lruvec {  #endif  }; -/* Isolate unmapped pages */ -#define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)  /* Isolate for asynchronous migration */  #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)  /* Isolate unevictable pages */ @@ -676,15 +674,34 @@ enum zone_watermarks {  #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)  #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) +/* + * Flags used in pcp->flags field. + * + * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the + * previous page freeing.  To avoid to drain PCP for an accident + * high-order page freeing. + * + * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before + * draining PCP for consecutive high-order pages freeing without + * allocation if data cache slice of CPU is large enough.  To reduce + * zone lock contention and keep cache-hot pages reusing. + */ +#define	PCPF_PREV_FREE_HIGH_ORDER	BIT(0) +#define	PCPF_FREE_HIGH_BATCH		BIT(1) +  struct per_cpu_pages {  	spinlock_t lock;	/* Protects lists field */  	int count;		/* number of pages in the list */  	int high;		/* high watermark, emptying needed */ +	int high_min;		/* min high watermark */ +	int high_max;		/* max high watermark */  	int batch;		/* chunk size for buddy add/remove */ -	short free_factor;	/* batch scaling factor during free */ +	u8 flags;		/* protected by pcp->lock */ +	u8 alloc_factor;	/* batch scaling factor during allocate */  #ifdef CONFIG_NUMA -	short expire;		/* When 0, remote pagesets are drained */ +	u8 expire;		/* When 0, remote pagesets are drained */  #endif +	short free_count;	/* consecutive free count */  	/* Lists of pages, one per migrate type stored on the pcp-lists */  	struct list_head lists[NR_PCP_LISTS]; @@ -837,7 +854,8 @@ struct zone {  	 * the high and batch values are copied to individual pagesets for  	 * faster access  	 */ -	int pageset_high; +	int pageset_high_min; +	int pageset_high_max;  	int pageset_batch;  #ifndef CONFIG_SPARSEMEM @@ -998,6 +1016,7 @@ enum zone_flags {  					 * Cleared when kswapd is woken.  					 */  	ZONE_RECLAIM_ACTIVE,		/* kswapd may be scanning the zone. */ +	ZONE_BELOW_HIGH,		/* zone is below high watermark. */  };  static inline unsigned long zone_managed_pages(struct zone *zone) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5c02720c53a5..a88e64acebfe 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -693,6 +693,25 @@ TESTPAGEFLAG_FALSE(Ksm, ksm)  u64 stable_page_flags(struct page *page);  /** + * folio_xor_flags_has_waiters - Change some folio flags. + * @folio: The folio. + * @mask: Bits set in this word will be changed. + * + * This must only be used for flags which are changed with the folio + * lock held.  For example, it is unsafe to use for PG_dirty as that + * can be set without the folio lock held.  It can also only be used + * on flags which are in the range 0-6 as some of the implementations + * only affect those bits. + * + * Return: Whether there are tasks waiting on the folio. + */ +static inline bool folio_xor_flags_has_waiters(struct folio *folio, +		unsigned long mask) +{ +	return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0)); +} + +/**   * folio_test_uptodate - Is this folio up to date?   * @folio: The folio.   * diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 351c3b7f93a1..bcc1ea44b4e8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -789,9 +789,6 @@ static inline pgoff_t folio_next_index(struct folio *folio)   */  static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)  { -	/* HugeTLBfs indexes the page cache in units of hpage_size */ -	if (folio_test_hugetlb(folio)) -		return &folio->page;  	return folio_page(folio, index & (folio_nr_pages(folio) - 1));  } @@ -807,9 +804,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)   */  static inline bool folio_contains(struct folio *folio, pgoff_t index)  { -	/* HugeTLBfs indexes the page cache in units of hpage_size */ -	if (folio_test_hugetlb(folio)) -		return folio->index == index;  	return index - folio_index(folio) < folio_nr_pages(folio);  } @@ -867,10 +861,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping,  }  /* - * Get index of the page within radix-tree (but not for hugetlb pages). - * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) + * Get the offset in PAGE_SIZE (even for hugetlb pages).   */ -static inline pgoff_t page_to_index(struct page *page) +static inline pgoff_t page_to_pgoff(struct page *page)  {  	struct page *head; @@ -885,19 +878,6 @@ static inline pgoff_t page_to_index(struct page *page)  	return head->index + page - head;  } -extern pgoff_t hugetlb_basepage_index(struct page *page); - -/* - * Get the offset in PAGE_SIZE (even for hugetlb pages). - * (TODO: hugetlb pages should have ->index in PAGE_SIZE) - */ -static inline pgoff_t page_to_pgoff(struct page *page) -{ -	if (unlikely(PageHuge(page))) -		return hugetlb_basepage_index(page); -	return page_to_index(page); -} -  /*   * Return byte-offset into filesystem object for page.   */ @@ -934,24 +914,16 @@ static inline loff_t folio_file_pos(struct folio *folio)  /*   * Get the offset in PAGE_SIZE (even for hugetlb folios). - * (TODO: hugetlb folios should have ->index in PAGE_SIZE)   */  static inline pgoff_t folio_pgoff(struct folio *folio)  { -	if (unlikely(folio_test_hugetlb(folio))) -		return hugetlb_basepage_index(&folio->page);  	return folio->index;  } -extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, -				     unsigned long address); -  static inline pgoff_t linear_page_index(struct vm_area_struct *vma,  					unsigned long address)  {  	pgoff_t pgoff; -	if (unlikely(is_vm_hugetlb_page(vma))) -		return linear_hugepage_index(vma, address);  	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;  	pgoff += vma->vm_pgoff;  	return pgoff; @@ -1129,6 +1101,7 @@ static inline void wait_on_page_locked(struct page *page)  	folio_wait_locked(page_folio(page));  } +void folio_end_read(struct folio *folio, bool success);  void wait_on_page_writeback(struct page *page);  void folio_wait_writeback(struct folio *folio);  int folio_wait_writeback_killable(struct folio *folio); diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index d01351b1526f..3a44dd1e33d2 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -57,6 +57,8 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,  			      s32 batch);  s64 __percpu_counter_sum(struct percpu_counter *fbc);  int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); +bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, +				  s64 amount, s32 batch);  void percpu_counter_sync(struct percpu_counter *fbc);  static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) @@ -69,6 +71,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)  	percpu_counter_add_batch(fbc, amount, percpu_counter_batch);  } +static inline bool +percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) +{ +	return __percpu_counter_limited_add(fbc, limit, amount, +					    percpu_counter_batch); +} +  /*   * With percpu_counter_add_local() and percpu_counter_sub_local(), counts   * are accumulated in local per cpu counter and not in fbc->count until @@ -185,6 +194,27 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)  	local_irq_restore(flags);  } +static inline bool +percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) +{ +	unsigned long flags; +	bool good = false; +	s64 count; + +	if (amount == 0) +		return true; + +	local_irq_save(flags); +	count = fbc->count + amount; +	if ((amount > 0 && count <= limit) || +	    (amount < 0 && count >= limit)) { +		fbc->count = count; +		good = true; +	} +	local_irq_restore(flags); +	return good; +} +  /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */  static inline void  percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 51cc21ebb568..b26fe858fd44 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -189,7 +189,7 @@ typedef int __bitwise rmap_t;  /*   * rmap interfaces called when adding or removing pte of page   */ -void page_move_anon_rmap(struct page *, struct vm_area_struct *); +void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);  void page_add_anon_rmap(struct page *, struct vm_area_struct *,  		unsigned long address, rmap_t flags);  void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, @@ -203,7 +203,7 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,  void page_remove_rmap(struct page *, struct vm_area_struct *,  		bool compound); -void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, +void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *,  		unsigned long address, rmap_t flags);  void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,  		unsigned long address); diff --git a/include/linux/sched.h b/include/linux/sched.h index 12ec109ce8c9..b49ca40f6335 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1448,6 +1448,10 @@ struct task_struct {  	struct mem_cgroup		*active_memcg;  #endif +#ifdef CONFIG_MEMCG_KMEM +	struct obj_cgroup		*objcg; +#endif +  #ifdef CONFIG_BLK_CGROUP  	struct gendisk			*throttle_disk;  #endif diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 0ee96ea7a0e9..02f5090ffea2 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm)  #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */  #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */  #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)  #define MMF_OOM_REAP_QUEUED	25	/* mm was queued for oom_reaper */  #define MMF_MULTIPROCESS	26	/* mm is shared between processes */  /* @@ -85,10 +86,22 @@ static inline int get_dumpable(struct mm_struct *mm)  #define MMF_HAS_MDWE		28  #define MMF_HAS_MDWE_MASK	(1 << MMF_HAS_MDWE) -#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP) + +#define MMF_HAS_MDWE_NO_INHERIT	29 + +#define MMF_VM_MERGE_ANY	30 +#define MMF_VM_MERGE_ANY_MASK	(1 << MMF_VM_MERGE_ANY)  #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ -				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) +				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ +				 MMF_VM_MERGE_ANY_MASK) + +static inline unsigned long mmf_init_flags(unsigned long flags) +{ +	if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) +		flags &= ~((1UL << MMF_HAS_MDWE) | +			   (1UL << MMF_HAS_MDWE_NO_INHERIT)); +	return flags & MMF_INIT_MASK; +} -#define MMF_VM_MERGE_ANY	29  #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8d89c8c4fac1..9a19f1b42f64 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -403,6 +403,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);   * __GFP_ACCOUNT allocations till the end of the scope will be charged to the   * given memcg.   * + * Please, make sure that caller has a reference to the passed memcg structure, + * so its lifetime is guaranteed to exceed the scope between two + * set_active_memcg() calls. + *   * NOTE: This function can nest. Users must save the return value and   * reset the previous value after their own charging scope is over.   */ diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index b69afb8630db..52b22c5c396d 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -30,8 +30,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);  extern pid_t task_numa_group_id(struct task_struct *p);  extern void set_numabalancing_state(bool enabled);  extern void task_numa_free(struct task_struct *p, bool final); -extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, -					int src_nid, int dst_cpu); +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, +				int src_nid, int dst_cpu);  #else  static inline void task_numa_fault(int last_node, int node, int pages,  				   int flags) @@ -48,7 +48,7 @@ static inline void task_numa_free(struct task_struct *p, bool final)  {  }  static inline bool should_numa_migrate_memory(struct task_struct *p, -				struct page *page, int src_nid, int dst_cpu) +				struct folio *folio, int src_nid, int dst_cpu)  {  	return true;  } diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 6b0c626620f5..2caa6b86106a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -23,18 +23,22 @@ struct shmem_inode_info {  	unsigned long		flags;  	unsigned long		alloced;	/* data pages alloced to file */  	unsigned long		swapped;	/* subtotal assigned to swap */ -	pgoff_t			fallocend;	/* highest fallocate endindex */ -	struct list_head        shrinklist;     /* shrinkable hpage inodes */ -	struct list_head	swaplist;	/* chain of maybes on swap */ +	union { +	    struct offset_ctx	dir_offsets;	/* stable directory offsets */ +	    struct { +		struct list_head shrinklist;	/* shrinkable hpage inodes */ +		struct list_head swaplist;	/* chain of maybes on swap */ +	    }; +	}; +	struct timespec64	i_crtime;	/* file creation time */  	struct shared_policy	policy;		/* NUMA memory alloc policy */  	struct simple_xattrs	xattrs;		/* list of xattrs */ +	pgoff_t			fallocend;	/* highest fallocate endindex */ +	unsigned int		fsflags;	/* for FS_IOC_[SG]ETFLAGS */  	atomic_t		stop_eviction;	/* hold when working on inode */ -	struct timespec64	i_crtime;	/* file creation time */ -	unsigned int		fsflags;	/* flags for FS_IOC_[SG]ETFLAGS */  #ifdef CONFIG_TMPFS_QUOTA  	struct dquot		*i_dquot[MAXQUOTAS];  #endif -	struct offset_ctx	dir_offsets;	/* stable entry offsets */  	struct inode		vfs_inode;  }; diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 224293b2dd06..1a00be90d93a 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -4,6 +4,25 @@  #include <linux/atomic.h>  #include <linux/types.h> +#include <linux/refcount.h> +#include <linux/completion.h> + +#define SHRINKER_UNIT_BITS	BITS_PER_LONG + +/* + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware + * shrinkers, which have elements charged to the memcg. + */ +struct shrinker_info_unit { +	atomic_long_t nr_deferred[SHRINKER_UNIT_BITS]; +	DECLARE_BITMAP(map, SHRINKER_UNIT_BITS); +}; + +struct shrinker_info { +	struct rcu_head rcu; +	int map_nr_max; +	struct shrinker_info_unit *unit[]; +};  /*   * This struct is used to pass information from page reclaim to the shrinkers. @@ -70,6 +89,19 @@ struct shrinker {  	int seeks;	/* seeks to recreate an obj */  	unsigned flags; +	/* +	 * The reference count of this shrinker. Registered shrinker have an +	 * initial refcount of 1, then the lookup operations are now allowed +	 * to use it via shrinker_try_get(). Later in the unregistration step, +	 * the initial refcount will be discarded, and will free the shrinker +	 * asynchronously via RCU after its refcount reaches 0. +	 */ +	refcount_t refcount; +	struct completion done;	/* use to wait for refcount to reach 0 */ +	struct rcu_head rcu; + +	void *private_data; +  	/* These are for internal use */  	struct list_head list;  #ifdef CONFIG_MEMCG @@ -86,48 +118,39 @@ struct shrinker {  };  #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ -/* Flags */ -#define SHRINKER_REGISTERED	(1 << 0) -#define SHRINKER_NUMA_AWARE	(1 << 1) -#define SHRINKER_MEMCG_AWARE	(1 << 2) +/* Internal flags */ +#define SHRINKER_REGISTERED	BIT(0) +#define SHRINKER_ALLOCATED	BIT(1) + +/* Flags for users to use */ +#define SHRINKER_NUMA_AWARE	BIT(2) +#define SHRINKER_MEMCG_AWARE	BIT(3)  /*   * It just makes sense when the shrinker is also MEMCG_AWARE for now,   * non-MEMCG_AWARE shrinker should not have this flag set.   */ -#define SHRINKER_NONSLAB	(1 << 3) +#define SHRINKER_NONSLAB	BIT(4) -extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, -					    const char *fmt, ...); -extern void register_shrinker_prepared(struct shrinker *shrinker); -extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, -					    const char *fmt, ...); -extern void unregister_shrinker(struct shrinker *shrinker); -extern void free_prealloced_shrinker(struct shrinker *shrinker); -extern void synchronize_shrinkers(void); +__printf(2, 3) +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); +void shrinker_register(struct shrinker *shrinker); +void shrinker_free(struct shrinker *shrinker); -#ifdef CONFIG_SHRINKER_DEBUG -extern int shrinker_debugfs_add(struct shrinker *shrinker); -extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, -					      int *debugfs_id); -extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, -				    int debugfs_id); -extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, -						  const char *fmt, ...); -#else /* CONFIG_SHRINKER_DEBUG */ -static inline int shrinker_debugfs_add(struct shrinker *shrinker) -{ -	return 0; -} -static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, -						     int *debugfs_id) +static inline bool shrinker_try_get(struct shrinker *shrinker)  { -	*debugfs_id = -1; -	return NULL; +	return refcount_inc_not_zero(&shrinker->refcount);  } -static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, -					   int debugfs_id) + +static inline void shrinker_put(struct shrinker *shrinker)  { +	if (refcount_dec_and_test(&shrinker->refcount)) +		complete(&shrinker->done);  } + +#ifdef CONFIG_SHRINKER_DEBUG +extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, +						  const char *fmt, ...); +#else /* CONFIG_SHRINKER_DEBUG */  static inline __printf(2, 3)  int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)  { diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac8c6854097c..f2dc19f40d05 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -161,11 +161,22 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)  }  static inline bool vma_can_userfault(struct vm_area_struct *vma, -				     unsigned long vm_flags) +				     unsigned long vm_flags, +				     bool wp_async)  { +	vm_flags &= __VM_UFFD_FLAGS; +  	if ((vm_flags & VM_UFFD_MINOR) &&  	    (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))  		return false; + +	/* +	 * If wp async enabled, and WP is the only mode enabled, allow any +	 * memory type. +	 */ +	if (wp_async && (vm_flags == VM_UFFD_WP)) +		return true; +  #ifndef CONFIG_PTE_MARKER_UFFD_WP  	/*  	 * If user requested uffd-wp but not enabled pte markers for @@ -175,6 +186,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,  	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))  		return false;  #endif + +	/* By default, allow any of anon|shmem|hugetlb */  	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||  	    vma_is_shmem(vma);  } @@ -197,6 +210,7 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,  extern void userfaultfd_unmap_complete(struct mm_struct *mm,  				       struct list_head *uf);  extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); +extern bool userfaultfd_wp_async(struct vm_area_struct *vma);  #else /* CONFIG_USERFAULTFD */ @@ -207,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf,  	return VM_FAULT_SIGBUS;  } +static inline long uffd_wp_range(struct vm_area_struct *vma, +				 unsigned long start, unsigned long len, +				 bool enable_wp) +{ +	return false; +} +  static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,  					struct vm_userfaultfd_ctx vm_ctx)  { @@ -297,6 +318,11 @@ static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)  	return false;  } +static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ +	return false; +} +  #endif /* CONFIG_USERFAULTFD */  static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) diff --git a/include/linux/wait.h b/include/linux/wait.h index 5ec7739400f4..3473b663176f 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -19,10 +19,9 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int  /* wait_queue_entry::flags */  #define WQ_FLAG_EXCLUSIVE	0x01  #define WQ_FLAG_WOKEN		0x02 -#define WQ_FLAG_BOOKMARK	0x04 -#define WQ_FLAG_CUSTOM		0x08 -#define WQ_FLAG_DONE		0x10 -#define WQ_FLAG_PRIORITY	0x20 +#define WQ_FLAG_CUSTOM		0x04 +#define WQ_FLAG_DONE		0x08 +#define WQ_FLAG_PRIORITY	0x10  /*   * A single wait-queue entry structure: @@ -212,8 +211,6 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq  int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);  void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);  void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); -void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, -		unsigned int mode, void *key, wait_queue_entry_t *bookmark);  void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);  void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);  void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);  | 
