diff options
Diffstat (limited to 'include/linux')
62 files changed, 2481 insertions, 832 deletions
diff --git a/include/linux/cache_coherency.h b/include/linux/cache_coherency.h new file mode 100644 index 000000000000..cc81c5733e31 --- /dev/null +++ b/include/linux/cache_coherency.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Cache coherency maintenance operation device drivers + * + * Copyright Huawei 2025 + */ +#ifndef _LINUX_CACHE_COHERENCY_H_ +#define _LINUX_CACHE_COHERENCY_H_ + +#include <linux/list.h> +#include <linux/kref.h> +#include <linux/types.h> + +struct cc_inval_params { + phys_addr_t addr; + size_t size; +}; + +struct cache_coherency_ops_inst; + +struct cache_coherency_ops { + int (*wbinv)(struct cache_coherency_ops_inst *cci, + struct cc_inval_params *invp); + int (*done)(struct cache_coherency_ops_inst *cci); +}; + +struct cache_coherency_ops_inst { + struct kref kref; + struct list_head node; + const struct cache_coherency_ops *ops; +}; + +int cache_coherency_ops_instance_register(struct cache_coherency_ops_inst *cci); +void cache_coherency_ops_instance_unregister(struct cache_coherency_ops_inst *cci); + +struct cache_coherency_ops_inst * +_cache_coherency_ops_instance_alloc(const struct cache_coherency_ops *ops, + size_t size); +/** + * cache_coherency_ops_instance_alloc - Allocate cache coherency ops instance + * @ops: Cache maintenance operations + * @drv_struct: structure that contains the struct cache_coherency_ops_inst + * @member: Name of the struct cache_coherency_ops_inst member in @drv_struct. + * + * This allocates a driver specific structure and initializes the + * cache_coherency_ops_inst embedded in the drv_struct. Upon success the + * pointer must be freed via cache_coherency_ops_instance_put(). + * + * Returns a &drv_struct * on success, %NULL on error. + */ +#define cache_coherency_ops_instance_alloc(ops, drv_struct, member) \ + ({ \ + static_assert(__same_type(struct cache_coherency_ops_inst, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_cache_coherency_ops_instance_alloc(ops, \ + sizeof(drv_struct)); \ + }) +void cache_coherency_ops_instance_put(struct cache_coherency_ops_inst *cci); + +#endif diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 698520b1bfdb..ef65c75beeaa 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -64,8 +64,8 @@ extern void config_item_put(struct config_item *); struct config_item_type { struct module *ct_owner; - struct configfs_item_operations *ct_item_ops; - struct configfs_group_operations *ct_group_ops; + const struct configfs_item_operations *ct_item_ops; + const struct configfs_group_operations *ct_group_ops; struct configfs_attribute **ct_attrs; struct configfs_bin_attribute **ct_bin_attrs; }; diff --git a/include/linux/damon.h b/include/linux/damon.h index cae8c613c5fc..3813373a9200 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,17 +91,23 @@ struct damon_region { * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. + * @obsolete: Whether the commit destination target is obsolete. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The * @pid should be set for appropriate &struct damon_operations including the * virtual address spaces monitoring operations. + * + * @obsolete is used only for damon_commit_targets() source targets, to specify + * the matching destination targets are obsolete. Read damon_commit_targets() + * to see how it is handled. */ struct damon_target { struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; + bool obsolete; }; /** @@ -147,6 +153,8 @@ enum damos_action { * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. + * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. + * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -156,6 +164,8 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_SOME_MEM_PSI_US, DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, + DAMOS_QUOTA_NODE_MEMCG_USED_BP, + DAMOS_QUOTA_NODE_MEMCG_FREE_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -166,6 +176,7 @@ enum damos_quota_goal_metric { * @current_value: Current value of @metric. * @last_psi_total: Last measured total PSI * @nid: Node id. + * @memcg_id: Memcg id. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -176,6 +187,12 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually * entered by the user, probably inside the kdamond callbacks. Otherwise, * DAMON sets @current_value with self-measured value of @metric. + * + * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node + * id of the target node to account the used/free memory. + * + * If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id + * represents the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; @@ -184,7 +201,10 @@ struct damos_quota_goal { /* metric-dependent fields */ union { u64 last_psi_total; - int nid; + struct { + int nid; + unsigned short memcg_id; + }; }; struct list_head list; }; @@ -472,7 +492,7 @@ struct damos_migrate_dests { * @wmarks: Watermarks for automated (in)activation of this scheme. * @migrate_dests: Destination nodes if @action is "migrate_{hot,cold}". * @target_nid: Destination node if @action is "migrate_{hot,cold}". - * @filters: Additional set of &struct damos_filter for &action. + * @core_filters: Additional set of &struct damos_filter for &action. * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. @@ -498,7 +518,7 @@ struct damos_migrate_dests { * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect - * &filters + * &core_filters * * The minimum entity that @action can be applied depends on the underlying * &struct damon_operations. Since it may not be aligned with the core layer @@ -542,7 +562,7 @@ struct damos { struct damos_migrate_dests migrate_dests; }; }; - struct list_head filters; + struct list_head core_filters; struct list_head ops_filters; void *last_applied; struct damos_stat stat; @@ -851,11 +871,11 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damos_for_each_quota_goal_safe(goal, next, quota) \ list_for_each_entry_safe(goal, next, &(quota)->goals, list) -#define damos_for_each_filter(f, scheme) \ - list_for_each_entry(f, &(scheme)->filters, list) +#define damos_for_each_core_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->core_filters, list) -#define damos_for_each_filter_safe(f, next, scheme) \ - list_for_each_entry_safe(f, next, &(scheme)->filters, list) +#define damos_for_each_core_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->core_filters, list) #define damos_for_each_ops_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->ops_filters, list) @@ -947,7 +967,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end); + unsigned long *start, unsigned long *end, + unsigned long min_sz_region); #endif /* CONFIG_DAMON */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c83e02b94389..898c60d21c92 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -198,7 +198,6 @@ enum dentry_flags { DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ DCACHE_CANT_MOUNT = BIT(8), - DCACHE_GENOCIDE = BIT(9), DCACHE_SHRINK_LIST = BIT(10), DCACHE_OP_WEAK_REVALIDATE = BIT(11), /* @@ -225,6 +224,7 @@ enum dentry_flags { DCACHE_PAR_LOOKUP = BIT(24), /* being looked up (with parent locked shared) */ DCACHE_DENTRY_CURSOR = BIT(25), DCACHE_NORCU = BIT(26), /* No RCU delay for freeing */ + DCACHE_PERSISTENT = BIT(27) }; #define DCACHE_MANAGED_DENTRY \ @@ -268,6 +268,8 @@ extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); +extern void d_dispose_if_unused(struct dentry *, struct list_head *); +extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); @@ -610,5 +612,7 @@ static inline struct dentry *d_next_sibling(const struct dentry *dentry) } void set_default_d_op(struct super_block *, const struct dentry_operations *); +struct dentry *d_make_persistent(struct dentry *, struct inode *); +void d_make_discardable(struct dentry *dentry); #endif /* __LINUX_DCACHE_H */ diff --git a/include/linux/err.h b/include/linux/err.h index 1d60aa86db53..8c37be0620ab 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -41,6 +41,14 @@ static inline void * __must_check ERR_PTR(long error) return (void *) error; } +/** + * INIT_ERR_PTR - Init a const error pointer. + * @error: A negative error code. + * + * Like ERR_PTR(), but usable to initialize static variables. + */ +#define INIT_ERR_PTR(error) ((void *)(error)) + /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) diff --git a/include/linux/firmware/qcom/qcom_tzmem.h b/include/linux/firmware/qcom/qcom_tzmem.h index 48ac0e5454c7..23173e0c3ddd 100644 --- a/include/linux/firmware/qcom/qcom_tzmem.h +++ b/include/linux/firmware/qcom/qcom_tzmem.h @@ -17,11 +17,20 @@ struct qcom_tzmem_pool; * enum qcom_tzmem_policy - Policy for pool growth. */ enum qcom_tzmem_policy { - /**< Static pool, never grow above initial size. */ + /** + * @QCOM_TZMEM_POLICY_STATIC: Static pool, + * never grow above initial size. + */ QCOM_TZMEM_POLICY_STATIC = 1, - /**< When out of memory, add increment * current size of memory. */ + /** + * @QCOM_TZMEM_POLICY_MULTIPLIER: When out of memory, + * add increment * current size of memory. + */ QCOM_TZMEM_POLICY_MULTIPLIER, - /**< When out of memory add as much as is needed until max_size. */ + /** + * @QCOM_TZMEM_POLICY_ON_DEMAND: When out of memory + * add as much as is needed until max_size. + */ QCOM_TZMEM_POLICY_ON_DEMAND, }; diff --git a/include/linux/firmware/xlnx-zynqmp-ufs.h b/include/linux/firmware/xlnx-zynqmp-ufs.h new file mode 100644 index 000000000000..d3538dd5822a --- /dev/null +++ b/include/linux/firmware/xlnx-zynqmp-ufs.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Firmware layer for UFS APIs. + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + */ + +#ifndef __FIRMWARE_XLNX_ZYNQMP_UFS_H__ +#define __FIRMWARE_XLNX_ZYNQMP_UFS_H__ + +#if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) +int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready); +int zynqmp_pm_is_sram_init_done(bool *is_done); +int zynqmp_pm_set_sram_bypass(void); +int zynqmp_pm_get_ufs_calibration_values(u32 *val); +#else +static inline int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_is_sram_init_done(bool *is_done) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_set_sram_bypass(void) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_get_ufs_calibration_values(u32 *val) +{ + return -ENODEV; +} +#endif + +#endif /* __FIRMWARE_XLNX_ZYNQMP_UFS_H__ */ diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index ae48d619c4e0..15fdbd089bbf 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -3,7 +3,7 @@ * Xilinx Zynq MPSoC Firmware layer * * Copyright (C) 2014-2021 Xilinx - * Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. + * Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc. * * Michal Simek <michal.simek@amd.com> * Davorin Mista <davorin.mista@aggios.com> @@ -16,6 +16,7 @@ #include <linux/types.h> #include <linux/err.h> +#include <linux/firmware/xlnx-zynqmp-ufs.h> #define ZYNQMP_PM_VERSION_MAJOR 1 #define ZYNQMP_PM_VERSION_MINOR 0 @@ -51,16 +52,10 @@ #define PM_PINCTRL_PARAM_SET_VERSION 2 -#define ZYNQMP_FAMILY_CODE 0x23 -#define VERSAL_FAMILY_CODE 0x26 - -/* When all subfamily of platform need to support */ -#define ALL_SUB_FAMILY_CODE 0x00 -#define VERSAL_SUB_FAMILY_CODE 0x01 -#define VERSALNET_SUB_FAMILY_CODE 0x03 - -#define FAMILY_CODE_MASK GENMASK(27, 21) -#define SUB_FAMILY_CODE_MASK GENMASK(20, 19) +/* Family codes */ +#define PM_ZYNQMP_FAMILY_CODE 0x1 /* ZynqMP family code */ +#define PM_VERSAL_FAMILY_CODE 0x2 /* Versal family code */ +#define PM_VERSAL_NET_FAMILY_CODE 0x3 /* Versal NET family code */ #define API_ID_MASK GENMASK(7, 0) #define MODULE_ID_MASK GENMASK(11, 8) @@ -164,6 +159,7 @@ enum pm_api_cb_id { enum pm_api_id { PM_API_FEATURES = 0, PM_GET_API_VERSION = 1, + PM_GET_NODE_STATUS = 3, PM_REGISTER_NOTIFIER = 5, PM_FORCE_POWERDOWN = 8, PM_REQUEST_WAKEUP = 10, @@ -241,6 +237,7 @@ enum pm_ioctl_id { IOCTL_GET_FEATURE_CONFIG = 27, /* IOCTL for Secure Read/Write Interface */ IOCTL_READ_REG = 28, + IOCTL_MASK_WRITE_REG = 29, /* Dynamic SD/GEM configuration */ IOCTL_SET_SD_CONFIG = 30, IOCTL_SET_GEM_CONFIG = 31, @@ -564,7 +561,7 @@ int zynqmp_pm_invoke_fw_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...); #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) int zynqmp_pm_get_api_version(u32 *version); int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); -int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily); +int zynqmp_pm_get_family_info(u32 *family); int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); int zynqmp_pm_clock_enable(u32 clock_id); int zynqmp_pm_clock_disable(u32 clock_id); @@ -619,6 +616,9 @@ int zynqmp_pm_feature(const u32 api_id); int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id); int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value); int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload); +int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value); +int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, + u32 mask, u32 value); int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset); int zynqmp_pm_force_pwrdwn(const u32 target, const enum zynqmp_pm_request_ack ack); @@ -629,6 +629,8 @@ int zynqmp_pm_request_wake(const u32 node, int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode); int zynqmp_pm_set_rpu_mode(u32 node_id, enum rpu_oper_mode rpu_mode); int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mode); +int zynqmp_pm_get_node_status(const u32 node, u32 *const status, + u32 *const requirements, u32 *const usage); int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value); int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config, u32 value); @@ -643,7 +645,7 @@ static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) return -ENODEV; } -static inline int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily) +static inline int zynqmp_pm_get_family_info(u32 *family) { return -ENODEV; } @@ -916,6 +918,17 @@ static inline int zynqmp_pm_request_wake(const u32 node, return -ENODEV; } +static inline int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, + u32 mask, u32 value) +{ + return -ENODEV; +} + static inline int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode) { return -ENODEV; @@ -931,6 +944,13 @@ static inline int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mo return -ENODEV; } +static inline int zynqmp_pm_get_node_status(const u32 node, u32 *const status, + u32 *const requirements, + u32 *const usage) +{ + return -ENODEV; +} + static inline int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value) diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 7964db96e41a..0a3bcd1718f3 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -7,6 +7,7 @@ #include <linux/ftrace.h> #include <linux/rcupdate.h> #include <linux/refcount.h> +#include <linux/rhashtable.h> #include <linux/slab.h> struct fprobe; @@ -26,7 +27,7 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, * @fp: The fprobe which owns this. */ struct fprobe_hlist_node { - struct hlist_node hlist; + struct rhlist_head hlist; unsigned long addr; struct fprobe *fp; }; diff --git a/include/linux/fs.h b/include/linux/fs.h index ce25feb06727..04ceeca12a0d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2041,14 +2041,14 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma); -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } @@ -2310,7 +2310,6 @@ void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); -void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); @@ -3191,6 +3190,8 @@ extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); +extern void __simple_unlink(struct inode *, struct dentry *); +extern void __simple_rmdir(struct inode *, struct dentry *); void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, @@ -3200,6 +3201,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); +extern void simple_remove_by_name(struct dentry *, const char *, + void (*callback)(struct dentry *)); extern void locked_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); @@ -3229,6 +3232,7 @@ extern int simple_fill_super(struct super_block *, unsigned long, extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); struct dentry *simple_start_creating(struct dentry *, const char *); +void simple_done_creating(struct dentry *); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 015dd1049bea..770f0dc993cc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1167,17 +1167,14 @@ static inline void ftrace_init(void) { } */ struct ftrace_graph_ent { unsigned long func; /* Current function */ - int depth; + unsigned long depth; } __packed; /* * Structure that defines an entry function trace with retaddr. - * It's already packed but the attribute "packed" is needed - * to remove extra padding at the end. */ struct fgraph_retaddr_ent { - unsigned long func; /* Current function */ - int depth; + struct ftrace_graph_ent ent; unsigned long retaddr; /* Return address */ } __packed; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 623bee335383..b155929af5b1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order); #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 11cab07f322a..ae7f21aad0ac 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -364,20 +364,35 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); +enum split_type { + SPLIT_TYPE_UNIFORM, + SPLIT_TYPE_NON_UNIFORM, +}; + bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, +int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); +int folio_split_unmapped(struct folio *folio, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); -bool uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); -bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); +bool folio_split_supported(struct folio *folio, unsigned int new_order, + enum split_type split_type, bool warns); int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); -/* - * try_folio_split_to_order - try to split a @folio at @page to @new_order using - * non uniform split. + +static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) +{ + return __split_huge_page_to_list_to_order(page, list, new_order); +} +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + return split_huge_page_to_list_to_order(page, NULL, new_order); +} + +/** + * try_folio_split_to_order() - try to split a @folio at @page to @new_order + * using non uniform split. * @folio: folio to be split * @page: split to @new_order at the given page * @new_order: the target split order @@ -387,14 +402,13 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *page, * folios are put back to LRU list. Use min_order_for_split() to get the lower * bound of @new_order. * - * Return: 0: split is successful, otherwise split failed. + * Return: 0 - split is successful, otherwise split failed. */ static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) { - if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) - return split_huge_page_to_list_to_order(&folio->page, NULL, - new_order); + if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false)) + return split_huge_page_to_order(&folio->page, new_order); return folio_split(folio, new_order, page, NULL); } static inline int split_huge_page(struct page *page) @@ -402,14 +416,43 @@ static inline int split_huge_page(struct page *page) return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio, bool partially_mapped); +#ifdef CONFIG_MEMCG +void reparent_deferred_split_queue(struct mem_cgroup *memcg); +#endif void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); +/** + * pmd_is_huge() - Is this PMD either a huge PMD entry or a software leaf entry? + * @pmd: The PMD to check. + * + * A huge PMD entry is a non-empty entry which is present and marked huge or a + * software leaf entry. This check be performed without the appropriate locks + * held, in which case the condition should be rechecked after they are + * acquired. + * + * Returns: true if this PMD is huge, false otherwise. + */ +static inline bool pmd_is_huge(pmd_t pmd) +{ + if (pmd_present(pmd)) { + return pmd_trans_huge(pmd); + } else if (!pmd_none(pmd)) { + /* + * Non-present PMDs must be valid huge non-present entries. We + * cannot assert that here due to header dependency issues. + */ + return true; + } + + return false; +} + #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)) \ + if (pmd_is_huge(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false); \ } while (0) @@ -447,19 +490,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); -static inline int is_swap_pmd(pmd_t pmd) -{ - return !pmd_none(pmd) && !pmd_present(pmd); -} - /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_is_huge(*pmd)) return __pmd_trans_huge_lock(pmd, vma); - else - return NULL; + + return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) @@ -473,6 +511,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, /** * folio_test_pmd_mappable - Can we map this folio with a PMD? * @folio: The folio to test + * + * Return: true - @folio can be mapped, false - @folio cannot be mapped. */ static inline bool folio_test_pmd_mappable(struct folio *folio) { @@ -481,6 +521,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf); + extern struct folio *huge_zero_folio; extern unsigned long huge_zero_pfn; @@ -524,6 +566,8 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze); bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio); +void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -576,6 +620,11 @@ split_huge_page_to_list_to_order(struct page *page, struct list_head *list, VM_WARN_ON_ONCE_PAGE(1, page); return -EINVAL; } +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; +} static inline int split_huge_page(struct page *page) { VM_WARN_ON_ONCE_PAGE(1, page); @@ -602,6 +651,7 @@ static inline int try_folio_split_to_order(struct folio *folio, } static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) @@ -642,10 +692,6 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, struct vm_area_struct *next) { } -static inline int is_swap_pmd(pmd_t pmd) -{ - return 0; -} static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { @@ -662,6 +708,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) return 0; } +static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) +{ + return 0; +} + static inline bool is_huge_zero_folio(const struct folio *folio) { return false; @@ -682,12 +733,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm) return; } -static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) -{ - return NULL; -} - static inline bool thp_migration_supported(void) { return false; @@ -720,6 +765,11 @@ static inline struct folio *get_persistent_huge_zero_folio(void) { return NULL; } + +static inline bool pmd_is_huge(pmd_t pmd) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8e63e46b8e1f..019a1c5281e4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags); + struct vm_area_desc *desc, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -172,7 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h, struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); -extern int sysctl_hugetlb_shm_group; +extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); @@ -275,11 +274,10 @@ void hugetlb_vma_lock_release(struct kref *kref); long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); -bool is_hugetlb_entry_migration(pte_t pte); -bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -466,6 +464,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} +static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + return 0; +} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 0660a03d37d9..a27aa0162918 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -2,22 +2,27 @@ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H -#ifdef CONFIG_HUGETLB_PAGE - #include <linux/mm.h> -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +#ifdef CONFIG_HUGETLB_PAGE + +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(vm_flags & VM_HUGETLB); } #else -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { return false; } #endif +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return is_vm_hugetlb_flags(vma->vm_flags); +} + #endif diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 801b2bd9e8d4..8c66284a91a8 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1135,7 +1135,9 @@ struct iommu_sva { struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1616,6 +1618,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1640,6 +1643,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 2223f95079ce..d45fa19f9e47 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -86,7 +86,13 @@ #define GICH_HCR_EN (1 << 0) #define GICH_HCR_UIE (1 << 1) +#define GICH_HCR_LRENPIE (1 << 2) #define GICH_HCR_NPIE (1 << 3) +#define GICH_HCR_VGrp0EIE (1 << 4) +#define GICH_HCR_VGrp0DIE (1 << 5) +#define GICH_HCR_VGrp1EIE (1 << 6) +#define GICH_HCR_VGrp1DIE (1 << 7) +#define GICH_HCR_EOICOUNT GENMASK(31, 27) #define GICH_LR_VIRTUALID (0x3ff << 0) #define GICH_LR_PHYSID_CPUID_SHIFT (10) diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h index a470a73a805a..67d9d960273b 100644 --- a/include/linux/irqchip/arm-vgic-info.h +++ b/include/linux/irqchip/arm-vgic-info.h @@ -24,6 +24,8 @@ struct gic_kvm_info { enum gic_type type; /* Virtual CPU interface */ struct resource vcpu; + /* GICv2 GICC VA */ + void __iomem *gicc_base; /* Interrupt number */ unsigned int maint_irq; /* No interrupt mask, no need to use the above field */ diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 0d1927da8055..fdef2c155c27 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -611,4 +611,16 @@ extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 +struct ctl_table; +int proc_dointvec_jiffies(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos); +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos); +int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos); +int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos); +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos); + #endif diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d12e1a5f5a9a..f335c1d7b61d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); -void kasan_release_vmalloc(unsigned long start, unsigned long end, +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); +static inline int kasan_populate_vmalloc(unsigned long addr, + unsigned long size, gfp_t gfp_mask) +{ + if (kasan_enabled()) + return __kasan_populate_vmalloc(addr, size, gfp_mask); + return 0; +} +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags); +static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end, + unsigned long flags) +{ + if (kasan_enabled()) + return __kasan_release_vmalloc(start, end, free_region_start, + free_region_end, flags); +} #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index f2fd221107bb..7da9fd506b39 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr); * @prot: page protection flags used for vmap. * @pages: array of pages. * @page_shift: page_shift passed to vmap_range_noflush(). + * @gfp_mask: gfp_mask to use internally. * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in * vmalloc metadata address range. Returns 0 on success, callers must check @@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift); + unsigned int page_shift, + gfp_t gfp_mask); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr) static inline int __must_check kmsan_vmap_pages_range_noflush( unsigned long start, unsigned long end, pgprot_t prot, - struct page **pages, unsigned int page_shift) + struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return 0; } diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 067538fc4d58..c982694c987b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,7 +17,7 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); @@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, +static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { return vm_flags; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..d93f75b05ae2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); @@ -2437,18 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -#else -static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, - unsigned long arg) -{ - return -ENOIOCTLCMD; -} -#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/include/linux/leafops.h b/include/linux/leafops.h new file mode 100644 index 000000000000..cfafe7a5e7b1 --- /dev/null +++ b/include/linux/leafops.h @@ -0,0 +1,619 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Describes operations that can be performed on software-defined page table + * leaf entries. These are abstracted from the hardware page table entries + * themselves by the softleaf_t type, see mm_types.h. + */ +#ifndef _LINUX_LEAFOPS_H +#define _LINUX_LEAFOPS_H + +#include <linux/mm_types.h> +#include <linux/swapops.h> +#include <linux/swap.h> + +#ifdef CONFIG_MMU + +/* Temporary until swp_entry_t eliminated. */ +#define LEAF_TYPE_SHIFT SWP_TYPE_SHIFT + +enum softleaf_type { + /* Fundamental types. */ + SOFTLEAF_NONE, + SOFTLEAF_SWAP, + /* Migration types. */ + SOFTLEAF_MIGRATION_READ, + SOFTLEAF_MIGRATION_READ_EXCLUSIVE, + SOFTLEAF_MIGRATION_WRITE, + /* Device types. */ + SOFTLEAF_DEVICE_PRIVATE_READ, + SOFTLEAF_DEVICE_PRIVATE_WRITE, + SOFTLEAF_DEVICE_EXCLUSIVE, + /* H/W posion types. */ + SOFTLEAF_HWPOISON, + /* Marker types. */ + SOFTLEAF_MARKER, +}; + +/** + * softleaf_mk_none() - Create an empty ('none') leaf entry. + * Returns: empty leaf entry. + */ +static inline softleaf_t softleaf_mk_none(void) +{ + return ((softleaf_t) { 0 }); +} + +/** + * softleaf_from_pte() - Obtain a leaf entry from a PTE entry. + * @pte: PTE entry. + * + * If @pte is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pte(pte_t pte) +{ + softleaf_t arch_entry; + + if (pte_present(pte) || pte_none(pte)) + return softleaf_mk_none(); + + pte = pte_swp_clear_flags(pte); + arch_entry = __pte_to_swp_entry(pte); + + /* Temporary until swp_entry_t eliminated. */ + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); +} + +/** + * softleaf_to_pte() - Obtain a PTE entry from a leaf entry. + * @entry: Leaf entry. + * + * This generates an architecture-specific PTE entry that can be utilised to + * encode the metadata the leaf entry encodes. + * + * Returns: Architecture-specific PTE entry encoding leaf entry. + */ +static inline pte_t softleaf_to_pte(softleaf_t entry) +{ + /* Temporary until swp_entry_t eliminated. */ + return swp_entry_to_pte(entry); +} + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +/** + * softleaf_from_pmd() - Obtain a leaf entry from a PMD entry. + * @pmd: PMD entry. + * + * If @pmd is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + softleaf_t arch_entry; + + if (pmd_present(pmd) || pmd_none(pmd)) + return softleaf_mk_none(); + + if (pmd_swp_soft_dirty(pmd)) + pmd = pmd_swp_clear_soft_dirty(pmd); + if (pmd_swp_uffd_wp(pmd)) + pmd = pmd_swp_clear_uffd_wp(pmd); + arch_entry = __pmd_to_swp_entry(pmd); + + /* Temporary until swp_entry_t eliminated. */ + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); +} + +#else + +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + return softleaf_mk_none(); +} + +#endif + +/** + * softleaf_is_none() - Is the leaf entry empty? + * @entry: Leaf entry. + * + * Empty entries are typically the result of a 'none' page table leaf entry + * being converted to a leaf entry. + * + * Returns: true if the entry is empty, false otherwise. + */ +static inline bool softleaf_is_none(softleaf_t entry) +{ + return entry.val == 0; +} + +/** + * softleaf_type() - Identify the type of leaf entry. + * @enntry: Leaf entry. + * + * Returns: the leaf entry type associated with @entry. + */ +static inline enum softleaf_type softleaf_type(softleaf_t entry) +{ + unsigned int type_num; + + if (softleaf_is_none(entry)) + return SOFTLEAF_NONE; + + type_num = entry.val >> LEAF_TYPE_SHIFT; + + if (type_num < MAX_SWAPFILES) + return SOFTLEAF_SWAP; + + switch (type_num) { +#ifdef CONFIG_MIGRATION + case SWP_MIGRATION_READ: + return SOFTLEAF_MIGRATION_READ; + case SWP_MIGRATION_READ_EXCLUSIVE: + return SOFTLEAF_MIGRATION_READ_EXCLUSIVE; + case SWP_MIGRATION_WRITE: + return SOFTLEAF_MIGRATION_WRITE; +#endif +#ifdef CONFIG_DEVICE_PRIVATE + case SWP_DEVICE_WRITE: + return SOFTLEAF_DEVICE_PRIVATE_WRITE; + case SWP_DEVICE_READ: + return SOFTLEAF_DEVICE_PRIVATE_READ; + case SWP_DEVICE_EXCLUSIVE: + return SOFTLEAF_DEVICE_EXCLUSIVE; +#endif +#ifdef CONFIG_MEMORY_FAILURE + case SWP_HWPOISON: + return SOFTLEAF_HWPOISON; +#endif + case SWP_PTE_MARKER: + return SOFTLEAF_MARKER; + } + + /* Unknown entry type. */ + VM_WARN_ON_ONCE(1); + return SOFTLEAF_NONE; +} + +/** + * softleaf_is_swap() - Is this leaf entry a swap entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a swap entry, otherwise false. + */ +static inline bool softleaf_is_swap(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_SWAP; +} + +/** + * softleaf_is_migration_write() - Is this leaf entry a writable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a writable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE; +} + +/** + * softleaf_is_migration_read() - Is this leaf entry a readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a readable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_read(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ; +} + +/** + * softleaf_is_migration_read_exclusive() - Is this leaf entry an exclusive + * readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is an exclusive readable migration entry, + * otherwise false. + */ +static inline bool softleaf_is_migration_read_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE; +} + +/** + * softleaf_is_migration() - Is this leaf entry a migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a migration entry, otherwise false. + */ +static inline bool softleaf_is_migration(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_MIGRATION_READ: + case SOFTLEAF_MIGRATION_READ_EXCLUSIVE: + case SOFTLEAF_MIGRATION_WRITE: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_private_write() - Is this leaf entry a device private + * writable entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private writable entry, otherwise + * false. + */ +static inline bool softleaf_is_device_private_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_PRIVATE_WRITE; +} + +/** + * softleaf_is_device_private() - Is this leaf entry a device private entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private entry, otherwise false. + */ +static inline bool softleaf_is_device_private(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_DEVICE_PRIVATE_WRITE: + case SOFTLEAF_DEVICE_PRIVATE_READ: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_exclusive() - Is this leaf entry a device-exclusive entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device-exclusive entry, otherwise false. + */ +static inline bool softleaf_is_device_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_EXCLUSIVE; +} + +/** + * softleaf_is_hwpoison() - Is this leaf entry a hardware poison entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a hardware poison entry, otherwise false. + */ +static inline bool softleaf_is_hwpoison(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_HWPOISON; +} + +/** + * softleaf_is_marker() - Is this leaf entry a marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a marker entry, otherwise false. + */ +static inline bool softleaf_is_marker(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MARKER; +} + +/** + * softleaf_to_marker() - Obtain marker associated with leaf entry. + * @entry: Leaf entry, softleaf_is_marker(@entry) must return true. + * + * Returns: Marker associated with the leaf entry. + */ +static inline pte_marker softleaf_to_marker(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_marker(entry)); + + return swp_offset(entry) & PTE_MARKER_MASK; +} + +/** + * softleaf_has_pfn() - Does this leaf entry encode a valid PFN number? + * @entry: Leaf entry. + * + * A pfn swap entry is a special type of swap entry that always has a pfn stored + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. + * + * Returns: true if the leaf entry encodes a PFN, otherwise false. + */ +static inline bool softleaf_has_pfn(softleaf_t entry) +{ + /* Make sure the swp offset can always store the needed fields. */ + BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); + + if (softleaf_is_migration(entry)) + return true; + if (softleaf_is_device_private(entry)) + return true; + if (softleaf_is_device_exclusive(entry)) + return true; + if (softleaf_is_hwpoison(entry)) + return true; + + return false; +} + +/** + * softleaf_to_pfn() - Obtain PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: The PFN associated with the leaf entry. + */ +static inline unsigned long softleaf_to_pfn(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return swp_offset(entry) & SWP_PFN_MASK; +} + +/** + * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct page associated with the leaf entry's PFN. + */ +static inline struct page *softleaf_to_page(softleaf_t entry) +{ + struct page *page = pfn_to_page(softleaf_to_pfn(entry)); + + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); + + return page; +} + +/** + * softleaf_to_folio() - Obtains struct folio for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct folio associated with the leaf entry's PFN. + */ +static inline struct folio *softleaf_to_folio(softleaf_t entry) +{ + struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); + + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked. + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && + !folio_test_locked(folio)); + + return folio; +} + +/** + * softleaf_is_poison_marker() - Is this leaf entry a poison marker? + * @entry: Leaf entry. + * + * The poison marker is set via UFFDIO_POISON. Userfaultfd-specific. + * + * Returns: true if the leaf entry is a poison marker, otherwise false. + */ +static inline bool softleaf_is_poison_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_POISONED; +} + +/** + * softleaf_is_guard_marker() - Is this leaf entry a guard region marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a guard marker, otherwise false. + */ +static inline bool softleaf_is_guard_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_GUARD; +} + +/** + * softleaf_is_uffd_wp_marker() - Is this leaf entry a userfautlfd write protect + * marker? + * @entry: Leaf entry. + * + * Userfaultfd-specific. + * + * Returns: true if the leaf entry is a UFFD WP marker, otherwise false. + */ +static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP; +} + +#ifdef CONFIG_MIGRATION + +/** + * softleaf_is_migration_young() - Does this migration entry contain an accessed + * bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the accessed (or 'young') bit was set on the migrated page + * table entry. + * + * Returns: true if the entry contains an accessed bit, otherwise false. + */ +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_YOUNG; + /* Keep the old behavior of aging page after migration */ + return false; +} + +/** + * softleaf_is_migration_dirty() - Does this migration entry contain a dirty bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the dirty bit was set on the migrated page table entry. + * + * Returns: true if the entry contains a dirty bit, otherwise false. + */ +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_DIRTY; + /* Keep the old behavior of clean page after migration */ + return false; +} + +#else /* CONFIG_MIGRATION */ + +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + return false; +} + +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + return false; +} +#endif /* CONFIG_MIGRATION */ + +/** + * pte_is_marker() - Does the PTE entry encode a marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a marker leaf entry, otherwise false. + */ +static inline bool pte_is_marker(pte_t pte) +{ + return softleaf_is_marker(softleaf_from_pte(pte)); +} + +/** + * pte_is_uffd_wp_marker() - Does this PTE entry encode a userfaultfd write + * protect marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a UFFD WP marker leaf entry, otherwise false. + */ +static inline bool pte_is_uffd_wp_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + return softleaf_is_uffd_wp_marker(entry); +} + +/** + * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker + * leaf entry? + * @entry: Leaf entry. + * + * It's useful to be able to determine which leaf entries encode UFFD-specific + * markers so we can handle these correctly. + * + * Returns: true if this PTE entry is a UFFD-specific marker, otherwise false. + */ +static inline bool pte_is_uffd_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + if (!softleaf_is_marker(entry)) + return false; + + /* UFFD WP, poisoned swap entries are UFFD-handled. */ + if (softleaf_is_uffd_wp_marker(entry)) + return true; + if (softleaf_is_poison_marker(entry)) + return true; + + return false; +} + +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * pmd_is_device_private_entry() - Check if PMD contains a device private swap + * entry. + * @pmd: The PMD to check. + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: true if PMD contains device private entry, false otherwise + */ +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return softleaf_is_device_private(softleaf_from_pmd(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return false; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +/** + * pmd_is_migration_entry() - Does this PMD entry encode a migration entry? + * @pmd: PMD entry. + * + * Returns: true if the PMD encodes a migration entry, otherwise false. + */ +static inline bool pmd_is_migration_entry(pmd_t pmd) +{ + return softleaf_is_migration(softleaf_from_pmd(pmd)); +} + +/** + * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry? + * @pmd: PMD entry. + * + * PMD leaf entries are valid only if they are device private or migration + * entries. This function asserts that a PMD leaf entry is valid in this + * respect. + * + * Returns: true if the PMD entry is a valid leaf entry, otherwise false. + */ +static inline bool pmd_is_valid_softleaf(pmd_t pmd) +{ + const softleaf_t entry = softleaf_from_pmd(pmd); + + /* Only device private, migration entries valid for PMD. */ + return softleaf_is_device_private(entry) || + softleaf_is_migration(entry); +} + +#endif /* CONFIG_MMU */ +#endif /* _LINUX_LEAFOPS_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 873e510d6f8d..0651865a4564 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,6 +52,7 @@ enum memcg_memory_event { MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, + MEMCG_SOCK_THROTTLED, MEMCG_NR_MEMORY_EVENTS, }; @@ -956,17 +957,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, void mem_cgroup_flush_stats(struct mem_cgroup *memcg); void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); -void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); - -static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_kmem_state(p, idx, val); - local_irq_restore(flags); -} +void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); @@ -1001,36 +992,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void __memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event, - bool allow_spinning) -{ - bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || - event == MEMCG_SWAP_FAIL; - - /* For now only MEMCG_MAX can happen with !allow_spinning context. */ - VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); - - atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event && allow_spinning) - cgroup_file_notify(&memcg->events_local_file); - - do { - atomic_long_inc(&memcg->memory_events[event]); - if (allow_spinning) { - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); - } - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; - if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) - break; - } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); -} +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning); static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) @@ -1430,14 +1393,6 @@ static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } -static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - struct page *page = virt_to_head_page(p); - - __mod_node_page_state(page_pgdat(page), idx, val); -} - static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { @@ -1497,16 +1452,6 @@ struct slabobj_ext { #endif } __aligned(8); -static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - __mod_lruvec_kmem_state(p, idx, 1); -} - -static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - __mod_lruvec_kmem_state(p, idx, -1); -} - static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; @@ -1674,6 +1619,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return shrinker->id; +} #else #define mem_cgroup_sockets_enabled 0 @@ -1705,6 +1655,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return -1; +} #endif #ifdef CONFIG_MEMCG @@ -1791,6 +1746,13 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); +void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return memcg ? css_is_dying(&memcg->css) : false; +} + #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1857,6 +1819,15 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { return true; } + +static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) +{ +} + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return false; +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 000000000000..bc326503d2d2 --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include <linux/interval_tree.h> + +struct pfn_address_space; + +struct pfn_address_space { + struct interval_tree_node node; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/memory.h b/include/linux/memory.h index ba1515160894..faeaa921e55b 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -64,9 +64,19 @@ struct memory_group { }; }; +enum memory_block_state { + /* These states are exposed to userspace as text strings in sysfs */ + MEM_ONLINE, /* exposed to userspace */ + MEM_GOING_OFFLINE, /* exposed to userspace */ + MEM_OFFLINE, /* exposed to userspace */ + MEM_GOING_ONLINE, + MEM_CANCEL_ONLINE, + MEM_CANCEL_OFFLINE, +}; + struct memory_block { unsigned long start_section_nr; - unsigned long state; /* serialized by the dev->lock */ + enum memory_block_state state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* @@ -89,14 +99,6 @@ int arch_get_memory_phys_device(unsigned long start_pfn); unsigned long memory_block_size_bytes(void); int set_memory_block_size_order(unsigned int order); -/* These states are exposed to userspace as text strings in sysfs */ -#define MEM_ONLINE (1<<0) /* exposed to userspace */ -#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ -#define MEM_OFFLINE (1<<2) /* exposed to userspace */ -#define MEM_GOING_ONLINE (1<<3) -#define MEM_CANCEL_ONLINE (1<<4) -#define MEM_CANCEL_OFFLINE (1<<5) - struct memory_notify { unsigned long start_pfn; unsigned long nr_pages; @@ -130,7 +132,7 @@ static inline int register_memory_notifier(struct notifier_block *nb) static inline void unregister_memory_notifier(struct notifier_block *nb) { } -static inline int memory_notify(unsigned long val, void *v) +static inline int memory_notify(enum memory_block_state state, void *v) { return 0; } @@ -154,7 +156,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); -extern int memory_notify(unsigned long val, void *v); +extern int memory_notify(enum memory_block_state state, void *v); extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, diff --git a/include/linux/memregion.h b/include/linux/memregion.h index c01321467789..a55f62cc5266 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -26,8 +26,10 @@ static inline void memregion_free(int id) /** * cpu_cache_invalidate_memregion - drop any CPU cached data for - * memregions described by @res_desc - * @res_desc: one of the IORES_DESC_* types + * memregion + * @start: start physical address of the target memory region. + * @len: length of the target memory region. -1 for all the regions of + * the target type. * * Perform cache maintenance after a memory event / operation that * changes the contents of physical memory in a cache-incoherent manner. @@ -46,7 +48,7 @@ static inline void memregion_free(int id) * the cache maintenance. */ #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION -int cpu_cache_invalidate_memregion(int res_desc); +int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len); bool cpu_cache_has_invalidate_memregion(void); #else static inline bool cpu_cache_has_invalidate_memregion(void) @@ -54,10 +56,16 @@ static inline bool cpu_cache_has_invalidate_memregion(void) return false; } -static inline int cpu_cache_invalidate_memregion(int res_desc) +static inline int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len) { WARN_ON_ONCE("CPU cache invalidation required"); return -ENXIO; } #endif + +static inline int cpu_cache_invalidate_all(void) +{ + return cpu_cache_invalidate_memregion(0, -1); +} + #endif /* _MEMREGION_H_ */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 30c7aecbd245..713ec0435b48 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -76,11 +76,11 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 0. The reference count will be + * Called once the folio refcount reaches 0. The reference count will be * reset to one by the core code after the method is called to prepare - * for handing out the page again. + * for handing out the folio again. */ - void (*page_free)(struct page *page); + void (*folio_free)(struct folio *folio); /* * Used for private (un-addressable) device memory only. Must migrate @@ -99,6 +99,13 @@ struct dev_pagemap_ops { */ int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, unsigned long nr_pages, int mf_flags); + + /* + * Used for private (un-addressable) device memory only. + * This callback is used when a folio is split into + * a smaller folio + */ + void (*folio_split)(struct folio *head, struct folio *tail); }; #define PGMAP_ALTMAP_VALID (1 << 0) @@ -176,6 +183,18 @@ static inline bool folio_is_pci_p2pdma(const struct folio *folio) folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } +static inline void *folio_zone_device_data(const struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + return folio->page.zone_device_data; +} + +static inline void folio_set_zone_device_data(struct folio *folio, void *data) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + folio->page.zone_device_data = data; +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && @@ -205,7 +224,7 @@ static inline bool is_fsdax_page(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE -void zone_device_page_init(struct page *page); +void zone_device_page_init(struct page *page, unsigned int order); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -214,6 +233,31 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); + +static inline void zone_device_folio_init(struct folio *folio, unsigned int order) +{ + zone_device_page_init(&folio->page, order); + if (order) + folio_set_large_rmappable(folio); +} + +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ + if (folio_is_device_private(original_folio)) { + if (!original_folio->pgmap->ops->folio_split) { + if (new_folio) { + new_folio->pgmap = original_folio->pgmap; + new_folio->page.mapping = + original_folio->page.mapping; + } + } else { + original_folio->pgmap->ops->folio_split(original_folio, + new_folio); + } + } +} + #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) @@ -247,6 +291,11 @@ static inline unsigned long memremap_compat_align(void) { return PAGE_SIZE; } + +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ +} #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 1f0ac122c3bf..26ca00c325d9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node) #define MIGRATE_PFN_VALID (1UL << 0) #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_WRITE (1UL << 3) +#define MIGRATE_PFN_COMPOUND (1UL << 4) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) @@ -143,6 +144,7 @@ enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3, }; struct migrate_vma { diff --git a/include/linux/mm.h b/include/linux/mm.h index 8dc0a07570cc..7a1819c20643 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly; # endif #endif +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + #include <asm/page.h> #include <asm/processor.h> @@ -273,178 +275,235 @@ extern unsigned int kobjsize(const void *objp); * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ -#define VM_NONE 0x00000000 -#define VM_READ 0x00000001 /* currently active flags */ -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 +#define VM_NONE 0x00000000 -/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ -#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_MAYSHARE 0x00000080 +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; -#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT) +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ #ifdef CONFIG_MMU -#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ -#else /* CONFIG_MMU */ -#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ -#define VM_UFFD_MISSING 0 + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), #endif /* CONFIG_MMU */ -#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ - -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ - - /* Used by sys_madvise() */ -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ - -#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ -#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ -#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ -#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ -#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_SYNC 0x00800000 /* Synchronous page faults */ -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ -#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ - + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ +#ifdef CONFIG_STACK_GROWSUP + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; +#undef DECLARE_VMA_BIT +#undef DECLARE_VMA_BIT_ALIAS + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) #ifdef CONFIG_MEM_SOFT_DIRTY -# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) #else -# define VM_SOFTDIRTY 0 +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE #endif - -#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ -#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ -#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ -#define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */ - -#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS -#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) -#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) -#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) -#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) -#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) -#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) -#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) -#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ - #ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) #if CONFIG_ARCH_PKEY_BITS > 3 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) #else -# define VM_PKEY_BIT3 0 -#endif +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ #if CONFIG_ARCH_PKEY_BITS > 4 -# define VM_PKEY_BIT4 VM_HIGH_ARCH_4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) #else -# define VM_PKEY_BIT4 0 -#endif +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ #endif /* CONFIG_ARCH_HAS_PKEYS */ - -#ifdef CONFIG_X86_USER_SHADOW_STACK -/* - * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of - * support core mm. - * - * These VMAs will get a single end guard page. This helps userspace protect - * itself from attacks. A single page is enough for current shadow stack archs - * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c - * for more details on the guard size. - */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_5 -#endif - -#if defined(CONFIG_ARM64_GCS) -/* - * arm64's Guarded Control Stack implements similar functionality and - * has similar constraints to shadow stacks. - */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_6 -#endif - -#ifndef VM_SHADOW_STACK -# define VM_SHADOW_STACK VM_NONE +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE #endif - #if defined(CONFIG_PPC64) -# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ +#define VM_SAO INIT_VM_FLAG(SAO) #elif defined(CONFIG_PARISC) -# define VM_GROWSUP VM_ARCH_1 +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) #elif defined(CONFIG_SPARC64) -# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ -# define VM_ARCH_CLEAR VM_SPARC_ADI +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif defined(CONFIG_ARM64) -# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ -# define VM_ARCH_CLEAR VM_ARM64_BTI +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif !defined(CONFIG_MMU) -# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ -#endif - -#if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ -#else -# define VM_MTE VM_NONE -# define VM_MTE_ALLOWED VM_NONE +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) #endif - #ifndef VM_GROWSUP -# define VM_GROWSUP VM_NONE +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE #endif - #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR -# define VM_UFFD_MINOR_BIT 41 -# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ -#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -# define VM_UFFD_MINOR VM_NONE -#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ - -/* - * This flag is used to connect VFIO to arch specific KVM code. It - * indicates that the memory under this VMA is safe for use with any - * non-cachable memory type inside KVM. Some VFIO devices, on some - * platforms, are thought to be unsafe and can cause machine crashes - * if KVM does not lock down the memory type. - */ -#ifdef CONFIG_64BIT -#define VM_ALLOW_ANY_UNCACHED_BIT 39 -#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) #else -#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_UFFD_MINOR VM_NONE #endif - #ifdef CONFIG_64BIT -#define VM_DROPPABLE_BIT 40 -#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) -#elif defined(CONFIG_PPC32) -#define VM_DROPPABLE VM_ARCH_1 +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) #else -#define VM_DROPPABLE VM_NONE +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE #endif - -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) #else -#define VM_SEALED VM_NONE +#define VM_DROPPABLE VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ @@ -470,12 +529,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) -#ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 +#define VM_SEALED_SYSMAP VM_NONE #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) @@ -483,12 +540,26 @@ extern unsigned int kobjsize(const void *objp); /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) - /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + */ +#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) + /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -498,13 +569,69 @@ extern unsigned int kobjsize(const void *objp); /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +/* These flags can be updated atomically via VMA/mmap read lock. */ +#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD + /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR -# define VM_ARCH_CLEAR VM_NONE +#define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. + * + * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that + * mapped page tables may contain metadata not described by the + * VMA and thus any merged VMA may also contain this metadata, + * and thus we must make this flag sticky. + */ +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE VM_STICKY + +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + +/* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ @@ -783,7 +910,9 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { - ACCESS_PRIVATE(vma, __vm_flags) = flags; + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); } /* @@ -794,6 +923,7 @@ static inline void vm_flags_init(struct vm_area_struct *vma, static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_assert_write_locked(vma); vm_flags_init(vma, flags); } @@ -802,21 +932,33 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); - WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); + /* + * If VMA flags exist beyond the first system word, also clear these. It + * is assumed the write once behaviour is required only for the first + * system word. + */ + if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + + bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + } + + vma_flags_overwrite_word_once(&vma->flags, flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) |= flags; + vma_flags_set_word(&vma->flags, flags); } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; + vma_flags_clear_word(&vma->flags, flags); } /* @@ -840,6 +982,51 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } +static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, + vma_flag_t bit) +{ + const vm_flags_t mask = BIT((__force int)bit); + + /* Only specific flags are permitted */ + if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) + return false; + + return true; +} + +/* + * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific + * valid flags are allowed to do this. + */ +static inline void vma_flag_set_atomic(struct vm_area_struct *vma, + vma_flag_t bit) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + + /* mmap read lock/VMA read lock must be held. */ + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) + vma_assert_locked(vma); + + if (__vma_flag_atomic_valid(vma, bit)) + set_bit((__force int)bit, bitmap); +} + +/* + * Test for VMA flag atomically. Requires no locks. Only specific valid flags + * are allowed to do this. + * + * This is necessarily racey, so callers must ensure that serialisation is + * achieved through some other means, or that races are permissible. + */ +static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, + vma_flag_t bit) +{ + if (__vma_flag_atomic_valid(vma, bit)) + return test_bit((__force int)bit, &vma->vm_flags); + + return false; +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; @@ -2438,7 +2625,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end); struct mmu_notifier_range; @@ -2922,6 +3109,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU */ enum pt_flags { + PT_kernel = PG_referenced, PT_reserved = PG_reserved, /* High bits are used for zone/node/section */ }; @@ -2948,6 +3136,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) } /** + * ptdesc_set_kernel - Mark a ptdesc used to map the kernel + * @ptdesc: The ptdesc to be marked + * + * Kernel page tables often need special handling. Set a flag so that + * the handling code knows this ptdesc will not be used for userspace. + */ +static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) +{ + set_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel + * @ptdesc: The ptdesc to be unmarked + * + * Use when the ptdesc is no longer used to map the kernel and no longer + * needs special handling. + */ +static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) +{ + /* + * Note: the 'PG_referenced' bit does not strictly need to be + * cleared before freeing the page. But this is nice for + * symmetry. + */ + clear_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel + * @ptdesc: The ptdesc being tested + * + * Call to tell if the ptdesc used to map the kernel. + */ +static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) +{ + return test_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags * @order: desired pagetable order @@ -2965,6 +3193,21 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) +static inline void __pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +void pagetable_free_kernel(struct ptdesc *pt); +#else +static inline void pagetable_free_kernel(struct ptdesc *pt) +{ + __pagetable_free(pt); +} +#endif /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -2974,9 +3217,12 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde */ static inline void pagetable_free(struct ptdesc *pt) { - struct page *page = ptdesc_page(pt); - - __free_pages(page, compound_order(page)); + if (ptdesc_test_kernel(pt)) { + ptdesc_clear_kernel(pt); + pagetable_free_kernel(pt); + } else { + __pagetable_free(pt); + } } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) @@ -3560,6 +3806,90 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) +{ + return desc->end - desc->start; +} + +static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) +{ + return vma_desc_size(desc) >> PAGE_SHIFT; +} + +/** + * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN + * remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_remap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + /* [start, start + size) must be within the VMA. */ + WARN_ON_ONCE(start < desc->start || start >= desc->end); + WARN_ON_ONCE(start + size > desc->end); + + action->type = MMAP_REMAP_PFN; + action->remap.start = start; + action->remap.start_pfn = start_pfn; + action->remap.size = size; + action->remap.pgprot = desc->page_prot; +} + +/** + * mmap_action_remap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_remap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +/** + * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN + * I/O remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_ioremap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + mmap_action_remap(desc, start, start_pfn, size); + desc->action.type = MMAP_IO_REMAP_PFN; +} + +/** + * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN I/O remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc); +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma); + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) @@ -3601,10 +3931,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); -int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot); +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); @@ -3637,15 +3966,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -#ifndef io_remap_pfn_range -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, - unsigned long size, pgprot_t prot) +#ifndef io_remap_pfn_range_pfn +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { - return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); + return pfn; } #endif +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, + unsigned long size, pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); + + return remap_pfn_range(vma, addr, pfn, size, prot); +} + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) @@ -4094,6 +4432,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; @@ -4222,16 +4561,6 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); - -/* - * mseal of userspace process's system mappings. - */ -#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS -#define VM_SEALED_SYSMAP VM_SEALED -#else -#define VM_SEALED_SYSMAP VM_NONE -#endif - /* * DMA mapping IDs for page_pool * diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f6a2b2d20016..fa2d6ba811b5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -8,7 +8,7 @@ #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> -#include <linux/swapops.h> +#include <linux/leafops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -44,7 +44,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); - __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } @@ -541,9 +541,9 @@ static inline bool mm_tlb_flush_nested(const struct mm_struct *mm) * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( - swp_entry_t entry, struct vm_area_struct *dst_vma) + softleaf_t entry, struct vm_area_struct *dst_vma) { - pte_marker srcm = pte_marker_get(entry); + const pte_marker srcm = softleaf_to_marker(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); @@ -553,7 +553,6 @@ static inline pte_marker copy_pte_marker( return dstm; } -#endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to @@ -571,9 +570,11 @@ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; + if (!uffd_supports_wp_marker()) + return false; + /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); @@ -602,7 +603,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } -#endif + return false; } @@ -616,6 +617,7 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma) return true; } +#endif /** * num_pages_contiguous() - determine the number of contiguous pages diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3b7d05e7169c..9f6de068295d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -286,6 +286,31 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * typedef softleaf_t - Describes a page table software leaf entry, abstracted + * from its architecture-specific encoding. + * + * Page table leaf entries are those which do not reference any descendent page + * tables but rather either reference a data page, are an empty (or 'none' + * entry), or contain a non-present entry. + * + * If referencing another page table or a data page then the page table entry is + * pertinent to hardware - that is it tells the hardware how to decode the page + * table entry. + * + * Otherwise it is a software-defined leaf page table entry, which this type + * describes. See leafops.h and specifically @softleaf_type for a list of all + * possible kinds of software leaf entry. + * + * A softleaf_t entry is abstracted from the hardware page table entry, so is + * not architecture-specific. + * + * NOTE: While we transition from the confusing swp_entry_t type used for this + * purpose, we simply alias this type. This will be removed once the + * transition is complete. + */ +typedef swp_entry_t softleaf_t; + #if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT) /* We have some extra room after the refcount in tail pages. */ #define NR_PAGES_IN_LARGE_FOLIO @@ -774,6 +799,65 @@ struct pfnmap_track_ctx { }; #endif +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -791,12 +875,18 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; /* @@ -833,10 +923,12 @@ struct vm_area_struct { /* * Flags, see mm.h. * To modify use vm_flags_{init|reset|set|clear|mod} functions. + * Preferably, use vma_flags_xxx() functions. */ union { + /* Temporary while VMA flags are being converted. */ const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -917,6 +1009,52 @@ struct vm_area_struct { #endif } __randomize_layout; +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + #ifdef CONFIG_NUMA #define vma_policy(vma) ((vma)->vm_policy) #else @@ -1194,15 +1332,13 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; -/* Set the first system word of mm flags, non-atomically. */ -static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) +/* Copy value to the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_overwrite_word(struct mm_struct *mm, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); - - bitmap_copy(bitmap, &value, BITS_PER_LONG); + *ACCESS_PRIVATE(&mm->flags, __mm_flags) = value; } -/* Obtain a read-only view of the bitmap. */ +/* Obtain a read-only view of the mm flags bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); @@ -1211,9 +1347,7 @@ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct /* Read the first system word of mm flags, non-atomically. */ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) { - const unsigned long *bitmap = __mm_flags_get_bitmap(mm); - - return bitmap_read(bitmap, 0, BITS_PER_LONG); + return *__mm_flags_get_bitmap(mm); } /* diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 2c9fffa58714..d53f72dba7fe 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -130,7 +130,7 @@ static inline bool is_vma_writer_only(int refcnt) * a detached vma happens only in vma_mark_detached() and is a rare * case, therefore most of the time there will be no unnecessary wakeup. */ - return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) @@ -183,7 +183,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -195,7 +195,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l return (vma->vm_lock_seq == *mm_lock_seq); } -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, + int state); /* * Begin writing to a VMA. @@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - __vma_start_write(vma, mm_lock_seq); + __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE); +} + +/** + * vma_start_write_killable - Begin writing to a VMA. + * @vma: The VMA we are going to modify. + * + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + * + * Context: May sleep while waiting for readers to drop the vma read lock. + * Caller must already hold the mmap_lock for write. + * + * Return: 0 for a successful acquisition. -EINTR if a fatal signal was + * received. + */ +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + if (__is_vma_write_locked(vma, &mm_lock_seq)) + return 0; + return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -281,11 +305,10 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return true; } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) - { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) { return 0; } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_assert_attached(struct vm_area_struct *vma) {} diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7fb7331c5725..4398e027f450 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1060,10 +1060,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_DIRTY, /* reclaim scanning has recently found - * many dirty file pages at the tail - * of the LRU. - */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ diff --git a/include/linux/node.h b/include/linux/node.h index 866e3323f1fd..0269b064ba65 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -132,8 +132,6 @@ static inline void register_memory_blocks_under_nodes(void) } #endif -extern void unregister_node(struct node *node); - struct node_notify { int nid; }; @@ -176,8 +174,8 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) #ifdef CONFIG_NUMA extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ -extern int register_one_node(int nid); -extern void unregister_one_node(int nid); +int register_node(int nid); +void unregister_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); @@ -189,11 +187,11 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid, static inline void node_dev_init(void) { } -static inline int register_one_node(int nid) +static inline int register_node(int nid) { return 0; } -static inline int unregister_one_node(int nid) +static inline int unregister_node(int nid) { return 0; } diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 725f95f7e416..736f633b2d5f 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -459,6 +459,18 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) struct_size((type *)NULL, member, count) /** + * struct_offset() - Calculate the offset of a member within a struct + * @p: Pointer to the struct + * @member: Name of the member to get the offset of + * + * Calculates the offset of a particular @member of the structure pointed + * to by @p. + * + * Return: number of bytes to the location of @member. + */ +#define struct_offset(p, member) (offsetof(typeof(*(p)), member)) + +/** * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. * Enables caller macro to pass arbitrary trailing expressions * diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e601a3144f28..31a848485ad9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -651,9 +651,11 @@ static inline void *detach_page_private(struct page *page) } #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy); #else -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { return folio_alloc_noprof(gfp, order); } @@ -664,7 +666,7 @@ static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int o static inline struct page *__page_cache_alloc(gfp_t gfp) { - return &filemap_alloc_folio(gfp, 0)->page; + return &filemap_alloc_folio(gfp, 0, NULL)->page; } static inline gfp_t readahead_gfp_mask(struct address_space *x) @@ -750,11 +752,17 @@ static inline fgf_t fgf_set_order(size_t size) } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp); +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp, struct mempolicy *policy); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); +static inline struct folio *__filemap_get_folio(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp) +{ + return __filemap_get_folio_mpol(mapping, index, fgf_flags, gfp, NULL); +} + /** * write_begin_get_folio - Get folio for write_begin with flags. * @iocb: The kiocb passed from write_begin (may be NULL). diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ee3148ef87f6..652f287c1ef6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1557,6 +1557,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif +/* + * Some platforms can customize the PTE soft-dirty bit making it unavailable + * even if the architecture provides the resource. + * Adding this API allows architectures to add their own checks for the + * devices on which the kernel is running. + * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY + * is part of this macro. + */ +#ifndef pgtable_supports_soft_dirty +#define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) +#endif + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index f139377f4b31..19d1c5e5f335 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -66,8 +66,6 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; - struct dentry *proc_self; /* For /proc/self */ - struct dentry *proc_thread_self; /* For /proc/thread-self */ kgid_t pid_gid; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index e0dbcb4b4fd9..abcdee256c65 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -14,6 +14,39 @@ #include <uapi/linux/psp-sev.h> +/* As defined by SEV API, under "Guest Policy". */ +#define SEV_POLICY_MASK_NODBG BIT(0) +#define SEV_POLICY_MASK_NOKS BIT(1) +#define SEV_POLICY_MASK_ES BIT(2) +#define SEV_POLICY_MASK_NOSEND BIT(3) +#define SEV_POLICY_MASK_DOMAIN BIT(4) +#define SEV_POLICY_MASK_SEV BIT(5) +#define SEV_POLICY_MASK_API_MAJOR GENMASK(23, 16) +#define SEV_POLICY_MASK_API_MINOR GENMASK(31, 24) + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) +#define SNP_POLICY_MASK_CXL_ALLOW BIT_ULL(21) +#define SNP_POLICY_MASK_MEM_AES_256_XTS BIT_ULL(22) +#define SNP_POLICY_MASK_RAPL_DIS BIT_ULL(23) +#define SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM BIT_ULL(24) +#define SNP_POLICY_MASK_PAGE_SWAP_DISABLE BIT_ULL(25) + +/* Base SEV-SNP policy bitmask for minimum supported SEV firmware version */ +#define SNP_POLICY_MASK_BASE (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_MIGRATE_MA | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) + #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ /** @@ -849,7 +882,10 @@ struct snp_feature_info { u32 edx; } __packed; +#define SNP_RAPL_DISABLE_SUPPORTED BIT(2) #define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) +#define SNP_AES_256_XTS_POLICY_SUPPORTED BIT(4) +#define SNP_CXL_ALLOW_POLICY_SUPPORTED BIT(5) #ifdef CONFIG_CRYPTO_DEV_SP_PSP @@ -995,6 +1031,7 @@ void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); bool sev_is_snp_ciphertext_hiding_supported(void); +u64 sev_get_snp_policy_bits(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 357df16ede32..46514cb1b9e0 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -27,31 +27,6 @@ struct device_node; struct of_phandle_args; /** - * struct reset_control_lookup - represents a single lookup entry - * - * @list: internal list of all reset lookup entries - * @provider: name of the reset controller device controlling this reset line - * @index: ID of the reset controller in the reset controller device - * @dev_id: name of the device associated with this reset line - * @con_id: name of the reset line (can be NULL) - */ -struct reset_control_lookup { - struct list_head list; - const char *provider; - unsigned int index; - const char *dev_id; - const char *con_id; -}; - -#define RESET_LOOKUP(_provider, _index, _dev_id, _con_id) \ - { \ - .provider = _provider, \ - .index = _index, \ - .dev_id = _dev_id, \ - .con_id = _con_id, \ - } - -/** * struct reset_controller_dev - reset controller entity that might * provide multiple reset controls * @ops: a pointer to device specific struct reset_control_ops @@ -90,9 +65,6 @@ void reset_controller_unregister(struct reset_controller_dev *rcdev); struct device; int devm_reset_controller_register(struct device *dev, struct reset_controller_dev *rcdev); - -void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries); #else static inline int reset_controller_register(struct reset_controller_dev *rcdev) { @@ -108,11 +80,6 @@ static inline int devm_reset_controller_register(struct device *dev, { return 0; } - -static inline void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries) -{ -} #endif #endif diff --git a/include/linux/reset.h b/include/linux/reset.h index 840d75d172f6..44f9e3415f92 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -2,6 +2,7 @@ #ifndef _LINUX_RESET_H_ #define _LINUX_RESET_H_ +#include <linux/bits.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/types.h> diff --git a/include/linux/rv.h b/include/linux/rv.h index 9520aab34bcb..92fd467547e7 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -88,7 +88,7 @@ union rv_task_monitor { struct rv_reactor { const char *name; const char *description; - __printf(1, 2) void (*react)(const char *msg, ...); + __printf(1, 0) void (*react)(const char *msg, va_list args); struct list_head list; }; #endif @@ -102,7 +102,7 @@ struct rv_monitor { void (*reset)(void); #ifdef CONFIG_RV_REACTORS struct rv_reactor *reactor; - __printf(1, 2) void (*react)(const char *msg, ...); + __printf(1, 0) void (*react)(const char *msg, va_list args); #endif struct list_head list; struct rv_monitor *parent; @@ -116,13 +116,14 @@ int rv_get_task_monitor_slot(void); void rv_put_task_monitor_slot(int slot); #ifdef CONFIG_RV_REACTORS -bool rv_reacting_on(void); int rv_unregister_reactor(struct rv_reactor *reactor); int rv_register_reactor(struct rv_reactor *reactor); +__printf(2, 3) +void rv_react(struct rv_monitor *monitor, const char *msg, ...); #else -static inline bool rv_reacting_on(void) +__printf(2, 3) +static inline void rv_react(struct rv_monitor *monitor, const char *msg, ...) { - return false; } #endif /* CONFIG_RV_REACTORS */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0232d983b715..0e1d73955fa5 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -189,12 +189,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); -unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags); +unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, - struct file *filp, +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -318,6 +317,9 @@ static inline void might_alloc(gfp_t gfp_mask) fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); + if (current->flags & PF_MEMALLOC) + return; + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } diff --git a/include/linux/security.h b/include/linux/security.h index eb36451ce41f..83a646d72f6f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2257,8 +2257,6 @@ static inline void securityfs_remove(struct dentry *dentry) #endif -#define securityfs_recursive_remove securityfs_remove - #ifdef CONFIG_BPF_SYSCALL union bpf_attr; struct bpf_map; diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 52791e070506..9f2839e73f8a 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -149,6 +149,23 @@ static inline void seq_buf_commit(struct seq_buf *s, int num) } } +/** + * seq_buf_pop - pop off the last written character + * @s: the seq_buf handle + * + * Removes the last written character to the seq_buf @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int seq_buf_pop(struct seq_buf *s) +{ + if (!s->len) + return -1; + + s->len--; + return (unsigned int)s->buffer[s->len]; +} + extern __printf(2, 3) int seq_buf_printf(struct seq_buf *s, const char *fmt, ...); extern __printf(2, 0) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 774efe592a9a..5e4b3c1ae5c2 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); -extern int shmem_zero_setup(struct vm_area_struct *); +int shmem_zero_setup(struct vm_area_struct *vma); +int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); @@ -135,11 +136,16 @@ static inline bool shmem_hpage_pmd_enabled(void) #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); +extern void shmem_uncharge(struct inode *inode, long pages); #else static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) { return 0; } + +static inline void shmem_uncharge(struct inode *inode, long pages) +{ +} #endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -193,7 +199,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) } extern bool shmem_charge(struct inode *inode, long pages); -extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 7a69210a250c..0287f9182c4d 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -74,7 +74,14 @@ #define LLCC_CAMSRTIP 73 #define LLCC_CAMRTRF 74 #define LLCC_CAMSRTRF 75 +#define LLCC_VIDEO_APV 83 +#define LLCC_COMPUTE1 87 +#define LLCC_CPUSS_OPP 88 #define LLCC_CPUSSMPAM 89 +#define LLCC_CAM_IPE_STROV 92 +#define LLCC_CAM_OFE_STROV 93 +#define LLCC_CPUSS_HEU 94 +#define LLCC_MDM_PNG_FIXED 100 /** * struct llcc_slice_desc - Cache slice descriptor diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index 608950443eee..ba823a0013c5 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -82,6 +82,10 @@ struct socinfo { __le32 num_func_clusters; __le32 boot_cluster; __le32 boot_core; + /* Version 20 */ + __le32 raw_package_type; + /* Version 21, 22, 23 */ + __le32 reserve1[4]; }; /* Internal feature codes */ diff --git a/include/linux/soc/qcom/ubwc.h b/include/linux/soc/qcom/ubwc.h index 1ed8b1b16bc9..0a4edfe3d96d 100644 --- a/include/linux/soc/qcom/ubwc.h +++ b/include/linux/soc/qcom/ubwc.h @@ -52,6 +52,7 @@ struct qcom_ubwc_cfg_data { #define UBWC_4_0 0x40000000 #define UBWC_4_3 0x40030000 #define UBWC_5_0 0x50000000 +#define UBWC_6_0 0x60000000 #if IS_ENABLED(CONFIG_QCOM_UBWC_CONFIG) const struct qcom_ubwc_cfg_data *qcom_ubwc_config_get_data(void); diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index 71e0c09a49eb..532c6c2d1195 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -672,14 +672,341 @@ /* For Tensor GS101 */ /* PMU ALIVE */ -#define GS101_SYSIP_DAT0 (0x810) -#define GS101_CPU0_INFORM (0x860) -#define GS101_CPU_INFORM(cpu) \ - (GS101_CPU0_INFORM + (cpu*4)) -#define GS101_SYSTEM_CONFIGURATION (0x3A00) -#define GS101_EINT_WAKEUP_MASK (0x3A80) -#define GS101_PHY_CTRL_USB20 (0x3EB0) -#define GS101_PHY_CTRL_USBDP (0x3EB4) +#define GS101_OM_STAT 0x0000 +#define GS101_VERSION 0x0004 +#define GS101_PORESET_CHECK 0x0008 +#define GS101_OTP_STATUS 0x000c +#define GS101_SYSTEM_INFO 0x0010 +#define GS101_IDLE_IP(n) (0x03e0 + ((n) & 3) * 4) +#define GS101_IDLE_IP_MASK(n) (0x03f0 + ((n) & 3) * 4) +#define GS101_SLC_CH_OFFSET(ch) (0x0400 + ((ch) & 3) * 0x10) +#define GS101_DATARAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x00) +#define GS101_TAGRAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x04) +#define GS101_LRURAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x08) +#define GS101_PPMPURAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x0c) +#define GS101_DATARAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x40) +#define GS101_TAGRAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x44) +#define GS101_LRURAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x48) +#define GS101_PPMPURAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x4c) +#define GS101_INFORM0 0x0800 +#define GS101_INFORM1 0x0804 +#define GS101_INFORM2 0x0808 +#define GS101_INFORM3 0x080c +#define GS101_SYSIP_DAT(n) (0x0810 + ((n) & 3) * 4) +#define GS101_PWR_HOLD_HW_TRIP 0x0820 +#define GS101_PWR_HOLD_SW_TRIP 0x0824 +#define GS101_GSA_INFORM(n) (0x0830 + ((n) & 1) * 4) +#define GS101_INFORM4 0x0840 +#define GS101_INFORM5 0x0844 +#define GS101_INFORM6 0x0848 +#define GS101_INFORM7 0x084c +#define GS101_INFORM8 0x0850 +#define GS101_INFORM9 0x0854 +#define GS101_INFORM10 0x0858 +#define GS101_INFORM11 0x085c +#define GS101_CPU_INFORM(cpu) (0x0860 + ((cpu) & 7) * 4) +#define GS101_IROM_INFORM 0x0880 +#define GS101_IROM_CPU_INFORM(cpu) (0x0890 + ((cpu) & 7) * 4) +#define GS101_PMU_SPARE(n) (0x0900 + ((n) & 3) * 4) +#define GS101_IROM_DATA_REG(n) (0x0980 + ((n) & 3) * 4) +#define GS101_IROM_PWRMODE 0x0990 +#define GS101_DREX_CALIBRATION(n) (0x09a0 + ((n) & 7) * 4) + +#define GS101_CLUSTER0_OFFSET 0x1000 +#define GS101_CLUSTER1_OFFSET 0x1300 +#define GS101_CLUSTER2_OFFSET 0x1500 +#define GS101_CLUSTER_CPU_OFFSET(cl, cpu) ((cl) + ((cpu) * 0x80)) +#define GS101_CLUSTER_CPU_CONFIGURATION(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x00) +#define GS101_CLUSTER_CPU_STATUS(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x04) +#define GS101_CLUSTER_CPU_STATES(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x08) +#define GS101_CLUSTER_CPU_OPTION(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x0c) +#define GS101_CLUSTER_CPU_OUT(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x20) +#define GS101_CLUSTER_CPU_IN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x24) +#define GS101_CLUSTER_CPU_INT_IN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x40) +#define GS101_CLUSTER_CPU_INT_EN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x44) +#define GS101_CLUSTER_CPU_INT_TYPE(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x48) +#define GS101_CLUSTER_CPU_INT_DIR(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x4c) + +#define GS101_CLUSTER_NONCPU_OFFSET(cl) (0x1200 + ((cl) * 0x200)) +#define GS101_CLUSTER_NONCPU_CONFIGURATION(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x00) +#define GS101_CLUSTER_NONCPU_STATUS(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x04) +#define GS101_CLUSTER_NONCPU_STATES(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x08) +#define GS101_CLUSTER_NONCPU_OPTION(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x0c) +#define GS101_CLUSTER_NONCPU_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x20) +#define GS101_CLUSTER_NONCPU_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x24) +#define GS101_CLUSTER_NONCPU_INT_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x40) +#define GS101_CLUSTER_NONCPU_INT_EN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x44) +#define GS101_CLUSTER_NONCPU_INT_TYPE(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x48) +#define GS101_CLUSTER_NONCPU_INT_DIR(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x4c) +#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x60) +#define GS101_CLUSTER_NONCPU_DUALRAIL_POS_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x64) +#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x6c) +#define GS101_CLUSTER0_NONCPU_DSU_PCH \ + (GS101_CLUSTER_NONCPU_OFFSET(0) + 0x80) + +#define GS101_SUBBBLK_OFFSET_ALIVE 0x1800 +#define GS101_SUBBBLK_OFFSET_AOC 0x1880 +#define GS101_SUBBBLK_OFFSET_APM 0x1900 +#define GS101_SUBBBLK_OFFSET_CMU 0x1980 +#define GS101_SUBBBLK_OFFSET_BUS0 0x1a00 +#define GS101_SUBBBLK_OFFSET_BUS1 0x1a80 +#define GS101_SUBBBLK_OFFSET_BUS2 0x1b00 +#define GS101_SUBBBLK_OFFSET_CORE 0x1b80 +#define GS101_SUBBBLK_OFFSET_EH 0x1c00 +#define GS101_SUBBBLK_OFFSET_CPUCL0 0x1c80 +#define GS101_SUBBBLK_OFFSET_CPUCL1 0x1d00 +#define GS101_SUBBBLK_OFFSET_CPUCL2 0x1d80 +#define GS101_SUBBBLK_OFFSET_G3D 0x1e00 +#define GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0 0x1e80 +#define GS101_SUBBBLK_OFFSET_EMBEDDED_G3D 0x2000 +#define GS101_SUBBBLK_OFFSET_HSI0 0x2080 +#define GS101_SUBBBLK_OFFSET_HSI1 0x2100 +#define GS101_SUBBBLK_OFFSET_HSI2 0x2180 +#define GS101_SUBBBLK_OFFSET_DPU 0x2200 +#define GS101_SUBBBLK_OFFSET_DISP 0x2280 +#define GS101_SUBBBLK_OFFSET_G2D 0x2300 +#define GS101_SUBBBLK_OFFSET_MFC 0x2380 +#define GS101_SUBBBLK_OFFSET_CSIS 0x2400 +#define GS101_SUBBBLK_OFFSET_PDP 0x2480 +#define GS101_SUBBBLK_OFFSET_DNS 0x2500 +#define GS101_SUBBBLK_OFFSET_G3AA 0x2580 +#define GS101_SUBBBLK_OFFSET_IPP 0x2600 +#define GS101_SUBBBLK_OFFSET_ITP 0x2680 +#define GS101_SUBBBLK_OFFSET_MCSC 0x2700 +#define GS101_SUBBBLK_OFFSET_GDC 0x2780 +#define GS101_SUBBBLK_OFFSET_TNR 0x2800 +#define GS101_SUBBBLK_OFFSET_BO 0x2880 +#define GS101_SUBBBLK_OFFSET_TPU 0x2900 +#define GS101_SUBBBLK_OFFSET_MIF0 0x2980 +#define GS101_SUBBBLK_OFFSET_MIF1 0x2a00 +#define GS101_SUBBBLK_OFFSET_MIF2 0x2a80 +#define GS101_SUBBBLK_OFFSET_MIF3 0x2b00 +#define GS101_SUBBBLK_OFFSET_MISC 0x2b80 +#define GS101_SUBBBLK_OFFSET_PERIC0 0x2c00 +#define GS101_SUBBBLK_OFFSET_PERIC1 0x2c80 +#define GS101_SUBBBLK_OFFSET_S2D 0x2d00 +#define GS101_SUBBLK_CONFIGURATION(blk) ((blk) + 0x00) +#define GS101_SUBBLK_STATUS(blk) ((blk) + 0x04) +#define GS101_SUBBLK_STATES(blk) ((blk) + 0x08) +#define GS101_SUBBLK_OPTION(blk) ((blk) + 0x0c) +#define GS101_SUBBLK_CTRL(blk) ((blk) + 0x10) +#define GS101_SUBBLK_OUT(blk) ((blk) + 0x20) +#define GS101_SUBBLK_IN(blk) ((blk) + 0x24) +#define GS101_SUBBLK_INT_IN(blk) ((blk) + 0x40) +#define GS101_SUBBLK_INT_EN(blk) ((blk) + 0x44) +#define GS101_SUBBLK_INT_TYPE(blk) ((blk) + 0x48) +#define GS101_SUBBLK_INT_DIR(blk) ((blk) + 0x4c) +#define GS101_SUBBLK_MEMORY_OUT(blk) ((blk) + 0x60) +#define GS101_SUBBLK_MEMORY_IN(blk) ((blk) + 0x64) + +#define GS101_SUBBBLK_CPU_OFFSET_APM 0x3000 +#define GS101_SUBBBLK_CPU_OFFSET_DBGCORE 0x3080 +#define GS101_SUBBBLK_CPU_OFFSET_SSS 0x3100 +#define GS101_SUBBLK_CPU_CONFIGURATION(blk) ((blk) + 0x00) +#define GS101_SUBBLK_CPU_STATUS(blk) ((blk) + 0x04) +#define GS101_SUBBLK_CPU_STATES(blk) ((blk) + 0x08) +#define GS101_SUBBLK_CPU_OPTION(blk) ((blk) + 0x0c) +#define GS101_SUBBLK_CPU_OUT(blk) ((blk) + 0x20) +#define GS101_SUBBLK_CPU_IN(blk) ((blk) + 0x24) +#define GS101_SUBBLK_CPU_INT_IN(blk) ((blk) + 0x40) +#define GS101_SUBBLK_CPU_INT_EN(blk) ((blk) + 0x44) +#define GS101_SUBBLK_CPU_INT_TYPE(blk) ((blk) + 0x48) +#define GS101_SUBBLK_CPU_INT_DIR(blk) ((blk) + 0x4c) + +#define GS101_MIF_CONFIGURATION 0x3800 +#define GS101_MIF_STATUS 0x3804 +#define GS101_MIF_STATES 0x3808 +#define GS101_MIF_OPTION 0x380c +#define GS101_MIF_CTRL 0x3810 +#define GS101_MIF_OUT 0x3820 +#define GS101_MIF_IN 0x3824 +#define GS101_MIF_INT_IN 0x3840 +#define GS101_MIF_INT_EN 0x3844 +#define GS101_MIF_INT_TYPE 0x3848 +#define GS101_MIF_INT_DIR 0x384c +#define GS101_TOP_CONFIGURATION 0x3900 +#define GS101_TOP_STATUS 0x3904 +#define GS101_TOP_STATES 0x3908 +#define GS101_TOP_OPTION 0x390c +#define GS101_TOP_OUT 0x3920 +#define GS101_TOP_IN 0x3924 +#define GS101_TOP_INT_IN 0x3940 +#define GS101_TOP_INT_EN 0x3944 +#define GS101_TOP_INT_TYPE 0x3948 +#define GS101_TOP_INT_DIR 0x394c +#define GS101_WAKEUP_STAT 0x3950 +#define GS101_WAKEUP2_STAT 0x3954 +#define GS101_WAKEUP2_INT_IN 0x3960 +#define GS101_WAKEUP2_INT_EN 0x3964 +#define GS101_WAKEUP2_INT_TYPE 0x3968 +#define GS101_WAKEUP2_INT_DIR 0x396c +#define GS101_SYSTEM_CONFIGURATION 0x3a00 +#define GS101_SYSTEM_STATUS 0x3a04 +#define GS101_SYSTEM_STATES 0x3a08 +#define GS101_SYSTEM_OPTION 0x3a0c +#define GS101_SYSTEM_CTRL 0x3a10 +#define GS101_SPARE_CTRL 0x3a14 +#define GS101_USER_DEFINED_OUT 0x3a18 +#define GS101_SYSTEM_OUT 0x3a20 +#define GS101_SYSTEM_IN 0x3a24 +#define GS101_SYSTEM_INT_IN 0x3a40 +#define GS101_SYSTEM_INT_EN 0x3a44 +#define GS101_SYSTEM_INT_TYPE 0x3a48 +#define GS101_SYSTEM_INT_DIR 0x3a4c +#define GS101_EINT_INT_IN 0x3a50 +#define GS101_EINT_INT_EN 0x3a54 +#define GS101_EINT_INT_TYPE 0x3a58 +#define GS101_EINT_INT_DIR 0x3a5c +#define GS101_EINT2_INT_IN 0x3a60 +#define GS101_EINT2_INT_EN 0x3a64 +#define GS101_EINT2_INT_TYPE 0x3a68 +#define GS101_EINT2_INT_DIR 0x3a6c +#define GS101_EINT3_INT_IN 0x3a70 +#define GS101_EINT3_INT_EN 0x3a74 +#define GS101_EINT3_INT_TYPE 0x3a78 +#define GS101_EINT3_INT_DIR 0x3a7c +#define GS101_EINT_WAKEUP_MASK 0x3a80 +#define GS101_EINT_WAKEUP_MASK2 0x3a84 +#define GS101_EINT_WAKEUP_MASK3 0x3a88 +#define GS101_USER_DEFINED_INT_IN 0x3a90 +#define GS101_USER_DEFINED_INT_EN 0x3a94 +#define GS101_USER_DEFINED_INT_TYPE 0x3a98 +#define GS101_USER_DEFINED_INT_DIR 0x3a9c +#define GS101_SCAN2DRAM_INT_IN 0x3aa0 +#define GS101_SCAN2DRAM_INT_EN 0x3aa4 +#define GS101_SCAN2DRAM_INT_TYPE 0x3aa8 +#define GS101_SCAN2DRAM_INT_DIR 0x3aac +#define GS101_HCU_START 0x3ab0 +#define GS101_CUSTOM_OUT 0x3ac0 +#define GS101_CUSTOM_IN 0x3ac4 +#define GS101_CUSTOM_INT_IN 0x3ad0 +#define GS101_CUSTOM_INT_EN 0x3ad4 +#define GS101_CUSTOM_INT_TYPE 0x3ad8 +#define GS101_CUSTOM_INT_DIR 0x3adc +#define GS101_ACK_LAST_CPU 0x3afc +#define GS101_HCU_R(n) (0x3b00 + ((n) & 3) * 4) +#define GS101_HCU_SP 0x3b14 +#define GS101_HCU_PC 0x3b18 +#define GS101_PMU_RAM_CTRL 0x3b20 +#define GS101_APM_HCU_CTRL 0x3b24 +#define GS101_APM_NMI_ENABLE 0x3b30 +#define GS101_DBGCORE_NMI_ENABLE 0x3b34 +#define GS101_HCU_NMI_ENABLE 0x3b38 +#define GS101_PWR_HOLD_WDT_ENABLE 0x3b3c +#define GS101_NMI_SRC_IN 0x3b40 +#define GS101_RST_STAT 0x3b44 +#define GS101_RST_STAT_PMU 0x3b48 +#define GS101_HPM_INT_IN 0x3b60 +#define GS101_HPM_INT_EN 0x3b64 +#define GS101_HPM_INT_TYPE 0x3b68 +#define GS101_HPM_INT_DIR 0x3b6c +#define GS101_S2D_AUTH 0x3b70 +#define GS101_BOOT_STAT 0x3b74 +#define GS101_PMLINK_OUT 0x3c00 +#define GS101_PMLINK_AOC_OUT 0x3c04 +#define GS101_PMLINK_AOC_CTRL 0x3c08 +#define GS101_TCXO_BUF_CTRL 0x3c10 +#define GS101_ADD_CTRL 0x3c14 +#define GS101_HCU_TIMEOUT_RESET 0x3c20 +#define GS101_HCU_TIMEOUT_SCAN2DRAM 0x3c24 +#define GS101_TIMER(n) (0x3c80 + ((n) & 3) * 4) +#define GS101_PPC_MIF(n) (0x3c90 + ((n) & 3) * 4) +#define GS101_PPC_CORE 0x3ca0 +#define GS101_PPC_EH 0x3ca4 +#define GS101_PPC_CPUCL1_0 0x3ca8 +#define GS101_PPC_CPUCL1_1 0x3cac +#define GS101_EXT_REGULATOR_MIF_DURATION 0x3cb0 +#define GS101_EXT_REGULATOR_TOP_DURATION 0x3cb4 +#define GS101_EXT_REGULATOR_CPUCL2_DURATION 0x3cb8 +#define GS101_EXT_REGULATOR_CPUCL1_DURATION 0x3cbc +#define GS101_EXT_REGULATOR_G3D_DURATION 0x3cc0 +#define GS101_EXT_REGULATOR_TPU_DURATION 0x3cc4 +#define GS101_TCXO_DURATION 0x3cc8 +#define GS101_BURNIN_CTRL 0x3cd0 +#define GS101_JTAG_DBG_DET 0x3cd4 +#define GS101_MMC_CONWKUP_CTRL 0x3cd8 +#define GS101_USBDPPHY0_USBDP_WAKEUP 0x3cdc +#define GS101_TMU_TOP_TRIP 0x3ce0 +#define GS101_TMU_SUB_TRIP 0x3ce4 +#define GS101_MEMORY_CEN 0x3d00 +#define GS101_MEMORY_PGEN 0x3d04 +#define GS101_MEMORY_RET 0x3d08 +#define GS101_MEMORY_PGEN_FEEDBACK 0x3d0c +#define GS101_MEMORY_SMX 0x3d10 +#define GS101_MEMORY_SMX_FEEDBACK 0x3d14 +#define GS101_SLC_PCH_CHANNEL 0x3d20 +#define GS101_SLC_PCH_CB 0x3d24 +#define GS101_FORCE_NOMC 0x3d3c +#define GS101_FORCE_BOOST 0x3d4c +#define GS101_PMLINK_SLC_REQ 0x3d50 +#define GS101_PMLINK_SLC_ACK 0x3d54 +#define GS101_PMLINK_SLC_BUSY 0x3d58 +#define GS101_BOOTSYNC_OUT 0x3d80 +#define GS101_BOOTSYNC_IN 0x3d84 +#define GS101_SCAN_READY_OUT 0x3d88 +#define GS101_SCAN_READY_IN 0x3d8c +#define GS101_GSA_RESTORE 0x3d90 +#define GS101_ALIVE_OTP_LATCH 0x3d94 +#define GS101_DEBUG_OVERRIDE 0x3d98 +#define GS101_WDT_OPTION 0x3d9c +#define GS101_AOC_WDT_CFG 0x3da0 +#define GS101_CTRL_SECJTAG_ALIVE 0x3da4 +#define GS101_CTRL_DIV_PLL_ALV_DIVLOW 0x3e00 +#define GS101_CTRL_MUX_CLK_APM_REFSRC_AUTORESTORE 0x3e04 +#define GS101_CTRL_MUX_CLK_APM_REFSRC 0x3e08 +#define GS101_CTRL_MUX_CLK_APM_REF 0x3e0c +#define GS101_CTRL_MUX_PLL_ALV_DIV4 0x3e10 +#define GS101_CTRL_PLL_ALV_DIV4 0x3e14 +#define GS101_CTRL_OSCCLK_APMGSA 0x3e18 +#define GS101_CTRL_BLK_AOC_CLKS 0x3e1c +#define GS101_CTRL_PLL_ALV_LOCK 0x3e20 +#define GS101_CTRL_CLKDIV__CLKRTC 0x3e24 +#define GS101_CTRL_SOC32K 0x3e30 +#define GS101_CTRL_STM_PMU 0x3e34 +#define GS101_CTRL_PMU_DEBUG 0x3e38 +#define GS101_CTRL_DEBUG_UART 0x3e3c +#define GS101_CTRL_TCK 0x3e40 +#define GS101_CTRL_SBU_SW_EN 0x3e44 +#define GS101_PAD_CTRL_CLKOUT0 0x3e80 +#define GS101_PAD_CTRL_CLKOUT1 0x3e84 +#define GS101_PAD_CTRL_APM_24MOUT_0 0x3e88 +#define GS101_PAD_CTRL_APM_24MOUT_1 0x3e8c +#define GS101_PAD_CTRL_IO_FORCE_RETENTION 0x3e90 +#define GS101_PAD_CTRL_APACTIVE_n 0x3e94 +#define GS101_PAD_CTRL_TCXO_ON 0x3e98 +#define GS101_PAD_CTRL_PWR_HOLD 0x3e9c +#define GS101_PAD_CTRL_RESETO_n 0x3ea0 +#define GS101_PAD_CTRL_WRESETO_n 0x3ea4 +#define GS101_PHY_CTRL_USB20 0x3eb0 +#define GS101_PHY_CTRL_USBDP 0x3eb4 +#define GS101_PHY_CTRL_MIPI_DCPHY_M4M4 0x3eb8 +#define GS101_PHY_CTRL_MIPI_DCPHY_S4S4S4S4 0x3ebc +#define GS101_PHY_CTRL_PCIE_GEN4_0 0x3ec0 +#define GS101_PHY_CTRL_PCIE_GEN4_1 0x3ec4 +#define GS101_PHY_CTRL_UFS 0x3ec8 /* PMU INTR GEN */ #define GS101_GRP1_INTR_BID_UPEND (0x0108) diff --git a/include/linux/string.h b/include/linux/string.h index 0266dbdaa4cd..1b564c36d721 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -371,6 +371,10 @@ static inline void memzero_explicit(void *s, size_t count) * kbasename - return the last part of a pathname. * * @path: path to extract the filename from. + * + * Returns: + * Pointer to the filename portion inside @path. If no '/' exists, + * returns @path unchanged. */ static inline const char *kbasename(const char *path) { @@ -556,6 +560,9 @@ static __always_inline size_t str_has_prefix(const char *str, const char *prefix * strstarts - does @str start with @prefix? * @str: string to examine * @prefix: prefix to look for. + * + * Returns: + * True if @str begins with @prefix. False in all other cases. */ static inline bool strstarts(const char *str, const char *prefix) { diff --git a/include/linux/swap.h b/include/linux/swap.h index e818fbade1e2..38ca3df68716 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,16 +301,7 @@ struct swap_info_struct { struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ + struct plist_node avail_list; /* entry in swap_avail_head */ }; static inline swp_entry_t page_swap_entry(struct page *page) @@ -462,7 +453,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); +int folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); @@ -560,7 +551,7 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) +static inline int folio_alloc_swap(struct folio *folio) { return -EINVAL; } diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 64ea151a7ae3..8cfc966eae48 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -28,7 +28,7 @@ #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* - * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To + * Definitions only for PFN swap entries (see leafeant_has_pfn()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ @@ -66,8 +66,6 @@ #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) -static inline bool is_pfn_swap_entry(swp_entry_t entry); - /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { @@ -110,36 +108,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry) } /* - * This should only be called upon a pfn swap entry to get the PFN stored - * in the swap entry. Please refers to is_pfn_swap_entry() for definition - * of pfn swap entry. - */ -static inline unsigned long swp_offset_pfn(swp_entry_t entry) -{ - VM_BUG_ON(!is_pfn_swap_entry(entry)); - return swp_offset(entry) & SWP_PFN_MASK; -} - -/* check whether a pte points to a swap entry */ -static inline int is_swap_pte(pte_t pte) -{ - return !pte_none(pte) && !pte_present(pte); -} - -/* - * Convert the arch-dependent pte representation of a swp_entry_t into an - * arch-independent swp_entry_t. - */ -static inline swp_entry_t pte_to_swp_entry(pte_t pte) -{ - swp_entry_t arch_entry; - - pte = pte_swp_clear_flags(pte); - arch_entry = __pte_to_swp_entry(pte); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - -/* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. */ @@ -175,27 +143,11 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(SWP_DEVICE_WRITE, offset); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - int type = swp_type(entry); - return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; -} - #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -207,50 +159,14 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - return false; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return false; -} - #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION -static inline int is_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ || - swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || - swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ); -} - -static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); -} static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -289,14 +205,6 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_YOUNG; - /* Keep the old behavior of aging page after migration */ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) @@ -305,14 +213,6 @@ static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_DIRTY; - /* Keep the old behavior of clean page after migration */ - return false; -} - extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); @@ -332,43 +232,21 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline int is_migration_entry(swp_entry_t swp) -{ - return 0; -} - static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return 0; -} -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return 0; -} static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - return false; -} #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE @@ -426,21 +304,6 @@ static inline swp_entry_t make_pte_marker_entry(pte_marker marker) return swp_entry(SWP_PTE_MARKER, marker); } -static inline bool is_pte_marker_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_PTE_MARKER; -} - -static inline pte_marker pte_marker_get(swp_entry_t entry) -{ - return swp_offset(entry) & PTE_MARKER_MASK; -} - -static inline bool is_pte_marker(pte_t pte) -{ - return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); -} - static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); @@ -451,83 +314,11 @@ static inline swp_entry_t make_poisoned_swp_entry(void) return make_pte_marker_entry(PTE_MARKER_POISONED); } -static inline int is_poisoned_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_POISONED); - -} - static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } -static inline int is_guard_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_GUARD); -} - -/* - * This is a special version to check pte_none() just to cover the case when - * the pte is a pte marker. It existed because in many cases the pte marker - * should be seen as a none pte; it's just that we have stored some information - * onto the none pte so it becomes not-none any more. - * - * It should be used when the pte is file-backed, ram-based and backing - * userspace pages, like shmem. It is not needed upon pgtables that do not - * support pte markers at all. For example, it's not needed on anonymous - * memory, kernel-only memory (including when the system is during-boot), - * non-ram based generic file-system. It's fine to be used even there, but the - * extra pte marker check will be pure overhead. - */ -static inline int pte_none_mostly(pte_t pte) -{ - return pte_none(pte) || is_pte_marker(pte); -} - -static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) -{ - struct page *p = pfn_to_page(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - BUG_ON(is_migration_entry(entry) && !PageLocked(p)); - - return p; -} - -static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) -{ - struct folio *folio = pfn_folio(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked - */ - BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); - - return folio; -} - -/* - * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They can either be used to represent unaddressable device - * memory, to restrict access to a page undergoing migration or to represent a - * pfn which has been hwpoisoned and unmapped. - */ -static inline bool is_pfn_swap_entry(swp_entry_t entry) -{ - /* Make sure the swp offset can always store the needed fields */ - BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); - - return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); -} - struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION @@ -539,18 +330,6 @@ extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - swp_entry_t arch_entry; - - if (pmd_swp_soft_dirty(pmd)) - pmd = pmd_swp_clear_soft_dirty(pmd); - if (pmd_swp_uffd_wp(pmd)) - pmd = pmd_swp_clear_uffd_wp(pmd); - arch_entry = __pmd_to_swp_entry(pmd); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; @@ -559,10 +338,6 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) return __swp_entry_to_pmd(arch_entry); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); -} #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) @@ -578,26 +353,12 @@ static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - return swp_entry(0, 0); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - return 0; -} #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static inline int non_swap_entry(swp_entry_t entry) -{ - return swp_type(entry) >= MAX_SWAPFILES; -} - #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ diff --git a/include/linux/syscore_ops.h b/include/linux/syscore_ops.h index ae4d48e4c970..ac6d71be5c38 100644 --- a/include/linux/syscore_ops.h +++ b/include/linux/syscore_ops.h @@ -11,14 +11,19 @@ #include <linux/list.h> struct syscore_ops { + int (*suspend)(void *data); + void (*resume)(void *data); + void (*shutdown)(void *data); +}; + +struct syscore { struct list_head node; - int (*suspend)(void); - void (*resume)(void); - void (*shutdown)(void); + const struct syscore_ops *ops; + void *data; }; -extern void register_syscore_ops(struct syscore_ops *ops); -extern void unregister_syscore_ops(struct syscore_ops *ops); +extern void register_syscore(struct syscore *syscore); +extern void unregister_syscore(struct syscore *syscore); #ifdef CONFIG_PM_SLEEP extern int syscore_suspend(void); extern void syscore_resume(void); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 92e9146b1104..288fe0055cd5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -59,6 +59,121 @@ extern const int sysctl_vals[]; #define SYSCTL_LONG_ONE ((void *)&sysctl_long_vals[1]) #define SYSCTL_LONG_MAX ((void *)&sysctl_long_vals[2]) +#define SYSCTL_CONV_IDENTITY(val) (val) +/** + * + * "dir" originates from read_iter (dir = 0) or write_iter (dir = 1) + * in the file_operations struct at proc/proc_sysctl.c. Its value means + * one of two things for sysctl: + * 1. SYSCTL_USER_TO_KERN(dir) Writing to an internal kernel variable from user + * space (dir > 0) + * 2. SYSCTL_KERN_TO_USER(dir) Writing to a user space buffer from a kernel + * variable (dir == 0). + */ +#define SYSCTL_USER_TO_KERN(dir) (!!(dir)) +#define SYSCTL_KERN_TO_USER(dir) (!dir) + +#define SYSCTL_USER_TO_KERN_INT_CONV(name, u_ptr_op) \ +int sysctl_user_to_kern_int_conv##name(const bool *negp, \ + const unsigned long *u_ptr,\ + int *k_ptr) \ +{ \ + unsigned long u = u_ptr_op(*u_ptr); \ + if (*negp) { \ + if (u > (unsigned long) INT_MAX + 1) \ + return -EINVAL; \ + WRITE_ONCE(*k_ptr, -u); \ + } else { \ + if (u > (unsigned long) INT_MAX) \ + return -EINVAL; \ + WRITE_ONCE(*k_ptr, u); \ + } \ + return 0; \ +} + +#define SYSCTL_KERN_TO_USER_INT_CONV(name, k_ptr_op) \ +int sysctl_kern_to_user_int_conv##name(bool *negp, \ + unsigned long *u_ptr, \ + const int *k_ptr) \ +{ \ + int val = READ_ONCE(*k_ptr); \ + if (val < 0) { \ + *negp = true; \ + *u_ptr = -k_ptr_op((unsigned long)val); \ + } else { \ + *negp = false; \ + *u_ptr = k_ptr_op((unsigned long)val); \ + } \ + return 0; \ +} + +/** + * To range check on a converted value, use a temp k_ptr + * When checking range, value should be within (tbl->extra1, tbl->extra2) + */ +#define SYSCTL_INT_CONV_CUSTOM(name, user_to_kern, kern_to_user, \ + k_ptr_range_check) \ +int do_proc_int_conv##name(bool *negp, unsigned long *u_ptr, int *k_ptr,\ + int dir, const struct ctl_table *tbl) \ +{ \ + if (SYSCTL_KERN_TO_USER(dir)) \ + return kern_to_user(negp, u_ptr, k_ptr); \ + \ + if (k_ptr_range_check) { \ + int tmp_k, ret; \ + if (!tbl) \ + return -EINVAL; \ + ret = user_to_kern(negp, u_ptr, &tmp_k); \ + if (ret) \ + return ret; \ + if ((tbl->extra1 && *(int *)tbl->extra1 > tmp_k) || \ + (tbl->extra2 && *(int *)tbl->extra2 < tmp_k)) \ + return -EINVAL; \ + WRITE_ONCE(*k_ptr, tmp_k); \ + } else \ + return user_to_kern(negp, u_ptr, k_ptr); \ + return 0; \ +} + +#define SYSCTL_USER_TO_KERN_UINT_CONV(name, u_ptr_op) \ +int sysctl_user_to_kern_uint_conv##name(const unsigned long *u_ptr,\ + unsigned int *k_ptr) \ +{ \ + unsigned long u = u_ptr_op(*u_ptr); \ + if (u > UINT_MAX) \ + return -EINVAL; \ + WRITE_ONCE(*k_ptr, u); \ + return 0; \ +} + +#define SYSCTL_UINT_CONV_CUSTOM(name, user_to_kern, kern_to_user, \ + k_ptr_range_check) \ +int do_proc_uint_conv##name(unsigned long *u_ptr, unsigned int *k_ptr, \ + int dir, const struct ctl_table *tbl) \ +{ \ + if (SYSCTL_KERN_TO_USER(dir)) \ + return kern_to_user(u_ptr, k_ptr); \ + \ + if (k_ptr_range_check) { \ + unsigned int tmp_k; \ + int ret; \ + if (!tbl) \ + return -EINVAL; \ + ret = user_to_kern(u_ptr, &tmp_k); \ + if (ret) \ + return ret; \ + if ((tbl->extra1 && \ + *(unsigned int *)tbl->extra1 > tmp_k) || \ + (tbl->extra2 && \ + *(unsigned int *)tbl->extra2 < tmp_k)) \ + return -ERANGE; \ + WRITE_ONCE(*k_ptr, tmp_k); \ + } else \ + return user_to_kern(u_ptr, k_ptr); \ + return 0; \ +} + + extern const unsigned long sysctl_long_vals[]; typedef int proc_handler(const struct ctl_table *ctl, int write, void *buffer, @@ -68,25 +183,30 @@ int proc_dostring(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dobool(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos); +int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, + int dir, const struct ctl_table *table)); int proc_douintvec(const struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_dointvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +int proc_douintvec_conv(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, unsigned int *valp, + int write, const struct ctl_table *table)); + int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_dointvec_jiffies(const struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int proc_dointvec_userhz_jiffies(const struct ctl_table *, int, void *, size_t *, - loff_t *); -int proc_dointvec_ms_jiffies(const struct ctl_table *, int, void *, size_t *, - loff_t *); int proc_doulongvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int, void *, - size_t *, loff_t *); +int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv); int proc_do_large_bitmap(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_do_static_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +int sysctl_kern_to_user_uint_conv(unsigned long *u_ptr, const unsigned int *k_ptr); /* * Register a set of sysctl names by calling register_sysctl @@ -156,6 +276,10 @@ struct ctl_node { * @nreg: When nreg drops to 0 the ctl_table_header will be unregistered. * @rcu: Delays the freeing of the inode. Introduced with "unfuck proc_sysctl ->d_compare()" * + * @type: Enumeration to differentiate between ctl target types + * @type.SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations + * @type.SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Identifies a permanently empty dir + * target to serve as a mount point */ struct ctl_table_header { union { @@ -175,13 +299,6 @@ struct ctl_table_header { struct ctl_dir *parent; struct ctl_node *node; struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */ - /** - * enum type - Enumeration to differentiate between ctl target types - * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations - * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently - * empty directory target to serve - * as mount point. - */ enum { SYSCTL_TABLE_TYPE_DEFAULT, SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY, @@ -235,12 +352,6 @@ extern struct ctl_table_header *register_sysctl_mount_point(const char *path); void do_sysctl_args(void); bool sysctl_is_alias(char *param); -int do_proc_douintvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data); extern int unaligned_enabled; extern int no_unaligned_warning; diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 557780fe1c77..4a0b8c172d27 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -80,6 +80,19 @@ static inline bool trace_seq_has_overflowed(struct trace_seq *s) return s->full || seq_buf_has_overflowed(&s->seq); } +/** + * trace_seq_pop - pop off the last written character + * @s: trace sequence descriptor + * + * Removes the last written character to the trace_seq @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int trace_seq_pop(struct trace_seq *s) +{ + return seq_buf_pop(&s->seq); +} + /* * Currently only defined when tracing is enabled. */ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 826ce3f8e1f8..8a56f3278b1b 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -222,6 +222,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } /* + * When a tracepoint is used, it's name is added to the __tracepoint_check + * section. This section is only used at build time to make sure all + * defined tracepoints are used. It is discarded after the build. + */ +# define TRACEPOINT_CHECK(name) \ + static const char __used __section("__tracepoint_check") \ + __trace_check_##name[] = #name; + +/* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. @@ -270,6 +279,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ + TRACEPOINT_CHECK(name) \ if (cond) { \ guard(preempt_notrace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ @@ -289,6 +299,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ + TRACEPOINT_CHECK(name) \ guard(rcu_tasks_trace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ } \ @@ -371,10 +382,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args)); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ + TRACEPOINT_CHECK(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name); \ EXPORT_SYMBOL_GPL(__traceiter_##name); \ EXPORT_STATIC_CALL_GPL(tp_func_##name) #define EXPORT_TRACEPOINT_SYMBOL(name) \ + TRACEPOINT_CHECK(name) \ EXPORT_SYMBOL(__tracepoint_##name); \ EXPORT_SYMBOL(__traceiter_##name); \ EXPORT_STATIC_CALL(tp_func_##name) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c0e716aec26a..fd5f42765497 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -16,7 +16,7 @@ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <asm-generic/pgtable_uffd.h> #include <linux/hugetlb_inline.h> @@ -228,15 +228,14 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, if (wp_async && (vm_flags == VM_UFFD_WP)) return true; -#ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for * uffd-wp, then shmem & hugetlbfs are not supported but only * anonymous. */ - if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) return false; -#endif /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || @@ -291,6 +290,43 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx); void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx); +static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) +{ + /* Only wr-protect mode uses pte markers */ + if (!userfaultfd_wp(vma)) + return false; + + /* File-based uffd-wp always need markers */ + if (!vma_is_anonymous(vma)) + return true; + + /* + * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED + * enabled (to apply markers on zero pages). + */ + return userfaultfd_wp_unpopulated(vma); +} + +/* + * Returns true if this is a swap pte and was uffd-wp wr-protected in either + * forms (pte marker or a normal swap pte), false otherwise. + */ +static inline bool pte_swp_uffd_wp_any(pte_t pte) +{ + if (!uffd_supports_wp_marker()) + return false; + + if (pte_present(pte)) + return false; + + if (pte_swp_uffd_wp(pte)) + return true; + + if (pte_is_uffd_wp_marker(pte)) + return true; + + return false; +} #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -415,49 +451,9 @@ static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) return false; } -#endif /* CONFIG_USERFAULTFD */ - static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { - /* Only wr-protect mode uses pte markers */ - if (!userfaultfd_wp(vma)) - return false; - - /* File-based uffd-wp always need markers */ - if (!vma_is_anonymous(vma)) - return true; - - /* - * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED - * enabled (to apply markers on zero pages). - */ - return userfaultfd_wp_unpopulated(vma); -} - -static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); -#else - return false; -#endif -} - -static inline bool pte_marker_uffd_wp(pte_t pte) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - swp_entry_t entry; - - if (!is_swap_pte(pte)) - return false; - - entry = pte_to_swp_entry(pte); - - return pte_marker_entry_uffd_wp(entry); -#else return false; -#endif } /* @@ -466,17 +462,7 @@ static inline bool pte_marker_uffd_wp(pte_t pte) */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP - if (!is_swap_pte(pte)) - return false; - - if (pte_swp_uffd_wp(pte)) - return true; - - if (pte_marker_uffd_wp(pte)) - return true; -#endif return false; } - +#endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb54b7b3202f..e8e94f90d686 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ #endif struct vm_struct { - struct vm_struct *next; + union { + struct vm_struct *next; /* Early registration of vm_areas. */ + struct llist_node llnode; /* Asynchronous freeing on error paths. */ + }; + void *addr; unsigned long size; unsigned long flags; @@ -328,4 +332,6 @@ bool vmalloc_dump_obj(void *object); static inline bool vmalloc_dump_obj(void *object) { return false; } #endif +unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask); +void memalloc_restore_scope(unsigned int flags); #endif /* _LINUX_VMALLOC_H */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c287998908bf..3398a345bda8 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -520,32 +520,12 @@ static inline const char *vm_event_name(enum vm_event_item item) #ifdef CONFIG_MEMCG -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -static inline void mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_state(lruvec, idx, val); - local_irq_restore(flags); -} - -void __lruvec_stat_mod_folio(struct folio *folio, +void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); -static inline void lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __lruvec_stat_mod_folio(folio, idx, val); - local_irq_restore(flags); -} - static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { @@ -554,24 +534,12 @@ static inline void mod_lruvec_page_state(struct page *page, #else -static inline void __mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); -} - static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } -static inline void __lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - __mod_node_page_state(folio_pgdat(folio), idx, val); -} - static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { @@ -586,18 +554,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __lruvec_stat_add_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); -} - -static inline void __lruvec_stat_sub_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); -} - static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { |
