diff options
Diffstat (limited to 'include/linux')
79 files changed, 1775 insertions, 761 deletions
diff --git a/include/linux/adreno-smmu-priv.h b/include/linux/adreno-smmu-priv.h index abec23c7744f..d83c9175828f 100644 --- a/include/linux/adreno-smmu-priv.h +++ b/include/linux/adreno-smmu-priv.h @@ -45,9 +45,9 @@ struct adreno_smmu_fault_info { * TTBR0 translation is enabled with the specified cfg * @get_fault_info: Called by the GPU fault handler to get information about * the fault - * @set_stall: Configure whether stall on fault (CFCFG) is enabled. Call - * before set_ttbr0_cfg(). If stalling on fault is enabled, - * the GPU driver must call resume_translation() + * @set_stall: Configure whether stall on fault (CFCFG) is enabled. If + * stalling on fault is enabled, the GPU driver must call + * resume_translation() * @resume_translation: Resume translation after a fault * * @set_prr_bit: [optional] Configure the GPU's Partially Resident diff --git a/include/linux/bus/stm32_firewall_device.h b/include/linux/bus/stm32_firewall_device.h index 5178b72bc920..eaa7a3f54450 100644 --- a/include/linux/bus/stm32_firewall_device.h +++ b/include/linux/bus/stm32_firewall_device.h @@ -114,27 +114,30 @@ void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 su #else /* CONFIG_STM32_FIREWALL */ -int stm32_firewall_get_firewall(struct device_node *np, struct stm32_firewall *firewall, - unsigned int nb_firewall) +static inline int stm32_firewall_get_firewall(struct device_node *np, + struct stm32_firewall *firewall, + unsigned int nb_firewall) { return -ENODEV; } -int stm32_firewall_grant_access(struct stm32_firewall *firewall) +static inline int stm32_firewall_grant_access(struct stm32_firewall *firewall) { return -ENODEV; } -void stm32_firewall_release_access(struct stm32_firewall *firewall) +static inline void stm32_firewall_release_access(struct stm32_firewall *firewall) { } -int stm32_firewall_grant_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id) +static inline int stm32_firewall_grant_access_by_id(struct stm32_firewall *firewall, + u32 subsystem_id) { return -ENODEV; } -void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id) +static inline void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, + u32 subsystem_id) { } diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 20881cc761fa..2b77d12e07b2 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -530,6 +530,12 @@ struct ftrace_likely_data { sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) #ifdef __OPTIMIZE__ +/* + * #ifdef __OPTIMIZE__ is only a good approximation; for instance "make + * CFLAGS_foo.o=-Og" defines __OPTIMIZE__, does not elide the conditional code + * and can break compilation with wrong error message(s). Combine with + * -U__OPTIMIZE__ when needed. + */ # define __compiletime_assert(condition, msg, prefix, suffix) \ do { \ /* \ @@ -543,7 +549,7 @@ struct ftrace_likely_data { prefix ## suffix(); \ } while (0) #else -# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0) +# define __compiletime_assert(condition, msg, prefix, suffix) ((void)(condition)) #endif #define _compiletime_assert(condition, msg, prefix, suffix) \ diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 5466c96a33db..2ddb256187b5 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -82,11 +82,11 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); -extern bool cpuset_node_allowed(int node, gfp_t gfp_mask); +extern bool cpuset_current_node_allowed(int node, gfp_t gfp_mask); static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return cpuset_node_allowed(zone_to_nid(z), gfp_mask); + return cpuset_current_node_allowed(zone_to_nid(z), gfp_mask); } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) @@ -173,6 +173,7 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } +extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid); #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -293,6 +294,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } +static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +{ + return true; +} #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 44305336314e..d35726d6a415 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -34,7 +34,12 @@ static inline void arch_kexec_protect_crashkres(void) { } static inline void arch_kexec_unprotect_crashkres(void) { } #endif - +#ifdef CONFIG_CRASH_DM_CRYPT +int crash_load_dm_crypt_keys(struct kimage *image); +ssize_t dm_crypt_keys_read(char *buf, size_t count, u64 *ppos); +#else +static inline int crash_load_dm_crypt_keys(struct kimage *image) {return 0; } +#endif #ifndef arch_crash_handle_hotplug_event static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { } diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 2f2555e6407c..dd6fc3b2133b 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -15,6 +15,8 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; +extern unsigned long long dm_crypt_keys_addr; + #ifdef CONFIG_CRASH_DUMP extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); extern void elfcorehdr_free(unsigned long long addr); diff --git a/include/linux/damon.h b/include/linux/damon.h index 47e36e6ea203..a4011726cb3b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -145,6 +145,8 @@ enum damos_action { * * @DAMOS_QUOTA_USER_INPUT: User-input value. * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. + * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. + * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -152,6 +154,8 @@ enum damos_action { enum damos_quota_goal_metric { DAMOS_QUOTA_USER_INPUT, DAMOS_QUOTA_SOME_MEM_PSI_US, + DAMOS_QUOTA_NODE_MEM_USED_BP, + DAMOS_QUOTA_NODE_MEM_FREE_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -161,6 +165,7 @@ enum damos_quota_goal_metric { * @target_value: Target value of @metric to achieve with the tuning. * @current_value: Current value of @metric. * @last_psi_total: Last measured total PSI + * @nid: Node id. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -179,6 +184,7 @@ struct damos_quota_goal { /* metric-dependent fields */ union { u64 last_psi_total; + int nid; }; struct list_head list; }; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c24f8bc01045..5206d63b3386 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -78,6 +78,7 @@ enum stop_cp_reason { STOP_CP_REASON_UPDATE_INODE, STOP_CP_REASON_FLUSH_FAIL, STOP_CP_REASON_NO_SEGMENT, + STOP_CP_REASON_CORRUPTED_FREE_BITMAP, STOP_CP_REASON_MAX, }; diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index 9b85a3f028d1..a8a17eeb7d90 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -8,6 +8,7 @@ #include <linux/bitfield.h> #include <linux/errno.h> +#include <linux/scmi_imx_protocol.h> #include <linux/types.h> #define SCMI_IMX_CTRL_PDM_CLK_SEL 0 /* AON PDM clock sel */ @@ -20,4 +21,22 @@ int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); int scmi_imx_misc_ctrl_set(u32 id, u32 val); +int scmi_imx_cpu_start(u32 cpuid, bool start); +int scmi_imx_cpu_started(u32 cpuid, bool *started); +int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, bool boot, + bool resume); + +enum scmi_imx_lmm_op { + SCMI_IMX_LMM_BOOT, + SCMI_IMX_LMM_POWER_ON, + SCMI_IMX_LMM_SHUTDOWN, +}; + +/* For shutdown pperation */ +#define SCMI_IMX_LMM_OP_FORCEFUL 0 +#define SCMI_IMX_LMM_OP_GRACEFUL BIT(0) + +int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags); +int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info); +int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector); #endif diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index 76255b5d06b1..f628bf1862c2 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -11,6 +11,7 @@ #include <linux/types.h> struct acpm_handle; +struct device_node; struct acpm_pmic_ops { int (*read_reg)(const struct acpm_handle *handle, @@ -44,6 +45,7 @@ struct acpm_handle { struct device; -const struct acpm_handle *devm_acpm_get_by_phandle(struct device *dev, - const char *property); +const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np); + #endif /* __EXYNOS_ACPM_PROTOCOL_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 0db87f8e676c..da86fcb11882 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1250,7 +1250,6 @@ extern int send_sigurg(struct file *file); /* These sb flags are internal to the kernel */ #define SB_DEAD BIT(21) #define SB_DYING BIT(24) -#define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) #define SB_BORN BIT(29) @@ -2190,6 +2189,7 @@ struct file_operations { int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); + int (*mmap_prepare)(struct vm_area_desc *); } __randomize_layout; /* Supports async buffered reads */ @@ -2259,11 +2259,35 @@ struct inode_operations { struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; +/* Did the driver provide valid mmap hook configuration? */ +static inline bool file_has_valid_mmap_hooks(struct file *file) +{ + bool has_mmap = file->f_op->mmap; + bool has_mmap_prepare = file->f_op->mmap_prepare; + + /* Hooks are mutually exclusive. */ + if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) + return false; + if (!has_mmap && !has_mmap_prepare) + return false; + + return true; +} + static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { + if (WARN_ON_ONCE(file->f_op->mmap_prepare)) + return -EINVAL; + return file->f_op->mmap(file, vma); } +static inline int __call_mmap_prepare(struct file *file, + struct vm_area_desc *desc) +{ + return file->f_op->mmap_prepare(desc); +} + extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 95851a6fb942..a4d3816d252a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -328,6 +328,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * DIRECT - Used by the direct ftrace_ops helper for direct functions * (internal ftrace only, should not be used by others) * SUBOP - Is controlled by another op in field managed. + * GRAPH - Is a component of the fgraph_ops structure */ enum { FTRACE_OPS_FL_ENABLED = BIT(0), @@ -349,6 +350,7 @@ enum { FTRACE_OPS_FL_PERMANENT = BIT(16), FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), + FTRACE_OPS_FL_GRAPH = BIT(19), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c9fa6309c903..be160e8d8bcb 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -45,13 +45,13 @@ static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. * All GFP_* flags including GFP_NOWAIT use one or both flags. - * try_alloc_pages() is the only API that doesn't specify either flag. + * alloc_pages_nolock() is the only API that doesn't specify either flag. * * This is stronger than GFP_NOWAIT or GFP_ATOMIC because * those are guaranteed to never block on a sleeping lock. * Here we are enforcing that the allocation doesn't ever spin * on any locks (i.e. only trylocks). There is no high level - * GFP_$FOO flag for this use in try_alloc_pages() as the + * GFP_$FOO flag for this use in alloc_pages_nolock() as the * regular page allocator doesn't fully support this * allocation mode. */ @@ -354,8 +354,8 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) -struct page *try_alloc_pages_noprof(int nid, unsigned int order); -#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__)) +struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); +#define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h index d2a9fc96424b..af5fb4ad77eb 100644 --- a/include/linux/habanalabs/hl_boot_if.h +++ b/include/linux/habanalabs/hl_boot_if.h @@ -295,7 +295,7 @@ enum cpu_boot_dev_sts { * Initialized in: linux * * CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN GIC access permission only from - * previleged entity. FW sets this status + * privileged entity. FW sets this status * bit for host. If this bit is set then * GIC can not be accessed from host. * Initialized in: linux diff --git a/include/linux/highmem.h b/include/linux/highmem.h index c698f8415675..e48d7f27b0b9 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -404,6 +404,33 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off, kunmap_local(dst); } +static inline void memcpy_folio(struct folio *dst_folio, size_t dst_off, + struct folio *src_folio, size_t src_off, size_t len) +{ + VM_BUG_ON(dst_off + len > folio_size(dst_folio)); + VM_BUG_ON(src_off + len > folio_size(src_folio)); + + do { + char *dst = kmap_local_folio(dst_folio, dst_off); + const char *src = kmap_local_folio(src_folio, src_off); + size_t chunk = len; + + if (folio_test_highmem(dst_folio) && + chunk > PAGE_SIZE - offset_in_page(dst_off)) + chunk = PAGE_SIZE - offset_in_page(dst_off); + if (folio_test_highmem(src_folio) && + chunk > PAGE_SIZE - offset_in_page(src_off)) + chunk = PAGE_SIZE - offset_in_page(src_off); + memcpy(dst, src, chunk); + kunmap_local(src); + kunmap_local(dst); + + dst_off += chunk; + src_off += chunk; + len -= chunk; + } while (len > 0); +} + static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { diff --git a/include/linux/hmm-dma.h b/include/linux/hmm-dma.h new file mode 100644 index 000000000000..f58b9fc71999 --- /dev/null +++ b/include/linux/hmm-dma.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) 2024 NVIDIA Corporation & Affiliates */ +#ifndef LINUX_HMM_DMA_H +#define LINUX_HMM_DMA_H + +#include <linux/dma-mapping.h> + +struct dma_iova_state; +struct pci_p2pdma_map_state; + +/* + * struct hmm_dma_map - array of PFNs and DMA addresses + * + * @state: DMA IOVA state + * @pfns: array of PFNs + * @dma_list: array of DMA addresses + * @dma_entry_size: size of each DMA entry in the array + */ +struct hmm_dma_map { + struct dma_iova_state state; + unsigned long *pfn_list; + dma_addr_t *dma_list; + size_t dma_entry_size; +}; + +int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map, + size_t nr_entries, size_t dma_entry_size); +void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map); +dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, + size_t idx, + struct pci_p2pdma_map_state *p2pdma_state); +bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx); +#endif /* LINUX_HMM_DMA_H */ diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 126a36571667..db75ffc949a7 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -23,6 +23,10 @@ struct mmu_interval_notifier; * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID) * HMM_PFN_ERROR - accessing the pfn is impossible and the device should * fail. ie poisoned memory, special pages, no vma, etc + * HMM_PFN_P2PDMA - P2P page + * HMM_PFN_P2PDMA_BUS - Bus mapped P2P transfer + * HMM_PFN_DMA_MAPPED - Flag preserved on input-to-output transformation + * to mark that page is already DMA mapped * * On input: * 0 - Return the current state of the page, do not fault it. @@ -36,13 +40,21 @@ enum hmm_pfn_flags { HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8), + /* + * Sticky flags, carried from input to output, + * don't forget to update HMM_PFN_INOUT_FLAGS + */ + HMM_PFN_DMA_MAPPED = 1UL << (BITS_PER_LONG - 4), + HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5), + HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6), + + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11), /* Input flags */ HMM_PFN_REQ_FAULT = HMM_PFN_VALID, HMM_PFN_REQ_WRITE = HMM_PFN_WRITE, - HMM_PFN_FLAGS = 0xFFUL << HMM_PFN_ORDER_SHIFT, + HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1), }; /* @@ -58,6 +70,14 @@ static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn) } /* + * hmm_pfn_to_phys() - return physical address pointed to by a device entry + */ +static inline phys_addr_t hmm_pfn_to_phys(unsigned long hmm_pfn) +{ + return __pfn_to_phys(hmm_pfn & ~HMM_PFN_FLAGS); +} + +/* * hmm_pfn_to_map_order() - return the CPU mapping size order * * This is optionally useful to optimize processing of the pfn result diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e893d546a49f..2f190c90192d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -395,7 +395,7 @@ static inline int split_huge_page(struct page *page) void deferred_split_folio(struct folio *folio, bool partially_mapped); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct folio *folio); + unsigned long address, bool freeze); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ @@ -403,12 +403,11 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ - false, NULL); \ + false); \ } while (0) - void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, - bool freeze, struct folio *folio); + bool freeze); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); @@ -495,15 +494,13 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); -#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) - static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, bool freeze, struct folio *folio); + pmd_t *pmd, bool freeze); bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio); @@ -578,12 +575,12 @@ static inline void deferred_split_folio(struct folio *folio, bool partially_mapp do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct folio *folio) {} + unsigned long address, bool freeze) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address, bool freeze, struct folio *folio) {} + unsigned long address, bool freeze) {} static inline void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, - bool freeze, struct folio *folio) {} + bool freeze) {} static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4861a7e304bb..0598f36931de 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -14,6 +14,7 @@ #include <linux/pgtable.h> #include <linux/gfp.h> #include <linux/userfaultfd_k.h> +#include <linux/nodemask.h> struct ctl_table; struct user_struct; @@ -128,12 +129,12 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); void unmap_hugepage_range(struct vm_area_struct *, - unsigned long, unsigned long, struct page *, - zap_flags_t); + unsigned long start, unsigned long end, + struct folio *, zap_flags_t); void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page, zap_flags_t zap_flags); + struct folio *, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo_node(int nid); @@ -176,6 +177,8 @@ extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); bool hugetlb_bootmem_allocated(void); +extern nodemask_t hugetlb_bootmem_nodes; +void hugetlb_bootmem_set_nodes(void); /* arch callbacks */ @@ -453,7 +456,7 @@ static inline long hugetlb_change_protection( static inline void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page, + unsigned long end, struct folio *folio, zap_flags_t zap_flags) { BUG(); @@ -700,7 +703,7 @@ struct huge_bootmem_page { bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, @@ -1088,7 +1091,7 @@ static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, return NULL; } -static inline int isolate_or_dissolve_huge_page(struct page *page, +static inline int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { return -ENOMEM; diff --git a/include/linux/hung_task.h b/include/linux/hung_task.h new file mode 100644 index 000000000000..1bc2b3244613 --- /dev/null +++ b/include/linux/hung_task.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Detect Hung Task: detecting tasks stuck in D state + * + * Copyright (C) 2025 Tongcheng Travel (www.ly.com) + * Author: Lance Yang <mingzhe.yang@ly.com> + */ +#ifndef __LINUX_HUNG_TASK_H +#define __LINUX_HUNG_TASK_H + +#include <linux/bug.h> +#include <linux/sched.h> +#include <linux/compiler.h> + +/* + * @blocker: Combines lock address and blocking type. + * + * Since lock pointers are at least 4-byte aligned(32-bit) or 8-byte + * aligned(64-bit). This leaves the 2 least bits (LSBs) of the pointer + * always zero. So we can use these bits to encode the specific blocking + * type. + * + * Type encoding: + * 00 - Blocked on mutex (BLOCKER_TYPE_MUTEX) + * 01 - Blocked on semaphore (BLOCKER_TYPE_SEM) + * 10 - Blocked on rt-mutex (BLOCKER_TYPE_RTMUTEX) + * 11 - Blocked on rw-semaphore (BLOCKER_TYPE_RWSEM) + */ +#define BLOCKER_TYPE_MUTEX 0x00UL +#define BLOCKER_TYPE_SEM 0x01UL +#define BLOCKER_TYPE_RTMUTEX 0x02UL +#define BLOCKER_TYPE_RWSEM 0x03UL + +#define BLOCKER_TYPE_MASK 0x03UL + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_set_blocker(void *lock, unsigned long type) +{ + unsigned long lock_ptr = (unsigned long)lock; + + WARN_ON_ONCE(!lock_ptr); + WARN_ON_ONCE(READ_ONCE(current->blocker)); + + /* + * If the lock pointer matches the BLOCKER_TYPE_MASK, return + * without writing anything. + */ + if (WARN_ON_ONCE(lock_ptr & BLOCKER_TYPE_MASK)) + return; + + WRITE_ONCE(current->blocker, lock_ptr | type); +} + +static inline void hung_task_clear_blocker(void) +{ + WARN_ON_ONCE(!READ_ONCE(current->blocker)); + + WRITE_ONCE(current->blocker, 0UL); +} + +/* + * hung_task_get_blocker_type - Extracts blocker type from encoded blocker + * address. + * + * @blocker: Blocker pointer with encoded type (via LSB bits) + * + * Returns: BLOCKER_TYPE_MUTEX, BLOCKER_TYPE_SEM, etc. + */ +static inline unsigned long hung_task_get_blocker_type(unsigned long blocker) +{ + WARN_ON_ONCE(!blocker); + + return blocker & BLOCKER_TYPE_MASK; +} + +static inline void *hung_task_blocker_to_lock(unsigned long blocker) +{ + WARN_ON_ONCE(!blocker); + + return (void *)(blocker & ~BLOCKER_TYPE_MASK); +} +#else +static inline void hung_task_set_blocker(void *lock, unsigned long type) +{ +} +static inline void hung_task_clear_blocker(void) +{ +} +static inline unsigned long hung_task_get_blocker_type(unsigned long blocker) +{ + return 0UL; +} +static inline void *hung_task_blocker_to_lock(unsigned long blocker) +{ + return NULL; +} +#endif + +#endif /* __LINUX_HUNG_TASK_H */ diff --git a/include/linux/i2c-atr.h b/include/linux/i2c-atr.h index 4d5da161c225..2bb54dc87c8e 100644 --- a/include/linux/i2c-atr.h +++ b/include/linux/i2c-atr.h @@ -19,21 +19,59 @@ struct fwnode_handle; struct i2c_atr; /** + * enum i2c_atr_flags - Flags for an I2C ATR driver + * + * @I2C_ATR_F_STATIC: ATR does not support dynamic mapping, use static mapping. + * Mappings will only be added or removed as a result of + * devices being added or removed from a child bus. + * The ATR pool will have to be big enough to accomodate all + * devices expected to be added to the child buses. + * @I2C_ATR_F_PASSTHROUGH: Allow unmapped incoming addresses to pass through + */ +enum i2c_atr_flags { + I2C_ATR_F_STATIC = BIT(0), + I2C_ATR_F_PASSTHROUGH = BIT(1), +}; + +/** * struct i2c_atr_ops - Callbacks from ATR to the device driver. - * @attach_client: Notify the driver of a new device connected on a child - * bus, with the alias assigned to it. The driver must - * configure the hardware to use the alias. - * @detach_client: Notify the driver of a device getting disconnected. The - * driver must configure the hardware to stop using the - * alias. + * @attach_addr: Notify the driver of a new device connected on a child + * bus, with the alias assigned to it. The driver must + * configure the hardware to use the alias. + * @detach_addr: Notify the driver of a device getting disconnected. The + * driver must configure the hardware to stop using the + * alias. * * All these functions return 0 on success, a negative error code otherwise. */ struct i2c_atr_ops { - int (*attach_client)(struct i2c_atr *atr, u32 chan_id, - const struct i2c_client *client, u16 alias); - void (*detach_client)(struct i2c_atr *atr, u32 chan_id, - const struct i2c_client *client); + int (*attach_addr)(struct i2c_atr *atr, u32 chan_id, + u16 addr, u16 alias); + void (*detach_addr)(struct i2c_atr *atr, u32 chan_id, + u16 addr); +}; + +/** + * struct i2c_atr_adap_desc - An ATR downstream bus descriptor + * @chan_id: Index of the new adapter (0 .. max_adapters-1). This value is + * passed to the callbacks in `struct i2c_atr_ops`. + * @parent: The device used as the parent of the new i2c adapter, or NULL + * to use the i2c-atr device as the parent. + * @bus_handle: The fwnode handle that points to the adapter's i2c + * peripherals, or NULL. + * @num_aliases: The number of aliases in this adapter's private alias pool. Set + * to zero if this adapter uses the ATR's global alias pool. + * @aliases: An optional array of private aliases used by the adapter + * instead of the ATR's global pool of aliases. Must contain + * exactly num_aliases entries if num_aliases > 0, is ignored + * otherwise. + */ +struct i2c_atr_adap_desc { + u32 chan_id; + struct device *parent; + struct fwnode_handle *bus_handle; + size_t num_aliases; + u16 *aliases; }; /** @@ -42,6 +80,7 @@ struct i2c_atr_ops { * @dev: The device acting as an ATR * @ops: Driver-specific callbacks * @max_adapters: Maximum number of child adapters + * @flags: Flags for ATR * * The new ATR helper is connected to the parent adapter but has no child * adapters. Call i2c_atr_add_adapter() to add some. @@ -51,7 +90,8 @@ struct i2c_atr_ops { * Return: pointer to the new ATR helper object, or ERR_PTR */ struct i2c_atr *i2c_atr_new(struct i2c_adapter *parent, struct device *dev, - const struct i2c_atr_ops *ops, int max_adapters); + const struct i2c_atr_ops *ops, int max_adapters, + u32 flags); /** * i2c_atr_delete - Delete an I2C ATR helper. @@ -65,12 +105,7 @@ void i2c_atr_delete(struct i2c_atr *atr); /** * i2c_atr_add_adapter - Create a child ("downstream") I2C bus. * @atr: The I2C ATR - * @chan_id: Index of the new adapter (0 .. max_adapters-1). This value is - * passed to the callbacks in `struct i2c_atr_ops`. - * @adapter_parent: The device used as the parent of the new i2c adapter, or NULL - * to use the i2c-atr device as the parent. - * @bus_handle: The fwnode handle that points to the adapter's i2c - * peripherals, or NULL. + * @desc: An ATR adapter descriptor * * After calling this function a new i2c bus will appear. Adding and removing * devices on the downstream bus will result in calls to the @@ -85,9 +120,7 @@ void i2c_atr_delete(struct i2c_atr *atr); * * Return: 0 on success, a negative error code otherwise. */ -int i2c_atr_add_adapter(struct i2c_atr *atr, u32 chan_id, - struct device *adapter_parent, - struct fwnode_handle *bus_handle); +int i2c_atr_add_adapter(struct i2c_atr *atr, struct i2c_atr_adap_desc *desc); /** * i2c_atr_del_adapter - Remove a child ("downstream") I2C bus added by diff --git a/include/linux/i2c-smbus.h b/include/linux/i2c-smbus.h index ced1c6ead52a..dc1bd2ab4c13 100644 --- a/include/linux/i2c-smbus.h +++ b/include/linux/i2c-smbus.h @@ -44,9 +44,11 @@ static inline void i2c_free_slave_host_notify_device(struct i2c_client *client) #endif #if IS_ENABLED(CONFIG_I2C_SMBUS) && IS_ENABLED(CONFIG_DMI) -void i2c_register_spd(struct i2c_adapter *adap); +void i2c_register_spd_write_disable(struct i2c_adapter *adap); +void i2c_register_spd_write_enable(struct i2c_adapter *adap); #else -static inline void i2c_register_spd(struct i2c_adapter *adap) { } +static inline void i2c_register_spd_write_disable(struct i2c_adapter *adap) { } +static inline void i2c_register_spd_write_enable(struct i2c_adapter *adap) { } #endif #endif /* _LINUX_I2C_SMBUS_H */ diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 2e4903b7f7bc..20fd41b51d5c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -405,7 +405,6 @@ static inline bool i2c_detect_slave_mode(struct device *dev) { return false; } * @addr: stored in i2c_client.addr * @dev_name: Overrides the default <busnr>-<addr> dev_name if set * @platform_data: stored in i2c_client.dev.platform_data - * @of_node: pointer to OpenFirmware device node * @fwnode: device node supplied by the platform firmware * @swnode: software node for the device * @resources: resources associated with the device @@ -429,7 +428,6 @@ struct i2c_board_info { unsigned short addr; const char *dev_name; void *platform_data; - struct device_node *of_node; struct fwnode_handle *fwnode; const struct software_node *swnode; const struct resource *resources; diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index bba2a51c87d2..138fbd89b1e6 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -88,6 +88,13 @@ struct io_pgtable_cfg { * * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable. * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits + * + * IO_PGTABLE_QUIRK_NO_WARN: Do not WARN_ON() on conflicting + * mappings, but silently return -EEXISTS. Normally an attempt + * to map over an existing mapping would indicate some sort of + * kernel bug, which would justify the WARN_ON(). But for GPU + * drivers, this could be under control of userspace. Which + * deserves an error return, but not to spam dmesg. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -97,6 +104,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) #define IO_PGTABLE_QUIRK_ARM_HD BIT(7) #define IO_PGTABLE_QUIRK_ARM_S2FWB BIT(8) + #define IO_PGTABLE_QUIRK_NO_WARN BIT(9) unsigned long quirks; unsigned long pgsize_bitmap; unsigned int ias; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 15cdadace993..156732807994 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -316,23 +316,6 @@ struct iommu_iort_rmr_data { u32 num_sids; }; -/** - * enum iommu_dev_features - Per device IOMMU features - * @IOMMU_DEV_FEAT_SVA: Shared Virtual Addresses - * @IOMMU_DEV_FEAT_IOPF: I/O Page Faults such as PRI or Stall. Generally - * enabling %IOMMU_DEV_FEAT_SVA requires - * %IOMMU_DEV_FEAT_IOPF, but some devices manage I/O Page - * Faults themselves instead of relying on the IOMMU. When - * supported, this feature must be enabled before and - * disabled after %IOMMU_DEV_FEAT_SVA. - * - * Device drivers enable a feature using iommu_dev_enable_feature(). - */ -enum iommu_dev_features { - IOMMU_DEV_FEAT_SVA, - IOMMU_DEV_FEAT_IOPF, -}; - #define IOMMU_NO_PASID (0U) /* Reserved for DMA w/o PASID */ #define IOMMU_FIRST_GLOBAL_PASID (1U) /*starting range for allocation */ #define IOMMU_PASID_INVALID (-1U) @@ -341,6 +324,18 @@ typedef unsigned int ioasid_t; /* Read but do not clear any dirty bits */ #define IOMMU_DIRTY_NO_CLEAR (1 << 0) +/* + * Pages allocated through iommu_alloc_pages_node_sz() can be placed on this + * list using iommu_pages_list_add(). Note: ONLY pages from + * iommu_alloc_pages_node_sz() can be used this way! + */ +struct iommu_pages_list { + struct list_head pages; +}; + +#define IOMMU_PAGES_LIST_INIT(name) \ + ((struct iommu_pages_list){ .pages = LIST_HEAD_INIT(name.pages) }) + #ifdef CONFIG_IOMMU_API /** @@ -363,7 +358,7 @@ struct iommu_iotlb_gather { unsigned long start; unsigned long end; size_t pgsize; - struct list_head freelist; + struct iommu_pages_list freelist; bool queued; }; @@ -569,9 +564,10 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * op is allocated in the iommu driver and freed by the caller after * use. The information type is one of enum iommu_hw_info_type defined * in include/uapi/linux/iommufd.h. - * @domain_alloc: allocate and return an iommu domain if success. Otherwise - * NULL is returned. The domain is not fully initialized until - * the caller iommu_domain_alloc() returns. + * @domain_alloc: Do not use in new drivers + * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to + * use identity_domain instead. This should only be used + * if dynamic logic is necessary. * @domain_alloc_paging_flags: Allocate an iommu domain corresponding to the * input parameters as defined in * include/uapi/linux/iommufd.h. The @user_data can be @@ -594,8 +590,6 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * @of_xlate: add OF master IDs to iommu grouping * @is_attach_deferred: Check if domain attach should be deferred from iommu * driver init to device driver init (default no) - * @dev_enable/disable_feat: per device entries to enable/disable - * iommu specific features. * @page_response: handle page request response * @def_domain_type: device default domain type, return value: * - IOMMU_DOMAIN_IDENTITY: must use an identity domain @@ -629,7 +623,10 @@ struct iommu_ops { void *(*hw_info)(struct device *dev, u32 *length, u32 *type); /* Domain allocation and freeing by the iommu driver */ +#if IS_ENABLED(CONFIG_FSL_PAMU) struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); +#endif + struct iommu_domain *(*domain_alloc_identity)(struct device *dev); struct iommu_domain *(*domain_alloc_paging_flags)( struct device *dev, u32 flags, const struct iommu_user_data *user_data); @@ -652,9 +649,6 @@ struct iommu_ops { bool (*is_attach_deferred)(struct device *dev); /* Per device IOMMU features */ - int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); - int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - void (*page_response)(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg); @@ -750,6 +744,7 @@ struct iommu_domain_ops { * @dev: struct device for sysfs handling * @singleton_group: Used internally for drivers that have only one group * @max_pasids: number of supported PASIDs + * @ready: set once iommu_device_register() has completed successfully */ struct iommu_device { struct list_head list; @@ -758,6 +753,7 @@ struct iommu_device { struct device *dev; struct iommu_group *singleton_group; u32 max_pasids; + bool ready; }; /** @@ -852,7 +848,7 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) { *gather = (struct iommu_iotlb_gather) { .start = ULONG_MAX, - .freelist = LIST_HEAD_INIT(gather->freelist), + .freelist = IOMMU_PAGES_LIST_INIT(gather->freelist), }; } @@ -1127,9 +1123,6 @@ void dev_iommu_priv_set(struct device *dev, void *priv); extern struct mutex iommu_probe_device_lock; int iommu_probe_device(struct device *dev); -int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); -int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); - int iommu_device_use_default_domain(struct device *dev); void iommu_device_unuse_default_domain(struct device *dev); @@ -1414,18 +1407,6 @@ static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, return -ENODEV; } -static inline int -iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat) -{ - return -ENODEV; -} - -static inline int -iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) -{ - return -ENODEV; -} - static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { return NULL; diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 2f74dd90c271..7da6602eab71 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -93,7 +93,8 @@ struct ipmi_user_hndl { /* * Called when the interface detects a watchdog pre-timeout. If - * this is NULL, it will be ignored for the user. + * this is NULL, it will be ignored for the user. Note that you + * can't do any IPMI calls from here, it's called with locks held. */ void (*ipmi_watchdog_pretimeout)(void *handler_data); @@ -343,4 +344,14 @@ extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data); /* Helper function for computing the IPMB checksum of some data. */ unsigned char ipmb_checksum(unsigned char *data, int size); +/* + * For things that must send messages at panic time, like the IPMI watchdog + * driver that extends the reset time on a panic, use this to send messages + * from panic context. Note that this puts the driver into a mode that + * only works at panic time, so only use it then. + */ +void ipmi_panic_request_and_wait(struct ipmi_user *user, + struct ipmi_addr *addr, + struct kernel_ipmi_msg *msg); + #endif /* __LINUX_IPMI_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index be2e8c0a187e..1cce1f6410a9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -33,6 +33,7 @@ #include <linux/sprintf.h> #include <linux/static_call_types.h> #include <linux/instruction_pointer.h> +#include <linux/util_macros.h> #include <linux/wordpart.h> #include <asm/byteorder.h> @@ -41,19 +42,6 @@ #define STACK_MAGIC 0xdeadbeef -/* generic data direction definitions */ -#define READ 0 -#define WRITE 1 - -#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) - -#define u64_to_user_ptr(x) ( \ -{ \ - typecheck(u64, (x)); \ - (void __user *)(uintptr_t)(x); \ -} \ -) - struct completion; struct user; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 53ef1b6c8712..03f85ad03025 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -25,6 +25,10 @@ extern note_buf_t __percpu *crash_notes; +#ifdef CONFIG_CRASH_DUMP +#include <linux/prandom.h> +#endif + #ifdef CONFIG_KEXEC_CORE #include <linux/list.h> #include <linux/compat.h> @@ -169,6 +173,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image); * @buf_min: The buffer can't be placed below this address. * @buf_max: The buffer can't be placed above this address. * @top_down: Allocate from top of memory. + * @random: Place the buffer at a random position. */ struct kexec_buf { struct kimage *image; @@ -180,8 +185,33 @@ struct kexec_buf { unsigned long buf_min; unsigned long buf_max; bool top_down; +#ifdef CONFIG_CRASH_DUMP + bool random; +#endif }; + +#ifdef CONFIG_CRASH_DUMP +static inline void kexec_random_range_start(unsigned long start, + unsigned long end, + struct kexec_buf *kbuf, + unsigned long *temp_start) +{ + unsigned short i; + + if (kbuf->random) { + get_random_bytes(&i, sizeof(unsigned short)); + *temp_start = start + (end - start) / USHRT_MAX * i; + } +} +#else +static inline void kexec_random_range_start(unsigned long start, + unsigned long end, + struct kexec_buf *kbuf, + unsigned long *temp_start) +{} +#endif + int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf); int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, @@ -374,10 +404,19 @@ struct kimage { bool is_ima_segment_index_set; #endif + struct { + struct kexec_segment *scratch; + phys_addr_t fdt; + } kho; + /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_sz; unsigned long elf_load_addr; + + /* dm crypt keys buffer */ + unsigned long dm_crypt_keys_addr; + unsigned long dm_crypt_keys_sz; }; /* kexec interface functions */ diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h new file mode 100644 index 000000000000..348844cffb13 --- /dev/null +++ b/include/linux/kexec_handover.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_H +#define LINUX_KEXEC_HANDOVER_H + +#include <linux/types.h> +#include <linux/errno.h> + +struct kho_scratch { + phys_addr_t addr; + phys_addr_t size; +}; + +/* KHO Notifier index */ +enum kho_event { + KEXEC_KHO_FINALIZE = 0, + KEXEC_KHO_ABORT = 1, +}; + +struct folio; +struct notifier_block; + +#define DECLARE_KHOSER_PTR(name, type) \ + union { \ + phys_addr_t phys; \ + type ptr; \ + } name +#define KHOSER_STORE_PTR(dest, val) \ + ({ \ + typeof(val) v = val; \ + typecheck(typeof((dest).ptr), v); \ + (dest).phys = virt_to_phys(v); \ + }) +#define KHOSER_LOAD_PTR(src) \ + ({ \ + typeof(src) s = src; \ + (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ + }) + +struct kho_serialization; + +#ifdef CONFIG_KEXEC_HANDOVER +bool kho_is_enabled(void); + +int kho_preserve_folio(struct folio *folio); +int kho_preserve_phys(phys_addr_t phys, size_t size); +struct folio *kho_restore_folio(phys_addr_t phys); +int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); +int kho_retrieve_subtree(const char *name, phys_addr_t *phys); + +int register_kho_notifier(struct notifier_block *nb); +int unregister_kho_notifier(struct notifier_block *nb); + +void kho_memory_init(void); + +void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, + u64 scratch_len); +#else +static inline bool kho_is_enabled(void) +{ + return false; +} + +static inline int kho_preserve_folio(struct folio *folio) +{ + return -EOPNOTSUPP; +} + +static inline int kho_preserve_phys(phys_addr_t phys, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline struct folio *kho_restore_folio(phys_addr_t phys) +{ + return NULL; +} + +static inline int kho_add_subtree(struct kho_serialization *ser, + const char *name, void *fdt) +{ + return -EOPNOTSUPP; +} + +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +{ + return -EOPNOTSUPP; +} + +static inline int register_kho_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline int unregister_kho_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline void kho_memory_init(void) +{ +} + +static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, + phys_addr_t scratch_phys, u64 scratch_len) +{ +} +#endif /* CONFIG_KEXEC_HANDOVER */ + +#endif /* LINUX_KEXEC_HANDOVER_H */ diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 1f46046080f5..b8d69cfbb58b 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,16 +15,8 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); -#ifdef CONFIG_SHMEM extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd); -#else -static inline int collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr, bool install_pmd) -{ - return 0; -} -#endif static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { diff --git a/include/linux/list.h b/include/linux/list.h index 29a375889fb8..e7e28afd28f8 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -50,9 +50,9 @@ static inline void INIT_LIST_HEAD(struct list_head *list) * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ -extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, - struct list_head *prev, - struct list_head *next); +bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, + struct list_head *prev, + struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a @@ -93,7 +93,7 @@ static __always_inline bool __list_add_valid(struct list_head *new, * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ -extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); +bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a diff --git a/include/linux/llist.h b/include/linux/llist.h index 2c982ff7475a..27b17f64bcee 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -223,9 +223,26 @@ static inline struct llist_node *llist_next(struct llist_node *node) return node->next; } -extern bool llist_add_batch(struct llist_node *new_first, - struct llist_node *new_last, - struct llist_head *head); +/** + * llist_add_batch - add several linked entries in batch + * @new_first: first entry in batch to be added + * @new_last: last entry in batch to be added + * @head: the head for your lock-less list + * + * Return whether list is empty before adding. + */ +static inline bool llist_add_batch(struct llist_node *new_first, + struct llist_node *new_last, + struct llist_head *head) +{ + struct llist_node *first = READ_ONCE(head->first); + + do { + new_last->next = first; + } while (!try_cmpxchg(&head->first, &first, new_first)); + + return !first; +} static inline bool __llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index cbbcd18d4186..9ef129038224 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -463,6 +463,8 @@ struct ma_wr_state { void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ + unsigned char vacant_height; /* Height of lowest node with free space */ + unsigned char sufficient_height;/* Height of lowest node with min sufficiency + 1 nodes */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) @@ -498,6 +500,8 @@ struct ma_wr_state { .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ + .vacant_height = 0, \ + .sufficient_height = 0 \ } #define MA_TOPIARY(name, tree) \ diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ef5a1ecc6e59..bb19a2534224 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -42,6 +42,14 @@ extern unsigned long long max_possible_pfn; * kernel resource tree. * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are * not initialized (only for reserved regions). + * @MEMBLOCK_RSRV_KERN: memory region that is reserved for kernel use, + * either explictitly with memblock_reserve_kern() or via memblock + * allocation APIs. All memblock allocations set this flag. + * @MEMBLOCK_KHO_SCRATCH: memory region that kexec can pass to the next + * kernel in handover mode. During early boot, we do not know about all + * memory reservations yet, so we get scratch memory from the previous + * kernel that we know is good to use. It is the only memory that + * allocations may happen from in this phase. */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ @@ -50,6 +58,8 @@ enum memblock_flags { MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */ + MEMBLOCK_RSRV_KERN = 0x20, /* memory reserved for kernel use */ + MEMBLOCK_KHO_SCRATCH = 0x40, /* scratch memory for kexec handover */ }; /** @@ -116,7 +126,19 @@ int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid, int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_phys_free(phys_addr_t base, phys_addr_t size); -int memblock_reserve(phys_addr_t base, phys_addr_t size); +int __memblock_reserve(phys_addr_t base, phys_addr_t size, int nid, + enum memblock_flags flags); + +static __always_inline int memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + return __memblock_reserve(base, size, NUMA_NO_NODE, 0); +} + +static __always_inline int memblock_reserve_kern(phys_addr_t base, phys_addr_t size) +{ + return __memblock_reserve(base, size, NUMA_NO_NODE, MEMBLOCK_RSRV_KERN); +} + #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP int memblock_physmem_add(phys_addr_t base, phys_addr_t size); #endif @@ -132,6 +154,8 @@ int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); +int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size); +int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size); void memblock_free(void *ptr, size_t size); void reset_all_zones_managed_pages(void); @@ -275,6 +299,11 @@ static inline bool memblock_is_driver_managed(struct memblock_region *m) return m->flags & MEMBLOCK_DRIVER_MANAGED; } +static inline bool memblock_is_kho_scratch(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_KHO_SCRATCH; +} + int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, @@ -476,6 +505,7 @@ static inline __init_memblock bool memblock_bottom_up(void) phys_addr_t memblock_phys_mem_size(void); phys_addr_t memblock_reserved_size(void); +phys_addr_t memblock_reserved_kern_size(phys_addr_t limit, int nid); unsigned long memblock_estimated_nr_free_pages(void); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); @@ -602,5 +632,14 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } static inline void memtest_report_meminfo(struct seq_file *m) { } #endif +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +void memblock_set_kho_scratch_only(void); +void memblock_clear_kho_scratch_only(void); +void memmap_init_kho_scratch_pages(void); +#else +static inline void memblock_set_kho_scratch_only(void) { } +static inline void memblock_clear_kho_scratch_only(void) { } +static inline void memmap_init_kho_scratch_pages(void) {} +#endif #endif /* _LINUX_MEMBLOCK_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 53364526d877..f7848f73f41c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -903,19 +903,9 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); -void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, - int val); - /* idx can be of type enum memcg_stat_item or node_stat_item */ -static inline void mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_memcg_state(memcg, idx, val); - local_irq_restore(flags); -} +void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, int val); static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) @@ -952,19 +942,8 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, local_irq_restore(flags); } -void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, - unsigned long count); - -static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) -{ - unsigned long flags; - - local_irq_save(flags); - __count_memcg_events(memcg, idx, count); - local_irq_restore(flags); -} +void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count); static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) @@ -1057,6 +1036,7 @@ static inline u64 cgroup_id_from_mm(struct mm_struct *mm) return id; } +extern int mem_cgroup_init(void); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1374,12 +1354,6 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } -static inline void __mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, - int nr) -{ -} - static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) @@ -1433,12 +1407,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, } static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) -{ -} - -static inline void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { @@ -1472,6 +1440,8 @@ static inline u64 cgroup_id_from_mm(struct mm_struct *mm) { return 0; } + +static inline int mem_cgroup_init(void) { return 0; } #endif /* CONFIG_MEMCG */ /* @@ -1736,6 +1706,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, rcu_read_unlock(); } +bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); + #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1793,6 +1765,15 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, { } +static inline ino_t page_cgroup_ino(struct page *page) +{ + return 0; +} + +static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +{ + return true; +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) diff --git a/include/linux/memory.h b/include/linux/memory.h index 12daa6ec7d09..5ec4e6d209b9 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -149,6 +149,14 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) { return 0; } +static inline int memory_block_advise_max_size(unsigned long size) +{ + return -ENODEV; +} +static inline unsigned long memory_block_advised_max_size(void) +{ + return 0; +} #else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); @@ -181,6 +189,8 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, void memory_block_add_nid(struct memory_block *mem, int nid, enum meminit_context context); #endif /* CONFIG_NUMA */ +int memory_block_advise_max_size(unsigned long size); +unsigned long memory_block_advised_max_size(void); #endif /* CONFIG_MEMORY_HOTPLUG */ /* diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ce9885e0178a..0fe96f3ab3ef 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> +#include <linux/node.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> @@ -178,6 +179,9 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); +extern int mempolicy_set_node_perf(unsigned int node, + struct access_coordinate *coords); + #else struct mempolicy {}; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d1dfbad9a447..e6ba8f4f4bd1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -398,6 +398,7 @@ struct mlx5_core_rsc_common { enum mlx5_res_type res; refcount_t refcount; struct completion free; + bool invalid; }; struct mlx5_uars_page { diff --git a/include/linux/mm.h b/include/linux/mm.h index e51dba8398f7..9e221ffcb868 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -12,6 +12,7 @@ #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> +#include <linux/compiler.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> @@ -356,9 +357,7 @@ extern unsigned int kobjsize(const void *objp); # define VM_SHADOW_STACK VM_NONE #endif -#if defined(CONFIG_X86) -# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ -#elif defined(CONFIG_PPC64) +#if defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 @@ -670,204 +669,11 @@ static inline void vma_numab_state_init(struct vm_area_struct *vma) {} static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_PER_VMA_LOCK -static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - static struct lock_class_key lockdep_key; - - lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); -#endif - if (reset_refcnt) - refcount_set(&vma->vm_refcnt, 0); - vma->vm_lock_seq = UINT_MAX; -} - -static inline bool is_vma_writer_only(int refcnt) -{ - /* - * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma - * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on - * a detached vma happens only in vma_mark_detached() and is a rare - * case, therefore most of the time there will be no unnecessary wakeup. - */ - return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; -} - -static inline void vma_refcount_put(struct vm_area_struct *vma) -{ - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ - struct mm_struct *mm = vma->vm_mm; - int oldcnt; - - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); - if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { - - if (is_vma_writer_only(oldcnt - 1)) - rcuwait_wake_up(&mm->vma_writer_wait); - } -} - -/* - * Try to read-lock a vma. The function is allowed to occasionally yield false - * locked result to avoid performance overhead, in which case we fall back to - * using mmap_lock. The function should never yield false unlocked result. - * False locked result is possible if mm_lock_seq overflows or if vma gets - * reused and attached to a different mm before we lock it. - * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got - * detached. - */ -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - int oldcnt; - - /* - * Check before locking. A race might cause false locked result. - * We can use READ_ONCE() for the mm_lock_seq here, and don't need - * ACQUIRE semantics, because this is just a lockless check whose result - * we don't rely on for anything - the mm_lock_seq read against which we - * need ordering is below. - */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) - return NULL; - - /* - * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() - * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. - * Acquire fence is required here to avoid reordering against later - * vm_lock_seq check and checks inside lock_vma_under_rcu(). - */ - if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) { - /* return EAGAIN if vma got detached from under us */ - return oldcnt ? NULL : ERR_PTR(-EAGAIN); - } - - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); - /* - * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. - * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq - * modification invalidates all existing locks. - * - * We must use ACQUIRE semantics for the mm_lock_seq so that if we are - * racing with vma_end_write_all(), we only start reading from the VMA - * after it has been unlocked. - * This pairs with RELEASE semantics in vma_end_write_all(). - */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { - vma_refcount_put(vma); - return NULL; - } - - return vma; -} - /* - * Use only while holding mmap read lock which guarantees that locking will not - * fail (nobody can concurrently write-lock the vma). vma_start_read() should - * not be used in such cases because it might fail due to mm_lock_seq overflow. - * This functionality is used to obtain vma read lock and drop the mmap read lock. + * These must be here rather than mmap_lock.h as dependent on vm_fault type, + * declared in this header. */ -static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) -{ - int oldcnt; - - mmap_assert_locked(vma->vm_mm); - if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) - return false; - - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); - return true; -} - -/* - * Use only while holding mmap read lock which guarantees that locking will not - * fail (nobody can concurrently write-lock the vma). vma_start_read() should - * not be used in such cases because it might fail due to mm_lock_seq overflow. - * This functionality is used to obtain vma read lock and drop the mmap read lock. - */ -static inline bool vma_start_read_locked(struct vm_area_struct *vma) -{ - return vma_start_read_locked_nested(vma, 0); -} - -static inline void vma_end_read(struct vm_area_struct *vma) -{ - vma_refcount_put(vma); -} - -/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) -{ - mmap_assert_write_locked(vma->vm_mm); - - /* - * current task is holding mmap_write_lock, both vma->vm_lock_seq and - * mm->mm_lock_seq can't be concurrently modified. - */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; - return (vma->vm_lock_seq == *mm_lock_seq); -} - -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); - -/* - * Begin writing to a VMA. - * Exclude concurrent readers under the per-VMA lock until the currently - * write-locked mmap_lock is dropped or downgraded. - */ -static inline void vma_start_write(struct vm_area_struct *vma) -{ - unsigned int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) - return; - - __vma_start_write(vma, mm_lock_seq); -} - -static inline void vma_assert_write_locked(struct vm_area_struct *vma) -{ - unsigned int mm_lock_seq; - - VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); -} - -static inline void vma_assert_locked(struct vm_area_struct *vma) -{ - unsigned int mm_lock_seq; - - VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && - !__is_vma_write_locked(vma, &mm_lock_seq), vma); -} - -/* - * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these - * assertions should be made either under mmap_write_lock or when the object - * has been isolated under mmap_write_lock, ensuring no competing writers. - */ -static inline void vma_assert_attached(struct vm_area_struct *vma) -{ - WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); -} - -static inline void vma_assert_detached(struct vm_area_struct *vma) -{ - WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); -} - -static inline void vma_mark_attached(struct vm_area_struct *vma) -{ - vma_assert_write_locked(vma); - vma_assert_detached(vma); - refcount_set_release(&vma->vm_refcnt, 1); -} - -void vma_mark_detached(struct vm_area_struct *vma); - +#ifdef CONFIG_PER_VMA_LOCK static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) @@ -883,36 +689,7 @@ static inline void assert_fault_locked(struct vm_fault *vmf) else mmap_assert_locked(vmf->vma->vm_mm); } - -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address); - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) - { return NULL; } -static inline void vma_end_read(struct vm_area_struct *vma) {} -static inline void vma_start_write(struct vm_area_struct *vma) {} -static inline void vma_assert_write_locked(struct vm_area_struct *vma) - { mmap_assert_write_locked(vma->vm_mm); } -static inline void vma_assert_attached(struct vm_area_struct *vma) {} -static inline void vma_assert_detached(struct vm_area_struct *vma) {} -static inline void vma_mark_attached(struct vm_area_struct *vma) {} -static inline void vma_mark_detached(struct vm_area_struct *vma) {} - -static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address) -{ - return NULL; -} - -static inline void vma_assert_locked(struct vm_area_struct *vma) -{ - mmap_assert_locked(vma->vm_mm); -} - +#else static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); @@ -922,7 +699,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } - #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; @@ -1459,7 +1235,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page); void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr); @@ -2004,6 +1780,45 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } +#ifdef CONFIG_MMU +static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) +{ + return pfn_pte(page_to_pfn(page), pgprot); +} + +/** + * folio_mk_pte - Create a PTE for this folio + * @folio: The folio to create a PTE for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_ptes(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) +{ + return pfn_pte(folio_pfn(folio), pgprot); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/** + * folio_mk_pmd - Create a PMD for this folio + * @folio: The folio to create a PMD for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_pmd_at(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) +{ + return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); +} +#endif +#endif /* CONFIG_MMU */ + static inline bool folio_has_pincount(const struct folio *folio) { if (IS_ENABLED(CONFIG_64BIT)) @@ -2185,15 +2000,6 @@ static inline long compound_nr(struct page *page) } /** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline long thp_nr_pages(struct page *page) -{ - return folio_nr_pages((struct folio *)page); -} - -/** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. * @@ -2303,7 +2109,62 @@ static inline bool folio_maybe_mapped_shared(struct folio *folio) */ if (mapcount <= 1) return false; - return folio_test_large_maybe_mapped_shared(folio); + return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); +} + +/** + * folio_expected_ref_count - calculate the expected folio refcount + * @folio: the folio + * + * Calculate the expected folio refcount, taking references from the pagecache, + * swapcache, PG_private and page table mappings into account. Useful in + * combination with folio_ref_count() to detect unexpected references (e.g., + * GUP or other temporary references). + * + * Does currently not consider references from the LRU cache. If the folio + * was isolated from the LRU (which is the case during migration or split), + * the LRU cache does not apply. + * + * Calling this function on an unmapped folio -- !folio_mapped() -- that is + * locked will return a stable result. + * + * Calling this function on a mapped folio will not result in a stable result, + * because nothing stops additional page table mappings from coming (e.g., + * fork()) or going (e.g., munmap()). + * + * Calling this function without the folio lock will also not result in a + * stable result: for example, the folio might get dropped from the swapcache + * concurrently. + * + * However, even when called without the folio lock or on a mapped folio, + * this function can be used to detect unexpected references early (for example, + * if it makes sense to even lock the folio and unmap it). + * + * The caller must add any reference (e.g., from folio_try_get()) it might be + * holding itself to the result. + * + * Returns the expected folio refcount. + */ +static inline int folio_expected_ref_count(const struct folio *folio) +{ + const int order = folio_order(folio); + int ref_count = 0; + + if (WARN_ON_ONCE(folio_test_slab(folio))) + return 0; + + if (folio_test_anon(folio)) { + /* One reference per page from the swapcache. */ + ref_count += folio_test_swapcache(folio) << order; + } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) { + /* One reference per page from the pagecache. */ + ref_count += !!folio->mapping << order; + /* One reference from PG_private. */ + ref_count += folio_test_private(folio); + } + + /* One reference per page table mapping. */ + return ref_count + folio_mapcount(folio); } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE @@ -2406,7 +2267,6 @@ static inline void clear_page_pfmemalloc(struct page *page) extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) -#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* @@ -2767,7 +2627,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); - if ((mm)->hiwater_rss < _rss) + if (data_race(mm->hiwater_rss) < _rss) (mm)->hiwater_rss = _rss; } @@ -3117,9 +2977,10 @@ static inline void pagetable_dtor_free(struct ptdesc *ptdesc) pagetable_free(ptdesc); } -static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) +static inline bool pagetable_pte_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) { - if (!ptlock_init(ptdesc)) + if (mm != &init_mm && !ptlock_init(ptdesc)) return false; __pagetable_ctor(ptdesc); return true; @@ -3223,9 +3084,10 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } -static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) +static inline bool pagetable_pmd_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) { - if (!pmd_ptlock_init(ptdesc)) + if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); __pagetable_ctor(ptdesc); @@ -3414,7 +3276,6 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, bool write); diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f9157a0c42a5..89b518ff097e 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -447,6 +447,8 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, #endif /* CONFIG_ANON_VMA_NAME */ +void pfnmap_track_ctx_release(struct kref *ref); + static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 32ba5126e221..d3cffd8828c9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -28,7 +28,6 @@ #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) -#define INIT_PASID 0 struct address_space; struct futex_private_hash; @@ -765,6 +764,38 @@ struct vma_numab_state { int prev_scan_seq; }; +#ifdef __HAVE_PFNMAP_TRACKING +struct pfnmap_track_ctx { + struct kref kref; + unsigned long pfn; + unsigned long size; /* in bytes */ +}; +#endif + +/* + * Describes a VMA that is about to be mmap()'ed. Drivers may choose to + * manipulate mutable fields which will cause those fields to be updated in the + * resultant VMA. + * + * Helper functions are not required for manipulating any field. + */ +struct vm_area_desc { + /* Immutable state. */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + + /* Mutable fields. Populated with initial state. */ + pgoff_t pgoff; + struct file *file; + vm_flags_t vm_flags; + pgprot_t page_prot; + + /* Write-only fields. */ + const struct vm_operations_struct *vm_ops; + void *private_data; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -878,6 +909,9 @@ struct vm_area_struct { struct anon_vma_name *anon_name; #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef __HAVE_PFNMAP_TRACKING + struct pfnmap_track_ctx *pfnmap_track_ctx; +#endif } __randomize_layout; #ifdef CONFIG_NUMA diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index e0eddfd306ef..5da384bd0a26 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -1,6 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H +/* Avoid a dependency loop by declaring here. */ +extern int rcuwait_wake_up(struct rcuwait *w); + #include <linux/lockdep.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> @@ -105,6 +109,206 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return read_seqcount_retry(&mm->mm_lock_seq, seq); } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key lockdep_key; + + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); +#endif + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); + vma->vm_lock_seq = UINT_MAX; +} + +static inline bool is_vma_writer_only(int refcnt) +{ + /* + * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma + * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on + * a detached vma happens only in vma_mark_detached() and is a rare + * case, therefore most of the time there will be no unnecessary wakeup. + */ + return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; +} + +static inline void vma_refcount_put(struct vm_area_struct *vma) +{ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *mm = vma->vm_mm; + int oldcnt; + + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { + + if (is_vma_writer_only(oldcnt - 1)) + rcuwait_wake_up(&mm->vma_writer_wait); + } +} + +/* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. + */ +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + int oldcnt; + + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) + return NULL; + + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + return oldcnt ? NULL : ERR_PTR(-EAGAIN); + } + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq + * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). + */ + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); + return NULL; + } + + return vma; +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +{ + int oldcnt; + + mmap_assert_locked(vma->vm_mm); + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + return true; +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked(struct vm_area_struct *vma) +{ + return vma_start_read_locked_nested(vma, 0); +} + +static inline void vma_end_read(struct vm_area_struct *vma) +{ + vma_refcount_put(vma); +} + +/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +{ + mmap_assert_write_locked(vma->vm_mm); + + /* + * current task is holding mmap_write_lock, both vma->vm_lock_seq and + * mm->mm_lock_seq can't be concurrently modified. + */ + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; + return (vma->vm_lock_seq == *mm_lock_seq); +} + +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); + +/* + * Begin writing to a VMA. + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + */ +static inline void vma_start_write(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + if (__is_vma_write_locked(vma, &mm_lock_seq)) + return; + + __vma_start_write(vma, mm_lock_seq); +} + +static inline void vma_assert_write_locked(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); +} + +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && + !__is_vma_write_locked(vma, &mm_lock_seq), vma); +} + +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); +} + +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); +} + +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set_release(&vma->vm_refcnt, 1); +} + +void vma_mark_detached(struct vm_area_struct *vma); + +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address); + #else /* CONFIG_PER_VMA_LOCK */ static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} @@ -120,6 +324,29 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int { return true; } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) + { return NULL; } +static inline void vma_end_read(struct vm_area_struct *vma) {} +static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline void vma_assert_write_locked(struct vm_area_struct *vma) + { mmap_assert_write_locked(vma->vm_mm); } +static inline void vma_assert_attached(struct vm_area_struct *vma) {} +static inline void vma_assert_detached(struct vm_area_struct *vma) {} +static inline void vma_mark_attached(struct vm_area_struct *vma) {} +static inline void vma_mark_detached(struct vm_area_struct *vma) {} + +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + return NULL; +} + +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); +} #endif /* CONFIG_PER_VMA_LOCK */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b1c459f7a485..28066b4ced81 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2074,11 +2074,37 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) return usage ? test_bit(idx, usage->subsection_map) : 0; } + +static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn) +{ + struct mem_section_usage *usage = READ_ONCE(ms->usage); + int idx = subsection_map_index(*pfn); + unsigned long bit; + + if (!usage) + return false; + + if (test_bit(idx, usage->subsection_map)) + return true; + + /* Find the next subsection that exists */ + bit = find_next_bit(usage->subsection_map, SUBSECTIONS_PER_SECTION, idx); + if (bit == SUBSECTIONS_PER_SECTION) + return false; + + *pfn = (*pfn & PAGE_SECTION_MASK) + (bit * PAGES_PER_SUBSECTION); + return true; +} #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } + +static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn) +{ + return true; +} #endif void sparse_init_early_section(int nid, struct page *map, unsigned long pnum, @@ -2127,6 +2153,58 @@ static inline int pfn_valid(unsigned long pfn) return ret; } + +/* Returns end_pfn or higher if no valid PFN remaining in range */ +static inline unsigned long first_valid_pfn(unsigned long pfn, unsigned long end_pfn) +{ + unsigned long nr = pfn_to_section_nr(pfn); + + rcu_read_lock_sched(); + + while (nr <= __highest_present_section_nr && pfn < end_pfn) { + struct mem_section *ms = __pfn_to_section(pfn); + + if (valid_section(ms) && + (early_section(ms) || pfn_section_first_valid(ms, &pfn))) { + rcu_read_unlock_sched(); + return pfn; + } + + /* Nothing left in this section? Skip to next section */ + nr++; + pfn = section_nr_to_pfn(nr); + } + + rcu_read_unlock_sched(); + return end_pfn; +} + +static inline unsigned long next_valid_pfn(unsigned long pfn, unsigned long end_pfn) +{ + pfn++; + + if (pfn >= end_pfn) + return end_pfn; + + /* + * Either every PFN within the section (or subsection for VMEMMAP) is + * valid, or none of them are. So there's no point repeating the check + * for every PFN; only call first_valid_pfn() again when crossing a + * (sub)section boundary (i.e. !(pfn & ~PAGE_{SUB,}SECTION_MASK)). + */ + if (pfn & ~(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) ? + PAGE_SUBSECTION_MASK : PAGE_SECTION_MASK)) + return pfn; + + return first_valid_pfn(pfn, end_pfn); +} + + +#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \ + for ((_pfn) = first_valid_pfn((_start_pfn), (_end_pfn)); \ + (_pfn) < (_end_pfn); \ + (_pfn) = next_valid_pfn((_pfn), (_end_pfn))) + #endif static inline int pfn_in_present_section(unsigned long pfn) @@ -2176,6 +2254,16 @@ void sparse_init(void); #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ +/* + * Fallback case for when the architecture provides its own pfn_valid() but + * not a corresponding for_each_valid_pfn(). + */ +#ifndef for_each_valid_pfn +#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \ + for ((_pfn) = (_start_pfn); (_pfn) < (_end_pfn); (_pfn)++) \ + if (pfn_valid(_pfn)) +#endif + #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 6904ad33ee7a..d3ee0e5162f0 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -101,9 +101,6 @@ extern struct vfsmount *vfs_create_mount(struct fs_context *fc); extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); -extern struct vfsmount *vfs_submount(const struct dentry *mountpoint, - struct file_system_type *type, - const char *name, void *data); extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index dd85613cdd86..991076cba7c5 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -22,6 +22,7 @@ struct numa_meminfo { }; int __init numa_add_memblk(int nodeid, u64 start, u64 end); +int __init numa_add_reserved_memblk(int nid, u64 start, u64 end); void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); int __init numa_cleanup_meminfo(struct numa_meminfo *mi); diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h index 6f9242259edc..6de479ebbe5d 100644 --- a/include/linux/oid_registry.h +++ b/include/linux/oid_registry.h @@ -151,6 +151,5 @@ enum OID { extern enum OID look_up_OID(const void *data, size_t datasize); extern int parse_OID(const void *data, size_t datasize, enum OID *oid); extern int sprint_oid(const void *, size_t, char *, size_t); -extern int sprint_OID(enum OID, char *, size_t); #endif /* _LINUX_OID_REGISTRY_H */ diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 4f5c9e979bb9..760006b1c480 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -72,8 +72,10 @@ #define NODE_NOT_IN_PAGE_FLAGS 1 #endif -#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +#if defined(CONFIG_KASAN_SW_TAGS) #define KASAN_TAG_WIDTH 8 +#elif defined(CONFIG_KASAN_HW_TAGS) +#define KASAN_TAG_WIDTH 4 #else #define KASAN_TAG_WIDTH 0 #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3b814ce08331..4fe5ee67535b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -915,20 +915,6 @@ FOLIO_FLAG_FALSE(partially_mapped) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * PageHuge() only returns true for hugetlbfs pages, but not for - * normal or transparent huge pages. - * - * PageTransHuge() returns true for both transparent huge and - * hugetlbfs pages, but not normal pages. PageTransHuge() can only be - * called only in the core VM paths where hugetlbfs pages can't exist. - */ -static inline int PageTransHuge(const struct page *page) -{ - VM_BUG_ON_PAGE(PageTail(page), page); - return PageHead(page); -} - -/* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. @@ -938,7 +924,6 @@ static inline int PageTransCompound(const struct page *page) return PageCompound(page); } #else -TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) #endif @@ -989,7 +974,7 @@ static inline bool page_mapcount_is_type(unsigned int mapcount) static inline bool page_has_type(const struct page *page) { - return page_mapcount_is_type(data_race(page->page_type)); + return page_type_has_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ @@ -1237,10 +1222,6 @@ static inline int folio_has_private(const struct folio *folio) return !!(folio->flags & PAGE_FLAGS_PRIVATE); } -static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio) -{ - return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); -} #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 26baa78f1ca7..d2ced9920992 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -533,7 +533,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) } struct address_space *folio_mapping(struct folio *); -struct address_space *swapcache_mapping(struct folio *); /** * folio_flush_mapping - Find the file mapping this folio belongs to. @@ -884,26 +883,6 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -extern pgoff_t __folio_swap_cache_index(struct folio *folio); - -/** - * folio_index - File index of a folio. - * @folio: The folio. - * - * For a folio which is either in the page cache or the swap cache, - * return its index within the address_space it belongs to. If you know - * the page is definitely in the page cache, you can look at the folio's - * index directly. - * - * Return: The index (offset in units of pages) of a folio in its file. - */ -static inline pgoff_t folio_index(struct folio *folio) -{ - if (unlikely(folio_test_swapcache(folio))) - return __folio_swap_cache_index(folio); - return folio->index; -} - /** * folio_next_index - Get the index of the next folio. * @folio: The current folio. @@ -935,27 +914,14 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) * @folio: The folio. * @index: The page index within the file. * - * Context: The caller should have the page locked in order to prevent - * (eg) shmem from moving the page between the page cache and swap cache - * and changing its index in the middle of the operation. + * Context: The caller should have the folio locked and ensure + * e.g., shmem did not move this folio to the swap cache. * Return: true or false. */ static inline bool folio_contains(struct folio *folio, pgoff_t index) { - return index - folio_index(folio) < folio_nr_pages(folio); -} - -/* - * Given the page we found in the page cache, return the page corresponding - * to this index in the file - */ -static inline struct page *find_subpage(struct page *head, pgoff_t index) -{ - /* HugeTLBfs wants the head page regardless */ - if (PageHuge(head)) - return head; - - return head + (index & (thp_nr_pages(head) - 1)); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + return index - folio->index < folio_nr_pages(folio); } unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, @@ -1308,9 +1274,9 @@ static inline bool filemap_range_needs_writeback(struct address_space *mapping, * struct readahead_control - Describes a readahead request. * * A readahead request is for consecutive pages. Filesystems which - * implement the ->readahead method should call readahead_page() or - * readahead_page_batch() in a loop and attempt to start I/O against - * each page in the request. + * implement the ->readahead method should call readahead_folio() or + * __readahead_batch() in a loop and attempt to start reads into each + * folio in the request. * * Most of the fields in this struct are private and should be accessed * by the functions below. @@ -1416,22 +1382,6 @@ static inline struct folio *__readahead_folio(struct readahead_control *ractl) } /** - * readahead_page - Get the next page to read. - * @ractl: The current readahead request. - * - * Context: The page is locked and has an elevated refcount. The caller - * should decreases the refcount once the page has been submitted for I/O - * and unlock the page once all I/O to that page has completed. - * Return: A pointer to the next page, or %NULL if we are done. - */ -static inline struct page *readahead_page(struct readahead_control *ractl) -{ - struct folio *folio = __readahead_folio(ractl); - - return &folio->page; -} - -/** * readahead_folio - Get the next folio to read. * @ractl: The current readahead request. * @@ -1453,7 +1403,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, { unsigned int i = 0; XA_STATE(xas, &rac->mapping->i_pages, 0); - struct page *page; + struct folio *folio; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; @@ -1462,13 +1412,12 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, xas_set(&xas, rac->_index); rcu_read_lock(); - xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, rac->_index + rac->_nr_pages - 1) { + if (xas_retry(&xas, folio)) continue; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - array[i++] = page; - rac->_batch_count += thp_nr_pages(page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + array[i++] = folio_page(folio, 0); + rac->_batch_count += folio_nr_pages(folio); if (i == array_sz) break; } @@ -1478,20 +1427,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, } /** - * readahead_page_batch - Get a batch of pages to read. - * @rac: The current readahead request. - * @array: An array of pointers to struct page. - * - * Context: The pages are locked and have an elevated refcount. The caller - * should decreases the refcount once the page has been submitted for I/O - * and unlock the page once all I/O to that page has completed. - * Return: The number of pages placed in the array. 0 indicates the request - * is complete. - */ -#define readahead_page_batch(rac, array) \ - __readahead_batch(rac, array, ARRAY_SIZE(array)) - -/** * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. */ diff --git a/include/linux/pe.h b/include/linux/pe.h index fdf9c95709ba..cd2b7275385f 100644 --- a/include/linux/pe.h +++ b/include/linux/pe.h @@ -39,113 +39,160 @@ */ #define LINUX_PE_MAGIC 0x818223cd -#define MZ_MAGIC 0x5a4d /* "MZ" */ +#define IMAGE_DOS_SIGNATURE 0x5a4d /* "MZ" */ -#define PE_MAGIC 0x00004550 /* "PE\0\0" */ -#define PE_OPT_MAGIC_PE32 0x010b -#define PE_OPT_MAGIC_PE32_ROM 0x0107 -#define PE_OPT_MAGIC_PE32PLUS 0x020b +#define IMAGE_NT_SIGNATURE 0x00004550 /* "PE\0\0" */ + +#define IMAGE_ROM_OPTIONAL_HDR_MAGIC 0x0107 /* ROM image (for R3000/R4000/R10000/ALPHA), without MZ and PE\0\0 sign */ +#define IMAGE_NT_OPTIONAL_HDR32_MAGIC 0x010b /* PE32 executable image */ +#define IMAGE_NT_OPTIONAL_HDR64_MAGIC 0x020b /* PE32+ executable image */ /* machine type */ -#define IMAGE_FILE_MACHINE_UNKNOWN 0x0000 -#define IMAGE_FILE_MACHINE_AM33 0x01d3 -#define IMAGE_FILE_MACHINE_AMD64 0x8664 -#define IMAGE_FILE_MACHINE_ARM 0x01c0 -#define IMAGE_FILE_MACHINE_ARMV7 0x01c4 -#define IMAGE_FILE_MACHINE_ARM64 0xaa64 -#define IMAGE_FILE_MACHINE_EBC 0x0ebc -#define IMAGE_FILE_MACHINE_I386 0x014c -#define IMAGE_FILE_MACHINE_IA64 0x0200 -#define IMAGE_FILE_MACHINE_M32R 0x9041 -#define IMAGE_FILE_MACHINE_MIPS16 0x0266 -#define IMAGE_FILE_MACHINE_MIPSFPU 0x0366 -#define IMAGE_FILE_MACHINE_MIPSFPU16 0x0466 -#define IMAGE_FILE_MACHINE_POWERPC 0x01f0 -#define IMAGE_FILE_MACHINE_POWERPCFP 0x01f1 -#define IMAGE_FILE_MACHINE_R4000 0x0166 -#define IMAGE_FILE_MACHINE_RISCV32 0x5032 -#define IMAGE_FILE_MACHINE_RISCV64 0x5064 -#define IMAGE_FILE_MACHINE_RISCV128 0x5128 -#define IMAGE_FILE_MACHINE_SH3 0x01a2 -#define IMAGE_FILE_MACHINE_SH3DSP 0x01a3 -#define IMAGE_FILE_MACHINE_SH3E 0x01a4 -#define IMAGE_FILE_MACHINE_SH4 0x01a6 -#define IMAGE_FILE_MACHINE_SH5 0x01a8 -#define IMAGE_FILE_MACHINE_THUMB 0x01c2 -#define IMAGE_FILE_MACHINE_WCEMIPSV2 0x0169 -#define IMAGE_FILE_MACHINE_LOONGARCH32 0x6232 -#define IMAGE_FILE_MACHINE_LOONGARCH64 0x6264 +#define IMAGE_FILE_MACHINE_UNKNOWN 0x0000 /* Unknown architecture */ +#define IMAGE_FILE_MACHINE_TARGET_HOST 0x0001 /* Interacts with the host and not a WOW64 guest (not for file image) */ +#define IMAGE_FILE_MACHINE_ALPHA_OLD 0x0183 /* DEC Alpha AXP 32-bit (old images) */ +#define IMAGE_FILE_MACHINE_ALPHA 0x0184 /* DEC Alpha AXP 32-bit */ +#define IMAGE_FILE_MACHINE_ALPHA64 0x0284 /* DEC Alpha AXP 64-bit (with 8kB page size) */ +#define IMAGE_FILE_MACHINE_AXP64 IMAGE_FILE_MACHINE_ALPHA64 +#define IMAGE_FILE_MACHINE_AM33 0x01d3 /* Matsushita AM33, now Panasonic MN103 */ +#define IMAGE_FILE_MACHINE_AMD64 0x8664 /* AMD64 (x64) */ +#define IMAGE_FILE_MACHINE_ARM 0x01c0 /* ARM Little-Endian (ARMv4) */ +#define IMAGE_FILE_MACHINE_THUMB 0x01c2 /* ARM Thumb Little-Endian (ARMv4T) */ +#define IMAGE_FILE_MACHINE_ARMNT 0x01c4 /* ARM Thumb-2 Little-Endian (ARMv7) */ +#define IMAGE_FILE_MACHINE_ARMV7 IMAGE_FILE_MACHINE_ARMNT +#define IMAGE_FILE_MACHINE_ARM64 0xaa64 /* ARM64 Little-Endian (Classic ABI) */ +#define IMAGE_FILE_MACHINE_ARM64EC 0xa641 /* ARM64 Little-Endian (Emulation Compatible ABI for AMD64) */ +#define IMAGE_FILE_MACHINE_ARM64X 0xa64e /* ARM64 Little-Endian (fat binary with both Classic ABI and EC ABI code) */ +#define IMAGE_FILE_MACHINE_CEE 0xc0ee /* COM+ Execution Engine (CLR pure MSIL object files) */ +#define IMAGE_FILE_MACHINE_CEF 0x0cef /* Windows CE 3.0 Common Executable Format (CEF bytecode) */ +#define IMAGE_FILE_MACHINE_CHPE_X86 0x3a64 /* ARM64 Little-Endian (Compiled Hybrid PE ABI for I386) */ +#define IMAGE_FILE_MACHINE_HYBRID_X86 IMAGE_FILE_MACHINE_CHPE_X86 +#define IMAGE_FILE_MACHINE_EBC 0x0ebc /* EFI/UEFI Byte Code */ +#define IMAGE_FILE_MACHINE_I386 0x014c /* Intel 386 (x86) */ +#define IMAGE_FILE_MACHINE_I860 0x014d /* Intel 860 (N10) */ +#define IMAGE_FILE_MACHINE_IA64 0x0200 /* Intel IA-64 (with 8kB page size) */ +#define IMAGE_FILE_MACHINE_LOONGARCH32 0x6232 /* LoongArch 32-bit processor family */ +#define IMAGE_FILE_MACHINE_LOONGARCH64 0x6264 /* LoongArch 64-bit processor family */ +#define IMAGE_FILE_MACHINE_M32R 0x9041 /* Mitsubishi M32R 32-bit Little-Endian */ +#define IMAGE_FILE_MACHINE_M68K 0x0268 /* Motorola 68000 series */ +#define IMAGE_FILE_MACHINE_MIPS16 0x0266 /* MIPS III with MIPS16 ASE Little-Endian */ +#define IMAGE_FILE_MACHINE_MIPSFPU 0x0366 /* MIPS III with FPU Little-Endian */ +#define IMAGE_FILE_MACHINE_MIPSFPU16 0x0466 /* MIPS III with MIPS16 ASE and FPU Little-Endian */ +#define IMAGE_FILE_MACHINE_MPPC_601 0x0601 /* PowerPC 32-bit Big-Endian */ +#define IMAGE_FILE_MACHINE_OMNI 0xace1 /* Microsoft OMNI VM (omniprox.dll) */ +#define IMAGE_FILE_MACHINE_PARISC 0x0290 /* HP PA-RISC */ +#define IMAGE_FILE_MACHINE_POWERPC 0x01f0 /* PowerPC 32-bit Little-Endian */ +#define IMAGE_FILE_MACHINE_POWERPCFP 0x01f1 /* PowerPC 32-bit with FPU Little-Endian */ +#define IMAGE_FILE_MACHINE_POWERPCBE 0x01f2 /* PowerPC 64-bit Big-Endian */ +#define IMAGE_FILE_MACHINE_R3000 0x0162 /* MIPS I Little-Endian */ +#define IMAGE_FILE_MACHINE_R3000_BE 0x0160 /* MIPS I Big-Endian */ +#define IMAGE_FILE_MACHINE_R4000 0x0166 /* MIPS III Little-Endian (with 1kB or 4kB page size) */ +#define IMAGE_FILE_MACHINE_R10000 0x0168 /* MIPS IV Little-Endian */ +#define IMAGE_FILE_MACHINE_RISCV32 0x5032 /* RISC-V 32-bit address space */ +#define IMAGE_FILE_MACHINE_RISCV64 0x5064 /* RISC-V 64-bit address space */ +#define IMAGE_FILE_MACHINE_RISCV128 0x5128 /* RISC-V 128-bit address space */ +#define IMAGE_FILE_MACHINE_SH3 0x01a2 /* Hitachi SH-3 32-bit Little-Endian (with 1kB page size) */ +#define IMAGE_FILE_MACHINE_SH3DSP 0x01a3 /* Hitachi SH-3 DSP 32-bit (with 1kB page size) */ +#define IMAGE_FILE_MACHINE_SH3E 0x01a4 /* Hitachi SH-3E Little-Endian (with 1kB page size) */ +#define IMAGE_FILE_MACHINE_SH4 0x01a6 /* Hitachi SH-4 32-bit Little-Endian (with 1kB page size) */ +#define IMAGE_FILE_MACHINE_SH5 0x01a8 /* Hitachi SH-5 64-bit */ +#define IMAGE_FILE_MACHINE_TAHOE 0x07cc /* Intel EM machine */ +#define IMAGE_FILE_MACHINE_TRICORE 0x0520 /* Infineon AUDO 32-bit */ +#define IMAGE_FILE_MACHINE_WCEMIPSV2 0x0169 /* MIPS Windows CE v2 Little-Endian */ /* flags */ -#define IMAGE_FILE_RELOCS_STRIPPED 0x0001 -#define IMAGE_FILE_EXECUTABLE_IMAGE 0x0002 -#define IMAGE_FILE_LINE_NUMS_STRIPPED 0x0004 -#define IMAGE_FILE_LOCAL_SYMS_STRIPPED 0x0008 -#define IMAGE_FILE_AGGRESSIVE_WS_TRIM 0x0010 -#define IMAGE_FILE_LARGE_ADDRESS_AWARE 0x0020 -#define IMAGE_FILE_16BIT_MACHINE 0x0040 -#define IMAGE_FILE_BYTES_REVERSED_LO 0x0080 -#define IMAGE_FILE_32BIT_MACHINE 0x0100 -#define IMAGE_FILE_DEBUG_STRIPPED 0x0200 -#define IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP 0x0400 -#define IMAGE_FILE_NET_RUN_FROM_SWAP 0x0800 -#define IMAGE_FILE_SYSTEM 0x1000 -#define IMAGE_FILE_DLL 0x2000 -#define IMAGE_FILE_UP_SYSTEM_ONLY 0x4000 -#define IMAGE_FILE_BYTES_REVERSED_HI 0x8000 - -#define IMAGE_FILE_OPT_ROM_MAGIC 0x107 -#define IMAGE_FILE_OPT_PE32_MAGIC 0x10b -#define IMAGE_FILE_OPT_PE32_PLUS_MAGIC 0x20b - -#define IMAGE_SUBSYSTEM_UNKNOWN 0 -#define IMAGE_SUBSYSTEM_NATIVE 1 -#define IMAGE_SUBSYSTEM_WINDOWS_GUI 2 -#define IMAGE_SUBSYSTEM_WINDOWS_CUI 3 -#define IMAGE_SUBSYSTEM_POSIX_CUI 7 -#define IMAGE_SUBSYSTEM_WINDOWS_CE_GUI 9 -#define IMAGE_SUBSYSTEM_EFI_APPLICATION 10 -#define IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER 11 -#define IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER 12 -#define IMAGE_SUBSYSTEM_EFI_ROM_IMAGE 13 -#define IMAGE_SUBSYSTEM_XBOX 14 - -#define IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE 0x0040 -#define IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY 0x0080 -#define IMAGE_DLL_CHARACTERISTICS_NX_COMPAT 0x0100 -#define IMAGE_DLLCHARACTERISTICS_NO_ISOLATION 0x0200 -#define IMAGE_DLLCHARACTERISTICS_NO_SEH 0x0400 -#define IMAGE_DLLCHARACTERISTICS_NO_BIND 0x0800 -#define IMAGE_DLLCHARACTERISTICS_WDM_DRIVER 0x2000 -#define IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE 0x8000 - -#define IMAGE_DLLCHARACTERISTICS_EX_CET_COMPAT 0x0001 -#define IMAGE_DLLCHARACTERISTICS_EX_FORWARD_CFI_COMPAT 0x0040 - -/* they actually defined 0x00000000 as well, but I think we'll skip that one. */ -#define IMAGE_SCN_RESERVED_0 0x00000001 -#define IMAGE_SCN_RESERVED_1 0x00000002 -#define IMAGE_SCN_RESERVED_2 0x00000004 -#define IMAGE_SCN_TYPE_NO_PAD 0x00000008 /* don't pad - obsolete */ -#define IMAGE_SCN_RESERVED_3 0x00000010 +#define IMAGE_FILE_RELOCS_STRIPPED 0x0001 /* Relocation info stripped from file */ +#define IMAGE_FILE_EXECUTABLE_IMAGE 0x0002 /* File is executable (i.e. no unresolved external references) */ +#define IMAGE_FILE_LINE_NUMS_STRIPPED 0x0004 /* Line nunbers stripped from file */ +#define IMAGE_FILE_LOCAL_SYMS_STRIPPED 0x0008 /* Local symbols stripped from file */ +#define IMAGE_FILE_AGGRESSIVE_WS_TRIM 0x0010 /* Aggressively trim working set */ +#define IMAGE_FILE_LARGE_ADDRESS_AWARE 0x0020 /* App can handle >2gb addresses (image can be loaded at address above 2GB) */ +#define IMAGE_FILE_16BIT_MACHINE 0x0040 /* 16 bit word machine */ +#define IMAGE_FILE_BYTES_REVERSED_LO 0x0080 /* Bytes of machine word are reversed (should be set together with IMAGE_FILE_BYTES_REVERSED_HI) */ +#define IMAGE_FILE_32BIT_MACHINE 0x0100 /* 32 bit word machine */ +#define IMAGE_FILE_DEBUG_STRIPPED 0x0200 /* Debugging info stripped from file in .DBG file */ +#define IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP 0x0400 /* If Image is on removable media, copy and run from the swap file */ +#define IMAGE_FILE_NET_RUN_FROM_SWAP 0x0800 /* If Image is on Net, copy and run from the swap file */ +#define IMAGE_FILE_SYSTEM 0x1000 /* System kernel-mode file (can't be loaded in user-mode) */ +#define IMAGE_FILE_DLL 0x2000 /* File is a DLL */ +#define IMAGE_FILE_UP_SYSTEM_ONLY 0x4000 /* File should only be run on a UP (uniprocessor) machine */ +#define IMAGE_FILE_BYTES_REVERSED_HI 0x8000 /* Bytes of machine word are reversed (should be set together with IMAGE_FILE_BYTES_REVERSED_LO) */ + +/* subsys */ +#define IMAGE_SUBSYSTEM_UNKNOWN 0 /* Unknown subsystem */ +#define IMAGE_SUBSYSTEM_NATIVE 1 /* No subsystem required (NT device drivers and NT native system processes) */ +#define IMAGE_SUBSYSTEM_WINDOWS_GUI 2 /* Windows graphical user interface (GUI) subsystem */ +#define IMAGE_SUBSYSTEM_WINDOWS_CUI 3 /* Windows character-mode user interface (CUI) subsystem */ +#define IMAGE_SUBSYSTEM_WINDOWS_OLD_CE_GUI 4 /* Old Windows CE subsystem */ +#define IMAGE_SUBSYSTEM_OS2_CUI 5 /* OS/2 CUI subsystem */ +#define IMAGE_SUBSYSTEM_RESERVED_6 6 +#define IMAGE_SUBSYSTEM_POSIX_CUI 7 /* POSIX CUI subsystem */ +#define IMAGE_SUBSYSTEM_MMOSA 8 /* MMOSA/Native Win32E */ +#define IMAGE_SUBSYSTEM_WINDOWS_CE_GUI 9 /* Windows CE subsystem */ +#define IMAGE_SUBSYSTEM_EFI_APPLICATION 10 /* Extensible Firmware Interface (EFI) application */ +#define IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER 11 /* EFI driver with boot services */ +#define IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER 12 /* EFI driver with run-time services */ +#define IMAGE_SUBSYSTEM_EFI_ROM_IMAGE 13 /* EFI ROM image */ +#define IMAGE_SUBSYSTEM_XBOX 14 /* Xbox system */ +#define IMAGE_SUBSYSTEM_RESERVED_15 15 +#define IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION 16 /* Windows Boot application */ +#define IMAGE_SUBSYSTEM_XBOX_CODE_CATALOG 17 /* Xbox Code Catalog */ + +/* dll_flags */ +#define IMAGE_LIBRARY_PROCESS_INIT 0x0001 /* DLL initialization function called just after process initialization */ +#define IMAGE_LIBRARY_PROCESS_TERM 0x0002 /* DLL initialization function called just before process termination */ +#define IMAGE_LIBRARY_THREAD_INIT 0x0004 /* DLL initialization function called just after thread initialization */ +#define IMAGE_LIBRARY_THREAD_TERM 0x0008 /* DLL initialization function called just before thread initialization */ +#define IMAGE_DLLCHARACTERISTICS_RESERVED_4 0x0010 +#define IMAGE_DLLCHARACTERISTICS_HIGH_ENTROPY_VA 0x0020 /* ASLR with 64 bit address space (image can be loaded at address above 4GB) */ +#define IMAGE_DLLCHARACTERISTICS_DYNAMIC_BASE 0x0040 /* The DLL can be relocated at load time */ +#define IMAGE_DLLCHARACTERISTICS_FORCE_INTEGRITY 0x0080 /* Code integrity checks are forced */ +#define IMAGE_DLLCHARACTERISTICS_NX_COMPAT 0x0100 /* Image is compatible with data execution prevention */ +#define IMAGE_DLLCHARACTERISTICS_NO_ISOLATION 0x0200 /* Image is isolation aware, but should not be isolated (prevents loading of manifest file) */ +#define IMAGE_DLLCHARACTERISTICS_NO_SEH 0x0400 /* Image does not use SEH, no SE handler may reside in this image */ +#define IMAGE_DLLCHARACTERISTICS_NO_BIND 0x0800 /* Do not bind the image */ +#define IMAGE_DLLCHARACTERISTICS_X86_THUNK 0x1000 /* Image is a Wx86 Thunk DLL (for non-x86/risc DLL files) */ +#define IMAGE_DLLCHARACTERISTICS_APPCONTAINER 0x1000 /* Image should execute in an AppContainer (for EXE Metro Apps in Windows 8) */ +#define IMAGE_DLLCHARACTERISTICS_WDM_DRIVER 0x2000 /* A WDM driver */ +#define IMAGE_DLLCHARACTERISTICS_GUARD_CF 0x4000 /* Image supports Control Flow Guard */ +#define IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE 0x8000 /* The image is terminal server (Remote Desktop Services) aware */ + +/* IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS flags */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_COMPAT 0x0001 /* Image is Control-flow Enforcement Technology Shadow Stack compatible */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_COMPAT_STRICT_MODE 0x0002 /* CET is enforced in strict mode */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_SET_CONTEXT_IP_VALIDATION_RELAXED_MODE 0x0004 /* Relaxed mode for Context IP Validation under CET is allowed */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_DYNAMIC_APIS_ALLOW_IN_PROC 0x0008 /* Use of dynamic APIs is restricted to processes only */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_RESERVED_1 0x0010 +#define IMAGE_DLLCHARACTERISTICS_EX_CET_RESERVED_2 0x0020 +#define IMAGE_DLLCHARACTERISTICS_EX_FORWARD_CFI_COMPAT 0x0040 /* All branch targets in all image code sections are annotated with forward-edge control flow integrity guard instructions */ +#define IMAGE_DLLCHARACTERISTICS_EX_HOTPATCH_COMPATIBLE 0x0080 /* Image can be modified while in use, hotpatch-compatible */ + +/* section_header flags */ +#define IMAGE_SCN_SCALE_INDEX 0x00000001 /* address of tls index is scaled = multiplied by 4 (for .tls section on MIPS only) */ +#define IMAGE_SCN_TYPE_NO_LOAD 0x00000002 /* reserved */ +#define IMAGE_SCN_TYPE_GROUPED 0x00000004 /* obsolete (used for 16-bit offset code) */ +#define IMAGE_SCN_TYPE_NO_PAD 0x00000008 /* .o only - don't pad - obsolete (same as IMAGE_SCN_ALIGN_1BYTES) */ +#define IMAGE_SCN_TYPE_COPY 0x00000010 /* reserved */ #define IMAGE_SCN_CNT_CODE 0x00000020 /* .text */ #define IMAGE_SCN_CNT_INITIALIZED_DATA 0x00000040 /* .data */ #define IMAGE_SCN_CNT_UNINITIALIZED_DATA 0x00000080 /* .bss */ -#define IMAGE_SCN_LNK_OTHER 0x00000100 /* reserved */ -#define IMAGE_SCN_LNK_INFO 0x00000200 /* .drectve comments */ -#define IMAGE_SCN_RESERVED_4 0x00000400 +#define IMAGE_SCN_LNK_OTHER 0x00000100 /* .o only - other type than code, data or info */ +#define IMAGE_SCN_LNK_INFO 0x00000200 /* .o only - .drectve comments */ +#define IMAGE_SCN_LNK_OVERLAY 0x00000400 /* section contains overlay */ #define IMAGE_SCN_LNK_REMOVE 0x00000800 /* .o only - scn to be rm'd*/ #define IMAGE_SCN_LNK_COMDAT 0x00001000 /* .o only - COMDAT data */ -#define IMAGE_SCN_RESERVED_5 0x00002000 /* spec omits this */ -#define IMAGE_SCN_RESERVED_6 0x00004000 /* spec omits this */ -#define IMAGE_SCN_GPREL 0x00008000 /* global pointer referenced data */ -/* spec lists 0x20000 twice, I suspect they meant 0x10000 for one of them */ -#define IMAGE_SCN_MEM_PURGEABLE 0x00010000 /* reserved for "future" use */ -#define IMAGE_SCN_16BIT 0x00020000 /* reserved for "future" use */ -#define IMAGE_SCN_LOCKED 0x00040000 /* reserved for "future" use */ -#define IMAGE_SCN_PRELOAD 0x00080000 /* reserved for "future" use */ +#define IMAGE_SCN_RESERVED_13 0x00002000 /* spec omits this */ +#define IMAGE_SCN_MEM_PROTECTED 0x00004000 /* section is memory protected (for M68K) */ +#define IMAGE_SCN_NO_DEFER_SPEC_EXC 0x00004000 /* reset speculative exceptions handling bits in the TLB entries (for non-M68K) */ +#define IMAGE_SCN_MEM_FARDATA 0x00008000 /* section uses FAR_EXTERNAL relocations (for M68K) */ +#define IMAGE_SCN_GPREL 0x00008000 /* global pointer referenced data (for non-M68K) */ +#define IMAGE_SCN_MEM_SYSHEAP 0x00010000 /* use system heap (for M68K) */ +#define IMAGE_SCN_MEM_PURGEABLE 0x00020000 /* section can be released from RAM (for M68K) */ +#define IMAGE_SCN_MEM_16BIT 0x00020000 /* section is 16-bit (for non-M68K where it makes sense: I386, THUMB, MIPS16, MIPSFPU16, ...) */ +#define IMAGE_SCN_MEM_LOCKED 0x00040000 /* prevent the section from being moved (for M68K and .o I386) */ +#define IMAGE_SCN_MEM_PRELOAD 0x00080000 /* section is preload to RAM (for M68K and .o I386) */ /* and here they just stuck a 1-byte integer in the middle of a bitfield */ -#define IMAGE_SCN_ALIGN_1BYTES 0x00100000 /* it does what it says on the box */ +#define IMAGE_SCN_ALIGN_1BYTES 0x00100000 /* .o only - it does what it says on the box */ #define IMAGE_SCN_ALIGN_2BYTES 0x00200000 #define IMAGE_SCN_ALIGN_4BYTES 0x00300000 #define IMAGE_SCN_ALIGN_8BYTES 0x00400000 @@ -159,7 +206,9 @@ #define IMAGE_SCN_ALIGN_2048BYTES 0x00c00000 #define IMAGE_SCN_ALIGN_4096BYTES 0x00d00000 #define IMAGE_SCN_ALIGN_8192BYTES 0x00e00000 -#define IMAGE_SCN_LNK_NRELOC_OVFL 0x01000000 /* extended relocations */ +#define IMAGE_SCN_ALIGN_RESERVED 0x00f00000 +#define IMAGE_SCN_ALIGN_MASK 0x00f00000 +#define IMAGE_SCN_LNK_NRELOC_OVFL 0x01000000 /* .o only - extended relocations */ #define IMAGE_SCN_MEM_DISCARDABLE 0x02000000 /* scn can be discarded */ #define IMAGE_SCN_MEM_NOT_CACHED 0x04000000 /* cannot be cached */ #define IMAGE_SCN_MEM_NOT_PAGED 0x08000000 /* not pageable */ @@ -168,8 +217,28 @@ #define IMAGE_SCN_MEM_READ 0x40000000 /* readable */ #define IMAGE_SCN_MEM_WRITE 0x80000000 /* writeable */ -#define IMAGE_DEBUG_TYPE_CODEVIEW 2 -#define IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS 20 +#define IMAGE_DEBUG_TYPE_UNKNOWN 0 /* Unknown value, ignored by all tools */ +#define IMAGE_DEBUG_TYPE_COFF 1 /* COFF debugging information */ +#define IMAGE_DEBUG_TYPE_CODEVIEW 2 /* CodeView debugging information or Visual C++ Program Database debugging information */ +#define IMAGE_DEBUG_TYPE_FPO 3 /* Frame pointer omission (FPO) information */ +#define IMAGE_DEBUG_TYPE_MISC 4 /* Location of DBG file with CodeView debugging information */ +#define IMAGE_DEBUG_TYPE_EXCEPTION 5 /* Exception information, copy of .pdata section */ +#define IMAGE_DEBUG_TYPE_FIXUP 6 /* Fixup information */ +#define IMAGE_DEBUG_TYPE_OMAP_TO_SRC 7 /* The mapping from an RVA in image to an RVA in source image */ +#define IMAGE_DEBUG_TYPE_OMAP_FROM_SRC 8 /* The mapping from an RVA in source image to an RVA in image */ +#define IMAGE_DEBUG_TYPE_BORLAND 9 /* Borland debugging information */ +#define IMAGE_DEBUG_TYPE_RESERVED10 10 /* Coldpath / Hotpatch debug information */ +#define IMAGE_DEBUG_TYPE_CLSID 11 /* CLSID */ +#define IMAGE_DEBUG_TYPE_VC_FEATURE 12 /* Visual C++ counts / statistics */ +#define IMAGE_DEBUG_TYPE_POGO 13 /* COFF group information, data for profile-guided optimization */ +#define IMAGE_DEBUG_TYPE_ILTCG 14 /* Incremental link-time code generation */ +#define IMAGE_DEBUG_TYPE_MPX 15 /* Intel Memory Protection Extensions */ +#define IMAGE_DEBUG_TYPE_REPRO 16 /* PE determinism or reproducibility */ +#define IMAGE_DEBUG_TYPE_EMBEDDED_PORTABLE_PDB 17 /* Embedded Portable PDB debugging information */ +#define IMAGE_DEBUG_TYPE_SPGO 18 /* Sample profile-guided optimization */ +#define IMAGE_DEBUG_TYPE_PDBCHECKSUM 19 /* PDB Checksum */ +#define IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS 20 /* Extended DLL characteristics bits */ +#define IMAGE_DEBUG_TYPE_PERFMAP 21 /* Location of associated Ready To Run PerfMap file */ #ifndef __ASSEMBLY__ @@ -235,7 +304,7 @@ struct pe32_opt_hdr { uint16_t image_minor; /* minor image version */ uint16_t subsys_major; /* major subsystem version */ uint16_t subsys_minor; /* minor subsystem version */ - uint32_t win32_version; /* reserved, must be 0 */ + uint32_t win32_version; /* win32 version reported at runtime */ uint32_t image_size; /* image size */ uint32_t header_size; /* header size rounded up to file_align */ @@ -246,7 +315,7 @@ struct pe32_opt_hdr { uint32_t stack_size; /* amt of stack required */ uint32_t heap_size_req; /* amt of heap requested */ uint32_t heap_size; /* amt of heap required */ - uint32_t loader_flags; /* reserved, must be 0 */ + uint32_t loader_flags; /* loader flags */ uint32_t data_dirs; /* number of data dir entries */ }; @@ -269,7 +338,7 @@ struct pe32plus_opt_hdr { uint16_t image_minor; /* minor image version */ uint16_t subsys_major; /* major subsystem version */ uint16_t subsys_minor; /* minor subsystem version */ - uint32_t win32_version; /* reserved, must be 0 */ + uint32_t win32_version; /* win32 version reported at runtime */ uint32_t image_size; /* image size */ uint32_t header_size; /* header size rounded up to file_align */ @@ -280,7 +349,7 @@ struct pe32plus_opt_hdr { uint64_t stack_size; /* amt of stack required */ uint64_t heap_size_req; /* amt of heap requested */ uint64_t heap_size; /* amt of heap required */ - uint32_t loader_flags; /* reserved, must be 0 */ + uint32_t loader_flags; /* loader flags */ uint32_t data_dirs; /* number of data dir entries */ }; @@ -301,10 +370,10 @@ struct data_directory { struct data_dirent global_ptr; /* global pointer reg. Size=0 */ struct data_dirent tls; /* .tls */ struct data_dirent load_config; /* load configuration structure */ - struct data_dirent bound_imports; /* no idea */ + struct data_dirent bound_imports; /* bound import table */ struct data_dirent import_addrs; /* import address table */ struct data_dirent delay_imports; /* delay-load import table */ - struct data_dirent clr_runtime_hdr; /* .cor (object only) */ + struct data_dirent clr_runtime_hdr; /* .cor (clr/.net executables) */ struct data_dirent reserved; }; diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 0aeb0e276a3e..c16cdeaa505e 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -375,7 +375,7 @@ do { \ } while (0) /* - * this_cpu operations (C) 2008-2013 Christoph Lameter <cl@linux.com> + * this_cpu operations (C) 2008-2013 Christoph Lameter <cl@gentwo.org> * * Optimized manipulation for memory allocated through the per cpu * allocator or for addresses of per cpu variables. diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index b50447ef1c92..0b6e1f781d86 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1164,10 +1164,6 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) } #endif -#ifndef __HAVE_ARCH_PGD_OFFSET_GATE -#define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) -#endif - #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, old_addr, new_addr) (pte) #endif @@ -1489,83 +1485,92 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) * vmf_insert_pfn. */ -/* - * track_pfn_remap is called when a _new_ pfn mapping is being established - * by remap_pfn_range() for physical range indicated by pfn and size. - */ -static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long addr, - unsigned long size) +static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, + pgprot_t *prot) { return 0; } -/* - * track_pfn_insert is called when a _new_ single pfn is established - * by vmf_insert_pfn(). - */ -static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - pfn_t pfn) +static inline int pfnmap_track(unsigned long pfn, unsigned long size, + pgprot_t *prot) { + return 0; } -/* - * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page - * tables copied during copy_page_range(). Will store the pfn to be - * passed to untrack_pfn_copy() only if there is something to be untracked. - * Callers should initialize the pfn to 0. - */ -static inline int track_pfn_copy(struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, unsigned long *pfn) +static inline void pfnmap_untrack(unsigned long pfn, unsigned long size) { - return 0; } +#else +/** + * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range + * @pfn: the start of the pfn range + * @size: the size of the pfn range in bytes + * @prot: the pgprot to modify + * + * Lookup the cachemode for the pfn range starting at @pfn with the size + * @size and store it in @prot, leaving other data in @prot unchanged. + * + * This allows for a hardware implementation to have fine-grained control of + * memory cache behavior at page level granularity. Without a hardware + * implementation, this function does nothing. + * + * Currently there is only one implementation for this - x86 Page Attribute + * Table (PAT). See Documentation/arch/x86/pat.rst for more details. + * + * This function can fail if the pfn range spans pfns that require differing + * cachemodes. If the pfn range was previously verified to have a single + * cachemode, it is sufficient to query only a single pfn. The assumption is + * that this is the case for drivers using the vmf_insert_pfn*() interface. + * + * Returns 0 on success and -EINVAL on error. + */ +int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, + pgprot_t *prot); -/* - * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during - * copy_page_range(), but after track_pfn_copy() was already called. Can - * be called even if track_pfn_copy() did not actually track anything: - * handled internally. +/** + * pfnmap_track - track a pfn range + * @pfn: the start of the pfn range + * @size: the size of the pfn range in bytes + * @prot: the pgprot to track + * + * Requested the pfn range to be 'tracked' by a hardware implementation and + * setup the cachemode in @prot similar to pfnmap_setup_cachemode(). + * + * This allows for fine-grained control of memory cache behaviour at page + * level granularity. Tracking memory this way is persisted across VMA splits + * (VMA merging does not apply for VM_PFNMAP). + * + * Currently, there is only one implementation for this - x86 Page Attribute + * Table (PAT). See Documentation/arch/x86/pat.rst for more details. + * + * Returns 0 on success and -EINVAL on error. */ -static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, - unsigned long pfn) -{ -} +int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot); -/* - * untrack_pfn is called while unmapping a pfnmap for a region. - * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case pfn, size are zero). +/** + * pfnmap_untrack - untrack a pfn range + * @pfn: the start of the pfn range + * @size: the size of the pfn range in bytes + * + * Untrack a pfn range previously tracked through pfnmap_track(). */ -static inline void untrack_pfn(struct vm_area_struct *vma, - unsigned long pfn, unsigned long size, - bool mm_wr_locked) -{ -} +void pfnmap_untrack(unsigned long pfn, unsigned long size); +#endif -/* - * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA: +/** + * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn + * @pfn: the pfn + * @prot: the pgprot to modify + * + * Lookup the cachemode for @pfn and store it in @prot, leaving other + * data in @prot unchanged. * - * 1) During mremap() on the src VMA after the page tables were moved. - * 2) During fork() on the dst VMA, immediately after duplicating the src VMA. + * See pfnmap_setup_cachemode() for details. */ -static inline void untrack_pfn_clear(struct vm_area_struct *vma) +static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) { + pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } -#else -extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long addr, - unsigned long size); -extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - pfn_t pfn); -extern int track_pfn_copy(struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, unsigned long *pfn); -extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, - unsigned long pfn); -extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size, bool mm_wr_locked); -extern void untrack_pfn_clear(struct vm_area_struct *vma); -#endif #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h index 673e96df453b..25620229b1d6 100644 --- a/include/linux/pinctrl/machine.h +++ b/include/linux/pinctrl/machine.h @@ -149,14 +149,18 @@ struct pinctrl_map { #define PIN_MAP_CONFIGS_GROUP_HOG_DEFAULT(dev, grp, cfgs) \ PIN_MAP_CONFIGS_GROUP(dev, PINCTRL_STATE_DEFAULT, dev, grp, cfgs) +struct device; struct pinctrl_map; #ifdef CONFIG_PINCTRL -extern int pinctrl_register_mappings(const struct pinctrl_map *map, - unsigned int num_maps); -extern void pinctrl_unregister_mappings(const struct pinctrl_map *map); -extern void pinctrl_provide_dummies(void); +int pinctrl_register_mappings(const struct pinctrl_map *map, + unsigned int num_maps); +int devm_pinctrl_register_mappings(struct device *dev, + const struct pinctrl_map *map, + unsigned int num_maps); +void pinctrl_unregister_mappings(const struct pinctrl_map *map); +void pinctrl_provide_dummies(void); #else static inline int pinctrl_register_mappings(const struct pinctrl_map *map, @@ -165,6 +169,13 @@ static inline int pinctrl_register_mappings(const struct pinctrl_map *map, return 0; } +static inline int devm_pinctrl_register_mappings(struct device *dev, + const struct pinctrl_map *map, + unsigned int num_maps) +{ + return 0; +} + static inline void pinctrl_unregister_mappings(const struct pinctrl_map *map) { } diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index 8dbd51ea8626..240bd3bff18d 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -11,10 +11,17 @@ struct ptdump_range { }; struct ptdump_state { - /* level is 0:PGD to 4:PTE, or -1 if unknown */ - void (*note_page)(struct ptdump_state *st, unsigned long addr, - int level, u64 val); - void (*effective_prot)(struct ptdump_state *st, int level, u64 val); + void (*note_page_pte)(struct ptdump_state *st, unsigned long addr, pte_t pte); + void (*note_page_pmd)(struct ptdump_state *st, unsigned long addr, pmd_t pmd); + void (*note_page_pud)(struct ptdump_state *st, unsigned long addr, pud_t pud); + void (*note_page_p4d)(struct ptdump_state *st, unsigned long addr, p4d_t p4d); + void (*note_page_pgd)(struct ptdump_state *st, unsigned long addr, pgd_t pgd); + void (*note_page_flush)(struct ptdump_state *st); + void (*effective_prot_pte)(struct ptdump_state *st, pte_t pte); + void (*effective_prot_pmd)(struct ptdump_state *st, pmd_t pmd); + void (*effective_prot_pud)(struct ptdump_state *st, pud_t pud); + void (*effective_prot_p4d)(struct ptdump_state *st, p4d_t p4d); + void (*effective_prot_pgd)(struct ptdump_state *st, pgd_t pgd); const struct ptdump_range *range; }; diff --git a/include/linux/relay.h b/include/linux/relay.h index 72b876dd5cb8..b3224111d074 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -159,9 +159,6 @@ struct rchan *relay_open(const char *base_filename, size_t n_subbufs, const struct rchan_callbacks *cb, void *private_data); -extern int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent); extern void relay_close(struct rchan *chan); extern void relay_flush(struct rchan *chan); extern void relay_subbufs_consumed(struct rchan *chan, diff --git a/include/linux/reset.h b/include/linux/reset.h index 2986ced69a02..840d75d172f6 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -1005,6 +1005,12 @@ devm_reset_control_array_get_exclusive(struct device *dev) } static inline struct reset_control * +devm_reset_control_array_get_exclusive_released(struct device *dev) +{ + return devm_reset_control_array_get(dev, RESET_CONTROL_EXCLUSIVE_RELEASED); +} + +static inline struct reset_control * devm_reset_control_array_get_shared(struct device *dev) { return devm_reset_control_array_get(dev, RESET_CONTROL_SHARED); diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 56e27263acf8..cd7f0ae26615 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -192,6 +192,7 @@ void ring_buffer_record_off(struct trace_buffer *buffer); void ring_buffer_record_on(struct trace_buffer *buffer); bool ring_buffer_record_is_on(struct trace_buffer *buffer); bool ring_buffer_record_is_set_on(struct trace_buffer *buffer); +bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu); diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index e49c32b0f394..dd8afe511242 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -391,13 +391,8 @@ struct rio_dev *rio_dev_get(struct rio_dev *); void rio_dev_put(struct rio_dev *); #ifdef CONFIG_RAPIDIO_DMA_ENGINE -extern struct dma_chan *rio_request_dma(struct rio_dev *rdev); extern struct dma_chan *rio_request_mport_dma(struct rio_mport *mport); extern void rio_release_dma(struct dma_chan *dchan); -extern struct dma_async_tx_descriptor *rio_dma_prep_slave_sg( - struct rio_dev *rdev, struct dma_chan *dchan, - struct rio_dma_data *data, - enum dma_transfer_direction direction, unsigned long flags); extern struct dma_async_tx_descriptor *rio_dma_prep_xfer( struct dma_chan *dchan, u16 destid, struct rio_dma_data *data, diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6b82b618846e..c4f4903b1088 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -223,7 +223,7 @@ static inline void __folio_large_mapcount_sanity_checks(const struct folio *foli VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY && folio->_mm_id_mapcount[1] < 0); VM_WARN_ON_ONCE(!folio_mapped(folio) && - folio_test_large_maybe_mapped_shared(folio)); + test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)); } static __always_inline void folio_set_large_mapcount(struct folio *folio, diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 138e2f1bd08f..0cdbfc42f153 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -95,6 +95,28 @@ static inline bool sg_is_last(struct scatterlist *sg) } /** + * sg_next - return the next scatterlist entry in a list + * @sg: The current sg entry + * + * Description: + * Usually the next entry will be @sg@ + 1, but if this sg element is part + * of a chained scatterlist, it could jump to the start of a new + * scatterlist array. + * + **/ +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ + if (sg_is_last(sg)) + return NULL; + + sg++; + if (unlikely(sg_is_chain(sg))) + sg = sg_chain_ptr(sg); + + return sg; +} + +/** * sg_assign_page - Assign a given page to an SG entry * @sg: SG entry * @page: The page @@ -418,7 +440,6 @@ static inline void sg_init_marker(struct scatterlist *sgl, int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); -struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); void sg_init_one(struct scatterlist *, const void *, unsigned int); diff --git a/include/linux/sched.h b/include/linux/sched.h index 1f054f1f11b5..aa9c5be7a632 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1240,7 +1240,11 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - struct mutex *blocker_mutex; + /* + * Encoded lock address causing task block (lower 2 bits = type from + * <linux/hung_task.h>). Accessed via hung_task_*() helpers. + */ + unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index cffad65bdc6a..85c5a6392e02 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -106,7 +106,6 @@ static inline unsigned long stack_not_used(struct task_struct *p) #endif extern void set_task_stack_end_magic(struct task_struct *tsk); -#ifndef __HAVE_ARCH_KSTACK_END static inline int kstack_end(void *addr) { /* Reliable end of stack detection: @@ -114,6 +113,5 @@ static inline int kstack_end(void *addr) */ return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); } -#endif #endif /* _LINUX_SCHED_TASK_STACK_H */ diff --git a/include/linux/scmi_imx_protocol.h b/include/linux/scmi_imx_protocol.h index 53b356a26414..27bd372cbfb1 100644 --- a/include/linux/scmi_imx_protocol.h +++ b/include/linux/scmi_imx_protocol.h @@ -11,9 +11,12 @@ #include <linux/bitfield.h> #include <linux/device.h> #include <linux/notifier.h> +#include <linux/scmi_protocol.h> #include <linux/types.h> +#define SCMI_PROTOCOL_IMX_LMM 0x80 #define SCMI_PROTOCOL_IMX_BBM 0x81 +#define SCMI_PROTOCOL_IMX_CPU 0x82 #define SCMI_PROTOCOL_IMX_MISC 0x84 #define SCMI_IMX_VENDOR "NXP" @@ -57,4 +60,43 @@ struct scmi_imx_misc_proto_ops { int (*misc_ctrl_req_notify)(const struct scmi_protocol_handle *ph, u32 ctrl_id, u32 evt_id, u32 flags); }; + +/* See LMM_ATTRIBUTES in imx95.rst */ +#define LMM_ID_DISCOVER 0xFFFFFFFFU +#define LMM_MAX_NAME 16 + +enum scmi_imx_lmm_state { + LMM_STATE_LM_OFF, + LMM_STATE_LM_ON, + LMM_STATE_LM_SUSPEND, + LMM_STATE_LM_POWERED, +}; + +struct scmi_imx_lmm_info { + u32 lmid; + enum scmi_imx_lmm_state state; + u32 errstatus; + u8 name[LMM_MAX_NAME]; +}; + +struct scmi_imx_lmm_proto_ops { + int (*lmm_power_boot)(const struct scmi_protocol_handle *ph, u32 lmid, + bool boot); + int (*lmm_info)(const struct scmi_protocol_handle *ph, u32 lmid, + struct scmi_imx_lmm_info *info); + int (*lmm_reset_vector_set)(const struct scmi_protocol_handle *ph, + u32 lmid, u32 cpuid, u32 flags, u64 vector); + int (*lmm_shutdown)(const struct scmi_protocol_handle *ph, u32 lmid, + u32 flags); +}; + +struct scmi_imx_cpu_proto_ops { + int (*cpu_reset_vector_set)(const struct scmi_protocol_handle *ph, + u32 cpuid, u64 vector, bool start, + bool boot, bool resume); + int (*cpu_start)(const struct scmi_protocol_handle *ph, u32 cpuid, + bool start); + int (*cpu_started)(const struct scmi_protocol_handle *ph, u32 cpuid, + bool *started); +}; #endif diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 04655faadc2d..89706157e622 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -16,13 +16,25 @@ struct semaphore { raw_spinlock_t lock; unsigned int count; struct list_head wait_list; + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + unsigned long last_holder; +#endif }; +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#define __LAST_HOLDER_SEMAPHORE_INITIALIZER \ + , .last_holder = 0UL +#else +#define __LAST_HOLDER_SEMAPHORE_INITIALIZER +#endif + #define __SEMAPHORE_INITIALIZER(name, n) \ { \ .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ - .wait_list = LIST_HEAD_INIT((name).wait_list), \ + .wait_list = LIST_HEAD_INIT((name).wait_list) \ + __LAST_HOLDER_SEMAPHORE_INITIALIZER \ } /* @@ -47,5 +59,6 @@ extern int __must_check down_killable(struct semaphore *sem); extern int __must_check down_trylock(struct semaphore *sem); extern int __must_check down_timeout(struct semaphore *sem, long jiffies); extern void up(struct semaphore *sem); +extern unsigned long sem_last_holder(struct semaphore *sem); #endif /* __LINUX_SEMAPHORE_H */ diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 8e5d78fb4847..7a69210a250c 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -24,6 +24,7 @@ #define LLCC_CMPTDMA 15 #define LLCC_DISP 16 #define LLCC_VIDFW 17 +#define LLCC_CAMFW 18 #define LLCC_MDMHPFX 20 #define LLCC_MDMPNG 21 #define LLCC_AUDHW 22 @@ -67,6 +68,13 @@ #define LLCC_EVCS_LEFT 67 #define LLCC_EVCS_RIGHT 68 #define LLCC_SPAD 69 +#define LLCC_VIDDEC 70 +#define LLCC_CAMOFE 71 +#define LLCC_CAMRTIP 72 +#define LLCC_CAMSRTIP 73 +#define LLCC_CAMRTRF 74 +#define LLCC_CAMSRTRF 75 +#define LLCC_CPUSSMPAM 89 /** * struct llcc_slice_desc - Cache slice descriptor diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index ce1a3790d6fb..0d5a17ea8fb8 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -658,9 +658,20 @@ #define EXYNOS5433_PAD_RETENTION_FSYSGENIO_OPTION (0x32A8) /* For Tensor GS101 */ +/* PMU ALIVE */ #define GS101_SYSIP_DAT0 (0x810) +#define GS101_CPU0_INFORM (0x860) +#define GS101_CPU_INFORM(cpu) \ + (GS101_CPU0_INFORM + (cpu*4)) #define GS101_SYSTEM_CONFIGURATION (0x3A00) #define GS101_PHY_CTRL_USB20 (0x3EB0) #define GS101_PHY_CTRL_USBDP (0x3EB4) +/* PMU INTR GEN */ +#define GS101_GRP1_INTR_BID_UPEND (0x0108) +#define GS101_GRP1_INTR_BID_CLEAR (0x010c) +#define GS101_GRP2_INTR_BID_ENABLE (0x0200) +#define GS101_GRP2_INTR_BID_UPEND (0x0208) +#define GS101_GRP2_INTR_BID_CLEAR (0x020c) + #endif /* __LINUX_SOC_EXYNOS_REGS_PMU_H */ diff --git a/include/linux/sort.h b/include/linux/sort.h index 8e5603b10941..c01ef804a0eb 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -4,6 +4,16 @@ #include <linux/types.h> +/** + * cmp_int - perform a three-way comparison of the arguments + * @l: the left argument + * @r: the right argument + * + * Return: 1 if the left argument is greater than the right one; 0 if the + * arguments are equal; -1 if the left argument is less than the right one. + */ +#define cmp_int(l, r) (((l) > (r)) - ((l) < (r))) + void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, diff --git a/include/linux/swap.h b/include/linux/swap.h index db46b25a65ae..bc0e1c275fc0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -414,6 +414,10 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #define MEMCG_RECLAIM_PROACTIVE (1 << 2) #define MIN_SWAPPINESS 0 #define MAX_SWAPPINESS 200 + +/* Just recliam from anon folios in proactive memory reclaim */ +#define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1) + extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, @@ -450,7 +454,7 @@ static inline unsigned long total_swapcache_pages(void) } void free_swap_cache(struct folio *folio); -void free_page_and_swap_cache(struct page *); +void free_folio_and_swap_cache(struct folio *folio); void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; @@ -520,10 +524,8 @@ static inline void put_swap_device(struct swap_info_struct *si) #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) -/* only sparc can not include linux/pagemap.h in this file - * so leave put_page and release_pages undeclared... */ -#define free_page_and_swap_cache(page) \ - put_page(page) +#define free_folio_and_swap_cache(folio) \ + folio_put(folio) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index a351763e6965..826ce3f8e1f8 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -464,16 +464,30 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #endif #define DECLARE_TRACE(name, proto, args) \ - __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + __DECLARE_TRACE(name##_tp, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()), \ PARAMS(void *__data, proto)) #define DECLARE_TRACE_CONDITION(name, proto, args, cond) \ - __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + __DECLARE_TRACE(name##_tp, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ PARAMS(void *__data, proto)) #define DECLARE_TRACE_SYSCALL(name, proto, args) \ + __DECLARE_TRACE_SYSCALL(name##_tp, PARAMS(proto), PARAMS(args), \ + PARAMS(void *__data, proto)) + +#define DECLARE_TRACE_EVENT(name, proto, args) \ + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()), \ + PARAMS(void *__data, proto)) + +#define DECLARE_TRACE_EVENT_CONDITION(name, proto, args, cond) \ + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ + PARAMS(void *__data, proto)) + +#define DECLARE_TRACE_EVENT_SYSCALL(name, proto, args) \ __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \ PARAMS(void *__data, proto)) @@ -591,32 +605,32 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print) #define DEFINE_EVENT(template, name, proto, args) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)\ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_CONDITION(template, name, proto, \ args, cond) \ - DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + DECLARE_TRACE_EVENT_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT(name, proto, args, struct, assign, print) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FN(name, proto, args, struct, \ assign, print, reg, unreg) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#define TRACE_EVENT_FN_COND(name, proto, args, cond, struct, \ + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) +#define TRACE_EVENT_FN_COND(name, proto, args, cond, struct, \ assign, print, reg, unreg) \ - DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + DECLARE_TRACE_EVENT_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT_CONDITION(name, proto, args, cond, \ struct, assign, print) \ - DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + DECLARE_TRACE_EVENT_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, \ print, reg, unreg) \ - DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT_SYSCALL(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FLAGS(event, flag) diff --git a/include/linux/tsm-mr.h b/include/linux/tsm-mr.h new file mode 100644 index 000000000000..50a521f4ac97 --- /dev/null +++ b/include/linux/tsm-mr.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __TSM_MR_H +#define __TSM_MR_H + +#include <crypto/hash_info.h> + +/** + * struct tsm_measurement_register - describes an architectural measurement + * register (MR) + * @mr_name: name of the MR + * @mr_value: buffer containing the current value of the MR + * @mr_size: size of the MR - typically the digest size of @mr_hash + * @mr_flags: bitwise OR of one or more flags, detailed below + * @mr_hash: optional hash identifier defined in include/uapi/linux/hash_info.h. + * + * A CC guest driver encloses an array of this structure in struct + * tsm_measurements to detail the measurement facility supported by the + * underlying CC hardware. + * + * @mr_name and @mr_value must stay valid until this structure is no longer in + * use. + * + * @mr_flags is the bitwise-OR of zero or more of the flags below. + * + * * %TSM_MR_F_READABLE - the sysfs attribute corresponding to this MR is readable. + * * %TSM_MR_F_WRITABLE - the sysfs attribute corresponding to this MR is writable. + * The semantics is typically to extend the MR but could vary depending on the + * architecture and the MR. + * * %TSM_MR_F_LIVE - this MR's value may differ from the last value written, so + * must be read back from the underlying CC hardware/firmware. + * * %TSM_MR_F_RTMR - bitwise-OR of %TSM_MR_F_LIVE and %TSM_MR_F_WRITABLE. + * * %TSM_MR_F_NOHASH - this MR does NOT have an associated hash algorithm. + * @mr_hash will be ignored when this flag is set. + */ +struct tsm_measurement_register { + const char *mr_name; + void *mr_value; + u32 mr_size; + u32 mr_flags; + enum hash_algo mr_hash; +}; + +#define TSM_MR_F_NOHASH 1 +#define TSM_MR_F_WRITABLE 2 +#define TSM_MR_F_READABLE 4 +#define TSM_MR_F_LIVE 8 +#define TSM_MR_F_RTMR (TSM_MR_F_LIVE | TSM_MR_F_WRITABLE) + +#define TSM_MR_(mr, hash) \ + .mr_name = #mr, .mr_size = hash##_DIGEST_SIZE, \ + .mr_hash = HASH_ALGO_##hash, .mr_flags = TSM_MR_F_READABLE + +/** + * struct tsm_measurements - defines the CC architecture specific measurement + * facility and methods for updating measurement registers (MRs) + * @mrs: Array of MR definitions. + * @nr_mrs: Number of elements in @mrs. + * @refresh: Callback function to load/sync all MRs from TVM hardware/firmware + * into the kernel cache. + * @write: Callback function to write to the MR specified by the parameter @mr. + * Typically, writing to an MR extends the input buffer to that MR. + * + * The @refresh callback is invoked when an MR with %TSM_MR_F_LIVE set is being + * read and the cache is stale. It must reload all MRs with %TSM_MR_F_LIVE set. + * The function parameter @tm is a pointer pointing back to this structure. + * + * The @write callback is invoked whenever an MR is being written. It takes two + * additional parameters besides @tm: + * + * * @mr - points to the MR (an element of @tm->mrs) being written. + * * @data - contains the bytes to write and whose size is @mr->mr_size. + * + * Both @refresh and @write should return 0 on success and an appropriate error + * code on failure. + */ +struct tsm_measurements { + const struct tsm_measurement_register *mrs; + size_t nr_mrs; + int (*refresh)(const struct tsm_measurements *tm); + int (*write)(const struct tsm_measurements *tm, + const struct tsm_measurement_register *mr, const u8 *data); +}; + +const struct attribute_group * +tsm_mr_create_attribute_group(const struct tsm_measurements *tm); +void tsm_mr_free_attribute_group(const struct attribute_group *attr_grp); + +#endif diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 11b0c525be30..431054810dca 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -6,17 +6,17 @@ #include <linux/types.h> #include <linux/uuid.h> -#define TSM_INBLOB_MAX 64 -#define TSM_OUTBLOB_MAX SZ_32K +#define TSM_REPORT_INBLOB_MAX 64 +#define TSM_REPORT_OUTBLOB_MAX SZ_32K /* * Privilege level is a nested permission concept to allow confidential * guests to partition address space, 4-levels are supported. */ -#define TSM_PRIVLEVEL_MAX 3 +#define TSM_REPORT_PRIVLEVEL_MAX 3 /** - * struct tsm_desc - option descriptor for generating tsm report blobs + * struct tsm_report_desc - option descriptor for generating tsm report blobs * @privlevel: optional privilege level to associate with @outblob * @inblob_len: sizeof @inblob * @inblob: arbitrary input data @@ -24,10 +24,10 @@ * @service_guid: optional service-provider service guid to attest * @service_manifest_version: optional service-provider service manifest version requested */ -struct tsm_desc { +struct tsm_report_desc { unsigned int privlevel; size_t inblob_len; - u8 inblob[TSM_INBLOB_MAX]; + u8 inblob[TSM_REPORT_INBLOB_MAX]; char *service_provider; guid_t service_guid; unsigned int service_manifest_version; @@ -44,7 +44,7 @@ struct tsm_desc { * @manifestblob: (optional) manifest data associated with the report */ struct tsm_report { - struct tsm_desc desc; + struct tsm_report_desc desc; size_t outblob_len; u8 *outblob; size_t auxblob_len; @@ -88,7 +88,7 @@ enum tsm_bin_attr_index { }; /** - * struct tsm_ops - attributes and operations for tsm instances + * struct tsm_report_ops - attributes and operations for tsm_report instances * @name: tsm id reflected in /sys/kernel/config/tsm/report/$report/provider * @privlevel_floor: convey base privlevel for nested scenarios * @report_new: Populate @report with the report blob and auxblob @@ -99,7 +99,7 @@ enum tsm_bin_attr_index { * Implementation specific ops, only one is expected to be registered at * a time i.e. only one of "sev-guest", "tdx-guest", etc. */ -struct tsm_ops { +struct tsm_report_ops { const char *name; unsigned int privlevel_floor; int (*report_new)(struct tsm_report *report, void *data); @@ -107,6 +107,6 @@ struct tsm_ops { bool (*report_bin_attr_visible)(int n); }; -int tsm_register(const struct tsm_ops *ops, void *priv); -int tsm_unregister(const struct tsm_ops *ops); +int tsm_report_register(const struct tsm_report_ops *ops, void *priv); +int tsm_report_unregister(const struct tsm_report_ops *ops); #endif /* __TSM_H */ diff --git a/include/linux/turris-signing-key.h b/include/linux/turris-signing-key.h new file mode 100644 index 000000000000..8a435b73c3a9 --- /dev/null +++ b/include/linux/turris-signing-key.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * 2025 by Marek BehĂșn <kabel@kernel.org> + */ + +#ifndef __TURRIS_SIGNING_KEY_H +#define __TURRIS_SIGNING_KEY_H + +#include <linux/key.h> +#include <linux/types.h> + +struct device; + +#ifdef CONFIG_KEYS +struct turris_signing_key_subtype { + u16 key_size; + u8 data_size; + u8 sig_size; + u8 public_key_size; + const char *hash_algo; + const void *(*get_public_key)(const struct key *key); + int (*sign)(const struct key *key, const void *msg, void *signature); +}; + +static inline struct device *turris_signing_key_get_dev(const struct key *key) +{ + return key->payload.data[1]; +} + +int +devm_turris_signing_key_create(struct device *dev, const struct turris_signing_key_subtype *subtype, + const char *desc); +#endif + +#endif /* __TURRIS_SIGNING_KEY_H */ diff --git a/include/linux/types.h b/include/linux/types.h index 49b79c8bb1a9..6dfdb8e8e4c3 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -136,6 +136,10 @@ typedef s64 ktime_t; typedef u64 sector_t; typedef u64 blkcnt_t; +/* generic data direction definitions */ +#define READ 0 +#define WRITE 1 + /* * The type of an index into the pagecache. */ diff --git a/include/linux/unroll.h b/include/linux/unroll.h index 863fb69f6a7e..186b71de740f 100644 --- a/include/linux/unroll.h +++ b/include/linux/unroll.h @@ -11,10 +11,8 @@ #ifdef CONFIG_CC_IS_CLANG #define __pick_unrolled(x, y) _Pragma(#x) -#elif CONFIG_GCC_VERSION >= 80000 -#define __pick_unrolled(x, y) _Pragma(#y) #else -#define __pick_unrolled(x, y) /* not supported */ +#define __pick_unrolled(x, y) _Pragma(#y) #endif /** diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 2e46b69ff0a6..516217c39094 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -188,13 +188,13 @@ struct uprobes_state { }; extern void __init uprobes_init(void); -extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); +extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 3b570b765b75..9373962aade9 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -2,7 +2,10 @@ #ifndef _LINUX_HELPER_MACROS_H_ #define _LINUX_HELPER_MACROS_H_ +#include <linux/compiler_attributes.h> #include <linux/math.h> +#include <linux/typecheck.h> +#include <linux/stddef.h> /** * for_each_if - helper for handling conditionals in various for_each macros @@ -80,6 +83,72 @@ }) /** + * PTR_IF - evaluate to @ptr if @cond is true, or to NULL otherwise. + * @cond: A conditional, usually in a form of IS_ENABLED(CONFIG_FOO) + * @ptr: A pointer to assign if @cond is true. + * + * PTR_IF(IS_ENABLED(CONFIG_FOO), ptr) evaluates to @ptr if CONFIG_FOO is set + * to 'y' or 'm', or to NULL otherwise. The @ptr argument must be a pointer. + * + * The macro can be very useful to help compiler dropping dead code. + * + * For instance, consider the following:: + * + * #ifdef CONFIG_FOO_SUSPEND + * static int foo_suspend(struct device *dev) + * { + * ... + * } + * #endif + * + * static struct pm_ops foo_ops = { + * #ifdef CONFIG_FOO_SUSPEND + * .suspend = foo_suspend, + * #endif + * }; + * + * While this works, the foo_suspend() macro is compiled conditionally, + * only when CONFIG_FOO_SUSPEND is set. This is problematic, as there could + * be a build bug in this function, we wouldn't have a way to know unless + * the configuration option is set. + * + * An alternative is to declare foo_suspend() always, but mark it + * as __maybe_unused. This works, but the __maybe_unused attribute + * is required to instruct the compiler that the function may not + * be referenced anywhere, and is safe to remove without making + * a fuss about it. This makes the programmer responsible for tagging + * the functions that can be garbage-collected. + * + * With the macro it is possible to write the following: + * + * static int foo_suspend(struct device *dev) + * { + * ... + * } + * + * static struct pm_ops foo_ops = { + * .suspend = PTR_IF(IS_ENABLED(CONFIG_FOO_SUSPEND), foo_suspend), + * }; + * + * The foo_suspend() function will now be automatically dropped by the + * compiler, and it does not require any specific attribute. + */ +#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) + +/** + * to_user_ptr - cast a pointer passed as u64 from user space to void __user * + * @x: The u64 value from user space, usually via IOCTL + * + * to_user_ptr() simply casts a pointer passed as u64 from user space to void + * __user * correctly. Using this lets us get rid of all the tiresome casts. + */ +#define u64_to_user_ptr(x) \ +({ \ + typecheck(u64, (x)); \ + (void __user *)(uintptr_t)(x); \ +}) + +/** * is_insidevar - check if the @ptr points inside the @var memory range. * @ptr: the pointer to a memory address. * @var: the variable which address and size identify the memory range. diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 78eede109b1a..be850174e802 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -965,10 +965,12 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * + * Note that callers interested in whether wrapping has occurred should + * use __xa_alloc_cyclic() instead. + * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. - * Return: 0 if the allocation succeeded without wrapping. 1 if the - * allocation succeeded after wrapping, -ENOMEM if memory could not be + * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, @@ -981,7 +983,7 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); - return err; + return err < 0 ? err : 0; } /** @@ -1002,10 +1004,12 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * + * Note that callers interested in whether wrapping has occurred should + * use __xa_alloc_cyclic() instead. + * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. - * Return: 0 if the allocation succeeded without wrapping. 1 if the - * allocation succeeded after wrapping, -ENOMEM if memory could not be + * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, @@ -1018,7 +1022,7 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); - return err; + return err < 0 ? err : 0; } /** @@ -1039,10 +1043,12 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * + * Note that callers interested in whether wrapping has occurred should + * use __xa_alloc_cyclic() instead. + * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. - * Return: 0 if the allocation succeeded without wrapping. 1 if the - * allocation succeeded after wrapping, -ENOMEM if memory could not be + * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, @@ -1055,7 +1061,7 @@ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); - return err; + return err < 0 ? err : 0; } /** diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 52f30e526607..369ef068fad8 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -22,7 +22,7 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, - unsigned long *handle); + unsigned long *handle, const int nid); void zpool_free(struct zpool *pool, unsigned long handle); @@ -64,7 +64,7 @@ struct zpool_driver { void (*destroy)(void *pool); int (*malloc)(void *pool, size_t size, gfp_t gfp, - unsigned long *handle); + unsigned long *handle, const int nid); void (*free)(void *pool, unsigned long handle); void *(*obj_read_begin)(void *pool, unsigned long handle, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index c26baf9fb331..13e9cc5490f7 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -26,7 +26,8 @@ struct zs_pool; struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags, + const int nid); void zs_free(struct zs_pool *pool, unsigned long obj); size_t zs_huge_class_size(struct zs_pool *pool); |