diff options
Diffstat (limited to 'arch/x86')
28 files changed, 507 insertions, 320 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0be4937203c7..340e5468980e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1858,8 +1858,7 @@ endchoice config X86_SGX bool "Software Guard eXtensions (SGX)" depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC - depends on CRYPTO=y - depends on CRYPTO_SHA256=y + select CRYPTO_LIB_SHA256 select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA select XARRAY_MULTI @@ -2006,6 +2005,9 @@ config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG config ARCH_SUPPORTS_KEXEC_JUMP def_bool y +config ARCH_SUPPORTS_KEXEC_HANDOVER + def_bool X86_64 + config ARCH_SUPPORTS_CRASH_DUMP def_bool X86_64 || (X86_32 && HIGHMEM) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index f03d59ea6e40..3b0948ad449f 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -760,6 +760,49 @@ static void process_e820_entries(unsigned long minimum, } } +/* + * If KHO is active, only process its scratch areas to ensure we are not + * stepping onto preserved memory. + */ +static bool process_kho_entries(unsigned long minimum, unsigned long image_size) +{ + struct kho_scratch *kho_scratch; + struct setup_data *ptr; + struct kho_data *kho; + int i, nr_areas = 0; + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + return false; + + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; + while (ptr) { + if (ptr->type == SETUP_KEXEC_KHO) { + kho = (struct kho_data *)(unsigned long)ptr->data; + kho_scratch = (void *)(unsigned long)kho->scratch_addr; + nr_areas = kho->scratch_size / sizeof(*kho_scratch); + break; + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + if (!nr_areas) + return false; + + for (i = 0; i < nr_areas; i++) { + struct kho_scratch *area = &kho_scratch[i]; + struct mem_vector region = { + .start = area->addr, + .size = area->size, + }; + + if (process_mem_region(®ion, minimum, image_size)) + break; + } + + return true; +} + static unsigned long find_random_phys_addr(unsigned long minimum, unsigned long image_size) { @@ -775,7 +818,12 @@ static unsigned long find_random_phys_addr(unsigned long minimum, return 0; } - if (!process_efi_entries(minimum, image_size)) + /* + * During kexec handover only process KHO scratch areas that are known + * not to contain any data that must be preserved. + */ + if (!process_kho_entries(minimum, image_size) && + !process_efi_entries(minimum, image_size)) process_e820_entries(minimum, image_size); phys_addr = slots_fetch_random(); diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index e30649e44d8f..e1f4fd5bc8ee 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -43,7 +43,7 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ .section ".bstext", "ax" #ifdef CONFIG_EFI_STUB # "MZ", MS-DOS header - .word MZ_MAGIC + .word IMAGE_DOS_SIGNATURE .org 0x38 # # Offset to the PE header. @@ -51,16 +51,16 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ .long LINUX_PE_MAGIC .long pe_header pe_header: - .long PE_MAGIC + .long IMAGE_NT_SIGNATURE coff_header: #ifdef CONFIG_X86_32 .set image_file_add_flags, IMAGE_FILE_32BIT_MACHINE - .set pe_opt_magic, PE_OPT_MAGIC_PE32 + .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR32_MAGIC .word IMAGE_FILE_MACHINE_I386 #else .set image_file_add_flags, 0 - .set pe_opt_magic, PE_OPT_MAGIC_PE32PLUS + .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR64_MAGIC .word IMAGE_FILE_MACHINE_AMD64 #endif .word section_count # nr_sections @@ -111,7 +111,7 @@ extra_header_fields: .long salign # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) - .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics + .word IMAGE_DLLCHARACTERISTICS_NX_COMPAT # DllCharacteristics #ifdef CONFIG_X86_32 .long 0 # SizeOfStackReserve .long 0 # SizeOfStackCommit diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index edab6d6049be..7b2833705d47 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -36,6 +36,7 @@ /* TDX Module call error codes */ #define TDCALL_RETURN_CODE(a) ((a) >> 32) #define TDCALL_INVALID_OPERAND 0xc0000100 +#define TDCALL_OPERAND_BUSY 0x80000200 #define TDREPORT_SUBTYPE_0 0 @@ -109,12 +110,13 @@ static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask) * REPORTDATA to be included into TDREPORT. * @tdreport: Address of the output buffer to store TDREPORT. * - * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module - * v1.0 specification for more information on TDG.MR.REPORT TDCALL. + * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module v1.0 + * specification for more information on TDG.MR.REPORT TDCALL. + * * It is used in the TDX guest driver module to get the TDREPORT0. * - * Return 0 on success, -EINVAL for invalid operands, or -EIO on - * other TDCALL failures. + * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation, + * or -EIO on other TDCALL failures. */ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) { @@ -128,7 +130,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) ret = __tdcall(TDG_MR_REPORT, &args); if (ret) { if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) - return -EINVAL; + return -ENXIO; + else if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY) + return -EBUSY; return -EIO; } @@ -137,6 +141,42 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) EXPORT_SYMBOL_GPL(tdx_mcall_get_report0); /** + * tdx_mcall_extend_rtmr() - Wrapper to extend RTMR registers using + * TDG.MR.RTMR.EXTEND TDCALL. + * @index: Index of RTMR register to be extended. + * @data: Address of the input buffer with RTMR register extend data. + * + * Refer to section titled "TDG.MR.RTMR.EXTEND leaf" in the TDX Module v1.0 + * specification for more information on TDG.MR.RTMR.EXTEND TDCALL. + * + * It is used in the TDX guest driver module to allow user to extend the RTMR + * registers. + * + * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation, + * or -EIO on other TDCALL failures. + */ +int tdx_mcall_extend_rtmr(u8 index, u8 *data) +{ + struct tdx_module_args args = { + .rcx = virt_to_phys(data), + .rdx = index, + }; + u64 ret; + + ret = __tdcall(TDG_MR_RTMR_EXTEND, &args); + if (ret) { + if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) + return -ENXIO; + if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY) + return -EBUSY; + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(tdx_mcall_extend_rtmr); + +/** * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote * hypercall. * @buf: Address of the directly mapped shared kernel buffer which diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5ddba366d3b4..774430c3abff 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -777,6 +777,9 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + /* This bit combination is used to mark shadow stacks */ + WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == + _PAGE_DIRTY); pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PTE_PFN_MASK; return __pte(pfn | check_pgprot(pgprot)); @@ -1073,22 +1076,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) */ #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - * - * (Currently stuck as a macro because of indirect forward reference - * to linux/mm.h:page_to_nid()) - */ -#define mk_pte(page, pgprot) \ -({ \ - pgprot_t __pgprot = pgprot; \ - \ - WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \ - _PAGE_DIRTY); \ - pfn_pte(page_to_pfn(page), __pgprot); \ -}) - static inline int pmd_bad(pmd_t pmd) { return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != @@ -1353,8 +1340,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) - #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 8d9f1c9aaa4c..61f56cdaccb5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -4,6 +4,7 @@ #include <asm/page.h> #include <asm-generic/set_memory.h> +#include <asm/pgtable.h> #define set_memory_rox set_memory_rox int set_memory_rox(unsigned long addr, int numpages); @@ -37,6 +38,7 @@ int set_memory_rox(unsigned long addr, int numpages); * The caller is required to take care of these. */ +int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot); int _set_memory_uc(unsigned long addr, int numpages); int _set_memory_wc(unsigned long addr, int numpages); int _set_memory_wt(unsigned long addr, int numpages); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 6324f4c6c545..692af46603a1 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -68,6 +68,8 @@ extern void x86_ce4100_early_setup(void); static inline void x86_ce4100_early_setup(void) { } #endif +#include <linux/kexec_handover.h> + #ifndef _SETUP #include <asm/espfix.h> diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index fd9209e996e7..2f3820342598 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -13,6 +13,7 @@ /* TDX module Call Leaf IDs */ #define TDG_VP_VMCALL 0 #define TDG_VP_INFO 1 +#define TDG_MR_RTMR_EXTEND 2 #define TDG_VP_VEINFO_GET 3 #define TDG_MR_REPORT 4 #define TDG_MEM_PAGE_ACCEPT 6 diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 7c488ff0c764..c10dbb74cd00 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -38,6 +38,13 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return regs->orig_ax; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->orig_ax = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -90,6 +97,18 @@ static inline void syscall_get_arguments(struct task_struct *task, args[5] = regs->bp; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->bx = args[0]; + regs->cx = args[1]; + regs->dx = args[2]; + regs->si = args[3]; + regs->di = args[4]; + regs->bp = args[5]; +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; @@ -121,6 +140,30 @@ static inline void syscall_get_arguments(struct task_struct *task, } } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + regs->bx = *args++; + regs->cx = *args++; + regs->dx = *args++; + regs->si = *args++; + regs->di = *args++; + regs->bp = *args; + } else +# endif + { + regs->di = *args++; + regs->si = *args++; + regs->dx = *args++; + regs->r10 = *args++; + regs->r8 = *args++; + regs->r9 = *args; + } +} + static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 26ffc792e673..8b19294600c4 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -68,6 +68,8 @@ bool tdx_early_handle_ve(struct pt_regs *regs); int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport); +int tdx_mcall_extend_rtmr(u8 index, u8 *data); + u64 tdx_hcall_get_quote(u8 *buf, size_t size); void __init tdx_dump_attributes(u64 td_attr); diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h index 50c45ead4e7c..2671c4e1b3a0 100644 --- a/arch/x86/include/uapi/asm/setup_data.h +++ b/arch/x86/include/uapi/asm/setup_data.h @@ -13,7 +13,8 @@ #define SETUP_CC_BLOB 7 #define SETUP_IMA 8 #define SETUP_RNG_SEED 9 -#define SETUP_ENUM_MAX SETUP_RNG_SEED +#define SETUP_KEXEC_KHO 10 +#define SETUP_ENUM_MAX SETUP_KEXEC_KHO #define SETUP_INDIRECT (1<<31) #define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) @@ -78,6 +79,16 @@ struct ima_setup_data { __u64 size; } __attribute__((packed)); +/* + * Locations of kexec handover metadata + */ +struct kho_data { + __u64 fdt_addr; + __u64 fdt_size; + __u64 scratch_addr; + __u64 scratch_size; +} __attribute__((packed)); + #endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h index 4eddb4d571ef..30f39f92c98f 100644 --- a/arch/x86/kernel/cpu/sgx/driver.h +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -2,7 +2,6 @@ #ifndef __ARCH_SGX_DRIVER_H__ #define __ARCH_SGX_DRIVER_H__ -#include <crypto/hash.h> #include <linux/kref.h> #include <linux/mmu_notifier.h> #include <linux/radix-tree.h> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 776a20172867..66f1efa16fbb 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -3,6 +3,7 @@ #include <asm/mman.h> #include <asm/sgx.h> +#include <crypto/sha2.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -463,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) return ret; } -static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, - void *hash) -{ - SHASH_DESC_ON_STACK(shash, tfm); - - shash->tfm = tfm; - - return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); -} - -static int sgx_get_key_hash(const void *modulus, void *hash) -{ - struct crypto_shash *tfm; - int ret; - - tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - ret = __sgx_get_key_hash(tfm, modulus, hash); - - crypto_free_shash(tfm); - return ret; -} - static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { @@ -523,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, sgx_xfrm_reserved_mask) return -EINVAL; - ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); - if (ret) - return ret; + sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner); mutex_lock(&encl->lock); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 6722b2fc82cf..2de01b379aa3 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -720,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags) goto out; } + sgx_unmark_page_reclaimable(page); + /* * TBD: Add additional plumbing to enable pre-emptive * action for asynchronous poison notification. Until diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 0be61c45400c..bcb534688dfe 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -278,6 +278,7 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, unsigned long long mend) { unsigned long start, end; + int ret; cmem->ranges[0].start = mstart; cmem->ranges[0].end = mend; @@ -286,22 +287,43 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, /* Exclude elf header region */ start = image->elf_load_addr; end = start + image->elf_headers_sz - 1; - return crash_exclude_mem_range(cmem, start, end); + ret = crash_exclude_mem_range(cmem, start, end); + + if (ret) + return ret; + + /* Exclude dm crypt keys region */ + if (image->dm_crypt_keys_addr) { + start = image->dm_crypt_keys_addr; + end = start + image->dm_crypt_keys_sz - 1; + return crash_exclude_mem_range(cmem, start, end); + } + + return ret; } /* Prepare memory map for crash dump kernel */ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) { + unsigned int nr_ranges = 0; int i, ret = 0; unsigned long flags; struct e820_entry ei; struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(struct_size(cmem, ranges, 1)); + /* + * Using random kexec_buf for passing dm crypt keys may cause a range + * split. So use two slots here. + */ + nr_ranges = 2; + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return -ENOMEM; + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9920122018a0..c3acbd26408b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1300,6 +1300,24 @@ void __init e820__memblock_setup(void) } /* + * At this point memblock is only allowed to allocate from memory + * below 1M (aka ISA_END_ADDRESS) up until direct map is completely set + * up in init_mem_mapping(). + * + * KHO kernels are special and use only scratch memory for memblock + * allocations, but memory below 1M is ignored by kernel after early + * boot and cannot be naturally marked as scratch. + * + * To allow allocation of the real-mode trampoline and a few (if any) + * other very early allocations from below 1M forcibly mark the memory + * below 1M as scratch. + * + * After real mode trampoline is allocated, we clear that scratch + * marking. + */ + memblock_mark_kho_scratch(0, SZ_1M); + + /* * 32-bit systems are limited to 4BG of memory even with HIGHMEM and * to even less without it. * Discard memory after max_pfn - the actual limit detected at runtime. diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 68530fad05f7..24a41f0e0cf1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -27,6 +27,8 @@ #include <asm/kexec-bzimage64.h> #define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ +#define MAX_DMCRYPTKEYS_STR_LEN 31 /* dmcryptkeys=0x<64bit-value> */ + /* * Defines lowest physical address for various segments. Not sure where @@ -76,6 +78,10 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, if (image->type == KEXEC_TYPE_CRASH) { len = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", image->elf_load_addr); + + if (image->dm_crypt_keys_addr != 0) + len += sprintf(cmdline_ptr + len, + "dmcryptkeys=0x%lx ", image->dm_crypt_keys_addr); } memcpy(cmdline_ptr + len, cmdline, cmdline_len); cmdline_len += len; @@ -233,6 +239,32 @@ setup_ima_state(const struct kimage *image, struct boot_params *params, #endif /* CONFIG_IMA_KEXEC */ } +static void setup_kho(const struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int setup_data_offset) +{ + struct setup_data *sd = (void *)params + setup_data_offset; + struct kho_data *kho = (void *)sd + sizeof(*sd); + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + return; + + sd->type = SETUP_KEXEC_KHO; + sd->len = sizeof(struct kho_data); + + /* Only add if we have all KHO images in place */ + if (!image->kho.fdt || !image->kho.scratch) + return; + + /* Add setup data */ + kho->fdt_addr = image->kho.fdt; + kho->fdt_size = PAGE_SIZE; + kho->scratch_addr = image->kho.scratch->mem; + kho->scratch_size = image->kho.scratch->bufsz; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = params_load_addr + setup_data_offset; +} + static int setup_boot_parameters(struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -312,6 +344,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct ima_setup_data); } + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) { + /* Setup space to store preservation metadata */ + setup_kho(image, params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct kho_data); + } + /* Setup RNG seed */ setup_rng_seed(params, params_load_addr, setup_data_offset); @@ -441,6 +480,19 @@ static void *bzImage64_load(struct kimage *image, char *kernel, ret = crash_load_segments(image); if (ret) return ERR_PTR(ret); + ret = crash_load_dm_crypt_keys(image); + if (ret == -ENOENT) { + kexec_dprintk("No dm crypt key to load\n"); + } else if (ret) { + pr_err("Failed to load dm crypt keys\n"); + return ERR_PTR(ret); + } + if (image->dm_crypt_keys_addr && + cmdline_len + MAX_ELFCOREHDR_STR_LEN + MAX_DMCRYPTKEYS_STR_LEN > + header->cmdline_size) { + pr_err("Appending dmcryptkeys=<addr> to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } } #endif @@ -468,6 +520,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel, efi_map_sz = efi_get_runtime_map_size(); params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; + if (image->dm_crypt_keys_addr) + params_cmdline_sz += MAX_DMCRYPTKEYS_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + sizeof(struct setup_data) + @@ -479,6 +533,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + kbuf.bufsz += sizeof(struct setup_data) + + sizeof(struct kho_data); + params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 949c9e4bfad2..697fb99406e6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -630,13 +630,35 @@ static void kexec_mark_crashkres(bool protect) kexec_mark_range(control, crashk_res.end, protect); } +/* make the memory storing dm crypt keys in/accessible */ +static void kexec_mark_dm_crypt_keys(bool protect) +{ + unsigned long start_paddr, end_paddr; + unsigned int nr_pages; + + if (kexec_crash_image->dm_crypt_keys_addr) { + start_paddr = kexec_crash_image->dm_crypt_keys_addr; + end_paddr = start_paddr + kexec_crash_image->dm_crypt_keys_sz - 1; + nr_pages = (PAGE_ALIGN(end_paddr) - PAGE_ALIGN_DOWN(start_paddr))/PAGE_SIZE; + if (protect) + set_memory_np((unsigned long)phys_to_virt(start_paddr), nr_pages); + else + __set_memory_prot( + (unsigned long)phys_to_virt(start_paddr), + nr_pages, + __pgprot(_PAGE_PRESENT | _PAGE_NX | _PAGE_RW)); + } +} + void arch_kexec_protect_crashkres(void) { kexec_mark_crashkres(true); + kexec_mark_dm_crypt_keys(true); } void arch_kexec_unprotect_crashkres(void) { + kexec_mark_dm_crypt_keys(false); kexec_mark_crashkres(false); } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7d9ed79a93c0..fb27be697128 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -282,8 +282,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa_symbol(_brk_start), - _brk_end - _brk_start); + memblock_reserve_kern(__pa_symbol(_brk_start), + _brk_end - _brk_start); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -356,7 +356,7 @@ static void __init early_reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image); } static void __init reserve_initrd(void) @@ -409,7 +409,7 @@ static void __init add_early_ima_buffer(u64 phys_addr) } if (data->size) { - memblock_reserve(data->addr, data->size); + memblock_reserve_kern(data->addr, data->size); ima_kexec_buffer_phys = data->addr; ima_kexec_buffer_size = data->size; } @@ -447,6 +447,29 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size) } #endif +static void __init add_kho(u64 phys_addr, u32 data_len) +{ + struct kho_data *kho; + u64 addr = phys_addr + sizeof(struct setup_data); + u64 size = data_len - sizeof(struct setup_data); + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) { + pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n"); + return; + } + + kho = early_memremap(addr, size); + if (!kho) { + pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n", + addr, size); + return; + } + + kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size); + + early_memunmap(kho, size); +} + static void __init parse_setup_data(void) { struct setup_data *data; @@ -475,6 +498,9 @@ static void __init parse_setup_data(void) case SETUP_IMA: add_early_ima_buffer(pa_data); break; + case SETUP_KEXEC_KHO: + add_kho(pa_data, data_len); + break; case SETUP_RNG_SEED: data = early_memremap(pa_data, data_len); add_bootloader_randomness(data->data, data->len); @@ -549,7 +575,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) len = sizeof(*data); pa_next = data->next; - memblock_reserve(pa_data, sizeof(*data) + data->len); + memblock_reserve_kern(pa_data, sizeof(*data) + data->len); if (data->type == SETUP_INDIRECT) { len += data->len; @@ -563,7 +589,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) indirect = (struct setup_indirect *)data->data; if (indirect->type != SETUP_INDIRECT) - memblock_reserve(indirect->addr, indirect->len); + memblock_reserve_kern(indirect->addr, indirect->len); } pa_data = pa_next; @@ -766,8 +792,8 @@ static void __init early_reserve_memory(void) * __end_of_kernel_reserve symbol must be explicitly reserved with a * separate memblock_reserve() or they will be discarded. */ - memblock_reserve(__pa_symbol(_text), - (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + memblock_reserve_kern(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); /* * The first 4Kb of memory is a BIOS owned area, but generally it is diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b90d872aa0c8..1ba92ac9441d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1244,10 +1244,6 @@ void play_dead_common(void) local_irq_disable(); } -/* - * We need to flush the caches before going to sleep, lest we have - * dirty data in our caches when we come back up. - */ void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); @@ -1294,6 +1290,50 @@ void __noreturn mwait_play_dead(unsigned int eax_hint) } /* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead_cpuid_hint(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + int i; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return; + if (!this_cpu_has(X86_FEATURE_MWAIT)) + return; + if (!this_cpu_has(X86_FEATURE_CLFLUSH)) + return; + + eax = CPUID_LEAF_MWAIT; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + mwait_play_dead(eax); +} + +/* * Kick all "offline" CPUs out of mwait on kexec(). See comment in * mwait_play_dead(). */ @@ -1343,9 +1383,9 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - /* Below returns only on error. */ - cpuidle_play_dead(); - hlt_play_dead(); + mwait_play_dead_cpuid_hint(); + if (cpuidle_play_dead()) + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 89079ea73e65..a4700ef6eb64 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -266,6 +266,32 @@ static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) st->prot_levels[level] = effective; } +static void effective_prot_pte(struct ptdump_state *st, pte_t pte) +{ + effective_prot(st, 4, pte_val(pte)); +} + +static void effective_prot_pmd(struct ptdump_state *st, pmd_t pmd) +{ + effective_prot(st, 3, pmd_val(pmd)); +} + +static void effective_prot_pud(struct ptdump_state *st, pud_t pud) +{ + effective_prot(st, 2, pud_val(pud)); +} + +static void effective_prot_p4d(struct ptdump_state *st, p4d_t p4d) +{ + effective_prot(st, 1, p4d_val(p4d)); +} + +static void effective_prot_pgd(struct ptdump_state *st, pgd_t pgd) +{ + effective_prot(st, 0, pgd_val(pgd)); +} + + /* * This function gets called on a break in a continuous series * of PTE entries; the next one is different so we need to @@ -362,6 +388,38 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } +static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte) +{ + note_page(pt_st, addr, 4, pte_val(pte)); +} + +static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd) +{ + note_page(pt_st, addr, 3, pmd_val(pmd)); +} + +static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud) +{ + note_page(pt_st, addr, 2, pud_val(pud)); +} + +static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d) +{ + note_page(pt_st, addr, 1, p4d_val(p4d)); +} + +static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd) +{ + note_page(pt_st, addr, 0, pgd_val(pgd)); +} + +static void note_page_flush(struct ptdump_state *pt_st) +{ + pte_t pte_zero = {0}; + + note_page(pt_st, 0, -1, pte_val(pte_zero)); +} + bool ptdump_walk_pgd_level_core(struct seq_file *m, struct mm_struct *mm, pgd_t *pgd, bool checkwx, bool dmesg) @@ -378,8 +436,17 @@ bool ptdump_walk_pgd_level_core(struct seq_file *m, struct pg_state st = { .ptdump = { - .note_page = note_page, - .effective_prot = effective_prot, + .note_page_pte = note_page_pte, + .note_page_pmd = note_page_pmd, + .note_page_pud = note_page_pud, + .note_page_p4d = note_page_p4d, + .note_page_pgd = note_page_pgd, + .note_page_flush = note_page_flush, + .effective_prot_pte = effective_prot_pte, + .effective_prot_pmd = effective_prot_pmd, + .effective_prot_pud = effective_prot_pud, + .effective_prot_p4d = effective_prot_p4d, + .effective_prot_pgd = effective_prot_pgd, .range = ptdump_ranges }, .level = -1, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 66330fe4e18c..ee66fae9ebcc 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1467,16 +1467,21 @@ static unsigned long probe_memory_block_size(void) } /* - * Use max block size to minimize overhead on bare metal, where - * alignment for memory hotplug isn't a concern. + * When hotplug alignment is not a concern, maximize blocksize + * to minimize overhead. Otherwise, align to the lesser of advice + * alignment and end of memory alignment. */ - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + bz = memory_block_advised_max_size(); + if (!bz) { bz = MAX_BLOCK_SIZE; - goto done; + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + goto done; + } else { + bz = max(min(bz, MAX_BLOCK_SIZE), MIN_MEMORY_BLOCK_SIZE); } /* Find the largest allowed block size that aligns to memory end */ - for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { + for (; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { if (IS_ALIGNED(boot_mem_end, bz)) break; } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 331e101bf801..12c8180ca1ba 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -71,7 +71,7 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, static unsigned int __ioremap_check_ram(struct resource *res) { unsigned long start_pfn, stop_pfn; - unsigned long i; + unsigned long pfn; if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) return 0; @@ -79,9 +79,8 @@ static unsigned int __ioremap_check_ram(struct resource *res) start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; stop_pfn = (res->end + 1) >> PAGE_SHIFT; if (stop_pfn > start_pfn) { - for (i = 0; i < (stop_pfn - start_pfn); ++i) - if (pfn_valid(start_pfn + i) && - !PageReserved(pfn_to_page(start_pfn + i))) + for_each_valid_pfn(pfn, start_pfn, stop_pfn) + if (!PageReserved(pfn_to_page(pfn))) return IORES_MAP_SYSTEM_RAM; } diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index c97b527c66fe..2e7923844afe 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -775,6 +775,12 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, return vma_prot; } +static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm) +{ + *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) | + cachemode2protval(pcm)); +} + int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { @@ -789,8 +795,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | - cachemode2protval(pcm)); + pgprot_set_cachemode(vma_prot, pcm); return 1; } @@ -831,8 +836,7 @@ int memtype_kernel_map_sync(u64 base, unsigned long size, * Reserved non RAM regions only and after successful memtype_reserve, * this func also keeps identity mapping (if any) in sync with this new prot. */ -static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, - int strict_prot) +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot) { int is_ram = 0; int ret; @@ -858,9 +862,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), cattr_name(pcm)); - *vma_prot = __pgprot((pgprot_val(*vma_prot) & - (~_PAGE_CACHE_MASK)) | - cachemode2protval(pcm)); + pgprot_set_cachemode(vma_prot, pcm); } return 0; } @@ -870,8 +872,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, return ret; if (pcm != want_pcm) { - if (strict_prot || - !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { + if (!is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { memtype_free(paddr, paddr + size); pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, @@ -881,13 +882,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, cattr_name(pcm)); return -EINVAL; } - /* - * We allow returning different type than the one requested in - * non strict case. - */ - *vma_prot = __pgprot((pgprot_val(*vma_prot) & - (~_PAGE_CACHE_MASK)) | - cachemode2protval(pcm)); + pgprot_set_cachemode(vma_prot, pcm); } if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { @@ -910,124 +905,14 @@ static void free_pfn_range(u64 paddr, unsigned long size) memtype_free(paddr, paddr + size); } -static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, - resource_size_t *phys) -{ - struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start }; - - if (follow_pfnmap_start(&args)) - return -EINVAL; - - /* Never return PFNs of anon folios in COW mappings. */ - if (!args.special) { - follow_pfnmap_end(&args); - return -EINVAL; - } - - *prot = pgprot_val(args.pgprot); - *phys = (resource_size_t)args.pfn << PAGE_SHIFT; - follow_pfnmap_end(&args); - return 0; -} - -static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, - pgprot_t *pgprot) -{ - unsigned long prot; - - VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT)); - - /* - * We need the starting PFN and cachemode used for track_pfn_remap() - * that covered the whole VMA. For most mappings, we can obtain that - * information from the page tables. For COW mappings, we might now - * suddenly have anon folios mapped and follow_phys() will fail. - * - * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to - * detect the PFN. If we need the cachemode as well, we're out of luck - * for now and have to fail fork(). - */ - if (!follow_phys(vma, &prot, paddr)) { - if (pgprot) - *pgprot = __pgprot(prot); - return 0; - } - if (is_cow_mapping(vma->vm_flags)) { - if (pgprot) - return -EINVAL; - *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return 0; - } - WARN_ON_ONCE(1); - return -EINVAL; -} - -int track_pfn_copy(struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, unsigned long *pfn) -{ - const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start; - resource_size_t paddr; - pgprot_t pgprot; - int rc; - - if (!(src_vma->vm_flags & VM_PAT)) - return 0; - - /* - * Duplicate the PAT information for the dst VMA based on the src - * VMA. - */ - if (get_pat_info(src_vma, &paddr, &pgprot)) - return -EINVAL; - rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1); - if (rc) - return rc; - - /* Reservation for the destination VMA succeeded. */ - vm_flags_set(dst_vma, VM_PAT); - *pfn = PHYS_PFN(paddr); - return 0; -} - -void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn) -{ - untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true); - /* - * Reservation was freed, any copied page tables will get cleaned - * up later, but without getting PAT involved again. - */ -} - -/* - * prot is passed in as a parameter for the new mapping. If the vma has - * a linear pfn mapping for the entire range, or no vma is provided, - * reserve the entire pfn + size range with single reserve_pfn_range - * call. - */ -int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long addr, unsigned long size) +int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; enum page_cache_mode pcm; - /* reserve the whole chunk starting from paddr */ - if (!vma || (addr == vma->vm_start - && size == (vma->vm_end - vma->vm_start))) { - int ret; - - ret = reserve_pfn_range(paddr, size, prot, 0); - if (ret == 0 && vma) - vm_flags_set(vma, VM_PAT); - return ret; - } - if (!pat_enabled()) return 0; - /* - * For anything smaller than the vma size we set prot based on the - * lookup. - */ pcm = lookup_memtype(paddr); /* Check memtype for the remaining pages */ @@ -1038,70 +923,35 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return -EINVAL; } - *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | - cachemode2protval(pcm)); - + pgprot_set_cachemode(prot, pcm); return 0; } -void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) +int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot) { - enum page_cache_mode pcm; + const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; - if (!pat_enabled()) - return; - - /* Set prot based on lookup */ - pcm = lookup_memtype(pfn_t_to_phys(pfn)); - *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | - cachemode2protval(pcm)); + return reserve_pfn_range(paddr, size, prot); } -/* - * untrack_pfn is called while unmapping a pfnmap for a region. - * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case pfn, size are zero). - */ -void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size, bool mm_wr_locked) +void pfnmap_untrack(unsigned long pfn, unsigned long size) { - resource_size_t paddr; - - if (vma && !(vma->vm_flags & VM_PAT)) - return; + const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; - /* free the chunk starting from pfn or the whole chunk */ - paddr = (resource_size_t)pfn << PAGE_SHIFT; - if (!paddr && !size) { - if (get_pat_info(vma, &paddr, NULL)) - return; - size = vma->vm_end - vma->vm_start; - } free_pfn_range(paddr, size); - if (vma) { - if (mm_wr_locked) - vm_flags_clear(vma, VM_PAT); - else - __vm_flags_mod(vma, 0, VM_PAT); - } -} - -void untrack_pfn_clear(struct vm_area_struct *vma) -{ - vm_flags_clear(vma, VM_PAT); } pgprot_t pgprot_writecombine(pgprot_t prot) { - return __pgprot(pgprot_val(prot) | - cachemode2protval(_PAGE_CACHE_MODE_WC)); + pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WC); + return prot; } EXPORT_SYMBOL_GPL(pgprot_writecombine); pgprot_t pgprot_writethrough(pgprot_t prot) { - return __pgprot(pgprot_val(prot) | - cachemode2protval(_PAGE_CACHE_MODE_WT)); + pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WT); + return prot; } EXPORT_SYMBOL_GPL(pgprot_writethrough); diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c index 645613d59942..e5844ed1311e 100644 --- a/arch/x86/mm/pat/memtype_interval.c +++ b/arch/x86/mm/pat/memtype_interval.c @@ -49,32 +49,6 @@ INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end, static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED; -enum { - MEMTYPE_EXACT_MATCH = 0, - MEMTYPE_END_MATCH = 1 -}; - -static struct memtype *memtype_match(u64 start, u64 end, int match_type) -{ - struct memtype *entry_match; - - entry_match = interval_iter_first(&memtype_rbroot, start, end-1); - - while (entry_match != NULL && entry_match->start < end) { - if ((match_type == MEMTYPE_EXACT_MATCH) && - (entry_match->start == start) && (entry_match->end == end)) - return entry_match; - - if ((match_type == MEMTYPE_END_MATCH) && - (entry_match->start < start) && (entry_match->end == end)) - return entry_match; - - entry_match = interval_iter_next(entry_match, start, end-1); - } - - return NULL; /* Returns NULL if there is no match */ -} - static int memtype_check_conflict(u64 start, u64 end, enum page_cache_mode reqtype, enum page_cache_mode *newtype) @@ -130,35 +104,16 @@ int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_ty struct memtype *memtype_erase(u64 start, u64 end) { - struct memtype *entry_old; - - /* - * Since the memtype_rbroot tree allows overlapping ranges, - * memtype_erase() checks with EXACT_MATCH first, i.e. free - * a whole node for the munmap case. If no such entry is found, - * it then checks with END_MATCH, i.e. shrink the size of a node - * from the end for the mremap case. - */ - entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH); - if (!entry_old) { - entry_old = memtype_match(start, end, MEMTYPE_END_MATCH); - if (!entry_old) - return ERR_PTR(-EINVAL); + struct memtype *entry = interval_iter_first(&memtype_rbroot, start, end - 1); + + while (entry && entry->start < end) { + if (entry->start == start && entry->end == end) { + interval_remove(entry, &memtype_rbroot); + return entry; + } + entry = interval_iter_next(entry, start, end - 1); } - - if (entry_old->start == start) { - /* munmap: erase this node */ - interval_remove(entry_old, &memtype_rbroot); - } else { - /* mremap: update the end value of this node */ - interval_remove(entry_old, &memtype_rbroot); - entry_old->end = start; - interval_insert(entry_old, &memtype_rbroot); - - return NULL; - } - - return entry_old; + return ERR_PTR(-EINVAL); } struct memtype *memtype_lookup(u64 addr) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 30ab4aced761..46edc11726b7 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2148,6 +2148,19 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages, CPA_PAGES_ARRAY, pages); } +/* + * __set_memory_prot is an internal helper for callers that have been passed + * a pgprot_t value from upper layers and a reservation has already been taken. + * If you want to set the pgprot to a specific page protocol, use the + * set_memory_xx() functions. + */ +int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) +{ + return change_page_attr_set_clr(&addr, numpages, prot, + __pgprot(~pgprot_val(prot)), 0, 0, + NULL); +} + int _set_memory_uc(unsigned long addr, int numpages) { /* diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 62777ba4de1a..ddf248c3ee7d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -189,7 +189,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) if (!ptdesc) failed = true; - if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { + if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; @@ -751,14 +751,13 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); - free_page((unsigned long)pte); + pte_free_kernel(&init_mm, pte); } } free_page((unsigned long)pmd_sv); - pagetable_dtor(virt_to_ptdesc(pmd)); - free_page((unsigned long)pmd); + pmd_free(&init_mm, pmd); return 1; } @@ -781,7 +780,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); - free_page((unsigned long)pte); + pte_free_kernel(&init_mm, pte); return 1; } diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index ed5c63c0b4e5..88be32026768 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -66,6 +66,8 @@ void __init reserve_real_mode(void) * setup_arch(). */ memblock_reserve(0, SZ_1M); + + memblock_clear_kho_scratch(0, SZ_1M); } static void __init sme_sev_setup_real_mode(struct trampoline_header *th) |