summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig6
-rw-r--r--arch/x86/boot/compressed/kaslr.c50
-rw-r--r--arch/x86/boot/header.S10
-rw-r--r--arch/x86/coco/tdx/tdx.c50
-rw-r--r--arch/x86/include/asm/pgtable.h21
-rw-r--r--arch/x86/include/asm/set_memory.h2
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/shared/tdx.h1
-rw-r--r--arch/x86/include/asm/syscall.h43
-rw-r--r--arch/x86/include/asm/tdx.h2
-rw-r--r--arch/x86/include/uapi/asm/setup_data.h13
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c30
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c2
-rw-r--r--arch/x86/kernel/crash.c26
-rw-r--r--arch/x86/kernel/e820.c18
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c58
-rw-r--r--arch/x86/kernel/machine_kexec_64.c22
-rw-r--r--arch/x86/kernel/setup.c42
-rw-r--r--arch/x86/kernel/smpboot.c54
-rw-r--r--arch/x86/mm/dump_pagetables.c71
-rw-r--r--arch/x86/mm/init_64.c15
-rw-r--r--arch/x86/mm/ioremap.c7
-rw-r--r--arch/x86/mm/pat/memtype.c194
-rw-r--r--arch/x86/mm/pat/memtype_interval.c63
-rw-r--r--arch/x86/mm/pat/set_memory.c13
-rw-r--r--arch/x86/mm/pgtable.c9
-rw-r--r--arch/x86/realmode/init.c2
28 files changed, 507 insertions, 320 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0be4937203c7..340e5468980e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1858,8 +1858,7 @@ endchoice
config X86_SGX
bool "Software Guard eXtensions (SGX)"
depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC
- depends on CRYPTO=y
- depends on CRYPTO_SHA256=y
+ select CRYPTO_LIB_SHA256
select MMU_NOTIFIER
select NUMA_KEEP_MEMINFO if NUMA
select XARRAY_MULTI
@@ -2006,6 +2005,9 @@ config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
config ARCH_SUPPORTS_KEXEC_JUMP
def_bool y
+config ARCH_SUPPORTS_KEXEC_HANDOVER
+ def_bool X86_64
+
config ARCH_SUPPORTS_CRASH_DUMP
def_bool X86_64 || (X86_32 && HIGHMEM)
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index f03d59ea6e40..3b0948ad449f 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -760,6 +760,49 @@ static void process_e820_entries(unsigned long minimum,
}
}
+/*
+ * If KHO is active, only process its scratch areas to ensure we are not
+ * stepping onto preserved memory.
+ */
+static bool process_kho_entries(unsigned long minimum, unsigned long image_size)
+{
+ struct kho_scratch *kho_scratch;
+ struct setup_data *ptr;
+ struct kho_data *kho;
+ int i, nr_areas = 0;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return false;
+
+ ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
+ while (ptr) {
+ if (ptr->type == SETUP_KEXEC_KHO) {
+ kho = (struct kho_data *)(unsigned long)ptr->data;
+ kho_scratch = (void *)(unsigned long)kho->scratch_addr;
+ nr_areas = kho->scratch_size / sizeof(*kho_scratch);
+ break;
+ }
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
+ if (!nr_areas)
+ return false;
+
+ for (i = 0; i < nr_areas; i++) {
+ struct kho_scratch *area = &kho_scratch[i];
+ struct mem_vector region = {
+ .start = area->addr,
+ .size = area->size,
+ };
+
+ if (process_mem_region(&region, minimum, image_size))
+ break;
+ }
+
+ return true;
+}
+
static unsigned long find_random_phys_addr(unsigned long minimum,
unsigned long image_size)
{
@@ -775,7 +818,12 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
return 0;
}
- if (!process_efi_entries(minimum, image_size))
+ /*
+ * During kexec handover only process KHO scratch areas that are known
+ * not to contain any data that must be preserved.
+ */
+ if (!process_kho_entries(minimum, image_size) &&
+ !process_efi_entries(minimum, image_size))
process_e820_entries(minimum, image_size);
phys_addr = slots_fetch_random();
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index e30649e44d8f..e1f4fd5bc8ee 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -43,7 +43,7 @@ SYSSEG = 0x1000 /* historical load address >> 4 */
.section ".bstext", "ax"
#ifdef CONFIG_EFI_STUB
# "MZ", MS-DOS header
- .word MZ_MAGIC
+ .word IMAGE_DOS_SIGNATURE
.org 0x38
#
# Offset to the PE header.
@@ -51,16 +51,16 @@ SYSSEG = 0x1000 /* historical load address >> 4 */
.long LINUX_PE_MAGIC
.long pe_header
pe_header:
- .long PE_MAGIC
+ .long IMAGE_NT_SIGNATURE
coff_header:
#ifdef CONFIG_X86_32
.set image_file_add_flags, IMAGE_FILE_32BIT_MACHINE
- .set pe_opt_magic, PE_OPT_MAGIC_PE32
+ .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR32_MAGIC
.word IMAGE_FILE_MACHINE_I386
#else
.set image_file_add_flags, 0
- .set pe_opt_magic, PE_OPT_MAGIC_PE32PLUS
+ .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR64_MAGIC
.word IMAGE_FILE_MACHINE_AMD64
#endif
.word section_count # nr_sections
@@ -111,7 +111,7 @@ extra_header_fields:
.long salign # SizeOfHeaders
.long 0 # CheckSum
.word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application)
- .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics
+ .word IMAGE_DLLCHARACTERISTICS_NX_COMPAT # DllCharacteristics
#ifdef CONFIG_X86_32
.long 0 # SizeOfStackReserve
.long 0 # SizeOfStackCommit
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index edab6d6049be..7b2833705d47 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -36,6 +36,7 @@
/* TDX Module call error codes */
#define TDCALL_RETURN_CODE(a) ((a) >> 32)
#define TDCALL_INVALID_OPERAND 0xc0000100
+#define TDCALL_OPERAND_BUSY 0x80000200
#define TDREPORT_SUBTYPE_0 0
@@ -109,12 +110,13 @@ static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask)
* REPORTDATA to be included into TDREPORT.
* @tdreport: Address of the output buffer to store TDREPORT.
*
- * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
- * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
+ * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module v1.0
+ * specification for more information on TDG.MR.REPORT TDCALL.
+ *
* It is used in the TDX guest driver module to get the TDREPORT0.
*
- * Return 0 on success, -EINVAL for invalid operands, or -EIO on
- * other TDCALL failures.
+ * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,
+ * or -EIO on other TDCALL failures.
*/
int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
{
@@ -128,7 +130,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
ret = __tdcall(TDG_MR_REPORT, &args);
if (ret) {
if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
- return -EINVAL;
+ return -ENXIO;
+ else if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)
+ return -EBUSY;
return -EIO;
}
@@ -137,6 +141,42 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
/**
+ * tdx_mcall_extend_rtmr() - Wrapper to extend RTMR registers using
+ * TDG.MR.RTMR.EXTEND TDCALL.
+ * @index: Index of RTMR register to be extended.
+ * @data: Address of the input buffer with RTMR register extend data.
+ *
+ * Refer to section titled "TDG.MR.RTMR.EXTEND leaf" in the TDX Module v1.0
+ * specification for more information on TDG.MR.RTMR.EXTEND TDCALL.
+ *
+ * It is used in the TDX guest driver module to allow user to extend the RTMR
+ * registers.
+ *
+ * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation,
+ * or -EIO on other TDCALL failures.
+ */
+int tdx_mcall_extend_rtmr(u8 index, u8 *data)
+{
+ struct tdx_module_args args = {
+ .rcx = virt_to_phys(data),
+ .rdx = index,
+ };
+ u64 ret;
+
+ ret = __tdcall(TDG_MR_RTMR_EXTEND, &args);
+ if (ret) {
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
+ return -ENXIO;
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY)
+ return -EBUSY;
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tdx_mcall_extend_rtmr);
+
+/**
* tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
* hypercall.
* @buf: Address of the directly mapped shared kernel buffer which
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5ddba366d3b4..774430c3abff 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -777,6 +777,9 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot)
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ /* This bit combination is used to mark shadow stacks */
+ WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) ==
+ _PAGE_DIRTY);
pfn ^= protnone_mask(pgprot_val(pgprot));
pfn &= PTE_PFN_MASK;
return __pte(pfn | check_pgprot(pgprot));
@@ -1073,22 +1076,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
*/
#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * (Currently stuck as a macro because of indirect forward reference
- * to linux/mm.h:page_to_nid())
- */
-#define mk_pte(page, pgprot) \
-({ \
- pgprot_t __pgprot = pgprot; \
- \
- WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \
- _PAGE_DIRTY); \
- pfn_pte(page_to_pfn(page), __pgprot); \
-})
-
static inline int pmd_bad(pmd_t pmd)
{
return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
@@ -1353,8 +1340,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
-#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
-
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 8d9f1c9aaa4c..61f56cdaccb5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -4,6 +4,7 @@
#include <asm/page.h>
#include <asm-generic/set_memory.h>
+#include <asm/pgtable.h>
#define set_memory_rox set_memory_rox
int set_memory_rox(unsigned long addr, int numpages);
@@ -37,6 +38,7 @@ int set_memory_rox(unsigned long addr, int numpages);
* The caller is required to take care of these.
*/
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
int _set_memory_uc(unsigned long addr, int numpages);
int _set_memory_wc(unsigned long addr, int numpages);
int _set_memory_wt(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 6324f4c6c545..692af46603a1 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -68,6 +68,8 @@ extern void x86_ce4100_early_setup(void);
static inline void x86_ce4100_early_setup(void) { }
#endif
+#include <linux/kexec_handover.h>
+
#ifndef _SETUP
#include <asm/espfix.h>
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index fd9209e996e7..2f3820342598 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -13,6 +13,7 @@
/* TDX module Call Leaf IDs */
#define TDG_VP_VMCALL 0
#define TDG_VP_INFO 1
+#define TDG_MR_RTMR_EXTEND 2
#define TDG_VP_VEINFO_GET 3
#define TDG_MR_REPORT 4
#define TDG_MEM_PAGE_ACCEPT 6
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 7c488ff0c764..c10dbb74cd00 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -38,6 +38,13 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
return regs->orig_ax;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->orig_ax = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -90,6 +97,18 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[5] = regs->bp;
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ regs->bx = args[0];
+ regs->cx = args[1];
+ regs->dx = args[2];
+ regs->si = args[3];
+ regs->di = args[4];
+ regs->bp = args[5];
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_I386;
@@ -121,6 +140,30 @@ static inline void syscall_get_arguments(struct task_struct *task,
}
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task->thread_info.status & TS_COMPAT) {
+ regs->bx = *args++;
+ regs->cx = *args++;
+ regs->dx = *args++;
+ regs->si = *args++;
+ regs->di = *args++;
+ regs->bp = *args;
+ } else
+# endif
+ {
+ regs->di = *args++;
+ regs->si = *args++;
+ regs->dx = *args++;
+ regs->r10 = *args++;
+ regs->r8 = *args++;
+ regs->r9 = *args;
+ }
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
/* x32 tasks should be considered AUDIT_ARCH_X86_64. */
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 26ffc792e673..8b19294600c4 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -68,6 +68,8 @@ bool tdx_early_handle_ve(struct pt_regs *regs);
int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
+int tdx_mcall_extend_rtmr(u8 index, u8 *data);
+
u64 tdx_hcall_get_quote(u8 *buf, size_t size);
void __init tdx_dump_attributes(u64 td_attr);
diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h
index 50c45ead4e7c..2671c4e1b3a0 100644
--- a/arch/x86/include/uapi/asm/setup_data.h
+++ b/arch/x86/include/uapi/asm/setup_data.h
@@ -13,7 +13,8 @@
#define SETUP_CC_BLOB 7
#define SETUP_IMA 8
#define SETUP_RNG_SEED 9
-#define SETUP_ENUM_MAX SETUP_RNG_SEED
+#define SETUP_KEXEC_KHO 10
+#define SETUP_ENUM_MAX SETUP_KEXEC_KHO
#define SETUP_INDIRECT (1<<31)
#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
@@ -78,6 +79,16 @@ struct ima_setup_data {
__u64 size;
} __attribute__((packed));
+/*
+ * Locations of kexec handover metadata
+ */
+struct kho_data {
+ __u64 fdt_addr;
+ __u64 fdt_size;
+ __u64 scratch_addr;
+ __u64 scratch_size;
+} __attribute__((packed));
+
#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
index 4eddb4d571ef..30f39f92c98f 100644
--- a/arch/x86/kernel/cpu/sgx/driver.h
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -2,7 +2,6 @@
#ifndef __ARCH_SGX_DRIVER_H__
#define __ARCH_SGX_DRIVER_H__
-#include <crypto/hash.h>
#include <linux/kref.h>
#include <linux/mmu_notifier.h>
#include <linux/radix-tree.h>
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 776a20172867..66f1efa16fbb 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -3,6 +3,7 @@
#include <asm/mman.h>
#include <asm/sgx.h>
+#include <crypto/sha2.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -463,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
return ret;
}
-static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
- void *hash)
-{
- SHASH_DESC_ON_STACK(shash, tfm);
-
- shash->tfm = tfm;
-
- return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
-}
-
-static int sgx_get_key_hash(const void *modulus, void *hash)
-{
- struct crypto_shash *tfm;
- int ret;
-
- tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- ret = __sgx_get_key_hash(tfm, modulus, hash);
-
- crypto_free_shash(tfm);
- return ret;
-}
-
static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
void *token)
{
@@ -523,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
sgx_xfrm_reserved_mask)
return -EINVAL;
- ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
- if (ret)
- return ret;
+ sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner);
mutex_lock(&encl->lock);
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 6722b2fc82cf..2de01b379aa3 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -720,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags)
goto out;
}
+ sgx_unmark_page_reclaimable(page);
+
/*
* TBD: Add additional plumbing to enable pre-emptive
* action for asynchronous poison notification. Until
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 0be61c45400c..bcb534688dfe 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -278,6 +278,7 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
unsigned long long mend)
{
unsigned long start, end;
+ int ret;
cmem->ranges[0].start = mstart;
cmem->ranges[0].end = mend;
@@ -286,22 +287,43 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
/* Exclude elf header region */
start = image->elf_load_addr;
end = start + image->elf_headers_sz - 1;
- return crash_exclude_mem_range(cmem, start, end);
+ ret = crash_exclude_mem_range(cmem, start, end);
+
+ if (ret)
+ return ret;
+
+ /* Exclude dm crypt keys region */
+ if (image->dm_crypt_keys_addr) {
+ start = image->dm_crypt_keys_addr;
+ end = start + image->dm_crypt_keys_sz - 1;
+ return crash_exclude_mem_range(cmem, start, end);
+ }
+
+ return ret;
}
/* Prepare memory map for crash dump kernel */
int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
{
+ unsigned int nr_ranges = 0;
int i, ret = 0;
unsigned long flags;
struct e820_entry ei;
struct crash_memmap_data cmd;
struct crash_mem *cmem;
- cmem = vzalloc(struct_size(cmem, ranges, 1));
+ /*
+ * Using random kexec_buf for passing dm crypt keys may cause a range
+ * split. So use two slots here.
+ */
+ nr_ranges = 2;
+ cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return -ENOMEM;
+ cmem->max_nr_ranges = nr_ranges;
+ cmem->nr_ranges = 0;
+
memset(&cmd, 0, sizeof(struct crash_memmap_data));
cmd.params = params;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9920122018a0..c3acbd26408b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1300,6 +1300,24 @@ void __init e820__memblock_setup(void)
}
/*
+ * At this point memblock is only allowed to allocate from memory
+ * below 1M (aka ISA_END_ADDRESS) up until direct map is completely set
+ * up in init_mem_mapping().
+ *
+ * KHO kernels are special and use only scratch memory for memblock
+ * allocations, but memory below 1M is ignored by kernel after early
+ * boot and cannot be naturally marked as scratch.
+ *
+ * To allow allocation of the real-mode trampoline and a few (if any)
+ * other very early allocations from below 1M forcibly mark the memory
+ * below 1M as scratch.
+ *
+ * After real mode trampoline is allocated, we clear that scratch
+ * marking.
+ */
+ memblock_mark_kho_scratch(0, SZ_1M);
+
+ /*
* 32-bit systems are limited to 4BG of memory even with HIGHMEM and
* to even less without it.
* Discard memory after max_pfn - the actual limit detected at runtime.
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 68530fad05f7..24a41f0e0cf1 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -27,6 +27,8 @@
#include <asm/kexec-bzimage64.h>
#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+#define MAX_DMCRYPTKEYS_STR_LEN 31 /* dmcryptkeys=0x<64bit-value> */
+
/*
* Defines lowest physical address for various segments. Not sure where
@@ -76,6 +78,10 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
if (image->type == KEXEC_TYPE_CRASH) {
len = sprintf(cmdline_ptr,
"elfcorehdr=0x%lx ", image->elf_load_addr);
+
+ if (image->dm_crypt_keys_addr != 0)
+ len += sprintf(cmdline_ptr + len,
+ "dmcryptkeys=0x%lx ", image->dm_crypt_keys_addr);
}
memcpy(cmdline_ptr + len, cmdline, cmdline_len);
cmdline_len += len;
@@ -233,6 +239,32 @@ setup_ima_state(const struct kimage *image, struct boot_params *params,
#endif /* CONFIG_IMA_KEXEC */
}
+static void setup_kho(const struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + setup_data_offset;
+ struct kho_data *kho = (void *)sd + sizeof(*sd);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return;
+
+ sd->type = SETUP_KEXEC_KHO;
+ sd->len = sizeof(struct kho_data);
+
+ /* Only add if we have all KHO images in place */
+ if (!image->kho.fdt || !image->kho.scratch)
+ return;
+
+ /* Add setup data */
+ kho->fdt_addr = image->kho.fdt;
+ kho->fdt_size = PAGE_SIZE;
+ kho->scratch_addr = image->kho.scratch->mem;
+ kho->scratch_size = image->kho.scratch->bufsz;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = params_load_addr + setup_data_offset;
+}
+
static int
setup_boot_parameters(struct kimage *image, struct boot_params *params,
unsigned long params_load_addr,
@@ -312,6 +344,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
sizeof(struct ima_setup_data);
}
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ /* Setup space to store preservation metadata */
+ setup_kho(image, params, params_load_addr, setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+ }
+
/* Setup RNG seed */
setup_rng_seed(params, params_load_addr, setup_data_offset);
@@ -441,6 +480,19 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
ret = crash_load_segments(image);
if (ret)
return ERR_PTR(ret);
+ ret = crash_load_dm_crypt_keys(image);
+ if (ret == -ENOENT) {
+ kexec_dprintk("No dm crypt key to load\n");
+ } else if (ret) {
+ pr_err("Failed to load dm crypt keys\n");
+ return ERR_PTR(ret);
+ }
+ if (image->dm_crypt_keys_addr &&
+ cmdline_len + MAX_ELFCOREHDR_STR_LEN + MAX_DMCRYPTKEYS_STR_LEN >
+ header->cmdline_size) {
+ pr_err("Appending dmcryptkeys=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
}
#endif
@@ -468,6 +520,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
efi_map_sz = efi_get_runtime_map_size();
params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
MAX_ELFCOREHDR_STR_LEN;
+ if (image->dm_crypt_keys_addr)
+ params_cmdline_sz += MAX_DMCRYPTKEYS_STR_LEN;
params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
sizeof(struct setup_data) +
@@ -479,6 +533,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.bufsz += sizeof(struct setup_data) +
sizeof(struct ima_setup_data);
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ kbuf.bufsz += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+
params = kzalloc(kbuf.bufsz, GFP_KERNEL);
if (!params)
return ERR_PTR(-ENOMEM);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 949c9e4bfad2..697fb99406e6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -630,13 +630,35 @@ static void kexec_mark_crashkres(bool protect)
kexec_mark_range(control, crashk_res.end, protect);
}
+/* make the memory storing dm crypt keys in/accessible */
+static void kexec_mark_dm_crypt_keys(bool protect)
+{
+ unsigned long start_paddr, end_paddr;
+ unsigned int nr_pages;
+
+ if (kexec_crash_image->dm_crypt_keys_addr) {
+ start_paddr = kexec_crash_image->dm_crypt_keys_addr;
+ end_paddr = start_paddr + kexec_crash_image->dm_crypt_keys_sz - 1;
+ nr_pages = (PAGE_ALIGN(end_paddr) - PAGE_ALIGN_DOWN(start_paddr))/PAGE_SIZE;
+ if (protect)
+ set_memory_np((unsigned long)phys_to_virt(start_paddr), nr_pages);
+ else
+ __set_memory_prot(
+ (unsigned long)phys_to_virt(start_paddr),
+ nr_pages,
+ __pgprot(_PAGE_PRESENT | _PAGE_NX | _PAGE_RW));
+ }
+}
+
void arch_kexec_protect_crashkres(void)
{
kexec_mark_crashkres(true);
+ kexec_mark_dm_crypt_keys(true);
}
void arch_kexec_unprotect_crashkres(void)
{
+ kexec_mark_dm_crypt_keys(false);
kexec_mark_crashkres(false);
}
#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 7d9ed79a93c0..fb27be697128 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -282,8 +282,8 @@ static void __init cleanup_highmap(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- memblock_reserve(__pa_symbol(_brk_start),
- _brk_end - _brk_start);
+ memblock_reserve_kern(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -356,7 +356,7 @@ static void __init early_reserve_initrd(void)
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
- memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+ memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image);
}
static void __init reserve_initrd(void)
@@ -409,7 +409,7 @@ static void __init add_early_ima_buffer(u64 phys_addr)
}
if (data->size) {
- memblock_reserve(data->addr, data->size);
+ memblock_reserve_kern(data->addr, data->size);
ima_kexec_buffer_phys = data->addr;
ima_kexec_buffer_size = data->size;
}
@@ -447,6 +447,29 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size)
}
#endif
+static void __init add_kho(u64 phys_addr, u32 data_len)
+{
+ struct kho_data *kho;
+ u64 addr = phys_addr + sizeof(struct setup_data);
+ u64 size = data_len - sizeof(struct setup_data);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n");
+ return;
+ }
+
+ kho = early_memremap(addr, size);
+ if (!kho) {
+ pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n",
+ addr, size);
+ return;
+ }
+
+ kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size);
+
+ early_memunmap(kho, size);
+}
+
static void __init parse_setup_data(void)
{
struct setup_data *data;
@@ -475,6 +498,9 @@ static void __init parse_setup_data(void)
case SETUP_IMA:
add_early_ima_buffer(pa_data);
break;
+ case SETUP_KEXEC_KHO:
+ add_kho(pa_data, data_len);
+ break;
case SETUP_RNG_SEED:
data = early_memremap(pa_data, data_len);
add_bootloader_randomness(data->data, data->len);
@@ -549,7 +575,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
len = sizeof(*data);
pa_next = data->next;
- memblock_reserve(pa_data, sizeof(*data) + data->len);
+ memblock_reserve_kern(pa_data, sizeof(*data) + data->len);
if (data->type == SETUP_INDIRECT) {
len += data->len;
@@ -563,7 +589,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
indirect = (struct setup_indirect *)data->data;
if (indirect->type != SETUP_INDIRECT)
- memblock_reserve(indirect->addr, indirect->len);
+ memblock_reserve_kern(indirect->addr, indirect->len);
}
pa_data = pa_next;
@@ -766,8 +792,8 @@ static void __init early_reserve_memory(void)
* __end_of_kernel_reserve symbol must be explicitly reserved with a
* separate memblock_reserve() or they will be discarded.
*/
- memblock_reserve(__pa_symbol(_text),
- (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
+ memblock_reserve_kern(__pa_symbol(_text),
+ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
/*
* The first 4Kb of memory is a BIOS owned area, but generally it is
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b90d872aa0c8..1ba92ac9441d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1244,10 +1244,6 @@ void play_dead_common(void)
local_irq_disable();
}
-/*
- * We need to flush the caches before going to sleep, lest we have
- * dirty data in our caches when we come back up.
- */
void __noreturn mwait_play_dead(unsigned int eax_hint)
{
struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
@@ -1294,6 +1290,50 @@ void __noreturn mwait_play_dead(unsigned int eax_hint)
}
/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead_cpuid_hint(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int highest_cstate = 0;
+ unsigned int highest_subcstate = 0;
+ int i;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ return;
+ if (!this_cpu_has(X86_FEATURE_MWAIT))
+ return;
+ if (!this_cpu_has(X86_FEATURE_CLFLUSH))
+ return;
+
+ eax = CPUID_LEAF_MWAIT;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ /*
+ * eax will be 0 if EDX enumeration is not valid.
+ * Initialized below to cstate, sub_cstate value when EDX is valid.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+ eax = 0;
+ } else {
+ edx >>= MWAIT_SUBSTATE_SIZE;
+ for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+ if (edx & MWAIT_SUBSTATE_MASK) {
+ highest_cstate = i;
+ highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+ }
+ }
+ eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+ (highest_subcstate - 1);
+ }
+
+ mwait_play_dead(eax);
+}
+
+/*
* Kick all "offline" CPUs out of mwait on kexec(). See comment in
* mwait_play_dead().
*/
@@ -1343,9 +1383,9 @@ void native_play_dead(void)
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- /* Below returns only on error. */
- cpuidle_play_dead();
- hlt_play_dead();
+ mwait_play_dead_cpuid_hint();
+ if (cpuidle_play_dead())
+ hlt_play_dead();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 89079ea73e65..a4700ef6eb64 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -266,6 +266,32 @@ static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
st->prot_levels[level] = effective;
}
+static void effective_prot_pte(struct ptdump_state *st, pte_t pte)
+{
+ effective_prot(st, 4, pte_val(pte));
+}
+
+static void effective_prot_pmd(struct ptdump_state *st, pmd_t pmd)
+{
+ effective_prot(st, 3, pmd_val(pmd));
+}
+
+static void effective_prot_pud(struct ptdump_state *st, pud_t pud)
+{
+ effective_prot(st, 2, pud_val(pud));
+}
+
+static void effective_prot_p4d(struct ptdump_state *st, p4d_t p4d)
+{
+ effective_prot(st, 1, p4d_val(p4d));
+}
+
+static void effective_prot_pgd(struct ptdump_state *st, pgd_t pgd)
+{
+ effective_prot(st, 0, pgd_val(pgd));
+}
+
+
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
@@ -362,6 +388,38 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
}
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
bool ptdump_walk_pgd_level_core(struct seq_file *m,
struct mm_struct *mm, pgd_t *pgd,
bool checkwx, bool dmesg)
@@ -378,8 +436,17 @@ bool ptdump_walk_pgd_level_core(struct seq_file *m,
struct pg_state st = {
.ptdump = {
- .note_page = note_page,
- .effective_prot = effective_prot,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
+ .effective_prot_pte = effective_prot_pte,
+ .effective_prot_pmd = effective_prot_pmd,
+ .effective_prot_pud = effective_prot_pud,
+ .effective_prot_p4d = effective_prot_p4d,
+ .effective_prot_pgd = effective_prot_pgd,
.range = ptdump_ranges
},
.level = -1,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 66330fe4e18c..ee66fae9ebcc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1467,16 +1467,21 @@ static unsigned long probe_memory_block_size(void)
}
/*
- * Use max block size to minimize overhead on bare metal, where
- * alignment for memory hotplug isn't a concern.
+ * When hotplug alignment is not a concern, maximize blocksize
+ * to minimize overhead. Otherwise, align to the lesser of advice
+ * alignment and end of memory alignment.
*/
- if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ bz = memory_block_advised_max_size();
+ if (!bz) {
bz = MAX_BLOCK_SIZE;
- goto done;
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ goto done;
+ } else {
+ bz = max(min(bz, MAX_BLOCK_SIZE), MIN_MEMORY_BLOCK_SIZE);
}
/* Find the largest allowed block size that aligns to memory end */
- for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
+ for (; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
if (IS_ALIGNED(boot_mem_end, bz))
break;
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 331e101bf801..12c8180ca1ba 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -71,7 +71,7 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
static unsigned int __ioremap_check_ram(struct resource *res)
{
unsigned long start_pfn, stop_pfn;
- unsigned long i;
+ unsigned long pfn;
if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
return 0;
@@ -79,9 +79,8 @@ static unsigned int __ioremap_check_ram(struct resource *res)
start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
stop_pfn = (res->end + 1) >> PAGE_SHIFT;
if (stop_pfn > start_pfn) {
- for (i = 0; i < (stop_pfn - start_pfn); ++i)
- if (pfn_valid(start_pfn + i) &&
- !PageReserved(pfn_to_page(start_pfn + i)))
+ for_each_valid_pfn(pfn, start_pfn, stop_pfn)
+ if (!PageReserved(pfn_to_page(pfn)))
return IORES_MAP_SYSTEM_RAM;
}
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index c97b527c66fe..2e7923844afe 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -775,6 +775,12 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
return vma_prot;
}
+static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm)
+{
+ *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
+ cachemode2protval(pcm));
+}
+
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot)
{
@@ -789,8 +795,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
if (file->f_flags & O_DSYNC)
pcm = _PAGE_CACHE_MODE_UC_MINUS;
- *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
- cachemode2protval(pcm));
+ pgprot_set_cachemode(vma_prot, pcm);
return 1;
}
@@ -831,8 +836,7 @@ int memtype_kernel_map_sync(u64 base, unsigned long size,
* Reserved non RAM regions only and after successful memtype_reserve,
* this func also keeps identity mapping (if any) in sync with this new prot.
*/
-static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
- int strict_prot)
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot)
{
int is_ram = 0;
int ret;
@@ -858,9 +862,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
(unsigned long long)paddr,
(unsigned long long)(paddr + size - 1),
cattr_name(pcm));
- *vma_prot = __pgprot((pgprot_val(*vma_prot) &
- (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
+ pgprot_set_cachemode(vma_prot, pcm);
}
return 0;
}
@@ -870,8 +872,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
return ret;
if (pcm != want_pcm) {
- if (strict_prot ||
- !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
+ if (!is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
memtype_free(paddr, paddr + size);
pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
@@ -881,13 +882,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
cattr_name(pcm));
return -EINVAL;
}
- /*
- * We allow returning different type than the one requested in
- * non strict case.
- */
- *vma_prot = __pgprot((pgprot_val(*vma_prot) &
- (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
+ pgprot_set_cachemode(vma_prot, pcm);
}
if (memtype_kernel_map_sync(paddr, size, pcm) < 0) {
@@ -910,124 +905,14 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
-static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
- resource_size_t *phys)
-{
- struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start };
-
- if (follow_pfnmap_start(&args))
- return -EINVAL;
-
- /* Never return PFNs of anon folios in COW mappings. */
- if (!args.special) {
- follow_pfnmap_end(&args);
- return -EINVAL;
- }
-
- *prot = pgprot_val(args.pgprot);
- *phys = (resource_size_t)args.pfn << PAGE_SHIFT;
- follow_pfnmap_end(&args);
- return 0;
-}
-
-static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
- pgprot_t *pgprot)
-{
- unsigned long prot;
-
- VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
-
- /*
- * We need the starting PFN and cachemode used for track_pfn_remap()
- * that covered the whole VMA. For most mappings, we can obtain that
- * information from the page tables. For COW mappings, we might now
- * suddenly have anon folios mapped and follow_phys() will fail.
- *
- * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
- * detect the PFN. If we need the cachemode as well, we're out of luck
- * for now and have to fail fork().
- */
- if (!follow_phys(vma, &prot, paddr)) {
- if (pgprot)
- *pgprot = __pgprot(prot);
- return 0;
- }
- if (is_cow_mapping(vma->vm_flags)) {
- if (pgprot)
- return -EINVAL;
- *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
- return 0;
- }
- WARN_ON_ONCE(1);
- return -EINVAL;
-}
-
-int track_pfn_copy(struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma, unsigned long *pfn)
-{
- const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
- resource_size_t paddr;
- pgprot_t pgprot;
- int rc;
-
- if (!(src_vma->vm_flags & VM_PAT))
- return 0;
-
- /*
- * Duplicate the PAT information for the dst VMA based on the src
- * VMA.
- */
- if (get_pat_info(src_vma, &paddr, &pgprot))
- return -EINVAL;
- rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
- if (rc)
- return rc;
-
- /* Reservation for the destination VMA succeeded. */
- vm_flags_set(dst_vma, VM_PAT);
- *pfn = PHYS_PFN(paddr);
- return 0;
-}
-
-void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
-{
- untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
- /*
- * Reservation was freed, any copied page tables will get cleaned
- * up later, but without getting PAT involved again.
- */
-}
-
-/*
- * prot is passed in as a parameter for the new mapping. If the vma has
- * a linear pfn mapping for the entire range, or no vma is provided,
- * reserve the entire pfn + size range with single reserve_pfn_range
- * call.
- */
-int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
- unsigned long pfn, unsigned long addr, unsigned long size)
+int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot)
{
resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
enum page_cache_mode pcm;
- /* reserve the whole chunk starting from paddr */
- if (!vma || (addr == vma->vm_start
- && size == (vma->vm_end - vma->vm_start))) {
- int ret;
-
- ret = reserve_pfn_range(paddr, size, prot, 0);
- if (ret == 0 && vma)
- vm_flags_set(vma, VM_PAT);
- return ret;
- }
-
if (!pat_enabled())
return 0;
- /*
- * For anything smaller than the vma size we set prot based on the
- * lookup.
- */
pcm = lookup_memtype(paddr);
/* Check memtype for the remaining pages */
@@ -1038,70 +923,35 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
return -EINVAL;
}
- *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
-
+ pgprot_set_cachemode(prot, pcm);
return 0;
}
-void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
+int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot)
{
- enum page_cache_mode pcm;
+ const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
- if (!pat_enabled())
- return;
-
- /* Set prot based on lookup */
- pcm = lookup_memtype(pfn_t_to_phys(pfn));
- *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
+ return reserve_pfn_range(paddr, size, prot);
}
-/*
- * untrack_pfn is called while unmapping a pfnmap for a region.
- * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case pfn, size are zero).
- */
-void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
- unsigned long size, bool mm_wr_locked)
+void pfnmap_untrack(unsigned long pfn, unsigned long size)
{
- resource_size_t paddr;
-
- if (vma && !(vma->vm_flags & VM_PAT))
- return;
+ const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
- /* free the chunk starting from pfn or the whole chunk */
- paddr = (resource_size_t)pfn << PAGE_SHIFT;
- if (!paddr && !size) {
- if (get_pat_info(vma, &paddr, NULL))
- return;
- size = vma->vm_end - vma->vm_start;
- }
free_pfn_range(paddr, size);
- if (vma) {
- if (mm_wr_locked)
- vm_flags_clear(vma, VM_PAT);
- else
- __vm_flags_mod(vma, 0, VM_PAT);
- }
-}
-
-void untrack_pfn_clear(struct vm_area_struct *vma)
-{
- vm_flags_clear(vma, VM_PAT);
}
pgprot_t pgprot_writecombine(pgprot_t prot)
{
- return __pgprot(pgprot_val(prot) |
- cachemode2protval(_PAGE_CACHE_MODE_WC));
+ pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WC);
+ return prot;
}
EXPORT_SYMBOL_GPL(pgprot_writecombine);
pgprot_t pgprot_writethrough(pgprot_t prot)
{
- return __pgprot(pgprot_val(prot) |
- cachemode2protval(_PAGE_CACHE_MODE_WT));
+ pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WT);
+ return prot;
}
EXPORT_SYMBOL_GPL(pgprot_writethrough);
diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c
index 645613d59942..e5844ed1311e 100644
--- a/arch/x86/mm/pat/memtype_interval.c
+++ b/arch/x86/mm/pat/memtype_interval.c
@@ -49,32 +49,6 @@ INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
-enum {
- MEMTYPE_EXACT_MATCH = 0,
- MEMTYPE_END_MATCH = 1
-};
-
-static struct memtype *memtype_match(u64 start, u64 end, int match_type)
-{
- struct memtype *entry_match;
-
- entry_match = interval_iter_first(&memtype_rbroot, start, end-1);
-
- while (entry_match != NULL && entry_match->start < end) {
- if ((match_type == MEMTYPE_EXACT_MATCH) &&
- (entry_match->start == start) && (entry_match->end == end))
- return entry_match;
-
- if ((match_type == MEMTYPE_END_MATCH) &&
- (entry_match->start < start) && (entry_match->end == end))
- return entry_match;
-
- entry_match = interval_iter_next(entry_match, start, end-1);
- }
-
- return NULL; /* Returns NULL if there is no match */
-}
-
static int memtype_check_conflict(u64 start, u64 end,
enum page_cache_mode reqtype,
enum page_cache_mode *newtype)
@@ -130,35 +104,16 @@ int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_ty
struct memtype *memtype_erase(u64 start, u64 end)
{
- struct memtype *entry_old;
-
- /*
- * Since the memtype_rbroot tree allows overlapping ranges,
- * memtype_erase() checks with EXACT_MATCH first, i.e. free
- * a whole node for the munmap case. If no such entry is found,
- * it then checks with END_MATCH, i.e. shrink the size of a node
- * from the end for the mremap case.
- */
- entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH);
- if (!entry_old) {
- entry_old = memtype_match(start, end, MEMTYPE_END_MATCH);
- if (!entry_old)
- return ERR_PTR(-EINVAL);
+ struct memtype *entry = interval_iter_first(&memtype_rbroot, start, end - 1);
+
+ while (entry && entry->start < end) {
+ if (entry->start == start && entry->end == end) {
+ interval_remove(entry, &memtype_rbroot);
+ return entry;
+ }
+ entry = interval_iter_next(entry, start, end - 1);
}
-
- if (entry_old->start == start) {
- /* munmap: erase this node */
- interval_remove(entry_old, &memtype_rbroot);
- } else {
- /* mremap: update the end value of this node */
- interval_remove(entry_old, &memtype_rbroot);
- entry_old->end = start;
- interval_insert(entry_old, &memtype_rbroot);
-
- return NULL;
- }
-
- return entry_old;
+ return ERR_PTR(-EINVAL);
}
struct memtype *memtype_lookup(u64 addr)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 30ab4aced761..46edc11726b7 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2148,6 +2148,19 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
CPA_PAGES_ARRAY, pages);
}
+/*
+ * __set_memory_prot is an internal helper for callers that have been passed
+ * a pgprot_t value from upper layers and a reservation has already been taken.
+ * If you want to set the pgprot to a specific page protocol, use the
+ * set_memory_xx() functions.
+ */
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
+{
+ return change_page_attr_set_clr(&addr, numpages, prot,
+ __pgprot(~pgprot_val(prot)), 0, 0,
+ NULL);
+}
+
int _set_memory_uc(unsigned long addr, int numpages)
{
/*
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 62777ba4de1a..ddf248c3ee7d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -189,7 +189,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
if (!ptdesc)
failed = true;
- if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
+ if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
ptdesc = NULL;
failed = true;
@@ -751,14 +751,13 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
for (i = 0; i < PTRS_PER_PMD; i++) {
if (!pmd_none(pmd_sv[i])) {
pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
- free_page((unsigned long)pte);
+ pte_free_kernel(&init_mm, pte);
}
}
free_page((unsigned long)pmd_sv);
- pagetable_dtor(virt_to_ptdesc(pmd));
- free_page((unsigned long)pmd);
+ pmd_free(&init_mm, pmd);
return 1;
}
@@ -781,7 +780,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
/* INVLPG to clear all paging-structure caches */
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
- free_page((unsigned long)pte);
+ pte_free_kernel(&init_mm, pte);
return 1;
}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index ed5c63c0b4e5..88be32026768 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -66,6 +66,8 @@ void __init reserve_real_mode(void)
* setup_arch().
*/
memblock_reserve(0, SZ_1M);
+
+ memblock_clear_kho_scratch(0, SZ_1M);
}
static void __init sme_sev_setup_real_mode(struct trampoline_header *th)