summaryrefslogtreecommitdiff
path: root/arch/x86/boot
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/boot')
-rw-r--r--arch/x86/boot/bioscall.S4
-rw-r--r--arch/x86/boot/boot.h6
-rw-r--r--arch/x86/boot/compressed/Makefile10
-rw-r--r--arch/x86/boot/compressed/head_64.S1
-rw-r--r--arch/x86/boot/compressed/misc.c1
-rw-r--r--arch/x86/boot/compressed/misc.h8
-rw-r--r--arch/x86/boot/compressed/pgtable.h18
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c12
-rw-r--r--arch/x86/boot/compressed/sev-handle-vc.c134
-rw-r--r--arch/x86/boot/compressed/sev.c156
-rw-r--r--arch/x86/boot/compressed/sev.h21
-rw-r--r--arch/x86/boot/compressed/string.c8
-rw-r--r--arch/x86/boot/copy.S8
-rw-r--r--arch/x86/boot/header.S6
-rw-r--r--arch/x86/boot/startup/Makefile30
-rw-r--r--arch/x86/boot/startup/efi-mixed.S253
-rw-r--r--arch/x86/boot/startup/gdt_idt.c71
-rw-r--r--arch/x86/boot/startup/la57toggle.S (renamed from arch/x86/boot/compressed/la57toggle.S)1
-rw-r--r--arch/x86/boot/startup/map_kernel.c217
-rw-r--r--arch/x86/boot/startup/sev-shared.c887
-rw-r--r--arch/x86/boot/startup/sev-startup.c368
-rw-r--r--arch/x86/boot/startup/sme.c575
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/boot/video.c2
24 files changed, 2602 insertions, 197 deletions
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index aa9b96457584..cf4a6155714e 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -32,7 +32,7 @@ intcall:
movw %dx, %si
movw %sp, %di
movw $11, %cx
- rep; movsl
+ rep movsl
/* Pop full state from the stack */
popal
@@ -67,7 +67,7 @@ intcall:
jz 4f
movw %sp, %si
movw $11, %cx
- rep; movsl
+ rep movsl
4: addw $44, %sp
/* Restore state and return */
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 38f17a1e1e36..60580836daf7 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -34,7 +34,7 @@
extern struct setup_header hdr;
extern struct boot_params boot_params;
-#define cpu_relax() asm volatile("rep; nop")
+#define cpu_relax() asm volatile("pause")
static inline void io_delay(void)
{
@@ -155,14 +155,14 @@ static inline void wrgs32(u32 v, addr_t addr)
static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len)
{
bool diff;
- asm volatile("fs; repe; cmpsb" CC_SET(nz)
+ asm volatile("fs repe cmpsb" CC_SET(nz)
: CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len)
{
bool diff;
- asm volatile("gs; repe; cmpsb" CC_SET(nz)
+ asm volatile("gs repe cmpsb" CC_SET(nz)
: CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fdbce022db55..f4f7b22d8113 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -44,10 +44,10 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
-# sev.c indirectly includes inat-table.h which is generated during
+# sev-decode-insn.c indirectly includes inat-table.c which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
# that the compiler finds it even with out-of-tree builds (make O=/some/path).
-CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/
+CFLAGS_sev-handle-vc.o += -I$(objtree)/arch/x86/lib/
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
@@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T
hostprogs := mkpiggy
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
quiet_cmd_voffset = VOFFSET $@
cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
@@ -96,8 +96,7 @@ ifdef CONFIG_X86_64
vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
- vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o
- vmlinux-objs-y += $(obj)/la57toggle.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o $(obj)/sev-handle-vc.o
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
@@ -106,6 +105,7 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o
vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a
$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index eafd4f185e77..d9dab940ff62 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -35,7 +35,6 @@
#include <asm/bootparam.h>
#include <asm/desc_defs.h>
#include <asm/trapnr.h>
-#include "pgtable.h"
/*
* Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 1cdcd4aaf395..94b5991da001 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -14,7 +14,6 @@
#include "misc.h"
#include "error.h"
-#include "pgtable.h"
#include "../string.h"
#include "../voffset.h"
#include <asm/bootparam_utils.h>
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index dd8d1a85f671..db1048621ea2 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -136,6 +136,9 @@ static inline void console_init(void)
#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
+struct es_em_ctxt;
+struct insn;
+
void sev_enable(struct boot_params *bp);
void snp_check_features(void);
void sev_es_shutdown_ghcb(void);
@@ -143,6 +146,11 @@ extern bool sev_es_check_ghcb_fault(unsigned long address);
void snp_set_page_private(unsigned long paddr);
void snp_set_page_shared(unsigned long paddr);
void sev_prep_identity_maps(unsigned long top_level_pgt);
+
+enum es_result vc_decode_insn(struct es_em_ctxt *ctxt);
+bool insn_has_rep_prefix(struct insn *insn);
+void sev_insn_decode_init(void);
+bool early_setup_ghcb(void);
#else
static inline void sev_enable(struct boot_params *bp)
{
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
deleted file mode 100644
index 6d595abe06b3..000000000000
--- a/arch/x86/boot/compressed/pgtable.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef BOOT_COMPRESSED_PAGETABLE_H
-#define BOOT_COMPRESSED_PAGETABLE_H
-
-#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
-
-#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0
-
-#ifndef __ASSEMBLER__
-
-extern unsigned long *trampoline_32bit;
-
-extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl);
-
-extern const u16 trampoline_ljmp_imm_offset;
-
-#endif /* __ASSEMBLER__ */
-#endif /* BOOT_COMPRESSED_PAGETABLE_H */
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index d8c5de40669d..bdd26050dff7 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -4,19 +4,16 @@
#include <asm/bootparam_utils.h>
#include <asm/e820/types.h>
#include <asm/processor.h>
-#include "pgtable.h"
#include "../string.h"
#include "efi.h"
#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
-#ifdef CONFIG_X86_5LEVEL
/* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */
unsigned int __section(".data") __pgtable_l5_enabled;
unsigned int __section(".data") pgdir_shift = 39;
unsigned int __section(".data") ptrs_per_p4d = 1;
-#endif
/* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
@@ -115,18 +112,13 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
* Check if LA57 is desired and supported.
*
* There are several parts to the check:
- * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
* - if user asked to disable 5-level paging: no5lvl in cmdline
* - if the machine supports 5-level paging:
* + CPUID leaf 7 is supported
* + the leaf has the feature bit set
- *
- * That's substitute for boot_cpu_has() in early boot code.
*/
- if (IS_ENABLED(CONFIG_X86_5LEVEL) &&
- !cmdline_find_option_bool("no5lvl") &&
- native_cpuid_eax(0) >= 7 &&
- (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
+ if (!cmdline_find_option_bool("no5lvl") &&
+ native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & BIT(16))) {
l5_required = true;
/* Initialize variables for 5-level paging */
diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c
new file mode 100644
index 000000000000..89dd02de2a0f
--- /dev/null
+++ b/arch/x86/boot/compressed/sev-handle-vc.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "sev.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <asm/insn.h>
+#include <asm/pgtable_types.h>
+#include <asm/ptrace.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+#include <asm/trap_pf.h>
+#include <asm/fpu/xcr.h>
+
+#define __BOOT_COMPRESSED
+
+/* Basic instruction decoding support needed */
+#include "../../lib/inat.c"
+#include "../../lib/insn.c"
+
+/*
+ * Copy a version of this function here - insn-eval.c can't be used in
+ * pre-decompression code.
+ */
+bool insn_has_rep_prefix(struct insn *insn)
+{
+ insn_byte_t p;
+ int i;
+
+ insn_get_prefixes(insn);
+
+ for_each_insn_prefix(insn, i, p) {
+ if (p == 0xf2 || p == 0xf3)
+ return true;
+ }
+
+ return false;
+}
+
+enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int ret;
+
+ memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
+
+ return ES_OK;
+}
+
+extern void sev_insn_decode_init(void) __alias(inat_init_tables);
+
+/*
+ * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
+ * doesn't use segments.
+ */
+static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+ return 0UL;
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ void *dst, char *buf, size_t size)
+{
+ memcpy(dst, buf, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ void *src, char *buf, size_t size)
+{
+ memcpy(buf, src, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+ return ES_OK;
+}
+
+static bool fault_in_kernel_space(unsigned long address)
+{
+ return false;
+}
+
+#define sev_printk(fmt, ...)
+
+#include "../../coco/sev/vc-shared.c"
+
+void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
+{
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ if (!boot_ghcb && !early_setup_ghcb())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ vc_ghcb_invalidate(boot_ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ result = vc_check_opcode_bytes(&ctxt, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ switch (exit_code) {
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(boot_ghcb, &ctxt);
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(boot_ghcb, &ctxt);
+ break;
+ default:
+ result = ES_UNSUPPORTED;
+ break;
+ }
+
+finish:
+ if (result == ES_OK)
+ vc_finish_insn(&ctxt);
+ else if (result != ES_RETRY)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 0003e4416efd..fd1b67dfea22 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -21,99 +21,14 @@
#include <asm/fpu/xcr.h>
#include <asm/ptrace.h>
#include <asm/svm.h>
-#include <asm/cpuid.h>
+#include <asm/cpuid/api.h>
#include "error.h"
-#include "../msr.h"
+#include "sev.h"
static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
struct ghcb *boot_ghcb;
-/*
- * Copy a version of this function here - insn-eval.c can't be used in
- * pre-decompression code.
- */
-static bool insn_has_rep_prefix(struct insn *insn)
-{
- insn_byte_t p;
- int i;
-
- insn_get_prefixes(insn);
-
- for_each_insn_prefix(insn, i, p) {
- if (p == 0xf2 || p == 0xf3)
- return true;
- }
-
- return false;
-}
-
-/*
- * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
- * doesn't use segments.
- */
-static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
-{
- return 0UL;
-}
-
-static inline u64 sev_es_rd_ghcb_msr(void)
-{
- struct msr m;
-
- boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
-
- return m.q;
-}
-
-static inline void sev_es_wr_ghcb_msr(u64 val)
-{
- struct msr m;
-
- m.q = val;
- boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
-}
-
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
-{
- char buffer[MAX_INSN_SIZE];
- int ret;
-
- memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
-
- ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
- if (ret < 0)
- return ES_DECODE_FAILED;
-
- return ES_OK;
-}
-
-static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
- void *dst, char *buf, size_t size)
-{
- memcpy(dst, buf, size);
-
- return ES_OK;
-}
-
-static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
- void *src, char *buf, size_t size)
-{
- memcpy(buf, src, size);
-
- return ES_OK;
-}
-
-static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
-{
- return ES_OK;
-}
-
-static bool fault_in_kernel_space(unsigned long address)
-{
- return false;
-}
-
#undef __init
#define __init
@@ -122,24 +37,27 @@ static bool fault_in_kernel_space(unsigned long address)
#define __BOOT_COMPRESSED
-/* Basic instruction decoding support needed */
-#include "../../lib/inat.c"
-#include "../../lib/insn.c"
+extern struct svsm_ca *boot_svsm_caa;
+extern u64 boot_svsm_caa_pa;
-/* Include code for early handlers */
-#include "../../coco/sev/shared.c"
-
-static struct svsm_ca *svsm_get_caa(void)
+struct svsm_ca *svsm_get_caa(void)
{
return boot_svsm_caa;
}
-static u64 svsm_get_caa_pa(void)
+u64 svsm_get_caa_pa(void)
{
return boot_svsm_caa_pa;
}
-static int svsm_perform_call_protocol(struct svsm_call *call)
+int svsm_perform_call_protocol(struct svsm_call *call);
+
+u8 snp_vmpl;
+
+/* Include code for early handlers */
+#include "../../boot/startup/sev-shared.c"
+
+int svsm_perform_call_protocol(struct svsm_call *call)
{
struct ghcb *ghcb;
int ret;
@@ -157,7 +75,7 @@ static int svsm_perform_call_protocol(struct svsm_call *call)
return ret;
}
-bool sev_snp_enabled(void)
+static bool sev_snp_enabled(void)
{
return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
}
@@ -212,7 +130,7 @@ void snp_set_page_shared(unsigned long paddr)
__page_state_change(paddr, SNP_PAGE_STATE_SHARED);
}
-static bool early_setup_ghcb(void)
+bool early_setup_ghcb(void)
{
if (set_page_decrypted((unsigned long)&boot_ghcb_page))
return false;
@@ -223,7 +141,7 @@ static bool early_setup_ghcb(void)
boot_ghcb = &boot_ghcb_page;
/* Initialize lookup tables for the instruction decoder */
- inat_init_tables();
+ sev_insn_decode_init();
/* SNP guest requires the GHCB GPA must be registered */
if (sev_snp_enabled())
@@ -296,46 +214,6 @@ bool sev_es_check_ghcb_fault(unsigned long address)
return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page);
}
-void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
-{
- struct es_em_ctxt ctxt;
- enum es_result result;
-
- if (!boot_ghcb && !early_setup_ghcb())
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-
- vc_ghcb_invalidate(boot_ghcb);
- result = vc_init_em_ctxt(&ctxt, regs, exit_code);
- if (result != ES_OK)
- goto finish;
-
- result = vc_check_opcode_bytes(&ctxt, exit_code);
- if (result != ES_OK)
- goto finish;
-
- switch (exit_code) {
- case SVM_EXIT_RDTSC:
- case SVM_EXIT_RDTSCP:
- result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
- break;
- case SVM_EXIT_IOIO:
- result = vc_handle_ioio(boot_ghcb, &ctxt);
- break;
- case SVM_EXIT_CPUID:
- result = vc_handle_cpuid(boot_ghcb, &ctxt);
- break;
- default:
- result = ES_UNSUPPORTED;
- break;
- }
-
-finish:
- if (result == ES_OK)
- vc_finish_insn(&ctxt);
- else if (result != ES_RETRY)
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-}
-
/*
* SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
* guest side implementation for proper functioning of the guest. If any
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
index d3900384b8ab..92f79c21939c 100644
--- a/arch/x86/boot/compressed/sev.h
+++ b/arch/x86/boot/compressed/sev.h
@@ -10,14 +10,31 @@
#ifdef CONFIG_AMD_MEM_ENCRYPT
-bool sev_snp_enabled(void);
+#include "../msr.h"
+
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 sev_get_status(void);
bool early_is_sevsnp_guest(void);
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ struct msr m;
+
+ boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+
+ return m.q;
+}
+
+static inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ struct msr m;
+
+ m.q = val;
+ boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+}
+
#else
-static inline bool sev_snp_enabled(void) { return false; }
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
static inline u64 sev_get_status(void) { return 0; }
static inline bool early_is_sevsnp_guest(void) { return false; }
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 81fc1eaa3229..9af19d9614cb 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -15,9 +15,9 @@ static void *____memcpy(void *dest, const void *src, size_t n)
{
int d0, d1, d2;
asm volatile(
- "rep ; movsl\n\t"
+ "rep movsl\n\t"
"movl %4,%%ecx\n\t"
- "rep ; movsb\n\t"
+ "rep movsb"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
: "memory");
@@ -29,9 +29,9 @@ static void *____memcpy(void *dest, const void *src, size_t n)
{
long d0, d1, d2;
asm volatile(
- "rep ; movsq\n\t"
+ "rep movsq\n\t"
"movq %4,%%rcx\n\t"
- "rep ; movsb\n\t"
+ "rep movsb"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
: "memory");
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index 6afd05e819d2..3973a67cd04e 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -22,10 +22,10 @@ SYM_FUNC_START_NOALIGN(memcpy)
movw %dx, %si
pushw %cx
shrw $2, %cx
- rep; movsl
+ rep movsl
popw %cx
andw $3, %cx
- rep; movsb
+ rep movsb
popw %di
popw %si
retl
@@ -38,10 +38,10 @@ SYM_FUNC_START_NOALIGN(memset)
imull $0x01010101,%eax
pushw %cx
shrw $2, %cx
- rep; stosl
+ rep stosl
popw %cx
andw $3, %cx
- rep; stosb
+ rep stosb
popw %di
retl
SYM_FUNC_END(memset)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b5c79f43359b..e30649e44d8f 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -361,12 +361,8 @@ xloadflags:
#endif
#ifdef CONFIG_X86_64
-#ifdef CONFIG_X86_5LEVEL
#define XLF56 (XLF_5LEVEL|XLF_5LEVEL_ENABLED)
#else
-#define XLF56 XLF_5LEVEL
-#endif
-#else
#define XLF56 0
#endif
@@ -585,7 +581,7 @@ start_of_setup:
xorl %eax, %eax
subw %di, %cx
shrw $2, %cx
- rep; stosl
+ rep stosl
# Jump to C code (should not return)
calll main
diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile
new file mode 100644
index 000000000000..b514f7e81332
--- /dev/null
+++ b/arch/x86/boot/startup/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+
+KBUILD_AFLAGS += -D__DISABLE_EXPORTS
+KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \
+ -Os -DDISABLE_BRANCH_PROFILING \
+ $(DISABLE_STACKLEAK_PLUGIN) \
+ -fno-stack-protector -D__NO_FORTIFY \
+ -fno-jump-tables \
+ -include $(srctree)/include/linux/hidden.h
+
+# disable ftrace hooks and LTO
+KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS))
+KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
+KMSAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o
+
+lib-$(CONFIG_X86_64) += la57toggle.o
+lib-$(CONFIG_EFI_MIXED) += efi-mixed.o
+
+#
+# Disable objtool validation for all library code, which is intended
+# to be linked into the decompressor or the EFI stub but not vmlinux
+#
+$(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y
diff --git a/arch/x86/boot/startup/efi-mixed.S b/arch/x86/boot/startup/efi-mixed.S
new file mode 100644
index 000000000000..e04ed99bc449
--- /dev/null
+++ b/arch/x86/boot/startup/efi-mixed.S
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
+ *
+ * Early support for invoking 32-bit EFI services from a 64-bit kernel.
+ *
+ * Because this thunking occurs before ExitBootServices() we have to
+ * restore the firmware's 32-bit GDT and IDT before we make EFI service
+ * calls.
+ *
+ * On the plus side, we don't have to worry about mangling 64-bit
+ * addresses into 32-bits because we're executing with an identity
+ * mapped pagetable and haven't transitioned to 64-bit virtual addresses
+ * yet.
+ */
+
+#include <linux/linkage.h>
+#include <asm/desc_defs.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+ .text
+ .code32
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+SYM_FUNC_START(efi32_stub_entry)
+ call 1f
+1: popl %ecx
+
+ /* Clear BSS */
+ xorl %eax, %eax
+ leal (_bss - 1b)(%ecx), %edi
+ leal (_ebss - 1b)(%ecx), %ecx
+ subl %edi, %ecx
+ shrl $2, %ecx
+ cld
+ rep stosl
+
+ add $0x4, %esp /* Discard return address */
+ movl 8(%esp), %ebx /* struct boot_params pointer */
+ jmp efi32_startup
+SYM_FUNC_END(efi32_stub_entry)
+#endif
+
+/*
+ * Called using a far call from __efi64_thunk() below, using the x86_64 SysV
+ * ABI (except for R8/R9 which are inaccessible to 32-bit code - EAX/EBX are
+ * used instead). EBP+16 points to the arguments passed via the stack.
+ *
+ * The first argument (EDI) is a pointer to the boot service or protocol, to
+ * which the remaining arguments are passed, each truncated to 32 bits.
+ */
+SYM_FUNC_START_LOCAL(efi_enter32)
+ /*
+ * Convert x86-64 SysV ABI params to i386 ABI
+ */
+ pushl 32(%ebp) /* Up to 3 args passed via the stack */
+ pushl 24(%ebp)
+ pushl 16(%ebp)
+ pushl %ebx /* R9 */
+ pushl %eax /* R8 */
+ pushl %ecx
+ pushl %edx
+ pushl %esi
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Disable long mode via EFER */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btrl $_EFER_LME, %eax
+ wrmsr
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ call efi32_enable_long_mode
+
+ addl $32, %esp
+ movl %edi, %eax
+ lret
+SYM_FUNC_END(efi_enter32)
+
+ .code64
+SYM_FUNC_START(__efi64_thunk)
+ push %rbp
+ movl %esp, %ebp
+ push %rbx
+
+ /* Move args #5 and #6 into 32-bit accessible registers */
+ movl %r8d, %eax
+ movl %r9d, %ebx
+
+ lcalll *efi32_call(%rip)
+
+ pop %rbx
+ pop %rbp
+ RET
+SYM_FUNC_END(__efi64_thunk)
+
+ .code32
+SYM_FUNC_START_LOCAL(efi32_enable_long_mode)
+ movl %cr4, %eax
+ btsl $(X86_CR4_PAE_BIT), %eax
+ movl %eax, %cr4
+
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* Disable interrupts - the firmware's IDT does not work in long mode */
+ cli
+
+ /* Enable paging */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+ ret
+SYM_FUNC_END(efi32_enable_long_mode)
+
+/*
+ * This is the common EFI stub entry point for mixed mode. It sets up the GDT
+ * and page tables needed for 64-bit execution, after which it calls the
+ * common 64-bit EFI entrypoint efi_stub_entry().
+ *
+ * Arguments: 0(%esp) image handle
+ * 4(%esp) EFI system table pointer
+ * %ebx struct boot_params pointer (or NULL)
+ *
+ * Since this is the point of no return for ordinary execution, no registers
+ * are considered live except for the function parameters. [Note that the EFI
+ * stub may still exit and return to the firmware using the Exit() EFI boot
+ * service.]
+ */
+SYM_FUNC_START_LOCAL(efi32_startup)
+ movl %esp, %ebp
+
+ subl $8, %esp
+ sgdtl (%esp) /* Save GDT descriptor to the stack */
+ movl 2(%esp), %esi /* Existing GDT pointer */
+ movzwl (%esp), %ecx /* Existing GDT limit */
+ inc %ecx /* Existing GDT size */
+ andl $~7, %ecx /* Ensure size is multiple of 8 */
+
+ subl %ecx, %esp /* Allocate new GDT */
+ andl $~15, %esp /* Realign the stack */
+ movl %esp, %edi /* New GDT address */
+ leal 7(%ecx), %eax /* New GDT limit */
+ pushw %cx /* Push 64-bit CS (for LJMP below) */
+ pushl %edi /* Push new GDT address */
+ pushw %ax /* Push new GDT limit */
+
+ /* Copy GDT to the stack and add a 64-bit code segment at the end */
+ movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) & 0xffffffff, (%edi,%ecx)
+ movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) >> 32, 4(%edi,%ecx)
+ shrl $2, %ecx
+ cld
+ rep movsl /* Copy the firmware GDT */
+ lgdtl (%esp) /* Switch to the new GDT */
+
+ call 1f
+1: pop %edi
+
+ /* Record mixed mode entry */
+ movb $0x0, (efi_is64 - 1b)(%edi)
+
+ /* Set up indirect far call to re-enter 32-bit mode */
+ leal (efi32_call - 1b)(%edi), %eax
+ addl %eax, (%eax)
+ movw %cs, 4(%eax)
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Set up 1:1 mapping */
+ leal (pte - 1b)(%edi), %eax
+ movl $_PAGE_PRESENT | _PAGE_RW | _PAGE_PSE, %ecx
+ leal (_PAGE_PRESENT | _PAGE_RW)(%eax), %edx
+2: movl %ecx, (%eax)
+ addl $8, %eax
+ addl $PMD_SIZE, %ecx
+ jnc 2b
+
+ movl $PAGE_SIZE, %ecx
+ .irpc l, 0123
+ movl %edx, \l * 8(%eax)
+ addl %ecx, %edx
+ .endr
+ addl %ecx, %eax
+ movl %edx, (%eax)
+ movl %eax, %cr3
+
+ call efi32_enable_long_mode
+
+ /* Set up far jump to 64-bit mode (CS is already on the stack) */
+ leal (efi_stub_entry - 1b)(%edi), %eax
+ movl %eax, 2(%esp)
+
+ movl 0(%ebp), %edi
+ movl 4(%ebp), %esi
+ movl %ebx, %edx
+ ljmpl *2(%esp)
+SYM_FUNC_END(efi32_startup)
+
+/*
+ * efi_status_t efi32_pe_entry(efi_handle_t image_handle,
+ * efi_system_table_32_t *sys_table)
+ */
+SYM_FUNC_START(efi32_pe_entry)
+ pushl %ebx // save callee-save registers
+
+ /* Check whether the CPU supports long mode */
+ movl $0x80000001, %eax // assume extended info support
+ cpuid
+ btl $29, %edx // check long mode bit
+ jnc 1f
+ leal 8(%esp), %esp // preserve stack alignment
+ xor %ebx, %ebx // no struct boot_params pointer
+ jmp efi32_startup // only ESP and EBX remain live
+1: movl $0x80000003, %eax // EFI_UNSUPPORTED
+ popl %ebx
+ RET
+SYM_FUNC_END(efi32_pe_entry)
+
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+ .org efi32_stub_entry + 0x200
+ .code64
+SYM_FUNC_START_NOALIGN(efi64_stub_entry)
+ jmp efi_handover_entry
+SYM_FUNC_END(efi64_stub_entry)
+#endif
+
+ .data
+ .balign 8
+SYM_DATA_START_LOCAL(efi32_call)
+ .long efi_enter32 - .
+ .word 0x0
+SYM_DATA_END(efi32_call)
+SYM_DATA(efi_is64, .byte 1)
+
+ .bss
+ .balign PAGE_SIZE
+SYM_DATA_LOCAL(pte, .fill 6 * PAGE_SIZE, 1, 0)
diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c
new file mode 100644
index 000000000000..a3112a69b06a
--- /dev/null
+++ b/arch/x86/boot/startup/gdt_idt.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <linux/types.h>
+
+#include <asm/desc.h>
+#include <asm/init.h>
+#include <asm/setup.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+
+/*
+ * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is
+ * used until the idt_table takes over. On the boot CPU this happens in
+ * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases
+ * this happens in the functions called from head_64.S.
+ *
+ * The idt_table can't be used that early because all the code modifying it is
+ * in idt.c and can be instrumented by tracing or KASAN, which both don't work
+ * during early CPU bringup. Also the idt_table has the runtime vectors
+ * configured which require certain CPU state to be setup already (like TSS),
+ * which also hasn't happened yet in early CPU bringup.
+ */
+static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
+
+/* This may run while still in the direct mapping */
+void __head startup_64_load_idt(void *vc_handler)
+{
+ struct desc_ptr desc = {
+ .address = (unsigned long)rip_rel_ptr(bringup_idt_table),
+ .size = sizeof(bringup_idt_table) - 1,
+ };
+ struct idt_data data;
+ gate_desc idt_desc;
+
+ /* @vc_handler is set only for a VMM Communication Exception */
+ if (vc_handler) {
+ init_idt_data(&data, X86_TRAP_VC, vc_handler);
+ idt_init_desc(&idt_desc, &data);
+ native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc);
+ }
+
+ native_load_idt(&desc);
+}
+
+/*
+ * Setup boot CPU state needed before kernel switches to virtual addresses.
+ */
+void __head startup_64_setup_gdt_idt(void)
+{
+ struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page);
+ void *handler = NULL;
+
+ struct desc_ptr startup_gdt_descr = {
+ .address = (unsigned long)gp->gdt,
+ .size = GDT_SIZE - 1,
+ };
+
+ /* Load GDT */
+ native_load_gdt(&startup_gdt_descr);
+
+ /* New GDT is live - reload data segment registers */
+ asm volatile("movl %%eax, %%ds\n"
+ "movl %%eax, %%ss\n"
+ "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ handler = rip_rel_ptr(vc_no_ghcb);
+
+ startup_64_load_idt(handler);
+}
diff --git a/arch/x86/boot/compressed/la57toggle.S b/arch/x86/boot/startup/la57toggle.S
index 9ee002387eb1..370075b4d95b 100644
--- a/arch/x86/boot/compressed/la57toggle.S
+++ b/arch/x86/boot/startup/la57toggle.S
@@ -5,7 +5,6 @@
#include <asm/boot.h>
#include <asm/msr.h>
#include <asm/processor-flags.h>
-#include "pgtable.h"
/*
* This is the 32-bit trampoline that will be copied over to low memory. It
diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c
new file mode 100644
index 000000000000..332dbe6688c4
--- /dev/null
+++ b/arch/x86/boot/startup/map_kernel.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pgtable.h>
+
+#include <asm/init.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/sev.h>
+
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+extern unsigned int next_early_pgt;
+
+static inline bool check_la57_support(void)
+{
+ /*
+ * 5-level paging is detected and enabled at kernel decompression
+ * stage. Only check if it has been enabled there.
+ */
+ if (!(native_read_cr4() & X86_CR4_LA57))
+ return false;
+
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+
+ return true;
+}
+
+static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
+ pmdval_t *pmd,
+ unsigned long p2v_offset)
+{
+ unsigned long paddr, paddr_end;
+ int i;
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (sme_get_me_mask()) {
+ paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted);
+ paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted);
+
+ for (; paddr < paddr_end; paddr += PMD_SIZE) {
+ /*
+ * On SNP, transition the page to shared in the RMP table so that
+ * it is consistent with the page table attribute change.
+ *
+ * __start_bss_decrypted has a virtual address in the high range
+ * mapping (kernel .text). PVALIDATE, by way of
+ * early_snp_set_memory_shared(), requires a valid virtual
+ * address but the kernel is currently running off of the identity
+ * mapping so use the PA to get a *currently* valid virtual address.
+ */
+ early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
+
+ i = pmd_index(paddr - p2v_offset);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+/*
+ * This code is compiled using PIC codegen because it will execute from the
+ * early 1:1 mapping of memory, which deviates from the mapping expected by the
+ * linker. Due to this deviation, taking the address of a global variable will
+ * produce an ambiguous result when using the plain & operator. Instead,
+ * rip_rel_ptr() must be used, which will return the RIP-relative address in
+ * the 1:1 mapping of memory. Kernel virtual addresses can be determined by
+ * subtracting p2v_offset from the RIP-relative address.
+ */
+unsigned long __head __startup_64(unsigned long p2v_offset,
+ struct boot_params *bp)
+{
+ pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
+ unsigned long physaddr = (unsigned long)rip_rel_ptr(_text);
+ unsigned long va_text, va_end;
+ unsigned long pgtable_flags;
+ unsigned long load_delta;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ bool la57;
+ int i;
+
+ la57 = check_la57_support();
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ phys_base = load_delta = __START_KERNEL_map + p2v_offset;
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_MASK)
+ for (;;);
+
+ va_text = physaddr - p2v_offset;
+ va_end = (unsigned long)rip_rel_ptr(_end) - p2v_offset;
+
+ /* Include the SME encryption mask in the fixup value */
+ load_delta += sme_get_me_mask();
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = rip_rel_ptr(early_top_pgt);
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
+
+ if (la57) {
+ p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
+ p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
+
+ pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
+ }
+
+ level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta;
+ level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta;
+
+ for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
+ level2_fixmap_pgt[i].pmd += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ pud = &early_pgts[0]->pmd;
+ pmd = &early_pgts[1]->pmd;
+ next_early_pgt = 2;
+
+ pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
+
+ if (la57) {
+ p4d = &early_pgts[next_early_pgt++]->pmd;
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
+
+ i = physaddr >> P4D_SHIFT;
+ p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
+ }
+
+ i = physaddr >> PUD_SHIFT;
+ pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+ pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ pmd_entry += sme_get_me_mask();
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT);
+
+ pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ *
+ * Only the region occupied by the kernel image has so far
+ * been checked against the table of usable memory regions
+ * provided by the firmware, so invalidate pages outside that
+ * region. A page table entry that maps to a reserved area of
+ * memory would allow processor speculation into that area,
+ * and on some hardware (particularly the UV platform) even
+ * speculative access to some reserved areas is caught as an
+ * error, causing the BIOS to halt the system.
+ */
+
+ pmd = rip_rel_ptr(level2_kernel_pgt);
+
+ /* invalidate pages before the kernel image */
+ for (i = 0; i < pmd_index(va_text); i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ /* fixup pages that are part of the kernel image */
+ for (; i <= pmd_index(va_end); i++)
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+
+ /* invalidate pages after the kernel image */
+ for (; i < PTRS_PER_PMD; i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ return sme_postprocess_startup(bp, pmd, p2v_offset);
+}
diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c
new file mode 100644
index 000000000000..7a706db87b93
--- /dev/null
+++ b/arch/x86/boot/startup/sev-shared.c
@@ -0,0 +1,887 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *
+ * This file is not compiled stand-alone. It contains code shared
+ * between the pre-decompression boot code and the running Linux kernel
+ * and is included directly into both code-bases.
+ */
+
+#include <asm/setup_data.h>
+
+#ifndef __BOOT_COMPRESSED
+#define error(v) pr_err(v)
+#define has_cpuflag(f) boot_cpu_has(f)
+#else
+#undef WARN
+#define WARN(condition, format...) (!!(condition))
+#undef vc_forward_exception
+#define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n")
+#endif
+
+/*
+ * SVSM related information:
+ * During boot, the page tables are set up as identity mapped and later
+ * changed to use kernel virtual addresses. Maintain separate virtual and
+ * physical addresses for the CAA to allow SVSM functions to be used during
+ * early boot, both with identity mapped virtual addresses and proper kernel
+ * virtual addresses.
+ */
+struct svsm_ca *boot_svsm_caa __ro_after_init;
+u64 boot_svsm_caa_pa __ro_after_init;
+
+/*
+ * Since feature negotiation related variables are set early in the boot
+ * process they must reside in the .data section so as not to be zeroed
+ * out when the .bss section is later cleared.
+ *
+ * GHCB protocol version negotiated with the hypervisor.
+ */
+static u16 ghcb_version __ro_after_init;
+
+/* Copy of the SNP firmware's CPUID page. */
+static struct snp_cpuid_table cpuid_table_copy __ro_after_init;
+
+/*
+ * These will be initialized based on CPUID table so that non-present
+ * all-zero leaves (for sparse tables) can be differentiated from
+ * invalid/out-of-range leaves. This is needed since all-zero leaves
+ * still need to be post-processed.
+ */
+static u32 cpuid_std_range_max __ro_after_init;
+static u32 cpuid_hyp_range_max __ro_after_init;
+static u32 cpuid_ext_range_max __ro_after_init;
+
+bool __init sev_es_check_cpu_features(void)
+{
+ if (!has_cpuflag(X86_FEATURE_RDRAND)) {
+ error("RDRAND instruction not supported - no trusted source of randomness available\n");
+ return false;
+ }
+
+ return true;
+}
+
+void __head __noreturn
+sev_es_terminate(unsigned int set, unsigned int reason)
+{
+ u64 val = GHCB_MSR_TERM_REQ;
+
+ /* Tell the hypervisor what went wrong. */
+ val |= GHCB_SEV_TERM_REASON(set, reason);
+
+ /* Request Guest Termination from Hypervisor */
+ sev_es_wr_ghcb_msr(val);
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
+/*
+ * The hypervisor features are available from GHCB version 2 onward.
+ */
+u64 get_hv_features(void)
+{
+ u64 val;
+
+ if (ghcb_version < 2)
+ return 0;
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ);
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP)
+ return 0;
+
+ return GHCB_MSR_HV_FT_RESP_VAL(val);
+}
+
+void snp_register_ghcb_early(unsigned long paddr)
+{
+ unsigned long pfn = paddr >> PAGE_SHIFT;
+ u64 val;
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn));
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+
+ /* If the response GPA is not ours then abort the guest */
+ if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) ||
+ (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER);
+}
+
+bool sev_es_negotiate_protocol(void)
+{
+ u64 val;
+
+ /* Do the GHCB protocol version negotiation */
+ sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+
+ if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
+ return false;
+
+ if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
+ GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
+ return false;
+
+ ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX);
+
+ return true;
+}
+
+static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ u32 ret;
+
+ ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0);
+ if (!ret)
+ return ES_OK;
+
+ if (ret == 1) {
+ u64 info = ghcb->save.sw_exit_info_2;
+ unsigned long v = info & SVM_EVTINJ_VEC_MASK;
+
+ /* Check if exception information from hypervisor is sane. */
+ if ((info & SVM_EVTINJ_VALID) &&
+ ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
+ ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
+ ctxt->fi.vector = v;
+
+ if (info & SVM_EVTINJ_VALID_ERR)
+ ctxt->fi.error_code = info >> 32;
+
+ return ES_EXCEPTION;
+ }
+ }
+
+ return ES_VMM_ERROR;
+}
+
+static inline int svsm_process_result_codes(struct svsm_call *call)
+{
+ switch (call->rax_out) {
+ case SVSM_SUCCESS:
+ return 0;
+ case SVSM_ERR_INCOMPLETE:
+ case SVSM_ERR_BUSY:
+ return -EAGAIN;
+ default:
+ return -EINVAL;
+ }
+}
+
+/*
+ * Issue a VMGEXIT to call the SVSM:
+ * - Load the SVSM register state (RAX, RCX, RDX, R8 and R9)
+ * - Set the CA call pending field to 1
+ * - Issue VMGEXIT
+ * - Save the SVSM return register state (RAX, RCX, RDX, R8 and R9)
+ * - Perform atomic exchange of the CA call pending field
+ *
+ * - See the "Secure VM Service Module for SEV-SNP Guests" specification for
+ * details on the calling convention.
+ * - The calling convention loosely follows the Microsoft X64 calling
+ * convention by putting arguments in RCX, RDX, R8 and R9.
+ * - RAX specifies the SVSM protocol/callid as input and the return code
+ * as output.
+ */
+static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending)
+{
+ register unsigned long rax asm("rax") = call->rax;
+ register unsigned long rcx asm("rcx") = call->rcx;
+ register unsigned long rdx asm("rdx") = call->rdx;
+ register unsigned long r8 asm("r8") = call->r8;
+ register unsigned long r9 asm("r9") = call->r9;
+
+ call->caa->call_pending = 1;
+
+ asm volatile("rep; vmmcall\n\t"
+ : "+r" (rax), "+r" (rcx), "+r" (rdx), "+r" (r8), "+r" (r9)
+ : : "memory");
+
+ *pending = xchg(&call->caa->call_pending, *pending);
+
+ call->rax_out = rax;
+ call->rcx_out = rcx;
+ call->rdx_out = rdx;
+ call->r8_out = r8;
+ call->r9_out = r9;
+}
+
+static int svsm_perform_msr_protocol(struct svsm_call *call)
+{
+ u8 pending = 0;
+ u64 val, resp;
+
+ /*
+ * When using the MSR protocol, be sure to save and restore
+ * the current MSR value.
+ */
+ val = sev_es_rd_ghcb_msr();
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_VMPL_REQ_LEVEL(0));
+
+ svsm_issue_call(call, &pending);
+
+ resp = sev_es_rd_ghcb_msr();
+
+ sev_es_wr_ghcb_msr(val);
+
+ if (pending)
+ return -EINVAL;
+
+ if (GHCB_RESP_CODE(resp) != GHCB_MSR_VMPL_RESP)
+ return -EINVAL;
+
+ if (GHCB_MSR_VMPL_RESP_VAL(resp))
+ return -EINVAL;
+
+ return svsm_process_result_codes(call);
+}
+
+static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call)
+{
+ struct es_em_ctxt ctxt;
+ u8 pending = 0;
+
+ vc_ghcb_invalidate(ghcb);
+
+ /*
+ * Fill in protocol and format specifiers. This can be called very early
+ * in the boot, so use rip-relative references as needed.
+ */
+ ghcb->protocol_version = ghcb_version;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+
+ svsm_issue_call(call, &pending);
+
+ if (pending)
+ return -EINVAL;
+
+ switch (verify_exception_info(ghcb, &ctxt)) {
+ case ES_OK:
+ break;
+ case ES_EXCEPTION:
+ vc_forward_exception(&ctxt);
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ return svsm_process_result_codes(call);
+}
+
+enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ u64 exit_code, u64 exit_info_1,
+ u64 exit_info_2)
+{
+ /* Fill in protocol and format specifiers */
+ ghcb->protocol_version = ghcb_version;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, exit_code);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ return verify_exception_info(ghcb, ctxt);
+}
+
+static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg)
+{
+ u64 val;
+
+ sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx));
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
+ return -EIO;
+
+ *reg = (val >> 32);
+
+ return 0;
+}
+
+static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf)
+{
+ int ret;
+
+ /*
+ * MSR protocol does not support fetching non-zero subfunctions, but is
+ * sufficient to handle current early-boot cases. Should that change,
+ * make sure to report an error rather than ignoring the index and
+ * grabbing random values. If this issue arises in the future, handling
+ * can be added here to use GHCB-page protocol for cases that occur late
+ * enough in boot that GHCB page is available.
+ */
+ if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn)
+ return -EINVAL;
+
+ ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx);
+
+ return ret;
+}
+
+static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ u32 cr4 = native_read_cr4();
+ int ret;
+
+ ghcb_set_rax(ghcb, leaf->fn);
+ ghcb_set_rcx(ghcb, leaf->subfn);
+
+ if (cr4 & X86_CR4_OSXSAVE)
+ /* Safe to read xcr0 */
+ ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+ else
+ /* xgetbv will cause #UD - use reset value for xcr0 */
+ ghcb_set_xcr0(ghcb, 1);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) &&
+ ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ leaf->eax = ghcb->save.rax;
+ leaf->ebx = ghcb->save.rbx;
+ leaf->ecx = ghcb->save.rcx;
+ leaf->edx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf)
+ : __sev_cpuid_hv_msr(leaf);
+}
+
+/*
+ * This may be called early while still running on the initial identity
+ * mapping. Use RIP-relative addressing to obtain the correct address
+ * while running with the initial identity mapping as well as the
+ * switch-over to kernel virtual addresses later.
+ */
+const struct snp_cpuid_table *snp_cpuid_get_table(void)
+{
+ return rip_rel_ptr(&cpuid_table_copy);
+}
+
+/*
+ * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of
+ * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0
+ * and 1 based on the corresponding features enabled by a particular
+ * combination of XCR0 and XSS registers so that a guest can look up the
+ * version corresponding to the features currently enabled in its XCR0/XSS
+ * registers. The only values that differ between these versions/table
+ * entries is the enabled XSAVE area size advertised via EBX.
+ *
+ * While hypervisors may choose to make use of this support, it is more
+ * robust/secure for a guest to simply find the entry corresponding to the
+ * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the
+ * XSAVE area size using subfunctions 2 through 64, as documented in APM
+ * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here.
+ *
+ * Since base/legacy XSAVE area size is documented as 0x240, use that value
+ * directly rather than relying on the base size in the CPUID table.
+ *
+ * Return: XSAVE area size on success, 0 otherwise.
+ */
+static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ u64 xfeatures_found = 0;
+ u32 xsave_size = 0x240;
+ int i;
+
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+ if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64))
+ continue;
+ if (!(xfeatures_en & (BIT_ULL(e->ecx_in))))
+ continue;
+ if (xfeatures_found & (BIT_ULL(e->ecx_in)))
+ continue;
+
+ xfeatures_found |= (BIT_ULL(e->ecx_in));
+
+ if (compacted)
+ xsave_size += e->eax;
+ else
+ xsave_size = max(xsave_size, e->eax + e->ebx);
+ }
+
+ /*
+ * Either the guest set unsupported XCR0/XSS bits, or the corresponding
+ * entries in the CPUID table were not present. This is not a valid
+ * state to be in.
+ */
+ if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2)))
+ return 0;
+
+ return xsave_size;
+}
+
+static bool __head
+snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ int i;
+
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+ if (e->eax_in != leaf->fn)
+ continue;
+
+ if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn)
+ continue;
+
+ /*
+ * For 0xD subfunctions 0 and 1, only use the entry corresponding
+ * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0).
+ * See the comments above snp_cpuid_calc_xsave_size() for more
+ * details.
+ */
+ if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1))
+ if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in)
+ continue;
+
+ leaf->eax = e->eax;
+ leaf->ebx = e->ebx;
+ leaf->ecx = e->ecx;
+ leaf->edx = e->edx;
+
+ return true;
+ }
+
+ return false;
+}
+
+static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ if (sev_cpuid_hv(ghcb, ctxt, leaf))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
+}
+
+static int __head
+snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ struct cpuid_leaf *leaf)
+{
+ struct cpuid_leaf leaf_hv = *leaf;
+
+ switch (leaf->fn) {
+ case 0x1:
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
+
+ /* initial APIC ID */
+ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
+ /* APIC enabled bit */
+ leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9));
+
+ /* OSXSAVE enabled bit */
+ if (native_read_cr4() & X86_CR4_OSXSAVE)
+ leaf->ecx |= BIT(27);
+ break;
+ case 0x7:
+ /* OSPKE enabled bit */
+ leaf->ecx &= ~BIT(4);
+ if (native_read_cr4() & X86_CR4_PKE)
+ leaf->ecx |= BIT(4);
+ break;
+ case 0xB:
+ leaf_hv.subfn = 0;
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
+
+ /* extended APIC ID */
+ leaf->edx = leaf_hv.edx;
+ break;
+ case 0xD: {
+ bool compacted = false;
+ u64 xcr0 = 1, xss = 0;
+ u32 xsave_size;
+
+ if (leaf->subfn != 0 && leaf->subfn != 1)
+ return 0;
+
+ if (native_read_cr4() & X86_CR4_OSXSAVE)
+ xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ if (leaf->subfn == 1) {
+ /* Get XSS value if XSAVES is enabled. */
+ if (leaf->eax & BIT(3)) {
+ unsigned long lo, hi;
+
+ asm volatile("rdmsr" : "=a" (lo), "=d" (hi)
+ : "c" (MSR_IA32_XSS));
+ xss = (hi << 32) | lo;
+ }
+
+ /*
+ * The PPR and APM aren't clear on what size should be
+ * encoded in 0xD:0x1:EBX when compaction is not enabled
+ * by either XSAVEC (feature bit 1) or XSAVES (feature
+ * bit 3) since SNP-capable hardware has these feature
+ * bits fixed as 1. KVM sets it to 0 in this case, but
+ * to avoid this becoming an issue it's safer to simply
+ * treat this as unsupported for SNP guests.
+ */
+ if (!(leaf->eax & (BIT(1) | BIT(3))))
+ return -EINVAL;
+
+ compacted = true;
+ }
+
+ xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted);
+ if (!xsave_size)
+ return -EINVAL;
+
+ leaf->ebx = xsave_size;
+ }
+ break;
+ case 0x8000001E:
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
+
+ /* extended APIC ID */
+ leaf->eax = leaf_hv.eax;
+ /* compute ID */
+ leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0));
+ /* node ID */
+ leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0));
+ break;
+ default:
+ /* No fix-ups needed, use values as-is. */
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
+ * should be treated as fatal by caller.
+ */
+int __head
+snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+
+ if (!cpuid_table->count)
+ return -EOPNOTSUPP;
+
+ if (!snp_cpuid_get_validated_func(leaf)) {
+ /*
+ * Some hypervisors will avoid keeping track of CPUID entries
+ * where all values are zero, since they can be handled the
+ * same as out-of-range values (all-zero). This is useful here
+ * as well as it allows virtually all guest configurations to
+ * work using a single SNP CPUID table.
+ *
+ * To allow for this, there is a need to distinguish between
+ * out-of-range entries and in-range zero entries, since the
+ * CPUID table entries are only a template that may need to be
+ * augmented with additional values for things like
+ * CPU-specific information during post-processing. So if it's
+ * not in the table, set the values to zero. Then, if they are
+ * within a valid CPUID range, proceed with post-processing
+ * using zeros as the initial values. Otherwise, skip
+ * post-processing and just return zeros immediately.
+ */
+ leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
+
+ /* Skip post-processing for out-of-range zero leafs. */
+ if (!(leaf->fn <= cpuid_std_range_max ||
+ (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
+ (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
+ return 0;
+ }
+
+ return snp_cpuid_postprocess(ghcb, ctxt, leaf);
+}
+
+/*
+ * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
+ * page yet, so it only supports the MSR based communication with the
+ * hypervisor and only the CPUID exit-code.
+ */
+void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
+{
+ unsigned int subfn = lower_bits(regs->cx, 32);
+ unsigned int fn = lower_bits(regs->ax, 32);
+ u16 opcode = *(unsigned short *)regs->ip;
+ struct cpuid_leaf leaf;
+ int ret;
+
+ /* Only CPUID is supported via MSR protocol */
+ if (exit_code != SVM_EXIT_CPUID)
+ goto fail;
+
+ /* Is it really a CPUID insn? */
+ if (opcode != 0xa20f)
+ goto fail;
+
+ leaf.fn = fn;
+ leaf.subfn = subfn;
+
+ ret = snp_cpuid(NULL, NULL, &leaf);
+ if (!ret)
+ goto cpuid_done;
+
+ if (ret != -EOPNOTSUPP)
+ goto fail;
+
+ if (__sev_cpuid_hv_msr(&leaf))
+ goto fail;
+
+cpuid_done:
+ regs->ax = leaf.eax;
+ regs->bx = leaf.ebx;
+ regs->cx = leaf.ecx;
+ regs->dx = leaf.edx;
+
+ /*
+ * This is a VC handler and the #VC is only raised when SEV-ES is
+ * active, which means SEV must be active too. Do sanity checks on the
+ * CPUID results to make sure the hypervisor does not trick the kernel
+ * into the no-sev path. This could map sensitive data unencrypted and
+ * make it accessible to the hypervisor.
+ *
+ * In particular, check for:
+ * - Availability of CPUID leaf 0x8000001f
+ * - SEV CPUID bit.
+ *
+ * The hypervisor might still report the wrong C-bit position, but this
+ * can't be checked here.
+ */
+
+ if (fn == 0x80000000 && (regs->ax < 0x8000001f))
+ /* SEV leaf check */
+ goto fail;
+ else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
+ /* SEV bit */
+ goto fail;
+
+ /* Skip over the CPUID two-byte opcode */
+ regs->ip += 2;
+
+ return;
+
+fail:
+ /* Terminate the guest */
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
+
+struct cc_setup_data {
+ struct setup_data header;
+ u32 cc_blob_address;
+};
+
+/*
+ * Search for a Confidential Computing blob passed in as a setup_data entry
+ * via the Linux Boot Protocol.
+ */
+static __head
+struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
+{
+ struct cc_setup_data *sd = NULL;
+ struct setup_data *hdr;
+
+ hdr = (struct setup_data *)bp->hdr.setup_data;
+
+ while (hdr) {
+ if (hdr->type == SETUP_CC_BLOB) {
+ sd = (struct cc_setup_data *)hdr;
+ return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address;
+ }
+ hdr = (struct setup_data *)hdr->next;
+ }
+
+ return NULL;
+}
+
+/*
+ * Initialize the kernel's copy of the SNP CPUID table, and set up the
+ * pointer that will be used to access it.
+ *
+ * Maintaining a direct mapping of the SNP CPUID table used by firmware would
+ * be possible as an alternative, but the approach is brittle since the
+ * mapping needs to be updated in sync with all the changes to virtual memory
+ * layout and related mapping facilities throughout the boot process.
+ */
+static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
+{
+ const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table;
+ int i;
+
+ if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
+
+ cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys;
+ if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
+
+ cpuid_table = snp_cpuid_get_table();
+ memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table));
+
+ /* Initialize CPUID ranges for range-checking. */
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ if (fn->eax_in == 0x0)
+ cpuid_std_range_max = fn->eax;
+ else if (fn->eax_in == 0x40000000)
+ cpuid_hyp_range_max = fn->eax;
+ else if (fn->eax_in == 0x80000000)
+ cpuid_ext_range_max = fn->eax;
+ }
+}
+
+static void __head svsm_pval_4k_page(unsigned long paddr, bool validate)
+{
+ struct svsm_pvalidate_call *pc;
+ struct svsm_call call = {};
+ unsigned long flags;
+ u64 pc_pa;
+ int ret;
+
+ /*
+ * This can be called very early in the boot, use native functions in
+ * order to avoid paravirt issues.
+ */
+ flags = native_local_irq_save();
+
+ call.caa = svsm_get_caa();
+
+ pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer;
+ pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
+
+ pc->num_entries = 1;
+ pc->cur_index = 0;
+ pc->entry[0].page_size = RMP_PG_SIZE_4K;
+ pc->entry[0].action = validate;
+ pc->entry[0].ignore_cf = 0;
+ pc->entry[0].pfn = paddr >> PAGE_SHIFT;
+
+ /* Protocol 0, Call ID 1 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE);
+ call.rcx = pc_pa;
+
+ ret = svsm_perform_call_protocol(&call);
+ if (ret)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+
+ native_local_irq_restore(flags);
+}
+
+static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
+ bool validate)
+{
+ int ret;
+
+ if (snp_vmpl) {
+ svsm_pval_4k_page(paddr, validate);
+ } else {
+ ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
+ if (ret)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+ }
+}
+
+/*
+ * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM
+ * services needed when not running in VMPL0.
+ */
+static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info)
+{
+ struct snp_secrets_page *secrets_page;
+ struct snp_cpuid_table *cpuid_table;
+ unsigned int i;
+ u64 caa;
+
+ BUILD_BUG_ON(sizeof(*secrets_page) != PAGE_SIZE);
+
+ /*
+ * Check if running at VMPL0.
+ *
+ * Use RMPADJUST (see the rmpadjust() function for a description of what
+ * the instruction does) to update the VMPL1 permissions of a page. If
+ * the guest is running at VMPL0, this will succeed and implies there is
+ * no SVSM. If the guest is running at any other VMPL, this will fail.
+ * Linux SNP guests only ever run at a single VMPL level so permission mask
+ * changes of a lesser-privileged VMPL are a don't-care.
+ *
+ * Use a rip-relative reference to obtain the proper address, since this
+ * routine is running identity mapped when called, both by the decompressor
+ * code and the early kernel code.
+ */
+ if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1))
+ return false;
+
+ /*
+ * Not running at VMPL0, ensure everything has been properly supplied
+ * for running under an SVSM.
+ */
+ if (!cc_info || !cc_info->secrets_phys || cc_info->secrets_len != PAGE_SIZE)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECRETS_PAGE);
+
+ secrets_page = (struct snp_secrets_page *)cc_info->secrets_phys;
+ if (!secrets_page->svsm_size)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NO_SVSM);
+
+ if (!secrets_page->svsm_guest_vmpl)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_VMPL0);
+
+ snp_vmpl = secrets_page->svsm_guest_vmpl;
+
+ caa = secrets_page->svsm_caa;
+
+ /*
+ * An open-coded PAGE_ALIGNED() in order to avoid including
+ * kernel-proper headers into the decompressor.
+ */
+ if (caa & (PAGE_SIZE - 1))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA);
+
+ /*
+ * The CA is identity mapped when this routine is called, both by the
+ * decompressor code and the early kernel code.
+ */
+ boot_svsm_caa = (struct svsm_ca *)caa;
+ boot_svsm_caa_pa = caa;
+
+ /* Advertise the SVSM presence via CPUID. */
+ cpuid_table = (struct snp_cpuid_table *)snp_cpuid_get_table();
+ for (i = 0; i < cpuid_table->count; i++) {
+ struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ if (fn->eax_in == 0x8000001f)
+ fn->eax |= BIT(28);
+ }
+
+ return true;
+}
diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c
new file mode 100644
index 000000000000..0b7e3b950183
--- /dev/null
+++ b/arch/x86/boot/startup/sev-startup.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV: " fmt
+
+#include <linux/percpu-defs.h>
+#include <linux/cc_platform.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/set_memory.h>
+#include <linux/memblock.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/efi.h>
+#include <linux/io.h>
+#include <linux/psp-sev.h>
+#include <uapi/linux/sev-guest.h>
+
+#include <asm/init.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/stacktrace.h>
+#include <asm/sev.h>
+#include <asm/sev-internal.h>
+#include <asm/insn-eval.h>
+#include <asm/fpu/xcr.h>
+#include <asm/processor.h>
+#include <asm/realmode.h>
+#include <asm/setup.h>
+#include <asm/traps.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid/api.h>
+#include <asm/cmdline.h>
+
+/* For early boot hypervisor communication in SEV-ES enabled guests */
+struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
+
+/*
+ * Needs to be in the .data section because we need it NULL before bss is
+ * cleared
+ */
+struct ghcb *boot_ghcb __section(".data");
+
+/* Bitmap of SEV features supported by the hypervisor */
+u64 sev_hv_features __ro_after_init;
+
+/* Secrets page physical address from the CC blob */
+u64 sev_secrets_pa __ro_after_init;
+
+/* For early boot SVSM communication */
+struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE);
+
+DEFINE_PER_CPU(struct svsm_ca *, svsm_caa);
+DEFINE_PER_CPU(u64, svsm_caa_pa);
+
+/*
+ * Nothing shall interrupt this code path while holding the per-CPU
+ * GHCB. The backup GHCB is only for NMIs interrupting this path.
+ *
+ * Callers must disable local interrupts around it.
+ */
+noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (unlikely(data->ghcb_active)) {
+ /* GHCB is already in use - save its contents */
+
+ if (unlikely(data->backup_ghcb_active)) {
+ /*
+ * Backup-GHCB is also already in use. There is no way
+ * to continue here so just kill the machine. To make
+ * panic() work, mark GHCBs inactive so that messages
+ * can be printed out.
+ */
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+
+ instrumentation_begin();
+ panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+ instrumentation_end();
+ }
+
+ /* Mark backup_ghcb active before writing to it */
+ data->backup_ghcb_active = true;
+
+ state->ghcb = &data->backup_ghcb;
+
+ /* Backup GHCB content */
+ *state->ghcb = *ghcb;
+ } else {
+ state->ghcb = NULL;
+ data->ghcb_active = true;
+ }
+
+ return ghcb;
+}
+
+/* Include code shared with pre-decompression boot stage */
+#include "sev-shared.c"
+
+noinstr void __sev_put_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ WARN_ON(!irqs_disabled());
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (state->ghcb) {
+ /* Restore GHCB from Backup */
+ *ghcb = *state->ghcb;
+ data->backup_ghcb_active = false;
+ state->ghcb = NULL;
+ } else {
+ /*
+ * Invalidate the GHCB so a VMGEXIT instruction issued
+ * from userspace won't appear to be valid.
+ */
+ vc_ghcb_invalidate(ghcb);
+ data->ghcb_active = false;
+ }
+}
+
+int svsm_perform_call_protocol(struct svsm_call *call)
+{
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int ret;
+
+ /*
+ * This can be called very early in the boot, use native functions in
+ * order to avoid paravirt issues.
+ */
+ flags = native_local_irq_save();
+
+ if (sev_cfg.ghcbs_initialized)
+ ghcb = __sev_get_ghcb(&state);
+ else if (boot_ghcb)
+ ghcb = boot_ghcb;
+ else
+ ghcb = NULL;
+
+ do {
+ ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
+ : svsm_perform_msr_protocol(call);
+ } while (ret == -EAGAIN);
+
+ if (sev_cfg.ghcbs_initialized)
+ __sev_put_ghcb(&state);
+
+ native_local_irq_restore(flags);
+
+ return ret;
+}
+
+void __head
+early_set_pages_state(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages, enum psc_op op)
+{
+ unsigned long paddr_end;
+ u64 val;
+
+ vaddr = vaddr & PAGE_MASK;
+
+ paddr = paddr & PAGE_MASK;
+ paddr_end = paddr + (npages << PAGE_SHIFT);
+
+ while (paddr < paddr_end) {
+ /* Page validation must be rescinded before changing to shared */
+ if (op == SNP_PAGE_STATE_SHARED)
+ pvalidate_4k_page(vaddr, paddr, false);
+
+ /*
+ * Use the MSR protocol because this function can be called before
+ * the GHCB is established.
+ */
+ sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
+ goto e_term;
+
+ if (GHCB_MSR_PSC_RESP_VAL(val))
+ goto e_term;
+
+ /* Page validation must be performed after changing to private */
+ if (op == SNP_PAGE_STATE_PRIVATE)
+ pvalidate_4k_page(vaddr, paddr, true);
+
+ vaddr += PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ }
+
+ return;
+
+e_term:
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+}
+
+void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages)
+{
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /*
+ * Ask the hypervisor to mark the memory pages as private in the RMP
+ * table.
+ */
+ early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
+}
+
+void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages)
+{
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /* Ask hypervisor to mark the memory pages shared in the RMP table. */
+ early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the kernel
+ * in the following ways, depending on how it is booted:
+ *
+ * - when booted via the boot/decompress kernel:
+ * - via boot_params
+ *
+ * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
+ * - via a setup_data entry, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ /* Boot kernel would have passed the CC blob via boot_params. */
+ if (bp->cc_blob_address) {
+ cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
+ goto found_cc_info;
+ }
+
+ /*
+ * If kernel was booted directly, without the use of the
+ * boot/decompression kernel, the CC blob may have been passed via
+ * setup_data instead.
+ */
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ snp_abort();
+
+ return cc_info;
+}
+
+static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
+{
+ struct svsm_call call = {};
+ int ret;
+ u64 pa;
+
+ /*
+ * Record the SVSM Calling Area address (CAA) if the guest is not
+ * running at VMPL0. The CA will be used to communicate with the
+ * SVSM to perform the SVSM services.
+ */
+ if (!svsm_setup_ca(cc_info))
+ return;
+
+ /*
+ * It is very early in the boot and the kernel is running identity
+ * mapped but without having adjusted the pagetables to where the
+ * kernel was loaded (physbase), so the get the CA address using
+ * RIP-relative addressing.
+ */
+ pa = (u64)rip_rel_ptr(&boot_svsm_ca_page);
+
+ /*
+ * Switch over to the boot SVSM CA while the current CA is still
+ * addressable. There is no GHCB at this point so use the MSR protocol.
+ *
+ * SVSM_CORE_REMAP_CA call:
+ * RAX = 0 (Protocol=0, CallID=0)
+ * RCX = New CA GPA
+ */
+ call.caa = svsm_get_caa();
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
+ call.rcx = pa;
+ ret = svsm_perform_call_protocol(&call);
+ if (ret)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL);
+
+ boot_svsm_caa = (struct svsm_ca *)pa;
+ boot_svsm_caa_pa = pa;
+}
+
+bool __head snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE)
+ sev_secrets_pa = cc_info->secrets_phys;
+ else
+ return false;
+
+ setup_cpuid_table(cc_info);
+
+ svsm_setup(cc_info);
+
+ /*
+ * The CC blob will be used later to access the secrets page. Cache
+ * it here like the boot kernel does.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
+void __head __noreturn snp_abort(void)
+{
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+}
diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c
new file mode 100644
index 000000000000..70ea1748c0a7
--- /dev/null
+++ b/arch/x86/boot/startup/sme.c
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+/*
+ * Since we're dealing with identity mappings, physical and virtual
+ * addresses are the same, so override these defines which are ultimately
+ * used by the headers in misc.h.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+/*
+ * Special hack: we have to be careful, because no indirections are
+ * allowed here, and paravirt_ops is a kind of one. As it will only run in
+ * baremetal anyway, we just keep it from happening. (This list needs to
+ * be extended when new paravirt and debugging variants are added.)
+ */
+#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_XXL
+#undef CONFIG_PARAVIRT_SPINLOCKS
+
+/*
+ * This code runs before CPU feature bits are set. By default, the
+ * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if
+ * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5
+ * is provided to handle this situation and, instead, use a variable that
+ * has been set by the early boot code.
+ */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
+
+#include <asm/init.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/coco.h>
+#include <asm/sev.h>
+
+#define PGD_FLAGS _KERNPG_TABLE_NOENC
+#define P4D_FLAGS _KERNPG_TABLE_NOENC
+#define PUD_FLAGS _KERNPG_TABLE_NOENC
+#define PMD_FLAGS _KERNPG_TABLE_NOENC
+
+#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
+#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \
+ (_PAGE_PAT_LARGE | _PAGE_PWT))
+
+#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
+
+#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
+
+#define PTE_FLAGS_DEC PTE_FLAGS
+#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))
+
+#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
+
+struct sme_populate_pgd_data {
+ void *pgtable_area;
+ pgd_t *pgd;
+
+ pmdval_t pmd_flags;
+ pteval_t pte_flags;
+ unsigned long paddr;
+
+ unsigned long vaddr;
+ unsigned long vaddr_end;
+};
+
+/*
+ * This work area lives in the .init.scratch section, which lives outside of
+ * the kernel proper. It is sized to hold the intermediate copy buffer and
+ * more than enough pagetable pages.
+ *
+ * By using this section, the kernel can be encrypted in place and it
+ * avoids any possibility of boot parameters or initramfs images being
+ * placed such that the in-place encryption logic overwrites them. This
+ * section is 2MB aligned to allow for simple pagetable setup using only
+ * PMD entries (see vmlinux.lds.S).
+ */
+static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
+
+static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+{
+ unsigned long pgd_start, pgd_end, pgd_size;
+ pgd_t *pgd_p;
+
+ pgd_start = ppd->vaddr & PGDIR_MASK;
+ pgd_end = ppd->vaddr_end & PGDIR_MASK;
+
+ pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
+
+ pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
+
+ memset(pgd_p, 0, pgd_size);
+}
+
+static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = ppd->pgd + pgd_index(ppd->vaddr);
+ if (pgd_none(*pgd)) {
+ p4d = ppd->pgtable_area;
+ memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
+ ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
+ set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
+ }
+
+ p4d = p4d_offset(pgd, ppd->vaddr);
+ if (p4d_none(*p4d)) {
+ pud = ppd->pgtable_area;
+ memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
+ ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
+ set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
+ }
+
+ pud = pud_offset(p4d, ppd->vaddr);
+ if (pud_none(*pud)) {
+ pmd = ppd->pgtable_area;
+ memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
+ ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
+ set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
+ }
+
+ if (pud_leaf(*pud))
+ return NULL;
+
+ return pud;
+}
+
+static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_leaf(*pmd))
+ return;
+
+ set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
+}
+
+static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_none(*pmd)) {
+ pte = ppd->pgtable_area;
+ memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE);
+ ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE;
+ set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
+ }
+
+ if (pmd_leaf(*pmd))
+ return;
+
+ pte = pte_offset_kernel(pmd, ppd->vaddr);
+ if (pte_none(*pte))
+ set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
+}
+
+static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd_large(ppd);
+
+ ppd->vaddr += PMD_SIZE;
+ ppd->paddr += PMD_SIZE;
+ }
+}
+
+static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd(ppd);
+
+ ppd->vaddr += PAGE_SIZE;
+ ppd->paddr += PAGE_SIZE;
+ }
+}
+
+static void __head __sme_map_range(struct sme_populate_pgd_data *ppd,
+ pmdval_t pmd_flags, pteval_t pte_flags)
+{
+ unsigned long vaddr_end;
+
+ ppd->pmd_flags = pmd_flags;
+ ppd->pte_flags = pte_flags;
+
+ /* Save original end value since we modify the struct value */
+ vaddr_end = ppd->vaddr_end;
+
+ /* If start is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE);
+ __sme_map_range_pte(ppd);
+
+ /* Create PMD entries */
+ ppd->vaddr_end = vaddr_end & PMD_MASK;
+ __sme_map_range_pmd(ppd);
+
+ /* If end is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = vaddr_end;
+ __sme_map_range_pte(ppd);
+}
+
+static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
+}
+
+static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
+}
+
+static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
+}
+
+static unsigned long __head sme_pgtable_calc(unsigned long len)
+{
+ unsigned long entries = 0, tables = 0;
+
+ /*
+ * Perform a relatively simplistic calculation of the pagetable
+ * entries that are needed. Those mappings will be covered mostly
+ * by 2MB PMD entries so we can conservatively calculate the required
+ * number of P4D, PUD and PMD structures needed to perform the
+ * mappings. For mappings that are not 2MB aligned, PTE mappings
+ * would be needed for the start and end portion of the address range
+ * that fall outside of the 2MB alignment. This results in, at most,
+ * two extra pages to hold PTE entries for each range that is mapped.
+ * Incrementing the count for each covers the case where the addresses
+ * cross entries.
+ */
+
+ /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */
+ if (PTRS_PER_P4D > 1)
+ entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D;
+ entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD;
+ entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD;
+ entries += 2 * sizeof(pte_t) * PTRS_PER_PTE;
+
+ /*
+ * Now calculate the added pagetable structures needed to populate
+ * the new pagetables.
+ */
+
+ if (PTRS_PER_P4D > 1)
+ tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D;
+ tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD;
+ tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD;
+
+ return entries + tables;
+}
+
+void __head sme_encrypt_kernel(struct boot_params *bp)
+{
+ unsigned long workarea_start, workarea_end, workarea_len;
+ unsigned long execute_start, execute_end, execute_len;
+ unsigned long kernel_start, kernel_end, kernel_len;
+ unsigned long initrd_start, initrd_end, initrd_len;
+ struct sme_populate_pgd_data ppd;
+ unsigned long pgtable_area_len;
+ unsigned long decrypted_base;
+
+ /*
+ * This is early code, use an open coded check for SME instead of
+ * using cc_platform_has(). This eliminates worries about removing
+ * instrumentation or checking boot_cpu_data in the cc_platform_has()
+ * function.
+ */
+ if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED)
+ return;
+
+ /*
+ * Prepare for encrypting the kernel and initrd by building new
+ * pagetables with the necessary attributes needed to encrypt the
+ * kernel in place.
+ *
+ * One range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as encrypted.
+ *
+ * Another range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as decrypted and write-protected.
+ *
+ * The use of write-protect attribute will prevent any of the
+ * memory from being cached.
+ */
+
+ kernel_start = (unsigned long)rip_rel_ptr(_text);
+ kernel_end = ALIGN((unsigned long)rip_rel_ptr(_end), PMD_SIZE);
+ kernel_len = kernel_end - kernel_start;
+
+ initrd_start = 0;
+ initrd_end = 0;
+ initrd_len = 0;
+#ifdef CONFIG_BLK_DEV_INITRD
+ initrd_len = (unsigned long)bp->hdr.ramdisk_size |
+ ((unsigned long)bp->ext_ramdisk_size << 32);
+ if (initrd_len) {
+ initrd_start = (unsigned long)bp->hdr.ramdisk_image |
+ ((unsigned long)bp->ext_ramdisk_image << 32);
+ initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
+ initrd_len = initrd_end - initrd_start;
+ }
+#endif
+
+ /*
+ * Calculate required number of workarea bytes needed:
+ * executable encryption area size:
+ * stack page (PAGE_SIZE)
+ * encryption routine page (PAGE_SIZE)
+ * intermediate copy buffer (PMD_SIZE)
+ * pagetable structures for the encryption of the kernel
+ * pagetable structures for workarea (in case not currently mapped)
+ */
+ execute_start = workarea_start = (unsigned long)rip_rel_ptr(sme_workarea);
+ execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE;
+ execute_len = execute_end - execute_start;
+
+ /*
+ * One PGD for both encrypted and decrypted mappings and a set of
+ * PUDs and PMDs for each of the encrypted and decrypted mappings.
+ */
+ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+ pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+ if (initrd_len)
+ pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
+
+ /* PUDs and PMDs needed in the current pagetables for the workarea */
+ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+ /*
+ * The total workarea includes the executable encryption area and
+ * the pagetable area. The start of the workarea is already 2MB
+ * aligned, align the end of the workarea on a 2MB boundary so that
+ * we don't try to create/allocate PTE entries from the workarea
+ * before it is mapped.
+ */
+ workarea_len = execute_len + pgtable_area_len;
+ workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE);
+
+ /*
+ * Set the address to the start of where newly created pagetable
+ * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+ * structures are created when the workarea is added to the current
+ * pagetables and when the new encrypted and decrypted kernel
+ * mappings are populated.
+ */
+ ppd.pgtable_area = (void *)execute_end;
+
+ /*
+ * Make sure the current pagetable structure has entries for
+ * addressing the workarea.
+ */
+ ppd.pgd = (pgd_t *)native_read_cr3_pa();
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+
+ /*
+ * A new pagetable structure is being built to allow for the kernel
+ * and initrd to be encrypted. It starts with an empty PGD that will
+ * then be populated with new PUDs and PMDs as the encrypted and
+ * decrypted kernel mappings are created.
+ */
+ ppd.pgd = ppd.pgtable_area;
+ memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
+ ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
+
+ /*
+ * A different PGD index/entry must be used to get different
+ * pagetable entries for the decrypted mapping. Choose the next
+ * PGD index and convert it to a virtual address to be used as
+ * the base of the mapping.
+ */
+ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+ if (initrd_len) {
+ unsigned long check_base;
+
+ check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
+ decrypted_base = max(decrypted_base, check_base);
+ }
+ decrypted_base <<= PGDIR_SHIFT;
+
+ /* Add encrypted kernel (identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start;
+ ppd.vaddr_end = kernel_end;
+ sme_map_range_encrypted(&ppd);
+
+ /* Add decrypted, write-protected kernel (non-identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+
+ if (initrd_len) {
+ /* Add encrypted initrd (identity) mappings */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start;
+ ppd.vaddr_end = initrd_end;
+ sme_map_range_encrypted(&ppd);
+ /*
+ * Add decrypted, write-protected initrd (non-identity) mappings
+ */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+ }
+
+ /* Add decrypted workarea mappings to both kernel mappings */
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_map_range_decrypted(&ppd);
+
+ /* Perform the encryption */
+ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+ kernel_len, workarea_start, (unsigned long)ppd.pgd);
+
+ if (initrd_len)
+ sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
+ initrd_len, workarea_start,
+ (unsigned long)ppd.pgd);
+
+ /*
+ * At this point we are running encrypted. Remove the mappings for
+ * the decrypted areas - all that is needed for this is to remove
+ * the PGD entry/entries.
+ */
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ if (initrd_len) {
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+ }
+
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+}
+
+void __head sme_enable(struct boot_params *bp)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned long feature_mask;
+ unsigned long me_mask;
+ bool snp_en;
+ u64 msr;
+
+ snp_en = snp_init(bp);
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+#define AMD_SME_BIT BIT(0)
+#define AMD_SEV_BIT BIT(1)
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ /* Check whether SEV or SME is supported */
+ if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT)))
+ return;
+
+ me_mask = 1UL << (ebx & 0x3f);
+
+ /* Check the SEV MSR whether SEV or SME is enabled */
+ sev_status = msr = native_rdmsrq(MSR_AMD64_SEV);
+ feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /*
+ * Any discrepancies between the presence of a CC blob and SNP
+ * enablement abort the guest.
+ */
+ if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED))
+ snp_abort();
+
+ /* Check if memory encryption is enabled */
+ if (feature_mask == AMD_SME_BIT) {
+ if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION))
+ return;
+
+ /*
+ * No SME if Hypervisor bit is set. This check is here to
+ * prevent a guest from trying to enable SME. For running as a
+ * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there
+ * might be other hypervisors which emulate that MSR as non-zero
+ * or even pass it through to the guest.
+ * A malicious hypervisor can still trick a guest into this
+ * path, but there is no way to protect against that.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (ecx & BIT(31))
+ return;
+
+ /* For SME, check the SYSCFG MSR */
+ msr = native_rdmsrq(MSR_AMD64_SYSCFG);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ return;
+ }
+
+ sme_me_mask = me_mask;
+ physical_mask &= ~me_mask;
+ cc_vendor = CC_VENDOR_AMD;
+ cc_set_mask(me_mask);
+}
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+/* Local version for startup code, which never operates on user page tables */
+__weak
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+#endif
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 84f7a883ce1e..f35369bb14c5 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -32,7 +32,7 @@
int memcmp(const void *s1, const void *s2, size_t len)
{
bool diff;
- asm("repe; cmpsb" CC_SET(nz)
+ asm("repe cmpsb" CC_SET(nz)
: CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index f2e96905b3fe..0641c8c46aee 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -292,7 +292,7 @@ static void restore_screen(void)
"shrw %%cx ; "
"jnc 1f ; "
"stosw \n\t"
- "1: rep;stosl ; "
+ "1: rep stosl ; "
"popw %%es"
: "+D" (dst), "+c" (npad)
: "bdS" (video_segment),