summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/arch/x86/tdx.rst14
-rw-r--r--MAINTAINERS11
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/kexec.h12
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/tdx.h35
-rw-r--r--arch/x86/kernel/cpu/amd.c17
-rw-r--r--arch/x86/kernel/machine_kexec_64.c44
-rw-r--r--arch/x86/kernel/process.c24
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S36
-rw-r--r--arch/x86/kvm/vmx/tdx.c44
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c80
12 files changed, 214 insertions, 106 deletions
diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst
index 719043cd8b46..61670e7df2f7 100644
--- a/Documentation/arch/x86/tdx.rst
+++ b/Documentation/arch/x86/tdx.rst
@@ -142,13 +142,6 @@ but depends on the BIOS to behave correctly.
Note TDX works with CPU logical online/offline, thus the kernel still
allows to offline logical CPU and online it again.
-Kexec()
-~~~~~~~
-
-TDX host support currently lacks the ability to handle kexec. For
-simplicity only one of them can be enabled in the Kconfig. This will be
-fixed in the future.
-
Erratum
~~~~~~~
@@ -171,6 +164,13 @@ If the platform has such erratum, the kernel prints additional message in
machine check handler to tell user the machine check may be caused by
kernel bug on TDX private memory.
+Kexec
+~~~~~~~
+
+Currently kexec doesn't work on the TDX platforms with the aforementioned
+erratum. It fails when loading the kexec kernel image. Otherwise it
+works normally.
+
Interaction vs S3 and deeper states
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/MAINTAINERS b/MAINTAINERS
index 9b3db4f797c6..3b192f9a4785 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27723,17 +27723,14 @@ F: arch/x86/kernel/unwind_*.c
X86 TRUST DOMAIN EXTENSIONS (TDX)
M: Kirill A. Shutemov <kas@kernel.org>
R: Dave Hansen <dave.hansen@linux.intel.com>
+R: Rick Edgecombe <rick.p.edgecombe@intel.com>
L: x86@kernel.org
L: linux-coco@lists.linux.dev
+L: kvm@vger.kernel.org
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/tdx
-F: Documentation/ABI/testing/sysfs-devices-virtual-misc-tdx_guest
-F: arch/x86/boot/compressed/tdx*
-F: arch/x86/coco/tdx/
-F: arch/x86/include/asm/shared/tdx.h
-F: arch/x86/include/asm/tdx.h
-F: arch/x86/virt/vmx/tdx/
-F: drivers/virt/coco/tdx-guest
+N: tdx
+K: \b(tdx)
X86 VDSO
M: Andy Lutomirski <luto@kernel.org>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e3cccf4256ca..9d034a987c6e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1902,7 +1902,6 @@ config INTEL_TDX_HOST
depends on X86_X2APIC
select ARCH_KEEP_MEMBLOCK
depends on CONTIG_ALLOC
- depends on !KEXEC_CORE
depends on X86_MCE
help
Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index f2ad77929d6e..5cfb27f26583 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -13,6 +13,15 @@
# define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */
#endif
+#ifdef CONFIG_X86_64
+
+#include <linux/bits.h>
+
+#define RELOC_KERNEL_PRESERVE_CONTEXT BIT(0)
+#define RELOC_KERNEL_CACHE_INCOHERENT BIT(1)
+
+#endif
+
# define KEXEC_CONTROL_PAGE_SIZE 4096
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
@@ -121,8 +130,7 @@ typedef unsigned long
relocate_kernel_fn(unsigned long indirection_page,
unsigned long pa_control_page,
unsigned long start_address,
- unsigned int preserve_context,
- unsigned int host_mem_enc_active);
+ unsigned int flags);
#endif
extern relocate_kernel_fn relocate_kernel;
#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index bde58f6510ac..a24c7805acdb 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -731,6 +731,8 @@ void __noreturn stop_this_cpu(void *dummy);
void microcode_check(struct cpuinfo_x86 *prev_info);
void store_cpu_caps(struct cpuinfo_x86 *info);
+DECLARE_PER_CPU(bool, cache_state_incoherent);
+
enum l1tf_mitigations {
L1TF_MITIGATION_OFF,
L1TF_MITIGATION_AUTO,
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 7ddef3a69866..6b338d7f01b7 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -102,10 +102,31 @@ u64 __seamcall_ret(u64 fn, struct tdx_module_args *args);
u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args);
void tdx_init(void);
+#include <linux/preempt.h>
#include <asm/archrandom.h>
+#include <asm/processor.h>
typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
+static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn,
+ struct tdx_module_args *args)
+{
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * SEAMCALLs are made to the TDX module and can generate dirty
+ * cachelines of TDX private memory. Mark cache state incoherent
+ * so that the cache can be flushed during kexec.
+ *
+ * This needs to be done before actually making the SEAMCALL,
+ * because kexec-ing CPU could send NMI to stop remote CPUs,
+ * in which case even disabling IRQ won't help here.
+ */
+ this_cpu_write(cache_state_incoherent, true);
+
+ return func(fn, args);
+}
+
static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
struct tdx_module_args *args)
{
@@ -113,7 +134,9 @@ static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
u64 ret;
do {
- ret = func(fn, args);
+ preempt_disable();
+ ret = __seamcall_dirty_cache(func, fn, args);
+ preempt_enable();
} while (ret == TDX_RND_NO_ENTROPY && --retry);
return ret;
@@ -131,6 +154,8 @@ int tdx_guest_keyid_alloc(void);
u32 tdx_get_nr_guest_keyids(void);
void tdx_guest_keyid_free(unsigned int keyid);
+void tdx_quirk_reset_page(struct page *page);
+
struct tdx_td {
/* TD root structure: */
struct page *tdr_page;
@@ -146,6 +171,8 @@ struct tdx_td {
struct tdx_vp {
/* TDVP root page */
struct page *tdvpr_page;
+ /* precalculated page_to_phys(tdvpr_page) for use in noinstr code */
+ phys_addr_t tdvpr_pa;
/* TD vCPU control structure: */
struct page **tdcx_pages;
@@ -203,5 +230,11 @@ static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
#endif /* CONFIG_INTEL_TDX_HOST */
+#ifdef CONFIG_KEXEC_CORE
+void tdx_cpu_flush_cache_for_kexec(void);
+#else
+static inline void tdx_cpu_flush_cache_for_kexec(void) { }
+#endif
+
#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a6f88ca1a6b4..5398db4dedb4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -546,6 +546,23 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
u64 msr;
/*
+ * Mark using WBINVD is needed during kexec on processors that
+ * support SME. This provides support for performing a successful
+ * kexec when going from SME inactive to SME active (or vice-versa).
+ *
+ * The cache must be cleared so that if there are entries with the
+ * same physical address, both with and without the encryption bit,
+ * they don't race each other when flushed and potentially end up
+ * with the wrong entry being committed to memory.
+ *
+ * Test the CPUID bit directly because with mem_encrypt=off the
+ * BSP will clear the X86_FEATURE_SME bit and the APs will not
+ * see it set after that.
+ */
+ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
+ __this_cpu_write(cache_state_incoherent, true);
+
+ /*
* BIOS support is required for SME and SEV.
* For SME: If BIOS has enabled SME then adjust x86_phys_bits by
* the SME physical address space reduction value.
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 697fb99406e6..15088d14904f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -29,6 +29,7 @@
#include <asm/set_memory.h>
#include <asm/cpu.h>
#include <asm/efi.h>
+#include <asm/processor.h>
#ifdef CONFIG_ACPI
/*
@@ -346,6 +347,22 @@ int machine_kexec_prepare(struct kimage *image)
unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
int result;
+ /*
+ * Some early TDX-capable platforms have an erratum. A kernel
+ * partial write (a write transaction of less than cacheline
+ * lands at memory controller) to TDX private memory poisons that
+ * memory, and a subsequent read triggers a machine check.
+ *
+ * On those platforms the old kernel must reset TDX private
+ * memory before jumping to the new kernel otherwise the new
+ * kernel may see unexpected machine check. For simplicity
+ * just fail kexec/kdump on those platforms.
+ */
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) {
+ pr_info_once("Not allowed on platform with tdx_pw_mce bug\n");
+ return -EOPNOTSUPP;
+ }
+
/* Setup the identity mapped 64bit page table */
result = init_pgtable(image, __pa(control_page));
if (result)
@@ -384,16 +401,10 @@ void __nocfi machine_kexec(struct kimage *image)
{
unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
relocate_kernel_fn *relocate_kernel_ptr;
- unsigned int host_mem_enc_active;
+ unsigned int relocate_kernel_flags;
int save_ftrace_enabled;
void *control_page;
- /*
- * This must be done before load_segments() since if call depth tracking
- * is used then GS must be valid to make any function calls.
- */
- host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT);
-
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
save_processor_state();
@@ -427,6 +438,17 @@ void __nocfi machine_kexec(struct kimage *image)
*/
relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start;
+ relocate_kernel_flags = 0;
+ if (image->preserve_context)
+ relocate_kernel_flags |= RELOC_KERNEL_PRESERVE_CONTEXT;
+
+ /*
+ * This must be done before load_segments() since it resets
+ * GS to 0 and percpu data needs the correct GS to work.
+ */
+ if (this_cpu_read(cache_state_incoherent))
+ relocate_kernel_flags |= RELOC_KERNEL_CACHE_INCOHERENT;
+
/*
* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
@@ -436,6 +458,11 @@ void __nocfi machine_kexec(struct kimage *image)
*
* Take advantage of this here by force loading the segments,
* before the GDT is zapped with an invalid value.
+ *
+ * load_segments() resets GS to 0. Don't make any function call
+ * after here since call depth tracking uses percpu variables to
+ * operate (relocate_kernel() is explicitly ignored by call depth
+ * tracking).
*/
load_segments();
@@ -443,8 +470,7 @@ void __nocfi machine_kexec(struct kimage *image)
image->start = relocate_kernel_ptr((unsigned long)image->head,
virt_to_phys(control_page),
image->start,
- image->preserve_context,
- host_mem_enc_active);
+ relocate_kernel_flags);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e3a3987b0c4f..4c718f8adc59 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -89,6 +89,16 @@ DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
/*
+ * The cache may be in an incoherent state and needs flushing during kexec.
+ * E.g., on SME/TDX platforms, dirty cacheline aliases with and without
+ * encryption bit(s) can coexist and the cache needs to be flushed before
+ * booting to the new kernel to avoid the silent memory corruption due to
+ * dirty cachelines with different encryption property being written back
+ * to the memory.
+ */
+DEFINE_PER_CPU(bool, cache_state_incoherent);
+
+/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
*/
@@ -827,19 +837,7 @@ void __noreturn stop_this_cpu(void *dummy)
disable_local_APIC();
mcheck_cpu_clear(c);
- /*
- * Use wbinvd on processors that support SME. This provides support
- * for performing a successful kexec when going from SME inactive
- * to SME active (or vice-versa). The cache must be cleared so that
- * if there are entries with the same physical address, both with and
- * without the encryption bit, they don't race each other when flushed
- * and potentially end up with the wrong entry being committed to
- * memory.
- *
- * Test the CPUID bit directly because the machine might've cleared
- * X86_FEATURE_SME due to cmdline options.
- */
- if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
+ if (this_cpu_read(cache_state_incoherent))
wbinvd();
/*
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index ea604f4d0b52..11e20bb13aca 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -66,8 +66,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
* %rdi indirection_page
* %rsi pa_control_page
* %rdx start address
- * %rcx preserve_context
- * %r8 host_mem_enc_active
+ * %rcx flags: RELOC_KERNEL_*
*/
/* Save the CPU context, used for jumping back */
@@ -111,7 +110,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
/* save indirection list for jumping back */
movq %rdi, pa_backup_pages_map(%rip)
- /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
+ /* Save the flags to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
/* setup a new stack at the end of the physical control page */
@@ -129,9 +128,8 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/*
* %rdi indirection page
* %rdx start address
- * %r8 host_mem_enc_active
* %r9 page table page
- * %r11 preserve_context
+ * %r11 flags: RELOC_KERNEL_*
* %r13 original CR4 when relocate_kernel() was invoked
*/
@@ -200,14 +198,21 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %r9, %cr3
/*
+ * If the memory cache is in incoherent state, e.g., due to
+ * memory encryption, do WBINVD to flush cache.
+ *
* If SME is active, there could be old encrypted cache line
* entries that will conflict with the now unencrypted memory
* used by kexec. Flush the caches before copying the kernel.
+ *
+ * Note SME sets this flag to true when the platform supports
+ * SME, so the WBINVD is performed even SME is not activated
+ * by the kernel. But this has no harm.
*/
- testq %r8, %r8
- jz .Lsme_off
+ testb $RELOC_KERNEL_CACHE_INCOHERENT, %r11b
+ jz .Lnowbinvd
wbinvd
-.Lsme_off:
+.Lnowbinvd:
call swap_pages
@@ -220,7 +225,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %cr3, %rax
movq %rax, %cr3
- testq %r11, %r11 /* preserve_context */
+ testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
jnz .Lrelocate
/*
@@ -273,7 +278,13 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
ANNOTATE_NOENDBR
andq $PAGE_MASK, %r8
lea PAGE_SIZE(%r8), %rsp
- movl $1, %r11d /* Ensure preserve_context flag is set */
+ /*
+ * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that
+ * swap_pages() can swap pages correctly. Note all other
+ * RELOC_KERNEL_* flags passed to relocate_kernel() are not
+ * restored.
+ */
+ movl $RELOC_KERNEL_PRESERVE_CONTEXT, %r11d
call swap_pages
movq kexec_va_control_page(%rip), %rax
0: addq $virtual_mapped - 0b, %rax
@@ -321,7 +332,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
UNWIND_HINT_END_OF_STACK
/*
* %rdi indirection page
- * %r11 preserve_context
+ * %r11 flags: RELOC_KERNEL_*
*/
movq %rdi, %rcx /* Put the indirection_page in %rcx */
xorl %edi, %edi
@@ -357,7 +368,8 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movq %rdi, %rdx /* Save destination page to %rdx */
movq %rsi, %rax /* Save source page to %rax */
- testq %r11, %r11 /* Only actually swap for ::preserve_context */
+ /* Only actually swap for ::preserve_context */
+ testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
jz .Lnoswap
/* copy source page to swap page */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index ca9c8ec7dd01..00f8bfd2330d 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -281,25 +281,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
vcpu->cpu = -1;
}
-static void tdx_clear_page(struct page *page)
-{
- const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
- void *dest = page_to_virt(page);
- unsigned long i;
-
- /*
- * The page could have been poisoned. MOVDIR64B also clears
- * the poison bit so the kernel can safely use the page again.
- */
- for (i = 0; i < PAGE_SIZE; i += 64)
- movdir64b(dest + i, zero_page);
- /*
- * MOVDIR64B store uses WC buffer. Prevent following memory reads
- * from seeing potentially poisoned cache.
- */
- __mb();
-}
-
static void tdx_no_vcpus_enter_start(struct kvm *kvm)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -345,7 +326,7 @@ static int tdx_reclaim_page(struct page *page)
r = __tdx_reclaim_page(page);
if (!r)
- tdx_clear_page(page);
+ tdx_quirk_reset_page(page);
return r;
}
@@ -442,6 +423,16 @@ void tdx_disable_virtualization_cpu(void)
tdx_flush_vp(&arg);
}
local_irq_restore(flags);
+
+ /*
+ * Flush cache now if kexec is possible: this is necessary to avoid
+ * having dirty private memory cachelines when the new kernel boots,
+ * but WBINVD is a relatively expensive operation and doing it during
+ * kexec can exacerbate races in native_stop_other_cpus(). Do it
+ * now, since this is a safe moment and there is going to be no more
+ * TDX activity on this CPU from this point on.
+ */
+ tdx_cpu_flush_cache_for_kexec();
}
#define TDX_SEAMCALL_RETRIES 10000
@@ -593,7 +584,7 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
return;
}
- tdx_clear_page(kvm_tdx->td.tdr_page);
+ tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
__free_page(kvm_tdx->td.tdr_page);
kvm_tdx->td.tdr_page = NULL;
@@ -861,6 +852,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
if (tdx->vp.tdvpr_page) {
tdx_reclaim_control_page(tdx->vp.tdvpr_page);
tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_pa = 0;
}
tdx->state = VCPU_TD_STATE_UNINITIALIZED;
@@ -1714,7 +1706,7 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
return -EIO;
}
- tdx_clear_page(page);
+ tdx_quirk_reset_page(page);
tdx_unpin(kvm, page);
return 0;
}
@@ -2940,6 +2932,13 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
return -ENOMEM;
tdx->vp.tdvpr_page = page;
+ /*
+ * page_to_phys() does not work in 'noinstr' code, like guest
+ * entry via tdh_vp_enter(). Precalculate and store it instead
+ * of doing it at runtime later.
+ */
+ tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
+
tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
GFP_KERNEL);
if (!tdx->vp.tdcx_pages) {
@@ -3002,6 +3001,7 @@ free_tdvpr:
if (tdx->vp.tdvpr_page)
__free_page(tdx->vp.tdvpr_page);
tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_pa = 0;
return ret;
}
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c7a9a087ccaf..eac403248462 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -633,15 +633,19 @@ err:
}
/*
- * Convert TDX private pages back to normal by using MOVDIR64B to
- * clear these pages. Note this function doesn't flush cache of
- * these TDX private pages. The caller should make sure of that.
+ * Convert TDX private pages back to normal by using MOVDIR64B to clear these
+ * pages. Typically, any write to the page will convert it from TDX private back
+ * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to
+ * do the conversion explicitly via MOVDIR64B.
*/
-static void reset_tdx_pages(unsigned long base, unsigned long size)
+static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
{
const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
unsigned long phys, end;
+ if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return;
+
end = base + size;
for (phys = base; phys < end; phys += 64)
movdir64b(__va(phys), zero_page);
@@ -654,17 +658,23 @@ static void reset_tdx_pages(unsigned long base, unsigned long size)
mb();
}
-static void tdmr_reset_pamt(struct tdmr_info *tdmr)
+void tdx_quirk_reset_page(struct page *page)
+{
+ tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
+}
+EXPORT_SYMBOL_GPL(tdx_quirk_reset_page);
+
+static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
{
- tdmr_do_pamt_func(tdmr, reset_tdx_pages);
+ tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr);
}
-static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
+static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list)
{
int i;
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
- tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
+ tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i));
}
static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
@@ -1136,15 +1146,7 @@ err_reset_pamts:
* to the kernel.
*/
wbinvd_on_all_cpus();
- /*
- * According to the TDX hardware spec, if the platform
- * doesn't have the "partial write machine check"
- * erratum, any kernel read/write will never cause #MC
- * in kernel space, thus it's OK to not convert PAMTs
- * back to normal. But do the conversion anyway here
- * as suggested by the TDX spec.
- */
- tdmrs_reset_pamt_all(&tdx_tdmr_list);
+ tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list);
err_free_pamts:
tdmrs_free_pamt_all(&tdx_tdmr_list);
err_free_tdmrs:
@@ -1266,7 +1268,7 @@ static bool paddr_is_tdx_private(unsigned long phys)
return false;
/* Get page type from the TDX module */
- sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
+ sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args);
/*
* The SEAMCALL will not return success unless there is a
@@ -1502,11 +1504,6 @@ static inline u64 tdx_tdr_pa(struct tdx_td *td)
return page_to_phys(td->tdr_page);
}
-static inline u64 tdx_tdvpr_pa(struct tdx_vp *td)
-{
- return page_to_phys(td->tdvpr_page);
-}
-
/*
* The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether
* a CLFLUSH of pages is required before handing them to the TDX module.
@@ -1518,11 +1515,11 @@ static void tdx_clflush_page(struct page *page)
clflush_cache_range(page_to_virt(page), PAGE_SIZE);
}
-noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
+noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
{
- args->rcx = tdx_tdvpr_pa(td);
+ args->rcx = td->tdvpr_pa;
- return __seamcall_saved_ret(TDH_VP_ENTER, args);
+ return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args);
}
EXPORT_SYMBOL_GPL(tdh_vp_enter);
@@ -1581,7 +1578,7 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
{
struct tdx_module_args args = {
.rcx = page_to_phys(tdcx_page),
- .rdx = tdx_tdvpr_pa(vp),
+ .rdx = vp->tdvpr_pa,
};
tdx_clflush_page(tdcx_page);
@@ -1650,7 +1647,7 @@ EXPORT_SYMBOL_GPL(tdh_mng_create);
u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
.rdx = tdx_tdr_pa(td),
};
@@ -1706,7 +1703,7 @@ EXPORT_SYMBOL_GPL(tdh_mr_finalize);
u64 tdh_vp_flush(struct tdx_vp *vp)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
};
return seamcall(TDH_VP_FLUSH, &args);
@@ -1752,7 +1749,7 @@ EXPORT_SYMBOL_GPL(tdh_mng_init);
u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
.rdx = field,
};
u64 ret;
@@ -1769,7 +1766,7 @@ EXPORT_SYMBOL_GPL(tdh_vp_rd);
u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
.rdx = field,
.r8 = data,
.r9 = mask,
@@ -1782,7 +1779,7 @@ EXPORT_SYMBOL_GPL(tdh_vp_wr);
u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
.rdx = initial_rcx,
.r8 = x2apicid,
};
@@ -1870,3 +1867,22 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
}
EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
+
+#ifdef CONFIG_KEXEC_CORE
+void tdx_cpu_flush_cache_for_kexec(void)
+{
+ lockdep_assert_preemption_disabled();
+
+ if (!this_cpu_read(cache_state_incoherent))
+ return;
+
+ /*
+ * Private memory cachelines need to be clean at the time of
+ * kexec. Write them back now, as the caller promises that
+ * there should be no more SEAMCALLs on this CPU.
+ */
+ wbinvd();
+ this_cpu_write(cache_state_incoherent, false);
+}
+EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec);
+#endif