summaryrefslogtreecommitdiff
path: root/arch/x86/include
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/include')
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h43
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/pgtable.h21
-rw-r--r--arch/x86/include/asm/posted_intr.h83
-rw-r--r--arch/x86/include/asm/set_memory.h2
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/shared/tdx.h10
-rw-r--r--arch/x86/include/asm/svm.h10
-rw-r--r--arch/x86/include/asm/syscall.h43
-rw-r--r--arch/x86/include/asm/tdx.h77
-rw-r--r--arch/x86/include/asm/tdx_global_metadata.h44
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/uapi/asm/kvm.h72
-rw-r--r--arch/x86/include/uapi/asm/setup_data.h13
-rw-r--r--arch/x86/include/uapi/asm/svm.h2
-rw-r--r--arch/x86/include/uapi/asm/vmx.h5
18 files changed, 398 insertions, 42 deletions
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5b50e0e35129..ee176236c2be 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -336,6 +336,7 @@
#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */
+#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/
#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */
#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */
@@ -378,6 +379,7 @@
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */
+#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */
#define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
@@ -446,6 +448,7 @@
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */
#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */
#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */
+#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */
#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */
@@ -457,6 +460,7 @@
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */
#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */
+#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */
#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 823c0434bbad..8d50e3e0a19b 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
KVM_X86_OP(vm_init)
KVM_X86_OP_OPTIONAL(vm_destroy)
+KVM_X86_OP_OPTIONAL(vm_pre_destroy)
KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
KVM_X86_OP(vcpu_create)
KVM_X86_OP(vcpu_free)
@@ -115,6 +116,7 @@ KVM_X86_OP_OPTIONAL(pi_start_assignment)
KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
+KVM_X86_OP_OPTIONAL(protected_apic_has_interrupt)
KVM_X86_OP_OPTIONAL(set_hv_timer)
KVM_X86_OP_OPTIONAL(cancel_hv_timer)
KVM_X86_OP(setup_mce)
@@ -126,6 +128,7 @@ KVM_X86_OP(enable_smi_window)
#endif
KVM_X86_OP_OPTIONAL(dev_get_attr)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9c971f846108..b4a391929cdb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -126,7 +126,8 @@
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
+#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \
+ KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -412,7 +413,6 @@ struct kvm_rmap_head {
};
struct kvm_pio_request {
- unsigned long linear_rip;
unsigned long count;
int in;
int port;
@@ -609,8 +609,15 @@ struct kvm_pmu {
struct kvm_pmu_ops;
enum {
- KVM_DEBUGREG_BP_ENABLED = 1,
- KVM_DEBUGREG_WONT_EXIT = 2,
+ KVM_DEBUGREG_BP_ENABLED = BIT(0),
+ KVM_DEBUGREG_WONT_EXIT = BIT(1),
+ /*
+ * Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by
+ * hardware on exit from or enter to guest. KVM needn't switch them.
+ * DR0-3, DR6 and DR7 are set to their architectural INIT value on VM
+ * exit, host values need to be restored.
+ */
+ KVM_DEBUGREG_AUTO_SWITCH = BIT(2),
};
struct kvm_mtrr {
@@ -911,6 +918,7 @@ struct kvm_vcpu_arch {
bool emulate_regs_need_sync_to_vcpu;
bool emulate_regs_need_sync_from_vcpu;
int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
+ unsigned long cui_linear_rip;
gpa_t time;
s8 pvclock_tsc_shift;
@@ -1028,6 +1036,7 @@ struct kvm_vcpu_arch {
int pending_ioapic_eoi;
int pending_external_vector;
+ int highest_stale_pending_ioapic_eoi;
/* be preempted when it's in kernel-mode(cpl=0) */
bool preempted_in_kernel;
@@ -1571,6 +1580,13 @@ struct kvm_arch {
struct kvm_mmu_memory_cache split_desc_cache;
gfn_t gfn_direct_bits;
+
+ /*
+ * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero
+ * value indicates CPU dirty logging is unsupported or disabled in
+ * current VM.
+ */
+ int cpu_dirty_log_size;
};
struct kvm_vm_stat {
@@ -1674,6 +1690,7 @@ struct kvm_x86_ops {
unsigned int vm_size;
int (*vm_init)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
+ void (*vm_pre_destroy)(struct kvm *kvm);
/* Create, but do not attach this VCPU */
int (*vcpu_precreate)(struct kvm *kvm);
@@ -1823,11 +1840,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
- /*
- * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
- * value indicates CPU dirty logging is unsupported or disabled.
- */
- int cpu_dirty_log_size;
void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
const struct kvm_x86_nested_ops *nested_ops;
@@ -1841,6 +1853,7 @@ struct kvm_x86_ops {
void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
+ bool (*protected_apic_has_interrupt)(struct kvm_vcpu *vcpu);
int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
bool *expired);
@@ -1857,6 +1870,7 @@ struct kvm_x86_ops {
int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
+ int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
@@ -1930,6 +1944,7 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
+extern bool __read_mostly enable_device_posted_irqs;
extern struct kvm_x86_ops kvm_x86_ops;
#define kvm_x86_call(func) static_call(kvm_x86_##func)
@@ -2333,6 +2348,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
int kvm_add_user_return_msr(u32 msr);
int kvm_find_user_return_msr(u32 msr);
int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
+void kvm_user_return_msr_update_cache(unsigned int index, u64 val);
static inline bool kvm_is_supported_user_return_msr(u32 msr)
{
@@ -2416,7 +2432,12 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
KVM_X86_QUIRK_SLOT_ZAP_ALL | \
- KVM_X86_QUIRK_STUFF_FEATURE_MSRS)
+ KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT)
+
+#define KVM_X86_CONDITIONAL_QUIRKS \
+ (KVM_X86_QUIRK_CD_NW_CLEARED | \
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT)
/*
* KVM previously used a u32 field in kvm_run to indicate the hypercall was
@@ -2427,7 +2448,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
static inline bool kvm_arch_has_irq_bypass(void)
{
- return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
+ return enable_device_posted_irqs;
}
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 4096b8af4ba7..9c2ea29e12a9 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -228,7 +228,7 @@ static __always_inline u64 rdpmc(int counter)
#endif /* !CONFIG_PARAVIRT_XXL */
/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
-#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
+#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
/* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */
static __always_inline void wrmsrns(u32 msr, u64 val)
@@ -237,7 +237,7 @@ static __always_inline void wrmsrns(u32 msr, u64 val)
* WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant
* DS prefix to avoid a trailing NOP.
*/
- asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS)
+ asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS)
"2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
: : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)));
}
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5ddba366d3b4..774430c3abff 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -777,6 +777,9 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot)
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ /* This bit combination is used to mark shadow stacks */
+ WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) ==
+ _PAGE_DIRTY);
pfn ^= protnone_mask(pgprot_val(pgprot));
pfn &= PTE_PFN_MASK;
return __pte(pfn | check_pgprot(pgprot));
@@ -1073,22 +1076,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
*/
#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * (Currently stuck as a macro because of indirect forward reference
- * to linux/mm.h:page_to_nid())
- */
-#define mk_pte(page, pgprot) \
-({ \
- pgprot_t __pgprot = pgprot; \
- \
- WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \
- _PAGE_DIRTY); \
- pfn_pte(page_to_pfn(page), __pgprot); \
-})
-
static inline int pmd_bad(pmd_t pmd)
{
return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
@@ -1353,8 +1340,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
-#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
-
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h
index de788b400fba..a5f761fbf45b 100644
--- a/arch/x86/include/asm/posted_intr.h
+++ b/arch/x86/include/asm/posted_intr.h
@@ -1,19 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_POSTED_INTR_H
#define _X86_POSTED_INTR_H
+
+#include <asm/cmpxchg.h>
+#include <asm/rwonce.h>
#include <asm/irq_vectors.h>
+#include <linux/bitmap.h>
+
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
#define PID_TABLE_ENTRY_VALID 1
+#define NR_PIR_VECTORS 256
+#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG)
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
- union {
- u32 pir[8]; /* Posted interrupt requested */
- u64 pir64[4];
- };
+ unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */
union {
struct {
u16 notifications; /* Suppress and outstanding bits */
@@ -26,6 +31,65 @@ struct pi_desc {
u32 rsvd[6];
} __aligned(64);
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ * accessed by both CPU and IOMMU.
+ * 2.During software processing of posted interrupts, the CPU needs to do
+ * natural width read and xchg for checking and clearing posted interrupt
+ * request (PIR), a 256 bit field within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ * line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ * cache line. The cache line states after each operation are as follows,
+ * assuming a 64-bit kernel:
+ * CPU IOMMU PID Cache line state
+ * ---------------------------------------------------------------
+ *...read64 exclusive
+ *...lock xchg64 modified
+ *... post/atomic swap invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * when processing posted interrupts in software, e.g. to dispatch interrupt
+ * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
+ * in KVM.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ * read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ * read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool pi_harvest_pir(unsigned long *pir,
+ unsigned long *pir_vals)
+{
+ unsigned long pending = 0;
+ int i;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ pir_vals[i] = READ_ONCE(pir[i]);
+ pending |= pir_vals[i];
+ }
+
+ if (!pending)
+ return false;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ if (!pir_vals[i])
+ continue;
+
+ pir_vals[i] = arch_xchg(&pir[i], 0);
+ }
+
+ return true;
+}
+
static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
{
return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
@@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+ return test_and_set_bit(vector, pi_desc->pir);
}
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+ return bitmap_empty(pi_desc->pir, NR_VECTORS);
}
static inline void pi_set_sn(struct pi_desc *pi_desc)
@@ -81,6 +145,11 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
}
+static inline bool pi_test_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
/* Non-atomic helpers */
static inline void __pi_set_sn(struct pi_desc *pi_desc)
{
@@ -105,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector)
if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
return false;
- return test_bit(vector, (unsigned long *)pid->pir);
+ return test_bit(vector, pid->pir);
}
extern void intel_posted_msi_init(void);
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 8d9f1c9aaa4c..61f56cdaccb5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -4,6 +4,7 @@
#include <asm/page.h>
#include <asm-generic/set_memory.h>
+#include <asm/pgtable.h>
#define set_memory_rox set_memory_rox
int set_memory_rox(unsigned long addr, int numpages);
@@ -37,6 +38,7 @@ int set_memory_rox(unsigned long addr, int numpages);
* The caller is required to take care of these.
*/
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
int _set_memory_uc(unsigned long addr, int numpages);
int _set_memory_wc(unsigned long addr, int numpages);
int _set_memory_wt(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 6324f4c6c545..692af46603a1 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -68,6 +68,8 @@ extern void x86_ce4100_early_setup(void);
static inline void x86_ce4100_early_setup(void) { }
#endif
+#include <linux/kexec_handover.h>
+
#ifndef _SETUP
#include <asm/espfix.h>
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index a28ff6b14145..2f3820342598 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -13,6 +13,7 @@
/* TDX module Call Leaf IDs */
#define TDG_VP_VMCALL 0
#define TDG_VP_INFO 1
+#define TDG_MR_RTMR_EXTEND 2
#define TDG_VP_VEINFO_GET 3
#define TDG_MR_REPORT 4
#define TDG_MEM_PAGE_ACCEPT 6
@@ -67,11 +68,18 @@
#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT)
/* TDX hypercall Leaf IDs */
+#define TDVMCALL_GET_TD_VM_CALL_INFO 0x10000
#define TDVMCALL_MAP_GPA 0x10001
#define TDVMCALL_GET_QUOTE 0x10002
#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
-#define TDVMCALL_STATUS_RETRY 1
+/*
+ * TDG.VP.VMCALL Status Codes (returned in R10)
+ */
+#define TDVMCALL_STATUS_SUCCESS 0x0000000000000000ULL
+#define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL
+#define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL
+#define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL
/*
* Bitmasks of exposed registers (with VMM).
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 9b7fa99ae951..ad954a1a6656 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -116,6 +116,7 @@ enum {
INTERCEPT_INVPCID,
INTERCEPT_MCOMMIT,
INTERCEPT_TLBSYNC,
+ INTERCEPT_BUSLOCK,
INTERCEPT_IDLE_HLT = 166,
};
@@ -159,7 +160,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 avic_physical_id; /* Offset 0xf8 */
u8 reserved_7[8];
u64 vmsa_pa; /* Used for an SEV-ES guest */
- u8 reserved_8[720];
+ u8 reserved_8[16];
+ u16 bus_lock_counter; /* Offset 0x120 */
+ u8 reserved_9[22];
+ u64 allowed_sev_features; /* Offset 0x138 */
+ u64 guest_sev_features; /* Offset 0x140 */
+ u8 reserved_10[664];
/*
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
@@ -291,6 +297,8 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+#define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63)
+
struct vmcb_seg {
u16 selector;
u16 attrib;
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 7c488ff0c764..c10dbb74cd00 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -38,6 +38,13 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
return regs->orig_ax;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->orig_ax = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -90,6 +97,18 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[5] = regs->bp;
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ regs->bx = args[0];
+ regs->cx = args[1];
+ regs->dx = args[2];
+ regs->si = args[3];
+ regs->di = args[4];
+ regs->bp = args[5];
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_I386;
@@ -121,6 +140,30 @@ static inline void syscall_get_arguments(struct task_struct *task,
}
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task->thread_info.status & TS_COMPAT) {
+ regs->bx = *args++;
+ regs->cx = *args++;
+ regs->dx = *args++;
+ regs->si = *args++;
+ regs->di = *args++;
+ regs->bp = *args;
+ } else
+# endif
+ {
+ regs->di = *args++;
+ regs->si = *args++;
+ regs->dx = *args++;
+ regs->r10 = *args++;
+ regs->r8 = *args++;
+ regs->r9 = *args;
+ }
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
/* x32 tasks should be considered AUDIT_ARCH_X86_64. */
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 4a1922ec80cf..8b19294600c4 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -5,6 +5,7 @@
#include <linux/init.h>
#include <linux/bits.h>
+#include <linux/mmzone.h>
#include <asm/errno.h>
#include <asm/ptrace.h>
@@ -18,6 +19,7 @@
* TDX module.
*/
#define TDX_ERROR _BITUL(63)
+#define TDX_NON_RECOVERABLE _BITUL(62)
#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40))
#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000))
@@ -33,6 +35,8 @@
#ifndef __ASSEMBLER__
#include <uapi/asm/mce.h>
+#include <asm/tdx_global_metadata.h>
+#include <linux/pgtable.h>
/*
* Used by the #VE exception handler to gather the #VE exception
@@ -64,6 +68,8 @@ bool tdx_early_handle_ve(struct pt_regs *regs);
int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
+int tdx_mcall_extend_rtmr(u8 index, u8 *data);
+
u64 tdx_hcall_get_quote(u8 *buf, size_t size);
void __init tdx_dump_attributes(u64 td_attr);
@@ -119,11 +125,82 @@ static inline u64 sc_retry(sc_func_t func, u64 fn,
int tdx_cpu_enable(void);
int tdx_enable(void);
const char *tdx_dump_mce_info(struct mce *m);
+const struct tdx_sys_info *tdx_get_sysinfo(void);
+
+int tdx_guest_keyid_alloc(void);
+u32 tdx_get_nr_guest_keyids(void);
+void tdx_guest_keyid_free(unsigned int keyid);
+
+struct tdx_td {
+ /* TD root structure: */
+ struct page *tdr_page;
+
+ int tdcs_nr_pages;
+ /* TD control structure: */
+ struct page **tdcs_pages;
+
+ /* Size of `tdcx_pages` in struct tdx_vp */
+ int tdcx_nr_pages;
+};
+
+struct tdx_vp {
+ /* TDVP root page */
+ struct page *tdvpr_page;
+
+ /* TD vCPU control structure: */
+ struct page **tdcx_pages;
+};
+
+static inline u64 mk_keyed_paddr(u16 hkid, struct page *page)
+{
+ u64 ret;
+
+ ret = page_to_phys(page);
+ /* KeyID bits are just above the physical address bits: */
+ ret |= (u64)hkid << boot_cpu_data.x86_phys_bits;
+
+ return ret;
+}
+
+static inline int pg_level_to_tdx_sept_level(enum pg_level level)
+{
+ WARN_ON_ONCE(level == PG_LEVEL_NONE);
+ return level - 1;
+}
+
+u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
+u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page);
+u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page);
+u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mng_key_config(struct tdx_td *td);
+u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
+u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
+u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
+u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_mr_finalize(struct tdx_td *td);
+u64 tdh_vp_flush(struct tdx_vp *vp);
+u64 tdh_mng_vpflushdone(struct tdx_td *td);
+u64 tdh_mng_key_freeid(struct tdx_td *td);
+u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err);
+u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid);
+u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data);
+u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask);
+u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size);
+u64 tdh_mem_track(struct tdx_td *tdr);
+u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2);
+u64 tdh_phymem_cache_wb(bool resume);
+u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
+u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
#else
static inline void tdx_init(void) { }
static inline int tdx_cpu_enable(void) { return -ENODEV; }
static inline int tdx_enable(void) { return -ENODEV; }
+static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
+static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
#endif /* CONFIG_INTEL_TDX_HOST */
#endif /* !__ASSEMBLER__ */
diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h
new file mode 100644
index 000000000000..060a2ad744bf
--- /dev/null
+++ b/arch/x86/include/asm/tdx_global_metadata.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Automatically generated TDX global metadata structures. */
+#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H
+#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H
+
+#include <linux/types.h>
+
+struct tdx_sys_info_features {
+ u64 tdx_features0;
+};
+
+struct tdx_sys_info_tdmr {
+ u16 max_tdmrs;
+ u16 max_reserved_per_tdmr;
+ u16 pamt_4k_entry_size;
+ u16 pamt_2m_entry_size;
+ u16 pamt_1g_entry_size;
+};
+
+struct tdx_sys_info_td_ctrl {
+ u16 tdr_base_size;
+ u16 tdcs_base_size;
+ u16 tdvps_base_size;
+};
+
+struct tdx_sys_info_td_conf {
+ u64 attributes_fixed0;
+ u64 attributes_fixed1;
+ u64 xfam_fixed0;
+ u64 xfam_fixed1;
+ u16 num_cpuid_config;
+ u16 max_vcpus_per_td;
+ u64 cpuid_config_leaves[128];
+ u64 cpuid_config_values[128][2];
+};
+
+struct tdx_sys_info {
+ struct tdx_sys_info_features features;
+ struct tdx_sys_info_tdmr tdmr;
+ struct tdx_sys_info_td_ctrl td_ctrl;
+ struct tdx_sys_info_td_conf td_conf;
+};
+
+#endif
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 8707361b24da..cca7d6641287 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -256,6 +256,7 @@ enum vmcs_field {
TSC_MULTIPLIER_HIGH = 0x00002033,
TERTIARY_VM_EXEC_CONTROL = 0x00002034,
TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035,
+ SHARED_EPT_POINTER = 0x0000203C,
PID_POINTER_TABLE = 0x00002042,
PID_POINTER_TABLE_HIGH = 0x00002043,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
@@ -586,6 +587,7 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_PROT_READ BIT(3)
#define EPT_VIOLATION_PROT_WRITE BIT(4)
#define EPT_VIOLATION_PROT_EXEC BIT(5)
+#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6)
#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
EPT_VIOLATION_PROT_WRITE | \
EPT_VIOLATION_PROT_EXEC)
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 460306b35a4b..6f3499507c5e 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -441,6 +441,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8)
+#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
@@ -844,6 +845,7 @@ struct kvm_sev_snp_launch_start {
};
/* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_PAGE_TYPE_INVALID 0x0
#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1
#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3
#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4
@@ -930,4 +932,74 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_SNP_VM 4
#define KVM_X86_TDX_VM 5
+/* Trust Domain eXtension sub-ioctl() commands. */
+enum kvm_tdx_cmd_id {
+ KVM_TDX_CAPABILITIES = 0,
+ KVM_TDX_INIT_VM,
+ KVM_TDX_INIT_VCPU,
+ KVM_TDX_INIT_MEM_REGION,
+ KVM_TDX_FINALIZE_VM,
+ KVM_TDX_GET_CPUID,
+
+ KVM_TDX_CMD_NR_MAX,
+};
+
+struct kvm_tdx_cmd {
+ /* enum kvm_tdx_cmd_id */
+ __u32 id;
+ /* flags for sub-commend. If sub-command doesn't use this, set zero. */
+ __u32 flags;
+ /*
+ * data for each sub-command. An immediate or a pointer to the actual
+ * data in process virtual address. If sub-command doesn't use it,
+ * set zero.
+ */
+ __u64 data;
+ /*
+ * Auxiliary error code. The sub-command may return TDX SEAMCALL
+ * status code in addition to -Exxx.
+ */
+ __u64 hw_error;
+};
+
+struct kvm_tdx_capabilities {
+ __u64 supported_attrs;
+ __u64 supported_xfam;
+ __u64 reserved[254];
+
+ /* Configurable CPUID bits for userspace */
+ struct kvm_cpuid2 cpuid;
+};
+
+struct kvm_tdx_init_vm {
+ __u64 attributes;
+ __u64 xfam;
+ __u64 mrconfigid[6]; /* sha384 digest */
+ __u64 mrowner[6]; /* sha384 digest */
+ __u64 mrownerconfig[6]; /* sha384 digest */
+
+ /* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
+ __u64 reserved[12];
+
+ /*
+ * Call KVM_TDX_INIT_VM before vcpu creation, thus before
+ * KVM_SET_CPUID2.
+ * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
+ * TDX module directly virtualizes those CPUIDs without VMM. The user
+ * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
+ * those values. If it doesn't, KVM may have wrong idea of vCPUIDs of
+ * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
+ * module doesn't virtualize.
+ */
+ struct kvm_cpuid2 cpuid;
+};
+
+#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0)
+
+struct kvm_tdx_init_mem_region {
+ __u64 source_addr;
+ __u64 gpa;
+ __u64 nr_pages;
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h
index 50c45ead4e7c..2671c4e1b3a0 100644
--- a/arch/x86/include/uapi/asm/setup_data.h
+++ b/arch/x86/include/uapi/asm/setup_data.h
@@ -13,7 +13,8 @@
#define SETUP_CC_BLOB 7
#define SETUP_IMA 8
#define SETUP_RNG_SEED 9
-#define SETUP_ENUM_MAX SETUP_RNG_SEED
+#define SETUP_KEXEC_KHO 10
+#define SETUP_ENUM_MAX SETUP_KEXEC_KHO
#define SETUP_INDIRECT (1<<31)
#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
@@ -78,6 +79,16 @@ struct ima_setup_data {
__u64 size;
} __attribute__((packed));
+/*
+ * Locations of kexec handover metadata
+ */
+struct kho_data {
+ __u64 fdt_addr;
+ __u64 fdt_size;
+ __u64 scratch_addr;
+ __u64 scratch_size;
+} __attribute__((packed));
+
#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index ec1321248dac..9c640a521a67 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -95,6 +95,7 @@
#define SVM_EXIT_CR14_WRITE_TRAP 0x09e
#define SVM_EXIT_CR15_WRITE_TRAP 0x09f
#define SVM_EXIT_INVPCID 0x0a2
+#define SVM_EXIT_BUS_LOCK 0x0a5
#define SVM_EXIT_IDLE_HLT 0x0a6
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
@@ -225,6 +226,7 @@
{ SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \
{ SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \
{ SVM_EXIT_INVPCID, "invpcid" }, \
+ { SVM_EXIT_BUS_LOCK, "buslock" }, \
{ SVM_EXIT_IDLE_HLT, "idle-halt" }, \
{ SVM_EXIT_NPF, "npf" }, \
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index a5faf6d88f1b..f0f4a4cf84a7 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -34,6 +34,7 @@
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT_SIGNAL 3
#define EXIT_REASON_SIPI_SIGNAL 4
+#define EXIT_REASON_OTHER_SMI 6
#define EXIT_REASON_INTERRUPT_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
@@ -92,6 +93,7 @@
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_TDCALL 77
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -155,7 +157,8 @@
{ EXIT_REASON_UMWAIT, "UMWAIT" }, \
{ EXIT_REASON_TPAUSE, "TPAUSE" }, \
{ EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \
- { EXIT_REASON_NOTIFY, "NOTIFY" }
+ { EXIT_REASON_NOTIFY, "NOTIFY" }, \
+ { EXIT_REASON_TDCALL, "TDCALL" }
#define VMX_EXIT_REASON_FLAGS \
{ VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }