summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx/tdx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/tdx.c')
-rw-r--r--arch/x86/kvm/vmx/tdx.c805
1 files changed, 380 insertions, 425 deletions
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0a49c863c811..2d7a4d52ccfb 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -24,20 +24,33 @@
#undef pr_fmt
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define pr_tdx_error(__fn, __err) \
- pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
+#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \
+({ \
+ struct kvm *_kvm = (__kvm); \
+ bool __ret = !!(__err); \
+ \
+ if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \
+ if (_kvm) \
+ kvm_vm_bugged(_kvm); \
+ pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
+ __err, __args); \
+ } \
+ unlikely(__ret); \
+})
+
+#define TDX_BUG_ON(__err, __fn, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")
+
+#define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)
+
+#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)
+
+#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
+ a1, a2, a3)
-#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
- pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
-
-#define pr_tdx_error_1(__fn, __err, __rcx) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
-
-#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
-
-#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
bool enable_tdx __ro_after_init;
module_param_named(tdx, enable_tdx, bool, 0444);
@@ -281,25 +294,34 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
vcpu->cpu = -1;
}
-static void tdx_no_vcpus_enter_start(struct kvm *kvm)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
-
- kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
-}
-
-static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
-}
+/*
+ * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
+ * retry (if necessary) after forcing vCPUs to exit and wait for the operation
+ * to complete. All flows that remove/block S-EPT entries run with mmu_lock
+ * held for write, i.e. are mutually exclusive with each other, but they aren't
+ * mutually exclusive with running vCPUs, and so can fail with "operand busy"
+ * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
+ *
+ * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
+ */
+#define tdh_do_no_vcpus(tdh_func, kvm, args...) \
+({ \
+ struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \
+ u64 __err; \
+ \
+ lockdep_assert_held_write(&kvm->mmu_lock); \
+ \
+ __err = tdh_func(args); \
+ if (unlikely(tdx_operand_busy(__err))) { \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \
+ \
+ __err = tdh_func(args); \
+ \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \
+ } \
+ __err; \
+})
/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
static int __tdx_reclaim_page(struct page *page)
@@ -313,10 +335,9 @@ static int __tdx_reclaim_page(struct page *page)
* before the HKID is released and control pages have also been
* released at this point, so there is no possibility of contention.
*/
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
+ if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
return -EIO;
- }
+
return 0;
}
@@ -404,8 +425,8 @@ static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
return;
smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
- if (KVM_BUG_ON(arg.err, vcpu->kvm))
- pr_tdx_error(TDH_VP_FLUSH, arg.err);
+
+ TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
}
void tdx_disable_virtualization_cpu(void)
@@ -464,8 +485,7 @@ static void smp_func_do_phymem_cache_wb(void *unused)
}
out:
- if (WARN_ON_ONCE(err))
- pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
+ TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
}
void tdx_mmu_release_hkid(struct kvm *kvm)
@@ -504,8 +524,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
err = tdh_mng_vpflushdone(&kvm_tdx->td);
if (err == TDX_FLUSHVP_NOT_DONE)
goto out;
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
+ if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
kvm_tdx->hkid);
goto out;
@@ -528,8 +547,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
* tdh_mng_key_freeid() will fail.
*/
err = tdh_mng_key_freeid(&kvm_tdx->td);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_MNG_KEY_FREEID, err);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
kvm_tdx->hkid);
} else {
@@ -580,10 +598,9 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
* when it is reclaiming TDCS).
*/
err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
return;
- }
+
tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
__free_page(kvm_tdx->td.tdr_page);
@@ -606,11 +623,8 @@ static int tdx_do_tdh_mng_key_config(void *param)
/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
err = tdh_mng_key_config(&kvm_tdx->td);
-
- if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
- pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
return -EIO;
- }
return 0;
}
@@ -763,25 +777,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
}
-/*
- * Compared to vmx_prepare_switch_to_guest(), there is not much to do
- * as SEAMCALL/SEAMRET calls take care of most of save and restore.
- */
-void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vt *vt = to_vt(vcpu);
-
- if (vt->guest_state_loaded)
- return;
-
- if (likely(is_64bit_mm(current->mm)))
- vt->msr_host_kernel_gs_base = current->thread.gsbase;
- else
- vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
-
- vt->guest_state_loaded = true;
-}
-
struct tdx_uret_msr {
u32 msr;
unsigned int slot;
@@ -795,19 +790,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = {
{.msr = MSR_TSC_AUX,},
};
-static void tdx_user_return_msr_update_cache(void)
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vt *vt = to_vt(vcpu);
int i;
+ if (vt->guest_state_loaded)
+ return;
+
+ if (likely(is_64bit_mm(current->mm)))
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
+ else
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+
+ vt->guest_state_loaded = true;
+
+ /*
+ * Explicitly set user-return MSRs that are clobbered by the TDX-Module
+ * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
+ * written by the TDX-Module. Don't rely on the TDX-Module to actually
+ * clobber the MSRs, as the contract is poorly defined and not upheld.
+ * E.g. the TDX-Module will synthesize an EPT Violation without doing
+ * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
+ * state.
+ */
for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
- kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
- tdx_uret_msrs[i].defval);
+ kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
+ tdx_uret_msrs[i].defval, -1ull);
}
static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
{
struct vcpu_vt *vt = to_vt(vcpu);
- struct vcpu_tdx *tdx = to_tdx(vcpu);
if (!vt->guest_state_loaded)
return;
@@ -815,11 +829,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
++vcpu->stat.host_state_reload;
wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
- if (tdx->guest_entered) {
- tdx_user_return_msr_update_cache();
- tdx->guest_entered = false;
- }
-
vt->guest_state_loaded = false;
}
@@ -829,19 +838,52 @@ void tdx_vcpu_put(struct kvm_vcpu *vcpu)
tdx_prepare_switch_to_host(vcpu);
}
+/*
+ * Life cycles for a TD and a vCPU:
+ * 1. KVM_CREATE_VM ioctl.
+ * TD state is TD_STATE_UNINITIALIZED.
+ * hkid is not assigned at this stage.
+ * 2. KVM_TDX_INIT_VM ioctl.
+ * TD transitions to TD_STATE_INITIALIZED.
+ * hkid is assigned after this stage.
+ * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
+ * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
+ * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
+ * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
+ * kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
+ * 4. KVM_TDX_INIT_VCPU ioctl.
+ * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
+ * vCPU control structures are allocated at this stage.
+ * 5. kvm_destroy_vm().
+ * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
+ * (2) puts hkid to !assigned state.
+ * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
+ * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
+ * 5.3 tdx_vm_destroy()
+ * transitions TD to TD_STATE_UNINITIALIZED state.
+ *
+ * tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
+ * - If at 3.3, hkid is still assigned, but the vCPU must be in
+ * VCPU_TD_STATE_UNINITIALIZED state.
+ * - if at 5.2, hkid must be !assigned and all vCPUs must be in
+ * VCPU_TD_STATE_INITIALIZED state and have been dissociated.
+ */
void tdx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
struct vcpu_tdx *tdx = to_tdx(vcpu);
int i;
+ if (vcpu->cpu != -1) {
+ KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
+ tdx_flush_vp_on_cpu(vcpu);
+ return;
+ }
+
/*
* It is not possible to reclaim pages while hkid is assigned. It might
- * be assigned if:
- * 1. the TD VM is being destroyed but freeing hkid failed, in which
- * case the pages are leaked
- * 2. TD VCPU creation failed and this on the error path, in which case
- * there is nothing to do anyway
+ * be assigned if the TD VM is being destroyed but freeing hkid failed,
+ * in which case the pages are leaked.
*/
if (is_hkid_assigned(kvm_tdx))
return;
@@ -856,7 +898,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
}
if (tdx->vp.tdvpr_page) {
tdx_reclaim_control_page(tdx->vp.tdvpr_page);
- tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_page = NULL;
tdx->vp.tdvpr_pa = 0;
}
@@ -1059,7 +1101,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
update_debugctlmsr(vcpu->arch.host_debugctl);
tdx_load_host_xsave_state(vcpu);
- tdx->guest_entered = true;
vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
@@ -1069,9 +1110,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
return EXIT_FASTPATH_NONE;
- if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
- kvm_machine_check();
-
trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(tdx_failed_vmentry(vcpu)))
@@ -1583,137 +1621,79 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
-static void tdx_unpin(struct kvm *kvm, struct page *page)
+static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ kvm_pfn_t pfn)
{
- put_page(page);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
+ gpa_t gpa = gfn_to_gpa(gfn);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
+ KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
+ return -EIO;
+
+ err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
+ kvm_tdx->page_add_src, &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
}
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
+ enum pg_level level, kvm_pfn_t pfn)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct page *page = pfn_to_page(pfn);
gpa_t gpa = gfn_to_gpa(gfn);
u64 entry, level_state;
u64 err;
err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
- if (unlikely(tdx_operand_busy(err))) {
- tdx_unpin(kvm, page);
+ if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- }
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
- tdx_unpin(kvm, page);
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
return -EIO;
- }
-
- return 0;
-}
-
-/*
- * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
- * callback tdx_gmem_post_populate() then maps pages into private memory.
- * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
- * private EPT structures for the page to have been built before, which is
- * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
- * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
- * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
- * are no half-initialized shared EPT pages.
- */
-static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
- return -EINVAL;
- /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
- atomic64_inc(&kvm_tdx->nr_premapped);
return 0;
}
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+ enum pg_level level, u64 mirror_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- struct page *page = pfn_to_page(pfn);
+ kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return -EINVAL;
+ return -EIO;
- /*
- * Because guest_memfd doesn't support page migration with
- * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
- * migration. Until guest_memfd supports page migration, prevent page
- * migration.
- * TODO: Once guest_memfd introduces callback on page migration,
- * implement it and remove get_page/put_page().
- */
- get_page(page);
+ WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
+ (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
/*
- * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
- * barrier in tdx_td_finalize().
+ * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
+ * before kvm_tdx->state. Userspace must not be allowed to pre-fault
+ * arbitrary memory until the initial memory image is finalized. Pairs
+ * with the smp_wmb() in tdx_td_finalize().
*/
smp_rmb();
- if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
- return tdx_mem_page_aug(kvm, gfn, level, page);
-
- return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
-}
-
-static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
-{
- int tdx_level = pg_level_to_tdx_sept_level(level);
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- gpa_t gpa = gfn_to_gpa(gfn);
- u64 err, entry, level_state;
-
- /* TODO: handle large pages. */
- if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return -EINVAL;
-
- if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
- return -EINVAL;
/*
- * When zapping private page, write lock is held. So no race condition
- * with other vcpu sept operation.
- * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ * If the TD isn't finalized/runnable, then userspace is initializing
+ * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
*/
- err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
- &level_state);
-
- if (unlikely(tdx_operand_busy(err))) {
- /*
- * The second retry is expected to succeed after kicking off all
- * other vCPUs and prevent them from invoking TDH.VP.ENTER.
- */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
- &level_state);
- tdx_no_vcpus_enter_stop(kvm);
- }
-
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
- return -EIO;
- }
-
- err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
+ return tdx_mem_page_add(kvm, gfn, level, pfn);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
- return -EIO;
- }
- tdx_quirk_reset_page(page);
- tdx_unpin(kvm, page);
- return 0;
+ return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
@@ -1729,81 +1709,13 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
return -EIO;
- }
return 0;
}
/*
- * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
- * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
- * successfully.
- *
- * Since tdh_mem_sept_add() must have been invoked successfully before a
- * non-leaf entry present in the mirrored page table, the SEPT ZAP related
- * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
- * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
- * SEPT.
- *
- * Further check if the returned entry from SEPT walking is with RWX permissions
- * to filter out anything unexpected.
- *
- * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
- * level_state returned from a SEAMCALL error is the same as that passed into
- * the SEAMCALL.
- */
-static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
- u64 entry, int level)
-{
- if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
- return false;
-
- if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
- return false;
-
- if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
- return false;
-
- return true;
-}
-
-static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
-{
- int tdx_level = pg_level_to_tdx_sept_level(level);
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
- u64 err, entry, level_state;
-
- /* For now large page isn't supported yet. */
- WARN_ON_ONCE(level != PG_LEVEL_4K);
-
- err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
-
- if (unlikely(tdx_operand_busy(err))) {
- /* After no vCPUs enter, the second retry is expected to succeed */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
- tdx_no_vcpus_enter_stop(kvm);
- }
- if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
- !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
- atomic64_dec(&kvm_tdx->nr_premapped);
- tdx_unpin(kvm, page);
- return 0;
- }
-
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
- return -EIO;
- }
- return 1;
-}
-
-/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
* the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
@@ -1836,18 +1748,15 @@ static void tdx_track(struct kvm *kvm)
if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
return;
+ /*
+ * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
+ * mode must be serialized, as TDH.MEM.TRACK will fail if the previous
+ * tracking epoch hasn't completed.
+ */
lockdep_assert_held_write(&kvm->mmu_lock);
- err = tdh_mem_track(&kvm_tdx->td);
- if (unlikely(tdx_operand_busy(err))) {
- /* After no vCPUs enter, the second retry is expected to succeed */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_track(&kvm_tdx->td);
- tdx_no_vcpus_enter_stop(kvm);
- }
-
- if (KVM_BUG_ON(err, kvm))
- pr_tdx_error(TDH_MEM_TRACK, err);
+ err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
+ TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
@@ -1866,7 +1775,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
* and slot move/deletion.
*/
if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
- return -EINVAL;
+ return -EIO;
/*
* The HKID assigned to this TD was already freed and cache was
@@ -1875,11 +1784,16 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
return tdx_reclaim_page(virt_to_page(private_spt));
}
-static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 mirror_spte)
{
- struct page *page = pfn_to_page(pfn);
- int ret;
+ struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
/*
* HKID is released after all private pages have been removed, and set
@@ -1887,11 +1801,16 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
* there can't be anything populated in the private EPT.
*/
if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
- return -EINVAL;
+ return;
- ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
- if (ret <= 0)
- return ret;
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return;
+
+ err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
+ return;
/*
* TDX requires TLB tracking before dropping private page. Do
@@ -1899,7 +1818,21 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
*/
tdx_track(kvm);
- return tdx_sept_drop_private_spte(kvm, gfn, level, page);
+ /*
+ * When zapping private page, write lock is held. So no race condition
+ * with other vcpu sept operation.
+ * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ */
+ err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
+ return;
+
+ err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
+ return;
+
+ tdx_quirk_reset_page(page);
}
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
@@ -2145,11 +2078,7 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
}
unhandled_exit:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = vp_enter_ret;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret);
return 0;
}
@@ -2282,37 +2211,28 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
if (cmd->flags)
return -EINVAL;
- caps = kzalloc(sizeof(*caps) +
- sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
- GFP_KERNEL);
- if (!caps)
- return -ENOMEM;
-
user_caps = u64_to_user_ptr(cmd->data);
- if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
- ret = -EFAULT;
- goto out;
- }
+ if (get_user(nr_user_entries, &user_caps->cpuid.nent))
+ return -EFAULT;
- if (nr_user_entries < td_conf->num_cpuid_config) {
- ret = -E2BIG;
- goto out;
- }
+ if (nr_user_entries < td_conf->num_cpuid_config)
+ return -E2BIG;
+
+ caps = kzalloc(struct_size(caps, cpuid.entries,
+ td_conf->num_cpuid_config), GFP_KERNEL);
+ if (!caps)
+ return -ENOMEM;
ret = init_kvm_tdx_caps(td_conf, caps);
if (ret)
goto out;
- if (copy_to_user(user_caps, caps, sizeof(*caps))) {
+ if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
+ caps->cpuid.nent))) {
ret = -EFAULT;
goto out;
}
- if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
- caps->cpuid.nent *
- sizeof(caps->cpuid.entries[0])))
- ret = -EFAULT;
-
out:
/* kfree() accepts NULL. */
kfree(caps);
@@ -2537,8 +2457,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
goto free_packages;
}
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error(TDH_MNG_CREATE, err);
+ if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
ret = -EIO;
goto free_packages;
}
@@ -2579,8 +2498,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
ret = -EAGAIN;
goto teardown;
}
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error(TDH_MNG_ADDCX, err);
+ if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
ret = -EIO;
goto teardown;
}
@@ -2597,8 +2515,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
*seamcall_err = err;
ret = -EINVAL;
goto teardown;
- } else if (WARN_ON_ONCE(err)) {
- pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
+ } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
ret = -EIO;
goto teardown;
}
@@ -2642,7 +2559,7 @@ free_tdcs:
free_tdr:
if (tdr_page)
__free_page(tdr_page);
- kvm_tdx->td.tdr_page = 0;
+ kvm_tdx->td.tdr_page = NULL;
free_hkid:
tdx_hkid_free(kvm_tdx);
@@ -2747,11 +2664,53 @@ err_out:
return -EIO;
}
+typedef void *tdx_vm_state_guard_t;
+
+static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
+{
+ int r;
+
+ mutex_lock(&kvm->lock);
+
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
+ r = -EBUSY;
+ goto out_err;
+ }
+
+ r = kvm_lock_all_vcpus(kvm);
+ if (r)
+ goto out_err;
+
+ /*
+ * Note the unintuitive ordering! vcpu->mutex must be taken outside
+ * kvm->slots_lock!
+ */
+ mutex_lock(&kvm->slots_lock);
+ return kvm;
+
+out_err:
+ mutex_unlock(&kvm->lock);
+ return ERR_PTR(r);
+}
+
+static void tdx_release_vm_state_locks(struct kvm *kvm)
+{
+ mutex_unlock(&kvm->slots_lock);
+ kvm_unlock_all_vcpus(kvm);
+ mutex_unlock(&kvm->lock);
+}
+
+DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
+ if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
+ tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
+
static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
+ struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct kvm_tdx_init_vm *init_vm;
struct td_params *td_params = NULL;
+ u32 nr_user_entries;
int ret;
BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
@@ -2763,28 +2722,16 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
if (cmd->flags)
return -EINVAL;
- init_vm = kmalloc(sizeof(*init_vm) +
- sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
- GFP_KERNEL);
- if (!init_vm)
- return -ENOMEM;
-
- if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
- ret = -EFAULT;
- goto out;
- }
+ if (get_user(nr_user_entries, &user_data->cpuid.nent))
+ return -EFAULT;
- if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
- ret = -E2BIG;
- goto out;
- }
+ if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
- if (copy_from_user(init_vm->cpuid.entries,
- u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
- flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
- ret = -EFAULT;
- goto out;
- }
+ init_vm = memdup_user(user_data,
+ struct_size(user_data, cpuid.entries, nr_user_entries));
+ if (IS_ERR(init_vm))
+ return PTR_ERR(init_vm);
if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
ret = -EINVAL;
@@ -2868,24 +2815,14 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- guard(mutex)(&kvm->slots_lock);
-
if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
- /*
- * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
- * TDH.MEM.PAGE.ADD().
- */
- if (atomic64_read(&kvm_tdx->nr_premapped))
- return -EINVAL;
cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
if (tdx_operand_busy(cmd->hw_error))
return -EBUSY;
- if (KVM_BUG_ON(cmd->hw_error, kvm)) {
- pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
+ if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
return -EIO;
- }
kvm_tdx->state = TD_STATE_RUNNABLE;
/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
@@ -2894,27 +2831,38 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
return 0;
}
-int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
{
- struct kvm_tdx_cmd tdx_cmd;
- int r;
-
- if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
+ if (copy_from_user(cmd, argp, sizeof(*cmd)))
return -EFAULT;
/*
- * Userspace should never set hw_error. It is used to fill
- * hardware-defined error by the kernel.
+ * Userspace should never set hw_error. KVM writes hw_error to report
+ * hardware-defined error back to userspace.
*/
- if (tdx_cmd.hw_error)
+ if (cmd->hw_error)
return -EINVAL;
- mutex_lock(&kvm->lock);
+ return 0;
+}
+
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_tdx_cmd tdx_cmd;
+ int r;
+
+ r = tdx_get_cmd(argp, &tdx_cmd);
+ if (r)
+ return r;
+
+ if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
+ return tdx_get_capabilities(&tdx_cmd);
+
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
switch (tdx_cmd.id) {
- case KVM_TDX_CAPABILITIES:
- r = tdx_get_capabilities(&tdx_cmd);
- break;
case KVM_TDX_INIT_VM:
r = tdx_td_init(kvm, &tdx_cmd);
break;
@@ -2922,15 +2870,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
r = tdx_td_finalize(kvm, &tdx_cmd);
break;
default:
- r = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
- r = -EFAULT;
+ return -EFAULT;
-out:
- mutex_unlock(&kvm->lock);
return r;
}
@@ -2972,16 +2917,14 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
}
err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
+ if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
ret = -EIO;
- pr_tdx_error(TDH_VP_CREATE, err);
goto free_tdcx;
}
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
- pr_tdx_error(TDH_VP_ADDCX, err);
+ if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
/*
* Pages already added are reclaimed by the vcpu_free
* method, but the rest are freed here.
@@ -2994,10 +2937,19 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
}
}
- err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
- pr_tdx_error(TDH_VP_INIT, err);
- return -EIO;
+ /*
+ * tdh_vp_init() can take an exclusive lock of the TDR resource inside
+ * the TDX-Module. The TDR resource is also taken as shared in several
+ * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
+ * (TDX-Module locks are try-lock implementations with no slow path).
+ * Take mmu_lock for write to reflect the nature of the lock taken by
+ * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
+ * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
+ */
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
+ err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
+ if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
+ return -EIO;
}
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3016,7 +2968,7 @@ free_tdcx:
free_tdvpr:
if (tdx->vp.tdvpr_page)
__free_page(tdx->vp.tdvpr_page);
- tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_page = NULL;
tdx->vp.tdvpr_pa = 0;
return ret;
@@ -3054,7 +3006,8 @@ static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_i
static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
{
- struct kvm_cpuid2 __user *output, *td_cpuid;
+ struct kvm_cpuid2 __user *output;
+ struct kvm_cpuid2 *td_cpuid;
int r = 0, i = 0, leaf;
u32 level;
@@ -3167,15 +3120,15 @@ struct tdx_gmem_post_populate_arg {
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
void __user *src, int order, void *_arg)
{
- u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct tdx_gmem_post_populate_arg *arg = _arg;
- struct kvm_vcpu *vcpu = arg->vcpu;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
gpa_t gpa = gfn_to_gpa(gfn);
- u8 level = PG_LEVEL_4K;
struct page *src_page;
int ret, i;
- u64 err, entry, level_state;
+
+ if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
+ return -EIO;
/*
* Get the source page if it has been faulted in. Return failure if the
@@ -3187,49 +3140,29 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
if (ret != 1)
return -ENOMEM;
- ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
- if (ret < 0)
- goto out;
-
- /*
- * The private mem cannot be zapped after kvm_tdp_map_page()
- * because all paths are covered by slots_lock and the
- * filemap invalidate lock. Check that they are indeed enough.
- */
- if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
- scoped_guard(read_lock, &kvm->mmu_lock) {
- if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
- ret = -EIO;
- goto out;
- }
- }
- }
+ kvm_tdx->page_add_src = src_page;
+ ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
+ kvm_tdx->page_add_src = NULL;
- ret = 0;
- err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
- src_page, &entry, &level_state);
- if (err) {
- ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
- goto out;
- }
+ put_page(src_page);
- if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
- atomic64_dec(&kvm_tdx->nr_premapped);
+ if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
+ return ret;
- if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
- for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
- err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
- &level_state);
- if (err) {
- ret = -EIO;
- break;
- }
- }
+ /*
+ * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
+ * between mapping the pfn and now, but slots_lock prevents memslot
+ * updates, filemap_invalidate_lock() prevents guest_memfd updates,
+ * mmu_notifier events can't reach S-EPT entries, and KVM's internal
+ * zapping flows are mutually exclusive with S-EPT mappings.
+ */
+ for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+ err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
+ return -EIO;
}
-out:
- put_page(src_page);
- return ret;
+ return 0;
}
static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
@@ -3245,8 +3178,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
if (tdx->state != VCPU_TD_STATE_INITIALIZED)
return -EINVAL;
- guard(mutex)(&kvm->slots_lock);
-
/* Once TD is finalized, the initial guest memory is fixed. */
if (kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
@@ -3264,7 +3195,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
!vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
return -EINVAL;
- kvm_mmu_reload(vcpu);
ret = 0;
while (region.nr_pages) {
if (signal_pending(current)) {
@@ -3301,28 +3231,57 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
return ret;
}
-int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct kvm_tdx_cmd cmd;
- int ret;
+ int r;
- if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
- return -EINVAL;
+ r = tdx_get_cmd(argp, &cmd);
+ if (r)
+ return r;
- if (copy_from_user(&cmd, argp, sizeof(cmd)))
- return -EFAULT;
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
- if (cmd.hw_error)
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
+ vcpu_load(vcpu);
+
switch (cmd.id) {
+ case KVM_TDX_INIT_MEM_REGION:
+ r = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ break;
case KVM_TDX_INIT_VCPU:
- ret = tdx_vcpu_init(vcpu, &cmd);
+ r = tdx_vcpu_init(vcpu, &cmd);
break;
- case KVM_TDX_INIT_MEM_REGION:
- ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ default:
+ r = -ENOIOCTLCMD;
break;
+ }
+
+ vcpu_put(vcpu);
+
+ return r;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm_tdx_cmd cmd;
+ int ret;
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ ret = tdx_get_cmd(argp, &cmd);
+ if (ret)
+ return ret;
+
+ switch (cmd.id) {
case KVM_TDX_GET_CPUID:
ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
break;
@@ -3447,10 +3406,6 @@ static int __init __tdx_bringup(void)
/*
* Check if MSRs (tdx_uret_msrs) can be saved/restored
* before returning to user space.
- *
- * this_cpu_ptr(user_return_msrs)->registered isn't checked
- * because the registration is done at vcpu runtime by
- * tdx_user_return_msr_update_cache().
*/
tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
if (tdx_uret_msrs[i].slot == -1) {