diff options
Diffstat (limited to 'arch/x86/kvm/svm')
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 71 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 70 | ||||
-rw-r--r-- | arch/x86/kvm/svm/pmu.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 680 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 456 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 78 | ||||
-rw-r--r-- | arch/x86/kvm/svm/vmenter.S | 10 |
7 files changed, 850 insertions, 523 deletions
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 65fd245a9953..067f8e3f5a0d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -20,6 +20,7 @@ #include <linux/kvm_host.h> #include <asm/irq_remapping.h> +#include <asm/msr.h> #include "trace.h" #include "lapic.h" @@ -330,7 +331,7 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) int cpu = READ_ONCE(vcpu->cpu); if (cpu != get_cpu()) { - wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); } put_cpu(); @@ -796,12 +797,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; u64 entry; + if (WARN_ON_ONCE(!pi->ir_data)) + return -EINVAL; + /** * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { + if (pi->prev_ga_tag) { struct kvm *kvm = svm->vcpu.kvm; u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); @@ -820,7 +824,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -896,10 +900,10 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", @@ -933,6 +937,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_vcpu_apicv_active(&svm->vcpu)) { struct amd_iommu_pi_data pi; + enable_remapped_mode = false; + /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); @@ -951,33 +957,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, */ if (!ret && pi.is_guest_mode) svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } } if (!ret && svm) { @@ -993,6 +972,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } ret = 0; + if (enable_remapped_mode) { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.prev_ga_tag = 0; + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b708bdf7eaff..8427a48b8b7a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -111,7 +111,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) { - if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) + if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) return true; if (!nested_npt_enabled(svm)) @@ -594,7 +594,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DR); } - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { /* * Reserved bits of DEBUGCTL are ignored. Be consistent with @@ -646,12 +646,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - if (guest_can_use(vcpu, X86_FEATURE_VGIF) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); else @@ -673,6 +678,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + /* + * Stash vmcb02's counter if the guest hasn't moved past the guilty + * instruction; otherwise, reset the counter to '0'. + * + * In order to detect if L2 has made forward progress or not, track the + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP + * is changed, guest has clearly made forward progress, bus_lock_counter + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the + * scenario, where a buslock happened in L1 before VMRUN, the bus lock + * firmly happened on an instruction in the past. Even if vmcb01's + * counter is still '1', (because the guilty instruction got patched), + * the vCPU has clearly made forward progress and so KVM should reset + * vmcb02's counter to '0'. + * + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. + * if userspace rate-limits the vCPU, then it's entirely possible that + * L1's tick interrupt is pending by the time userspace re-runs the + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the + * entire cycle start over. + */ + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + vmcb02->control.bus_lock_counter = 1; + else + vmcb02->control.bus_lock_counter = 0; + /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ @@ -689,7 +721,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); @@ -710,7 +742,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * what a nrips=0 CPU would do (L1 is responsible for advancing RIP * prior to injecting the event). */ - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb02->control.next_rip = svm->nested.ctl.next_rip; else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; @@ -720,7 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_injected = true; svm->soft_int_csbase = vmcb12_csbase; svm->soft_int_old_rip = vmcb12_rip; - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) svm->soft_int_next_rip = svm->nested.ctl.next_rip; else svm->soft_int_next_rip = vmcb12_rip; @@ -728,18 +760,18 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_LBRV)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV)) vmcb02->control.virt_ext |= (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) pause_count12 = svm->nested.ctl.pause_filter_count; else pause_count12 = 0; - if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) pause_thresh12 = svm->nested.ctl.pause_filter_thresh; else pause_thresh12 = 0; @@ -762,11 +794,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. @@ -994,7 +1021,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ - svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); /* Give the current vmcb to the guest */ @@ -1026,7 +1053,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; @@ -1039,8 +1066,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } + /* + * Invalidate bus_lock_rip unless KVM is still waiting for the guest + * to make forward progress before re-enabling bus lock detection. + */ + if (!vmcb02->control.bus_lock_counter) + svm->nested.ctl.bus_lock_rip = INVALID_GPA; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + kvm_nested_vmexit_handle_ibrs(vcpu); + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1065,7 +1101,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (!nested_exit_on_intr(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 22d5a65b410c..288f7f2a46f2 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -46,7 +46,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, switch (msr) { case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) return NULL; /* * Each PMU counter has a pair of CTL and CTR MSRs. CTLn @@ -109,7 +109,7 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: return pmu->version > 0; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); + return guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE); case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: @@ -179,7 +179,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) union cpuid_0x80000022_ebx ebx; pmu->version = 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFMON_V2)) { pmu->version = 2; /* * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest @@ -189,7 +189,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; - } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { + } else if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 943bd074a5d3..5a69b657dae9 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -26,6 +26,7 @@ #include <asm/fpu/xcr.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> +#include <asm/msr.h> #include <asm/sev.h> #include "mmu.h" @@ -140,7 +141,7 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm) static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } @@ -226,9 +227,7 @@ e_uncharge: static unsigned int sev_get_asid(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->asid; + return to_kvm_sev_info(kvm)->asid; } static void sev_asid_free(struct kvm_sev_info *sev) @@ -403,7 +402,7 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_init *data, unsigned long vm_type) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_platform_init_args init_args = {0}; bool es_active = vm_type != KVM_X86_SEV_VM; u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; @@ -500,10 +499,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_init data; - if (!sev->need_init) + if (!to_kvm_sev_info(kvm)->need_init) return -EINVAL; if (kvm->arch.vm_type != KVM_X86_SEV_VM && @@ -543,14 +541,14 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error) static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return __sev_issue_cmd(sev->fd, id, data, error); } static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_launch_start start; struct kvm_sev_launch_start params; void *dh_blob, *session_blob; @@ -563,6 +561,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; + sev->policy = params.policy; + memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -622,9 +622,9 @@ e_free_dh: static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write) + unsigned int flags) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); unsigned long npages, size; int npinned; unsigned long locked, lock_limit; @@ -663,7 +663,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, flags, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); ret = -ENOMEM; @@ -686,11 +686,9 @@ err: static void sev_unpin_memory(struct kvm *kvm, struct page **pages, unsigned long npages) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unpin_user_pages(pages, npages); kvfree(pages); - sev->pages_locked -= npages; + to_kvm_sev_info(kvm)->pages_locked -= npages; } static void sev_clflush_pages(struct page *pages[], unsigned long npages) @@ -734,7 +732,6 @@ static unsigned long get_num_contig_pages(unsigned long idx, static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_launch_update_data params; struct sev_data_launch_update_data data; struct page **inpages; @@ -751,7 +748,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) vaddr_end = vaddr + size; /* Lock the user memory. */ - inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); if (IS_ERR(inpages)) return PTR_ERR(inpages); @@ -762,7 +759,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) sev_clflush_pages(inpages, npages); data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { int offset, len; @@ -802,7 +799,7 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); struct sev_es_save_area *save = svm->sev_es.vmsa; struct xregs_state *xsave; const u8 *s; @@ -972,7 +969,6 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *measure = u64_to_user_ptr(argp->data); - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_measure data; struct kvm_sev_launch_measure params; void __user *p = NULL; @@ -1005,7 +1001,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); /* @@ -1033,19 +1029,17 @@ e_free_blob: static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); } static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_guest_status params; struct sev_data_guest_status data; int ret; @@ -1055,7 +1049,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); if (ret) return ret; @@ -1074,11 +1068,10 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, unsigned long dst, int size, int *error, bool enc) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_dbg data; data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; data.dst_addr = dst; data.src_addr = src; data.len = size; @@ -1250,7 +1243,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (IS_ERR(src_p)) return PTR_ERR(src_p); - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); return PTR_ERR(dst_p); @@ -1302,7 +1295,6 @@ err: static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_secret data; struct kvm_sev_launch_secret params; struct page **pages; @@ -1316,7 +1308,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; - pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -1358,7 +1350,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) data.hdr_address = __psp_pa(hdr); data.hdr_len = params.hdr_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); kfree(hdr); @@ -1378,7 +1370,6 @@ e_unpin_memory: static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *report = u64_to_user_ptr(argp->data); - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_attestation_report data; struct kvm_sev_attestation_report params; void __user *p; @@ -1411,7 +1402,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); /* * If we query the session length, FW responded with expected data. @@ -1441,12 +1432,11 @@ static int __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_start *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); params->session_len = data.session_len; @@ -1459,7 +1449,6 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; struct kvm_sev_send_start params; void *amd_certs, *session_data; @@ -1520,7 +1509,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) data.amd_certs_len = params.amd_certs_len; data.session_address = __psp_pa(session_data); data.session_len = params.session_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); @@ -1552,12 +1541,11 @@ static int __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_update_data *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); params->hdr_len = data.hdr_len; @@ -1572,7 +1560,6 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; struct kvm_sev_send_update_data params; void *hdr, *trans_data; @@ -1608,11 +1595,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL); if (!hdr) goto e_unpin; - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL); if (!trans_data) goto e_free_hdr; @@ -1626,7 +1613,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); @@ -1657,31 +1644,29 @@ e_unpin: static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); } static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_cancel data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); } static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_receive_start start; struct kvm_sev_receive_start params; int *error = &argp->error; @@ -1755,7 +1740,6 @@ e_free_pdh: static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_receive_update_data params; struct sev_data_receive_update_data data; void *hdr = NULL, *trans = NULL; @@ -1798,7 +1782,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -1815,7 +1799,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, &argp->error); @@ -1832,13 +1816,12 @@ e_free_hdr: static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_receive_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } @@ -1858,8 +1841,8 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id) static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); int r = -EBUSY; if (dst_kvm == src_kvm) @@ -1893,8 +1876,8 @@ release_dst: static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); mutex_unlock(&dst_kvm->lock); mutex_unlock(&src_kvm->lock); @@ -1902,74 +1885,10 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } -/* vCPU mutex subclasses. */ -enum sev_migration_role { - SEV_MIGRATION_SOURCE = 0, - SEV_MIGRATION_TARGET, - SEV_NR_MIGRATION_ROLES, -}; - -static int sev_lock_vcpus_for_migration(struct kvm *kvm, - enum sev_migration_role role) -{ - struct kvm_vcpu *vcpu; - unsigned long i, j; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable_nested(&vcpu->mutex, role)) - goto out_unlock; - -#ifdef CONFIG_PROVE_LOCKING - if (!i) - /* - * Reset the role to one that avoids colliding with - * the role used for the first vcpu mutex. - */ - role = SEV_NR_MIGRATION_ROLES; - else - mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); -#endif - } - - return 0; - -out_unlock: - - kvm_for_each_vcpu(j, vcpu, kvm) { - if (i == j) - break; - -#ifdef CONFIG_PROVE_LOCKING - if (j) - mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); -#endif - - mutex_unlock(&vcpu->mutex); - } - return -EINTR; -} - -static void sev_unlock_vcpus_for_migration(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - unsigned long i; - bool first = true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (first) - first = false; - else - mutex_acquire(&vcpu->mutex.dep_map, - SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); - - mutex_unlock(&vcpu->mutex); - } -} - static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); struct kvm_vcpu *dst_vcpu, *src_vcpu; struct vcpu_svm *dst_svm, *src_svm; struct kvm_sev_info *mirror; @@ -2009,8 +1928,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * and add the new mirror to the list. */ if (is_mirroring_enc_context(dst_kvm)) { - struct kvm_sev_info *owner_sev_info = - &to_kvm_svm(dst->enc_context_owner)->sev_info; + struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); list_del(&src->mirror_entry); list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); @@ -2069,7 +1987,7 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); struct kvm_sev_info *src_sev, *cg_cleanup_sev; CLASS(fd, f)(source_fd); struct kvm *source_kvm; @@ -2093,7 +2011,7 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) goto out_unlock; } - src_sev = &to_kvm_svm(source_kvm)->sev_info; + src_sev = to_kvm_sev_info(source_kvm); dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; @@ -2104,10 +2022,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); + ret = kvm_lock_all_vcpus(kvm); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); + ret = kvm_lock_all_vcpus(source_kvm); if (ret) goto out_dst_vcpu; @@ -2121,9 +2039,9 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) ret = 0; out_source_vcpu: - sev_unlock_vcpus_for_migration(source_kvm); + kvm_unlock_all_vcpus(source_kvm); out_dst_vcpu: - sev_unlock_vcpus_for_migration(kvm); + kvm_unlock_all_vcpus(kvm); out_dst_cgroup: /* Operates on the source on success, on the destination on failure. */ if (charged) @@ -2181,7 +2099,7 @@ static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) static int snp_bind_asid(struct kvm *kvm, int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_activate data = {0}; data.gctx_paddr = __psp_pa(sev->snp_context); @@ -2191,7 +2109,7 @@ static int snp_bind_asid(struct kvm *kvm, int *error) static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_launch_start start = {0}; struct kvm_sev_snp_launch_start params; int rc; @@ -2220,6 +2138,8 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) return -EINVAL; + sev->policy = params.policy; + sev->snp_context = snp_context_create(kvm, argp); if (!sev->snp_context) return -ENOTTY; @@ -2260,7 +2180,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf void __user *src, int order, void *opaque) { struct sev_gmem_populate_args *sev_populate_args = opaque; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); int n_private = 0, ret, i; int npages = (1 << order); gfn_t gfn; @@ -2350,7 +2270,7 @@ err: static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_gmem_populate_args sev_populate_args = {0}; struct kvm_sev_snp_launch_update params; struct kvm_memory_slot *memslot; @@ -2434,7 +2354,7 @@ out: static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_launch_update data = {}; struct kvm_vcpu *vcpu; unsigned long i; @@ -2482,7 +2402,7 @@ static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct kvm_sev_snp_launch_finish params; struct sev_data_snp_launch_finish *data; void *id_block = NULL, *id_auth = NULL; @@ -2677,7 +2597,7 @@ out: int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct enc_region *region; int ret = 0; @@ -2696,7 +2616,8 @@ int sev_mem_enc_register_region(struct kvm *kvm, return -ENOMEM; mutex_lock(&kvm->lock); - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, + FOLL_WRITE | FOLL_LONGTERM); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); mutex_unlock(&kvm->lock); @@ -2729,7 +2650,7 @@ e_free: static struct enc_region * find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct enc_region *i; @@ -2824,9 +2745,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ - source_sev = &to_kvm_svm(source_kvm)->sev_info; + source_sev = to_kvm_sev_info(source_kvm); kvm_get_kvm(source_kvm); - mirror_sev = &to_kvm_svm(kvm)->sev_info; + mirror_sev = to_kvm_sev_info(kvm); list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ @@ -2854,7 +2775,7 @@ e_unlock: static int snp_decommission_context(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_addr data = {}; int ret; @@ -2879,7 +2800,7 @@ static int snp_decommission_context(struct kvm *kvm) void sev_vm_destroy(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct list_head *pos, *q; @@ -2953,6 +2874,7 @@ void __init sev_set_cpu_caps(void) void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + struct sev_platform_init_args init_args = {0}; bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -2972,6 +2894,16 @@ void __init sev_hardware_setup(void) WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) + goto out; + /* Retrieve SEV CPUID information */ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); @@ -3051,11 +2983,11 @@ out: min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", - sev_es_supported ? "enabled" : "disabled", + str_enabled_disabled(sev_es_supported), min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); if (boot_cpu_has(X86_FEATURE_SEV_SNP)) pr_info("SEV-SNP %s (ASIDs %u - %u)\n", - sev_snp_supported ? "enabled" : "disabled", + str_enabled_disabled(sev_snp_supported), min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); sev_enabled = sev_supported; @@ -3069,6 +3001,15 @@ out: sev_supported_vmsa_features = 0; if (sev_es_debug_swap_enabled) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; + + if (!sev_enabled) + return; + + /* + * Do both SNP and SEV initialization at KVM module load. + */ + init_args.probe = true; + sev_platform_init(&init_args); } void sev_hardware_unsetup(void) @@ -3084,6 +3025,8 @@ void sev_hardware_unsetup(void) misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); + + sev_platform_shutdown(); } int sev_cpu_init(struct svm_cpu_data *sd) @@ -3129,7 +3072,7 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) * back to WBINVD if this faults so as not to make any problems worse * by leaving stale encrypted data in the cache. */ - if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) goto do_wbinvd; return; @@ -3183,9 +3126,14 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -3194,18 +3142,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -3261,7 +3215,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ @@ -3276,11 +3230,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3420,8 +3369,7 @@ vmgexit_err: dump_ghcb(svm); } - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason); + svm_vmgexit_bad_input(svm, reason); /* Resume the guest to "return" the error code. */ return 1; @@ -3462,10 +3410,19 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) svm->sev_es.ghcb = NULL; } -void pre_sev_run(struct vcpu_svm *svm, int cpu) +int pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - unsigned int asid = sev_get_asid(svm->vcpu.kvm); + struct kvm *kvm = svm->vcpu.kvm; + unsigned int asid = sev_get_asid(kvm); + + /* + * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid + * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP + * AP Destroy event. + */ + if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) + return -EINVAL; /* Assign the asid allocated with this SEV guest */ svm->asid = asid; @@ -3478,11 +3435,12 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) */ if (sd->sev_vmcbs[asid] == svm->vmcb && svm->vcpu.arch.last_vmentry_cpu == cpu) - return; + return 0; sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + return 0; } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) @@ -3564,8 +3522,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return 0; e_scratch: - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); return 1; } @@ -3627,13 +3584,20 @@ static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) return 1; /* resume guest */ } - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); return 1; /* resume guest */ } vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; vcpu->run->hypercall.args[0] = gpa; vcpu->run->hypercall.args[1] = 1; vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) @@ -3658,7 +3622,14 @@ static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) svm->sev_es.psc_inflight = 0; svm->sev_es.psc_idx = 0; svm->sev_es.psc_2m = false; - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret); + + /* + * PSC requests always get a "no action" response in SW_EXITINFO1, with + * a PSC-specific return code in SW_EXITINFO2 that provides the "real" + * return code. E.g. if the PSC request was interrupted, the need to + * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. + */ + svm_vmgexit_no_action(svm, psc_ret); } static void __snp_complete_one_psc(struct vcpu_svm *svm) @@ -3710,7 +3681,7 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) bool huge; u64 gfn; - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) { + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); return 1; } @@ -3797,6 +3768,13 @@ next_range: case VMGEXIT_PSC_OP_SHARED: vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); vcpu->run->hypercall.args[1] = npages; vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE @@ -3820,113 +3798,93 @@ next_range: goto next_range; } - unreachable(); + BUG(); } -static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) +/* + * Invoked as part of svm_vcpu_reset() processing of an init event. + */ +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_memory_slot *slot; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + + if (!sev_snp_guest(vcpu->kvm)) + return; + + guard(mutex)(&svm->sev_es.snp_vmsa_mutex); - WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex)); + if (!svm->sev_es.snp_ap_waiting_for_reset) + return; + + svm->sev_es.snp_ap_waiting_for_reset = false; /* Mark the vCPU as offline and not runnable */ vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); /* Clear use of the VMSA */ svm->vmcb->control.vmsa_pa = INVALID_PAGE; - if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { - gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); - struct kvm_memory_slot *slot; - struct page *page; - kvm_pfn_t pfn; - - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot) - return -EINVAL; - - /* - * The new VMSA will be private memory guest memory, so - * retrieve the PFN from the gmem backend. - */ - if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) - return -EINVAL; - - /* - * From this point forward, the VMSA will always be a - * guest-mapped page rather than the initial one allocated - * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa - * could be free'd and cleaned up here, but that involves - * cleanups like wbinvd_on_all_cpus() which would ideally - * be handled during teardown rather than guest boot. - * Deferring that also allows the existing logic for SEV-ES - * VMSAs to be re-used with minimal SNP-specific changes. - */ - svm->sev_es.snp_has_guest_vmsa = true; - - /* Use the new VMSA */ - svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); - - /* Mark the vCPU as runnable */ - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - - svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; - - /* - * gmem pages aren't currently migratable, but if this ever - * changes then care should be taken to ensure - * svm->sev_es.vmsa is pinned through some other means. - */ - kvm_release_page_clean(page); - } - /* * When replacing the VMSA during SEV-SNP AP creation, * mark the VMCB dirty so that full state is always reloaded. */ vmcb_mark_all_dirty(svm->vmcb); - return 0; -} + if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) + return; -/* - * Invoked as part of svm_vcpu_reset() processing of an init event. - */ -void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int ret; + gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; - if (!sev_snp_guest(vcpu->kvm)) + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) return; - mutex_lock(&svm->sev_es.snp_vmsa_mutex); + /* + * The new VMSA will be private memory guest memory, so retrieve the + * PFN from the gmem backend. + */ + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) + return; - if (!svm->sev_es.snp_ap_waiting_for_reset) - goto unlock; + /* + * From this point forward, the VMSA will always be a guest-mapped page + * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In + * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but + * that involves cleanups like wbinvd_on_all_cpus() which would ideally + * be handled during teardown rather than guest boot. Deferring that + * also allows the existing logic for SEV-ES VMSAs to be re-used with + * minimal SNP-specific changes. + */ + svm->sev_es.snp_has_guest_vmsa = true; - svm->sev_es.snp_ap_waiting_for_reset = false; + /* Use the new VMSA */ + svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); - ret = __sev_snp_update_protected_guest_state(vcpu); - if (ret) - vcpu_unimpl(vcpu, "snp: AP state update on init failed\n"); + /* Mark the vCPU as runnable */ + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); -unlock: - mutex_unlock(&svm->sev_es.snp_vmsa_mutex); + /* + * gmem pages aren't currently migratable, but if this ever changes + * then care should be taken to ensure svm->sev_es.vmsa is pinned + * through some other means. + */ + kvm_release_page_clean(page); } static int sev_snp_ap_creation(struct vcpu_svm *svm) { - struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct kvm_vcpu *vcpu = &svm->vcpu; struct kvm_vcpu *target_vcpu; struct vcpu_svm *target_svm; unsigned int request; unsigned int apic_id; - bool kick; - int ret; request = lower_32_bits(svm->vmcb->control.exit_info_1); apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); @@ -3939,47 +3897,23 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) return -EINVAL; } - ret = 0; - target_svm = to_svm(target_vcpu); - /* - * The target vCPU is valid, so the vCPU will be kicked unless the - * request is for CREATE_ON_INIT. For any errors at this stage, the - * kick will place the vCPU in an non-runnable state. - */ - kick = true; - - mutex_lock(&target_svm->sev_es.snp_vmsa_mutex); - - target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; - target_svm->sev_es.snp_ap_waiting_for_reset = true; - - /* Interrupt injection mode shouldn't change for AP creation */ - if (request < SVM_VMGEXIT_AP_DESTROY) { - u64 sev_features; - - sev_features = vcpu->arch.regs[VCPU_REGS_RAX]; - sev_features ^= sev->vmsa_features; - - if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) { - vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n", - vcpu->arch.regs[VCPU_REGS_RAX]); - ret = -EINVAL; - goto out; - } - } + guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); switch (request) { case SVM_VMGEXIT_AP_CREATE_ON_INIT: - kick = false; - fallthrough; case SVM_VMGEXIT_AP_CREATE: + if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { + vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", + vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); + return -EINVAL; + } + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", svm->vmcb->control.exit_info_2); - ret = -EINVAL; - goto out; + return -EINVAL; } /* @@ -3993,30 +3927,30 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) vcpu_unimpl(vcpu, "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", svm->vmcb->control.exit_info_2); - ret = -EINVAL; - goto out; + return -EINVAL; } target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; break; case SVM_VMGEXIT_AP_DESTROY: + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; break; default: vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", request); - ret = -EINVAL; - break; + return -EINVAL; } -out: - if (kick) { - kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); - kvm_vcpu_kick(target_vcpu); - } + target_svm->sev_es.snp_ap_waiting_for_reset = true; - mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex); + /* + * Unless Creation is deferred until INIT, signal the vCPU to update + * its state. + */ + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); - return ret; + return 0; } static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) @@ -4055,7 +3989,8 @@ static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_ goto out_unlock; } - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err)); + /* No action is requested *from KVM* if there was a firmware error. */ + svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); ret = 1; /* resume guest */ @@ -4111,8 +4046,7 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r return snp_handle_guest_req(svm, req_gpa, resp_gpa); request_invalid: - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); return 1; /* resume guest */ } @@ -4120,7 +4054,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); u64 ghcb_info; int ret = 1; @@ -4304,8 +4238,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) if (ret) return ret; - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0); + svm_vmgexit_success(svm, 0); exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { @@ -4340,7 +4273,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); switch (control->exit_info_1) { case 0: @@ -4349,21 +4282,19 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) break; case 1: /* Get AP jump table address */ - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table); + svm_vmgexit_success(svm, sev->ap_jump_table); break; default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); } ret = 1; break; } case SVM_VMGEXIT_HV_FEATURES: - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED); - + svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); ret = 1; break; case SVM_VMGEXIT_TERM_REQUEST: @@ -4384,8 +4315,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) case SVM_VMGEXIT_AP_CREATION: ret = sev_snp_ap_creation(svm); if (ret) { - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); } ret = 1; @@ -4435,8 +4365,8 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); } @@ -4445,16 +4375,15 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if * the host/guest supports its use. * - * guest_can_use() checks a number of requirements on the host/guest to - * ensure that MSR_IA32_XSS is available, but it might report true even - * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host - * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better - * to further check that the guest CPUID actually supports - * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved - * guests will still get intercepted and caught in the normal - * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths. + * KVM treats the guest as being capable of using XSAVES even if XSAVES + * isn't enabled in guest CPUID as there is no intercept for XSAVES, + * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is + * exposed to the guest and XSAVES is supported in hardware. Condition + * full XSS passthrough on the guest being able to use XSAVES *and* + * XSAVES being exposed to the guest so that KVM can at least honor + * guest CPUID for RDMSR and WRMSR. */ - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); else @@ -4477,6 +4406,7 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; @@ -4492,6 +4422,10 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | + VMCB_ALLOWED_SEV_FEATURES_VALID; + /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); svm_clr_intercept(svm, INTERCEPT_CR4_READ); @@ -4552,7 +4486,7 @@ void sev_init_vmcb(struct vcpu_svm *svm) void sev_es_vcpu_reset(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); /* * Set the GHCB MSR value as per the GHCB specification when emulating @@ -4567,6 +4501,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { + struct kvm *kvm = svm->vcpu.kvm; + /* * All host state for SEV-ES guests is categorized into three swap types * based on how it is handled by hardware during a world switch: @@ -4590,14 +4526,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both - * saves and loads debug registers (Type-A). + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does + * not save or load debug registers. Sadly, KVM can't prevent SNP + * guests from lying about DebugSwap on secondary vCPUs, i.e. the + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what + * the guest has actually enabled (or not!) in the VMSA. + * + * If DebugSwap is *possible*, save the masks so that they're restored + * if the guest enables DebugSwap. But for the DRs themselves, do NOT + * rely on the CPU to restore the host values; KVM will restore them as + * needed in common code, via hw_breakpoint_restore(). Note, KVM does + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs + * don't need to be restored per se, KVM just needs to ensure they are + * loaded with the correct values *if* the CPU writes the MSRs. */ - if (sev_vcpu_has_debug_swap(svm)) { - hostsa->dr0 = native_get_debugreg(0); - hostsa->dr1 = native_get_debugreg(1); - hostsa->dr2 = native_get_debugreg(2); - hostsa->dr3 = native_get_debugreg(3); + if (sev_vcpu_has_debug_swap(svm) || + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); @@ -4622,7 +4566,7 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) * Return from an AP Reset Hold VMGEXIT, where the guest will * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. */ - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + svm_vmgexit_success(svm, 1); break; case AP_RESET_HOLD_MSR_PROTO: /* @@ -4820,7 +4764,7 @@ static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); kvm_pfn_t pfn_aligned; gfn_t gfn_aligned; int level, rc; @@ -4942,3 +4886,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return level; } + +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area *vmsa; + struct kvm_sev_info *sev; + int error = 0; + int ret; + + if (!sev_es_guest(vcpu->kvm)) + return NULL; + + /* + * If the VMSA has not yet been encrypted, return a pointer to the + * current un-encrypted VMSA. + */ + if (!vcpu->arch.guest_state_protected) + return (struct vmcb_save_area *)svm->sev_es.vmsa; + + sev = to_kvm_sev_info(vcpu->kvm); + + /* Check if the SEV policy allows debugging */ + if (sev_snp_guest(vcpu->kvm)) { + if (!(sev->policy & SNP_POLICY_DEBUG)) + return NULL; + } else { + if (sev->policy & SEV_POLICY_NODBG) + return NULL; + } + + if (sev_snp_guest(vcpu->kvm)) { + struct sev_data_snp_dbg dbg = {0}; + + vmsa = snp_alloc_firmware_page(__GFP_ZERO); + if (!vmsa) + return NULL; + + dbg.gctx_paddr = __psp_pa(sev->snp_context); + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); + + /* + * Return the target page to a hypervisor page no matter what. + * If this fails, the page can't be used, so leak it and don't + * try to use it. + */ + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) + return NULL; + + if (ret) { + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + free_page((unsigned long)vmsa); + + return NULL; + } + } else { + struct sev_data_dbg dbg = {0}; + struct page *vmsa_page; + + vmsa_page = alloc_page(GFP_KERNEL); + if (!vmsa_page) + return NULL; + + vmsa = page_address(vmsa_page); + + dbg.handle = sev->handle; + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + dbg.len = PAGE_SIZE; + + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); + if (ret) { + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", + ret, error, error); + __free_page(vmsa_page); + + return NULL; + } + } + + return vmsa; +} + +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) +{ + /* If the VMSA has not yet been encrypted, nothing was allocated */ + if (!vcpu->arch.guest_state_protected || !vmsa) + return; + + free_page((unsigned long)vmsa); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 21dacd312779..ab9b947dbf4f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -28,8 +28,11 @@ #include <linux/rwsem.h> #include <linux/cc_platform.h> #include <linux/smp.h> +#include <linux/string_choices.h> +#include <linux/mutex.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -230,6 +233,8 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -248,6 +253,8 @@ static unsigned long iopm_base; DEFINE_PER_CPU(struct svm_cpu_data, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); + /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. @@ -284,8 +291,6 @@ u32 svm_msrpm_offset(u32 msr) return MSR_INVALID; } -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); - static int get_npt_level(void) { #ifdef CONFIG_X86_64 @@ -476,24 +481,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) static void svm_init_erratum_383(void) { - u32 low, high; - int err; u64 val; if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) + if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) return; val |= (1ULL << 47); - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + native_write_msr_safe(MSR_AMD64_DC_CFG, val); erratum_383_found = true; } @@ -567,7 +566,7 @@ static void __svm_write_tsc_multiplier(u64 multiplier) if (multiplier == __this_cpu_read(current_tsc_ratio)) return; - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); } @@ -580,15 +579,15 @@ static inline void kvm_cpu_svm_disable(void) { uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); + wrmsrq(MSR_VM_HSAVE_PA, 0); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) { /* * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and * NMI aren't blocked. */ stgi(); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); } } @@ -617,7 +616,7 @@ static int svm_enable_virtualization_cpu(void) uint64_t efer; int me = raw_smp_processor_id(); - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; @@ -627,9 +626,9 @@ static int svm_enable_virtualization_cpu(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - wrmsrl(MSR_EFER, efer | EFER_SVME); + wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); + wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -650,13 +649,12 @@ static int svm_enable_virtualization_cpu(void) * erratum is present everywhere). */ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; + u64 len, status = 0; int err; - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); if (err) osvw_status = osvw_len = 0; @@ -1049,7 +1047,7 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || - (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && + (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); if (enable_lbrv == current_enable_lbrv) @@ -1187,14 +1185,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, */ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { if (!npt_enabled || - !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) + !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID)) svm_set_intercept(svm, INTERCEPT_INVPCID); else svm_clr_intercept(svm, INTERCEPT_INVPCID); } if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { - if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP)) svm_clr_intercept(svm, INTERCEPT_RDTSCP); else svm_set_intercept(svm, INTERCEPT_RDTSCP); @@ -1298,8 +1296,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_HLT); + if (!kvm_hlt_in_guest(vcpu->kvm)) { + if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) + svm_set_intercept(svm, INTERCEPT_IDLE_HLT); + else + svm_set_intercept(svm, INTERCEPT_HLT); + } control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); @@ -1372,6 +1374,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) sev_init_vmcb(svm); @@ -1481,25 +1486,10 @@ out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) -{ - int i; - - for_each_online_cpu(i) - cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); -} - static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. - */ - svm_clear_current_vmcb(svm->vmcb); - svm_leave_nested(vcpu); svm_free_nested(svm); @@ -1509,6 +1499,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) +{ + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + /* + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. + */ + if (atomic_read(&srso_nr_vms)) + return; + + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} + +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); +} +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1541,6 +1588,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -1551,18 +1603,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; - - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } @@ -1921,9 +1964,6 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; unsigned long old_cr4 = vcpu->arch.cr4; - if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb_current(vcpu); - vcpu->arch.cr4 = cr4; if (!npt_enabled) { cr4 |= X86_CR4_PAE; @@ -1936,7 +1976,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1995,11 +2035,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -2142,14 +2182,13 @@ static int ac_interception(struct kvm_vcpu *vcpu) static bool is_erratum_383(void) { - int err, i; + int i; u64 value; if (!erratum_383_found) return false; - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) + if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) return false; /* Bit 62 may or may not be set for this mce */ @@ -2160,17 +2199,11 @@ static bool is_erratum_383(void) /* Clear MCi_STATUS registers */ for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); + if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + native_write_msr_safe(MSR_IA32_MCG_STATUS, value); } /* Flush tlb to evict multi-match entries */ @@ -2224,6 +2257,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -2864,7 +2901,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: if (!msr_info->host_initiated && - !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) return 1; msr_info->data = svm->tsc_ratio_msr; break; @@ -2940,7 +2977,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; msr_info->data = svm->virt_spec_ctrl; @@ -2977,11 +3014,7 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, - X86_TRAP_GP | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + svm_vmgexit_inject_exception(svm, X86_TRAP_GP); return 1; } @@ -3024,7 +3057,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) { if (!msr->host_initiated) return 1; @@ -3046,7 +3079,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_ratio_msr = data; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && is_guest_mode(vcpu)) nested_svm_update_tsc_ratio_msr(vcpu); @@ -3091,7 +3124,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; if (data & ~SPEC_CTRL_SSBD) @@ -3169,6 +3202,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; @@ -3263,7 +3306,7 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) unsigned long type; gva_t gva; - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -3276,9 +3319,51 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; + /* + * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the + * stack segment is used. The intercept takes priority over all + * #GP checks except CPL>0, but somehow still generates a linear + * address? The APM is sorely lacking. + */ + if (is_noncanonical_address(gva, vcpu, 0)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3346,7 +3431,9 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, [SVM_EXIT_INVPCID] = invpcid_interception, + [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, @@ -3361,14 +3448,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + guard(mutex)(&vmcb_dump_mutex); + + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3406,6 +3500,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3476,6 +3581,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) @@ -3508,10 +3670,14 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return interrupt_window_interception(vcpu); else if (exit_code == SVM_EXIT_INTR) return intr_interception(vcpu); - else if (exit_code == SVM_EXIT_HLT) + else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } @@ -3533,6 +3699,21 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } +static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, + u32 *error_code) +{ + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + *intr_info = control->event_inj; + + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->event_inj_err; + else + *error_code = 0; + +} + static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3576,7 +3757,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, exit_code); } -static void pre_svm_run(struct kvm_vcpu *vcpu) +static int pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -3598,6 +3779,8 @@ static void pre_svm_run(struct kvm_vcpu *vcpu) /* FIXME: handle wraparound of asid_generation */ if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); + + return 0; } static void svm_inject_nmi(struct kvm_vcpu *vcpu) @@ -4105,20 +4288,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; - case SVM_EXITINTINFO_TYPE_EXEPT: + case SVM_EXITINTINFO_TYPE_EXEPT: { + u32 error_code = 0; + /* * Never re-inject a #VC exception. */ if (vector == X86_TRAP_VC) break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(vcpu, vector, err); + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) + error_code = svm->vmcb->control.exit_int_info_err; - } else - kvm_requeue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector, + exitintinfo & SVM_EXITINTINFO_VALID_ERR, + error_code); break; + } case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; @@ -4178,6 +4364,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) @@ -4186,6 +4384,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } @@ -4220,7 +4420,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (force_immediate_exit) smp_send_reschedule(vcpu->cpu); - pre_svm_run(vcpu); + if (pre_svm_run(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; + vcpu->run->fail_entry.cpu = vcpu->cpu; + return EXIT_FASTPATH_EXIT_USERSPACE; + } sync_lapic_to_cr8(vcpu); @@ -4236,14 +4441,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4271,6 +4484,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); @@ -4392,27 +4609,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give * the guest read/write access to the host's XSS. */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVES) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) - kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); + guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES, + boot_cpu_has(X86_FEATURE_XSAVES) && + guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)); /* * Intercept VMLOAD if the vCPU model is Intel in order to emulate that * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing * SVM on Intel is bonkers and extremely unlikely to work). */ - if (!guest_cpuid_is_intel_compatible(vcpu)) - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI); + if (guest_cpuid_is_intel_compatible(vcpu)) + guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); svm_recalc_instruction_intercepts(vcpu, svm); @@ -4422,7 +4629,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, - !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); @@ -4673,7 +4880,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) * responsible for ensuring nested SVM and SMIs are mutually exclusive. */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 1; smram->smram64.svm_guest_flag = 1; @@ -4720,14 +4927,14 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) const struct kvm_smram_state_64 *smram64 = &smram->smram64; - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 0; /* Non-zero if SMI arrived while vCPU was in guest mode. */ if (!smram64->svm_guest_flag) return 0; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return 1; if (!(smram64->efer & EFER_SVME)) @@ -4790,9 +4997,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { + struct vcpu_svm *svm = to_svm(vcpu); bool smep, smap, is_user; u64 error_code; + /* Check that emulation is possible during event vectoring */ + if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) && + !kvm_can_emulate_event_vectoring(emul_type)) + return X86EMUL_UNHANDLEABLE_VECTORING; + /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) return X86EMUL_CONTINUE; @@ -4889,7 +5102,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * In addition, don't apply the erratum workaround if the #NPF occurred * while translating guest page tables (see below). */ - error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + error_code = svm->vmcb->control.exit_info_1; if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) goto resume_guest; @@ -4953,6 +5166,8 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) @@ -4978,6 +5193,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } @@ -5036,6 +5252,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, @@ -5077,6 +5294,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, .get_exit_info = svm_get_exit_info, + .get_entry_info = svm_get_entry_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -5147,7 +5365,7 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; @@ -5221,6 +5439,9 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) @@ -5328,7 +5549,7 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5416,6 +5637,7 @@ static __init int svm_hardware_setup(void) */ allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; err: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 43fa6a16eb19..e6f3c6a153a0 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -98,6 +98,7 @@ struct kvm_sev_info { unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long policy; unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ @@ -114,6 +115,9 @@ struct kvm_sev_info { struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ }; +#define SEV_POLICY_NODBG BIT_ULL(0) +#define SNP_POLICY_DEBUG BIT_ULL(19) + struct kvm_svm { struct kvm kvm; @@ -169,6 +173,7 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; + u64 bus_lock_rip; union { #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; @@ -335,11 +340,11 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; + bool bp_spec_reduce_set; + struct vmcb *save_area; unsigned long save_area_pa; - struct vmcb *current_vmcb; - /* index = sev_asid, value = vmcb pointer */ struct vmcb **sev_vmcbs; }; @@ -358,39 +363,30 @@ static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm) return &to_kvm_svm(kvm)->sev_info; } +#ifdef CONFIG_KVM_AMD_SEV static __always_inline bool sev_guest(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->active; -#else - return false; -#endif + return to_kvm_sev_info(kvm)->active; } - static __always_inline bool sev_es_guest(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return sev->es_active && !WARN_ON_ONCE(!sev->active); -#else - return false; -#endif } static __always_inline bool sev_snp_guest(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) && !WARN_ON_ONCE(!sev_es_guest(kvm)); +} #else - return false; +#define sev_guest(kvm) false +#define sev_es_guest(kvm) false +#define sev_snp_guest(kvm) false #endif -} static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val) { @@ -502,7 +498,7 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { - return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) && + return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VGIF) && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); } @@ -554,7 +550,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) { - return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) && + return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) && (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK); } @@ -588,10 +584,39 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm) return false; } +static inline void svm_vmgexit_set_return_code(struct vcpu_svm *svm, + u64 response, u64 data) +{ + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, response); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, data); +} + +static inline void svm_vmgexit_inject_exception(struct vcpu_svm *svm, u8 vector) +{ + u64 data = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT | vector; + + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_ISSUE_EXCEPTION, data); +} + +static inline void svm_vmgexit_bad_input(struct vcpu_svm *svm, u64 suberror) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_MALFORMED_INPUT, suberror); +} + +static inline void svm_vmgexit_success(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + +static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) extern bool dump_invalid_vmcb; @@ -722,7 +747,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ -void pre_sev_run(struct vcpu_svm *svm, int cpu); +int pre_sev_run(struct vcpu_svm *svm, int cpu); void sev_init_vmcb(struct vcpu_svm *svm); void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); @@ -763,6 +788,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { @@ -794,6 +821,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return 0; } +static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + return NULL; +} +static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {} #endif /* vmenter.S */ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 2ed80aea3bb1..0c61153b275f 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Enter guest mode */ - sti - 3: vmrun %_ASM_AX 4: - cli - /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX @@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ - sti - 1: vmrun %rax - -2: cli - +2: /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT |