diff options
Diffstat (limited to 'arch/x86/kvm/svm/svm.c')
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 456 |
1 files changed, 339 insertions, 117 deletions
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 21dacd312779..ab9b947dbf4f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -28,8 +28,11 @@ #include <linux/rwsem.h> #include <linux/cc_platform.h> #include <linux/smp.h> +#include <linux/string_choices.h> +#include <linux/mutex.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -230,6 +233,8 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -248,6 +253,8 @@ static unsigned long iopm_base; DEFINE_PER_CPU(struct svm_cpu_data, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); + /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. @@ -284,8 +291,6 @@ u32 svm_msrpm_offset(u32 msr) return MSR_INVALID; } -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); - static int get_npt_level(void) { #ifdef CONFIG_X86_64 @@ -476,24 +481,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) static void svm_init_erratum_383(void) { - u32 low, high; - int err; u64 val; if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) + if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) return; val |= (1ULL << 47); - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + native_write_msr_safe(MSR_AMD64_DC_CFG, val); erratum_383_found = true; } @@ -567,7 +566,7 @@ static void __svm_write_tsc_multiplier(u64 multiplier) if (multiplier == __this_cpu_read(current_tsc_ratio)) return; - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); } @@ -580,15 +579,15 @@ static inline void kvm_cpu_svm_disable(void) { uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); + wrmsrq(MSR_VM_HSAVE_PA, 0); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) { /* * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and * NMI aren't blocked. */ stgi(); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); } } @@ -617,7 +616,7 @@ static int svm_enable_virtualization_cpu(void) uint64_t efer; int me = raw_smp_processor_id(); - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; @@ -627,9 +626,9 @@ static int svm_enable_virtualization_cpu(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - wrmsrl(MSR_EFER, efer | EFER_SVME); + wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); + wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -650,13 +649,12 @@ static int svm_enable_virtualization_cpu(void) * erratum is present everywhere). */ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; + u64 len, status = 0; int err; - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); if (err) osvw_status = osvw_len = 0; @@ -1049,7 +1047,7 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || - (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && + (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); if (enable_lbrv == current_enable_lbrv) @@ -1187,14 +1185,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, */ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { if (!npt_enabled || - !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) + !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID)) svm_set_intercept(svm, INTERCEPT_INVPCID); else svm_clr_intercept(svm, INTERCEPT_INVPCID); } if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { - if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP)) svm_clr_intercept(svm, INTERCEPT_RDTSCP); else svm_set_intercept(svm, INTERCEPT_RDTSCP); @@ -1298,8 +1296,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_HLT); + if (!kvm_hlt_in_guest(vcpu->kvm)) { + if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) + svm_set_intercept(svm, INTERCEPT_IDLE_HLT); + else + svm_set_intercept(svm, INTERCEPT_HLT); + } control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); @@ -1372,6 +1374,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) sev_init_vmcb(svm); @@ -1481,25 +1486,10 @@ out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) -{ - int i; - - for_each_online_cpu(i) - cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); -} - static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. - */ - svm_clear_current_vmcb(svm->vmcb); - svm_leave_nested(vcpu); svm_free_nested(svm); @@ -1509,6 +1499,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) +{ + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + /* + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. + */ + if (atomic_read(&srso_nr_vms)) + return; + + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} + +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); +} +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1541,6 +1588,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -1551,18 +1603,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; - - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } @@ -1921,9 +1964,6 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; unsigned long old_cr4 = vcpu->arch.cr4; - if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb_current(vcpu); - vcpu->arch.cr4 = cr4; if (!npt_enabled) { cr4 |= X86_CR4_PAE; @@ -1936,7 +1976,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1995,11 +2035,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -2142,14 +2182,13 @@ static int ac_interception(struct kvm_vcpu *vcpu) static bool is_erratum_383(void) { - int err, i; + int i; u64 value; if (!erratum_383_found) return false; - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) + if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) return false; /* Bit 62 may or may not be set for this mce */ @@ -2160,17 +2199,11 @@ static bool is_erratum_383(void) /* Clear MCi_STATUS registers */ for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); + if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + native_write_msr_safe(MSR_IA32_MCG_STATUS, value); } /* Flush tlb to evict multi-match entries */ @@ -2224,6 +2257,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -2864,7 +2901,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: if (!msr_info->host_initiated && - !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) return 1; msr_info->data = svm->tsc_ratio_msr; break; @@ -2940,7 +2977,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; msr_info->data = svm->virt_spec_ctrl; @@ -2977,11 +3014,7 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, - X86_TRAP_GP | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + svm_vmgexit_inject_exception(svm, X86_TRAP_GP); return 1; } @@ -3024,7 +3057,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) { if (!msr->host_initiated) return 1; @@ -3046,7 +3079,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_ratio_msr = data; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && is_guest_mode(vcpu)) nested_svm_update_tsc_ratio_msr(vcpu); @@ -3091,7 +3124,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; if (data & ~SPEC_CTRL_SSBD) @@ -3169,6 +3202,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; @@ -3263,7 +3306,7 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) unsigned long type; gva_t gva; - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -3276,9 +3319,51 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; + /* + * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the + * stack segment is used. The intercept takes priority over all + * #GP checks except CPL>0, but somehow still generates a linear + * address? The APM is sorely lacking. + */ + if (is_noncanonical_address(gva, vcpu, 0)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3346,7 +3431,9 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, [SVM_EXIT_INVPCID] = invpcid_interception, + [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, @@ -3361,14 +3448,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + guard(mutex)(&vmcb_dump_mutex); + + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3406,6 +3500,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3476,6 +3581,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) @@ -3508,10 +3670,14 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return interrupt_window_interception(vcpu); else if (exit_code == SVM_EXIT_INTR) return intr_interception(vcpu); - else if (exit_code == SVM_EXIT_HLT) + else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } @@ -3533,6 +3699,21 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } +static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, + u32 *error_code) +{ + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + *intr_info = control->event_inj; + + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->event_inj_err; + else + *error_code = 0; + +} + static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3576,7 +3757,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, exit_code); } -static void pre_svm_run(struct kvm_vcpu *vcpu) +static int pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -3598,6 +3779,8 @@ static void pre_svm_run(struct kvm_vcpu *vcpu) /* FIXME: handle wraparound of asid_generation */ if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); + + return 0; } static void svm_inject_nmi(struct kvm_vcpu *vcpu) @@ -4105,20 +4288,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; - case SVM_EXITINTINFO_TYPE_EXEPT: + case SVM_EXITINTINFO_TYPE_EXEPT: { + u32 error_code = 0; + /* * Never re-inject a #VC exception. */ if (vector == X86_TRAP_VC) break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(vcpu, vector, err); + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) + error_code = svm->vmcb->control.exit_int_info_err; - } else - kvm_requeue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector, + exitintinfo & SVM_EXITINTINFO_VALID_ERR, + error_code); break; + } case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; @@ -4178,6 +4364,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) @@ -4186,6 +4384,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } @@ -4220,7 +4420,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (force_immediate_exit) smp_send_reschedule(vcpu->cpu); - pre_svm_run(vcpu); + if (pre_svm_run(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; + vcpu->run->fail_entry.cpu = vcpu->cpu; + return EXIT_FASTPATH_EXIT_USERSPACE; + } sync_lapic_to_cr8(vcpu); @@ -4236,14 +4441,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4271,6 +4484,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); @@ -4392,27 +4609,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give * the guest read/write access to the host's XSS. */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVES) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) - kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); + guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES, + boot_cpu_has(X86_FEATURE_XSAVES) && + guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)); /* * Intercept VMLOAD if the vCPU model is Intel in order to emulate that * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing * SVM on Intel is bonkers and extremely unlikely to work). */ - if (!guest_cpuid_is_intel_compatible(vcpu)) - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI); + if (guest_cpuid_is_intel_compatible(vcpu)) + guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); svm_recalc_instruction_intercepts(vcpu, svm); @@ -4422,7 +4629,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, - !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); @@ -4673,7 +4880,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) * responsible for ensuring nested SVM and SMIs are mutually exclusive. */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 1; smram->smram64.svm_guest_flag = 1; @@ -4720,14 +4927,14 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) const struct kvm_smram_state_64 *smram64 = &smram->smram64; - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 0; /* Non-zero if SMI arrived while vCPU was in guest mode. */ if (!smram64->svm_guest_flag) return 0; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return 1; if (!(smram64->efer & EFER_SVME)) @@ -4790,9 +4997,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { + struct vcpu_svm *svm = to_svm(vcpu); bool smep, smap, is_user; u64 error_code; + /* Check that emulation is possible during event vectoring */ + if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) && + !kvm_can_emulate_event_vectoring(emul_type)) + return X86EMUL_UNHANDLEABLE_VECTORING; + /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) return X86EMUL_CONTINUE; @@ -4889,7 +5102,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * In addition, don't apply the erratum workaround if the #NPF occurred * while translating guest page tables (see below). */ - error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + error_code = svm->vmcb->control.exit_info_1; if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) goto resume_guest; @@ -4953,6 +5166,8 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) @@ -4978,6 +5193,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } @@ -5036,6 +5252,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, @@ -5077,6 +5294,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, .get_exit_info = svm_get_exit_info, + .get_entry_info = svm_get_entry_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -5147,7 +5365,7 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; @@ -5221,6 +5439,9 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) @@ -5328,7 +5549,7 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5416,6 +5637,7 @@ static __init int svm_hardware_setup(void) */ allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; err: |