summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/svm/svm.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/svm/svm.c')
-rw-r--r--arch/x86/kvm/svm/svm.c456
1 files changed, 339 insertions, 117 deletions
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 21dacd312779..ab9b947dbf4f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -28,8 +28,11 @@
#include <linux/rwsem.h>
#include <linux/cc_platform.h>
#include <linux/smp.h>
+#include <linux/string_choices.h>
+#include <linux/mutex.h>
#include <asm/apic.h>
+#include <asm/msr.h>
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
@@ -230,6 +233,8 @@ module_param(tsc_scaling, int, 0444);
static bool avic;
module_param(avic, bool, 0444);
+module_param(enable_device_posted_irqs, bool, 0444);
+
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
@@ -248,6 +253,8 @@ static unsigned long iopm_base;
DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
+static DEFINE_MUTEX(vmcb_dump_mutex);
+
/*
* Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
* the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
@@ -284,8 +291,6 @@ u32 svm_msrpm_offset(u32 msr)
return MSR_INVALID;
}
-static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
-
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -476,24 +481,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu)
static void svm_init_erratum_383(void)
{
- u32 low, high;
- int err;
u64 val;
if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
return;
/* Use _safe variants to not break nested virtualization */
- val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
- if (err)
+ if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val))
return;
val |= (1ULL << 47);
- low = lower_32_bits(val);
- high = upper_32_bits(val);
-
- native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
+ native_write_msr_safe(MSR_AMD64_DC_CFG, val);
erratum_383_found = true;
}
@@ -567,7 +566,7 @@ static void __svm_write_tsc_multiplier(u64 multiplier)
if (multiplier == __this_cpu_read(current_tsc_ratio))
return;
- wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+ wrmsrq(MSR_AMD64_TSC_RATIO, multiplier);
__this_cpu_write(current_tsc_ratio, multiplier);
}
@@ -580,15 +579,15 @@ static inline void kvm_cpu_svm_disable(void)
{
uint64_t efer;
- wrmsrl(MSR_VM_HSAVE_PA, 0);
- rdmsrl(MSR_EFER, efer);
+ wrmsrq(MSR_VM_HSAVE_PA, 0);
+ rdmsrq(MSR_EFER, efer);
if (efer & EFER_SVME) {
/*
* Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
* NMI aren't blocked.
*/
stgi();
- wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ wrmsrq(MSR_EFER, efer & ~EFER_SVME);
}
}
@@ -617,7 +616,7 @@ static int svm_enable_virtualization_cpu(void)
uint64_t efer;
int me = raw_smp_processor_id();
- rdmsrl(MSR_EFER, efer);
+ rdmsrq(MSR_EFER, efer);
if (efer & EFER_SVME)
return -EBUSY;
@@ -627,9 +626,9 @@ static int svm_enable_virtualization_cpu(void)
sd->next_asid = sd->max_asid + 1;
sd->min_asid = max_sev_asid + 1;
- wrmsrl(MSR_EFER, efer | EFER_SVME);
+ wrmsrq(MSR_EFER, efer | EFER_SVME);
- wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
+ wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa);
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
/*
@@ -650,13 +649,12 @@ static int svm_enable_virtualization_cpu(void)
* erratum is present everywhere).
*/
if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
- uint64_t len, status = 0;
+ u64 len, status = 0;
int err;
- len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+ err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len);
if (!err)
- status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
- &err);
+ err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status);
if (err)
osvw_status = osvw_len = 0;
@@ -1049,7 +1047,7 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
- (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
if (enable_lbrv == current_enable_lbrv)
@@ -1187,14 +1185,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
*/
if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
if (!npt_enabled ||
- !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
+ !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID))
svm_set_intercept(svm, INTERCEPT_INVPCID);
else
svm_clr_intercept(svm, INTERCEPT_INVPCID);
}
if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
- if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
svm_clr_intercept(svm, INTERCEPT_RDTSCP);
else
svm_set_intercept(svm, INTERCEPT_RDTSCP);
@@ -1298,8 +1296,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_set_intercept(svm, INTERCEPT_MWAIT);
}
- if (!kvm_hlt_in_guest(vcpu->kvm))
- svm_set_intercept(svm, INTERCEPT_HLT);
+ if (!kvm_hlt_in_guest(vcpu->kvm)) {
+ if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT))
+ svm_set_intercept(svm, INTERCEPT_IDLE_HLT);
+ else
+ svm_set_intercept(svm, INTERCEPT_HLT);
+ }
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
@@ -1372,6 +1374,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
+ if (vcpu->kvm->arch.bus_lock_detection_enabled)
+ svm_set_intercept(svm, INTERCEPT_BUSLOCK);
+
if (sev_guest(vcpu->kvm))
sev_init_vmcb(svm);
@@ -1481,25 +1486,10 @@ out:
return err;
}
-static void svm_clear_current_vmcb(struct vmcb *vmcb)
-{
- int i;
-
- for_each_online_cpu(i)
- cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
-}
-
static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- /*
- * The vmcb page can be recycled, causing a false negative in
- * svm_vcpu_load(). So, ensure that no logical CPU has this
- * vmcb page recorded as its current vmcb.
- */
- svm_clear_current_vmcb(svm->vmcb);
-
svm_leave_nested(vcpu);
svm_free_nested(svm);
@@ -1509,6 +1499,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
+#ifdef CONFIG_CPU_MITIGATIONS
+static DEFINE_SPINLOCK(srso_lock);
+static atomic_t srso_nr_vms;
+
+static void svm_srso_clear_bp_spec_reduce(void *ign)
+{
+ struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
+
+ if (!sd->bp_spec_reduce_set)
+ return;
+
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ sd->bp_spec_reduce_set = false;
+}
+
+static void svm_srso_vm_destroy(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ if (atomic_dec_return(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ /*
+ * Verify a new VM didn't come along, acquire the lock, and increment
+ * the count before this task acquired the lock.
+ */
+ if (atomic_read(&srso_nr_vms))
+ return;
+
+ on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
+}
+
+static void svm_srso_vm_init(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ /*
+ * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
+ * transition, i.e. destroying the last VM, is fully complete, e.g. so
+ * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
+ */
+ if (atomic_inc_not_zero(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ atomic_inc(&srso_nr_vms);
+}
+#else
+static void svm_srso_vm_init(void) { }
+static void svm_srso_vm_destroy(void) { }
+#endif
+
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1541,6 +1588,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
+ !sd->bp_spec_reduce_set) {
+ sd->bp_spec_reduce_set = true;
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ }
svm->guest_state_loaded = true;
}
@@ -1551,18 +1603,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
-
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
- if (sd->current_vmcb != svm->vmcb) {
- sd->current_vmcb = svm->vmcb;
-
- if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
- indirect_branch_prediction_barrier();
- }
if (kvm_vcpu_apicv_active(vcpu))
avic_vcpu_load(vcpu, cpu);
}
@@ -1921,9 +1964,6 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = vcpu->arch.cr4;
- if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- svm_flush_tlb_current(vcpu);
-
vcpu->arch.cr4 = cr4;
if (!npt_enabled) {
cr4 |= X86_CR4_PAE;
@@ -1936,7 +1976,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1995,11 +2035,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid = sd->next_asid++;
}
-static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- if (svm->vcpu.arch.guest_state_protected)
+ if (vcpu->arch.guest_state_protected)
return;
if (unlikely(value != vmcb->save.dr6)) {
@@ -2142,14 +2182,13 @@ static int ac_interception(struct kvm_vcpu *vcpu)
static bool is_erratum_383(void)
{
- int err, i;
+ int i;
u64 value;
if (!erratum_383_found)
return false;
- value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
- if (err)
+ if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value))
return false;
/* Bit 62 may or may not be set for this mce */
@@ -2160,17 +2199,11 @@ static bool is_erratum_383(void)
/* Clear MCi_STATUS registers */
for (i = 0; i < 6; ++i)
- native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
-
- value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
- if (!err) {
- u32 low, high;
+ native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0);
+ if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) {
value &= ~(1ULL << 2);
- low = lower_32_bits(value);
- high = upper_32_bits(value);
-
- native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
+ native_write_msr_safe(MSR_IA32_MCG_STATUS, value);
}
/* Flush tlb to evict multi-match entries */
@@ -2224,6 +2257,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
*/
if (!sev_es_guest(vcpu->kvm)) {
clear_page(svm->vmcb);
+#ifdef CONFIG_KVM_SMM
+ if (is_smm(vcpu))
+ kvm_smm_changed(vcpu, false);
+#endif
kvm_vcpu_reset(vcpu, true);
}
@@ -2864,7 +2901,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_AMD64_TSC_RATIO:
if (!msr_info->host_initiated &&
- !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR))
return 1;
msr_info->data = svm->tsc_ratio_msr;
break;
@@ -2940,7 +2977,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
return 1;
msr_info->data = svm->virt_spec_ctrl;
@@ -2977,11 +3014,7 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
return kvm_complete_insn_gp(vcpu, err);
- ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
- X86_TRAP_GP |
- SVM_EVTINJ_TYPE_EXEPT |
- SVM_EVTINJ_VALID);
+ svm_vmgexit_inject_exception(svm, X86_TRAP_GP);
return 1;
}
@@ -3024,7 +3057,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
switch (ecx) {
case MSR_AMD64_TSC_RATIO:
- if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) {
if (!msr->host_initiated)
return 1;
@@ -3046,7 +3079,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->tsc_ratio_msr = data;
- if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
is_guest_mode(vcpu))
nested_svm_update_tsc_ratio_msr(vcpu);
@@ -3091,7 +3124,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
return 1;
if (data & ~SPEC_CTRL_SSBD)
@@ -3169,6 +3202,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
}
+
+ /*
+ * Suppress BTF as KVM doesn't virtualize BTF, but there's no
+ * way to communicate lack of support to the guest.
+ */
+ if (data & DEBUGCTLMSR_BTF) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ data &= ~DEBUGCTLMSR_BTF;
+ }
+
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
@@ -3263,7 +3306,7 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
unsigned long type;
gva_t gva;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -3276,9 +3319,51 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
type = svm->vmcb->control.exit_info_2;
gva = svm->vmcb->control.exit_info_1;
+ /*
+ * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the
+ * stack segment is used. The intercept takes priority over all
+ * #GP checks except CPL>0, but somehow still generates a linear
+ * address? The APM is sorely lacking.
+ */
+ if (is_noncanonical_address(gva, vcpu, 0)) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+
return kvm_handle_invpcid(vcpu, type, gva);
}
+static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
+ * execute the bus-locking instruction. Set the bus lock counter to '1'
+ * to effectively step past the bus lock.
+ */
+ if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
+ svm->vmcb->control.bus_lock_counter = 1;
+
+ return 1;
+}
+
+static int bus_lock_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_userspace_buslock;
+
+ if (is_guest_mode(vcpu))
+ svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;
+
+ return 0;
+}
+
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3346,7 +3431,9 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
[SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
[SVM_EXIT_INVPCID] = invpcid_interception,
+ [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
[SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_BUS_LOCK] = bus_lock_exit,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -3361,14 +3448,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
+ char *vm_type;
if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
return;
}
- pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
- svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
+ guard(mutex)(&vmcb_dump_mutex);
+
+ vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" :
+ sev_es_guest(vcpu->kvm) ? "SEV-ES" :
+ sev_guest(vcpu->kvm) ? "SEV" : "SVM";
+
+ pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n",
+ vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
pr_err("VMCB Control Area:\n");
pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
@@ -3406,6 +3500,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
+ pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features);
+ pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ save = sev_decrypt_vmsa(vcpu);
+ if (!save)
+ goto no_vmsa;
+
+ save01 = save;
+ }
+
pr_err("VMCB State Save Area:\n");
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"es:",
@@ -3476,6 +3581,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-15s %016llx %-13s %016llx\n",
"excp_from:", save->last_excp_from,
"excp_to:", save->last_excp_to);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
+
+ pr_err("%-15s %016llx\n",
+ "sev_features", vmsa->sev_features);
+
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rax:", vmsa->rax, "rbx:", vmsa->rbx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rcx:", vmsa->rcx, "rdx:", vmsa->rdx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsi:", vmsa->rsi, "rdi:", vmsa->rdi);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rbp:", vmsa->rbp, "rsp:", vmsa->rsp);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r8:", vmsa->r8, "r9:", vmsa->r9);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r10:", vmsa->r10, "r11:", vmsa->r11);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r12:", vmsa->r12, "r13:", vmsa->r13);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r14:", vmsa->r14, "r15:", vmsa->r15);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "xcr0:", vmsa->xcr0, "xss:", vmsa->xss);
+ } else {
+ pr_err("%-15s %016llx %-13s %016lx\n",
+ "rax:", save->rax, "rbx:",
+ vcpu->arch.regs[VCPU_REGS_RBX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rcx:", vcpu->arch.regs[VCPU_REGS_RCX],
+ "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rsi:", vcpu->arch.regs[VCPU_REGS_RSI],
+ "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]);
+ pr_err("%-15s %016lx %-13s %016llx\n",
+ "rbp:", vcpu->arch.regs[VCPU_REGS_RBP],
+ "rsp:", save->rsp);
+#ifdef CONFIG_X86_64
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r8:", vcpu->arch.regs[VCPU_REGS_R8],
+ "r9:", vcpu->arch.regs[VCPU_REGS_R9]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r10:", vcpu->arch.regs[VCPU_REGS_R10],
+ "r11:", vcpu->arch.regs[VCPU_REGS_R11]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r12:", vcpu->arch.regs[VCPU_REGS_R12],
+ "r13:", vcpu->arch.regs[VCPU_REGS_R13]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r14:", vcpu->arch.regs[VCPU_REGS_R14],
+ "r15:", vcpu->arch.regs[VCPU_REGS_R15]);
+#endif
+ }
+
+no_vmsa:
+ if (sev_es_guest(vcpu->kvm))
+ sev_free_decrypted_vmsa(vcpu, save);
}
static bool svm_check_exit_valid(u64 exit_code)
@@ -3508,10 +3670,14 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
return interrupt_window_interception(vcpu);
else if (exit_code == SVM_EXIT_INTR)
return intr_interception(vcpu);
- else if (exit_code == SVM_EXIT_HLT)
+ else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT)
return kvm_emulate_halt(vcpu);
else if (exit_code == SVM_EXIT_NPF)
return npf_interception(vcpu);
+#ifdef CONFIG_KVM_AMD_SEV
+ else if (exit_code == SVM_EXIT_VMGEXIT)
+ return sev_handle_vmgexit(vcpu);
+#endif
#endif
return svm_exit_handlers[exit_code](vcpu);
}
@@ -3533,6 +3699,21 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
*error_code = 0;
}
+static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info,
+ u32 *error_code)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *intr_info = control->event_inj;
+
+ if ((*intr_info & SVM_EXITINTINFO_VALID) &&
+ (*intr_info & SVM_EXITINTINFO_VALID_ERR))
+ *error_code = control->event_inj_err;
+ else
+ *error_code = 0;
+
+}
+
static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3576,7 +3757,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return svm_invoke_exit_handler(vcpu, exit_code);
}
-static void pre_svm_run(struct kvm_vcpu *vcpu)
+static int pre_svm_run(struct kvm_vcpu *vcpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3598,6 +3779,8 @@ static void pre_svm_run(struct kvm_vcpu *vcpu)
/* FIXME: handle wraparound of asid_generation */
if (svm->current_vmcb->asid_generation != sd->asid_generation)
new_asid(svm, sd);
+
+ return 0;
}
static void svm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4105,20 +4288,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
vcpu->arch.nmi_injected = true;
svm->nmi_l1_to_l2 = nmi_l1_to_l2;
break;
- case SVM_EXITINTINFO_TYPE_EXEPT:
+ case SVM_EXITINTINFO_TYPE_EXEPT: {
+ u32 error_code = 0;
+
/*
* Never re-inject a #VC exception.
*/
if (vector == X86_TRAP_VC)
break;
- if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
- u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_requeue_exception_e(vcpu, vector, err);
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR)
+ error_code = svm->vmcb->control.exit_int_info_err;
- } else
- kvm_requeue_exception(vcpu, vector);
+ kvm_requeue_exception(vcpu, vector,
+ exitintinfo & SVM_EXITINTINFO_VALID_ERR,
+ error_code);
break;
+ }
case SVM_EXITINTINFO_TYPE_INTR:
kvm_queue_interrupt(vcpu, vector, false);
break;
@@ -4178,6 +4364,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_enter_irqoff();
+ /*
+ * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
+ * VMRUN controls whether or not physical IRQs are masked (KVM always
+ * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
+ * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
+ * into guest state if delivery of an event during VMRUN triggers a
+ * #VMEXIT, and the guest_state transitions already tell lockdep that
+ * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
+ * this path, so IRQs aren't actually unmasked while running host code.
+ */
+ raw_local_irq_enable();
+
amd_clear_divider();
if (sev_es_guest(vcpu->kvm))
@@ -4186,6 +4384,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
else
__svm_vcpu_run(svm, spec_ctrl_intercepted);
+ raw_local_irq_disable();
+
guest_state_exit_irqoff();
}
@@ -4220,7 +4420,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
if (force_immediate_exit)
smp_send_reschedule(vcpu->cpu);
- pre_svm_run(vcpu);
+ if (pre_svm_run(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR;
+ vcpu->run->fail_entry.cpu = vcpu->cpu;
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+ }
sync_lapic_to_cr8(vcpu);
@@ -4236,14 +4441,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- svm_set_dr6(svm, vcpu->arch.dr6);
- else
- svm_set_dr6(svm, DR6_ACTIVE_LOW);
+ if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
+ /*
+ * Hardware only context switches DEBUGCTL if LBR virtualization is
+ * enabled. Manually load DEBUGCTL if necessary (and restore it after
+ * VM-Exit), as running with the host's DEBUGCTL can negatively affect
+ * guest state and can even be fatal, e.g. due to Bus Lock Detect.
+ */
+ if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
+ update_debugctlmsr(svm->vmcb->save.dbgctl);
+
kvm_wait_lapic_expire(vcpu);
/*
@@ -4271,6 +4484,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
+
kvm_load_host_xsave_state(vcpu);
stgi();
@@ -4392,27 +4609,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
* XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
* the guest read/write access to the host's XSS.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVES) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
- kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
-
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
+ guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES,
+ boot_cpu_has(X86_FEATURE_XSAVES) &&
+ guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE));
/*
* Intercept VMLOAD if the vCPU model is Intel in order to emulate that
* VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
* SVM on Intel is bonkers and extremely unlikely to work).
*/
- if (!guest_cpuid_is_intel_compatible(vcpu))
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
-
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
+ if (guest_cpuid_is_intel_compatible(vcpu))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
svm_recalc_instruction_intercepts(vcpu, svm);
@@ -4422,7 +4629,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
- !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+ !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
if (sev_guest(vcpu->kvm))
sev_vcpu_after_set_cpuid(svm);
@@ -4673,7 +4880,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
* responsible for ensuring nested SVM and SMIs are mutually exclusive.
*/
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return 1;
smram->smram64.svm_guest_flag = 1;
@@ -4720,14 +4927,14 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
const struct kvm_smram_state_64 *smram64 = &smram->smram64;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return 0;
/* Non-zero if SMI arrived while vCPU was in guest mode. */
if (!smram64->svm_guest_flag)
return 0;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
return 1;
if (!(smram64->efer & EFER_SVME))
@@ -4790,9 +4997,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
bool smep, smap, is_user;
u64 error_code;
+ /* Check that emulation is possible during event vectoring */
+ if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
/* Emulation is always possible when KVM has access to all guest state. */
if (!sev_guest(vcpu->kvm))
return X86EMUL_CONTINUE;
@@ -4889,7 +5102,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* In addition, don't apply the erratum workaround if the #NPF occurred
* while translating guest page tables (see below).
*/
- error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
+ error_code = svm->vmcb->control.exit_info_1;
if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
goto resume_guest;
@@ -4953,6 +5166,8 @@ static void svm_vm_destroy(struct kvm *kvm)
{
avic_vm_destroy(kvm);
sev_vm_destroy(kvm);
+
+ svm_srso_vm_destroy();
}
static int svm_vm_init(struct kvm *kvm)
@@ -4978,6 +5193,7 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}
+ svm_srso_vm_init();
return 0;
}
@@ -5036,6 +5252,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
+ .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
@@ -5077,6 +5294,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
.get_exit_info = svm_get_exit_info,
+ .get_entry_info = svm_get_entry_info,
.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
@@ -5147,7 +5365,7 @@ static __init void svm_adjust_mmio_mask(void)
return;
/* If memory encryption is not enabled, use existing mask */
- rdmsrl(MSR_AMD64_SYSCFG, msr);
+ rdmsrq(MSR_AMD64_SYSCFG, msr);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
@@ -5221,6 +5439,9 @@ static __init void svm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
+ if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD))
+ kvm_caps.has_bus_lock_exit = true;
+
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
@@ -5328,7 +5549,7 @@ static __init int svm_hardware_setup(void)
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
- pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled));
/* Setup shadow_me_value and shadow_me_mask */
kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
@@ -5416,6 +5637,7 @@ static __init int svm_hardware_setup(void)
*/
allow_smaller_maxphyaddr = !npt_enabled;
+ kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
return 0;
err: