summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c135
1 files changed, 100 insertions, 35 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5bdb5b854924..b58a74c1722d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -90,7 +90,6 @@
#include "trace.h"
#define MAX_IO_MSRS 256
-#define KVM_MAX_MCE_BANKS 32
/*
* Note, kvm_caps fields should *never* have default values, all fields must be
@@ -227,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
+bool __read_mostly enable_device_posted_irqs = true;
+EXPORT_SYMBOL_GPL(enable_device_posted_irqs);
+
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@ -636,6 +638,15 @@ static void kvm_user_return_msr_cpu_online(void)
}
}
+static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
+{
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
+ }
+}
+
int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
@@ -649,15 +660,20 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
return 1;
msrs->values[slot].curr = value;
- if (!msrs->registered) {
- msrs->urn.on_user_return = kvm_on_user_return;
- user_return_notifier_register(&msrs->urn);
- msrs->registered = true;
- }
+ kvm_user_return_register_notifier(msrs);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
+void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
+{
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+
+ msrs->values[slot].curr = value;
+ kvm_user_return_register_notifier(msrs);
+}
+EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache);
+
static void drop_user_return_notifiers(void)
{
struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
@@ -4739,6 +4755,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
+ if (kvm)
+ r = kvm->max_vcpus;
break;
case KVM_CAP_MAX_VCPU_ID:
r = KVM_MAX_VCPU_IDS;
@@ -4794,7 +4812,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
break;
case KVM_CAP_DISABLE_QUIRKS2:
- r = KVM_X86_VALID_QUIRKS;
+ r = kvm_caps.supported_quirks;
break;
case KVM_CAP_X86_NOTIFY_VMEXIT:
r = kvm_caps.has_notify_vmexit;
@@ -4975,6 +4993,8 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
+static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu);
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -4997,6 +5017,19 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_call(vcpu_load)(vcpu, cpu);
+ if (vcpu != per_cpu(last_vcpu, cpu)) {
+ /*
+ * Flush the branch predictor when switching vCPUs on the same
+ * physical CPU, as each vCPU needs its own branch prediction
+ * domain. No IBPB is needed when switching between L1 and L2
+ * on the same vCPU unless IBRS is advertised to the vCPU; that
+ * is handled on the nested VM-Exit path.
+ */
+ if (static_branch_likely(&switch_vcpu_ibpb))
+ indirect_branch_prediction_barrier();
+ per_cpu(last_vcpu, cpu) = vcpu;
+ }
+
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -5117,6 +5150,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
+ if (vcpu->arch.apic->guest_apic_protected)
+ return -EINVAL;
+
kvm_x86_call(sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
@@ -5127,6 +5163,9 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
{
int r;
+ if (vcpu->arch.apic->guest_apic_protected)
+ return -EINVAL;
+
r = kvm_apic_set_state(vcpu, s);
if (r)
return r;
@@ -6304,6 +6343,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_SET_DEVICE_ATTR:
r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
break;
+ case KVM_MEMORY_ENCRYPT_OP:
+ r = -ENOTTY;
+ if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
+ goto out;
+ r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
+ break;
default:
r = -EINVAL;
}
@@ -6491,7 +6536,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
struct kvm_vcpu *vcpu;
unsigned long i;
- if (!kvm_x86_ops.cpu_dirty_log_size)
+ if (!kvm->arch.cpu_dirty_log_size)
return;
kvm_for_each_vcpu(i, vcpu, kvm)
@@ -6521,11 +6566,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
switch (cap->cap) {
case KVM_CAP_DISABLE_QUIRKS2:
r = -EINVAL;
- if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
+ if (cap->args[0] & ~kvm_caps.supported_quirks)
break;
fallthrough;
case KVM_CAP_DISABLE_QUIRKS:
- kvm->arch.disabled_quirks = cap->args[0];
+ kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks;
r = 0;
break;
case KVM_CAP_SPLIT_IRQCHIP: {
@@ -7299,14 +7344,13 @@ set_pit2_out:
r = READ_ONCE(kvm->arch.default_tsc_khz);
goto out;
}
- case KVM_MEMORY_ENCRYPT_OP: {
+ case KVM_MEMORY_ENCRYPT_OP:
r = -ENOTTY;
if (!kvm_x86_ops.mem_enc_ioctl)
goto out;
r = kvm_x86_call(mem_enc_ioctl)(kvm, argp);
break;
- }
case KVM_MEMORY_ENCRYPT_REG_REGION: {
struct kvm_enc_region region;
@@ -8000,7 +8044,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
return rc;
if (!vcpu->mmio_nr_fragments)
- return rc;
+ return X86EMUL_CONTINUE;
gpa = vcpu->mmio_fragments[0].gpa;
@@ -9338,7 +9382,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
{
vcpu->arch.pio.count = 0;
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
return 1;
return kvm_skip_emulated_instruction(vcpu);
@@ -9363,7 +9407,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
complete_fast_pio_out_port_0x7e;
kvm_skip_emulated_instruction(vcpu);
} else {
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
return 0;
@@ -9376,7 +9420,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
vcpu->arch.pio.count = 0;
return 1;
}
@@ -9405,7 +9449,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
@@ -9771,6 +9815,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
}
+ kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS;
+ kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS;
rdmsrq_safe(MSR_EFER, &kvm_host.efer);
@@ -9786,6 +9832,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (r != 0)
goto out_mmu_exit;
+ enable_device_posted_irqs &= enable_apicv &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+
kvm_ops_update(ops);
for_each_online_cpu(cpu) {
@@ -9815,6 +9864,10 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
+ /* KVM always ignores guest PAT for shadow paging. */
+ if (!tdp_enabled)
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
@@ -10023,13 +10076,16 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
- unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3,
- int op_64_bit, int cpl,
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
int (*complete_hypercall)(struct kvm_vcpu *))
{
unsigned long ret;
+ unsigned long nr = kvm_rax_read(vcpu);
+ unsigned long a0 = kvm_rbx_read(vcpu);
+ unsigned long a1 = kvm_rcx_read(vcpu);
+ unsigned long a2 = kvm_rdx_read(vcpu);
+ unsigned long a3 = kvm_rsi_read(vcpu);
+ int op_64_bit = is_64_bit_hypercall(vcpu);
++vcpu->stat.hypercalls;
@@ -10132,9 +10188,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
- return __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi,
- is_64_bit_hypercall(vcpu),
- kvm_x86_call(get_cpl)(vcpu),
+ return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu),
complete_hypercall_exit);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -10664,6 +10718,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
return;
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+ vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
kvm_x86_call(sync_pir_to_irr)(vcpu);
@@ -10978,7 +11033,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_fpu.xfd_err)
wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
- if (unlikely(vcpu->arch.switch_db_regs)) {
+ if (unlikely(vcpu->arch.switch_db_regs &&
+ !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
@@ -11030,6 +11086,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
kvm_x86_call(sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
@@ -11134,7 +11191,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
!vcpu->arch.apf.halted);
}
-static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
{
if (!list_empty_careful(&vcpu->async_pf.done))
return true;
@@ -11143,9 +11200,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
kvm_apic_init_sipi_allowed(vcpu))
return true;
- if (vcpu->arch.pv.pv_unhalted)
- return true;
-
if (kvm_is_exception_pending(vcpu))
return true;
@@ -11183,10 +11237,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
return false;
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_has_events);
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+ return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted ||
+ kvm_vcpu_has_events(vcpu);
}
/* Called within kvm->srcu read side. */
@@ -11320,7 +11376,7 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
*/
++vcpu->stat.halt_exits;
if (lapic_in_kernel(vcpu)) {
- if (kvm_vcpu_has_events(vcpu))
+ if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted)
state = KVM_MP_STATE_RUNNABLE;
kvm_set_mp_state(vcpu, state);
return 1;
@@ -12388,13 +12444,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- int idx;
+ int idx, cpu;
kvm_clear_async_pf_completion_queue(vcpu);
kvm_mmu_unload(vcpu);
kvmclock_reset(vcpu);
+ for_each_possible_cpu(cpu)
+ cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL);
+
kvm_x86_call(vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
@@ -12694,6 +12753,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
{
@@ -12723,6 +12783,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* Decided by the vendor code for other VM types. */
kvm->arch.pre_fault_allowed =
type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
+ kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks;
ret = kvm_page_track_init(kvm);
if (ret)
@@ -12876,6 +12937,7 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
kvm_free_pit(kvm);
kvm_mmu_pre_destroy_vm(kvm);
+ static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -13073,7 +13135,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
{
int nr_slots;
- if (!kvm_x86_ops.cpu_dirty_log_size)
+ if (!kvm->arch.cpu_dirty_log_size)
return;
nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
@@ -13145,7 +13207,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if (READ_ONCE(eager_page_split))
kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
- if (kvm_x86_ops.cpu_dirty_log_size) {
+ if (kvm->arch.cpu_dirty_log_size) {
kvm_mmu_slot_leaf_clear_dirty(kvm, new);
kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
} else {
@@ -13534,8 +13596,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
* due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
* (or last) non-coherent device is (un)registered to so that new SPTEs
* with the correct "ignore guest PAT" setting are created.
+ *
+ * If KVM always honors guest PAT, however, there is nothing to do.
*/
- if (kvm_mmu_may_ignore_guest_pat())
+ if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT))
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
}
@@ -14012,6 +14076,7 @@ EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);