diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2025-09-30 13:36:41 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2025-09-30 13:36:41 -0400 |
commit | d05ca6b7931e15d882af9ca4a609957cae7aac96 (patch) | |
tree | 3f351bc1e1f1c51487ca706f93f5f5312b3b84d1 | |
parent | 10ef74c06bb1f51e706018f2939722e881192eff (diff) | |
parent | 86bcd23df9cec9c2df520ae0982033e301d3c184 (diff) |
Merge tag 'kvm-x86-misc-6.18' of https://github.com/kvm-x86/linux into HEAD
KVM x86 changes for 6.18
- Don't (re)check L1 intercepts when completing userspace I/O to fix a flaw
where a misbehaving usersepace (a.k.a. syzkaller) could swizzle L1's
intercepts and trigger a variety of WARNs in KVM.
- Emulate PERF_CNTR_GLOBAL_STATUS_SET for PerfMonV2 guests, as the MSR is
supposed to exist for v2 PMUs.
- Allow Centaur CPU leaves (base 0xC000_0000) for Zhaoxin CPUs.
- Clean up KVM's vector hashing code for delivering lowest priority IRQs.
- Clean up the fastpath handler code to only handle IPIs and WRMSRs that are
actually "fast", as opposed to handling those that KVM _hopes_ are fast, and
in the process of doing so add fastpath support for TSC_DEADLINE writes on
AMD CPUs.
- Clean up a pile of PMU code in anticipation of adding support for mediated
vPMUs.
- Add support for the immediate forms of RDMSR and WRMSRNS, sans full
emulator support (KVM should never need to emulate the MSRs outside of
forced emulation and other contrived testing scenarios).
- Clean up the MSR APIs in preparation for CET and FRED virtualization, as
well as mediated vPMU support.
- Rejecting a fully in-kernel IRQCHIP if EOIs are protected, i.e. for TDX VMs,
as KVM can't faithfully emulate an I/O APIC for such guests.
- KVM_REQ_MSR_FILTER_CHANGED into a generic RECALC_INTERCEPTS in preparation
for mediated vPMU support, as KVM will need to recalculate MSR intercepts in
response to PMU refreshes for guests with mediated vPMUs.
- Misc cleanups and minor fixes.
34 files changed, 711 insertions, 516 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index c17a87a0a5ac..ffc350b649ad 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3075,6 +3075,12 @@ This IOCTL replaces the obsolete KVM_GET_PIT. Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. See KVM_GET_PIT2 for details on struct kvm_pit_state2. +.. Tip:: + ``KVM_SET_PIT2`` strictly adheres to the spec of Intel 8254 PIT. For example, + a ``count`` value of 0 in ``struct kvm_pit_channel_state`` is interpreted as + 65536, which is the maximum count value. Refer to `Intel 8254 programmable + interval timer <https://www.scs.stanford.edu/10wi-cs140/pintos/specs/8254.pdf>`_. + This IOCTL replaces the obsolete KVM_SET_PIT. diff --git a/Documentation/virt/kvm/x86/hypercalls.rst b/Documentation/virt/kvm/x86/hypercalls.rst index 10db7924720f..521ecf9a8a36 100644 --- a/Documentation/virt/kvm/x86/hypercalls.rst +++ b/Documentation/virt/kvm/x86/hypercalls.rst @@ -137,7 +137,7 @@ compute the CLOCK_REALTIME for its clock, at the same instant. Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. -6. KVM_HC_SEND_IPI +7. KVM_HC_SEND_IPI ------------------ :Architecture: x86 @@ -158,7 +158,7 @@ corresponds to the APIC ID a2+1, and so on. Returns the number of CPUs to which the IPIs were delivered successfully. -7. KVM_HC_SCHED_YIELD +8. KVM_HC_SCHED_YIELD --------------------- :Architecture: x86 @@ -170,7 +170,7 @@ a0: destination APIC ID :Usage example: When sending a call-function IPI-many to vCPUs, yield if any of the IPI target vCPUs was preempted. -8. KVM_HC_MAP_GPA_RANGE +9. KVM_HC_MAP_GPA_RANGE ------------------------- :Architecture: x86 :Status: active diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ec68aba2fa7e..f1a9f40622cd 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -497,6 +497,7 @@ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ +#define X86_FEATURE_MSR_IMM (21*32+15) /* MSR immediate form instructions */ /* * BUG word(s) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 62c3e4de3303..fdf178443f85 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -138,7 +138,7 @@ KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) -KVM_X86_OP(recalc_msr_intercepts) +KVM_X86_OP(recalc_intercepts) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0fb3ecf51166..9e611901d310 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -120,7 +120,7 @@ #define KVM_REQ_TLB_FLUSH_GUEST \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) -#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) +#define KVM_REQ_RECALC_INTERCEPTS KVM_ARCH_REQ(29) #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ @@ -545,10 +545,10 @@ struct kvm_pmc { #define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \ KVM_MAX_NR_AMD_GP_COUNTERS) -#define KVM_MAX_NR_INTEL_FIXED_COUTNERS 3 -#define KVM_MAX_NR_AMD_FIXED_COUTNERS 0 -#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \ - KVM_MAX_NR_AMD_FIXED_COUTNERS) +#define KVM_MAX_NR_INTEL_FIXED_COUNTERS 3 +#define KVM_MAX_NR_AMD_FIXED_COUNTERS 0 +#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUNTERS, \ + KVM_MAX_NR_AMD_FIXED_COUNTERS) struct kvm_pmu { u8 version; @@ -579,6 +579,9 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_counting_instructions, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_counting_branches, X86_PMC_IDX_MAX); + u64 ds_area; u64 pebs_enable; u64 pebs_enable_rsvd; @@ -771,6 +774,7 @@ enum kvm_only_cpuid_leafs { CPUID_7_2_EDX, CPUID_24_0_EBX, CPUID_8000_0021_ECX, + CPUID_7_1_ECX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -926,6 +930,7 @@ struct kvm_vcpu_arch { bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); unsigned long cui_linear_rip; + int cui_rdmsr_imm_reg; gpa_t time; s8 pvclock_tsc_shift; @@ -1381,6 +1386,7 @@ struct kvm_arch { u8 vm_type; bool has_private_mem; bool has_protected_state; + bool has_protected_eoi; bool pre_fault_allowed; struct hlist_head *mmu_page_hash; struct list_head active_mmu_pages; @@ -1921,7 +1927,7 @@ struct kvm_x86_ops { int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); - void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu); + void (*recalc_intercepts)(struct kvm_vcpu *vcpu); int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); @@ -2162,13 +2168,16 @@ void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); -int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data); -int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data); -int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); +int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); +int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu); int kvm_emulate_invd(struct kvm_vcpu *vcpu); int kvm_emulate_mwait(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..717baeba6db3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -315,12 +315,15 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) + +#define PERF_CAP_LBR_FMT 0x3f +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_FW_WRITES BIT_ULL(13) +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -733,6 +736,7 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET 0xc0000303 /* AMD Hardware Feedback Support MSRs */ #define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index f0f4a4cf84a7..9792e329343e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -94,6 +94,8 @@ #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 #define EXIT_REASON_TDCALL 77 +#define EXIT_REASON_MSR_READ_IMM 84 +#define EXIT_REASON_MSR_WRITE_IMM 85 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -158,7 +160,9 @@ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ { EXIT_REASON_NOTIFY, "NOTIFY" }, \ - { EXIT_REASON_TDCALL, "TDCALL" } + { EXIT_REASON_TDCALL, "TDCALL" }, \ + { EXIT_REASON_MSR_READ_IMM, "MSR_READ_IMM" }, \ + { EXIT_REASON_MSR_WRITE_IMM, "MSR_WRITE_IMM" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 6b868afb26c3..cf4ae822bcc0 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_MSR_IMM, CPUID_ECX, 5, 0x00000007, 1 }, { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e2836a255b16..efee08fad72e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -448,6 +448,8 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * adjustments to the reserved GPA bits. */ kvm_mmu_after_set_cpuid(vcpu); + + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); } int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) @@ -985,6 +987,10 @@ void kvm_set_cpu_caps(void) F(LAM), ); + kvm_cpu_cap_init(CPUID_7_1_ECX, + SCATTERED_F(MSR_IMM), + ); + kvm_cpu_cap_init(CPUID_7_1_EDX, F(AVX_VNNI_INT8), F(AVX_NE_CONVERT), @@ -1411,9 +1417,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; cpuid_entry_override(entry, CPUID_7_1_EAX); + cpuid_entry_override(entry, CPUID_7_1_ECX); cpuid_entry_override(entry, CPUID_7_1_EDX); entry->ebx = 0; - entry->ecx = 0; } if (max_idx >= 2) { entry = do_host_cpuid(array, function, 2); @@ -1820,7 +1826,8 @@ static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func, int r; if (func == CENTAUR_CPUID_SIGNATURE && - boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 0; r = do_cpuid_func(array, func, type); @@ -2001,7 +2008,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, if (function == 7 && index == 0) { u64 data; if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && - !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + !kvm_msr_read(vcpu, MSR_IA32_TSX_CTRL, &data) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); } else if (function == 0x80000007) { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1349e278cd2a..23929151a5b8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4330,8 +4330,8 @@ static const struct opcode opcode_table[256] = { I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter), - I(Stack | IsBranch, em_leave), + I(Stack | SrcImmU16 | Src2ImmByte, em_enter), + I(Stack, em_leave), I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm), I(ImplicitOps | IsBranch, em_ret_far), D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn), @@ -5107,12 +5107,11 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt) ctxt->mem_read.end = 0; } -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts) { const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; - bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt); ctxt->mem_read.pos = 0; @@ -5160,7 +5159,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(&ctxt->dst); } - if (unlikely(is_guest_mode) && ctxt->intercept) { + if (unlikely(check_intercepts) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5189,7 +5188,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5243,7 +5242,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 72b19a88a776..a471900c7325 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1168,15 +1168,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); - mutex_lock(&hv->hv_lock); + guard(mutex)(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_SET || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) - goto out_unlock; + return; if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - goto out_unlock; + return; gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* @@ -1192,7 +1192,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; - goto out_unlock; + return; } /* @@ -1228,12 +1228,10 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; - goto out_unlock; + return; out_err: hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; -out_unlock: - mutex_unlock(&hv->hv_lock); } void kvm_hv_request_tsc_page_update(struct kvm *kvm) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2b5d389bca5f..2c2783296aed 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later /* * Copyright (C) 2001 MandrakeSoft S.A. * Copyright 2010 Red Hat, Inc. and/or its affiliates. @@ -8,20 +9,6 @@ * http://www.linux-mandrake.com/ * http://www.mandrakesoft.com/ * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * Yunhong Jiang <yunhong.jiang@intel.com> * Yaozu (Eddie) Dong <eddie.dong@intel.com> * Based on Xen 3.1 code. diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 16da89259011..a6b122f732be 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -195,63 +195,6 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, struct dest_map *dest_map) -{ - int r = -1; - struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; - unsigned int dest_vcpus = 0; - - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) - return r; - - if (irq->dest_mode == APIC_DEST_PHYSICAL && - irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { - pr_info("apic: phys broadcast and lowest prio\n"); - irq->delivery_mode = APIC_DM_FIXED; - } - - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_apic_present(vcpu)) - continue; - - if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, - irq->dest_id, irq->dest_mode)) - continue; - - if (!kvm_lowest_prio_delivery(irq)) { - if (r < 0) - r = 0; - r += kvm_apic_set_irq(vcpu, irq, dest_map); - } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { - if (!kvm_vector_hashing_enabled()) { - if (!lowest) - lowest = vcpu; - else if (kvm_apic_compare_prio(vcpu, lowest) < 0) - lowest = vcpu; - } else { - __set_bit(i, dest_vcpu_bitmap); - dest_vcpus++; - } - } - } - - if (dest_vcpus != 0) { - int idx = kvm_vector_to_index(irq->vector, dest_vcpus, - dest_vcpu_bitmap, KVM_MAX_VCPUS); - - lowest = kvm_get_vcpu(kvm, idx); - } - - if (lowest) - r = kvm_apic_set_irq(lowest, irq, dest_map); - - return r; -} - static void kvm_msi_to_lapic_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 5e62c1f79ce6..34f4a78a7a01 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -121,8 +121,4 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, - struct dest_map *dest_map); - #endif diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index c1df5acfacaf..7b5ddb787a25 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -235,7 +235,6 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); bool (*is_smm)(struct x86_emulate_ctxt *ctxt); - bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); @@ -521,7 +520,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 void init_decode_cache(struct x86_emulate_ctxt *ctxt); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5fc437341e03..3b76192b24e9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -74,6 +74,10 @@ module_param(lapic_timer_advance, bool, 0444); #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 + +static bool __read_mostly vector_hashing_enabled = true; +module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444); + static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); @@ -130,7 +134,7 @@ static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); } -bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) +static bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) { return kvm_x86_ops.set_hv_timer && !(kvm_mwait_in_guest(vcpu->kvm) || @@ -1063,19 +1067,12 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, } EXPORT_SYMBOL_GPL(kvm_apic_match_dest); -int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size) +static int kvm_vector_to_index(u32 vector, u32 dest_vcpus, + const unsigned long *bitmap, u32 bitmap_size) { - u32 mod; - int i, idx = -1; - - mod = vector % dest_vcpus; - - for (i = 0; i <= mod; i++) { - idx = find_next_bit(bitmap, bitmap_size, idx + 1); - BUG_ON(idx == bitmap_size); - } + int idx = find_nth_bit(bitmap, bitmap_size, vector % dest_vcpus); + BUG_ON(idx >= bitmap_size); return idx; } @@ -1106,6 +1103,16 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, return false; } +static bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) +{ + return (irq->delivery_mode == APIC_DM_LOWEST || irq->msi_redir_hint); +} + +static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) +{ + return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; +} + /* Return true if the interrupt can be handled by using *bitmap as index mask * for valid destinations in *dst array. * Return false if kvm_apic_map_get_dest_lapic did nothing useful. @@ -1149,7 +1156,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, if (!kvm_lowest_prio_delivery(irq)) return true; - if (!kvm_vector_hashing_enabled()) { + if (!vector_hashing_enabled) { lowest = -1; for_each_set_bit(i, bitmap, 16) { if (!(*dst)[i]) @@ -1258,6 +1265,63 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, return ret; } +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, struct dest_map *dest_map) +{ + int r = -1; + struct kvm_vcpu *vcpu, *lowest = NULL; + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned int dest_vcpus = 0; + + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + + if (irq->dest_mode == APIC_DEST_PHYSICAL && + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { + pr_info("apic: phys broadcast and lowest prio\n"); + irq->delivery_mode = APIC_DM_FIXED; + } + + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, + irq->dest_id, irq->dest_mode)) + continue; + + if (!kvm_lowest_prio_delivery(irq)) { + if (r < 0) + r = 0; + r += kvm_apic_set_irq(vcpu, irq, dest_map); + } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { + if (!vector_hashing_enabled) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } else { + __set_bit(i, dest_vcpu_bitmap); + dest_vcpus++; + } + } + } + + if (dest_vcpus != 0) { + int idx = kvm_vector_to_index(irq->vector, dest_vcpus, + dest_vcpu_bitmap, KVM_MAX_VCPUS); + + lowest = kvm_get_vcpu(kvm, idx); + } + + if (lowest) + r = kvm_apic_set_irq(lowest, irq, dest_map); + + return r; +} + /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. @@ -1401,11 +1465,6 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, rcu_read_unlock(); } -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) -{ - return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; -} - static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) { return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors); @@ -1483,24 +1542,30 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) } EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); -void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) +static void kvm_icr_to_lapic_irq(struct kvm_lapic *apic, u32 icr_low, + u32 icr_high, struct kvm_lapic_irq *irq) { - struct kvm_lapic_irq irq; - /* KVM has no delay and should always clear the BUSY/PENDING flag. */ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); - irq.vector = icr_low & APIC_VECTOR_MASK; - irq.delivery_mode = icr_low & APIC_MODE_MASK; - irq.dest_mode = icr_low & APIC_DEST_MASK; - irq.level = (icr_low & APIC_INT_ASSERT) != 0; - irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; - irq.shorthand = icr_low & APIC_SHORT_MASK; - irq.msi_redir_hint = false; + irq->vector = icr_low & APIC_VECTOR_MASK; + irq->delivery_mode = icr_low & APIC_MODE_MASK; + irq->dest_mode = icr_low & APIC_DEST_MASK; + irq->level = (icr_low & APIC_INT_ASSERT) != 0; + irq->trig_mode = icr_low & APIC_INT_LEVELTRIG; + irq->shorthand = icr_low & APIC_SHORT_MASK; + irq->msi_redir_hint = false; if (apic_x2apic_mode(apic)) - irq.dest_id = icr_high; + irq->dest_id = icr_high; else - irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); + irq->dest_id = GET_XAPIC_DEST_FIELD(icr_high); +} + +void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) +{ + struct kvm_lapic_irq irq; + + kvm_icr_to_lapic_irq(apic, icr_low, icr_high, &irq); trace_kvm_apic_ipi(icr_low, irq.dest_id); @@ -2435,7 +2500,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +static int __kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data, bool fast) { if (data & X2APIC_ICR_RESERVED_BITS) return 1; @@ -2450,7 +2515,20 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) */ data &= ~APIC_ICR_BUSY; - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + if (fast) { + struct kvm_lapic_irq irq; + int ignored; + + kvm_icr_to_lapic_irq(apic, (u32)data, (u32)(data >> 32), &irq); + + if (!kvm_irq_delivery_to_apic_fast(apic->vcpu->kvm, apic, &irq, + &ignored, NULL)) + return -EWOULDBLOCK; + + trace_kvm_apic_ipi((u32)data, irq.dest_id); + } else { + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + } if (kvm_x86_ops.x2apic_icr_is_split) { kvm_lapic_set_reg(apic, APIC_ICR, data); kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); @@ -2461,6 +2539,16 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) return 0; } +static int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +{ + return __kvm_x2apic_icr_write(apic, data, false); +} + +int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data) +{ + return __kvm_x2apic_icr_write(apic, data, true); +} + static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) { if (kvm_x86_ops.x2apic_icr_is_split) @@ -2661,24 +2749,21 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) int kvm_alloc_apic_access_page(struct kvm *kvm) { void __user *hva; - int ret = 0; - mutex_lock(&kvm->slots_lock); + guard(mutex)(&kvm->slots_lock); + if (kvm->arch.apic_access_memslot_enabled || kvm->arch.apic_access_memslot_inhibited) - goto out; + return 0; hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (IS_ERR(hva)) { - ret = PTR_ERR(hva); - goto out; - } + if (IS_ERR(hva)) + return PTR_ERR(hva); kvm->arch.apic_access_memslot_enabled = true; -out: - mutex_unlock(&kvm->slots_lock); - return ret; + + return 0; } EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 72de14527698..50123fe7f58f 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -105,7 +105,6 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); @@ -119,6 +118,9 @@ void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, + struct dest_map *dest_map); void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); @@ -137,7 +139,7 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data); +int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); @@ -222,12 +224,6 @@ static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu) !kvm_x86_call(apic_init_signal_blocked)(vcpu); } -static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) -{ - return (irq->delivery_mode == APIC_DM_LOWEST || - irq->msi_redir_hint); -} - static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) { return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); @@ -242,14 +238,11 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); -int kvm_vector_to_index(u32 vector, u32 dest_vcpus, - const unsigned long *bitmap, u32 bitmap_size); void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); -bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu); static inline enum lapic_mode kvm_apic_mode(u64 apic_base) { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 75e9cfc689f8..b7dc5bd981ba 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -26,11 +26,18 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +/* Unadultered PMU capabilities of the host, i.e. of hardware. */ +static struct x86_pmu_capability __read_mostly kvm_host_pmu; + +/* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); -struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; -EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); +struct kvm_pmu_emulated_event_selectors { + u64 INSTRUCTIONS_RETIRED; + u64 BRANCH_INSTRUCTIONS_RETIRED; +}; +static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; /* Precise Distribution of Instructions Retired (PDIR) */ static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { @@ -96,6 +103,54 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } +void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; + + perf_get_x86_pmu_capability(&kvm_host_pmu); + + /* + * Hybrid PMUs don't play nice with virtualization without careful + * configuration by userspace, and KVM's APIs for reporting supported + * vPMU features do not account for hybrid PMUs. Disable vPMU support + * for hybrid PMUs until KVM gains a way to let userspace opt-in. + */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + enable_pmu = false; + + if (enable_pmu) { + /* + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. + */ + if (!kvm_host_pmu.num_counters_gp || + WARN_ON_ONCE(kvm_host_pmu.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_host_pmu.version) + enable_pmu = false; + } + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + memcpy(&kvm_pmu_cap, &kvm_host_pmu, sizeof(kvm_host_pmu)); + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_MAX_NR_FIXED_COUNTERS); + + kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); + kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); +} + static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -426,7 +481,7 @@ static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, return true; } -static bool check_pmu_event_filter(struct kvm_pmc *pmc) +static bool pmc_is_event_allowed(struct kvm_pmc *pmc) { struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; @@ -441,12 +496,6 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } -static bool pmc_event_is_allowed(struct kvm_pmc *pmc) -{ - return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && - check_pmu_event_filter(pmc); -} - static int reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -457,7 +506,8 @@ static int reprogram_counter(struct kvm_pmc *pmc) emulate_overflow = pmc_pause_counter(pmc); - if (!pmc_event_is_allowed(pmc)) + if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || + !pmc_is_event_allowed(pmc)) return 0; if (emulate_overflow) @@ -492,6 +542,47 @@ static int reprogram_counter(struct kvm_pmc *pmc) eventsel & ARCH_PERFMON_EVENTSEL_INT); } +static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel) +{ + /* + * Ignore checks for edge detect (all events currently emulated by KVM + * are always rising edges), pin control (unsupported by modern CPUs), + * and counter mask and its invert flag (KVM doesn't emulate multiple + * events in a single clock cycle). + * + * Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit + * 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34). + * Checking the "in HLE/RTM transaction" flags is correct as the vCPU + * can't be in a transaction if KVM is emulating an instruction. + * + * Checking the reserved bits might be wrong if they are defined in the + * future, but so could ignoring them, so do the simple thing for now. + */ + return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB); +} + +void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc) +{ + bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1); + bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1); + + /* + * Do NOT consult the PMU event filters, as the filters must be checked + * at the time of emulation to ensure KVM uses fresh information, e.g. + * omitting a PMC from a bitmap could result in a missed event if the + * filter is changed to allow counting the event. + */ + if (!pmc_is_locally_enabled(pmc)) + return; + + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1); + + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1); +} +EXPORT_SYMBOL_GPL(kvm_pmu_recalc_pmc_emulation); + void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); @@ -527,6 +618,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) */ if (unlikely(pmu->need_cleanup)) kvm_pmu_cleanup(vcpu); + + kvm_for_each_pmc(pmu, pmc, bit, bitmap) + kvm_pmu_recalc_pmc_emulation(pmu, pmc); } int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) @@ -650,6 +744,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = pmu->global_ctrl; break; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; break; @@ -711,6 +806,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) pmu->global_status &= ~data; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: + if (!msr_info->host_initiated) + pmu->global_status |= data & ~pmu->global_status_rsvd; + break; default: kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return kvm_pmu_call(set_msr)(vcpu, msr_info); @@ -789,6 +888,10 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) */ if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); + + bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); + bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, + pmu->nr_arch_fixed_counters); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -813,7 +916,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); kvm_for_each_pmc(pmu, pmc, i, bitmask) { - if (pmc->perf_event && !pmc_speculative_in_use(pmc)) + if (pmc->perf_event && !pmc_is_locally_enabled(pmc)) pmc_stop_counter(pmc); } @@ -860,44 +963,46 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user; } -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) +static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, + const unsigned long *event_pmcs) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; - int i; + int i, idx; BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); + if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX)) + return; + if (!kvm_pmu_has_perf_global_ctrl(pmu)) - bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); - else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx, + bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX); + else if (!bitmap_and(bitmap, event_pmcs, (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) return; + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_for_each_pmc(pmu, pmc, i, bitmap) { - /* - * Ignore checks for edge detect (all events currently emulated - * but KVM are always rising edges), pin control (unsupported - * by modern CPUs), and counter mask and its invert flag (KVM - * doesn't emulate multiple events in a single clock cycle). - * - * Note, the uppermost nibble of AMD's mask overlaps Intel's - * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved - * bits (bits 35:34). Checking the "in HLE/RTM transaction" - * flags is correct as the vCPU can't be in a transaction if - * KVM is emulating an instruction. Checking the reserved bits - * might be wrong if they are defined in the future, but so - * could ignoring them, so do the simple thing for now. - */ - if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) || - !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) + if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); +} + +void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions); +} +EXPORT_SYMBOL_GPL(kvm_pmu_instruction_retired); + +void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches); } -EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); +EXPORT_SYMBOL_GPL(kvm_pmu_branch_retired); static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ad89d0bd6005..08ae644db00e 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -23,11 +23,6 @@ #define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED -struct kvm_pmu_emulated_event_selectors { - u64 INSTRUCTIONS_RETIRED; - u64 BRANCH_INSTRUCTIONS_RETIRED; -}; - struct kvm_pmu_ops { struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); @@ -165,7 +160,7 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) return NULL; } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -178,57 +173,15 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) } extern struct x86_pmu_capability kvm_pmu_cap; -extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel; -static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) -{ - bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; - int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; +void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); - /* - * Hybrid PMUs don't play nice with virtualization without careful - * configuration by userspace, and KVM's APIs for reporting supported - * vPMU features do not account for hybrid PMUs. Disable vPMU support - * for hybrid PMUs until KVM gains a way to let userspace opt-in. - */ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - enable_pmu = false; - - if (enable_pmu) { - perf_get_x86_pmu_capability(&kvm_pmu_cap); - - /* - * WARN if perf did NOT disable hardware PMU if the number of - * architecturally required GP counters aren't present, i.e. if - * there are a non-zero number of counters, but fewer than what - * is architecturally required. - */ - if (!kvm_pmu_cap.num_counters_gp || - WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) - enable_pmu = false; - else if (is_intel && !kvm_pmu_cap.version) - enable_pmu = false; - } - - if (!enable_pmu) { - memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); - return; - } - - kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); - kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, - pmu_ops->MAX_NR_GP_COUNTERS); - kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, - KVM_MAX_NR_FIXED_COUNTERS); - - kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = - perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); - kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = - perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); -} +void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { + kvm_pmu_recalc_pmc_emulation(pmc_to_pmu(pmc), pmc); + set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } @@ -272,7 +225,8 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel); +void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); +void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index c53b92379e6e..743ab25ba787 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -25,6 +25,9 @@ #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) #define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11) +/* Intel-defined sub-features, CPUID level 0x00000007:1 (ECX) */ +#define KVM_X86_FEATURE_MSR_IMM KVM_X86_FEATURE(CPUID_7_1_ECX, 5) + /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) @@ -87,6 +90,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, + [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, }; /* @@ -128,6 +132,7 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO); KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO); + KVM_X86_TRANSLATE_FEATURE(MSR_IMM); default: return x86_feature; } diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 9864c057187d..5dd8a1646800 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -529,7 +529,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, vcpu->arch.smbase = smstate->smbase; - if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) + if (__kvm_emulate_msr_write(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) return X86EMUL_UNHANDLEABLE; rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR); @@ -620,7 +620,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) /* And finally go back to 32-bit mode. */ efer = 0; - kvm_set_msr(vcpu, MSR_EFER, efer); + __kvm_emulate_msr_write(vcpu, MSR_EFER, efer); } #endif diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 288f7f2a46f2..bc062285fbf5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -41,7 +41,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); unsigned int idx; - if (!vcpu->kvm->arch.enable_pmu) + if (!pmu->version) return NULL; switch (msr) { @@ -113,6 +113,7 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: return pmu->version > 1; default: if (msr > MSR_F15H_PERF_CTR5 && @@ -199,17 +200,16 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); if (pmu->version > 1) { - pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_ctrl_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1); pmu->global_status_rsvd = pmu->global_ctrl_rsvd; } - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; + pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; - bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } static void amd_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b8504f1fe4ef..81db26b2a1bd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1008,7 +1008,7 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) } } -static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) +static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) { svm_recalc_instruction_intercepts(vcpu); svm_recalc_msr_intercepts(vcpu); @@ -1156,7 +1156,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) svm_hv_init_vmcb(vmcb); - svm_recalc_intercepts_after_set_cpuid(vcpu); + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); vmcb_mark_all_dirty(vmcb); @@ -4093,17 +4093,27 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + /* + * Next RIP must be provided as IRQs are disabled, and accessing guest + * memory to decode the instruction might fault, i.e. might sleep. + */ + if (!nrips || !control->next_rip) + return EXIT_FASTPATH_NONE; if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - switch (svm->vmcb->control.exit_code) { + switch (control->exit_code) { case SVM_EXIT_MSR: - if (!svm->vmcb->control.exit_info_1) + if (!control->exit_info_1) break; - return handle_fastpath_set_msr_irqoff(vcpu); + return handle_fastpath_wrmsr(vcpu); case SVM_EXIT_HLT: return handle_fastpath_hlt(vcpu); + case SVM_EXIT_INVD: + return handle_fastpath_invd(vcpu); default: break; } @@ -4380,8 +4390,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); - - svm_recalc_intercepts_after_set_cpuid(vcpu); } static bool svm_has_wbinvd_exit(void) @@ -5083,7 +5091,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = { .apic_init_signal_blocked = svm_apic_init_signal_blocked, - .recalc_msr_intercepts = svm_recalc_msr_intercepts, + .recalc_intercepts = svm_recalc_intercepts, .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, @@ -5213,8 +5221,12 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); - /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ + /* + * Clear capabilities that are automatically configured by common code, + * but that require explicit SVM support (that isn't yet implemented). + */ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); + kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 5316c27f6099..f614428dbeda 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -20,9 +20,6 @@ extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 -#define PMU_CAP_FW_WRITES (1ULL << 13) -#define PMU_CAP_LBR_FMT 0x3f - struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index bb5f182f6788..0eb2773b2ae2 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -188,18 +188,18 @@ static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return vmx_get_msr(vcpu, msr_info); } -static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu) +static void vt_recalc_intercepts(struct kvm_vcpu *vcpu) { /* - * TDX doesn't allow VMM to configure interception of MSR accesses. - * TDX guest requests MSR accesses by calling TDVMCALL. The MSR - * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR - * if the userspace has set any. + * TDX doesn't allow VMM to configure interception of instructions or + * MSR accesses. TDX guest requests MSR accesses by calling TDVMCALL. + * The MSR filters will be applied when handling the TDVMCALL for + * RDMSR/WRMSR if the userspace has set any. */ if (is_td_vcpu(vcpu)) return; - vmx_recalc_msr_intercepts(vcpu); + vmx_recalc_intercepts(vcpu); } static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) @@ -996,7 +996,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .apic_init_signal_blocked = vt_op(apic_init_signal_blocked), .migrate_timers = vmx_migrate_timers, - .recalc_msr_intercepts = vt_op(recalc_msr_intercepts), + .recalc_intercepts = vt_op(recalc_intercepts), .complete_emulated_msr = vt_op(complete_emulated_msr), .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b8ea1969113d..2156c9a854f4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -997,7 +997,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { + if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -1033,7 +1033,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, } } - if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { + if (kvm_emulate_msr_read(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; @@ -2770,8 +2770,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl))) { + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -3690,7 +3690,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_branch_retired(vcpu); if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); @@ -4758,8 +4758,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl)); + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl)); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ @@ -4937,7 +4937,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { + if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); @@ -6216,19 +6216,26 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, union vmx_exit_reason exit_reason) { - u32 msr_index = kvm_rcx_read(vcpu); + u32 msr_index; gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return true; + if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) + msr_index = vmx_get_exit_qual(vcpu); + else + msr_index = kvm_rcx_read(vcpu); + /* * The MSR_BITMAP page is divided into four 1024-byte bitmaps, * for the four combinations of read/write and low/high MSR numbers. * First we need to figure out which of the four to use: */ bitmap = vmcs12->msr_bitmap; - if (exit_reason.basic == EXIT_REASON_MSR_WRITE) + if (exit_reason.basic == EXIT_REASON_MSR_WRITE || + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) bitmap += 2048; if (msr_index >= 0xc0000000) { msr_index -= 0xc0000000; @@ -6527,6 +6534,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: + case EXIT_REASON_MSR_READ_IMM: + case EXIT_REASON_MSR_WRITE_IMM: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); case EXIT_REASON_INVALID_STATE: return true; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 0b173602821b..de1d9785c01f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -138,7 +138,7 @@ static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) { - return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0; + return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; } static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) @@ -478,8 +478,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) }; u64 eventsel; - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS); - BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS); + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUNTERS); + BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUNTERS); /* * Yell if perf reports support for a fixed counter but perf doesn't @@ -536,29 +536,44 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); eax.split.bit_width = min_t(int, eax.split.bit_width, kvm_pmu_cap.bit_width_gp); - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(eax.split.bit_width) - 1; eax.split.mask_length = min_t(int, eax.split.mask_length, kvm_pmu_cap.events_mask_len); - pmu->available_event_types = ~entry->ebx & - ((1ull << eax.split.mask_length) - 1); - - if (pmu->version == 1) { - pmu->nr_arch_fixed_counters = 0; - } else { - pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, - kvm_pmu_cap.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, - kvm_pmu_cap.bit_width_fixed); - pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << edx.split.bit_width_fixed) - 1; + pmu->available_event_types = ~entry->ebx & (BIT_ULL(eax.split.mask_length) - 1); + + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { + pmu->reserved_bits ^= HSW_IN_TX; + pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); } + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (intel_pmu_lbr_is_compatible(vcpu) && + (perf_capabilities & PERF_CAP_LBR_FMT)) + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); + else + lbr_desc->records.nr = 0; + + if (lbr_desc->records.nr) + bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + if (pmu->version == 1) + return; + + pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, + kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); + pmu->counter_bitmask[KVM_PMC_FIXED] = BIT_ULL(edx.split.bit_width_fixed) - 1; + intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | INTEL_FIXED_0_ENABLE_PMI); - counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); + counter_rsvd = ~((BIT_ULL(pmu->nr_arch_gp_counters) - 1) | + ((BIT_ULL(pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); pmu->global_ctrl_rsvd = counter_rsvd; /* @@ -573,29 +588,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_status_rsvd &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); - if (entry && - (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && - (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { - pmu->reserved_bits ^= HSW_IN_TX; - pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); - } - - bitmap_set(pmu->all_valid_pmc_idx, - 0, pmu->nr_arch_gp_counters); - bitmap_set(pmu->all_valid_pmc_idx, - INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - - perf_capabilities = vcpu_get_perf_capabilities(vcpu); - if (intel_pmu_lbr_is_compatible(vcpu) && - (perf_capabilities & PMU_CAP_LBR_FMT)) - memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); - else - lbr_desc->records.nr = 0; - - if (lbr_desc->records.nr) - bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); - if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_rsvd = counter_rsvd; @@ -603,8 +595,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_data_cfg_rsvd = ~0xff00000full; intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE); } else { - pmu->pebs_enable_rsvd = - ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->pebs_enable_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1); } } } @@ -625,7 +616,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].current_config = 0; } - for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) { + for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUNTERS; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; @@ -762,7 +753,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) int bit, hw_idx; kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) { - if (!pmc_speculative_in_use(pmc) || + if (!pmc_is_locally_enabled(pmc) || !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index be3516eaa513..097304bf1e1d 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -629,6 +629,11 @@ int tdx_vm_init(struct kvm *kvm) struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); kvm->arch.has_protected_state = true; + /* + * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap, + * i.e. all EOIs are accelerated and never trigger exits. + */ + kvm->arch.has_protected_eoi = true; kvm->arch.has_private_mem = true; kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d5ddf33be8c0..089d38c047a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2140,7 +2140,7 @@ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && + if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) && (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; @@ -2425,9 +2425,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->pt_desc.guest.addr_a[index / 2] = data; break; case MSR_IA32_PERF_CAPABILITIES: - if (data & PMU_CAP_LBR_FMT) { - if ((data & PMU_CAP_LBR_FMT) != - (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) + if (data & PERF_CAP_LBR_FMT) { + if ((data & PERF_CAP_LBR_FMT) != + (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; @@ -4081,7 +4081,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) +static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { if (!cpu_has_vmx_msr_bitmap()) return; @@ -4134,6 +4134,11 @@ void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) */ } +void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) +{ + vmx_recalc_msr_intercepts(vcpu); +} + static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { @@ -4317,7 +4322,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) return pin_based_exec_ctrl; } -static u32 vmx_vmentry_ctrl(void) +static u32 vmx_get_initial_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; @@ -4334,7 +4339,7 @@ static u32 vmx_vmentry_ctrl(void) return vmentry_ctrl; } -static u32 vmx_vmexit_ctrl(void) +static u32 vmx_get_initial_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; @@ -4364,19 +4369,13 @@ void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (kvm_vcpu_apicv_active(vcpu)) { - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - if (enable_ipiv) - tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); - } else { - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - if (enable_ipiv) - tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); - } + secondary_exec_controls_changebit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY, + kvm_vcpu_apicv_active(vcpu)); + if (enable_ipiv) + tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT, + kvm_vcpu_apicv_active(vcpu)); vmx_update_msr_bitmap_x2apic(vcpu); } @@ -4699,10 +4698,10 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); + vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ - vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); + vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl()); vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); @@ -6023,6 +6022,23 @@ static int handle_notify(struct kvm_vcpu *vcpu) return 1; } +static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu) +{ + return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO)); +} + +static int handle_rdmsr_imm(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu), + vmx_get_msr_imm_reg(vcpu)); +} + +static int handle_wrmsr_imm(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), + vmx_get_msr_imm_reg(vcpu)); +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6081,6 +6097,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, [EXIT_REASON_NOTIFY] = handle_notify, + [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, + [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, }; static const int kvm_vmx_max_exit_handlers = @@ -6515,6 +6533,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_reason.basic == EXIT_REASON_MSR_WRITE) return kvm_emulate_wrmsr(vcpu); + else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) + return handle_wrmsr_imm(vcpu); else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) return handle_preemption_timer(vcpu); else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) @@ -7190,11 +7210,16 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: - return handle_fastpath_set_msr_irqoff(vcpu); + return handle_fastpath_wrmsr(vcpu); + case EXIT_REASON_MSR_WRITE_IMM: + return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), + vmx_get_msr_imm_reg(vcpu)); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); case EXIT_REASON_HLT: return handle_fastpath_hlt(vcpu); + case EXIT_REASON_INVD: + return handle_fastpath_invd(vcpu); default: return EXIT_FASTPATH_NONE; } @@ -7795,16 +7820,13 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_LC_ENABLED; - /* Recalc MSR interception to account for feature changes. */ - vmx_recalc_msr_intercepts(vcpu); - /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); } static __init u64 vmx_get_perf_capabilities(void) { - u64 perf_cap = PMU_CAP_FW_WRITES; + u64 perf_cap = PERF_CAP_FW_WRITES; u64 host_perf_cap = 0; if (!enable_pmu) @@ -7824,7 +7846,7 @@ static __init u64 vmx_get_perf_capabilities(void) if (!vmx_lbr_caps.has_callstack) memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); else if (vmx_lbr_caps.nr) - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT; } if (vmx_pebs_supported()) { @@ -8353,8 +8375,6 @@ __init int vmx_hardware_setup(void) vmx_setup_user_return_msrs(); - if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) - return -EIO; if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -8580,11 +8600,18 @@ int __init vmx_init(void) return -EOPNOTSUPP; /* - * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing - * to unwind if a later step fails. + * Note, VMCS and eVMCS configuration only touch VMX knobs/variables, + * i.e. there's nothing to unwind if a later step fails. */ hv_init_evmcs(); + /* + * Parse the VMCS config and VMX capabilities before anything else, so + * that the information is available to all setup flows. + */ + if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) + return -EIO; + r = kvm_x86_vendor_init(&vt_init_ops); if (r) return r; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d3389baf3ab3..23d6e89b96f2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -608,6 +608,14 @@ static __always_inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##b { \ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +} \ +static __always_inline void lname##_controls_changebit(struct vcpu_vmx *vmx, u##bits val, \ + bool set) \ +{ \ + if (set) \ + lname##_controls_setbit(vmx, val); \ + else \ + lname##_controls_clearbit(vmx, val); \ } BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) @@ -706,6 +714,11 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) void dump_vmcs(struct kvm_vcpu *vcpu); +static inline int vmx_get_instr_info_reg(u32 vmx_instr_info) +{ + return (vmx_instr_info >> 3) & 0xf; +} + static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) { return (vmx_instr_info >> 28) & 0xf; diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 4c70f56c57c8..9697368d65b3 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -52,7 +52,7 @@ void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); -void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu); +void vmx_recalc_intercepts(struct kvm_vcpu *vcpu); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); int vmx_get_feature_msr(u32 msr, u64 *data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 77bfee41e273..c34981d28e9c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -164,9 +164,6 @@ module_param(kvmclock_periodic_sync, bool, 0444); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); -static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, 0444); - bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, 0444); EXPORT_SYMBOL_GPL(enable_vmware_backdoor); @@ -367,6 +364,7 @@ static const u32 msrs_to_save_pmu[] = { MSR_AMD64_PERF_CNTR_GLOBAL_CTL, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + @@ -1579,10 +1577,10 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); + u32 pmc = kvm_rcx_read(vcpu); u64 data; - if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { + if (kvm_pmu_rdpmc(vcpu, pmc, &data)) { kvm_inject_gp(vcpu, 0); return 1; } @@ -1905,8 +1903,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; int ret; @@ -1932,6 +1930,16 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return ret; } +int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return __kvm_set_msr(vcpu, index, data, true); +} + +int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + return __kvm_get_msr(vcpu, index, data, true); +} + static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { @@ -1939,33 +1947,36 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, __kvm_get_msr); } -int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return KVM_MSR_RET_FILTERED; return kvm_get_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); +EXPORT_SYMBOL_GPL(__kvm_emulate_msr_read); -int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return KVM_MSR_RET_FILTERED; return kvm_set_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); +EXPORT_SYMBOL_GPL(__kvm_emulate_msr_write); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - return kvm_get_msr_ignored_check(vcpu, index, data, false); + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_read(vcpu, index, data); } -EXPORT_SYMBOL_GPL(kvm_get_msr); +EXPORT_SYMBOL_GPL(kvm_emulate_msr_read); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { - return kvm_set_msr_ignored_check(vcpu, index, data, false); + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_write(vcpu, index, data); } -EXPORT_SYMBOL_GPL(kvm_set_msr); +EXPORT_SYMBOL_GPL(kvm_emulate_msr_write); + static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) { @@ -1997,6 +2008,15 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) return complete_fast_msr_access(vcpu); } +static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu) +{ + if (!vcpu->run->msr.error) + kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, + vcpu->run->msr.data); + + return complete_fast_msr_access(vcpu); +} + static u64 kvm_msr_reason(int r) { switch (r) { @@ -2031,56 +2051,83 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, return 1; } -int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, + int (*complete_rdmsr)(struct kvm_vcpu *)) { - u32 ecx = kvm_rcx_read(vcpu); u64 data; int r; - r = kvm_get_msr_with_filter(vcpu, ecx, &data); + r = kvm_emulate_msr_read(vcpu, msr, &data); if (!r) { - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(msr, data); - kvm_rax_write(vcpu, data & -1u); - kvm_rdx_write(vcpu, (data >> 32) & -1u); + if (reg < 0) { + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + } else { + kvm_register_write(vcpu, reg, data); + } } else { /* MSR read failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, - complete_fast_rdmsr, r)) + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0, + complete_rdmsr, r)) return 0; - trace_kvm_msr_read_ex(ecx); + trace_kvm_msr_read_ex(msr); } return kvm_x86_call(complete_emulated_msr)(vcpu, r); } + +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1, + complete_fast_rdmsr); +} EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); -int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) { - u32 ecx = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); - int r; + vcpu->arch.cui_rdmsr_imm_reg = reg; + + return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm); +} +EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr_imm); - r = kvm_set_msr_with_filter(vcpu, ecx, data); +static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int r; + r = kvm_emulate_msr_write(vcpu, msr, data); if (!r) { - trace_kvm_msr_write(ecx, data); + trace_kvm_msr_write(msr, data); } else { /* MSR write failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data, complete_fast_msr_access, r)) return 0; /* Signal all other negative errors to userspace */ if (r < 0) return r; - trace_kvm_msr_write_ex(ecx, data); + trace_kvm_msr_write_ex(msr, data); } return kvm_x86_call(complete_emulated_msr)(vcpu, r); } + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); +} +EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr_imm); + int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { return kvm_skip_emulated_instruction(vcpu); @@ -2093,6 +2140,15 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) +{ + if (!kvm_emulate_invd(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; +} +EXPORT_SYMBOL_GPL(handle_fastpath_invd); + int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -2140,74 +2196,41 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -/* - * The fast path for frequent and performance sensitive wrmsr emulation, - * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces - * the latency of virtual IPI by avoiding the expensive bits of transitioning - * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the - * other cases which must be called after interrupts are enabled on the host. - */ -static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) -{ - if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) - return 1; - - if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && - ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && - ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && - ((u32)(data >> 32) != X2APIC_BROADCAST)) - return kvm_x2apic_icr_write(vcpu->arch.apic, data); - - return 1; -} - -static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) -{ - if (!kvm_can_use_hv_timer(vcpu)) - return 1; - - kvm_set_lapic_tscdeadline_msr(vcpu, data); - return 0; -} - -fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - u32 msr = kvm_rcx_read(vcpu); - u64 data; - fastpath_t ret; - bool handled; - - kvm_vcpu_srcu_read_lock(vcpu); - switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): - data = kvm_read_edx_eax(vcpu); - handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || + kvm_x2apic_icr_write_fast(vcpu->arch.apic, data)) + return EXIT_FASTPATH_NONE; break; case MSR_IA32_TSC_DEADLINE: - data = kvm_read_edx_eax(vcpu); - handled = !handle_fastpath_set_tscdeadline(vcpu, data); + kvm_set_lapic_tscdeadline_msr(vcpu, data); break; default: - handled = false; - break; + return EXIT_FASTPATH_NONE; } - if (handled) { - if (!kvm_skip_emulated_instruction(vcpu)) - ret = EXIT_FASTPATH_EXIT_USERSPACE; - else - ret = EXIT_FASTPATH_REENTER_GUEST; - trace_kvm_msr_write(msr, data); - } else { - ret = EXIT_FASTPATH_NONE; - } + trace_kvm_msr_write(msr, data); - kvm_vcpu_srcu_read_unlock(vcpu); + if (!kvm_skip_emulated_instruction(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; - return ret; + return EXIT_FASTPATH_REENTER_GUEST; } -EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); + +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) +{ + return __handle_fastpath_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} +EXPORT_SYMBOL_GPL(handle_fastpath_wrmsr); + +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); +} +EXPORT_SYMBOL_GPL(handle_fastpath_wrmsr_imm); /* * Adapt set_msr() to msr_io()'s calling convention @@ -6778,7 +6801,11 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, kvm_free_msr_filter(old_filter); - kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + /* + * Recalc MSR intercepts as userspace may want to intercept accesses to + * MSRs that KVM would otherwise pass through to the guest. + */ + kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS); return 0; } @@ -6973,6 +7000,15 @@ set_identity_unlock: if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + /* + * Disallow an in-kernel I/O APIC if the VM has protected EOIs, + * i.e. if KVM can't intercept EOIs and thus can't properly + * emulate level-triggered interrupts. + */ + r = -ENOTTY; + if (kvm->arch.has_protected_eoi) + goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; @@ -7360,6 +7396,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) return; break; @@ -8360,7 +8397,7 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); + r = kvm_emulate_msr_read(vcpu, msr_index, pdata); if (r < 0) return X86EMUL_UNHANDLEABLE; @@ -8383,7 +8420,7 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_set_msr_with_filter(vcpu, msr_index, data); + r = kvm_emulate_msr_write(vcpu, msr_index, data); if (r < 0) return X86EMUL_UNHANDLEABLE; @@ -8403,7 +8440,7 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); } static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) @@ -8477,11 +8514,6 @@ static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) return is_smm(emul_to_vcpu(ctxt)); } -static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt) -{ - return is_guest_mode(emul_to_vcpu(ctxt)); -} - #ifndef CONFIG_KVM_SMM static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) { @@ -8565,7 +8597,6 @@ static const struct x86_emulate_ops emulate_ops = { .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible, .set_nmi_mask = emulator_set_nmi_mask, .is_smm = emulator_is_smm, - .is_guest_mode = emulator_is_guest_mode, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, @@ -8871,7 +8902,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); /* * rflags is the old, "raw" value of the flags. The new value has @@ -9150,7 +9181,14 @@ restart: ctxt->exception.address = 0; } - r = x86_emulate_insn(ctxt); + /* + * Check L1's instruction intercepts when emulating instructions for + * L2, unless KVM is re-emulating a previously decoded instruction, + * e.g. to complete userspace I/O, in which case KVM has already + * checked the intercepts. + */ + r = x86_emulate_insn(ctxt, is_guest_mode(vcpu) && + !(emulation_type & EMULTYPE_NO_DECODE)); if (r == EMULATION_INTERCEPTED) return 1; @@ -9205,9 +9243,9 @@ writeback: */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); if (ctxt->is_branch) - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_branch_retired(vcpu); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); @@ -10803,13 +10841,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); - /* - * Recalc MSR intercepts as userspace may want to intercept - * accesses to MSRs that KVM would otherwise pass through to - * the guest. - */ - if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - kvm_x86_call(recalc_msr_intercepts)(vcpu); + if (kvm_check_request(KVM_REQ_RECALC_INTERCEPTS, vcpu)) + kvm_x86_call(recalc_intercepts)(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) kvm_x86_call(update_cpu_dirty_logging)(vcpu); @@ -11310,13 +11343,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_halt); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) { - int ret; - - kvm_vcpu_srcu_read_lock(vcpu); - ret = kvm_emulate_halt(vcpu); - kvm_vcpu_srcu_read_unlock(vcpu); - - if (!ret) + if (!kvm_emulate_halt(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; if (kvm_vcpu_running(vcpu)) @@ -12395,6 +12422,41 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvfree(vcpu->arch.cpuid_entries); } +static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; + u64 xfeatures_mask; + int i; + + /* + * Guest FPU state is zero allocated and so doesn't need to be manually + * cleared on RESET, i.e. during vCPU creation. + */ + if (!init_event || !fpstate) + return; + + /* + * On INIT, only select XSTATE components are zeroed, most components + * are unchanged. Currently, the only components that are zeroed and + * supported by KVM are MPX related. + */ + xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) & + (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + if (!xfeatures_mask) + return; + + BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX); + + /* + * All paths that lead to INIT are required to load the guest's FPU + * state (because most paths are buried in KVM_RUN). + */ + kvm_put_guest_fpu(vcpu); + for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX) + fpstate_clear_xstate_component(fpstate, i); + kvm_load_guest_fpu(vcpu); +} + void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_cpuid_entry2 *cpuid_0x1; @@ -12452,22 +12514,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { - struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; - - /* - * All paths that lead to INIT are required to load the guest's - * FPU state (because most paths are buried in KVM_RUN). - */ - if (init_event) - kvm_put_guest_fpu(vcpu); - - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); - - if (init_event) - kvm_load_guest_fpu(vcpu); - } + kvm_xstate_reset(vcpu, init_event); if (!init_event) { vcpu->arch.smbase = 0x30000; @@ -12479,7 +12526,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); - __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); + kvm_msr_write(vcpu, MSR_IA32_XSS, 0); } /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ @@ -13526,11 +13573,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -bool kvm_vector_hashing_enabled(void) -{ - return vector_hashing; -} - bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { return (vcpu->arch.msr_kvm_poll_control & 1) == 0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index bcfd9b719ada..786e36fcd0fb 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -431,14 +431,15 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -bool kvm_vector_hashing_enabled(void); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, void *insn, int insn_len); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); -fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c index bb215230cc8a..3eaa216b96c0 100644 --- a/tools/testing/selftests/kvm/x86/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c @@ -14,10 +14,10 @@ #define NUM_BRANCH_INSNS_RETIRED (NUM_LOOPS) /* - * Number of instructions in each loop. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, - * 1 LOOP. + * Number of instructions in each loop. 1 ENTER, 1 CLFLUSH/CLFLUSHOPT/NOP, + * 1 MFENCE, 1 MOV, 1 LEAVE, 1 LOOP. */ -#define NUM_INSNS_PER_LOOP 4 +#define NUM_INSNS_PER_LOOP 6 /* * Number of "extra" instructions that will be counted, i.e. the number of @@ -226,9 +226,11 @@ do { \ __asm__ __volatile__("wrmsr\n\t" \ " mov $" __stringify(NUM_LOOPS) ", %%ecx\n\t" \ "1:\n\t" \ + FEP "enter $0, $0\n\t" \ clflush "\n\t" \ "mfence\n\t" \ "mov %[m], %%eax\n\t" \ + FEP "leave\n\t" \ FEP "loop 1b\n\t" \ FEP "mov %%edi, %%ecx\n\t" \ FEP "xor %%eax, %%eax\n\t" \ |