diff options
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
| -rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 323 |
1 files changed, 173 insertions, 150 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 91b6f2f3edc2..4cbe8c84b636 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -203,6 +203,7 @@ module_param(pt_mode, int, S_IRUGO); struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; +#ifdef CONFIG_CPU_MITIGATIONS static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -225,7 +226,7 @@ static const struct { #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; -static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; unsigned int i; @@ -302,6 +303,26 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } +static int vmx_setup_l1d_flush(void) +{ + /* + * Hand the parameter mitigation value in which was stored in the pre + * module init parser. If no parameter was given, it will contain + * 'auto' which will be turned into the default 'cond' mitigation mode. + */ + return __vmx_setup_l1d_flush(vmentry_l1d_flush_param); +} + +static void vmx_cleanup_l1d_flush(void) +{ + if (vmx_l1d_flush_pages) { + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); + vmx_l1d_flush_pages = NULL; + } + /* Restore state so sysfs ignores VMX */ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} + static int vmentry_l1d_flush_parse(const char *s) { unsigned int i; @@ -339,7 +360,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) } mutex_lock(&vmx_l1d_flush_mutex); - ret = vmx_setup_l1d_flush(l1tf); + ret = __vmx_setup_l1d_flush(l1tf); mutex_unlock(&vmx_l1d_flush_mutex); return ret; } @@ -352,6 +373,101 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } +/* + * Software based L1D cache flush which is used when microcode providing + * the cache control MSR is not loaded. + * + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to + * flush it is required to read in 64 KiB because the replacement algorithm + * is not exactly LRU. This could be sized at runtime via topology + * information but as all relevant affected CPUs have 32KiB L1D cache size + * there is no point in doing so. + */ +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + int size = PAGE_SIZE << L1D_CACHE_ORDER; + + if (!static_branch_unlikely(&vmx_l1d_should_flush)) + return; + + /* + * This code is only executed when the flush mode is 'cond' or + * 'always' + */ + if (static_branch_likely(&vmx_l1d_flush_cond)) { + /* + * Clear the per-cpu flush bit, it gets set again if the vCPU + * is reloaded, i.e. if the vCPU is scheduled out or if KVM + * exits to userspace, or if KVM reaches one of the unsafe + * VMEXIT handlers, e.g. if KVM calls into the emulator, + * or from the interrupt handlers. + */ + if (!kvm_get_cpu_l1tf_flush_l1d()) + return; + kvm_clear_cpu_l1tf_flush_l1d(); + } + + vcpu->stat.l1d_flush++; + + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; + } + + asm volatile( + /* First ensure the pages are in the TLB */ + "xorl %%eax, %%eax\n" + ".Lpopulate_tlb:\n\t" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $4096, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lpopulate_tlb\n\t" + "xorl %%eax, %%eax\n\t" + "cpuid\n\t" + /* Now fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (vmx_l1d_flush_pages), + [size] "r" (size) + : "eax", "ebx", "ecx", "edx"); +} + +#else /* CONFIG_CPU_MITIGATIONS*/ +static int vmx_setup_l1d_flush(void) +{ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER; + return 0; +} +static void vmx_cleanup_l1d_flush(void) +{ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} +static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + +} +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n"); + return 0; +} +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + return sysfs_emit(s, "never\n"); +} +#endif + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) { u64 msr; @@ -404,12 +520,6 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) vmx->disable_fb_clear = false; } -static const struct kernel_param_ops vmentry_l1d_flush_ops = { - .set = vmentry_l1d_flush_set, - .get = vmentry_l1d_flush_get, -}; -module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); - static u32 vmx_segment_access_rights(struct kvm_segment *var); void vmx_vmexit(void); @@ -752,7 +862,7 @@ static void __loaded_vmcs_clear(void *arg) loaded_vmcs->launched = 0; } -void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { int cpu = loaded_vmcs->cpu; @@ -903,7 +1013,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; - if (static_branch_unlikely(&cpu_buf_vm_clear) && + if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; @@ -3219,6 +3329,40 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } +static u64 construct_eptp(hpa_t root_hpa) +{ + u64 eptp = root_hpa | VMX_EPTP_MT_WB; + struct kvm_mmu_page *root; + + if (kvm_mmu_is_dummy_root(root_hpa)) + return eptp | VMX_EPTP_PWL_4; + + /* + * EPT roots should always have an associated MMU page. Return a "bad" + * EPTP to induce VM-Fail instead of continuing on in a unknown state. + */ + root = root_to_sp(root_hpa); + if (WARN_ON_ONCE(!root)) + return INVALID_PAGE; + + eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + + if (enable_ept_ad_bits && !root->role.ad_disabled) + eptp |= VMX_EPTP_AD_ENABLE_BIT; + + return eptp; +} + +static void vmx_flush_tlb_ept_root(hpa_t root_hpa) +{ + u64 eptp = construct_eptp(root_hpa); + + if (VALID_PAGE(eptp)) + ept_sync_context(eptp); + else + ept_sync_global(); +} + void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -3229,8 +3373,7 @@ void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) return; if (enable_ept) - ept_sync_context(construct_eptp(vcpu, root_hpa, - mmu->root_role.level)); + vmx_flush_tlb_ept_root(root_hpa); else vpid_sync_context(vmx_get_current_vpid(vcpu)); } @@ -3396,30 +3539,16 @@ static int vmx_get_max_ept_level(void) return 4; } -u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) -{ - u64 eptp = VMX_EPTP_MT_WB; - - eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; - - if (enable_ept_ad_bits && - (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) - eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= root_hpa; - - return eptp; -} - void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; unsigned long guest_cr3; - u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, root_hpa, root_level); - vmcs_write64(EPT_POINTER, eptp); + KVM_MMU_WARN_ON(root_to_sp(root_hpa) && + root_level != root_to_sp(root_hpa)->role.level); + vmcs_write64(EPT_POINTER, construct_eptp(root_hpa)); hv_track_root_tdp(vcpu, root_hpa); @@ -6631,15 +6760,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return kvm_vmx_exit_handlers[exit_handler_index](vcpu); unexpected_vmexit: - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", - exit_reason.full); dump_vmcs(vcpu); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_reason.full; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full); return 0; } @@ -6661,77 +6783,6 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return ret; } -/* - * Software based L1D cache flush which is used when microcode providing - * the cache control MSR is not loaded. - * - * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to - * flush it is required to read in 64 KiB because the replacement algorithm - * is not exactly LRU. This could be sized at runtime via topology - * information but as all relevant affected CPUs have 32KiB L1D cache size - * there is no point in doing so. - */ -static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) -{ - int size = PAGE_SIZE << L1D_CACHE_ORDER; - - /* - * This code is only executed when the flush mode is 'cond' or - * 'always' - */ - if (static_branch_likely(&vmx_l1d_flush_cond)) { - bool flush_l1d; - - /* - * Clear the per-vcpu flush bit, it gets set again if the vCPU - * is reloaded, i.e. if the vCPU is scheduled out or if KVM - * exits to userspace, or if KVM reaches one of the unsafe - * VMEXIT handlers, e.g. if KVM calls into the emulator. - */ - flush_l1d = vcpu->arch.l1tf_flush_l1d; - vcpu->arch.l1tf_flush_l1d = false; - - /* - * Clear the per-cpu flush bit, it gets set again from - * the interrupt handlers. - */ - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); - kvm_clear_cpu_l1tf_flush_l1d(); - - if (!flush_l1d) - return; - } - - vcpu->stat.l1d_flush++; - - if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); - return; - } - - asm volatile( - /* First ensure the pages are in the TLB */ - "xorl %%eax, %%eax\n" - ".Lpopulate_tlb:\n\t" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $4096, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lpopulate_tlb\n\t" - "xorl %%eax, %%eax\n\t" - "cpuid\n\t" - /* Now fill the cache */ - "xorl %%eax, %%eax\n" - ".Lfill_cache:\n" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $64, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lfill_cache\n\t" - "lfence\n" - :: [flush_pages] "r" (vmx_l1d_flush_pages), - [size] "r" (size) - : "eax", "ebx", "ecx", "edx"); -} - void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -7050,10 +7101,19 @@ void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) if (to_vt(vcpu)->emulation_required) return; - if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) + switch (vmx_get_exit_reason(vcpu).basic) { + case EXIT_REASON_EXTERNAL_INTERRUPT: handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); - else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) + break; + case EXIT_REASON_EXCEPTION_NMI: handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); + break; + case EXIT_REASON_MCE_DURING_VMENTRY: + kvm_machine_check(); + break; + default: + break; + } } /* @@ -7328,21 +7388,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_state_enter_irqoff(); - /* - * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW - * mitigation for MDS is done late in VMentry and is still - * executed in spite of L1D Flush. This is because an extra VERW - * should not matter much after the big hammer L1D Flush. - * - * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, - * and is affected by MMIO Stale Data. In such cases mitigation in only - * needed against an MMIO capable guest. - */ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&cpu_buf_vm_clear) && - (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) - x86_clear_cpu_buffers(); + vmx_l1d_flush(vcpu); vmx_disable_fb_clear(vmx); @@ -7454,8 +7500,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - kvm_load_guest_xsave_state(vcpu); - pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); @@ -7499,8 +7543,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) pt_guest_exit(vmx); - kvm_load_host_xsave_state(vcpu); - if (is_guest_mode(vcpu)) { /* * Track VMLAUNCH/VMRESUME that have made past guest state @@ -7516,9 +7558,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) - kvm_machine_check(); - trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) @@ -8679,16 +8718,6 @@ __init int vmx_hardware_setup(void) return r; } -static void vmx_cleanup_l1d_flush(void) -{ - if (vmx_l1d_flush_pages) { - free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); - vmx_l1d_flush_pages = NULL; - } - /* Restore state so sysfs ignores VMX */ - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; -} - void vmx_exit(void) { allow_smaller_maxphyaddr = false; @@ -8724,14 +8753,8 @@ int __init vmx_init(void) if (r) return r; - /* - * Must be called after common x86 init so enable_ept is properly set - * up. Hand the parameter mitigation value in which was stored in - * the pre module init parser. If no parameter was given, it will - * contain 'auto' which will be turned into the default 'cond' - * mitigation mode. - */ - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + /* Must be called after common x86 init so enable_ept is setup. */ + r = vmx_setup_l1d_flush(); if (r) goto err_l1d_flush; |
