summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c58
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h1
-rw-r--r--arch/x86/kernel/head32.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S10
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/mmu/mmu.c70
-rw-r--r--arch/x86/kvm/smm.c1
-rw-r--r--arch/x86/kvm/svm/sev.c32
-rw-r--r--arch/x86/kvm/svm/svm.c75
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/x86.c4
-rw-r--r--arch/x86/mm/tlb.c22
16 files changed, 220 insertions, 73 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4b9f378e05f6..5873c9e39919 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2368,6 +2368,7 @@ config STRICT_SIGALTSTACK_SIZE
config CFI_AUTO_DEFAULT
bool "Attempt to use FineIBT by default at boot time"
depends on FINEIBT
+ depends on !RUST || RUSTC_VERSION >= 108800
default y
help
Attempt to use FineIBT by default at boot time. If enabled,
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 695e569159c1..be7cddc414e4 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -17,10 +17,12 @@ struct ucode_cpu_info {
void load_ucode_bsp(void);
void load_ucode_ap(void);
void microcode_bsp_resume(void);
+bool __init microcode_loader_disabled(void);
#else
static inline void load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void microcode_bsp_resume(void) { }
+static inline bool __init microcode_loader_disabled(void) { return false; }
#endif
extern unsigned long initrd_start_early;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 4a10d35e70aa..96cb992d50ef 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -1098,15 +1098,17 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
static int __init save_microcode_in_initrd(void)
{
- unsigned int cpuid_1_eax = native_cpuid_eax(1);
struct cpuinfo_x86 *c = &boot_cpu_data;
struct cont_desc desc = { 0 };
+ unsigned int cpuid_1_eax;
enum ucode_state ret;
struct cpio_data cp;
- if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
return 0;
+ cpuid_1_eax = native_cpuid_eax(1);
+
if (!find_blobs_in_containers(&cp))
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b3658d11e7b6..079f046ee26d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -41,8 +41,8 @@
#include "internal.h"
-static struct microcode_ops *microcode_ops;
-bool dis_ucode_ldr = true;
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr = false;
bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
@@ -84,6 +84,9 @@ static bool amd_check_current_patch_level(void)
u32 lvl, dummy, i;
u32 *levels;
+ if (x86_cpuid_vendor() != X86_VENDOR_AMD)
+ return false;
+
native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
levels = final_levels;
@@ -95,27 +98,29 @@ static bool amd_check_current_patch_level(void)
return false;
}
-static bool __init check_loader_disabled_bsp(void)
+bool __init microcode_loader_disabled(void)
{
- static const char *__dis_opt_str = "dis_ucode_ldr";
- const char *cmdline = boot_command_line;
- const char *option = __dis_opt_str;
+ if (dis_ucode_ldr)
+ return true;
/*
- * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
- * completely accurate as xen pv guests don't see that CPUID bit set but
- * that's good enough as they don't land on the BSP path anyway.
+ * Disable when:
+ *
+ * 1) The CPU does not support CPUID.
+ *
+ * 2) Bit 31 in CPUID[1]:ECX is clear
+ * The bit is reserved for hypervisor use. This is still not
+ * completely accurate as XEN PV guests don't see that CPUID bit
+ * set, but that's good enough as they don't land on the BSP
+ * path anyway.
+ *
+ * 3) Certain AMD patch levels are not allowed to be
+ * overwritten.
*/
- if (native_cpuid_ecx(1) & BIT(31))
- return true;
-
- if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
- if (amd_check_current_patch_level())
- return true;
- }
-
- if (cmdline_find_option_bool(cmdline, option) <= 0)
- dis_ucode_ldr = false;
+ if (!have_cpuid_p() ||
+ native_cpuid_ecx(1) & BIT(31) ||
+ amd_check_current_patch_level())
+ dis_ucode_ldr = true;
return dis_ucode_ldr;
}
@@ -125,7 +130,10 @@ void __init load_ucode_bsp(void)
unsigned int cpuid_1_eax;
bool intel = true;
- if (!have_cpuid_p())
+ if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+ dis_ucode_ldr = true;
+
+ if (microcode_loader_disabled())
return;
cpuid_1_eax = native_cpuid_eax(1);
@@ -146,9 +154,6 @@ void __init load_ucode_bsp(void)
return;
}
- if (check_loader_disabled_bsp())
- return;
-
if (intel)
load_ucode_intel_bsp(&early_data);
else
@@ -159,6 +164,11 @@ void load_ucode_ap(void)
{
unsigned int cpuid_1_eax;
+ /*
+ * Can't use microcode_loader_disabled() here - .init section
+ * hell. It doesn't have to either - the BSP variant must've
+ * parsed cmdline already anyway.
+ */
if (dis_ucode_ldr)
return;
@@ -810,7 +820,7 @@ static int __init microcode_init(void)
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
- if (dis_ucode_ldr)
+ if (microcode_loader_disabled())
return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 819199bc0119..2a397da43923 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -389,7 +389,7 @@ static int __init save_builtin_microcode(void)
if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
return 0;
- if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
uci.mc = get_microcode_blob(&uci, true);
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index 5df621752fef..50a9702ae4e2 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -94,7 +94,6 @@ static inline unsigned int x86_cpuid_family(void)
return x86_family(eax);
}
-extern bool dis_ucode_ldr;
extern bool force_minrev;
#ifdef CONFIG_CPU_SUP_AMD
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index de001b2146ab..375f2d7f1762 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void)
*ptr = (unsigned long)ptep + PAGE_OFFSET;
#ifdef CONFIG_MICROCODE_INITRD32
- /* Running on a hypervisor? */
- if (native_cpuid_ecx(1) & BIT(31))
- return;
-
params = (struct boot_params *)__pa_nodebug(&boot_params);
if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
return;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ccdc45e5b759..aa4d0221583c 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -466,10 +466,18 @@ SECTIONS
}
/*
- * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause
+ * this. Let's assume that nobody will be running a COMPILE_TEST kernel and
+ * let's assert that fuller build coverage is more valuable than being able to
+ * run a COMPILE_TEST kernel.
+ */
+#ifndef CONFIG_COMPILE_TEST
+/*
+ * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility:
*/
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
+#endif
/* needed for Clang - see arch/x86/entry/entry.S */
PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 050a0e229a4d..f2b36d32ef40 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -104,6 +104,9 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
+ if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ kvm_mmu_free_obsolete_roots(vcpu);
+
/*
* Checking root.hpa is sufficient even when KVM has mirror root.
* We can have either:
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 63bb77ee1bb1..8d1b632e33d2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5974,6 +5974,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots);
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
int *bytes)
@@ -7669,9 +7670,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range)
{
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
/*
* Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
* supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
@@ -7686,6 +7708,38 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
return false;
+ if (WARN_ON_ONCE(range->end <= range->start))
+ return false;
+
+ /*
+ * If the head and tail pages of the range currently allow a hugepage,
+ * i.e. reside fully in the slot and don't have mixed attributes, then
+ * add each corresponding hugepage range to the ongoing invalidation,
+ * e.g. to prevent KVM from creating a hugepage in response to a fault
+ * for a gfn whose attributes aren't changing. Note, only the range
+ * of gfns whose attributes are being modified needs to be explicitly
+ * unmapped, as that will unmap any existing hugepages.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t start = gfn_round_for_level(range->start, level);
+ gfn_t end = gfn_round_for_level(range->end - 1, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+
+ if ((start != range->start || start + nr_pages > range->end) &&
+ start >= slot->base_gfn &&
+ start + nr_pages <= slot->base_gfn + slot->npages &&
+ !hugepage_test_mixed(slot, start, level))
+ kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages);
+
+ if (end == start)
+ continue;
+
+ if ((end + nr_pages) > range->end &&
+ (end + nr_pages) <= (slot->base_gfn + slot->npages) &&
+ !hugepage_test_mixed(slot, end, level))
+ kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages);
+ }
+
/* Unmap the old attribute page. */
if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
range->attr_filter = KVM_FILTER_SHARED;
@@ -7695,23 +7749,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
return kvm_unmap_gfn_range(kvm, range);
}
-static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
-}
-
-static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
-}
-static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
-}
static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, int level, unsigned long attrs)
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index 699e551ec93b..9864c057187d 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
kvm_mmu_reset_context(vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_smm_changed);
void process_smi(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0bc708ee2788..a7a7dc507336 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3173,9 +3173,14 @@ skip_vmsa_free:
kvfree(svm->sev_es.ghcb_sa);
}
+static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
+{
+ return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
static void dump_ghcb(struct vcpu_svm *svm)
{
- struct ghcb *ghcb = svm->sev_es.ghcb;
+ struct vmcb_control_area *control = &svm->vmcb->control;
unsigned int nbits;
/* Re-use the dump_invalid_vmcb module parameter */
@@ -3184,18 +3189,24 @@ static void dump_ghcb(struct vcpu_svm *svm)
return;
}
- nbits = sizeof(ghcb->save.valid_bitmap) * 8;
+ nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
- pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa);
+ /*
+ * Print KVM's snapshot of the GHCB values that were (unsuccessfully)
+ * used to handle the exit. If the guest has since modified the GHCB
+ * itself, dumping the raw GHCB won't help debug why KVM was unable to
+ * handle the VMGEXIT that KVM observed.
+ */
+ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
- ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb));
+ kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
- ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb));
+ control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
- ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb));
+ control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
- ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb));
- pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap);
+ svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
+ pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
}
static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
@@ -3266,11 +3277,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
-static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
-{
- return (((u64)control->exit_code_hi) << 32) | control->exit_code;
-}
-
static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d5d0c5c3300b..a89c271a1951 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void)
kvm_cpu_svm_disable();
amd_pmu_disable_virt();
-
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
static int svm_enable_virtualization_cpu(void)
@@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void)
rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
-
return 0;
}
@@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
+#ifdef CONFIG_CPU_MITIGATIONS
+static DEFINE_SPINLOCK(srso_lock);
+static atomic_t srso_nr_vms;
+
+static void svm_srso_clear_bp_spec_reduce(void *ign)
+{
+ struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
+
+ if (!sd->bp_spec_reduce_set)
+ return;
+
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ sd->bp_spec_reduce_set = false;
+}
+
+static void svm_srso_vm_destroy(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ if (atomic_dec_return(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ /*
+ * Verify a new VM didn't come along, acquire the lock, and increment
+ * the count before this task acquired the lock.
+ */
+ if (atomic_read(&srso_nr_vms))
+ return;
+
+ on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
+}
+
+static void svm_srso_vm_init(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ /*
+ * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
+ * transition, i.e. destroying the last VM, is fully complete, e.g. so
+ * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
+ */
+ if (atomic_inc_not_zero(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ atomic_inc(&srso_nr_vms);
+}
+#else
+static void svm_srso_vm_init(void) { }
+static void svm_srso_vm_destroy(void) { }
+#endif
+
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1550,6 +1601,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
+ !sd->bp_spec_reduce_set) {
+ sd->bp_spec_reduce_set = true;
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ }
svm->guest_state_loaded = true;
}
@@ -2231,6 +2287,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
*/
if (!sev_es_guest(vcpu->kvm)) {
clear_page(svm->vmcb);
+#ifdef CONFIG_KVM_SMM
+ if (is_smm(vcpu))
+ kvm_smm_changed(vcpu, false);
+#endif
kvm_vcpu_reset(vcpu, true);
}
@@ -5036,6 +5096,8 @@ static void svm_vm_destroy(struct kvm *kvm)
{
avic_vm_destroy(kvm);
sev_vm_destroy(kvm);
+
+ svm_srso_vm_destroy();
}
static int svm_vm_init(struct kvm *kvm)
@@ -5061,6 +5123,7 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}
+ svm_srso_vm_init();
return 0;
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d4490eaed55d..f16b068c4228 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -335,6 +335,8 @@ struct svm_cpu_data {
u32 next_asid;
u32 min_asid;
+ bool bp_spec_reduce_set;
+
struct vmcb *save_area;
unsigned long save_area_pa;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index df5b99ea1f18..9896fd574bfc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4597,7 +4597,7 @@ static bool kvm_is_vm_type_supported(unsigned long type)
return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
}
-static inline u32 kvm_sync_valid_fields(struct kvm *kvm)
+static inline u64 kvm_sync_valid_fields(struct kvm *kvm)
{
return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
}
@@ -11493,7 +11493,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct kvm_run *kvm_run = vcpu->run;
- u32 sync_valid_fields;
+ u64 sync_valid_fields;
int r;
r = kvm_mmu_post_init_vm(vcpu->kvm);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index eb83348f9305..b6d6750e4bd1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -899,8 +899,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
cond_mitigation(tsk);
/*
- * Let nmi_uaccess_okay() and finish_asid_transition()
- * know that CR3 is changing.
+ * Indicate that CR3 is about to change. nmi_uaccess_okay()
+ * and others are sensitive to the window where mm_cpumask(),
+ * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
*/
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
@@ -1204,8 +1205,16 @@ done:
static bool should_flush_tlb(int cpu, void *data)
{
+ struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);
struct flush_tlb_info *info = data;
+ /*
+ * Order the 'loaded_mm' and 'is_lazy' against their
+ * write ordering in switch_mm_irqs_off(). Ensure
+ * 'is_lazy' is at least as new as 'loaded_mm'.
+ */
+ smp_rmb();
+
/* Lazy TLB will get flushed at the next context switch. */
if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
return false;
@@ -1214,8 +1223,15 @@ static bool should_flush_tlb(int cpu, void *data)
if (!info->mm)
return true;
+ /*
+ * While switching, the remote CPU could have state from
+ * either the prev or next mm. Assume the worst and flush.
+ */
+ if (loaded_mm == LOADED_MM_SWITCHING)
+ return true;
+
/* The target mm is loaded, and the CPU is not lazy. */
- if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ if (loaded_mm == info->mm)
return true;
/* In cpumask, but not the loaded mm? Periodically remove by flushing. */