summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-04-08 13:47:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-04-08 13:47:55 -0700
commit0e8863244ef5b7d4391816062fcc07ff49aa7dcf (patch)
tree9aaa7beb04bd670f930940d30c177284196b0628 /arch
parentbec7dcbc242c6c087cede1a6fdfaeb5d6eaf25bf (diff)
parentc478032df0789250afe861bff5306d0dc4a8f9e5 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "ARM: - Rework heuristics for resolving the fault IPA (HPFAR_EL2 v. re-walk stage-1 page tables) to align with the architecture. This avoids possibly taking an SEA at EL2 on the page table walk or using an architecturally UNKNOWN fault IPA - Use acquire/release semantics in the KVM FF-A proxy to avoid reading a stale value for the FF-A version - Fix KVM guest driver to match PV CPUID hypercall ABI - Use Inner Shareable Normal Write-Back mappings at stage-1 in KVM selftests, which is the only memory type for which atomic instructions are architecturally guaranteed to work s390: - Don't use %pK for debug printing and tracepoints x86: - Use a separate subclass when acquiring KVM's per-CPU posted interrupts wakeup lock in the scheduled out path, i.e. when adding a vCPU on the list of vCPUs to wake, to workaround a false positive deadlock. The schedule out code runs with a scheduler lock that the wakeup handler takes in the opposite order; but it does so with IRQs disabled and cannot run concurrently with a wakeup - Explicitly zero-initialize on-stack CPUID unions - Allow building irqbypass.ko as as module when kvm.ko is a module - Wrap relatively expensive sanity check with KVM_PROVE_MMU - Acquire SRCU in KVM_GET_MP_STATE to protect guest memory accesses selftests: - Add more scenarios to the MONITOR/MWAIT test - Add option to rseq test to override /dev/cpu_dma_latency - Bring list of exit reasons up to date - Cleanup Makefile to list once tests that are valid on all architectures Other: - Documentation fixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (26 commits) KVM: arm64: Use acquire/release to communicate FF-A version negotiation KVM: arm64: selftests: Explicitly set the page attrs to Inner-Shareable KVM: arm64: selftests: Introduce and use hardware-definition macros KVM: VMX: Use separate subclasses for PI wakeup lock to squash false positive KVM: VMX: Assert that IRQs are disabled when putting vCPU on PI wakeup list KVM: x86: Explicitly zero-initialize on-stack CPUID unions KVM: Allow building irqbypass.ko as as module when kvm.ko is a module KVM: x86/mmu: Wrap sanity check on number of TDP MMU pages with KVM_PROVE_MMU KVM: selftests: Add option to rseq test to override /dev/cpu_dma_latency KVM: x86: Acquire SRCU in KVM_GET_MP_STATE to protect guest memory accesses Documentation: kvm: remove KVM_CAP_MIPS_TE Documentation: kvm: organize capabilities in the right section Documentation: kvm: fix some definition lists Documentation: kvm: drop "Capability" heading from capabilities Documentation: kvm: give correct name for KVM_CAP_SPAPR_MULTITCE Documentation: KVM: KVM_GET_SUPPORTED_CPUID now exposes TSC_DEADLINE selftests: kvm: list once tests that are valid on all architectures selftests: kvm: bring list of exit reasons up to date selftests: kvm: revamp MONITOR/MWAIT tests KVM: arm64: Don't translate FAR if invalid/unsafe ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm64/include/asm/esr.h44
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h7
-rw-r--r--arch/arm64/include/asm/kvm_ras.h2
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/fault.h70
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c9
-rw-r--r--arch/arm64/kvm/mmu.c31
-rw-r--r--arch/arm64/tools/sysreg7
-rw-r--r--arch/s390/kvm/intercept.c2
-rw-r--r--arch/s390/kvm/interrupt.c8
-rw-r--r--arch/s390/kvm/kvm-s390.c10
-rw-r--r--arch/s390/kvm/trace-s390.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h7
-rw-r--r--arch/x86/kvm/cpuid.c8
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c8
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c37
-rw-r--r--arch/x86/kvm/x86.c4
17 files changed, 198 insertions, 69 deletions
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index d1b1a33f9a8b..e4f77757937e 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -121,6 +121,15 @@
#define ESR_ELx_FSC_SEA_TTW(n) (0x14 + (n))
#define ESR_ELx_FSC_SECC (0x18)
#define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n))
+#define ESR_ELx_FSC_ADDRSZ (0x00)
+
+/*
+ * Annoyingly, the negative levels for Address size faults aren't laid out
+ * contiguously (or in the desired order)
+ */
+#define ESR_ELx_FSC_ADDRSZ_nL(n) ((n) == -1 ? 0x25 : 0x2C)
+#define ESR_ELx_FSC_ADDRSZ_L(n) ((n) < 0 ? ESR_ELx_FSC_ADDRSZ_nL(n) : \
+ (ESR_ELx_FSC_ADDRSZ + (n)))
/* Status codes for individual page table levels */
#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n))
@@ -161,8 +170,6 @@
#define ESR_ELx_Xs_MASK (GENMASK_ULL(4, 0))
/* ISS field definitions for exceptions taken in to Hyp */
-#define ESR_ELx_FSC_ADDRSZ (0x00)
-#define ESR_ELx_FSC_ADDRSZ_L(n) (ESR_ELx_FSC_ADDRSZ + (n))
#define ESR_ELx_CV (UL(1) << 24)
#define ESR_ELx_COND_SHIFT (20)
#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT)
@@ -464,6 +471,39 @@ static inline bool esr_fsc_is_access_flag_fault(unsigned long esr)
(esr == ESR_ELx_FSC_ACCESS_L(0));
}
+static inline bool esr_fsc_is_addr_sz_fault(unsigned long esr)
+{
+ esr &= ESR_ELx_FSC;
+
+ return (esr == ESR_ELx_FSC_ADDRSZ_L(3)) ||
+ (esr == ESR_ELx_FSC_ADDRSZ_L(2)) ||
+ (esr == ESR_ELx_FSC_ADDRSZ_L(1)) ||
+ (esr == ESR_ELx_FSC_ADDRSZ_L(0)) ||
+ (esr == ESR_ELx_FSC_ADDRSZ_L(-1));
+}
+
+static inline bool esr_fsc_is_sea_ttw(unsigned long esr)
+{
+ esr = esr & ESR_ELx_FSC;
+
+ return (esr == ESR_ELx_FSC_SEA_TTW(3)) ||
+ (esr == ESR_ELx_FSC_SEA_TTW(2)) ||
+ (esr == ESR_ELx_FSC_SEA_TTW(1)) ||
+ (esr == ESR_ELx_FSC_SEA_TTW(0)) ||
+ (esr == ESR_ELx_FSC_SEA_TTW(-1));
+}
+
+static inline bool esr_fsc_is_secc_ttw(unsigned long esr)
+{
+ esr = esr & ESR_ELx_FSC;
+
+ return (esr == ESR_ELx_FSC_SECC_TTW(3)) ||
+ (esr == ESR_ELx_FSC_SECC_TTW(2)) ||
+ (esr == ESR_ELx_FSC_SECC_TTW(1)) ||
+ (esr == ESR_ELx_FSC_SECC_TTW(0)) ||
+ (esr == ESR_ELx_FSC_SECC_TTW(-1));
+}
+
/* Indicate whether ESR.EC==0x1A is for an ERETAx instruction */
static inline bool esr_iss_is_eretax(unsigned long esr)
{
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index d7cf66573aca..bd020fc28aa9 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -305,7 +305,12 @@ static __always_inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vc
static __always_inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu)
{
- return ((phys_addr_t)vcpu->arch.fault.hpfar_el2 & HPFAR_MASK) << 8;
+ u64 hpfar = vcpu->arch.fault.hpfar_el2;
+
+ if (unlikely(!(hpfar & HPFAR_EL2_NS)))
+ return INVALID_GPA;
+
+ return FIELD_GET(HPFAR_EL2_FIPA, hpfar) << 12;
}
static inline u64 kvm_vcpu_get_disr(const struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_ras.h b/arch/arm64/include/asm/kvm_ras.h
index 87e10d9a635b..9398ade632aa 100644
--- a/arch/arm64/include/asm/kvm_ras.h
+++ b/arch/arm64/include/asm/kvm_ras.h
@@ -14,7 +14,7 @@
* Was this synchronous external abort a RAS notification?
* Returns '0' for errors handled by some RAS subsystem, or -ENOENT.
*/
-static inline int kvm_handle_guest_sea(phys_addr_t addr, u64 esr)
+static inline int kvm_handle_guest_sea(void)
{
/* apei_claim_sea(NULL) expects to mask interrupts itself */
lockdep_assert_irqs_enabled();
diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h
index 17df94570f03..fc573fc767b0 100644
--- a/arch/arm64/kvm/hyp/include/hyp/fault.h
+++ b/arch/arm64/kvm/hyp/include/hyp/fault.h
@@ -12,6 +12,16 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+static inline bool __fault_safe_to_translate(u64 esr)
+{
+ u64 fsc = esr & ESR_ELx_FSC;
+
+ if (esr_fsc_is_sea_ttw(esr) || esr_fsc_is_secc_ttw(esr))
+ return false;
+
+ return !(fsc == ESR_ELx_FSC_EXTABT && (esr & ESR_ELx_FnV));
+}
+
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
{
int ret;
@@ -44,34 +54,50 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
return true;
}
-static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
+/*
+ * Checks for the conditions when HPFAR_EL2 is written, per ARM ARM R_FKLWR.
+ */
+static inline bool __hpfar_valid(u64 esr)
{
- u64 hpfar, far;
-
- far = read_sysreg_el2(SYS_FAR);
-
/*
- * The HPFAR can be invalid if the stage 2 fault did not
- * happen during a stage 1 page table walk (the ESR_EL2.S1PTW
- * bit is clear) and one of the two following cases are true:
- * 1. The fault was due to a permission fault
- * 2. The processor carries errata 834220
+ * CPUs affected by ARM erratum #834220 may incorrectly report a
+ * stage-2 translation fault when a stage-1 permission fault occurs.
*
- * Therefore, for all non S1PTW faults where we either have a
- * permission fault or the errata workaround is enabled, we
- * resolve the IPA using the AT instruction.
+ * Re-walk the page tables to determine if a stage-1 fault actually
+ * occurred.
*/
- if (!(esr & ESR_ELx_S1PTW) &&
- (cpus_have_final_cap(ARM64_WORKAROUND_834220) ||
- esr_fsc_is_permission_fault(esr))) {
- if (!__translate_far_to_hpfar(far, &hpfar))
- return false;
- } else {
+ if (cpus_have_final_cap(ARM64_WORKAROUND_834220) &&
+ esr_fsc_is_translation_fault(esr))
+ return false;
+
+ if (esr_fsc_is_translation_fault(esr) || esr_fsc_is_access_flag_fault(esr))
+ return true;
+
+ if ((esr & ESR_ELx_S1PTW) && esr_fsc_is_permission_fault(esr))
+ return true;
+
+ return esr_fsc_is_addr_sz_fault(esr);
+}
+
+static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
+{
+ u64 hpfar;
+
+ fault->far_el2 = read_sysreg_el2(SYS_FAR);
+ fault->hpfar_el2 = 0;
+
+ if (__hpfar_valid(esr))
hpfar = read_sysreg(hpfar_el2);
- }
+ else if (unlikely(!__fault_safe_to_translate(esr)))
+ return true;
+ else if (!__translate_far_to_hpfar(fault->far_el2, &hpfar))
+ return false;
- fault->far_el2 = far;
- fault->hpfar_el2 = hpfar;
+ /*
+ * Hijack HPFAR_EL2.NS (RES0 in Non-secure) to indicate a valid
+ * HPFAR value.
+ */
+ fault->hpfar_el2 = hpfar | HPFAR_EL2_NS;
return true;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index e433dfab882a..3369dd0c4009 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -730,10 +730,10 @@ static void do_ffa_version(struct arm_smccc_res *res,
hyp_ffa_version = ffa_req_version;
}
- if (hyp_ffa_post_init())
+ if (hyp_ffa_post_init()) {
res->a0 = FFA_RET_NOT_SUPPORTED;
- else {
- has_version_negotiated = true;
+ } else {
+ smp_store_release(&has_version_negotiated, true);
res->a0 = hyp_ffa_version;
}
unlock:
@@ -809,7 +809,8 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
if (!is_ffa_call(func_id))
return false;
- if (!has_version_negotiated && func_id != FFA_VERSION) {
+ if (func_id != FFA_VERSION &&
+ !smp_load_acquire(&has_version_negotiated)) {
ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS);
goto out_handled;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index f34f11c720d7..2a5284f749b4 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -578,7 +578,14 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
return;
}
- addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
+
+ /*
+ * Yikes, we couldn't resolve the fault IPA. This should reinject an
+ * abort into the host when we figure out how to do that.
+ */
+ BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS));
+ addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12;
+
ret = host_stage2_idmap(addr);
BUG_ON(ret && ret != -EAGAIN);
}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 2feb6c6b63af..754f2fe0cc67 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1794,9 +1794,28 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
gfn_t gfn;
int ret, idx;
+ /* Synchronous External Abort? */
+ if (kvm_vcpu_abt_issea(vcpu)) {
+ /*
+ * For RAS the host kernel may handle this abort.
+ * There is no need to pass the error into the guest.
+ */
+ if (kvm_handle_guest_sea())
+ kvm_inject_vabt(vcpu);
+
+ return 1;
+ }
+
esr = kvm_vcpu_get_esr(vcpu);
+ /*
+ * The fault IPA should be reliable at this point as we're not dealing
+ * with an SEA.
+ */
ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm))
+ return -EFAULT;
+
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
if (esr_fsc_is_translation_fault(esr)) {
@@ -1818,18 +1837,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
}
- /* Synchronous External Abort? */
- if (kvm_vcpu_abt_issea(vcpu)) {
- /*
- * For RAS the host kernel may handle this abort.
- * There is no need to pass the error into the guest.
- */
- if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
- kvm_inject_vabt(vcpu);
-
- return 1;
- }
-
trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
kvm_vcpu_get_hfar(vcpu), fault_ipa);
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index f9476848a2ed..bdf044c5d11b 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -3536,3 +3536,10 @@ Field 5 F
Field 4 P
Field 3:0 Align
EndSysreg
+
+Sysreg HPFAR_EL2 3 4 6 0 4
+Field 63 NS
+Res0 62:48
+Field 47:4 FIPA
+Res0 3:0
+EndSysreg
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 610dd44a948b..a06a000f196c 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -95,7 +95,7 @@ static int handle_validity(struct kvm_vcpu *vcpu)
vcpu->stat.exit_validity++;
trace_kvm_s390_intercept_validity(vcpu, viwhy);
- KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%pK)", viwhy,
+ KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%p)", viwhy,
current->pid, vcpu->kvm);
/* do not warn on invalid runtime instrumentation mode */
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 2811a6c093b8..60c360c18690 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -3161,7 +3161,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm)
if (!gi->origin)
return;
gisa_clear_ipm(gi->origin);
- VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin);
+ VM_EVENT(kvm, 3, "gisa 0x%p cleared", gi->origin);
}
void kvm_s390_gisa_init(struct kvm *kvm)
@@ -3177,7 +3177,7 @@ void kvm_s390_gisa_init(struct kvm *kvm)
hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
memset(gi->origin, 0, sizeof(struct kvm_s390_gisa));
gi->origin->next_alert = (u32)virt_to_phys(gi->origin);
- VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin);
+ VM_EVENT(kvm, 3, "gisa 0x%p initialized", gi->origin);
}
void kvm_s390_gisa_enable(struct kvm *kvm)
@@ -3218,7 +3218,7 @@ void kvm_s390_gisa_destroy(struct kvm *kvm)
process_gib_alert_list();
hrtimer_cancel(&gi->timer);
gi->origin = NULL;
- VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa);
+ VM_EVENT(kvm, 3, "gisa 0x%p destroyed", gisa);
}
void kvm_s390_gisa_disable(struct kvm *kvm)
@@ -3467,7 +3467,7 @@ int __init kvm_s390_gib_init(u8 nisc)
}
}
- KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
+ KVM_EVENT(3, "gib 0x%p (nisc=%d) initialized", gib, gib->nisc);
goto out;
out_unreg_gal:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fff863734975..3f3175193fd7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1022,7 +1022,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
}
mutex_unlock(&kvm->lock);
VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit);
- VM_EVENT(kvm, 3, "New guest asce: 0x%pK",
+ VM_EVENT(kvm, 3, "New guest asce: 0x%p",
(void *) kvm->arch.gmap->asce);
break;
}
@@ -3466,7 +3466,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_s390_gisa_init(kvm);
INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup);
kvm->arch.pv.set_aside = NULL;
- KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
+ KVM_EVENT(3, "vm 0x%p created by pid %u", kvm, current->pid);
return 0;
out_err:
@@ -3529,7 +3529,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_s390_destroy_adapters(kvm);
kvm_s390_clear_float_irqs(kvm);
kvm_s390_vsie_destroy(kvm);
- KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
+ KVM_EVENT(3, "vm 0x%p destroyed", kvm);
}
/* Section: vcpu related */
@@ -3650,7 +3650,7 @@ static int sca_switch_to_extended(struct kvm *kvm)
free_page((unsigned long)old_sca);
- VM_EVENT(kvm, 2, "Switched to ESCA (0x%pK -> 0x%pK)",
+ VM_EVENT(kvm, 2, "Switched to ESCA (0x%p -> 0x%p)",
old_sca, kvm->arch.sca);
return 0;
}
@@ -4027,7 +4027,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto out_free_sie_block;
}
- VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK",
+ VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%p, sie block at 0x%p",
vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 9ac92dbf680d..9e28f165c114 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -56,7 +56,7 @@ TRACE_EVENT(kvm_s390_create_vcpu,
__entry->sie_block = sie_block;
),
- TP_printk("create cpu %d at 0x%pK, sie block at 0x%pK",
+ TP_printk("create cpu %d at 0x%p, sie block at 0x%p",
__entry->id, __entry->vcpu, __entry->sie_block)
);
@@ -255,7 +255,7 @@ TRACE_EVENT(kvm_s390_enable_css,
__entry->kvm = kvm;
),
- TP_printk("enabling channel I/O support (kvm @ %pK)\n",
+ TP_printk("enabling channel I/O support (kvm @ %p)\n",
__entry->kvm)
);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a884ab544335..3bdae454a959 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1472,8 +1472,13 @@ struct kvm_arch {
struct once nx_once;
#ifdef CONFIG_X86_64
- /* The number of TDP MMU pages across all roots. */
+#ifdef CONFIG_KVM_PROVE_MMU
+ /*
+ * The number of TDP MMU pages across all roots. Used only to sanity
+ * check that KVM isn't leaking TDP MMU pages.
+ */
atomic64_t tdp_mmu_pages;
+#endif
/*
* List of struct kvm_mmu_pages being used as roots.
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 5e4d4934c0d3..571c906ffcbf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1427,8 +1427,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
}
break;
case 0xa: { /* Architectural Performance Monitoring */
- union cpuid10_eax eax;
- union cpuid10_edx edx;
+ union cpuid10_eax eax = { };
+ union cpuid10_edx edx = { };
if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
@@ -1444,8 +1444,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
if (kvm_pmu_cap.version)
edx.split.anythread_deprecated = 1;
- edx.split.reserved1 = 0;
- edx.split.reserved2 = 0;
entry->eax = eax.full;
entry->ebx = kvm_pmu_cap.events_mask;
@@ -1763,7 +1761,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
/* AMD Extended Performance Monitoring and Debug */
case 0x80000022: {
- union cpuid_0x80000022_ebx ebx;
+ union cpuid_0x80000022_ebx ebx = { };
entry->ecx = entry->edx = 0;
if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7cc0564f5f97..21a3b8166242 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -40,7 +40,9 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
- WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
+#ifdef CONFIG_KVM_PROVE_MMU
+ KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
+#endif
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
@@ -325,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, +1);
+#ifdef CONFIG_KVM_PROVE_MMU
atomic64_inc(&kvm->arch.tdp_mmu_pages);
+#endif
}
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, -1);
+#ifdef CONFIG_KVM_PROVE_MMU
atomic64_dec(&kvm->arch.tdp_mmu_pages);
+#endif
}
/**
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index ec08fa3caf43..51116fe69a50 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -31,6 +31,8 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
*/
static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
+#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
+
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
return &(to_vmx(vcpu)->pi_desc);
@@ -89,9 +91,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
* current pCPU if the task was migrated.
*/
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
- raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu);
+
+ /*
+ * In addition to taking the wakeup lock for the regular/IRQ
+ * context, tell lockdep it is being taken for the "sched out"
+ * context as well. vCPU loads happens in task context, and
+ * this is taking the lock of the *previous* CPU, i.e. can race
+ * with both the scheduler and the wakeup handler.
+ */
+ raw_spin_lock(spinlock);
+ spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_);
list_del(&vmx->pi_wakeup_list);
- raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ spin_release(&spinlock->dep_map, _RET_IP_);
+ raw_spin_unlock(spinlock);
}
dest = cpu_physical_id(cpu);
@@ -148,11 +161,23 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct pi_desc old, new;
- unsigned long flags;
- local_irq_save(flags);
+ lockdep_assert_irqs_disabled();
- raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ /*
+ * Acquire the wakeup lock using the "sched out" context to workaround
+ * a lockdep false positive. When this is called, schedule() holds
+ * various per-CPU scheduler locks. When the wakeup handler runs, it
+ * holds this CPU's wakeup lock while calling try_to_wake_up(), which
+ * can eventually take the aforementioned scheduler locks, which causes
+ * lockdep to assume there is deadlock.
+ *
+ * Deadlock can't actually occur because IRQs are disabled for the
+ * entirety of the sched_out critical section, i.e. the wakeup handler
+ * can't run while the scheduler locks are held.
+ */
+ raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu),
+ PI_LOCK_SCHED_OUT);
list_add_tail(&vmx->pi_wakeup_list,
&per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
@@ -176,8 +201,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
*/
if (pi_test_on(&new))
__apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
-
- local_irq_restore(flags);
}
static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c841817a914a..3712dde0bf9d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11786,6 +11786,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
if (kvm_mpx_supported())
kvm_load_guest_fpu(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
+
r = kvm_apic_accept_events(vcpu);
if (r < 0)
goto out;
@@ -11799,6 +11801,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
mp_state->mp_state = vcpu->arch.mp_state;
out:
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
if (kvm_mpx_supported())
kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);