diff options
authorMarcelo Tosatti <>2021-05-26 14:20:14 -0300
committerPaolo Bonzini <>2021-05-27 07:58:23 -0400
commita2486020a82eefad686993695eb42d1b64f3f2fd (patch)
parent084071d5e9226add45a6031928bf10e6afc855fd (diff)
KVM: VMX: update vcpu posted-interrupt descriptor when assigning device
For VMX, when a vcpu enters HLT emulation, pi_post_block will: 1) Add vcpu to per-cpu list of blocked vcpus. 2) Program the posted-interrupt descriptor "notification vector" to POSTED_INTR_WAKEUP_VECTOR With interrupt remapping, an interrupt will set the PIR bit for the vector programmed for the device on the CPU, test-and-set the ON bit on the posted interrupt descriptor, and if the ON bit is clear generate an interrupt for the notification vector. This way, the target CPU wakes upon a device interrupt and wakes up the target vcpu. Problem is that pi_post_block only programs the notification vector if kvm_arch_has_assigned_device() is true. Its possible for the following to happen: 1) vcpu V HLTs on pcpu P, kvm_arch_has_assigned_device is false, notification vector is not programmed 2) device is assigned to VM 3) device interrupts vcpu V, sets ON bit (notification vector not programmed, so pcpu P remains in idle) 4) vcpu 0 IPIs vcpu V (in guest), but since pi descriptor ON bit is set, kvm_vcpu_kick is skipped 5) vcpu 0 busy spins on vcpu V's response for several seconds, until RCU watchdog NMIs all vCPUs. To fix this, use the start_assignment kvm_x86_ops callback to kick vcpus out of the halt loop, so the notification vector is properly reprogrammed to the wakeup vector. Reported-by: Pei Zhang <> Signed-off-by: Marcelo Tosatti <> Message-Id: <20210526172014.GA29007@fuller.cnet> Signed-off-by: Paolo Bonzini <>
4 files changed, 17 insertions, 0 deletions
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 459748680daf..5f81ef092bd4 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -238,6 +238,20 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
+ * Bail out of the block loop if the VM has an assigned
+ * device, but the blocking vCPU didn't reconfigure the
+ * PI.NV to the wakeup vector, i.e. the assigned device
+ * came along after the initial check in pi_pre_block().
+ */
+void vmx_pi_start_assignment(struct kvm *kvm)
+ if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+ kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
* pi_update_irte - set IRTE for Posted-Interrupts
* @kvm: kvm
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 0bdc41391c5b..7f7b2326caf5 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -95,5 +95,6 @@ void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
bool set);
+void vmx_pi_start_assignment(struct kvm *kvm);
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4bceb5ca3a89..639ec3eba9b8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7721,6 +7721,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.nested_ops = &vmx_nested_ops,
.update_pi_irte = pi_update_irte,
+ .start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 37a2d500a148..6a6bc7af0e28 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -307,6 +307,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
return kvm_make_all_cpus_request_except(kvm, req, NULL);
void kvm_flush_remote_tlbs(struct kvm *kvm)