diff options
| -rw-r--r-- | arch/x86/include/asm/posted_intr.h | 64 | ||||
| -rw-r--r-- | arch/x86/kernel/irq.c | 50 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.c | 16 |
3 files changed, 69 insertions, 61 deletions
diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index d86aa4badcee..a5f761fbf45b 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_POSTED_INTR_H #define _X86_POSTED_INTR_H + +#include <asm/cmpxchg.h> +#include <asm/rwonce.h> #include <asm/irq_vectors.h> +#include <linux/bitmap.h> + #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 @@ -26,6 +31,65 @@ struct pi_desc { u32 rsvd[6]; } __aligned(64); +/* + * De-multiplexing posted interrupts is on the performance path, the code + * below is written to optimize the cache performance based on the following + * considerations: + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently + * accessed by both CPU and IOMMU. + * 2.During software processing of posted interrupts, the CPU needs to do + * natural width read and xchg for checking and clearing posted interrupt + * request (PIR), a 256 bit field within the PID. + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache + * line when posting interrupts and setting control bits. + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID + * cache line. The cache line states after each operation are as follows, + * assuming a 64-bit kernel: + * CPU IOMMU PID Cache line state + * --------------------------------------------------------------- + *...read64 exclusive + *...lock xchg64 modified + *... post/atomic swap invalid + *...------------------------------------------------------------- + * + * To reduce L1 data cache miss, it is important to avoid contention with + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used + * when processing posted interrupts in software, e.g. to dispatch interrupt + * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR + * in KVM. + * + * In addition, the code is trying to keep the cache line state consistent + * as much as possible. e.g. when making a copy and clearing the PIR + * (assuming non-zero PIR bits are present in the entire PIR), it does: + * read, read, read, read, xchg, xchg, xchg, xchg + * instead of: + * read, xchg, read, xchg, read, xchg, read, xchg + */ +static __always_inline bool pi_harvest_pir(unsigned long *pir, + unsigned long *pir_vals) +{ + unsigned long pending = 0; + int i; + + for (i = 0; i < NR_PIR_WORDS; i++) { + pir_vals[i] = READ_ONCE(pir[i]); + pending |= pir_vals[i]; + } + + if (!pending) + return false; + + for (i = 0; i < NR_PIR_WORDS; i++) { + if (!pir_vals[i]) + continue; + + pir_vals[i] = arch_xchg(&pir[i], 0); + } + + return true; +} + static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) { return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c8cf7c61fc57..9ed29ff10e59 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -380,58 +380,14 @@ void intel_posted_msi_init(void) this_cpu_write(posted_msi_pi_desc.ndst, destination); } -/* - * De-multiplexing posted interrupts is on the performance path, the code - * below is written to optimize the cache performance based on the following - * considerations: - * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently - * accessed by both CPU and IOMMU. - * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg - * for checking and clearing posted interrupt request (PIR), a 256 bit field - * within the PID. - * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache - * line when posting interrupts and setting control bits. - * 4.The CPU can access the cache line a magnitude faster than the IOMMU. - * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID - * cache line. The cache line states after each operation are as follows: - * CPU IOMMU PID Cache line state - * --------------------------------------------------------------- - *...read64 exclusive - *...lock xchg64 modified - *... post/atomic swap invalid - *...------------------------------------------------------------- - * - * To reduce L1 data cache miss, it is important to avoid contention with - * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used - * to dispatch interrupt handlers. - * - * In addition, the code is trying to keep the cache line state consistent - * as much as possible. e.g. when making a copy and clearing the PIR - * (assuming non-zero PIR bits are present in the entire PIR), it does: - * read, read, read, read, xchg, xchg, xchg, xchg - * instead of: - * read, xchg, read, xchg, read, xchg, read, xchg - */ static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) { - unsigned long pir_copy[NR_PIR_WORDS], pending = 0; - int i, vec = FIRST_EXTERNAL_VECTOR; - - for (i = 0; i < NR_PIR_WORDS; i++) { - pir_copy[i] = READ_ONCE(pir[i]); - pending |= pir_copy[i]; - } + unsigned long pir_copy[NR_PIR_WORDS]; + int vec = FIRST_EXTERNAL_VECTOR; - if (!pending) + if (!pi_harvest_pir(pir, pir_copy)) return false; - for (i = 0; i < NR_PIR_WORDS; i++) { - if (!pir_copy[i]) - continue; - - pir_copy[i] = arch_xchg(&pir[i], 0); - } - for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) call_irq_handler(vec, regs); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 70ad5f0942e2..2c325b45b415 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -657,7 +657,7 @@ static u8 count_vectors(void *bitmap) bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { - unsigned long pir_vals[NR_PIR_WORDS], pending = 0; + unsigned long pir_vals[NR_PIR_WORDS]; u32 *__pir = (void *)pir_vals; u32 i, vec; u32 irr_val, prev_irr_val; @@ -666,21 +666,9 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) max_updated_irr = -1; *max_irr = -1; - for (i = 0; i < NR_PIR_WORDS; i++) { - pir_vals[i] = READ_ONCE(pir[i]); - pending |= pir_vals[i]; - } - - if (!pending) + if (!pi_harvest_pir(pir, pir_vals)) return false; - for (i = 0; i < NR_PIR_WORDS; i++) { - if (!pir_vals[i]) - continue; - - pir_vals[i] = arch_xchg(&pir[i], 0); - } - for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); |
