diff options
author | Sean Christopherson <seanjc@google.com> | 2025-02-12 12:30:12 -0800 |
---|---|---|
committer | Sean Christopherson <seanjc@google.com> | 2025-02-14 07:17:17 -0800 |
commit | b146a9b34aed3cfa6edc058830c906a2b718fba5 (patch) | |
tree | a58ef2632d8583bec7bfe38af83e5cc69a3e0be1 | |
parent | 928c54b1c4caf981afbf060a1417d4255f758513 (diff) |
KVM: x86/mmu: Age TDP MMU SPTEs without holding mmu_lock
Walk the TDP MMU in an RCU read-side critical section without holding
mmu_lock when harvesting and potentially updating age information on
TDP MMU SPTEs. Add a new macro to do RCU-safe walking of TDP MMU roots,
and do all SPTE aging with atomic updates; while clobbering Accessed
information is ok, KVM must not corrupt other bits, e.g. must not drop
a Dirty or Writable bit when making a SPTE young..
If updating a SPTE to mark it for access tracking fails, leave it as is
and treat it as if it were young. If the spte is being actively modified,
it is most likely young.
Acquire and release mmu_lock for write when harvesting age information
from the shadow MMU, as the shadow MMU doesn't yet support aging outside
of mmu_lock.
Suggested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20250204004038.1680123-5-jthoughton@google.com
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 36 |
4 files changed, 35 insertions, 13 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b15cde0a9b5c..5a07bdcc3e4f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1478,6 +1478,7 @@ struct kvm_arch { * tdp_mmu_page set. * * For reads, this list is protected by: + * RCU alone or * the MMU lock in read mode + RCU or * the MMU lock in write mode * diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ea2c4f21c1ca..fe8ea8c097de 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM_X86 select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER select KVM_ELIDE_TLB_FLUSH_IF_YOUNG + select KVM_MMU_LOCKLESS_AGING select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_DIRTY_RING_TSO diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5b9ef3535f50..b73b3c12f76f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1592,8 +1592,11 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); young = kvm_rmap_age_gfn_range(kvm, range, false); + write_unlock(&kvm->mmu_lock); + } if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1605,8 +1608,11 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); young = kvm_rmap_age_gfn_range(kvm, range, true); + write_unlock(&kvm->mmu_lock); + } if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 046b6ba31197..c9778c3e6ecd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, !tdp_mmu_root_match((_root), (_types)))) { \ } else +/* + * Iterate over all TDP MMU roots in an RCU read-side critical section. + * It is safe to iterate over the SPTEs under the root, but their values will + * be unstable, so all writes must be atomic. As this routine is meant to be + * used without holding the mmu_lock at all, any bits that are flipped must + * be reflected in kvm_tdp_mmu_spte_need_atomic_write(). + */ +#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + !tdp_mmu_root_match((_root), (_types))) { \ + } else + #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) @@ -1332,21 +1345,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ -static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter) +static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) { u64 new_spte; if (spte_ad_enabled(iter->old_spte)) { - iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, - iter->old_spte, - shadow_accessed_mask, - iter->level); + iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, + shadow_accessed_mask); new_spte = iter->old_spte & ~shadow_accessed_mask; } else { new_spte = mark_spte_for_access_track(iter->old_spte); - iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, - iter->old_spte, new_spte, - iter->level); + /* + * It is safe for the following cmpxchg to fail. Leave the + * Accessed bit set, as the spte is most likely young anyway. + */ + if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte)) + return; } trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, @@ -1371,9 +1385,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, * valid roots! */ WARN_ON(types & ~KVM_VALID_ROOTS); - __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) { - guard(rcu)(); + guard(rcu)(); + for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) { tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { if (!is_accessed_spte(iter.old_spte)) continue; @@ -1382,7 +1396,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, return true; ret = true; - kvm_tdp_mmu_age_spte(&iter); + kvm_tdp_mmu_age_spte(kvm, &iter); } } |