summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h31
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kvm/mmu.h31
-rw-r--r--arch/x86/kvm/mmu/mmu.c60
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h76
-rw-r--r--arch/x86/kvm/mmu/spte.h5
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c10
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h21
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c325
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h51
-rw-r--r--arch/x86/kvm/x86.c3
-rw-r--r--include/linux/kvm_host.h13
-rw-r--r--virt/kvm/guest_memfd.c36
-rw-r--r--virt/kvm/kvm_main.c14
15 files changed, 554 insertions, 127 deletions
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 7b4536ff3834..c35550581da0 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -93,6 +93,10 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
+KVM_X86_OP_OPTIONAL(link_external_spt)
+KVM_X86_OP_OPTIONAL(set_external_spte)
+KVM_X86_OP_OPTIONAL(free_external_spt)
+KVM_X86_OP_OPTIONAL(remove_external_spte)
KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9f8c319fe67e..2f442701dc75 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -313,10 +313,11 @@ struct kvm_kernel_irq_routing_entry;
* the number of unique SPs that can theoretically be created is 2^n, where n
* is the number of bits that are used to compute the role.
*
- * But, even though there are 19 bits in the mask below, not all combinations
+ * But, even though there are 20 bits in the mask below, not all combinations
* of modes and flags are possible:
*
- * - invalid shadow pages are not accounted, so the bits are effectively 18
+ * - invalid shadow pages are not accounted, mirror pages are not shadowed,
+ * so the bits are effectively 18.
*
* - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
* execonly and ad_disabled are only used for nested EPT which has
@@ -349,7 +350,8 @@ union kvm_mmu_page_role {
unsigned ad_disabled:1;
unsigned guest_mode:1;
unsigned passthrough:1;
- unsigned :5;
+ unsigned is_mirror:1;
+ unsigned :4;
/*
* This is left at the top of the word so that
@@ -457,6 +459,7 @@ struct kvm_mmu {
int (*sync_spte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, int i);
struct kvm_mmu_root_info root;
+ hpa_t mirror_root_hpa;
union kvm_cpu_role cpu_role;
union kvm_mmu_page_role root_role;
@@ -830,6 +833,11 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_shadow_page_cache;
struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
+ /*
+ * This cache is to allocate external page table. E.g. private EPT used
+ * by the TDX module.
+ */
+ struct kvm_mmu_memory_cache mmu_external_spt_cache;
/*
* QEMU userspace and the guest each have their own FPU state.
@@ -1549,6 +1557,8 @@ struct kvm_arch {
*/
#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
struct kvm_mmu_memory_cache split_desc_cache;
+
+ gfn_t gfn_direct_bits;
};
struct kvm_vm_stat {
@@ -1761,6 +1771,21 @@ struct kvm_x86_ops {
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level);
+ /* Update external mapping with page table link. */
+ int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ void *external_spt);
+ /* Update the external page table from spte getting set. */
+ int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ kvm_pfn_t pfn_for_gfn);
+
+ /* Update external page tables for page table about to be freed. */
+ int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ void *external_spt);
+
+ /* Update external page table from spte getting removed, and flush TLB. */
+ int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ kvm_pfn_t pfn_for_gfn);
+
bool (*has_wbinvd_exit)(void);
u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 88585c1de416..9e75da97bce0 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -925,5 +925,6 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_SEV_VM 2
#define KVM_X86_SEV_ES_VM 3
#define KVM_X86_SNP_VM 4
+#define KVM_X86_TDX_VM 5
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index caec3d11638d..050a0e229a4d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -104,6 +104,15 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
+ /*
+ * Checking root.hpa is sufficient even when KVM has mirror root.
+ * We can have either:
+ * (1) mirror_root_hpa = INVALID_PAGE, root.hpa = INVALID_PAGE
+ * (2) mirror_root_hpa = root, root.hpa = INVALID_PAGE
+ * (3) mirror_root_hpa = root1, root.hpa = root2
+ * We don't ever have:
+ * mirror_root_hpa = INVALID_PAGE, root.hpa = root
+ */
if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE))
return 0;
@@ -287,4 +296,26 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
return gpa;
return translate_nested_gpa(vcpu, gpa, access, exception);
}
+
+static inline bool kvm_has_mirrored_tdp(const struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_TDX_VM;
+}
+
+static inline gfn_t kvm_gfn_direct_bits(const struct kvm *kvm)
+{
+ return kvm->arch.gfn_direct_bits;
+}
+
+static inline bool kvm_is_addr_direct(struct kvm *kvm, gpa_t gpa)
+{
+ gpa_t gpa_direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(kvm));
+
+ return !gpa_direct_bits || (gpa & gpa_direct_bits);
+}
+
+static inline bool kvm_is_gfn_alias(struct kvm *kvm, gfn_t gfn)
+{
+ return gfn & kvm_gfn_direct_bits(kvm);
+}
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 74fa38ebddbf..26b4ba7e7cb5 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -599,6 +599,12 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
if (r)
return r;
+ if (kvm_has_mirrored_tdp(vcpu->kvm)) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
PT64_ROOT_MAX_LEVEL);
if (r)
@@ -618,6 +624,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache);
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
@@ -3656,8 +3663,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
unsigned i;
int r;
- if (tdp_mmu_enabled)
- return kvm_tdp_mmu_alloc_root(vcpu);
+ if (tdp_mmu_enabled) {
+ if (kvm_has_mirrored_tdp(vcpu->kvm) &&
+ !VALID_PAGE(mmu->mirror_root_hpa))
+ kvm_tdp_mmu_alloc_root(vcpu, true);
+ kvm_tdp_mmu_alloc_root(vcpu, false);
+ return 0;
+ }
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
@@ -4379,8 +4391,12 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, unsigned int access)
{
struct kvm_memory_slot *slot = fault->slot;
+ struct kvm *kvm = vcpu->kvm;
int ret;
+ if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm))
+ return -EFAULT;
+
/*
* Note that the mmu_invalidate_seq also serves to detect a concurrent
* change in attributes. is_page_fault_stale() will detect an
@@ -4394,7 +4410,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* Now that we have a snapshot of mmu_invalidate_seq we can check for a
* private vs. shared mismatch.
*/
- if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}
@@ -4456,7 +4472,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
* to detect retry guarantees the worst case latency for the vCPU.
*/
- if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn))
return RET_PF_RETRY;
ret = __kvm_mmu_faultin_pfn(vcpu, fault);
@@ -4476,7 +4492,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
* overall cost of failing to detect the invalidation until after
* mmu_lock is acquired.
*/
- if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) {
kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY);
return RET_PF_RETRY;
}
@@ -6095,8 +6111,16 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
else if (r == RET_PF_SPURIOUS)
vcpu->stat.pf_spurious++;
+ /*
+ * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or
+ * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE.
+ * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to
+ * indicate continuing the page fault handling until to the final
+ * page table mapping phase.
+ */
+ WARN_ON_ONCE(r == RET_PF_CONTINUE);
if (r != RET_PF_EMULATE)
- return 1;
+ return r;
emulate:
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
@@ -6272,6 +6296,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
mmu->root.hpa = INVALID_PAGE;
mmu->root.pgd = 0;
+ mmu->mirror_root_hpa = INVALID_PAGE;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -6441,8 +6466,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* write and in the same critical section as making the reload request,
* e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/
- if (tdp_mmu_enabled)
- kvm_tdp_mmu_invalidate_all_roots(kvm);
+ if (tdp_mmu_enabled) {
+ /*
+ * External page tables don't support fast zapping, therefore
+ * their mirrors must be invalidated separately by the caller.
+ */
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS);
+ }
/*
* Notify all vcpus to reload its shadow page table and flush TLB.
@@ -6467,7 +6497,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* lead to use-after-free.
*/
if (tdp_mmu_enabled)
- kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
void kvm_mmu_init_vm(struct kvm *kvm)
@@ -7220,6 +7250,12 @@ out:
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
+ if (tdp_mmu_enabled) {
+ read_lock(&vcpu->kvm->mmu_lock);
+ mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa,
+ NULL);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ }
free_mmu_pages(&vcpu->arch.root_mmu);
free_mmu_pages(&vcpu->arch.guest_mmu);
mmu_free_memory_caches(vcpu);
@@ -7452,6 +7488,12 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
return false;
+ /* Unmap the old attribute page. */
+ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ range->attr_filter = KVM_FILTER_SHARED;
+ else
+ range->attr_filter = KVM_FILTER_PRIVATE;
+
return kvm_unmap_gfn_range(kvm, range);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 72de71f4bc8b..75f00598289d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -6,6 +6,8 @@
#include <linux/kvm_host.h>
#include <asm/kvm_host.h>
+#include "mmu.h"
+
#ifdef CONFIG_KVM_PROVE_MMU
#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
#else
@@ -101,7 +103,22 @@ struct kvm_mmu_page {
int root_count;
refcount_t tdp_mmu_root_count;
};
- unsigned int unsync_children;
+ union {
+ /* These two members aren't used for TDP MMU */
+ struct {
+ unsigned int unsync_children;
+ /*
+ * Number of writes since the last time traversal
+ * visited this page.
+ */
+ atomic_t write_flooding_count;
+ };
+ /*
+ * Page table page of external PT.
+ * Passed to TDX module, not accessed by KVM.
+ */
+ void *external_spt;
+ };
union {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
tdp_ptep_t ptep;
@@ -124,9 +141,6 @@ struct kvm_mmu_page {
int clear_spte_count;
#endif
- /* Number of writes since the last time traversal visited this page. */
- atomic_t write_flooding_count;
-
#ifdef CONFIG_X86_64
/* Used for freeing the page asynchronously if it is a TDP MMU page. */
struct rcu_head rcu_head;
@@ -145,6 +159,34 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
return kvm_mmu_role_as_id(sp->role);
}
+static inline bool is_mirror_sp(const struct kvm_mmu_page *sp)
+{
+ return sp->role.is_mirror;
+}
+
+static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ /*
+ * external_spt is allocated for TDX module to hold private EPT mappings,
+ * TDX module will initialize the page by itself.
+ * Therefore, KVM does not need to initialize or access external_spt.
+ * KVM only interacts with sp->spt for private EPT operations.
+ */
+ sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
+}
+
+static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root)
+{
+ /*
+ * Since mirror SPs are used only for TDX, which maps private memory
+ * at its "natural" GFN, no mask needs to be applied to them - and, dually,
+ * we expect that the bits is only used for the shared PT.
+ */
+ if (is_mirror_sp(root))
+ return 0;
+ return kvm_gfn_direct_bits(kvm);
+}
+
static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
{
/*
@@ -229,7 +271,12 @@ struct kvm_page_fault {
*/
u8 goal_level;
- /* Shifted addr, or result of guest page table walk if addr is a gva. */
+ /*
+ * Shifted addr, or result of guest page table walk if addr is a gva. In
+ * the case of VM where memslot's can be mapped at multiple GPA aliases
+ * (i.e. TDX), the gfn field does not contain the bit that selects between
+ * the aliases (i.e. the shared bit for TDX).
+ */
gfn_t gfn;
/* The memslot containing gfn. May be NULL. */
@@ -268,9 +315,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
* tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
*
* Note, all values must be greater than or equal to zero so as not to encroach
- * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which
- * will allow for efficient machine code when checking for CONTINUE, e.g.
- * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
+ * on -errno return values.
*/
enum {
RET_PF_CONTINUE = 0,
@@ -282,6 +327,14 @@ enum {
RET_PF_SPURIOUS,
};
+/*
+ * Define RET_PF_CONTINUE as 0 to allow for
+ * - efficient machine code when checking for CONTINUE, e.g.
+ * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero,
+ * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value.
+ */
+static_assert(RET_PF_CONTINUE == 0);
+
static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
@@ -317,7 +370,12 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int r;
if (vcpu->arch.mmu->root_role.direct) {
- fault.gfn = fault.addr >> PAGE_SHIFT;
+ /*
+ * Things like memslots don't understand the concept of a shared
+ * bit. Strip it so that the GFN can be used like normal, and the
+ * fault.addr can be used when the shared bit is needed.
+ */
+ fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm);
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index af10bc0380a3..59746854c0af 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -276,6 +276,11 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
return spte_to_child_sp(root);
}
+static inline bool is_mirror_sptep(tdp_ptep_t sptep)
+{
+ return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
+}
+
static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 04c247bfe318..9e17bfa80901 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@
static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
- SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
+ SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level);
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
@@ -37,15 +37,17 @@ void tdp_iter_restart(struct tdp_iter *iter)
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
- int min_level, gfn_t next_last_level_gfn)
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits)
{
if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
- (root->role.level > PT64_ROOT_MAX_LEVEL))) {
+ (root->role.level > PT64_ROOT_MAX_LEVEL) ||
+ (gfn_bits && next_last_level_gfn >= gfn_bits))) {
iter->valid = false;
return;
}
iter->next_last_level_gfn = next_last_level_gfn;
+ iter->gfn_bits = gfn_bits;
iter->root_level = root->role.level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
@@ -113,7 +115,7 @@ static bool try_step_side(struct tdp_iter *iter)
* Check if the iterator is already at the end of the current page
* table.
*/
- if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
+ if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) ==
(SPTE_ENT_PER_PAGE - 1))
return false;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 2880fd392e0c..047b78333653 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -93,8 +93,10 @@ struct tdp_iter {
tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
tdp_ptep_t sptep;
- /* The lowest GFN mapped by the current SPTE */
+ /* The lowest GFN (mask bits excluded) mapped by the current SPTE */
gfn_t gfn;
+ /* Mask applied to convert the GFN to the mapping GPA */
+ gfn_t gfn_bits;
/* The level of the root page given to the iterator */
int root_level;
/* The lowest level the iterator should traverse to */
@@ -122,18 +124,23 @@ struct tdp_iter {
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
-#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
- for (tdp_iter_start(&iter, root, min_level, start); \
- iter.valid && iter.gfn < end; \
+#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \
+ iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
-#define for_each_tdp_pte(iter, root, start, end) \
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
+#define for_each_tdp_pte_min_level_all(iter, root, min_level) \
+ for (tdp_iter_start(&iter, root, min_level, 0, 0); \
+ iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \
+ tdp_iter_next(&iter))
+
+#define for_each_tdp_pte(iter, kvm, root, start, end) \
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
- int min_level, gfn_t next_last_level_gfn);
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 2f15e0e33903..046b6ba31197 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -37,8 +37,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
* for zapping and thus puts the TDP MMU's reference to each root, i.e.
* ultimately frees all roots.
*/
- kvm_tdp_mmu_invalidate_all_roots(kvm);
- kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
@@ -53,6 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
+ free_page((unsigned long)sp->external_spt);
free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -91,19 +92,33 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
+static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
+ enum kvm_tdp_mmu_root_types types)
+{
+ if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
+ return false;
+
+ if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
+ return false;
+
+ if (likely(!is_mirror_sp(root)))
+ return types & KVM_DIRECT_ROOTS;
+ return types & KVM_MIRROR_ROOTS;
+}
+
/*
* Returns the next root after @prev_root (or the first root if @prev_root is
- * NULL). A reference to the returned root is acquired, and the reference to
- * @prev_root is released (the caller obviously must hold a reference to
- * @prev_root if it's non-NULL).
+ * NULL) that matches with @types. A reference to the returned root is
+ * acquired, and the reference to @prev_root is released (the caller obviously
+ * must hold a reference to @prev_root if it's non-NULL).
*
- * If @only_valid is true, invalid roots are skipped.
+ * Roots that doesn't match with @types are skipped.
*
* Returns NULL if the end of tdp_mmu_roots was reached.
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
- bool only_valid)
+ enum kvm_tdp_mmu_root_types types)
{
struct kvm_mmu_page *next_root;
@@ -124,7 +139,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
typeof(*next_root), link);
while (next_root) {
- if ((!only_valid || !next_root->role.invalid) &&
+ if (tdp_mmu_root_match(next_root, types) &&
kvm_tdp_mmu_get_root(next_root))
break;
@@ -149,20 +164,20 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* If shared is set, this function is operating under the MMU lock in read
* mode.
*/
-#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
+ _root = tdp_mmu_next_root(_kvm, _root, _types)) \
if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
} else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true)
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, false); \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, false))
+ _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
/*
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
@@ -171,18 +186,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* Holding mmu_lock for write obviates the need for RCU protection as the list
* is guaranteed to be stable.
*/
-#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
- ((_only_valid) && (_root)->role.invalid))) { \
+ !tdp_mmu_root_match((_root), (_types)))) { \
} else
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
-
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root(_kvm, _root, _as_id, true)
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
@@ -223,7 +235,7 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
-int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
union kvm_mmu_page_role role = mmu->root_role;
@@ -231,6 +243,9 @@ int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
+ if (mirror)
+ role.is_mirror = true;
+
/*
* Check for an existing root before acquiring the pages lock to avoid
* unnecessary serialization if multiple vCPUs are loading a new root.
@@ -282,9 +297,12 @@ out_read_unlock:
* and actually consuming the root if it's invalidated after dropping
* mmu_lock, and the root can't be freed as this vCPU holds a reference.
*/
- mmu->root.hpa = __pa(root->spt);
- mmu->root.pgd = 0;
- return 0;
+ if (mirror) {
+ mmu->mirror_root_hpa = __pa(root->spt);
+ } else {
+ mmu->root.hpa = __pa(root->spt);
+ mmu->root.pgd = 0;
+ }
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -322,6 +340,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
+static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ int level)
+{
+ kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
+ int ret;
+
+ /*
+ * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
+ * PTs are removed in a special order, involving free_external_spt().
+ * But remove_external_spte() will be called on non-leaf PTEs via
+ * __tdp_mmu_zap_root(), so avoid the error the former would return
+ * in this case.
+ */
+ if (!is_last_spte(old_spte, level))
+ return;
+
+ /* Zapping leaf spte is allowed only when write lock is held. */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ /* Because write lock is held, operation should success. */
+ ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
+ KVM_BUG_ON(ret, kvm);
+}
+
/**
* handle_removed_pt() - handle a page table removed from the TDP structure
*
@@ -417,11 +458,81 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
old_spte, FROZEN_SPTE, level, shared);
+
+ if (is_mirror_sp(sp)) {
+ KVM_BUG_ON(shared, kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+ }
+
+ if (is_mirror_sp(sp) &&
+ WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
+ sp->external_spt))) {
+ /*
+ * Failed to free page table page in mirror page table and
+ * there is nothing to do further.
+ * Intentionally leak the page to prevent the kernel from
+ * accessing the encrypted page.
+ */
+ sp->external_spt = NULL;
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
+static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
+{
+ if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
+ struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
+
+ WARN_ON_ONCE(sp->role.level + 1 != level);
+ WARN_ON_ONCE(sp->gfn != gfn);
+ return sp->external_spt;
+ }
+
+ return NULL;
+}
+
+static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
+ gfn_t gfn, u64 old_spte,
+ u64 new_spte, int level)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool is_present = is_shadow_present_pte(new_spte);
+ bool is_leaf = is_present && is_last_spte(new_spte, level);
+ kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
+ int ret = 0;
+
+ KVM_BUG_ON(was_present, kvm);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ /*
+ * We need to lock out other updates to the SPTE until the external
+ * page table has been modified. Use FROZEN_SPTE similar to
+ * the zapping case.
+ */
+ if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
+ return -EBUSY;
+
+ /*
+ * Use different call to either set up middle level
+ * external page table, or leaf.
+ */
+ if (is_leaf) {
+ ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
+ } else {
+ void *external_spt = get_external_spt(gfn, new_spte, level);
+
+ KVM_BUG_ON(!external_spt, kvm);
+ ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
+ }
+ if (ret)
+ __kvm_tdp_mmu_write_spte(sptep, old_spte);
+ else
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return ret;
+}
+
/**
* handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
@@ -522,11 +633,10 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
}
-static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
+static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
u64 new_spte)
{
- u64 *sptep = rcu_dereference(iter->sptep);
-
/*
* The caller is responsible for ensuring the old SPTE is not a FROZEN
* SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
@@ -535,15 +645,34 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
*/
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
- /*
- * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
- * does not hold the mmu_lock. On failure, i.e. if a different logical
- * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
- * the current value, so the caller operates on fresh data, e.g. if it
- * retries tdp_mmu_set_spte_atomic()
- */
- if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
- return -EBUSY;
+ if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
+ int ret;
+
+ /*
+ * Users of atomic zapping don't operate on mirror roots,
+ * so don't handle it and bug the VM if it's seen.
+ */
+ if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
+ return -EBUSY;
+
+ ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+ if (ret)
+ return ret;
+ } else {
+ u64 *sptep = rcu_dereference(iter->sptep);
+
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
+ * and does not hold the mmu_lock. On failure, i.e. if a
+ * different logical CPU modified the SPTE, try_cmpxchg64()
+ * updates iter->old_spte with the current value, so the caller
+ * operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic()
+ */
+ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
+ return -EBUSY;
+ }
return 0;
}
@@ -573,7 +702,7 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- ret = __tdp_mmu_set_spte_atomic(iter, new_spte);
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
if (ret)
return ret;
@@ -613,6 +742,16 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
+
+ /*
+ * Users that do non-atomic setting of PTEs don't operate on mirror
+ * roots, so don't handle it and bug the VM if it's seen.
+ */
+ if (is_mirror_sptep(sptep)) {
+ KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+
return old_spte;
}
@@ -625,18 +764,18 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
iter->gfn, iter->level);
}
-#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
- for_each_tdp_pte(_iter, _root, _start, _end)
+#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
-#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
- tdp_root_for_each_pte(_iter, _root, _start, _end) \
+#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
+ tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
if (!is_shadow_present_pte(_iter.old_spte) || \
!is_last_spte(_iter.old_spte, _iter.level)) \
continue; \
else
-#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
+#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
struct tdp_iter *iter)
@@ -705,10 +844,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
- gfn_t end = tdp_mmu_max_gfn_exclusive();
- gfn_t start = 0;
-
- for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
+ for_each_tdp_pte_min_level_all(iter, root, zap_level) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
@@ -812,7 +948,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
flush = false;
@@ -863,19 +999,21 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
struct kvm_mmu_page *root;
/*
- * Zap all roots, including invalid roots, as all SPTEs must be dropped
- * before returning to the caller. Zap directly even if the root is
- * also being zapped by a worker. Walking zapped top-level SPTEs isn't
- * all that expensive and mmu_lock is already held, which means the
- * worker has yielded, i.e. flushing the work instead of zapping here
- * isn't guaranteed to be any faster.
+ * Zap all direct roots, including invalid direct roots, as all direct
+ * SPTEs must be dropped before returning to the caller. For TDX, mirror
+ * roots don't need handling in response to the mmu notifier (the caller).
+ *
+ * Zap directly even if the root is also being zapped by a concurrent
+ * "fast zap". Walking zapped top-level SPTEs isn't all that expensive
+ * and mmu_lock is already held, which means the other thread has yielded.
*
* A TLB flush is unnecessary, KVM zaps everything if and only the VM
* is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
*/
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root)
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
+ KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
tdp_mmu_zap_root(kvm, root, false);
}
@@ -883,11 +1021,14 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
* Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
* zap" completes.
*/
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
{
struct kvm_mmu_page *root;
- read_lock(&kvm->mmu_lock);
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root) {
if (!root->tdp_mmu_scheduled_root_to_zap)
@@ -905,7 +1046,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* that may be zapped, as such entries are associated with the
* ASID on both VMX and SVM.
*/
- tdp_mmu_zap_root(kvm, root, true);
+ tdp_mmu_zap_root(kvm, root, shared);
/*
* The referenced needs to be put *after* zapping the root, as
@@ -915,7 +1056,10 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
kvm_tdp_mmu_put_root(kvm, root);
}
- read_unlock(&kvm->mmu_lock);
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
}
/*
@@ -928,11 +1072,19 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
* See kvm_tdp_mmu_alloc_root().
*/
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types)
{
struct kvm_mmu_page *root;
/*
+ * Invalidating invalid roots doesn't make sense, prevent developers from
+ * having to think about it.
+ */
+ if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
+ root_types &= ~KVM_INVALID_ROOTS;
+
+ /*
* mmu_lock must be held for write to ensure that a root doesn't become
* invalid while there are active readers (invalidating a root while
* there are active readers may or may not be problematic in practice,
@@ -953,6 +1105,9 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
* or get/put references to roots.
*/
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!tdp_mmu_root_match(root, root_types))
+ continue;
+
/*
* Note, invalid roots can outlive a memslot update! Invalid
* roots must be *zapped* before the memslot update completes,
@@ -1068,7 +1223,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
*/
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
struct kvm *kvm = vcpu->kvm;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
@@ -1080,7 +1235,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
- tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
int r;
if (fault->nx_huge_page_workaround_enabled)
@@ -1107,13 +1262,18 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
*/
sp = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_child_sp(sp, &iter);
+ if (is_mirror_sp(sp))
+ kvm_mmu_alloc_external_spt(vcpu, sp);
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
- if (is_shadow_present_pte(iter.old_spte))
+ if (is_shadow_present_pte(iter.old_spte)) {
+ /* Don't support large page for mirrored roots (TDX) */
+ KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
- else
+ } else {
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
+ }
/*
* Force the guest to retry if installing an upper level SPTE
@@ -1148,12 +1308,16 @@ retry:
return ret;
}
+/* Used by mmu notifier via kvm_unmap_gfn_range() */
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
+ enum kvm_tdp_mmu_root_types types;
struct kvm_mmu_page *root;
- __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
+
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
range->may_block, flush);
@@ -1193,20 +1357,24 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
struct kvm_gfn_range *range,
bool test_only)
{
+ enum kvm_tdp_mmu_root_types types;
struct kvm_mmu_page *root;
struct tdp_iter iter;
bool ret = false;
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
+
/*
* Don't support rescheduling, none of the MMU notifiers that funnel
* into this helper allow blocking; it'd be dead, wasteful code. Note,
* this helper must NOT be used to unmap GFNs, as it processes only
* valid roots!
*/
- for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
+ WARN_ON(types & ~KVM_VALID_ROOTS);
+ __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
guard(rcu)();
- tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
+ tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
if (!is_accessed_spte(iter.old_spte))
continue;
@@ -1247,7 +1415,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
- for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
@@ -1366,7 +1534,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
* level above the target level (e.g. splitting a 1GB to 512 2MB pages,
* and then splitting each of those to 512 4KB pages).
*/
- for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue;
@@ -1464,7 +1632,7 @@ static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- tdp_root_for_each_pte(iter, root, start, end) {
+ tdp_root_for_each_pte(iter, kvm, root, start, end) {
retry:
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1512,7 +1680,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
+ tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
gfn + BITS_PER_LONG) {
if (!mask)
break;
@@ -1566,7 +1734,7 @@ static int tdp_mmu_make_huge_spte(struct kvm *kvm,
gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
struct tdp_iter iter;
- tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
/*
* Use the parent iterator when checking for forward progress so
* that KVM doesn't get stuck continuously trying to yield (i.e.
@@ -1600,7 +1768,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
flush = false;
@@ -1681,7 +1849,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
- for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
@@ -1729,14 +1897,14 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level)
{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
struct tdp_iter iter;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
*root_level = vcpu->arch.mmu->root_role.level;
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
@@ -1758,11 +1926,12 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte)
{
+ /* Fast pf is not supported for mirrored roots */
+ struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
struct tdp_iter iter;
- struct kvm_mmu *mmu = vcpu->arch.mmu;
tdp_ptep_t sptep = NULL;
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
*spte = iter.old_spte;
sptep = iter.sptep;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index f03ca0dd13d9..52acf99d40a0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,7 +10,7 @@
void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu);
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
@@ -19,11 +19,56 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
+enum kvm_tdp_mmu_root_types {
+ KVM_INVALID_ROOTS = BIT(0),
+ KVM_DIRECT_ROOTS = BIT(1),
+ KVM_MIRROR_ROOTS = BIT(2),
+ KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS,
+ KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS,
+};
+
+static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm,
+ enum kvm_gfn_range_filter process)
+{
+ enum kvm_tdp_mmu_root_types ret = 0;
+
+ if (!kvm_has_mirrored_tdp(kvm))
+ return KVM_DIRECT_ROOTS;
+
+ if (process & KVM_FILTER_PRIVATE)
+ ret |= KVM_MIRROR_ROOTS;
+ if (process & KVM_FILTER_SHARED)
+ ret |= KVM_DIRECT_ROOTS;
+
+ WARN_ON_ONCE(!ret);
+
+ return ret;
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr)))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu,
+ enum kvm_tdp_mmu_root_types type)
+{
+ if (unlikely(type == KVM_MIRROR_ROOTS))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
-void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
-void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f56d9ccc54e1..6e248152fa13 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13034,6 +13034,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
return -EINVAL;
+ if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1))
+ return -EINVAL;
+
return kvm_alloc_memslot_metadata(kvm, new);
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9df590e8f3da..3cb9a32a6330 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -255,11 +255,17 @@ union kvm_mmu_notifier_arg {
unsigned long attributes;
};
+enum kvm_gfn_range_filter {
+ KVM_FILTER_SHARED = BIT(0),
+ KVM_FILTER_PRIVATE = BIT(1),
+};
+
struct kvm_gfn_range {
struct kvm_memory_slot *slot;
gfn_t start;
gfn_t end;
union kvm_mmu_notifier_arg arg;
+ enum kvm_gfn_range_filter attr_filter;
bool may_block;
};
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
@@ -596,7 +602,12 @@ struct kvm_memory_slot {
#ifdef CONFIG_KVM_PRIVATE_MEM
struct {
- struct file __rcu *file;
+ /*
+ * Writes protected by kvm->slots_lock. Acquiring a
+ * reference via kvm_gmem_get_file() is protected by
+ * either kvm->slots_lock or kvm->srcu.
+ */
+ struct file *file;
pgoff_t pgoff;
} gmem;
#endif
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 47a9f68f7b24..b2aa6bf24d3a 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -118,6 +118,8 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
.slot = slot,
.may_block = true,
+ /* guest memfd is relevant to only private mappings. */
+ .attr_filter = KVM_FILTER_PRIVATE,
};
if (!found_memslot) {
@@ -259,15 +261,19 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
* dereferencing the slot for existing bindings needs to be protected
* against memslot updates, specifically so that unbind doesn't race
* and free the memslot (kvm_gmem_get_file() will return NULL).
+ *
+ * Since .release is called only when the reference count is zero,
+ * after which file_ref_get() and get_file_active() fail,
+ * kvm_gmem_get_pfn() cannot be using the file concurrently.
+ * file_ref_put() provides a full barrier, and get_file_active() the
+ * matching acquire barrier.
*/
mutex_lock(&kvm->slots_lock);
filemap_invalidate_lock(inode->i_mapping);
xa_for_each(&gmem->bindings, index, slot)
- rcu_assign_pointer(slot->gmem.file, NULL);
-
- synchronize_rcu();
+ WRITE_ONCE(slot->gmem.file, NULL);
/*
* All in-flight operations are gone and new bindings can be created.
@@ -296,8 +302,7 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
/*
* Do not return slot->gmem.file if it has already been closed;
* there might be some time between the last fput() and when
- * kvm_gmem_release() clears slot->gmem.file, and you do not
- * want to spin in the meanwhile.
+ * kvm_gmem_release() clears slot->gmem.file.
*/
return get_file_active(&slot->gmem.file);
}
@@ -508,11 +513,11 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
}
/*
- * No synchronize_rcu() needed, any in-flight readers are guaranteed to
- * be see either a NULL file or this new file, no need for them to go
- * away.
+ * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
+ * kvm_gmem_bind() must occur on a new memslot. Because the memslot
+ * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
*/
- rcu_assign_pointer(slot->gmem.file, file);
+ WRITE_ONCE(slot->gmem.file, file);
slot->gmem.pgoff = start;
xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
@@ -548,8 +553,12 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
filemap_invalidate_lock(file->f_mapping);
xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
- rcu_assign_pointer(slot->gmem.file, NULL);
- synchronize_rcu();
+
+ /*
+ * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
+ * cannot see this memslot.
+ */
+ WRITE_ONCE(slot->gmem.file, NULL);
filemap_invalidate_unlock(file->f_mapping);
fput(file);
@@ -561,11 +570,12 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
pgoff_t index, kvm_pfn_t *pfn,
bool *is_prepared, int *max_order)
{
+ struct file *gmem_file = READ_ONCE(slot->gmem.file);
struct kvm_gmem *gmem = file->private_data;
struct folio *folio;
- if (file != slot->gmem.file) {
- WARN_ON_ONCE(slot->gmem.file);
+ if (file != gmem_file) {
+ WARN_ON_ONCE(gmem_file);
return ERR_PTR(-EFAULT);
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e0b9d6dd6a85..faf10671eed2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -594,6 +594,11 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
*/
gfn_range.arg = range->arg;
gfn_range.may_block = range->may_block;
+ /*
+ * HVA-based notifications aren't relevant to private
+ * mappings as they don't have a userspace mapping.
+ */
+ gfn_range.attr_filter = KVM_FILTER_SHARED;
/*
* {gfn(page) | page intersects with [hva_start, hva_end)} =
@@ -2403,6 +2408,14 @@ static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
gfn_range.arg = range->arg;
gfn_range.may_block = range->may_block;
+ /*
+ * If/when KVM supports more attributes beyond private .vs shared, this
+ * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target
+ * range already has the desired private vs. shared state (it's unclear
+ * if that is a net win). For now, KVM reaches this point if and only
+ * if the private flag is being toggled, i.e. all mappings are in play.
+ */
+
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
slots = __kvm_memslots(kvm, i);
@@ -2459,6 +2472,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
struct kvm_mmu_notifier_range pre_set_range = {
.start = start,
.end = end,
+ .arg.attributes = attributes,
.handler = kvm_pre_set_memory_attributes,
.on_lock = kvm_mmu_invalidate_begin,
.flush_on_ret = true,