diff options
Diffstat (limited to 'arch/arm64/kvm/hyp')
35 files changed, 2400 insertions, 1714 deletions
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 4433a234aa9b..9f4e8d68ab50 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN alternative_else_nop_endif mrs x1, isr_el1 cbz x1, 1f + + // Ensure that __guest_enter() always provides a context + // synchronization event so that callers don't need ISBs for anything + // that would usually be synchonized by the ERET. + isb mov x0, #ARM_EXCEPTION_IRQ ret diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index d00093699aaf..502a5b73ee70 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -88,15 +88,26 @@ default: write_debug(ptr[0], reg, 0); \ } +static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.debug_owner) { + case VCPU_DEBUG_FREE: + WARN_ON_ONCE(1); + fallthrough; + case VCPU_DEBUG_GUEST_OWNED: + return &vcpu->arch.vcpu_debug_state; + case VCPU_DEBUG_HOST_OWNED: + return &vcpu->arch.external_debug_state; + } + + return NULL; +} + static void __debug_save_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); save_debug(dbg->dbg_bcr, dbgbcr, brps); save_debug(dbg->dbg_bvr, dbgbvr, brps); @@ -109,13 +120,8 @@ static void __debug_save_state(struct kvm_guest_debug_arch *dbg, static void __debug_restore_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); restore_debug(dbg->dbg_bcr, dbgbcr, brps); restore_debug(dbg->dbg_bvr, dbgbvr, brps); @@ -132,13 +138,13 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; host_dbg = host_data_ptr(host_debug_state.regs); - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(host_dbg, host_ctxt); __debug_restore_state(guest_dbg, guest_ctxt); @@ -151,18 +157,16 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; host_dbg = host_data_ptr(host_debug_state.regs); - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - - vcpu_clear_flag(vcpu, DEBUG_DIRTY); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 17df94570f03..fc573fc767b0 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -12,6 +12,16 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +static inline bool __fault_safe_to_translate(u64 esr) +{ + u64 fsc = esr & ESR_ELx_FSC; + + if (esr_fsc_is_sea_ttw(esr) || esr_fsc_is_secc_ttw(esr)) + return false; + + return !(fsc == ESR_ELx_FSC_EXTABT && (esr & ESR_ELx_FnV)); +} + static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { int ret; @@ -44,34 +54,50 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) return true; } -static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +/* + * Checks for the conditions when HPFAR_EL2 is written, per ARM ARM R_FKLWR. + */ +static inline bool __hpfar_valid(u64 esr) { - u64 hpfar, far; - - far = read_sysreg_el2(SYS_FAR); - /* - * The HPFAR can be invalid if the stage 2 fault did not - * happen during a stage 1 page table walk (the ESR_EL2.S1PTW - * bit is clear) and one of the two following cases are true: - * 1. The fault was due to a permission fault - * 2. The processor carries errata 834220 + * CPUs affected by ARM erratum #834220 may incorrectly report a + * stage-2 translation fault when a stage-1 permission fault occurs. * - * Therefore, for all non S1PTW faults where we either have a - * permission fault or the errata workaround is enabled, we - * resolve the IPA using the AT instruction. + * Re-walk the page tables to determine if a stage-1 fault actually + * occurred. */ - if (!(esr & ESR_ELx_S1PTW) && - (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - esr_fsc_is_permission_fault(esr))) { - if (!__translate_far_to_hpfar(far, &hpfar)) - return false; - } else { + if (cpus_have_final_cap(ARM64_WORKAROUND_834220) && + esr_fsc_is_translation_fault(esr)) + return false; + + if (esr_fsc_is_translation_fault(esr) || esr_fsc_is_access_flag_fault(esr)) + return true; + + if ((esr & ESR_ELx_S1PTW) && esr_fsc_is_permission_fault(esr)) + return true; + + return esr_fsc_is_addr_sz_fault(esr); +} + +static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +{ + u64 hpfar; + + fault->far_el2 = read_sysreg_el2(SYS_FAR); + fault->hpfar_el2 = 0; + + if (__hpfar_valid(esr)) hpfar = read_sysreg(hpfar_el2); - } + else if (unlikely(!__fault_safe_to_translate(esr))) + return true; + else if (!__translate_far_to_hpfar(fault->far_el2, &hpfar)) + return false; - fault->far_el2 = far; - fault->hpfar_el2 = hpfar; + /* + * Hijack HPFAR_EL2.NS (RES0 in Non-secure) to indicate a valid + * HPFAR value. + */ + fault->hpfar_el2 = hpfar | HPFAR_EL2_NS; return true; } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 34f53707892d..bb9f2eecfb67 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -65,12 +65,56 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } +#define reg_to_fgt_masks(reg) \ + ({ \ + struct fgt_masks *m; \ + switch(reg) { \ + case HFGRTR_EL2: \ + m = &hfgrtr_masks; \ + break; \ + case HFGWTR_EL2: \ + m = &hfgwtr_masks; \ + break; \ + case HFGITR_EL2: \ + m = &hfgitr_masks; \ + break; \ + case HDFGRTR_EL2: \ + m = &hdfgrtr_masks; \ + break; \ + case HDFGWTR_EL2: \ + m = &hdfgwtr_masks; \ + break; \ + case HAFGRTR_EL2: \ + m = &hafgrtr_masks; \ + break; \ + case HFGRTR2_EL2: \ + m = &hfgrtr2_masks; \ + break; \ + case HFGWTR2_EL2: \ + m = &hfgwtr2_masks; \ + break; \ + case HFGITR2_EL2: \ + m = &hfgitr2_masks; \ + break; \ + case HDFGRTR2_EL2: \ + m = &hdfgrtr2_masks; \ + break; \ + case HDFGWTR2_EL2: \ + m = &hdfgwtr2_masks; \ + break; \ + default: \ + BUILD_BUG_ON(1); \ + } \ + \ + m; \ + }) + #define compute_clr_set(vcpu, reg, clr, set) \ do { \ - u64 hfg; \ - hfg = __vcpu_sys_reg(vcpu, reg) & ~__ ## reg ## _RES0; \ - set |= hfg & __ ## reg ## _MASK; \ - clr |= ~hfg & __ ## reg ## _nMASK; \ + u64 hfg = __vcpu_sys_reg(vcpu, reg); \ + struct fgt_masks *m = reg_to_fgt_masks(reg); \ + set |= hfg & m->mask; \ + clr |= ~hfg & m->nmask; \ } while(0) #define reg_to_fgt_group_id(reg) \ @@ -79,7 +123,7 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) switch(reg) { \ case HFGRTR_EL2: \ case HFGWTR_EL2: \ - id = HFGxTR_GROUP; \ + id = HFGRTR_GROUP; \ break; \ case HFGITR_EL2: \ id = HFGITR_GROUP; \ @@ -91,6 +135,17 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) case HAFGRTR_EL2: \ id = HAFGRTR_GROUP; \ break; \ + case HFGRTR2_EL2: \ + case HFGWTR2_EL2: \ + id = HFGRTR2_GROUP; \ + break; \ + case HFGITR2_EL2: \ + id = HFGITR2_GROUP; \ + break; \ + case HDFGRTR2_EL2: \ + case HDFGWTR2_EL2: \ + id = HDFGRTR2_GROUP; \ + break; \ default: \ BUILD_BUG_ON(1); \ } \ @@ -101,13 +156,16 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) #define compute_undef_clr_set(vcpu, kvm, reg, clr, set) \ do { \ u64 hfg = kvm->arch.fgu[reg_to_fgt_group_id(reg)]; \ - set |= hfg & __ ## reg ## _MASK; \ - clr |= hfg & __ ## reg ## _nMASK; \ + struct fgt_masks *m = reg_to_fgt_masks(reg); \ + set |= hfg & m->mask; \ + clr |= hfg & m->nmask; \ } while(0) #define update_fgt_traps_cs(hctxt, vcpu, kvm, reg, clr, set) \ do { \ - u64 c = 0, s = 0; \ + struct fgt_masks *m = reg_to_fgt_masks(reg); \ + u64 c = clr, s = set; \ + u64 val; \ \ ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) \ @@ -115,30 +173,15 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) \ compute_undef_clr_set(vcpu, kvm, reg, c, s); \ \ - s |= set; \ - c |= clr; \ - if (c || s) { \ - u64 val = __ ## reg ## _nMASK; \ - val |= s; \ - val &= ~c; \ - write_sysreg_s(val, SYS_ ## reg); \ - } \ + val = m->nmask; \ + val |= s; \ + val &= ~c; \ + write_sysreg_s(val, SYS_ ## reg); \ } while(0) #define update_fgt_traps(hctxt, vcpu, kvm, reg) \ update_fgt_traps_cs(hctxt, vcpu, kvm, reg, 0, 0) -/* - * Validate the fine grain trap masks. - * Check that the masks do not overlap and that all bits are accounted for. - */ -#define CHECK_FGT_MASKS(reg) \ - do { \ - BUILD_BUG_ON((__ ## reg ## _MASK) & (__ ## reg ## _nMASK)); \ - BUILD_BUG_ON(~((__ ## reg ## _RES0) ^ (__ ## reg ## _MASK) ^ \ - (__ ## reg ## _nMASK))); \ - } while(0) - static inline bool cpu_has_amu(void) { u64 pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); @@ -152,56 +195,60 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); struct kvm *kvm = kern_hyp_va(vcpu->kvm); - CHECK_FGT_MASKS(HFGRTR_EL2); - CHECK_FGT_MASKS(HFGWTR_EL2); - CHECK_FGT_MASKS(HFGITR_EL2); - CHECK_FGT_MASKS(HDFGRTR_EL2); - CHECK_FGT_MASKS(HDFGWTR_EL2); - CHECK_FGT_MASKS(HAFGRTR_EL2); - CHECK_FGT_MASKS(HCRX_EL2); - if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; update_fgt_traps(hctxt, vcpu, kvm, HFGRTR_EL2); update_fgt_traps_cs(hctxt, vcpu, kvm, HFGWTR_EL2, 0, cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) ? - HFGxTR_EL2_TCR_EL1_MASK : 0); + HFGWTR_EL2_TCR_EL1_MASK : 0); update_fgt_traps(hctxt, vcpu, kvm, HFGITR_EL2); update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR_EL2); update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR_EL2); if (cpu_has_amu()) update_fgt_traps(hctxt, vcpu, kvm, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + update_fgt_traps(hctxt, vcpu, kvm, HFGRTR2_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HFGWTR2_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HFGITR2_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR2_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR2_EL2); } -#define __deactivate_fgt(htcxt, vcpu, kvm, reg) \ +#define __deactivate_fgt(htcxt, vcpu, reg) \ do { \ - if ((vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) || \ - kvm->arch.fgu[reg_to_fgt_group_id(reg)]) \ - write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ - SYS_ ## reg); \ + write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ + SYS_ ## reg); \ } while(0) static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); - struct kvm *kvm = kern_hyp_va(vcpu->kvm); if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; - __deactivate_fgt(hctxt, vcpu, kvm, HFGRTR_EL2); - if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) - write_sysreg_s(ctxt_sys_reg(hctxt, HFGWTR_EL2), SYS_HFGWTR_EL2); - else - __deactivate_fgt(hctxt, vcpu, kvm, HFGWTR_EL2); - __deactivate_fgt(hctxt, vcpu, kvm, HFGITR_EL2); - __deactivate_fgt(hctxt, vcpu, kvm, HDFGRTR_EL2); - __deactivate_fgt(hctxt, vcpu, kvm, HDFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR_EL2); if (cpu_has_amu()) - __deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __deactivate_fgt(hctxt, vcpu, HFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2); } static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) @@ -235,6 +282,8 @@ static inline void __deactivate_traps_mpam(void) static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ write_sysreg(1 << 15, hstr_el2); @@ -244,12 +293,9 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * counter, which could make a PMXEVCNTR_EL0 access UNDEF at * EL1 instead of being trapped to EL2. */ - if (kvm_arm_support_pmu_v3()) { - struct kvm_cpu_context *hctxt; - + if (system_supports_pmuv3()) { write_sysreg(0, pmselr_el0); - hctxt = host_data_ptr(host_ctxt); ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); @@ -261,14 +307,12 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) if (cpus_have_final_cap(ARM64_HAS_HCX)) { u64 hcrx = vcpu->arch.hcrx_el2; if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { - u64 clr = 0, set = 0; - - compute_clr_set(vcpu, HCRX_EL2, clr, set); - - hcrx |= set; - hcrx &= ~clr; + u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2); + hcrx |= val & __HCRX_EL2_MASK; + hcrx &= ~(~val & __HCRX_EL2_nMASK); } + ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2); write_sysreg_s(hcrx, SYS_HCRX_EL2); } @@ -278,19 +322,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); write_sysreg(0, hstr_el2); - if (kvm_arm_support_pmu_v3()) { - struct kvm_cpu_context *hctxt; - - hctxt = host_data_ptr(host_ctxt); + if (system_supports_pmuv3()) { write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); } if (cpus_have_final_cap(ARM64_HAS_HCX)) - write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); + write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2); __deactivate_traps_hfgxtr(vcpu); __deactivate_traps_mpam(); @@ -301,7 +344,7 @@ static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr) if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM)) hcr |= HCR_TVM; - write_sysreg(hcr, hcr_el2); + write_sysreg_hcr(hcr); if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2); @@ -326,7 +369,7 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } -static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) { *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2); @@ -375,7 +418,87 @@ static inline void __hyp_sve_save_host(void) true); } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); +static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + if (vcpu_has_sve(vcpu)) { + /* A guest hypervisor may restrict the effective max VL. */ + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) + zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); + else + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); + write_sysreg_el1(zcr_el1, SYS_ZCR); + } +} + +static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + /* + * When the guest owns the FP regs, we know that guest+hyp traps for + * any FPSIMD/SVE/SME features exposed to the guest have been disabled + * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() + * prior to __guest_entry(). As __guest_entry() guarantees a context + * synchronization event, we don't need an ISB here to avoid taking + * traps for anything that was exposed to the guest. + */ + if (vcpu_has_sve(vcpu)) { + zcr_el1 = read_sysreg_el1(SYS_ZCR); + __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; + + /* + * The guest's state is always saved using the guest's max VL. + * Ensure that the host has the guest's max VL active such that + * the host can save the guest's state lazily, but don't + * artificially restrict the host to the guest's max VL. + */ + if (has_vhe()) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + } else { + zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el1(zcr_el1, SYS_ZCR); + } + } +} + +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (system_supports_sve()) { + __hyp_sve_save_host(); + + /* Re-enable SVE traps if not supported for the guest vcpu. */ + if (!vcpu_has_sve(vcpu)) + cpacr_clear_set(CPACR_EL1_ZEN, 0); + + } else { + __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); + } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); +} + /* * We trap the first access to the FP/SIMD to save the host context and @@ -383,7 +506,7 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); * If FP/SIMD is not implemented, handle the trap and inject an undefined * instruction exception to the guest. Similarly for trapped SVE accesses. */ -static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest; u8 esr_ec; @@ -419,13 +542,13 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* First disable enough traps to allow us to update the registers */ if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) - cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); else - cpacr_clear_set(0, CPACR_ELx_FPEN); + cpacr_clear_set(0, CPACR_EL1_FPEN); isb(); /* Write out the host state if it's in the registers */ - if (host_owns_fp_regs()) + if (is_protected_kvm_enabled() && host_owns_fp_regs()) kvm_hyp_save_fpsimd_host(vcpu); /* Restore the guest state */ @@ -501,7 +624,25 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } -static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) +/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */ +static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt) +{ + u64 offset = 0; + + if (ctxt->offset.vm_offset) + offset += *kern_hyp_va(ctxt->offset.vm_offset); + if (ctxt->offset.vcpu_offset) + offset += *kern_hyp_va(ctxt->offset.vcpu_offset); + + return offset; +} + +static inline u64 compute_counter_value(struct arch_timer_context *ctxt) +{ + return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt); +} + +static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) { struct arch_timer_context *ctxt; u32 sysreg; @@ -511,18 +652,19 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) * We only get here for 64bit guests, 32bit guests will hit * the long and winding road all the way to the standard * handling. Yes, it sucks to be irrelevant. + * + * Also, we only deal with non-hypervisor context here (either + * an EL1 guest, or a non-HYP context of an EL2 guest). */ + if (is_hyp_ctxt(vcpu)) + return false; + sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); switch (sysreg) { case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: if (vcpu_has_nv(vcpu)) { - if (is_hyp_ctxt(vcpu)) { - ctxt = vcpu_hptimer(vcpu); - break; - } - /* Check for guest hypervisor trapping */ val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) @@ -534,16 +676,23 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) ctxt = vcpu_ptimer(vcpu); break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + if (val & CNTHCTL_EL1TVCT) + return false; + } + + ctxt = vcpu_vtimer(vcpu); + break; default: return false; } - val = arch_timer_read_cntpct_el0(); - - if (ctxt->offset.vm_offset) - val -= *kern_hyp_va(ctxt->offset.vm_offset); - if (ctxt->offset.vcpu_offset) - val -= *kern_hyp_va(ctxt->offset.vcpu_offset); + val = compute_counter_value(ctxt); vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); __kvm_skip_instr(vcpu); @@ -574,7 +723,7 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) return true; } -static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && handle_tx2_tvm(vcpu)) @@ -588,13 +737,13 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) __vgic_v3_perform_cpuif_access(vcpu) == 1) return true; - if (kvm_hyp_handle_cntpct(vcpu)) + if (kvm_handle_cntxct(vcpu)) return true; return false; } -static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) { if (static_branch_unlikely(&vgic_v3_cpuif_trap) && __vgic_v3_perform_cpuif_access(vcpu) == 1) @@ -603,19 +752,18 @@ static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } -static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, + u64 *exit_code) { if (!__populate_fault_info(vcpu)) return true; return false; } -static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); -static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); +#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault +#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault -static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) { if (kvm_hyp_handle_memory_fault(vcpu, exit_code)) return true; @@ -645,23 +793,16 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); - -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); - /* * Allow the hypervisor to handle the exit with an exit handler if it has one. * * Returns true if the hypervisor handled the exit, and control should go back * to the guest, or false if it hasn't. */ -static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); - exit_handler_fn fn; - - fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; - + exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; if (fn) return fn(vcpu, exit_code); @@ -691,20 +832,9 @@ static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code * the guest, false when we should restore the host state and return to the * main run loop. */ -static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - /* - * Save PSTATE early so that we can evaluate the vcpu mode - * early on. - */ - synchronize_vcpu_pstate(vcpu, exit_code); - - /* - * Check whether we want to repaint the state one way or - * another. - */ - early_exit_filter(vcpu, exit_code); - if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); @@ -734,7 +864,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) goto exit; /* Check if there's an exit handler and allow it to handle the exit. */ - if (kvm_hyp_handle_exit(vcpu, exit_code)) + if (kvm_hyp_handle_exit(vcpu, exit_code, handlers)) goto guest; exit: /* Return to the host kernel and handle the exit */ diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index a651c43ad679..b9cff893bbe0 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -18,9 +18,45 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); +static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; + + if (!vcpu) + vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); + + return vcpu; +} + +static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt) +{ + return host_data_ptr(host_ctxt) != ctxt; +} + +static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); + + if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu)) + return &vcpu->arch.external_mdscr_el1; + + return &ctxt_sys_reg(ctxt, MDSCR_EL1); +} + +static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm); + + if (!(ctxt_is_guest(ctxt) && + test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))) + return read_cpuid_id(); + + return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1); +} + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { - ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); + *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1); // POR_EL0 can affect uaccess, so must be saved/restored early. if (ctxt_has_s1poe(ctxt)) @@ -33,16 +69,6 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); } -static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) -{ - struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; - - if (!vcpu) - vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); - - return vcpu; -} - static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); @@ -139,7 +165,7 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { - write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + write_sysreg(*ctxt_mdscr_el1(ctxt), mdscr_el1); // POR_EL0 can affect uaccess, so must be saved/restored early. if (ctxt_has_s1poe(ctxt)) @@ -153,8 +179,9 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) } static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, - u64 mpidr) + u64 midr, u64 mpidr) { + write_sysreg(midr, vpidr_el2); write_sysreg(mpidr, vmpidr_el2); if (has_vhe() || @@ -283,7 +310,7 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); } @@ -300,7 +327,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h deleted file mode 100644 index f957890c7e38..000000000000 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ /dev/null @@ -1,223 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2021 Google LLC - * Author: Fuad Tabba <tabba@google.com> - */ - -#ifndef __ARM64_KVM_FIXED_CONFIG_H__ -#define __ARM64_KVM_FIXED_CONFIG_H__ - -#include <asm/sysreg.h> - -/* - * This file contains definitions for features to be allowed or restricted for - * guest virtual machines, depending on the mode KVM is running in and on the - * type of guest that is running. - * - * The ALLOW masks represent a bitmask of feature fields that are allowed - * without any restrictions as long as they are supported by the system. - * - * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for - * features that are restricted to support at most the specified feature. - * - * If a feature field is not present in either, than it is not supported. - * - * The approach taken for protected VMs is to allow features that are: - * - Needed by common Linux distributions (e.g., floating point) - * - Trivial to support, e.g., supporting the feature does not introduce or - * require tracking of additional state in KVM - * - Cannot be trapped or prevent the guest from using anyway - */ - -/* - * Allow for protected VMs: - * - Floating-point and Advanced SIMD - * - Data Independent Timing - * - Spectre/Meltdown Mitigation - */ -#define PVM_ID_AA64PFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - AArch64 guests only (no support for AArch32 guests): - * AArch32 adds complexity in trap handling, emulation, condition codes, - * etc... - * - RAS (v1) - * Supported by KVM - */ -#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, RAS, IMP) \ - ) - -/* - * Allow for protected VMs: - * - Branch Target Identification - * - Speculative Store Bypassing - */ -#define PVM_ID_AA64PFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \ - ) - -#define PVM_ID_AA64PFR2_ALLOW 0ULL - -/* - * Allow for protected VMs: - * - Mixed-endian - * - Distinction between Secure and Non-secure Memory - * - Mixed-endian at EL0 only - * - Non-context synchronizing exception entry and exit - */ -#define PVM_ID_AA64MMFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - 40-bit IPA - * - 16-bit ASID - */ -#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \ - ) - -/* - * Allow for protected VMs: - * - Hardware translation table updates to Access flag and Dirty state - * - Number of VMID bits from CPU - * - Hierarchical Permission Disables - * - Privileged Access Never - * - SError interrupt exceptions from speculative reads - * - Enhanced Translation Synchronization - * - Control for cache maintenance permission - */ -#define PVM_ID_AA64MMFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_CMOW) \ - ) - -/* - * Allow for protected VMs: - * - Common not Private translations - * - User Access Override - * - IESB bit in the SCTLR_ELx registers - * - Unaligned single-copy atomicity and atomic functions - * - ESR_ELx.EC value on an exception by read access to feature ID space - * - TTL field in address operations. - * - Break-before-make sequences when changing translation block size - * - E0PDx mechanism - */ -#define PVM_ID_AA64MMFR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \ - ) - -#define PVM_ID_AA64MMFR3_ALLOW (0ULL) - -/* - * No support for Scalable Vectors for protected VMs: - * Requires additional support from KVM, e.g., context-switching and - * trapping at EL2 - */ -#define PVM_ID_AA64ZFR0_ALLOW (0ULL) - -/* - * No support for debug, including breakpoints, and watchpoints for protected - * VMs: - * The Arm architecture mandates support for at least the Armv8 debug - * architecture, which would include at least 2 hardware breakpoints and - * watchpoints. Providing that support to protected guests adds - * considerable state and complexity. Therefore, the reserved value of 0 is - * used for debug-related fields. - */ -#define PVM_ID_AA64DFR0_ALLOW (0ULL) -#define PVM_ID_AA64DFR1_ALLOW (0ULL) - -/* - * No support for implementation defined features. - */ -#define PVM_ID_AA64AFR0_ALLOW (0ULL) -#define PVM_ID_AA64AFR1_ALLOW (0ULL) - -/* - * No restrictions on instructions implemented in AArch64. - */ -#define PVM_ID_AA64ISAR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \ - ) - -/* Restrict pointer authentication to the basic version. */ -#define PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \ - ) - -#define PVM_ID_AA64ISAR2_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ - ) - -#define PVM_ID_AA64ISAR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \ - ) - -#define PVM_ID_AA64ISAR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_ATS1A)| \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) \ - ) - -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); -bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); -bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); -int kvm_check_pvm_sysreg_table(void); - -#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index 97c527ef53c2..3766333bace9 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -7,7 +7,7 @@ #include <nvhe/memory.h> #include <nvhe/spinlock.h> -#define HYP_NO_ORDER USHRT_MAX +#define HYP_NO_ORDER ((u8)(~0)) struct hyp_pool { /* @@ -19,11 +19,11 @@ struct hyp_pool { struct list_head free_area[NR_PAGE_ORDERS]; phys_addr_t range_start; phys_addr_t range_end; - unsigned short max_order; + u8 max_order; }; /* Allocation */ -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order); +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order); void hyp_split_page(struct hyp_page *page); void hyp_get_page(struct hyp_pool *pool, void *addr); void hyp_put_page(struct hyp_pool *pool, void *addr); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 0972faccc2af..5f9d56754e39 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -11,40 +11,10 @@ #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> #include <asm/virt.h> +#include <nvhe/memory.h> #include <nvhe/pkvm.h> #include <nvhe/spinlock.h> -/* - * SW bits 0-1 are reserved to track the memory ownership state of each page: - * 00: The page is owned exclusively by the page-table owner. - * 01: The page is owned by the page-table owner, but is shared - * with another entity. - * 10: The page is shared with, but not owned by the page-table owner. - * 11: Reserved for future use (lending). - */ -enum pkvm_page_state { - PKVM_PAGE_OWNED = 0ULL, - PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, - PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, - __PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 | - KVM_PGTABLE_PROT_SW1, - - /* Meta-states which aren't encoded directly in the PTE's SW bits */ - PKVM_NOPAGE, -}; - -#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) -static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, - enum pkvm_page_state state) -{ - return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; -} - -static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) -{ - return prot & PKVM_PAGE_STATE_PROT_MASK; -} - struct host_mmu { struct kvm_arch arch; struct kvm_pgtable pgt; @@ -69,6 +39,13 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot); +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm); +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); @@ -79,7 +56,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_pin_shared_mem(void *from, void *to); void hyp_unpin_shared_mem(void *from, void *to); -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc); @@ -90,4 +67,10 @@ static __always_inline void __load_host_stage2(void) else write_sysreg(0, vttbr_el2); } + +#ifdef CONFIG_NVHE_EL2_DEBUG +void pkvm_ownership_selftest(void *base); +#else +static inline void pkvm_ownership_selftest(void *base) { } +#endif #endif /* __KVM_NVHE_MEM_PROTECT__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index ab205c4d6774..dee1a406b0c2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -7,9 +7,61 @@ #include <linux/types.h> +/* + * Bits 0-1 are used to encode the memory ownership state of each page from the + * point of view of a pKVM "component" (host, hyp, guest, ... see enum + * pkvm_component_id): + * 00: The page is owned and exclusively accessible by the component; + * 01: The page is owned and accessible by the component, but is also + * accessible by another component; + * 10: The page is accessible but not owned by the component; + * The storage of this state depends on the component: either in the + * hyp_vmemmap for the host and hyp states or in PTE software bits for guests. + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = BIT(0), + PKVM_PAGE_SHARED_BORROWED = BIT(1), + + /* + * 'Meta-states' are not stored directly in PTE SW bits for guest + * states, but inferred from the context (e.g. invalid PTE entries). + * For the host and hyp, meta-states are stored directly in the + * struct hyp_page. + */ + PKVM_NOPAGE = BIT(0) | BIT(1), +}; +#define PKVM_PAGE_STATE_MASK (BIT(0) | BIT(1)) + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + prot &= ~PKVM_PAGE_STATE_PROT_MASK; + prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state); + return prot; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot); +} + struct hyp_page { - unsigned short refcount; - unsigned short order; + u16 refcount; + u8 order; + + /* Host state. Guarded by the host stage-2 lock. */ + unsigned __host_state : 4; + + /* + * Complement of the hyp state. Guarded by the hyp stage-1 lock. We use + * the complement so that the initial 0 in __hyp_state_comp (due to the + * entire vmemmap starting off zeroed) encodes PKVM_NOPAGE. + */ + unsigned __hyp_state_comp : 4; + + u32 host_share_guest_count; }; extern u64 __hyp_vmemmap; @@ -29,7 +81,13 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) #define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) -#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)]) + +static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys) +{ + BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u64)); + return &hyp_vmemmap[hyp_phys_to_pfn(phys)]; +} + #define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt)) #define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt)) @@ -38,6 +96,26 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) #define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) +static inline enum pkvm_page_state get_host_state(struct hyp_page *p) +{ + return p->__host_state; +} + +static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__host_state = state; +} + +static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p) +{ + return p->__hyp_state_comp ^ PKVM_PAGE_STATE_MASK; +} + +static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_MASK; +} + /* * Refcounting for 'struct hyp_page'. * hyp_pool::lock must be held if atomic access to the refcount is required. diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 230e4f2527de..6e83ce35c2f2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -13,9 +13,11 @@ extern struct kvm_pgtable pkvm_pgtable; extern hyp_spinlock_t pkvm_pgd_lock; -int hyp_create_pcpu_fixmap(void); +int hyp_create_fixmap(void); void *hyp_fixmap_map(phys_addr_t phys); void hyp_fixmap_unmap(void); +void *hyp_fixblock_map(phys_addr_t phys, size_t *size); +void hyp_fixblock_unmap(void); int hyp_create_idmap(u32 hyp_va_bits); int hyp_map_vectors(void); diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 24a9a8330d19..ce31d3b73603 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -20,6 +20,12 @@ struct pkvm_hyp_vcpu { /* Backpointer to the host's (untrusted) vCPU instance. */ struct kvm_vcpu *host_vcpu; + + /* + * If this hyp vCPU is loaded, then this is a backpointer to the + * per-cpu pointer tracking us. Otherwise, NULL if not loaded. + */ + struct pkvm_hyp_vcpu **loaded_hyp_vcpu; }; /* @@ -37,16 +43,12 @@ struct pkvm_hyp_vm { struct hyp_pool pool; hyp_spinlock_t lock; - /* - * The number of vcpus initialized and ready to run. - * Modifying this is protected by 'vm_table_lock'. - */ - unsigned int nr_vcpus; - /* Array of the hyp vCPU structures for this VM. */ struct pkvm_hyp_vcpu *vcpus[]; }; +extern hyp_spinlock_t vm_table_lock; + static inline struct pkvm_hyp_vm * pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) { @@ -58,6 +60,11 @@ static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) return vcpu_is_protected(&hyp_vcpu->vcpu); } +static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm) +{ + return kvm_vm_is_protected(&hyp_vm->kvm); +} + void pkvm_hyp_vm_table_init(void *tbl); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, @@ -69,5 +76,15 @@ int __pkvm_teardown_vm(pkvm_handle_t handle); struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void); + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle); +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); + +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); +int kvm_check_pvm_sysreg_table(void); #endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index b43426a493df..a76522d63c3e 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -99,3 +99,9 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAG # causes a build failure. Remove profile optimization flags. KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables + +ifeq ($(CONFIG_UBSAN_KVM_EL2),y) +UBSAN_SANITIZE := y +# Always use brk and not hooks +ccflags-y += $(CFLAGS_UBSAN_TRAP) +endif diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 53efda0235cf..2f4a4f5036bb 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -51,42 +51,55 @@ static void __debug_restore_spe(u64 pmscr_el1) write_sysreg_el1(pmscr_el1, SYS_PMSCR); } -static void __debug_save_trace(u64 *trfcr_el1) +static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) { - *trfcr_el1 = 0; + *saved_trfcr = read_sysreg_el1(SYS_TRFCR); + write_sysreg_el1(new_trfcr, SYS_TRFCR); +} - /* Check if the TRBE is enabled */ - if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E)) - return; - /* - * Prohibit trace generation while we are in guest. - * Since access to TRFCR_EL1 is trapped, the guest can't - * modify the filtering set by the host. - */ - *trfcr_el1 = read_sysreg_el1(SYS_TRFCR); - write_sysreg_el1(0, SYS_TRFCR); - isb(); - /* Drain the trace buffer to memory */ - tsb_csync(); +static bool __trace_needs_drain(void) +{ + if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE)) + return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E; + + return host_data_test_flag(TRBE_ENABLED); } -static void __debug_restore_trace(u64 trfcr_el1) +static bool __trace_needs_switch(void) { - if (!trfcr_el1) - return; + return host_data_test_flag(TRBE_ENABLED) || + host_data_test_flag(EL1_TRACING_CONFIGURED); +} - /* Restore trace filter controls */ - write_sysreg_el1(trfcr_el1, SYS_TRFCR); +static void __trace_switch_to_guest(void) +{ + /* Unsupported with TRBE so disable */ + if (host_data_test_flag(TRBE_ENABLED)) + *host_data_ptr(trfcr_while_in_guest) = 0; + + __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1), + *host_data_ptr(trfcr_while_in_guest)); + + if (__trace_needs_drain()) { + isb(); + tsb_csync(); + } +} + +static void __trace_switch_to_host(void) +{ + __trace_do_switch(host_data_ptr(trfcr_while_in_guest), + *host_data_ptr(host_debug_state.trfcr_el1)); } void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) + if (host_data_test_flag(HAS_SPE)) __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1)); - /* Disable and flush Self-Hosted Trace generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) - __debug_save_trace(host_data_ptr(host_debug_state.trfcr_el1)); + + if (__trace_needs_switch()) + __trace_switch_to_guest(); } void __debug_switch_to_guest(struct kvm_vcpu *vcpu) @@ -96,18 +109,13 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) + if (host_data_test_flag(HAS_SPE)) __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1)); - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) - __debug_restore_trace(*host_data_ptr(host_debug_state.trfcr_el1)); + if (__trace_needs_switch()) + __trace_switch_to_host(); } void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index e433dfab882a..3369dd0c4009 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -730,10 +730,10 @@ static void do_ffa_version(struct arm_smccc_res *res, hyp_ffa_version = ffa_req_version; } - if (hyp_ffa_post_init()) + if (hyp_ffa_post_init()) { res->a0 = FFA_RET_NOT_SUPPORTED; - else { - has_version_negotiated = true; + } else { + smp_store_release(&has_version_negotiated, true); res->a0 = hyp_ffa_version; } unlock: @@ -809,7 +809,8 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id) if (!is_ffa_call(func_id)) return false; - if (!has_version_negotiated && func_id != FFA_VERSION) { + if (func_id != FFA_VERSION && + !smp_load_acquire(&has_version_negotiated)) { ffa_to_smccc_error(&res, FFA_RET_INVALID_PARAMETERS); goto out_handled; } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 3d610fc51f4d..eef15b374abb 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -124,7 +124,7 @@ SYM_FUNC_START(__hyp_do_panic) /* Ensure host stage-2 is disabled */ mrs x0, hcr_el2 bic x0, x0, #HCR_VM - msr hcr_el2, x0 + msr_hcr_el2 x0 isb tlbi vmalls12e1 dsb nsh @@ -188,12 +188,12 @@ SYM_FUNC_END(__host_hvc) /* * Test whether the SP has overflowed, without corrupting a GPR. - * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit + * nVHE hypervisor stacks are aligned so that the NVHE_STACK_SHIFT bit * of SP should always be 1. */ add sp, sp, x0 // sp' = sp + x0 sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp - tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@ + tbz x0, #NVHE_STACK_SHIFT, .L__hyp_sp_overflow\@ sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index fc1866226067..aada42522e7b 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -73,8 +73,12 @@ __do_hyp_init: eret SYM_CODE_END(__kvm_hyp_init) +/* + * Initialize EL2 CPU state to sane values. + * + * HCR_EL2.E2H must have been initialized already. + */ SYM_CODE_START_LOCAL(__kvm_init_el2_state) - /* Initialize EL2 CPU state to sane values. */ init_el2_state // Clobbers x0..x2 finalise_el2_state ret @@ -96,7 +100,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) msr mair_el2, x1 ldr x1, [x0, #NVHE_INIT_HCR_EL2] - msr hcr_el2, x1 + msr_hcr_el2 x1 mov x2, #HCR_E2H and x2, x1, x2 @@ -206,9 +210,9 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) 2: msr SPsel, #1 // We want to use SP_EL{1,2} - bl __kvm_init_el2_state + init_el2_hcr 0 - __init_el2_nvhe_prepare_eret + bl __kvm_init_el2_state /* Enable MMU, set vectors and stack. */ mov x0, x28 @@ -258,7 +262,7 @@ reset: alternative_if ARM64_KVM_PROTECTED_MODE mov_q x5, HCR_HOST_NVHE_FLAGS - msr hcr_el2, x5 + msr_hcr_el2 x5 alternative_else_nop_endif /* Install stub vectors */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 6aa0b13d86e5..8e8848de4d47 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -5,6 +5,7 @@ */ #include <hyp/adjust_pc.h> +#include <hyp/switch.h> #include <asm/pgtable-types.h> #include <asm/kvm_asm.h> @@ -68,7 +69,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) return; - cpacr_clear_set(0, CPACR_ELx_FPEN | CPACR_ELx_ZEN); + cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); isb(); if (vcpu_has_sve(vcpu)) @@ -83,7 +84,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (system_supports_sve()) __hyp_sve_restore_host(); else - __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); + __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); if (has_fpmr) write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); @@ -91,20 +92,37 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; } +static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state; +} + +static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state; +} + static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; fpsimd_sve_flush(); + flush_debug_state(hyp_vcpu); hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; - hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); - /* Limit guest vector length to the maximum supported by the host. */ - hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl); - - hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; - hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & @@ -112,8 +130,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; - hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr); - hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3; @@ -127,6 +143,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) unsigned int i; fpsimd_sve_sync(&hyp_vcpu->vcpu); + sync_debug_state(hyp_vcpu); host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; @@ -141,16 +158,46 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; } +static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2); + DECLARE_REG(u64, hcr_el2, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx); + if (!hyp_vcpu) + return; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + /* Propagate WFx trapping flags */ + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI); + hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI); + } +} + +static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (hyp_vcpu) + pkvm_put_hyp_vcpu(hyp_vcpu); +} + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1); int ret; - host_vcpu = kern_hyp_va(host_vcpu); - if (unlikely(is_protected_kvm_enabled())) { - struct pkvm_hyp_vcpu *hyp_vcpu; - struct kvm *host_kvm; + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); /* * KVM (and pKVM) doesn't support SME guests for now, and @@ -163,9 +210,6 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) goto out; } - host_kvm = kern_hyp_va(host_vcpu->kvm); - hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, - host_vcpu->vcpu_idx); if (!hyp_vcpu) { ret = -EINVAL; goto out; @@ -176,12 +220,149 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) ret = __kvm_vcpu_run(&hyp_vcpu->vcpu); sync_hyp_vcpu(hyp_vcpu); - pkvm_put_hyp_vcpu(hyp_vcpu); } else { + struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu); + /* The host is fully trusted, run its vCPU directly. */ - ret = __kvm_vcpu_run(host_vcpu); + fpsimd_lazy_switch_to_guest(vcpu); + ret = __kvm_vcpu_run(vcpu); + fpsimd_lazy_switch_to_host(vcpu); } +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, + host_vcpu->arch.pkvm_memcache.nr_pages, + &host_vcpu->arch.pkvm_memcache); +} + +static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = pkvm_refill_memcache(hyp_vcpu); + if (ret) + goto out; + + ret = __pkvm_host_share_guest(pfn, gfn, nr_pages, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + ret = __pkvm_host_unshare_guest(gfn, nr_pages, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 2); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_relax_perms_guest(gfn, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_wrprotect_guest(gfn, nr_pages, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(bool, mkold, host_ctxt, 4); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_test_clear_young_guest(gfn, nr_pages, mkold, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_mkyoung_guest(gfn, hyp_vcpu); out: cpu_reg(host_ctxt, 1) = ret; } @@ -233,6 +414,22 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); } +static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + struct pkvm_hyp_vm *hyp_vm; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + return; + + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + put_pkvm_hyp_vm(hyp_vm); +} + static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -264,11 +461,6 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) __vgic_v3_init_lrs(); } -static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt) -{ - cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2(); -} - static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); @@ -384,7 +576,6 @@ typedef void (*hcall_t)(struct kvm_cpu_context *); static const hcall_t host_hcall[] = { /* ___kvm_hyp_init */ - HANDLE_FUNC(__kvm_get_mdcr_el2), HANDLE_FUNC(__pkvm_init), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_cpu_set_vector), @@ -395,6 +586,12 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_share_guest), + HANDLE_FUNC(__pkvm_host_unshare_guest), + HANDLE_FUNC(__pkvm_host_relax_perms_guest), + HANDLE_FUNC(__pkvm_host_wrprotect_guest), + HANDLE_FUNC(__pkvm_host_test_clear_young_guest), + HANDLE_FUNC(__pkvm_host_mkyoung_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), @@ -409,6 +606,9 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), + HANDLE_FUNC(__pkvm_vcpu_load), + HANDLE_FUNC(__pkvm_vcpu_put), + HANDLE_FUNC(__pkvm_tlb_flush_vmid), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) @@ -480,12 +680,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_SMC64: handle_host_smc(host_ctxt); break; - case ESR_ELx_EC_SVE: - cpacr_clear_set(0, CPACR_ELx_ZEN); - isb(); - sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, - SYS_ZCR_EL2); - break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: handle_host_mem_abort(host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index f4562f417d3f..d724f6d69302 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -25,5 +25,7 @@ SECTIONS { BEGIN_HYP_SECTION(.data..percpu) PERCPU_INPUT(L1_CACHE_BYTES) END_HYP_SECTION + HYP_SECTION(.bss) + HYP_SECTION(.data) } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index e75374d682f4..95d7534c9679 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -60,6 +60,11 @@ static void hyp_unlock_component(void) hyp_spin_unlock(&pkvm_pgd_lock); } +#define for_each_hyp_page(__p, __st, __sz) \ + for (struct hyp_page *__p = hyp_phys_to_page(__st), \ + *__e = __p + ((__sz) >> PAGE_SHIFT); \ + __p < __e; __p++) + static void *host_s2_zalloc_pages_exact(size_t size) { void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size)); @@ -161,12 +166,6 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) return 0; } -static bool guest_stage2_force_pte_cb(u64 addr, u64 end, - enum kvm_pgtable_prot prot) -{ - return true; -} - static void *guest_s2_zalloc_pages_exact(size_t size) { void *addr = hyp_alloc_pages(¤t_vm->pool, get_order(size)); @@ -201,8 +200,8 @@ static void *guest_s2_zalloc_page(void *mc) memset(addr, 0, PAGE_SIZE); p = hyp_virt_to_page(addr); - memset(p, 0, sizeof(*p)); p->refcount = 1; + p->order = 0; return addr; } @@ -217,16 +216,42 @@ static void guest_s2_put_page(void *addr) hyp_put_page(¤t_vm->pool, addr); } +static void __apply_guest_page(void *va, size_t size, + void (*func)(void *addr, size_t size)) +{ + size += va - PTR_ALIGN_DOWN(va, PAGE_SIZE); + va = PTR_ALIGN_DOWN(va, PAGE_SIZE); + size = PAGE_ALIGN(size); + + while (size) { + size_t map_size = PAGE_SIZE; + void *map; + + if (IS_ALIGNED((unsigned long)va, PMD_SIZE) && size >= PMD_SIZE) + map = hyp_fixblock_map(__hyp_pa(va), &map_size); + else + map = hyp_fixmap_map(__hyp_pa(va)); + + func(map, map_size); + + if (map_size == PMD_SIZE) + hyp_fixblock_unmap(); + else + hyp_fixmap_unmap(); + + size -= map_size; + va += map_size; + } +} + static void clean_dcache_guest_page(void *va, size_t size) { - __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); - hyp_fixmap_unmap(); + __apply_guest_page(va, size, __clean_dcache_guest_page); } static void invalidate_icache_guest_page(void *va, size_t size) { - __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); - hyp_fixmap_unmap(); + __apply_guest_page(va, size, __invalidate_icache_guest_page); } int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) @@ -255,8 +280,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) }; guest_lock_component(vm); - ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, - guest_stage2_force_pte_cb); + ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, NULL); guest_unlock_component(vm); if (ret) return ret; @@ -266,8 +290,9 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) return 0; } -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) +void reclaim_pgtable_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) { + struct hyp_page *page; void *addr; /* Dump all pgtable pages in the hyp_pool */ @@ -279,7 +304,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) /* Drain the hyp_pool into the memcache */ addr = hyp_alloc_pages(&vm->pool, 0); while (addr) { - memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); + page = hyp_virt_to_page(addr); + page->refcount = 0; + page->order = 0; push_hyp_memcache(mc, addr, hyp_virt_to_phys); WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); addr = hyp_alloc_pages(&vm->pool, 0); @@ -306,7 +333,7 @@ int __pkvm_prot_finalize(void) */ kvm_flush_dcache_to_poc(params, sizeof(*params)); - write_sysreg(params->hcr_el2, hcr_el2); + write_sysreg_hcr(params->hcr_el2); __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); /* @@ -382,19 +409,28 @@ bool addr_is_memory(phys_addr_t phys) return !!find_mem_range(phys, &range); } -static bool addr_is_allowed_memory(phys_addr_t phys) +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + +static int check_range_allowed_memory(u64 start, u64 end) { struct memblock_region *reg; struct kvm_mem_range range; - reg = find_mem_range(phys, &range); + /* + * Callers can't check the state of a range that overlaps memory and + * MMIO regions, so ensure [start, end[ is in the same kvm_mem_range. + */ + reg = find_mem_range(start, &range); + if (!is_in_mem_range(end - 1, &range)) + return -EINVAL; - return reg && !(reg->flags & MEMBLOCK_NOMAP); -} + if (!reg || reg->flags & MEMBLOCK_NOMAP) + return -EPERM; -static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) -{ - return range->start <= addr && addr < range->end; + return 0; } static bool range_is_memory(u64 start, u64 end) @@ -454,8 +490,11 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) if (kvm_pte_valid(pte)) return -EAGAIN; - if (pte) + if (pte) { + WARN_ON(addr_is_memory(addr) && + get_host_state(hyp_phys_to_page(addr)) != PKVM_NOPAGE); return -EPERM; + } do { u64 granule = kvm_granule_size(level); @@ -477,10 +516,31 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); } +static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state) +{ + for_each_hyp_page(page, addr, size) + set_host_state(page, state); +} + int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { - return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, - addr, size, &host_s2_pool, owner_id); + int ret; + + if (!range_is_memory(addr, addr + size)) + return -EPERM; + + ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, + addr, size, &host_s2_pool, owner_id); + if (ret) + return ret; + + /* Don't forget to update the vmemmap tracking for the host */ + if (owner_id == PKVM_ID_HOST) + __host_update_page_state(addr, size, PKVM_PAGE_OWNED); + else + __host_update_page_state(addr, size, PKVM_NOPAGE); + + return 0; } static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) @@ -541,44 +601,18 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) return; } - addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; + + /* + * Yikes, we couldn't resolve the fault IPA. This should reinject an + * abort into the host when we figure out how to do that. + */ + BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS)); + addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12; + ret = host_stage2_idmap(addr); BUG_ON(ret && ret != -EAGAIN); } -struct pkvm_mem_transition { - u64 nr_pages; - - struct { - enum pkvm_component_id id; - /* Address in the initiator's address space */ - u64 addr; - - union { - struct { - /* Address in the completer's address space */ - u64 completer_addr; - } host; - struct { - u64 completer_addr; - } hyp; - }; - } initiator; - - struct { - enum pkvm_component_id id; - } completer; -}; - -struct pkvm_mem_share { - const struct pkvm_mem_transition tx; - const enum kvm_pgtable_prot completer_prot; -}; - -struct pkvm_mem_donation { - const struct pkvm_mem_transition tx; -}; - struct check_walk_data { enum pkvm_page_state desired; enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr); @@ -604,705 +638,724 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, return kvm_pgtable_walk(pgt, addr, size, &walker); } -static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr) -{ - if (!addr_is_allowed_memory(addr)) - return PKVM_NOPAGE; - - if (!kvm_pte_valid(pte) && pte) - return PKVM_NOPAGE; - - return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); -} - static int __host_check_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - struct check_walk_data d = { - .desired = state, - .get_page_state = host_get_page_state, - }; - - hyp_assert_lock_held(&host_mmu.lock); - return check_page_state_range(&host_mmu.pgt, addr, size, &d); -} - -static int __host_set_page_state_range(u64 addr, u64 size, - enum pkvm_page_state state) -{ - enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state); - - return host_stage2_idmap_locked(addr, size, prot); -} - -static int host_request_owned_transition(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED); -} - -static int host_request_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); -} - -static int host_initiate_share(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); -} + int ret; -static int host_initiate_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; + ret = check_range_allowed_memory(addr, addr + size); + if (ret) + return ret; - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); -} + hyp_assert_lock_held(&host_mmu.lock); -static int host_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u8 owner_id = tx->completer.id; - u64 size = tx->nr_pages * PAGE_SIZE; + for_each_hyp_page(page, addr, size) { + if (get_host_state(page) != state) + return -EPERM; + } - *completer_addr = tx->initiator.host.completer_addr; - return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id); + return 0; } -static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) +static int __host_set_page_state_range(u64 addr, u64 size, + enum pkvm_page_state state) { - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HYP); -} + if (get_host_state(hyp_phys_to_page(addr)) == PKVM_NOPAGE) { + int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); -static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx, - enum pkvm_page_state state) -{ - u64 size = tx->nr_pages * PAGE_SIZE; + if (ret) + return ret; + } - if (__host_ack_skip_pgtable_check(tx)) - return 0; + __host_update_page_state(addr, size, state); - return __host_check_page_state_range(addr, size, state); + return 0; } -static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) +static void __hyp_set_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) { - return __host_ack_transition(addr, tx, PKVM_NOPAGE); + for_each_hyp_page(page, phys, size) + set_hyp_state(page, state); } -static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) +static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) { - u64 size = tx->nr_pages * PAGE_SIZE; - u8 host_id = tx->completer.id; + for_each_hyp_page(page, phys, size) { + if (get_hyp_state(page) != state) + return -EPERM; + } - return host_stage2_set_owner_locked(addr, size, host_id); + return 0; } -static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr) +static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) { if (!kvm_pte_valid(pte)) return PKVM_NOPAGE; - return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); + return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); } -static int __hyp_check_page_state_range(u64 addr, u64 size, - enum pkvm_page_state state) +static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr, + u64 size, enum pkvm_page_state state) { struct check_walk_data d = { .desired = state, - .get_page_state = hyp_get_page_state, + .get_page_state = guest_get_page_state, }; - hyp_assert_lock_held(&pkvm_pgd_lock); - return check_page_state_range(&pkvm_pgtable, addr, size, &d); + hyp_assert_lock_held(&vm->lock); + return check_page_state_range(&vm->pgt, addr, size, &d); } -static int hyp_request_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.hyp.completer_addr; - return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED); -} - -static int hyp_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) +int __pkvm_host_share_hyp(u64 pfn) { - u64 size = tx->nr_pages * PAGE_SIZE; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE; int ret; - *completer_addr = tx->initiator.hyp.completer_addr; - ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size); - return (ret != size) ? -EFAULT : 0; -} - -static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) -{ - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HOST); -} + host_lock_component(); + hyp_lock_component(); -static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) -{ - u64 size = tx->nr_pages * PAGE_SIZE; + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; - if (perms != PAGE_HYP) - return -EPERM; + __hyp_set_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED)); - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); + return ret; } -static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) +int __pkvm_host_unshare_hyp(u64 pfn) { - u64 size = tx->nr_pages * PAGE_SIZE; + u64 phys = hyp_pfn_to_phys(pfn); + u64 virt = (u64)__hyp_va(phys); + u64 size = PAGE_SIZE; + int ret; - if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr)) - return -EBUSY; + host_lock_component(); + hyp_lock_component(); - return __hyp_check_page_state_range(addr, size, - PKVM_PAGE_SHARED_BORROWED); -} + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + if (hyp_page_count((void *)virt)) { + ret = -EBUSY; + goto unlock; + } -static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED)); - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; +unlock: + hyp_unlock_component(); + host_unlock_component(); - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); + return ret; } -static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) +int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) { - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); - enum kvm_pgtable_prot prot; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + void *virt = __hyp_va(phys); + int ret; - prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED); - return pkvm_create_mappings_locked(start, end, prot); -} + host_lock_component(); + hyp_lock_component(); -static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; - return (ret != size) ? -EFAULT : 0; -} + __hyp_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, PAGE_HYP)); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP)); -static int hyp_complete_donation(u64 addr, - const struct pkvm_mem_transition *tx) -{ - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); - enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); +unlock: + hyp_unlock_component(); + host_unlock_component(); - return pkvm_create_mappings_locked(start, end, prot); + return ret; } -static int check_share(struct pkvm_mem_share *share) +int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + u64 virt = (u64)__hyp_va(phys); int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + host_lock_component(); + hyp_lock_component(); + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_OWNED); if (ret) - return ret; + goto unlock; + ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_share(completer_addr, tx, share->completer_prot); - break; - case PKVM_ID_FFA: - /* - * We only check the host; the secure side will check the other - * end when we forward the FFA call. - */ - ret = 0; - break; - default: - ret = -EINVAL; - } + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST)); + +unlock: + hyp_unlock_component(); + host_unlock_component(); return ret; } -static int __do_share(struct pkvm_mem_share *share) +int hyp_pin_shared_mem(void *from, void *to) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + u64 phys = __hyp_pa(start); + u64 size = end - start; + struct hyp_page *p; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_share(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + host_lock_component(); + hyp_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); if (ret) - return ret; + goto unlock; - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_share(completer_addr, tx, share->completer_prot); - break; - case PKVM_ID_FFA: - /* - * We're not responsible for any secure page-tables, so there's - * nothing to do here. - */ - ret = 0; - break; - default: - ret = -EINVAL; + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + hyp_page_ref_inc(p); + if (p->refcount == 1) + WARN_ON(pkvm_create_mappings_locked((void *)cur, + (void *)cur + PAGE_SIZE, + PAGE_HYP)); } +unlock: + hyp_unlock_component(); + host_unlock_component(); + return ret; } -/* - * do_share(): - * - * The page owner grants access to another component with a given set - * of permissions. - * - * Initiator: OWNED => SHARED_OWNED - * Completer: NOPAGE => SHARED_BORROWED - */ -static int do_share(struct pkvm_mem_share *share) +void hyp_unpin_shared_mem(void *from, void *to) { - int ret; + u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); + u64 end = PAGE_ALIGN((u64)to); + struct hyp_page *p; - ret = check_share(share); - if (ret) - return ret; + host_lock_component(); + hyp_lock_component(); + + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + if (p->refcount == 1) + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, cur, PAGE_SIZE) != PAGE_SIZE); + hyp_page_ref_dec(p); + } - return WARN_ON(__do_share(share)); + hyp_unlock_component(); + host_unlock_component(); } -static int check_unshare(struct pkvm_mem_share *share) +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + host_unlock_component(); - if (ret) - return ret; + return ret; +} - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_unshare(completer_addr, tx); - break; - case PKVM_ID_FFA: - /* See check_share() */ - ret = 0; - break; - default: - ret = -EINVAL; - } +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) +{ + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + int ret; + + host_lock_component(); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + host_unlock_component(); return ret; } -static int __do_unshare(struct pkvm_mem_share *share) +static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *size) { - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; - int ret; + size_t block_size; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; + if (nr_pages == 1) { + *size = PAGE_SIZE; + return 0; } - if (ret) - return ret; + /* We solely support second to last level huge mapping */ + block_size = kvm_granule_size(KVM_PGTABLE_LAST_LEVEL - 1); - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_unshare(completer_addr, tx); - break; - case PKVM_ID_FFA: - /* See __do_share() */ - ret = 0; - break; - default: - ret = -EINVAL; - } + if (nr_pages != block_size >> PAGE_SHIFT) + return -EINVAL; - return ret; + if (!IS_ALIGNED(phys | ipa, block_size)) + return -EINVAL; + + *size = block_size; + return 0; } -/* - * do_unshare(): - * - * The page owner revokes access from another component for a range of - * pages which were previously shared using do_share(). - * - * Initiator: SHARED_OWNED => OWNED - * Completer: SHARED_BORROWED => NOPAGE - */ -static int do_unshare(struct pkvm_mem_share *share) +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot) { + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys = hyp_pfn_to_phys(pfn); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 size; int ret; - ret = check_unshare(share); + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + + ret = __guest_check_transition_size(phys, ipa, nr_pages, &size); if (ret) return ret; - return WARN_ON(__do_unshare(share)); -} - -static int check_donation(struct pkvm_mem_donation *donation) -{ - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; - int ret; + ret = check_range_allowed_memory(phys, phys + size); + if (ret) + return ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_request_donation(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } + host_lock_component(); + guest_lock_component(vm); + ret = __guest_check_page_state_range(vm, ipa, size, PKVM_NOPAGE); if (ret) - return ret; + goto unlock; + + for_each_hyp_page(page, phys, size) { + switch (get_host_state(page)) { + case PKVM_PAGE_OWNED: + continue; + case PKVM_PAGE_SHARED_OWNED: + if (page->host_share_guest_count == U32_MAX) { + ret = -EBUSY; + goto unlock; + } + + /* Only host to np-guest multi-sharing is tolerated */ + if (page->host_share_guest_count) + continue; + + fallthrough; + default: + ret = -EPERM; + goto unlock; + } + } - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_ack_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_ack_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; + for_each_hyp_page(page, phys, size) { + set_host_state(page, PKVM_PAGE_SHARED_OWNED); + page->host_share_guest_count++; } + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, size, phys, + pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + return ret; } -static int __do_donate(struct pkvm_mem_donation *donation) +static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa, u64 size) { - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; + enum pkvm_page_state state; + kvm_pte_t pte; + u64 phys; + s8 level; int ret; - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_donation(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_initiate_donation(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); if (ret) return ret; + if (!kvm_pte_valid(pte)) + return -ENOENT; + if (kvm_granule_size(level) != size) + return -E2BIG; + + state = guest_get_page_state(pte, ipa); + if (state != PKVM_PAGE_SHARED_BORROWED) + return -EPERM; + + phys = kvm_pte_to_phys(pte); + ret = check_range_allowed_memory(phys, phys + size); + if (WARN_ON(ret)) + return ret; - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_complete_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_complete_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; + for_each_hyp_page(page, phys, size) { + if (get_host_state(page) != PKVM_PAGE_SHARED_OWNED) + return -EPERM; + if (WARN_ON(!page->host_share_guest_count)) + return -EINVAL; } - return ret; + *__phys = phys; + + return 0; } -/* - * do_donate(): - * - * The page owner transfers ownership to another component, losing access - * as a consequence. - * - * Initiator: OWNED => NOPAGE - * Completer: NOPAGE => OWNED - */ -static int do_donate(struct pkvm_mem_donation *donation) +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) { + u64 ipa = hyp_pfn_to_phys(gfn); + u64 size, phys; int ret; - ret = check_donation(donation); + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); if (ret) return ret; - return WARN_ON(__do_donate(donation)); -} + host_lock_component(); + guest_lock_component(vm); -int __pkvm_host_share_hyp(u64 pfn) -{ - int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; + ret = __check_host_shared_guest(vm, &phys, ipa, size); + if (ret) + goto unlock; - host_lock_component(); - hyp_lock_component(); + ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, size); + if (ret) + goto unlock; - ret = do_share(&share); + for_each_hyp_page(page, phys, size) { + /* __check_host_shared_guest() protects against underflow */ + page->host_share_guest_count--; + if (!page->host_share_guest_count) + set_host_state(page, PKVM_PAGE_OWNED); + } - hyp_unlock_component(); +unlock: + guest_unlock_component(vm); host_unlock_component(); return ret; } -int __pkvm_host_unshare_hyp(u64 pfn) +static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa, u64 size) { + u64 phys; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; + + if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) + return; host_lock_component(); - hyp_lock_component(); + guest_lock_component(vm); - ret = do_unshare(&share); + ret = __check_host_shared_guest(vm, &phys, ipa, size); - hyp_unlock_component(); + guest_unlock_component(vm); host_unlock_component(); - return ret; + WARN_ON(ret && ret != -ENOENT); } -int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) { + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - }; - host_lock_component(); - hyp_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = do_donate(&donation); + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; - hyp_unlock_component(); - host_unlock_component(); + assert_host_shared_guest(vm, ipa, PAGE_SIZE); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); + guest_unlock_component(vm); return ret; } -int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) { + u64 size, ipa = hyp_pfn_to_phys(gfn); int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HYP, - .addr = hyp_addr, - .hyp = { - .completer_addr = host_addr, - }, - }, - .completer = { - .id = PKVM_ID_HOST, - }, - }, - }; - host_lock_component(); - hyp_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = do_donate(&donation); + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; - hyp_unlock_component(); - host_unlock_component(); + assert_host_shared_guest(vm, ipa, size); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, size); + guest_unlock_component(vm); return ret; } -int hyp_pin_shared_mem(void *from, void *to) +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm) { - u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); - u64 end = PAGE_ALIGN((u64)to); - u64 size = end - start; + u64 size, ipa = hyp_pfn_to_phys(gfn); int ret; - host_lock_component(); - hyp_lock_component(); - - ret = __host_check_page_state_range(__hyp_pa(start), size, - PKVM_PAGE_SHARED_OWNED); - if (ret) - goto unlock; + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = __hyp_check_page_state_range(start, size, - PKVM_PAGE_SHARED_BORROWED); + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); if (ret) - goto unlock; - - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_inc(hyp_virt_to_page(cur)); + return ret; -unlock: - hyp_unlock_component(); - host_unlock_component(); + assert_host_shared_guest(vm, ipa, size); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, size, mkold); + guest_unlock_component(vm); return ret; } -void hyp_unpin_shared_mem(void *from, void *to) +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) { - u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); - u64 end = PAGE_ALIGN((u64)to); + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); - host_lock_component(); - hyp_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_dec(hyp_virt_to_page(cur)); + assert_host_shared_guest(vm, ipa, PAGE_SIZE); + guest_lock_component(vm); + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); + guest_unlock_component(vm); - hyp_unlock_component(); - host_unlock_component(); + return 0; } -int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) -{ - int ret; - struct pkvm_mem_share share = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = hyp_pfn_to_phys(pfn), - }, - .completer = { - .id = PKVM_ID_FFA, +#ifdef CONFIG_NVHE_EL2_DEBUG +struct pkvm_expected_state { + enum pkvm_page_state host; + enum pkvm_page_state hyp; + enum pkvm_page_state guest[2]; /* [ gfn, gfn + 1 ] */ +}; + +static struct pkvm_expected_state selftest_state; +static struct hyp_page *selftest_page; + +static struct pkvm_hyp_vm selftest_vm = { + .kvm = { + .arch = { + .mmu = { + .arch = &selftest_vm.kvm.arch, + .pgt = &selftest_vm.pgt, }, }, - }; + }, +}; - host_lock_component(); - ret = do_share(&share); - host_unlock_component(); +static struct pkvm_hyp_vcpu selftest_vcpu = { + .vcpu = { + .arch = { + .hw_mmu = &selftest_vm.kvm.arch.mmu, + }, + .kvm = &selftest_vm.kvm, + }, +}; - return ret; +static void init_selftest_vm(void *virt) +{ + struct hyp_page *p = hyp_virt_to_page(virt); + int i; + + selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + + for (i = 0; i < pkvm_selftest_pages(); i++) { + if (p[i].refcount) + continue; + p[i].refcount = 1; + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } } -int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) +static u64 selftest_ipa(void) { - int ret; - struct pkvm_mem_share share = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = hyp_pfn_to_phys(pfn), - }, - .completer = { - .id = PKVM_ID_FFA, - }, - }, - }; + return BIT(selftest_vm.pgt.ia_bits - 1); +} + +static void assert_page_state(void) +{ + void *virt = hyp_page_to_virt(selftest_page); + u64 size = PAGE_SIZE << selftest_page->order; + struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; + u64 phys = hyp_virt_to_phys(virt); + u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE }; + struct pkvm_hyp_vm *vm; + + vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); host_lock_component(); - ret = do_unshare(&share); + WARN_ON(__host_check_page_state_range(phys, size, selftest_state.host)); host_unlock_component(); - return ret; -} + hyp_lock_component(); + WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp)); + hyp_unlock_component(); + + guest_lock_component(&selftest_vm); + WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0])); + WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1])); + guest_unlock_component(&selftest_vm); +} + +#define assert_transition_res(res, fn, ...) \ + do { \ + WARN_ON(fn(__VA_ARGS__) != res); \ + assert_page_state(); \ + } while (0) + +void pkvm_ownership_selftest(void *base) +{ + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX; + void *virt = hyp_alloc_pages(&host_s2_pool, 0); + struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; + struct pkvm_hyp_vm *vm = &selftest_vm; + u64 phys, size, pfn, gfn; + + WARN_ON(!virt); + selftest_page = hyp_virt_to_page(virt); + selftest_page->refcount = 0; + init_selftest_vm(base); + + size = PAGE_SIZE << selftest_page->order; + phys = hyp_virt_to_phys(virt); + pfn = hyp_phys_to_pfn(phys); + gfn = hyp_phys_to_pfn(selftest_ipa()); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + selftest_state.guest[0] = selftest_state.guest[1] = PKVM_NOPAGE; + assert_page_state(); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + hyp_unpin_shared_mem(virt, virt + size); + WARN_ON(hyp_page_count(virt) != 1); + assert_transition_res(-EBUSY, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + hyp_unpin_shared_mem(virt, virt + size); + assert_page_state(); + WARN_ON(hyp_page_count(virt)); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_hyp, pfn); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.guest[0] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + WARN_ON(hyp_virt_to_page(virt)->host_share_guest_count != 2); + + selftest_state.guest[0] = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn, 1, vm); + + selftest_state.guest[1] = PKVM_NOPAGE; + selftest_state.host = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1); + + selftest_page->refcount = 1; + hyp_put_page(&host_s2_pool, virt); +} +#endif diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 8850b591d775..ae8391baebc3 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -229,9 +229,8 @@ int hyp_map_vectors(void) return 0; } -void *hyp_fixmap_map(phys_addr_t phys) +static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys) { - struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots); kvm_pte_t pte, *ptep = slot->ptep; pte = *ptep; @@ -243,10 +242,21 @@ void *hyp_fixmap_map(phys_addr_t phys) return (void *)slot->addr; } +void *hyp_fixmap_map(phys_addr_t phys) +{ + return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys); +} + static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) { kvm_pte_t *ptep = slot->ptep; u64 addr = slot->addr; + u32 level; + + if (FIELD_GET(KVM_PTE_TYPE, *ptep) == KVM_PTE_TYPE_PAGE) + level = KVM_PGTABLE_LAST_LEVEL; + else + level = KVM_PGTABLE_LAST_LEVEL - 1; /* create_fixblock() guarantees PMD level */ WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID); @@ -260,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 */ dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); dsb(ish); isb(); } @@ -273,9 +283,9 @@ void hyp_fixmap_unmap(void) static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg); + struct hyp_fixmap_slot *slot = (struct hyp_fixmap_slot *)ctx->arg; - if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL) + if (!kvm_pte_valid(ctx->old) || (ctx->end - ctx->start) != kvm_granule_size(ctx->level)) return -EINVAL; slot->addr = ctx->addr; @@ -296,13 +306,84 @@ static int create_fixmap_slot(u64 addr, u64 cpu) struct kvm_pgtable_walker walker = { .cb = __create_fixmap_slot_cb, .flags = KVM_PGTABLE_WALK_LEAF, - .arg = (void *)cpu, + .arg = per_cpu_ptr(&fixmap_slots, cpu), }; return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker); } -int hyp_create_pcpu_fixmap(void) +#if PAGE_SHIFT < 16 +#define HAS_FIXBLOCK +static struct hyp_fixmap_slot hyp_fixblock_slot; +static DEFINE_HYP_SPINLOCK(hyp_fixblock_lock); +#endif + +static int create_fixblock(void) +{ +#ifdef HAS_FIXBLOCK + struct kvm_pgtable_walker walker = { + .cb = __create_fixmap_slot_cb, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &hyp_fixblock_slot, + }; + unsigned long addr; + phys_addr_t phys; + int ret, i; + + /* Find a RAM phys address, PMD aligned */ + for (i = 0; i < hyp_memblock_nr; i++) { + phys = ALIGN(hyp_memory[i].base, PMD_SIZE); + if (phys + PMD_SIZE < (hyp_memory[i].base + hyp_memory[i].size)) + break; + } + + if (i >= hyp_memblock_nr) + return -EINVAL; + + hyp_spin_lock(&pkvm_pgd_lock); + addr = ALIGN(__io_map_base, PMD_SIZE); + ret = __pkvm_alloc_private_va_range(addr, PMD_SIZE); + if (ret) + goto unlock; + + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PMD_SIZE, phys, PAGE_HYP); + if (ret) + goto unlock; + + ret = kvm_pgtable_walk(&pkvm_pgtable, addr, PMD_SIZE, &walker); + +unlock: + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +#else + return 0; +#endif +} + +void *hyp_fixblock_map(phys_addr_t phys, size_t *size) +{ +#ifdef HAS_FIXBLOCK + *size = PMD_SIZE; + hyp_spin_lock(&hyp_fixblock_lock); + return fixmap_map_slot(&hyp_fixblock_slot, phys); +#else + *size = PAGE_SIZE; + return hyp_fixmap_map(phys); +#endif +} + +void hyp_fixblock_unmap(void) +{ +#ifdef HAS_FIXBLOCK + fixmap_clear_slot(&hyp_fixblock_slot); + hyp_spin_unlock(&hyp_fixblock_lock); +#else + hyp_fixmap_unmap(); +#endif +} + +int hyp_create_fixmap(void) { unsigned long addr, i; int ret; @@ -322,7 +403,7 @@ int hyp_create_pcpu_fixmap(void) return ret; } - return 0; + return create_fixblock(); } int hyp_create_idmap(u32 hyp_va_bits) @@ -360,10 +441,10 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) prev_base = __io_map_base; /* - * Efficient stack verification using the PAGE_SHIFT bit implies + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies * an alignment of our allocation on the order of the size. */ - size = PAGE_SIZE * 2; + size = NVHE_STACK_SIZE * 2; addr = ALIGN(__io_map_base, size); ret = __pkvm_alloc_private_va_range(addr, size); @@ -373,12 +454,12 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) * at the higher address and leave the lower guard page * unbacked. * - * Any valid stack address now has the PAGE_SHIFT bit as 1 + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. */ - ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + PAGE_SIZE, - PAGE_SIZE, phys, PAGE_HYP); + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + NVHE_STACK_SIZE, + NVHE_STACK_SIZE, phys, PAGE_HYP); if (ret) __io_map_base = prev_base; } diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index e691290d3765..a1eb27a1a747 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -32,7 +32,7 @@ u64 __hyp_vmemmap; */ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { phys_addr_t addr = hyp_page_to_phys(p); @@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, /* Find a buddy page currently available for allocation */ static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); @@ -94,7 +94,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, struct hyp_page *p) { phys_addr_t phys = hyp_page_to_phys(p); - unsigned short order = p->order; + u8 order = p->order; struct hyp_page *buddy; memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); @@ -129,7 +129,7 @@ insert: static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy; @@ -183,7 +183,7 @@ void hyp_get_page(struct hyp_pool *pool, void *addr) void hyp_split_page(struct hyp_page *p) { - unsigned short order = p->order; + u8 order = p->order; unsigned int i; p->order = 0; @@ -195,10 +195,10 @@ void hyp_split_page(struct hyp_page *p) } } -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order) { - unsigned short i = order; struct hyp_page *p; + u8 i = order; hyp_spin_lock(&pool->lock); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 071993c16de8..338505cb0171 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -9,7 +9,6 @@ #include <asm/kvm_emulate.h> -#include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> #include <nvhe/memory.h> #include <nvhe/pkvm.h> @@ -24,232 +23,167 @@ unsigned int kvm_arm_vmid_bits; unsigned int kvm_host_sve_max_vl; /* - * Set trap register values based on features in ID_AA64PFR0. + * The currently loaded hyp vCPU for each physical CPU. Used only when + * protected KVM is enabled, but for both protected and non-protected VMs. */ -static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - u64 hcr_set = HCR_RW; - u64 hcr_clear = 0; - u64 cptr_set = 0; - u64 cptr_clear = 0; - - /* Protected KVM does not support AArch32 guests. */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_EL0_IMP); - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_EL1_IMP); +static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); - /* - * Linux guests assume support for floating-point and Advanced SIMD. Do - * not change the trapping behavior for these from the KVM default. - */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), - PVM_ID_AA64PFR0_ALLOW)); - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), - PVM_ID_AA64PFR0_ALLOW)); +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; if (has_hvhe()) - hcr_set |= HCR_E2H; + vcpu->arch.hcr_el2 |= HCR_E2H; - /* Trap RAS unless all current versions are supported */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < - ID_AA64PFR0_EL1_RAS_V1P1) { - hcr_set |= HCR_TERR | HCR_TEA; - hcr_clear |= HCR_FIEN; + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; } - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) { - hcr_clear |= HCR_AMVOFFEN; - cptr_set |= CPTR_EL2_TAM; - } + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; - /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { - if (has_hvhe()) - cptr_clear |= CPACR_ELx_ZEN; - else - cptr_set |= CPTR_EL2_TZ; - } + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) && + kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0) == read_cpuid(CTR_EL0)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; - vcpu->arch.cptr_el2 &= ~cptr_clear; + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; } -/* - * Set trap register values based on features in ID_AA64PFR1. - */ -static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) +static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - u64 hcr_set = 0; - u64 hcr_clear = 0; + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.hcr_el2; - /* Memory Tagging: Trap and Treat as Untagged if not supported. */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) { - hcr_set |= HCR_TID5; - hcr_clear |= HCR_DCT | HCR_ATA; + /* No support for AArch32. */ + val |= HCR_RW; + + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; + + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { + val |= HCR_TERR | HCR_TEA; + val &= ~(HCR_FIEN); } - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; -} + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) + val &= ~(HCR_AMVOFFEN); -/* - * Set trap register values based on features in ID_AA64DFR0. - */ -static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); - u64 mdcr_set = 0; - u64 mdcr_clear = 0; - u64 cptr_set = 0; - - /* Trap/constrain PMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; - mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | - MDCR_EL2_HPMN_MASK; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) { + val |= HCR_TID5; + val &= ~(HCR_DCT | HCR_ATA); } - /* Trap Debug */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids)) - mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) + val |= HCR_TLOR; + + vcpu->arch.hcr_el2 = val; +} - /* Trap OS Double Lock */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids)) - mdcr_set |= MDCR_EL2_TDOSA; +static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 val = vcpu->arch.mdcr_el2; - /* Trap SPE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPMS; - mdcr_clear |= MDCR_EL2_E2PB_MASK; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) { + val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); } - /* Trap Trace Filter */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids)) - mdcr_set |= MDCR_EL2_TTRF; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP)) + val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; - /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) { - if (has_hvhe()) - cptr_set |= CPACR_EL1_TTA; - else - cptr_set |= CPTR_EL2_TTA; - } + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) + val |= MDCR_EL2_TDOSA; - /* Trap External Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) - mdcr_clear |= MDCR_EL2_E2TB_MASK; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) { + val |= MDCR_EL2_TPMS; + val &= ~MDCR_EL2_E2PB_MASK; + } - vcpu->arch.mdcr_el2 |= mdcr_set; - vcpu->arch.mdcr_el2 &= ~mdcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; -} + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + val |= MDCR_EL2_TTRF; -/* - * Set trap register values based on features in ID_AA64MMFR0. - */ -static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); - u64 mdcr_set = 0; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP)) + val |= MDCR_EL2_E2TB_MASK; /* Trap Debug Communications Channel registers */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids)) - mdcr_set |= MDCR_EL2_TDCC; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + val |= MDCR_EL2_TDCC; - vcpu->arch.mdcr_el2 |= mdcr_set; + vcpu->arch.mdcr_el2 = val; } /* - * Set trap register values based on features in ID_AA64MMFR1. + * Check that cpu features that are neither trapped nor supported are not + * enabled for protected VMs. */ -static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) +static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); - u64 hcr_set = 0; - - /* Trap LOR */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids)) - hcr_set |= HCR_TLOR; - - vcpu->arch.hcr_el2 |= hcr_set; -} + struct kvm *kvm = vcpu->kvm; -/* - * Set baseline trap register values. - */ -static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) -{ - const u64 hcr_trap_feat_regs = HCR_TID3; - const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1; + /* Protected KVM does not support AArch32 guests. */ + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) + return -EINVAL; /* - * Always trap: - * - Feature id registers: to control features exposed to guests - * - Implementation-defined features + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. */ - vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef; - - /* Clear res0 and set res1 bits to trap potential new features. */ - vcpu->arch.hcr_el2 &= ~(HCR_RES0); - vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); - if (!has_hvhe()) { - vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; - vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); - } -} - -static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; - - if (has_hvhe()) - vcpu->arch.hcr_el2 |= HCR_E2H; - - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { - /* route synchronous external abort exceptions to EL2 */ - vcpu->arch.hcr_el2 |= HCR_TEA; - /* trap error record accesses */ - vcpu->arch.hcr_el2 |= HCR_TERR; - } - - if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) - vcpu->arch.hcr_el2 |= HCR_FWB; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) || + !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP)) + return -EINVAL; - if (cpus_have_final_cap(ARM64_HAS_EVT) && - !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) - vcpu->arch.hcr_el2 |= HCR_TID4; - else - vcpu->arch.hcr_el2 |= HCR_TID2; + /* No SME support in KVM right now. Check to catch if it changes. */ + if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) + return -EINVAL; - if (vcpu_has_ptrauth(vcpu)) - vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + return 0; } /* * Initialize trap register values in protected mode. */ -static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) { - vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + int ret; + vcpu->arch.mdcr_el2 = 0; pkvm_vcpu_reset_hcr(vcpu); - if ((!vcpu_is_protected(vcpu))) - return; + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) { + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + /* Trust the host for non-protected vcpu features. */ + vcpu->arch.hcrx_el2 = host_vcpu->arch.hcrx_el2; + return 0; + } + + ret = pkvm_check_pvm_cpu_features(vcpu); + if (ret) + return ret; + + pvm_init_traps_hcr(vcpu); + pvm_init_traps_mdcr(vcpu); + vcpu_set_hcrx(vcpu); - pvm_init_trap_regs(vcpu); - pvm_init_traps_aa64pfr0(vcpu); - pvm_init_traps_aa64pfr1(vcpu); - pvm_init_traps_aa64dfr0(vcpu); - pvm_init_traps_aa64mmfr0(vcpu); - pvm_init_traps_aa64mmfr1(vcpu); + return 0; } /* @@ -270,10 +204,10 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx) /* * Spinlock for protecting state related to the VM table. Protects writes - * to 'vm_table' and 'nr_table_entries' as well as reads and writes to - * 'last_hyp_vcpu_lookup'. + * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization. + * Also protects reads and writes to 'last_hyp_vcpu_lookup'. */ -static DEFINE_HYP_SPINLOCK(vm_table_lock); +DEFINE_HYP_SPINLOCK(vm_table_lock); /* * The table of VM entries for protected VMs in hyp. @@ -306,15 +240,32 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, struct pkvm_hyp_vcpu *hyp_vcpu = NULL; struct pkvm_hyp_vm *hyp_vm; + /* Cannot load a new vcpu without putting the old one first. */ + if (__this_cpu_read(loaded_hyp_vcpu)) + return NULL; + hyp_spin_lock(&vm_table_lock); hyp_vm = get_vm_by_handle(handle); - if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) + if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx) goto unlock; hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + if (!hyp_vcpu) + goto unlock; + + /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ + if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { + hyp_vcpu = NULL; + goto unlock; + } + + hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu); hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); unlock: hyp_spin_unlock(&vm_table_lock); + + if (hyp_vcpu) + __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu); return hyp_vcpu; } @@ -323,82 +274,130 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); hyp_spin_lock(&vm_table_lock); + hyp_vcpu->loaded_hyp_vcpu = NULL; + __this_cpu_write(loaded_hyp_vcpu, NULL); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void) +{ + return __this_cpu_read(loaded_hyp_vcpu); + +} + +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (hyp_vm) + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); + + return hyp_vm; +} + +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm) +{ + hyp_spin_lock(&vm_table_lock); hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); hyp_spin_unlock(&vm_table_lock); } +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); + + if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) { + put_pkvm_hyp_vm(hyp_vm); + hyp_vm = NULL; + } + + return hyp_vm; +} + static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) { struct kvm *kvm = &hyp_vm->kvm; + unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + /* CTR_EL0 is always under host control, even for protected VMs. */ + hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; + + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) + set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); + /* No restrictions for non-protected VMs. */ if (!kvm_vm_is_protected(kvm)) { + hyp_vm->kvm.arch.flags = host_arch_flags; + bitmap_copy(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); + + if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &host_arch_flags)) + hyp_vm->kvm.arch.midr_el1 = host_kvm->arch.midr_el1; + return; } bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); - /* - * For protected VMs, always allow: - * - CPU starting in poweroff state - * - PSCI v0.2 - */ - set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features); set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); - /* - * Check if remaining features are allowed: - * - Performance Monitoring - * - Scalable Vectors - * - Pointer Authentication - */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), PVM_ID_AA64DFR0_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) - set_bit(KVM_ARM_VCPU_SVE, allowed_features); - - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), PVM_ID_AA64ISAR1_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) { + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE); + } + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, allowed_features, KVM_VCPU_MAX_FEATURES); } -static void pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu) -{ - struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; - - if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || - vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) { - kvm_vcpu_enable_ptrauth(vcpu); - } else { - vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_PTRAUTH); - } -} - static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); } +static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + void *sve_state; + + if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE)) + return; + + sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state); + hyp_unpin_shared_mem(sve_state, + sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu)); +} + static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], unsigned int nr_vcpus) { int i; - for (i = 0; i < nr_vcpus; i++) - unpin_host_vcpu(hyp_vcpus[i]->host_vcpu); + for (i = 0; i < nr_vcpus; i++) { + struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i]; + + if (!hyp_vcpu) + continue; + + unpin_host_vcpu(hyp_vcpu->host_vcpu); + unpin_host_sve_state(hyp_vcpu); + } } static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, @@ -408,47 +407,73 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, hyp_vm->kvm.created_vcpus = nr_vcpus; hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + hyp_vm->kvm.arch.flags = 0; pkvm_init_features_from_host(hyp_vm, host_kvm); } -static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) { struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + unsigned int sve_max_vl; + size_t sve_state_size; + void *sve_state; + int ret = 0; if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { - vcpu_clear_flag(vcpu, GUEST_HAS_SVE); vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); + return 0; + } + + /* Limit guest vector length to the maximum supported by the host. */ + sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl); + sve_state_size = sve_state_size_from_vl(sve_max_vl); + sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state)); + + if (!sve_state || !sve_state_size) { + ret = -EINVAL; + goto err; } + + ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size); + if (ret) + goto err; + + vcpu->arch.sve_state = sve_state; + vcpu->arch.sve_max_vl = sve_max_vl; + + return 0; +err: + clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features); + return ret; } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, struct pkvm_hyp_vm *hyp_vm, - struct kvm_vcpu *host_vcpu, - unsigned int vcpu_idx) + struct kvm_vcpu *host_vcpu) { int ret = 0; if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1)) return -EBUSY; - if (host_vcpu->vcpu_idx != vcpu_idx) { - ret = -EINVAL; - goto done; - } - hyp_vcpu->host_vcpu = host_vcpu; hyp_vcpu->vcpu.kvm = &hyp_vm->kvm; hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id); - hyp_vcpu->vcpu.vcpu_idx = vcpu_idx; + hyp_vcpu->vcpu.vcpu_idx = READ_ONCE(host_vcpu->vcpu_idx); hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; - pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); - pkvm_vcpu_init_ptrauth(hyp_vcpu); - pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + + ret = pkvm_vcpu_init_traps(hyp_vcpu); + if (ret) + goto done; + + ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); @@ -673,29 +698,28 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, goto unlock; } - idx = hyp_vm->nr_vcpus; + ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu); + if (ret) + goto unlock; + + idx = hyp_vcpu->vcpu.vcpu_idx; if (idx >= hyp_vm->kvm.created_vcpus) { ret = -EINVAL; goto unlock; } - ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx); - if (ret) + if (hyp_vm->vcpus[idx]) { + ret = -EINVAL; goto unlock; + } hyp_vm->vcpus[idx] = hyp_vcpu; - hyp_vm->nr_vcpus++; unlock: hyp_spin_unlock(&vm_table_lock); - if (ret) { + if (ret) unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); - return ret; - } - - hyp_vcpu->vcpu.arch.cptr_el2 = kvm_get_reset_cptr_el2(&hyp_vcpu->vcpu); - - return 0; + return ret; } static void @@ -712,7 +736,7 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) int __pkvm_teardown_vm(pkvm_handle_t handle) { - struct kvm_hyp_memcache *mc; + struct kvm_hyp_memcache *mc, *stage2_mc; struct pkvm_hyp_vm *hyp_vm; struct kvm *host_kvm; unsigned int idx; @@ -740,12 +764,26 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) /* Reclaim guest pages (including page-table pages) */ mc = &host_kvm->arch.pkvm.teardown_mc; - reclaim_guest_pages(hyp_vm, mc); - unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); + stage2_mc = &host_kvm->arch.pkvm.stage2_teardown_mc; + reclaim_pgtable_pages(hyp_vm, stage2_mc); + unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->kvm.created_vcpus); /* Push the metadata pages to the teardown memcache */ - for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { + for (idx = 0; idx < hyp_vm->kvm.created_vcpus; ++idx) { struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; + struct kvm_hyp_memcache *vcpu_mc; + + if (!hyp_vcpu) + continue; + + vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; + + while (vcpu_mc->nr_pages) { + void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); + + push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys); + unmap_donated_memory_noclear(addr, PAGE_SIZE); + } teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); } diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 9c2ce1e0e99a..c3e196fb8b18 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -218,6 +218,9 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) if (is_cpu_on) release_boot_args(boot_args); + write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR); + write_sysreg(INIT_PSTATE_EL1, SPSR_EL2); + __host_enter(host_ctxt); } diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index cbdd18cd3f98..a48d3f5a5afb 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -12,7 +12,6 @@ #include <nvhe/early_alloc.h> #include <nvhe/ffa.h> -#include <nvhe/fixed_config.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> #include <nvhe/mem_protect.h> @@ -29,6 +28,7 @@ static void *vmemmap_base; static void *vm_table_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; +static void *selftest_base; static void *ffa_proxy_pages; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; static struct hyp_pool hpool; @@ -39,6 +39,11 @@ static int divide_memory_pool(void *virt, unsigned long size) hyp_early_alloc_init(virt, size); + nr_pages = pkvm_selftest_pages(); + selftest_base = hyp_early_alloc_contig(nr_pages); + if (nr_pages && !selftest_base) + return -ENOMEM; + nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page)); vmemmap_base = hyp_early_alloc_contig(nr_pages); if (!vmemmap_base) @@ -120,6 +125,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; + ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP); + if (ret) + return ret; + ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); if (ret) return ret; @@ -180,8 +189,8 @@ static void hpool_put_page(void *addr) static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - enum kvm_pgtable_prot prot; enum pkvm_page_state state; + struct hyp_page *page; phys_addr_t phys; if (!kvm_pte_valid(ctx->old)) @@ -194,25 +203,31 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!addr_is_memory(phys)) return -EINVAL; + page = hyp_phys_to_page(phys); + /* * Adjust the host stage-2 mappings to match the ownership attributes - * configured in the hypervisor stage-1. + * configured in the hypervisor stage-1, and make sure to propagate them + * to the hyp_vmemmap state. */ state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old)); switch (state) { case PKVM_PAGE_OWNED: + set_hyp_state(page, PKVM_PAGE_OWNED); return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); case PKVM_PAGE_SHARED_OWNED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); + set_hyp_state(page, PKVM_PAGE_SHARED_OWNED); + set_host_state(page, PKVM_PAGE_SHARED_BORROWED); break; case PKVM_PAGE_SHARED_BORROWED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + set_hyp_state(page, PKVM_PAGE_SHARED_BORROWED); + set_host_state(page, PKVM_PAGE_SHARED_OWNED); break; default: return -EINVAL; } - return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); + return 0; } static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -297,7 +312,7 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; - ret = hyp_create_pcpu_fixmap(); + ret = hyp_create_fixmap(); if (ret) goto out; @@ -306,6 +321,8 @@ void __noreturn __pkvm_init_finalise(void) goto out; pkvm_hyp_vm_table_init(vm_table_base); + + pkvm_ownership_selftest(selftest_base); out: /* * We tail-called to here from handle___pkvm_init() and will not return, diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index ed6b58b19cfa..5b6eeab1a774 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -28,7 +28,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); - stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE); + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - NVHE_STACK_SIZE); stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); stacktrace_info->fp = fp; stacktrace_info->pc = pc; @@ -54,7 +54,7 @@ static struct stack_info stackinfo_get_hyp(void) { struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); unsigned long high = params->stack_hyp_va; - unsigned long low = high - PAGE_SIZE; + unsigned long low = high - NVHE_STACK_SIZE; return (struct stack_info) { .low = low, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index cc69106734ca..73affe1333a4 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -26,7 +26,6 @@ #include <asm/debug-monitors.h> #include <asm/processor.h> -#include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> /* Non-VHE specific context */ @@ -34,35 +33,85 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); +struct fgt_masks hfgrtr_masks; +struct fgt_masks hfgwtr_masks; +struct fgt_masks hfgitr_masks; +struct fgt_masks hdfgrtr_masks; +struct fgt_masks hdfgwtr_masks; +struct fgt_masks hafgrtr_masks; +struct fgt_masks hfgrtr2_masks; +struct fgt_masks hfgwtr2_masks; +struct fgt_masks hfgitr2_masks; +struct fgt_masks hdfgrtr2_masks; +struct fgt_masks hdfgwtr2_masks; + extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); -static void __activate_traps(struct kvm_vcpu *vcpu) +static void __activate_cptr_traps(struct kvm_vcpu *vcpu) { - u64 val; + u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - ___activate_traps(vcpu, vcpu->arch.hcr_el2); - __activate_traps_common(vcpu); + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); - val = vcpu->arch.cptr_el2; - val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; - if (cpus_have_final_cap(ARM64_SME)) { - if (has_hvhe()) - val &= ~CPACR_ELx_SMEN; - else - val |= CPTR_EL2_TSM; + if (has_hvhe()) { + val |= CPACR_EL1_TTA; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } + + write_sysreg(val, cpacr_el1); + } else { + val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); } +} - if (!guest_owns_fp_regs()) { - if (has_hvhe()) - val &= ~(CPACR_ELx_FPEN | CPACR_ELx_ZEN); - else - val |= CPTR_EL2_TFP | CPTR_EL2_TZ; +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_hvhe()) { + u64 val = CPACR_EL1_FPEN; - __activate_traps_fpsimd32(vcpu); + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); + } else { + u64 val = CPTR_NVHE_EL2_RES1; + + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); } +} + +static void __activate_traps(struct kvm_vcpu *vcpu) +{ + ___activate_traps(vcpu, vcpu->arch.hcr_el2); + __activate_traps_common(vcpu); + __activate_cptr_traps(vcpu); - kvm_write_cptr_el2(val); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { @@ -105,9 +154,9 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) __deactivate_traps_common(vcpu); - write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); + write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); } @@ -180,34 +229,6 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_handle_pvm_sysreg(vcpu, exit_code)); } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) -{ - /* - * Non-protected kvm relies on the host restoring its sve state. - * Protected kvm restores the host's sve state as not to reveal that - * fpsimd was used by a guest nor leak upper sve bits. - */ - if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) { - __hyp_sve_save_host(); - - /* Re-enable SVE traps if not supported for the guest vcpu. */ - if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_ELx_ZEN, 0); - - } else { - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); - } - - if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) { - u64 val = read_sysreg_s(SYS_FPMR); - - if (unlikely(is_protected_kvm_enabled())) - *host_data_ptr(fpmr) = val; - else - **host_data_ptr(fpmr_ptr) = val; - } -} - static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, @@ -239,19 +260,21 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) return hyp_exit_handlers; } -/* - * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. - * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a - * guest from dropping to AArch32 EL0 if implemented by the CPU. If the - * hypervisor spots a guest in such a state ensure it is handled, and don't - * trust the host to spot or fix it. The check below is based on the one in - * kvm_arch_vcpu_ioctl_run(). - * - * Returns false if the guest ran in AArch32 when it shouldn't have, and - * thus should exit to the host, or true if a the guest run loop can continue. - */ -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + + synchronize_vcpu_pstate(vcpu, exit_code); + + /* + * Some guests (e.g., protected VMs) are not be allowed to run in + * AArch32. The ARMv8 architecture does not give the hypervisor a + * mechanism to prevent a guest from dropping to AArch32 EL0 if + * implemented by the CPU. If the hypervisor spots a guest in such a + * state ensure it is handled, and don't trust the host to spot or fix + * it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + */ if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { /* * As we have caught the guest red-handed, decide that it isn't @@ -264,6 +287,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); *exit_code |= ARM_EXCEPTION_IL; } + + return __fixup_guest_exit(vcpu, exit_code, handlers); } /* Switch to the guest for legacy non-VHE systems */ diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 2860548d4250..1ddd9ed3cbb3 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -11,7 +11,7 @@ #include <hyp/adjust_pc.h> -#include <nvhe/fixed_config.h> +#include <nvhe/pkvm.h> #include "../../sys_regs.h" @@ -28,222 +28,255 @@ u64 id_aa64mmfr1_el1_sys_val; u64 id_aa64mmfr2_el1_sys_val; u64 id_aa64smfr0_el1_sys_val; -/* - * Inject an unknown/undefined exception to an AArch64 guest while most of its - * sysregs are live. - */ -static void inject_undef64(struct kvm_vcpu *vcpu) -{ - u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - - *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); - *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); - - kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); - - __kvm_adjust_pc(vcpu); - - write_sysreg_el1(esr, SYS_ESR); - write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); - write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); - write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); -} - -/* - * Returns the restricted features values of the feature register based on the - * limitations in restrict_fields. - * A feature id field value of 0b0000 does not impose any restrictions. - * Note: Use only for unsigned feature field values. - */ -static u64 get_restricted_features_unsigned(u64 sys_reg_val, - u64 restrict_fields) -{ - u64 value = 0UL; - u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); +struct pvm_ftr_bits { + bool sign; + u8 shift; + u8 width; + u8 max_val; + bool (*vm_supported)(const struct kvm *kvm); +}; - /* - * According to the Arm Architecture Reference Manual, feature fields - * use increasing values to indicate increases in functionality. - * Iterate over the restricted feature fields and calculate the minimum - * unsigned value between the one supported by the system, and what the - * value is being restricted to. - */ - while (sys_reg_val && restrict_fields) { - value |= min(sys_reg_val & mask, restrict_fields & mask); - sys_reg_val &= ~mask; - restrict_fields &= ~mask; - mask <<= ARM64_FEATURE_FIELD_BITS; +#define __MAX_FEAT_FUNC(id, fld, max, func, sgn) \ + { \ + .sign = sgn, \ + .shift = id##_##fld##_SHIFT, \ + .width = id##_##fld##_WIDTH, \ + .max_val = id##_##fld##_##max, \ + .vm_supported = func, \ } - return value; -} - -/* - * Functions that return the value of feature id registers for protected VMs - * based on allowed features, system features, and KVM support. - */ +#define MAX_FEAT_FUNC(id, fld, max, func) \ + __MAX_FEAT_FUNC(id, fld, max, func, id##_##fld##_SIGNED) -static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask = 0; - u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; +#define MAX_FEAT(id, fld, max) \ + MAX_FEAT_FUNC(id, fld, max, NULL) - set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); +#define MAX_FEAT_ENUM(id, fld, max) \ + __MAX_FEAT_FUNC(id, fld, max, NULL, false) - return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; -} +#define FEAT_END { .width = 0, } -static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) +static bool vm_has_ptrauth(const struct kvm *kvm) { - const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); - u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; - - if (!kvm_has_mte(kvm)) - allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); - - return id_aa64pfr1_el1_sys_val & allow_mask; -} - -static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for Scalable Vectors, therefore, hyp has no sanitized - * copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, including breakpoints, and watchpoints, - * therefore, pKVM has no sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, therefore, hyp has no sanitized copy of the - * feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); - return 0; -} + if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) + return false; -static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); - return 0; + return (cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || + cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC); } -static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) +static bool vm_has_sve(const struct kvm *kvm) { - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); - return 0; + return system_supports_sve() && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_SVE); } -static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) -{ - return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; -} +/* + * Definitions for features to be allowed or restricted for protected guests. + * + * Each field in the masks represents the highest supported value for the + * feature. If a feature field is not present, it is not supported. Moreover, + * these are used to generate the guest's view of the feature registers. + * + * The approach for protected VMs is to at least support features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ -static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64pfr0[] = { + MAX_FEAT(ID_AA64PFR0_EL1, EL0, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL1, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL3, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, FP, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, AdvSIMD, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, GIC, IMP), + MAX_FEAT_FUNC(ID_AA64PFR0_EL1, SVE, IMP, vm_has_sve), + MAX_FEAT(ID_AA64PFR0_EL1, RAS, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, DIT, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV3, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); +static const struct pvm_ftr_bits pvmid_aa64pfr1[] = { + MAX_FEAT(ID_AA64PFR1_EL1, BT, IMP), + MAX_FEAT(ID_AA64PFR1_EL1, SSBS, SSBS2), + MAX_FEAT_ENUM(ID_AA64PFR1_EL1, MTE_frac, NI), + FEAT_END +}; - return id_aa64isar1_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = { + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40), + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGEND, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, SNSMEM, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGENDEL0, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, EXS, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64mmfr1[] = { + MAX_FEAT(ID_AA64MMFR1_EL1, HAFDBS, DBM), + MAX_FEAT_ENUM(ID_AA64MMFR1_EL1, VMIDBits, 16), + MAX_FEAT(ID_AA64MMFR1_EL1, HPDS, HPDS2), + MAX_FEAT(ID_AA64MMFR1_EL1, PAN, PAN3), + MAX_FEAT(ID_AA64MMFR1_EL1, SpecSEI, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, ETS, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, CMOW, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); +static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = { + MAX_FEAT(ID_AA64MMFR2_EL1, CnP, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP), + MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18), + MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2), + MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP), + FEAT_END +}; - return id_aa64isar2_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64isar1[] = { + MAX_FEAT(ID_AA64ISAR1_EL1, DPB, DPB2), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, APA, PAuth, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, API, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR1_EL1, JSCVT, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FCMA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, LRCPC, LRCPC3), + MAX_FEAT(ID_AA64ISAR1_EL1, GPA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, GPI, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FRINTTS, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SB, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX), + MAX_FEAT(ID_AA64ISAR1_EL1, BF16, EBF16), + MAX_FEAT(ID_AA64ISAR1_EL1, DGH, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, I8MM, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask; +static const struct pvm_ftr_bits pvmid_aa64isar2[] = { + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, GPA3, IMP, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, APA3, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR2_EL1, ATS1A, IMP), + FEAT_END +}; - set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, - PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); +/* + * None of the features in ID_AA64DFR0_EL1 nor ID_AA64MMFR4_EL1 are supported. + * However, both have Not-Implemented values that are non-zero. Define them + * so they can be used when getting the value of these registers. + */ +#define ID_AA64DFR0_EL1_NONZERO_NI \ +( \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI) | \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, MTPMU, NI) \ +) - return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; -} +#define ID_AA64MMFR4_EL1_NONZERO_NI \ + SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI) -static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) +/* + * Returns the value of the feature registers based on the system register + * value, the vcpu support for the revelant features, and the additional + * restrictions for protected VMs. + */ +static u64 get_restricted_features(const struct kvm_vcpu *vcpu, + u64 sys_reg_val, + const struct pvm_ftr_bits restrictions[]) { - return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; -} + u64 val = 0UL; + int i; + + for (i = 0; restrictions[i].width != 0; i++) { + bool (*vm_supported)(const struct kvm *) = restrictions[i].vm_supported; + bool sign = restrictions[i].sign; + int shift = restrictions[i].shift; + int width = restrictions[i].width; + u64 min_signed = (1UL << width) - 1UL; + u64 sign_bit = 1UL << (width - 1); + u64 mask = GENMASK_ULL(width + shift - 1, shift); + u64 sys_val = (sys_reg_val & mask) >> shift; + u64 pvm_max = restrictions[i].max_val; + + if (vm_supported && !vm_supported(vcpu->kvm)) + val |= (sign ? min_signed : 0) << shift; + else if (sign && (sys_val >= sign_bit || pvm_max >= sign_bit)) + val |= max(sys_val, pvm_max) << shift; + else + val |= min(sys_val, pvm_max) << shift; + } -static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) -{ - return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; + return val; } -/* Read a sanitized cpufeature ID register by its encoding */ -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) { switch (id) { case SYS_ID_AA64PFR0_EL1: - return get_pvm_id_aa64pfr0(vcpu); + return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0); case SYS_ID_AA64PFR1_EL1: - return get_pvm_id_aa64pfr1(vcpu); - case SYS_ID_AA64ZFR0_EL1: - return get_pvm_id_aa64zfr0(vcpu); - case SYS_ID_AA64DFR0_EL1: - return get_pvm_id_aa64dfr0(vcpu); - case SYS_ID_AA64DFR1_EL1: - return get_pvm_id_aa64dfr1(vcpu); - case SYS_ID_AA64AFR0_EL1: - return get_pvm_id_aa64afr0(vcpu); - case SYS_ID_AA64AFR1_EL1: - return get_pvm_id_aa64afr1(vcpu); + return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1); case SYS_ID_AA64ISAR0_EL1: - return get_pvm_id_aa64isar0(vcpu); + return id_aa64isar0_el1_sys_val; case SYS_ID_AA64ISAR1_EL1: - return get_pvm_id_aa64isar1(vcpu); + return get_restricted_features(vcpu, id_aa64isar1_el1_sys_val, pvmid_aa64isar1); case SYS_ID_AA64ISAR2_EL1: - return get_pvm_id_aa64isar2(vcpu); + return get_restricted_features(vcpu, id_aa64isar2_el1_sys_val, pvmid_aa64isar2); case SYS_ID_AA64MMFR0_EL1: - return get_pvm_id_aa64mmfr0(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr0_el1_sys_val, pvmid_aa64mmfr0); case SYS_ID_AA64MMFR1_EL1: - return get_pvm_id_aa64mmfr1(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr1_el1_sys_val, pvmid_aa64mmfr1); case SYS_ID_AA64MMFR2_EL1: - return get_pvm_id_aa64mmfr2(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr2_el1_sys_val, pvmid_aa64mmfr2); + case SYS_ID_AA64DFR0_EL1: + return ID_AA64DFR0_EL1_NONZERO_NI; + case SYS_ID_AA64MMFR4_EL1: + return ID_AA64MMFR4_EL1_NONZERO_NI; default: /* Unhandled ID register, RAZ */ return 0; } } +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); +} + static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { - return pvm_read_id_reg(vcpu, reg_to_encoding(r)); + struct kvm *kvm = vcpu->kvm; + u32 reg = reg_to_encoding(r); + + if (WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))) + return 0; + + if (reg >= sys_reg(3, 0, 0, 1, 0) && reg <= sys_reg(3, 0, 0, 7, 7)) + return kvm->arch.id_regs[IDREG_IDX(reg)]; + + return 0; } /* Handler to RAZ/WI sysregs */ @@ -271,13 +304,6 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, return false; } - /* - * No support for AArch32 guests, therefore, pKVM has no sanitized copy - * of AArch32 feature id registers. - */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_EL1_IMP); - return pvm_access_raz_wi(vcpu, p, r); } @@ -449,6 +475,30 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { }; /* + * Initializes feature registers for protected vms. + */ +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_arch *ka = &kvm->arch; + u32 r; + + hyp_assert_lock_held(&vm_table_lock); + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + /* + * Initialize only AArch64 id registers since AArch32 isn't supported + * for protected VMs. + */ + for (r = sys_reg(3, 0, 0, 4, 0); r <= sys_reg(3, 0, 0, 7, 7); r += sys_reg(0, 0, 0, 0, 1)) + ka->id_regs[IDREG_IDX(r)] = pvm_calc_id_reg(vcpu, r); + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); +} + +/* * Checks that the sysreg table is unique and in-order. * * Returns 0 if the table is consistent, or 1 otherwise. diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c index dba101565de3..3cc613cce5f5 100644 --- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -28,7 +28,9 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt) void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) { - __sysreg_restore_el1_state(ctxt, ctxt_sys_reg(ctxt, MPIDR_EL1)); + u64 midr = ctxt_midr_el1(ctxt); + + __sysreg_restore_el1_state(ctxt, midr, ctxt_sys_reg(ctxt, MPIDR_EL1)); __sysreg_restore_common_state(ctxt); __sysreg_restore_user_state(ctxt); __sysreg_restore_el2_return_state(ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c index 3aaab20ae5b4..ff176f4ce7de 100644 --- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -22,15 +22,16 @@ void __kvm_timer_set_cntvoff(u64 cntvoff) */ void __timer_disable_traps(struct kvm_vcpu *vcpu) { - u64 val, shift = 0; + u64 set, clr, shift = 0; if (has_hvhe()) shift = 10; /* Allow physical timer/counter access for the host */ - val = read_sysreg(cnthctl_el2); - val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; - write_sysreg(val, cnthctl_el2); + set = (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; + clr = CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + + sysreg_clear_set(cnthctl_el2, clr, set); } /* @@ -58,5 +59,12 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu) set <<= 10; } + /* + * Trap the virtual counter/timer if we have a broken cntvoff + * implementation. + */ + if (has_broken_cntvoff()) + set |= CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + sysreg_clear_set(cnthctl_el2, clr, set); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 40bd55966540..c351b4abd5db 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -11,12 +11,6 @@ #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> - -#define KVM_PTE_TYPE BIT(1) -#define KVM_PTE_TYPE_BLOCK 0 -#define KVM_PTE_TYPE_PAGE 1 -#define KVM_PTE_TYPE_TABLE 1 - struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; @@ -35,14 +29,6 @@ static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); } -static bool kvm_phys_is_valid(u64 phys) -{ - u64 parange_max = kvm_get_parange_max(); - u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max); - - return phys < BIT(shift); -} - static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) { u64 granule = kvm_granule_size(ctx->level); @@ -53,7 +39,7 @@ static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, if (granule > (ctx->end - ctx->addr)) return false; - if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) + if (!IS_ALIGNED(phys, granule)) return false; return IS_ALIGNED(ctx->addr, granule); @@ -587,6 +573,9 @@ struct stage2_map_data { /* Force mappings to page granularity */ bool force_pte; + + /* Walk should update owner_id only */ + bool annotation; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) @@ -885,18 +874,7 @@ static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, { u64 phys = data->phys; - /* - * Stage-2 walks to update ownership data are communicated to the map - * walker using an invalid PA. Avoid offsetting an already invalid PA, - * which could overflow and make the address valid again. - */ - if (!kvm_phys_is_valid(phys)) - return phys; - - /* - * Otherwise, work out the correct PA based on how far the walk has - * gotten. - */ + /* Work out the correct PA based on how far the walk has gotten */ return phys + (ctx->addr - ctx->start); } @@ -908,6 +886,9 @@ static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) return false; + if (data->annotation) + return true; + return kvm_block_mapping_supported(ctx, phys); } @@ -923,7 +904,7 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG; - if (kvm_phys_is_valid(phys)) + if (!data->annotation) new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else new = kvm_init_invalid_leaf_owner(data->owner_id); @@ -1085,11 +1066,11 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, { int ret; struct stage2_map_data map_data = { - .phys = KVM_PHYS_INVALID, .mmu = pgt->mmu, .memcache = mc, .owner_id = owner_id, .force_pte = true, + .annotation = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -1245,14 +1226,13 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) NULL, NULL, 0); } -void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) +void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) { int ret; ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - NULL, NULL, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + NULL, NULL, flags); if (!ret) dsb(ishst); } @@ -1308,7 +1288,7 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, } int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot) + enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { int ret; s8 level; @@ -1326,9 +1306,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_X) clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 3f9741e51d41..f162b0df5cae 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -18,7 +18,7 @@ #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) -static u64 __gic_v3_get_lr(unsigned int lr) +u64 __gic_v3_get_lr(unsigned int lr) { switch (lr & 0xf) { case 0: @@ -218,7 +218,7 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) elrsr = read_gicreg(ICH_ELRSR_EL2); - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); + write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) @@ -274,7 +274,7 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 * so that the trap bits can take effect. Yes, we *loves* the GIC. */ - if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) { + if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) { write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); isb(); } else if (!cpu_if->vgic_sre) { @@ -429,23 +429,27 @@ u64 __vgic_v3_get_gic_config(void) /* * To check whether we have a MMIO-based (GICv2 compatible) * CPU interface, we need to disable the system register - * view. To do that safely, we have to prevent any interrupt - * from firing (which would be deadly). + * view. * - * Note that this only makes sense on VHE, as interrupts are - * already masked for nVHE as part of the exception entry to - * EL2. - */ - if (has_vhe()) - flags = local_daif_save(); - - /* * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates * that to be able to set ICC_SRE_EL1.SRE to 0, all the * interrupt overrides must be set. You've got to love this. + * + * As we always run VHE with HCR_xMO set, no extra xMO + * manipulation is required in that case. + * + * To safely disable SRE, we have to prevent any interrupt + * from firing (which would be deadly). This only makes sense + * on VHE, as interrupts are already masked for nVHE as part + * of the exception entry to EL2. */ - sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); - isb(); + if (has_vhe()) { + flags = local_daif_save(); + } else { + sysreg_clear_set_hcr(0, HCR_AMO | HCR_FMO | HCR_IMO); + isb(); + } + write_gicreg(0, ICC_SRE_EL1); isb(); @@ -453,11 +457,13 @@ u64 __vgic_v3_get_gic_config(void) write_gicreg(sre, ICC_SRE_EL1); isb(); - sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); - isb(); - if (has_vhe()) + if (has_vhe()) { local_daif_restore(flags); + } else { + sysreg_clear_set_hcr(HCR_AMO | HCR_FMO | HCR_IMO, 0); + isb(); + } val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); val |= read_gicreg(ICH_VTR_EL2); @@ -752,7 +758,7 @@ static void __vgic_v3_bump_eoicount(void) u32 hcr; hcr = read_gicreg(ICH_HCR_EL2); - hcr += 1 << ICH_HCR_EOIcount_SHIFT; + hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT; write_gicreg(hcr, ICH_HCR_EL2); } @@ -1052,11 +1058,11 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, switch (sysreg) { case SYS_ICC_IGRPEN0_EL1: if (is_read && - (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && - (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; @@ -1069,15 +1075,15 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_EOIR0_EL1: case SYS_ICC_HPPIR0_EL1: case SYS_ICC_IAR0_EL1: - return ich_hcr & ICH_HCR_TALL0; + return ich_hcr & ICH_HCR_EL2_TALL0; case SYS_ICC_IGRPEN1_EL1: if (is_read && - (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && - (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; @@ -1090,10 +1096,10 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_EOIR1_EL1: case SYS_ICC_HPPIR1_EL1: case SYS_ICC_IAR1_EL1: - return ich_hcr & ICH_HCR_TALL1; + return ich_hcr & ICH_HCR_EL2_TALL1; case SYS_ICC_DIR_EL1: - if (ich_hcr & ICH_HCR_TDIR) + if (ich_hcr & ICH_HCR_EL2_TDIR) return true; fallthrough; @@ -1101,7 +1107,7 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_RPR_EL1: case SYS_ICC_CTLR_EL1: case SYS_ICC_PMR_EL1: - return ich_hcr & ICH_HCR_TC; + return ich_hcr & ICH_HCR_EL2_TC; default: return false; diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c index 289689b2682d..0100339b09e0 100644 --- a/arch/arm64/kvm/hyp/vhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -19,8 +19,3 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 80581b1c3995..c9b330dc2066 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -48,21 +48,46 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); static u64 __compute_hcr(struct kvm_vcpu *vcpu) { + u64 guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2); u64 hcr = vcpu->arch.hcr_el2; if (!vcpu_has_nv(vcpu)) return hcr; + /* + * We rely on the invariant that a vcpu entered from HYP + * context must also exit in the same context, as only an ERET + * instruction can kick us out of it, and we obviously trap + * that sucker. PSTATE.M will get fixed-up on exit. + */ if (is_hyp_ctxt(vcpu)) { + host_data_set_flag(VCPU_IN_HYP_CONTEXT); + hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; if (!vcpu_el2_e2h_is_set(vcpu)) hcr |= HCR_NV1; write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); + } else { + host_data_clear_flag(VCPU_IN_HYP_CONTEXT); + + if (guest_hcr & HCR_NV) { + u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id())); + + /* Inherit the low bits from the actual register */ + va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0); + write_sysreg_s(va, SYS_VNCR_EL2); + + /* Force NV2 in case the guest is forgetful... */ + guest_hcr |= HCR_NV2; + } } - return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE); + BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) && + host_data_test_flag(L1_VNCR_MAPPED)); + + return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } static void __activate_cptr_traps(struct kvm_vcpu *vcpu) @@ -77,12 +102,12 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM * shift value for trapping the AMU accesses. */ - u64 val = CPACR_ELx_TTA | CPTR_EL2_TAM; + u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM; if (guest_owns_fp_regs()) { - val |= CPACR_ELx_FPEN; + val |= CPACR_EL1_FPEN; if (vcpu_has_sve(vcpu)) - val |= CPACR_ELx_ZEN; + val |= CPACR_EL1_ZEN; } else { __activate_traps_fpsimd32(vcpu); } @@ -122,13 +147,13 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) * hypervisor has traps enabled to dispel any illusion of something more * complicated taking place. */ - if (!(SYS_FIELD_GET(CPACR_ELx, FPEN, cptr) & BIT(0))) - val &= ~CPACR_ELx_FPEN; - if (!(SYS_FIELD_GET(CPACR_ELx, ZEN, cptr) & BIT(0))) - val &= ~CPACR_ELx_ZEN; + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - val |= cptr & CPACR_ELx_E0POE; + val |= cptr & CPACR_EL1_E0POE; val |= cptr & CPTR_EL2_TCPAC; @@ -136,6 +161,16 @@ write: write_sysreg(val, cpacr_el1); } +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; + + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN_EL1EN; + + write_sysreg(val, cpacr_el1); +} + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -174,7 +209,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) ___deactivate_traps(vcpu); - write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); if (has_cntpoff()) { struct timer_map map; @@ -207,7 +242,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); @@ -256,6 +291,110 @@ void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL; } +static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) +{ + unsigned long ctl; + u64 cval, cnt; + bool stat; + + switch (reg) { + case CNTP_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + cnt = compute_counter_value(vcpu_ptimer(vcpu)); + break; + case CNTV_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); + cnt = compute_counter_value(vcpu_vtimer(vcpu)); + break; + default: + BUG(); + } + + stat = cval <= cnt; + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat); + + return ctl; +} + +static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr, val; + + /* + * Having FEAT_ECV allows for a better quality of timer emulation. + * However, this comes at a huge cost in terms of traps. Try and + * satisfy the reads from guest's hypervisor context without + * returning to the kernel if we can. + */ + if (!is_hyp_ctxt(vcpu)) + return false; + + esr = kvm_vcpu_get_esr(vcpu); + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ) + return false; + + switch (esr_sys64_to_sysreg(esr)) { + case SYS_CNTP_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTP_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + break; + case SYS_CNTP_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el0(SYS_CNTP_CVAL); + + if (!has_cntpoff()) + val -= timer_get_offset(vcpu_hptimer(vcpu)); + } else { + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + } + break; + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + val = compute_counter_value(vcpu_hptimer(vcpu)); + break; + case SYS_CNTV_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTV_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CVAL); + else + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + val = compute_counter_value(vcpu_hvtimer(vcpu)); + break; + default: + return false; + } + + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + + return true; +} + static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 esr = kvm_vcpu_get_esr(vcpu); @@ -309,14 +448,6 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) return true; } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) -{ - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); - - if (kvm_has_fpmr(vcpu->kvm)) - **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR); -} - static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) { int ret = -EINVAL; @@ -353,6 +484,14 @@ static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) if (ret) return false; + /* + * If we have to check for any VNCR mapping being invalidated, + * go back to the slow path for further processing. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) && + atomic_read(&vcpu->kvm->arch.vncr_map_count)) + return false; + __kvm_skip_instr(vcpu); return true; @@ -409,6 +548,9 @@ static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code)) return true; + if (kvm_hyp_handle_timer(vcpu, exit_code)) + return true; + if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code)) return true; @@ -418,6 +560,25 @@ static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) return kvm_hyp_handle_sysreg(vcpu, exit_code); } +static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 iss; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return false; + + /* + * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2 + * is populated with a correct ISS for a sysreg trap. These fruity + * parts are 64bit only, so unconditionally set IL. + */ + iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2)); + vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) | + FIELD_PREP(ESR_ELx_ISS_MASK, iss) | + ESR_ELx_IL; + return false; +} + static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, @@ -429,20 +590,23 @@ static const exit_handler_fn hyp_exit_handlers[] = { [ESR_ELx_EC_WATCHPT_LOW] = kvm_hyp_handle_watchpt_low, [ESR_ELx_EC_ERET] = kvm_hyp_handle_eret, [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, + + /* Apple shenanigans */ + [0x3F] = kvm_hyp_handle_impdef, }; -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { - return hyp_exit_handlers; -} + synchronize_vcpu_pstate(vcpu, exit_code); -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) -{ /* * If we were in HYP context on entry, adjust the PSTATE view - * so that the usual helpers work correctly. + * so that the usual helpers work correctly. This enforces our + * invariant that the guest's HYP context status is preserved + * across a run. */ - if (vcpu_has_nv(vcpu) && (read_sysreg(hcr_el2) & HCR_NV)) { + if (vcpu_has_nv(vcpu) && + unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) { u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { @@ -457,6 +621,12 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); *vcpu_cpsr(vcpu) |= mode; } + + /* Apply extreme paranoia! */ + BUG_ON(vcpu_has_nv(vcpu) && + !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu)); + + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } /* Switch to the guest for VHE systems running in EL2 */ @@ -471,6 +641,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_save_host_state_vhe(host_ctxt); + fpsimd_lazy_switch_to_guest(vcpu); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -495,6 +667,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); + fpsimd_lazy_switch_to_host(vcpu); + sysreg_restore_host_state_vhe(host_ctxt); if (guest_owns_fp_regs()) diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 5f78a39053a7..3814b0b2c937 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -87,11 +87,12 @@ static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); - write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); - write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); - write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); - write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); - write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); + write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2); + write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); + write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); if (vcpu_el2_e2h_is_set(vcpu)) { /* @@ -191,7 +192,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - u64 mpidr; + u64 midr, mpidr; host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); @@ -216,27 +217,22 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); - if (unlikely(__is_hyp_ctxt(guest_ctxt))) { + if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); } else { if (vcpu_has_nv(vcpu)) { /* - * Use the guest hypervisor's VPIDR_EL2 when in a - * nested state. The hardware value of MIDR_EL1 gets - * restored on put. - */ - write_sysreg(ctxt_sys_reg(guest_ctxt, VPIDR_EL2), vpidr_el2); - - /* * As we're restoring a nested guest, set the value * provided by the guest hypervisor. */ + midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2); mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); } else { + midr = ctxt_midr_el1(guest_ctxt); mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); } - __sysreg_restore_el1_state(guest_ctxt, mpidr); + __sysreg_restore_el1_state(guest_ctxt, midr, mpidr); } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); @@ -260,7 +256,7 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); - if (unlikely(__is_hyp_ctxt(guest_ctxt))) + if (unlikely(is_hyp_ctxt(vcpu))) __sysreg_save_vel2_state(vcpu); else __sysreg_save_el1_state(guest_ctxt); @@ -271,9 +267,5 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); - /* If leaving a nesting guest, restore MIDR_EL1 default view */ - if (vcpu_has_nv(vcpu)) - write_sysreg(read_cpuid_id(), vpidr_el2); - vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); } diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 3d50a1bd2bdb..ec2569818629 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -63,7 +63,7 @@ static void enter_vmid_context(struct kvm_s2_mmu *mmu, __load_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; - write_sysreg(val, hcr_el2); + write_sysreg_hcr(val); isb(); } @@ -73,7 +73,7 @@ static void exit_vmid_context(struct tlb_inv_context *cxt) * We're done with the TLB operation, let's restore the host's * view of HCR_EL2. */ - write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); isb(); /* ... and the stage-2 MMU context that we switched away from */ |