diff options
Diffstat (limited to 'arch')
189 files changed, 13150 insertions, 2957 deletions
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 3df5f2dd4c0f..8f1f18adcdb5 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -150,6 +150,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 25ed6f1a7c7a..3072731fe09c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1380,8 +1380,7 @@ config CC_HAVE_STACKPROTECTOR_TLS config STACKPROTECTOR_PER_TASK bool "Use a unique stack canary value for each task" depends on STACKPROTECTOR && CURRENT_POINTER_IN_TPIDRURO && !XIP_DEFLATED_DATA - depends on GCC_PLUGINS || CC_HAVE_STACKPROTECTOR_TLS - select GCC_PLUGIN_ARM_SSP_PER_TASK if !CC_HAVE_STACKPROTECTOR_TLS + depends on CC_HAVE_STACKPROTECTOR_TLS default y help Due to the fact that GCC uses an ordinary symbol reference from diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 945b5975fce2..d61369b1eabe 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -96,7 +96,7 @@ KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin \ -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ - -I$(obj) $(DISABLE_ARM_SSP_PER_TASK_PLUGIN) + -I$(obj) ccflags-remove-$(CONFIG_FUNCTION_TRACER) += -pg asflags-y := -DZIMAGE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a182295e6f08..c314eb429b9f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -42,6 +42,7 @@ config ARM64 select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT + select ARCH_HAS_PREEMPT_LAZY select ARCH_HAS_PTDUMP select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL @@ -134,7 +135,6 @@ config ARM64 select COMMON_CLK select CPU_PM if (SUSPEND || CPU_IDLE) select CPUMASK_OFFSTACK if NR_CPUS > 256 - select CRC32 select DCACHE_WORD_ACCESS select DYNAMIC_FTRACE if FUNCTION_TRACER select DMA_BOUNCE_UNALIGNED_KMALLOC @@ -333,9 +333,9 @@ config ARCH_MMAP_RND_BITS_MAX default 24 if ARM64_VA_BITS=39 default 27 if ARM64_VA_BITS=42 default 30 if ARM64_VA_BITS=47 - default 29 if ARM64_VA_BITS=48 && ARM64_64K_PAGES - default 31 if ARM64_VA_BITS=48 && ARM64_16K_PAGES - default 33 if ARM64_VA_BITS=48 + default 29 if (ARM64_VA_BITS=48 || ARM64_VA_BITS=52) && ARM64_64K_PAGES + default 31 if (ARM64_VA_BITS=48 || ARM64_VA_BITS=52) && ARM64_16K_PAGES + default 33 if (ARM64_VA_BITS=48 || ARM64_VA_BITS=52) default 14 if ARM64_64K_PAGES default 16 if ARM64_16K_PAGES default 18 @@ -464,6 +464,23 @@ config AMPERE_ERRATUM_AC03_CPU_38 If unsure, say Y. +config AMPERE_ERRATUM_AC04_CPU_23 + bool "AmpereOne: AC04_CPU_23: Failure to synchronize writes to HCR_EL2 may corrupt address translations." + default y + help + This option adds an alternative code sequence to work around Ampere + errata AC04_CPU_23 on AmpereOne. + + Updates to HCR_EL2 can rarely corrupt simultaneous translations for + data addresses initiated by load/store instructions. Only + instruction initiated translations are vulnerable, not translations + from prefetches for example. A DSB before the store to HCR_EL2 is + sufficient to prevent older instructions from hitting the window + for corruption, and an ISB after is sufficient to prevent younger + instructions from hitting the window for corruption. + + If unsure, say Y. + config ARM64_WORKAROUND_CLEAN_CACHE bool @@ -2285,7 +2302,6 @@ config ARM64_SME bool "ARM Scalable Matrix Extension support" default y depends on ARM64_SVE - depends on BROKEN help The Scalable Matrix Extension (SME) is an extension to the AArch64 execution state which utilises a substantial subset of the SVE diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 4936fa5b98ff..8eddf0c96098 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -3752,60 +3752,83 @@ }; gpu_opp_table: opp-table { - compatible = "operating-points-v2"; + compatible = "operating-points-v2-adreno", "operating-points-v2"; + + opp-1250000000 { + opp-hz = /bits/ 64 <1250000000>; + opp-level = <RPMH_REGULATOR_LEVEL_TURBO_L3>; + opp-peak-kBps = <16500000>; + qcom,opp-acd-level = <0xa82a5ffd>; + }; + + opp-1175000000 { + opp-hz = /bits/ 64 <1175000000>; + opp-level = <RPMH_REGULATOR_LEVEL_TURBO_L2>; + opp-peak-kBps = <14398438>; + qcom,opp-acd-level = <0xa82a5ffd>; + }; opp-1100000000 { opp-hz = /bits/ 64 <1100000000>; opp-level = <RPMH_REGULATOR_LEVEL_TURBO_L1>; - opp-peak-kBps = <16500000>; + opp-peak-kBps = <14398438>; + qcom,opp-acd-level = <0xa82a5ffd>; }; opp-1000000000 { opp-hz = /bits/ 64 <1000000000>; opp-level = <RPMH_REGULATOR_LEVEL_TURBO>; opp-peak-kBps = <14398438>; + qcom,opp-acd-level = <0xa82b5ffd>; }; opp-925000000 { opp-hz = /bits/ 64 <925000000>; opp-level = <RPMH_REGULATOR_LEVEL_NOM_L1>; opp-peak-kBps = <14398438>; + qcom,opp-acd-level = <0xa82b5ffd>; }; opp-800000000 { opp-hz = /bits/ 64 <800000000>; opp-level = <RPMH_REGULATOR_LEVEL_NOM>; opp-peak-kBps = <12449219>; + qcom,opp-acd-level = <0xa82c5ffd>; }; opp-744000000 { opp-hz = /bits/ 64 <744000000>; opp-level = <RPMH_REGULATOR_LEVEL_SVS_L2>; opp-peak-kBps = <10687500>; + qcom,opp-acd-level = <0x882e5ffd>; }; opp-687000000 { opp-hz = /bits/ 64 <687000000>; opp-level = <RPMH_REGULATOR_LEVEL_SVS_L1>; opp-peak-kBps = <8171875>; + qcom,opp-acd-level = <0x882e5ffd>; }; opp-550000000 { opp-hz = /bits/ 64 <550000000>; opp-level = <RPMH_REGULATOR_LEVEL_SVS>; opp-peak-kBps = <6074219>; + qcom,opp-acd-level = <0xc0285ffd>; }; opp-390000000 { opp-hz = /bits/ 64 <390000000>; opp-level = <RPMH_REGULATOR_LEVEL_LOW_SVS>; opp-peak-kBps = <3000000>; + qcom,opp-acd-level = <0xc0285ffd>; }; opp-300000000 { opp-hz = /bits/ 64 <300000000>; opp-level = <RPMH_REGULATOR_LEVEL_LOW_SVS_D1>; opp-peak-kBps = <2136719>; + qcom,opp-acd-level = <0xc02b5ffd>; }; }; }; diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 81e4157f92b7..71493b760b83 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -44,6 +44,7 @@ struct cpuinfo_arm64 { u64 reg_dczid; u64 reg_midr; u64 reg_revidr; + u64 reg_aidr; u64 reg_gmid; u64 reg_smidr; u64 reg_mpamidr; diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index dffff6763812..661735616787 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -134,6 +134,7 @@ #define HISI_CPU_PART_TSV110 0xD01 #define HISI_CPU_PART_HIP09 0xD02 +#define HISI_CPU_PART_HIP12 0xD06 #define APPLE_CPU_PART_M1_ICESTORM 0x022 #define APPLE_CPU_PART_M1_FIRESTORM 0x023 @@ -222,6 +223,7 @@ #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09) +#define MIDR_HISI_HIP12 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP12) #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) #define MIDR_APPLE_M1_ICESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_PRO) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index d40e427ddad9..1e7c7475e43f 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -38,7 +38,7 @@ orr x0, x0, #HCR_E2H .LnVHE_\@: - msr hcr_el2, x0 + msr_hcr_el2 x0 isb .endm @@ -204,26 +204,28 @@ orr x0, x0, #(1 << 62) .Lskip_spe_fgt_\@: + +.Lset_debug_fgt_\@: msr_s SYS_HDFGRTR_EL2, x0 msr_s SYS_HDFGWTR_EL2, x0 mov x0, xzr mrs x1, id_aa64pfr1_el1 ubfx x1, x1, #ID_AA64PFR1_EL1_SME_SHIFT, #4 - cbz x1, .Lskip_debug_fgt_\@ + cbz x1, .Lskip_sme_fgt_\@ /* Disable nVHE traps of TPIDR2 and SMPRI */ - orr x0, x0, #HFGxTR_EL2_nSMPRI_EL1_MASK - orr x0, x0, #HFGxTR_EL2_nTPIDR2_EL0_MASK + orr x0, x0, #HFGRTR_EL2_nSMPRI_EL1_MASK + orr x0, x0, #HFGRTR_EL2_nTPIDR2_EL0_MASK -.Lskip_debug_fgt_\@: +.Lskip_sme_fgt_\@: mrs_s x1, SYS_ID_AA64MMFR3_EL1 ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 cbz x1, .Lskip_pie_fgt_\@ /* Disable trapping of PIR_EL1 / PIRE0_EL1 */ - orr x0, x0, #HFGxTR_EL2_nPIR_EL1 - orr x0, x0, #HFGxTR_EL2_nPIRE0_EL1 + orr x0, x0, #HFGRTR_EL2_nPIR_EL1 + orr x0, x0, #HFGRTR_EL2_nPIRE0_EL1 .Lskip_pie_fgt_\@: mrs_s x1, SYS_ID_AA64MMFR3_EL1 @@ -231,17 +233,19 @@ cbz x1, .Lskip_poe_fgt_\@ /* Disable trapping of POR_EL0 */ - orr x0, x0, #HFGxTR_EL2_nPOR_EL0 + orr x0, x0, #HFGRTR_EL2_nPOR_EL0 .Lskip_poe_fgt_\@: /* GCS depends on PIE so we don't check it if PIE is absent */ mrs_s x1, SYS_ID_AA64PFR1_EL1 ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 - cbz x1, .Lset_fgt_\@ + cbz x1, .Lskip_gce_fgt_\@ /* Disable traps of access to GCS registers at EL0 and EL1 */ - orr x0, x0, #HFGxTR_EL2_nGCS_EL1_MASK - orr x0, x0, #HFGxTR_EL2_nGCS_EL0_MASK + orr x0, x0, #HFGRTR_EL2_nGCS_EL1_MASK + orr x0, x0, #HFGRTR_EL2_nGCS_EL0_MASK + +.Lskip_gce_fgt_\@: .Lset_fgt_\@: msr_s SYS_HFGRTR_EL2, x0 diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index e4f77757937e..e1deed824464 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -20,7 +20,8 @@ #define ESR_ELx_EC_FP_ASIMD UL(0x07) #define ESR_ELx_EC_CP10_ID UL(0x08) /* EL2 only */ #define ESR_ELx_EC_PAC UL(0x09) /* EL2 and above */ -/* Unallocated EC: 0x0A - 0x0B */ +#define ESR_ELx_EC_OTHER UL(0x0A) +/* Unallocated EC: 0x0B */ #define ESR_ELx_EC_CP14_64 UL(0x0C) #define ESR_ELx_EC_BTI UL(0x0D) #define ESR_ELx_EC_ILL UL(0x0E) @@ -99,6 +100,8 @@ #define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT) /* Shared ISS field definitions for Data/Instruction aborts */ +#define ESR_ELx_VNCR_SHIFT (13) +#define ESR_ELx_VNCR (UL(1) << ESR_ELx_VNCR_SHIFT) #define ESR_ELx_SET_SHIFT (11) #define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) #define ESR_ELx_FnV_SHIFT (10) @@ -181,6 +184,13 @@ #define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) #define ESR_ELx_xVC_IMM_MASK ((UL(1) << 16) - 1) +/* ISS definitions for LD64B/ST64B/{T,P}SBCSYNC instructions */ +#define ESR_ELx_ISS_OTHER_ST64BV (0) +#define ESR_ELx_ISS_OTHER_ST64BV0 (1) +#define ESR_ELx_ISS_OTHER_LDST64B (2) +#define ESR_ELx_ISS_OTHER_TSBCSYNC (3) +#define ESR_ELx_ISS_OTHER_PSBCSYNC (4) + #define DISR_EL1_IDS (UL(1) << 24) /* * DISR_EL1 and ESR_ELx share the bottom 13 bits, but the RES0 bits may mean @@ -378,12 +388,14 @@ /* * ISS values for SME traps */ +#define ESR_ELx_SME_ISS_SMTC_MASK GENMASK(2, 0) +#define ESR_ELx_SME_ISS_SMTC(esr) ((esr) & ESR_ELx_SME_ISS_SMTC_MASK) -#define ESR_ELx_SME_ISS_SME_DISABLED 0 -#define ESR_ELx_SME_ISS_ILL 1 -#define ESR_ELx_SME_ISS_SM_DISABLED 2 -#define ESR_ELx_SME_ISS_ZA_DISABLED 3 -#define ESR_ELx_SME_ISS_ZT_DISABLED 4 +#define ESR_ELx_SME_ISS_SMTC_SME_DISABLED 0 +#define ESR_ELx_SME_ISS_SMTC_ILL 1 +#define ESR_ELx_SME_ISS_SMTC_SM_DISABLED 2 +#define ESR_ELx_SME_ISS_SMTC_ZA_DISABLED 3 +#define ESR_ELx_SME_ISS_SMTC_ZT_DISABLED 4 /* ISS field definitions for MOPS exceptions */ #define ESR_ELx_MOPS_ISS_MEM_INST (UL(1) << 24) @@ -440,6 +452,11 @@ static inline bool esr_is_cfi_brk(unsigned long esr) (esr_brk_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE; } +static inline bool esr_is_ubsan_brk(unsigned long esr) +{ + return (esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM; +} + static inline bool esr_fsc_is_translation_fault(unsigned long esr) { esr = esr & ESR_ELx_FSC; diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 87e307804b99..635a43c4ec85 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -48,6 +48,12 @@ enum fixed_addresses { FIX_EARLYCON_MEM_BASE, FIX_TEXT_POKE0, +#ifdef CONFIG_KVM + /* One slot per CPU, mapping the guest's VNCR page at EL2. */ + FIX_VNCR_END, + FIX_VNCR = FIX_VNCR_END + NR_CPUS, +#endif + #ifdef CONFIG_ACPI_APEI_GHES /* Used for GHES mapping from assorted contexts */ FIX_APEI_GHES_IRQ, diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 564bc09b3e06..b8cf0ea43cc0 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -6,6 +6,7 @@ #define __ASM_FP_H #include <asm/errno.h> +#include <asm/percpu.h> #include <asm/ptrace.h> #include <asm/processor.h> #include <asm/sigcontext.h> @@ -76,7 +77,6 @@ extern void fpsimd_load_state(struct user_fpsimd_state *state); extern void fpsimd_thread_switch(struct task_struct *next); extern void fpsimd_flush_thread(void); -extern void fpsimd_signal_preserve_current_state(void); extern void fpsimd_preserve_current_state(void); extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); @@ -93,9 +93,12 @@ struct cpu_fp_state { enum fp_type to_save; }; +DECLARE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); + extern void fpsimd_bind_state_to_cpu(struct cpu_fp_state *fp_state); extern void fpsimd_flush_task_state(struct task_struct *target); +extern void fpsimd_save_and_flush_current_state(void); extern void fpsimd_save_and_flush_cpu_state(void); static inline bool thread_sm_enabled(struct thread_struct *thread) @@ -108,6 +111,8 @@ static inline bool thread_za_enabled(struct thread_struct *thread) return system_supports_sme() && (thread->svcr & SVCR_ZA_MASK); } +extern void task_smstop_sm(struct task_struct *task); + /* Maximum VL that SVE/SME VL-agnostic software can transparently support */ #define VL_ARCH_MAX 0x100 @@ -195,10 +200,8 @@ struct vl_info { extern void sve_alloc(struct task_struct *task, bool flush); extern void fpsimd_release_task(struct task_struct *task); -extern void fpsimd_sync_to_sve(struct task_struct *task); -extern void fpsimd_force_sync_to_sve(struct task_struct *task); -extern void sve_sync_to_fpsimd(struct task_struct *task); -extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task); +extern void fpsimd_sync_from_effective_state(struct task_struct *task); +extern void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task); extern int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags); @@ -292,14 +295,29 @@ static inline bool sve_vq_available(unsigned int vq) return vq_available(ARM64_VEC_SVE, vq); } -size_t sve_state_size(struct task_struct const *task); +static inline size_t __sve_state_size(unsigned int sve_vl, unsigned int sme_vl) +{ + unsigned int vl = max(sve_vl, sme_vl); + return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +} + +/* + * Return how many bytes of memory are required to store the full SVE + * state for task, given task's currently configured vector length. + */ +static inline size_t sve_state_size(struct task_struct const *task) +{ + unsigned int sve_vl = task_get_sve_vl(task); + unsigned int sme_vl = task_get_sme_vl(task); + return __sve_state_size(sve_vl, sme_vl); +} #else /* ! CONFIG_ARM64_SVE */ static inline void sve_alloc(struct task_struct *task, bool flush) { } static inline void fpsimd_release_task(struct task_struct *task) { } -static inline void sve_sync_to_fpsimd(struct task_struct *task) { } -static inline void sve_sync_from_fpsimd_zeropad(struct task_struct *task) { } +static inline void fpsimd_sync_from_effective_state(struct task_struct *task) { } +static inline void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { } static inline int sve_max_virtualisable_vl(void) { @@ -333,6 +351,11 @@ static inline void vec_update_vq_map(enum vec_type t) { } static inline int vec_verify_vq_map(enum vec_type t) { return 0; } static inline void sve_setup(void) { } +static inline size_t __sve_state_size(unsigned int sve_vl, unsigned int sme_vl) +{ + return 0; +} + static inline size_t sve_state_size(struct task_struct const *task) { return 0; @@ -385,6 +408,16 @@ extern int sme_set_current_vl(unsigned long arg); extern int sme_get_current_vl(void); extern void sme_suspend_exit(void); +static inline size_t __sme_state_size(unsigned int sme_vl) +{ + size_t size = ZA_SIG_REGS_SIZE(sve_vq_from_vl(sme_vl)); + + if (system_supports_sme2()) + size += ZT_SIG_REG_SIZE; + + return size; +} + /* * Return how many bytes of memory are required to store the full SME * specific state for task, given task's currently configured vector @@ -392,15 +425,7 @@ extern void sme_suspend_exit(void); */ static inline size_t sme_state_size(struct task_struct const *task) { - unsigned int vl = task_get_sme_vl(task); - size_t size; - - size = ZA_SIG_REGS_SIZE(sve_vq_from_vl(vl)); - - if (system_supports_sme2()) - size += ZT_SIG_REG_SIZE; - - return size; + return __sme_state_size(task_get_sme_vl(task)); } #else @@ -421,6 +446,11 @@ static inline int sme_set_current_vl(unsigned long arg) { return -EINVAL; } static inline int sme_get_current_vl(void) { return -EINVAL; } static inline void sme_suspend_exit(void) { } +static inline size_t __sme_state_size(unsigned int sme_vl) +{ + return 0; +} + static inline size_t sme_state_size(struct task_struct const *task) { return 0; diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index cbfa7b6f2e09..77d6b8c63d4e 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -41,7 +41,7 @@ do { \ \ ___hcr = read_sysreg(hcr_el2); \ if (!(___hcr & HCR_TGE)) { \ - write_sysreg(___hcr | HCR_TGE, hcr_el2); \ + write_sysreg_hcr(___hcr | HCR_TGE); \ isb(); \ } \ /* \ @@ -82,7 +82,7 @@ do { \ */ \ barrier(); \ if (!___ctx->cnt && !(___hcr & HCR_TGE)) \ - write_sysreg(___hcr, hcr_el2); \ + write_sysreg_hcr(___hcr); \ } while (0) static inline void ack_bad_irq(unsigned int irq) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 07fbf5bf85a7..2a8155c4a882 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -69,29 +69,38 @@ extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #include <asm-generic/hugetlb.h> -#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE -static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, - unsigned long start, - unsigned long end) +static inline void __flush_hugetlb_tlb_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long stride, + bool last_level) { - unsigned long stride = huge_page_size(hstate_vma(vma)); - switch (stride) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - __flush_tlb_range(vma, start, end, PUD_SIZE, false, 1); + __flush_tlb_range(vma, start, end, PUD_SIZE, last_level, 1); break; #endif case CONT_PMD_SIZE: case PMD_SIZE: - __flush_tlb_range(vma, start, end, PMD_SIZE, false, 2); + __flush_tlb_range(vma, start, end, PMD_SIZE, last_level, 2); break; case CONT_PTE_SIZE: - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 3); + __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, 3); break; default: - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); + __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, TLBI_TTL_UNKNOWN); } } +#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + unsigned long stride = huge_page_size(hstate_vma(vma)); + + __flush_hugetlb_tlb_range(vma, start, end, stride, false); +} + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index e9c8a581e16f..1da290aeedce 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -12,67 +12,70 @@ #include <asm/sysreg.h> #include <asm/types.h> -/* Hyp Configuration Register (HCR) bits */ - -#define HCR_TID5 (UL(1) << 58) -#define HCR_DCT (UL(1) << 57) -#define HCR_ATA_SHIFT 56 -#define HCR_ATA (UL(1) << HCR_ATA_SHIFT) -#define HCR_TTLBOS (UL(1) << 55) -#define HCR_TTLBIS (UL(1) << 54) -#define HCR_ENSCXT (UL(1) << 53) -#define HCR_TOCU (UL(1) << 52) -#define HCR_AMVOFFEN (UL(1) << 51) -#define HCR_TICAB (UL(1) << 50) -#define HCR_TID4 (UL(1) << 49) -#define HCR_FIEN (UL(1) << 47) -#define HCR_FWB (UL(1) << 46) -#define HCR_NV2 (UL(1) << 45) -#define HCR_AT (UL(1) << 44) -#define HCR_NV1 (UL(1) << 43) -#define HCR_NV (UL(1) << 42) -#define HCR_API (UL(1) << 41) -#define HCR_APK (UL(1) << 40) -#define HCR_TEA (UL(1) << 37) -#define HCR_TERR (UL(1) << 36) -#define HCR_TLOR (UL(1) << 35) -#define HCR_E2H (UL(1) << 34) -#define HCR_ID (UL(1) << 33) -#define HCR_CD (UL(1) << 32) -#define HCR_RW_SHIFT 31 -#define HCR_RW (UL(1) << HCR_RW_SHIFT) -#define HCR_TRVM (UL(1) << 30) -#define HCR_HCD (UL(1) << 29) -#define HCR_TDZ (UL(1) << 28) -#define HCR_TGE (UL(1) << 27) -#define HCR_TVM (UL(1) << 26) -#define HCR_TTLB (UL(1) << 25) -#define HCR_TPU (UL(1) << 24) -#define HCR_TPC (UL(1) << 23) /* HCR_TPCP if FEAT_DPB */ -#define HCR_TSW (UL(1) << 22) -#define HCR_TACR (UL(1) << 21) -#define HCR_TIDCP (UL(1) << 20) -#define HCR_TSC (UL(1) << 19) -#define HCR_TID3 (UL(1) << 18) -#define HCR_TID2 (UL(1) << 17) -#define HCR_TID1 (UL(1) << 16) -#define HCR_TID0 (UL(1) << 15) -#define HCR_TWE (UL(1) << 14) -#define HCR_TWI (UL(1) << 13) -#define HCR_DC (UL(1) << 12) -#define HCR_BSU (3 << 10) -#define HCR_BSU_IS (UL(1) << 10) -#define HCR_FB (UL(1) << 9) -#define HCR_VSE (UL(1) << 8) -#define HCR_VI (UL(1) << 7) -#define HCR_VF (UL(1) << 6) -#define HCR_AMO (UL(1) << 5) -#define HCR_IMO (UL(1) << 4) -#define HCR_FMO (UL(1) << 3) -#define HCR_PTW (UL(1) << 2) -#define HCR_SWIO (UL(1) << 1) -#define HCR_VM (UL(1) << 0) -#define HCR_RES0 ((UL(1) << 48) | (UL(1) << 39)) +/* + * Because I'm terribly lazy and that repainting the whole of the KVM + * code with the proper names is a pain, use a helper to map the names + * inherited from AArch32 with the new fancy nomenclature. One day... + */ +#define __HCR(x) HCR_EL2_##x + +#define HCR_TID5 __HCR(TID5) +#define HCR_DCT __HCR(DCT) +#define HCR_ATA_SHIFT __HCR(ATA_SHIFT) +#define HCR_ATA __HCR(ATA) +#define HCR_TTLBOS __HCR(TTLBOS) +#define HCR_TTLBIS __HCR(TTLBIS) +#define HCR_ENSCXT __HCR(EnSCXT) +#define HCR_TOCU __HCR(TOCU) +#define HCR_AMVOFFEN __HCR(AMVOFFEN) +#define HCR_TICAB __HCR(TICAB) +#define HCR_TID4 __HCR(TID4) +#define HCR_FIEN __HCR(FIEN) +#define HCR_FWB __HCR(FWB) +#define HCR_NV2 __HCR(NV2) +#define HCR_AT __HCR(AT) +#define HCR_NV1 __HCR(NV1) +#define HCR_NV __HCR(NV) +#define HCR_API __HCR(API) +#define HCR_APK __HCR(APK) +#define HCR_TEA __HCR(TEA) +#define HCR_TERR __HCR(TERR) +#define HCR_TLOR __HCR(TLOR) +#define HCR_E2H __HCR(E2H) +#define HCR_ID __HCR(ID) +#define HCR_CD __HCR(CD) +#define HCR_RW __HCR(RW) +#define HCR_TRVM __HCR(TRVM) +#define HCR_HCD __HCR(HCD) +#define HCR_TDZ __HCR(TDZ) +#define HCR_TGE __HCR(TGE) +#define HCR_TVM __HCR(TVM) +#define HCR_TTLB __HCR(TTLB) +#define HCR_TPU __HCR(TPU) +#define HCR_TPC __HCR(TPCP) +#define HCR_TSW __HCR(TSW) +#define HCR_TACR __HCR(TACR) +#define HCR_TIDCP __HCR(TIDCP) +#define HCR_TSC __HCR(TSC) +#define HCR_TID3 __HCR(TID3) +#define HCR_TID2 __HCR(TID2) +#define HCR_TID1 __HCR(TID1) +#define HCR_TID0 __HCR(TID0) +#define HCR_TWE __HCR(TWE) +#define HCR_TWI __HCR(TWI) +#define HCR_DC __HCR(DC) +#define HCR_BSU __HCR(BSU) +#define HCR_BSU_IS __HCR(BSU_IS) +#define HCR_FB __HCR(FB) +#define HCR_VSE __HCR(VSE) +#define HCR_VI __HCR(VI) +#define HCR_VF __HCR(VF) +#define HCR_AMO __HCR(AMO) +#define HCR_IMO __HCR(IMO) +#define HCR_FMO __HCR(FMO) +#define HCR_PTW __HCR(PTW) +#define HCR_SWIO __HCR(SWIO) +#define HCR_VM __HCR(VM) /* * The bits we set in HCR: @@ -312,56 +315,19 @@ GENMASK(15, 0)) /* - * FGT register definitions - * - * RES0 and polarity masks as of DDI0487J.a, to be updated as needed. - * We're not using the generated masks as they are usually ahead of - * the published ARM ARM, which we use as a reference. - * - * Once we get to a point where the two describe the same thing, we'll - * merge the definitions. One day. - */ -#define __HFGRTR_EL2_RES0 HFGxTR_EL2_RES0 -#define __HFGRTR_EL2_MASK GENMASK(49, 0) -#define __HFGRTR_EL2_nMASK ~(__HFGRTR_EL2_RES0 | __HFGRTR_EL2_MASK) - -/* - * The HFGWTR bits are a subset of HFGRTR bits. To ensure we don't miss any - * future additions, define __HFGWTR* macros relative to __HFGRTR* ones. + * Polarity masks for HCRX_EL2, limited to the bits that we know about + * at this point in time. It doesn't mean that we actually *handle* + * them, but that at least those that are not advertised to a guest + * will be RES0 for that guest. */ -#define __HFGRTR_ONLY_MASK (BIT(46) | BIT(42) | BIT(40) | BIT(28) | \ - GENMASK(26, 25) | BIT(21) | BIT(18) | \ - GENMASK(15, 14) | GENMASK(10, 9) | BIT(2)) -#define __HFGWTR_EL2_RES0 (__HFGRTR_EL2_RES0 | __HFGRTR_ONLY_MASK) -#define __HFGWTR_EL2_MASK (__HFGRTR_EL2_MASK & ~__HFGRTR_ONLY_MASK) -#define __HFGWTR_EL2_nMASK ~(__HFGWTR_EL2_RES0 | __HFGWTR_EL2_MASK) - -#define __HFGITR_EL2_RES0 HFGITR_EL2_RES0 -#define __HFGITR_EL2_MASK (BIT(62) | BIT(60) | GENMASK(54, 0)) -#define __HFGITR_EL2_nMASK ~(__HFGITR_EL2_RES0 | __HFGITR_EL2_MASK) - -#define __HDFGRTR_EL2_RES0 HDFGRTR_EL2_RES0 -#define __HDFGRTR_EL2_MASK (BIT(63) | GENMASK(58, 50) | GENMASK(48, 43) | \ - GENMASK(41, 40) | GENMASK(37, 22) | \ - GENMASK(19, 9) | GENMASK(7, 0)) -#define __HDFGRTR_EL2_nMASK ~(__HDFGRTR_EL2_RES0 | __HDFGRTR_EL2_MASK) - -#define __HDFGWTR_EL2_RES0 HDFGWTR_EL2_RES0 -#define __HDFGWTR_EL2_MASK (GENMASK(57, 52) | GENMASK(50, 48) | \ - GENMASK(46, 44) | GENMASK(42, 41) | \ - GENMASK(37, 35) | GENMASK(33, 31) | \ - GENMASK(29, 23) | GENMASK(21, 10) | \ - GENMASK(8, 7) | GENMASK(5, 0)) -#define __HDFGWTR_EL2_nMASK ~(__HDFGWTR_EL2_RES0 | __HDFGWTR_EL2_MASK) - -#define __HAFGRTR_EL2_RES0 HAFGRTR_EL2_RES0 -#define __HAFGRTR_EL2_MASK (GENMASK(49, 17) | GENMASK(4, 0)) -#define __HAFGRTR_EL2_nMASK ~(__HAFGRTR_EL2_RES0 | __HAFGRTR_EL2_MASK) - -/* Similar definitions for HCRX_EL2 */ -#define __HCRX_EL2_RES0 HCRX_EL2_RES0 -#define __HCRX_EL2_MASK (BIT(6)) -#define __HCRX_EL2_nMASK ~(__HCRX_EL2_RES0 | __HCRX_EL2_MASK) +#define __HCRX_EL2_MASK (BIT_ULL(6)) +#define __HCRX_EL2_nMASK (GENMASK_ULL(24, 14) | \ + GENMASK_ULL(11, 7) | \ + GENMASK_ULL(5, 0)) +#define __HCRX_EL2_RES0 ~(__HCRX_EL2_nMASK | __HCRX_EL2_MASK) +#define __HCRX_EL2_RES1 ~(__HCRX_EL2_nMASK | \ + __HCRX_EL2_MASK | \ + __HCRX_EL2_RES0) /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 08ba91e6fb03..d941abc6b5ee 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -39,7 +39,7 @@ #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS -#define KVM_VCPU_MAX_FEATURES 7 +#define KVM_VCPU_MAX_FEATURES 9 #define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1) #define KVM_REQ_SLEEP \ @@ -53,6 +53,7 @@ #define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) #define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) #define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9) +#define KVM_REQ_MAP_L1_VNCR_EL2 KVM_ARCH_REQ(10) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) @@ -273,11 +274,17 @@ struct kvm_sysreg_masks; enum fgt_group_id { __NO_FGT_GROUP__, - HFGxTR_GROUP, + HFGRTR_GROUP, + HFGWTR_GROUP = HFGRTR_GROUP, HDFGRTR_GROUP, HDFGWTR_GROUP = HDFGRTR_GROUP, HFGITR_GROUP, HAFGRTR_GROUP, + HFGRTR2_GROUP, + HFGWTR2_GROUP = HFGRTR2_GROUP, + HDFGRTR2_GROUP, + HDFGWTR2_GROUP = HDFGRTR2_GROUP, + HFGITR2_GROUP, /* Must be last */ __NR_FGT_GROUP_IDS__ @@ -359,8 +366,8 @@ struct kvm_arch { cpumask_var_t supported_cpus; - /* PMCR_EL0.N value for the guest */ - u8 pmcr_n; + /* Maximum number of counters for the guest */ + u8 nr_pmu_counters; /* Iterator for idreg debugfs */ u8 idreg_debugfs_iter; @@ -389,6 +396,9 @@ struct kvm_arch { /* Masks for VNCR-backed and general EL2 sysregs */ struct kvm_sysreg_masks *sysreg_masks; + /* Count the number of VNCR_EL2 currently mapped */ + atomic_t vncr_map_count; + /* * For an untrusted host VM, 'pkvm.handle' is used to lookup * the associated pKVM instance in the hypervisor. @@ -561,6 +571,13 @@ enum vcpu_sysreg { VNCR(HDFGRTR_EL2), VNCR(HDFGWTR_EL2), VNCR(HAFGRTR_EL2), + VNCR(HFGRTR2_EL2), + VNCR(HFGWTR2_EL2), + VNCR(HFGITR2_EL2), + VNCR(HDFGRTR2_EL2), + VNCR(HDFGWTR2_EL2), + + VNCR(VNCR_EL2), VNCR(CNTVOFF_EL2), VNCR(CNTV_CVAL_EL0), @@ -606,6 +623,37 @@ struct kvm_sysreg_masks { } mask[NR_SYS_REGS - __SANITISED_REG_START__]; }; +struct fgt_masks { + const char *str; + u64 mask; + u64 nmask; + u64 res0; +}; + +extern struct fgt_masks hfgrtr_masks; +extern struct fgt_masks hfgwtr_masks; +extern struct fgt_masks hfgitr_masks; +extern struct fgt_masks hdfgrtr_masks; +extern struct fgt_masks hdfgwtr_masks; +extern struct fgt_masks hafgrtr_masks; +extern struct fgt_masks hfgrtr2_masks; +extern struct fgt_masks hfgwtr2_masks; +extern struct fgt_masks hfgitr2_masks; +extern struct fgt_masks hdfgrtr2_masks; +extern struct fgt_masks hdfgwtr2_masks; + +extern struct fgt_masks kvm_nvhe_sym(hfgrtr_masks); +extern struct fgt_masks kvm_nvhe_sym(hfgwtr_masks); +extern struct fgt_masks kvm_nvhe_sym(hfgitr_masks); +extern struct fgt_masks kvm_nvhe_sym(hdfgrtr_masks); +extern struct fgt_masks kvm_nvhe_sym(hdfgwtr_masks); +extern struct fgt_masks kvm_nvhe_sym(hafgrtr_masks); +extern struct fgt_masks kvm_nvhe_sym(hfgrtr2_masks); +extern struct fgt_masks kvm_nvhe_sym(hfgwtr2_masks); +extern struct fgt_masks kvm_nvhe_sym(hfgitr2_masks); +extern struct fgt_masks kvm_nvhe_sym(hdfgrtr2_masks); +extern struct fgt_masks kvm_nvhe_sym(hdfgwtr2_masks); + struct kvm_cpu_context { struct user_pt_regs regs; /* sp = sp_el0 */ @@ -654,6 +702,8 @@ struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 +#define KVM_HOST_DATA_FLAG_VCPU_IN_HYP_CONTEXT 6 +#define KVM_HOST_DATA_FLAG_L1_VNCR_MAPPED 7 unsigned long flags; struct kvm_cpu_context host_ctxt; @@ -730,6 +780,8 @@ struct vcpu_reset_state { bool reset; }; +struct vncr_tlb; + struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; @@ -824,6 +876,9 @@ struct kvm_vcpu_arch { /* Per-vcpu CCSIDR override or NULL */ u32 *ccsidr; + + /* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */ + struct vncr_tlb *vncr_tlb; }; /* @@ -971,20 +1026,22 @@ struct kvm_vcpu_arch { #define vcpu_sve_zcr_elx(vcpu) \ (unlikely(is_hyp_ctxt(vcpu)) ? ZCR_EL2 : ZCR_EL1) -#define vcpu_sve_state_size(vcpu) ({ \ +#define sve_state_size_from_vl(sve_max_vl) ({ \ size_t __size_ret; \ - unsigned int __vcpu_vq; \ + unsigned int __vq; \ \ - if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) { \ + if (WARN_ON(!sve_vl_valid(sve_max_vl))) { \ __size_ret = 0; \ } else { \ - __vcpu_vq = vcpu_sve_max_vq(vcpu); \ - __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq); \ + __vq = sve_vq_from_vl(sve_max_vl); \ + __size_ret = SVE_SIG_REGS_SIZE(__vq); \ } \ \ __size_ret; \ }) +#define vcpu_sve_state_size(vcpu) sve_state_size_from_vl((vcpu)->arch.sve_max_vl) + #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_USE_HW | \ @@ -1550,12 +1607,16 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); kvm_cmp_feat_signed(kvm, id, fld, op, limit) : \ kvm_cmp_feat_unsigned(kvm, id, fld, op, limit)) -#define kvm_has_feat(kvm, id, fld, limit) \ +#define __kvm_has_feat(kvm, id, fld, limit) \ kvm_cmp_feat(kvm, id, fld, >=, limit) -#define kvm_has_feat_enum(kvm, id, fld, val) \ +#define kvm_has_feat(kvm, ...) __kvm_has_feat(kvm, __VA_ARGS__) + +#define __kvm_has_feat_enum(kvm, id, fld, val) \ kvm_cmp_feat_unsigned(kvm, id, fld, ==, val) +#define kvm_has_feat_enum(kvm, ...) __kvm_has_feat_enum(kvm, __VA_ARGS__) + #define kvm_has_feat_range(kvm, id, fld, min, max) \ (kvm_cmp_feat(kvm, id, fld, >=, min) && \ kvm_cmp_feat(kvm, id, fld, <=, max)) @@ -1593,4 +1654,9 @@ static inline bool kvm_arch_has_irq_bypass(void) return true; } +void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt); +void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1); +void check_feature_map(void); + + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 692f403c1896..0bd07ea068a1 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -231,6 +231,38 @@ static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans) shift; \ }) +static inline u64 decode_range_tlbi(u64 val, u64 *range, u16 *asid) +{ + u64 base, tg, num, scale; + int shift; + + tg = FIELD_GET(GENMASK(47, 46), val); + + switch(tg) { + case 1: + shift = 12; + break; + case 2: + shift = 14; + break; + case 3: + default: /* IMPDEF: handle tg==0 as 64k */ + shift = 16; + break; + } + + base = (val & GENMASK(36, 0)) << shift; + + if (asid) + *asid = FIELD_GET(TLBIR_ASID_MASK, val); + + scale = FIELD_GET(GENMASK(45, 44), val); + num = FIELD_GET(GENMASK(43, 39), val); + *range = __TLBI_RANGE_PAGES(num, scale) << shift; + + return base; +} + static inline unsigned int ps_to_output_size(unsigned int ps) { switch (ps) { @@ -245,4 +277,72 @@ static inline unsigned int ps_to_output_size(unsigned int ps) } } +enum trans_regime { + TR_EL10, + TR_EL20, + TR_EL2, +}; + +struct s1_walk_info { + u64 baddr; + enum trans_regime regime; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int txsz; + int sl; + bool as_el0; + bool hpd; + bool e0poe; + bool poe; + bool pan; + bool be; + bool s2; +}; + +struct s1_walk_result { + union { + struct { + u64 desc; + u64 pa; + s8 level; + u8 APTable; + bool nG; + u16 asid; + bool UXNTable; + bool PXNTable; + bool uwxn; + bool uov; + bool ur; + bool uw; + bool ux; + bool pwxn; + bool pov; + bool pr; + bool pw; + bool px; + }; + struct { + u8 fst; + bool ptw; + bool s2; + }; + }; + bool failed; +}; + +int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va); + +/* VNCR management */ +int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); +int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu); +void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val); + +#define vncr_fixmap(c) \ + ({ \ + u32 __c = (c); \ + BUG_ON(__c >= NR_CPUS); \ + (FIX_VNCR - __c); \ + }) + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 6b9d274052c7..2888b5d03757 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -59,6 +59,11 @@ typedef u64 kvm_pte_t; #define KVM_PHYS_INVALID (-1ULL) +#define KVM_PTE_TYPE BIT(1) +#define KVM_PTE_TYPE_BLOCK 0 +#define KVM_PTE_TYPE_PAGE 1 +#define KVM_PTE_TYPE_TABLE 1 + #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) @@ -413,7 +418,7 @@ static inline bool kvm_pgtable_walk_lock_held(void) */ struct kvm_pgtable { union { - struct rb_root pkvm_mappings; + struct rb_root_cached pkvm_mappings; struct { u32 ia_bits; s8 start_level; diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index abd693ce5b93..ea58282f59bb 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -135,6 +135,12 @@ static inline unsigned long host_s2_pgtable_pages(void) return res; } +#ifdef CONFIG_NVHE_EL2_DEBUG +static inline unsigned long pkvm_selftest_pages(void) { return 32; } +#else +static inline unsigned long pkvm_selftest_pages(void) { return 0; } +#endif + #define KVM_FFA_MBOX_NR_PAGES 1 static inline unsigned long hyp_ffa_proxy_pages(void) @@ -167,6 +173,8 @@ struct pkvm_mapping { struct rb_node node; u64 gfn; u64 pfn; + u64 nr_pages; + u64 __subtree_last; /* Internal member for interval tree */ }; int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h index a2a1eeb36d4b..314b2b52025f 100644 --- a/arch/arm64/include/asm/mem_encrypt.h +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -4,6 +4,8 @@ #include <asm/rsi.h> +struct device; + struct arm64_mem_crypt_ops { int (*encrypt)(unsigned long addr, int numpages); int (*decrypt)(unsigned long addr, int numpages); diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d3b538be1500..5285757ee0c1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -40,6 +40,85 @@ #include <linux/sched.h> #include <linux/page_table_check.h> +static inline void emit_pte_barriers(void) +{ + /* + * These barriers are emitted under certain conditions after a pte entry + * was modified (see e.g. __set_pte_complete()). The dsb makes the store + * visible to the table walker. The isb ensures that any previous + * speculative "invalid translation" marker that is in the CPU's + * pipeline gets cleared, so that any access to that address after + * setting the pte to valid won't cause a spurious fault. If the thread + * gets preempted after storing to the pgtable but before emitting these + * barriers, __switch_to() emits a dsb which ensure the walker gets to + * see the store. There is no guarantee of an isb being issued though. + * This is safe because it will still get issued (albeit on a + * potentially different CPU) when the thread starts running again, + * before any access to the address. + */ + dsb(ishst); + isb(); +} + +static inline void queue_pte_barriers(void) +{ + unsigned long flags; + + if (in_interrupt()) { + emit_pte_barriers(); + return; + } + + flags = read_thread_flags(); + + if (flags & BIT(TIF_LAZY_MMU)) { + /* Avoid the atomic op if already set. */ + if (!(flags & BIT(TIF_LAZY_MMU_PENDING))) + set_thread_flag(TIF_LAZY_MMU_PENDING); + } else { + emit_pte_barriers(); + } +} + +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE +static inline void arch_enter_lazy_mmu_mode(void) +{ + /* + * lazy_mmu_mode is not supposed to permit nesting. But in practice this + * does happen with CONFIG_DEBUG_PAGEALLOC, where a page allocation + * inside a lazy_mmu_mode section (such as zap_pte_range()) will change + * permissions on the linear map with apply_to_page_range(), which + * re-enters lazy_mmu_mode. So we tolerate nesting in our + * implementation. The first call to arch_leave_lazy_mmu_mode() will + * flush and clear the flag such that the remainder of the work in the + * outer nest behaves as if outside of lazy mmu mode. This is safe and + * keeps tracking simple. + */ + + if (in_interrupt()) + return; + + set_thread_flag(TIF_LAZY_MMU); +} + +static inline void arch_flush_lazy_mmu_mode(void) +{ + if (in_interrupt()) + return; + + if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING)) + emit_pte_barriers(); +} + +static inline void arch_leave_lazy_mmu_mode(void) +{ + if (in_interrupt()) + return; + + arch_flush_lazy_mmu_mode(); + clear_thread_flag(TIF_LAZY_MMU); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE @@ -320,18 +399,20 @@ static inline void __set_pte_nosync(pte_t *ptep, pte_t pte) WRITE_ONCE(*ptep, pte); } -static inline void __set_pte(pte_t *ptep, pte_t pte) +static inline void __set_pte_complete(pte_t pte) { - __set_pte_nosync(ptep, pte); - /* * Only if the new pte is valid and kernel, otherwise TLB maintenance - * or update_mmu_cache() have the necessary barriers. + * has the necessary barriers. */ - if (pte_valid_not_user(pte)) { - dsb(ishst); - isb(); - } + if (pte_valid_not_user(pte)) + queue_pte_barriers(); +} + +static inline void __set_pte(pte_t *ptep, pte_t pte) +{ + __set_pte_nosync(ptep, pte); + __set_pte_complete(pte); } static inline pte_t __ptep_get(pte_t *ptep) @@ -423,23 +504,6 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte)); } -static inline void __set_ptes(struct mm_struct *mm, - unsigned long __always_unused addr, - pte_t *ptep, pte_t pte, unsigned int nr) -{ - page_table_check_ptes_set(mm, ptep, pte, nr); - __sync_cache_and_tags(pte, nr); - - for (;;) { - __check_safe_pte_update(mm, ptep, pte); - __set_pte(ptep, pte); - if (--nr == 0) - break; - ptep++; - pte = pte_advance_pfn(pte, 1); - } -} - /* * Hugetlb definitions. */ @@ -649,30 +713,64 @@ static inline pgprot_t pud_pgprot(pud_t pud) return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); } -static inline void __set_pte_at(struct mm_struct *mm, - unsigned long __always_unused addr, - pte_t *ptep, pte_t pte, unsigned int nr) +static inline void __set_ptes_anysz(struct mm_struct *mm, pte_t *ptep, + pte_t pte, unsigned int nr, + unsigned long pgsize) { - __sync_cache_and_tags(pte, nr); - __check_safe_pte_update(mm, ptep, pte); - __set_pte(ptep, pte); + unsigned long stride = pgsize >> PAGE_SHIFT; + + switch (pgsize) { + case PAGE_SIZE: + page_table_check_ptes_set(mm, ptep, pte, nr); + break; + case PMD_SIZE: + page_table_check_pmds_set(mm, (pmd_t *)ptep, pte_pmd(pte), nr); + break; +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + page_table_check_puds_set(mm, (pud_t *)ptep, pte_pud(pte), nr); + break; +#endif + default: + VM_WARN_ON(1); + } + + __sync_cache_and_tags(pte, nr * stride); + + for (;;) { + __check_safe_pte_update(mm, ptep, pte); + __set_pte_nosync(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte = pte_advance_pfn(pte, stride); + } + + __set_pte_complete(pte); } -static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) +static inline void __set_ptes(struct mm_struct *mm, + unsigned long __always_unused addr, + pte_t *ptep, pte_t pte, unsigned int nr) { - page_table_check_pmd_set(mm, pmdp, pmd); - return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd), - PMD_SIZE >> PAGE_SHIFT); + __set_ptes_anysz(mm, ptep, pte, nr, PAGE_SIZE); } -static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud) +static inline void __set_pmds(struct mm_struct *mm, + unsigned long __always_unused addr, + pmd_t *pmdp, pmd_t pmd, unsigned int nr) +{ + __set_ptes_anysz(mm, (pte_t *)pmdp, pmd_pte(pmd), nr, PMD_SIZE); +} +#define set_pmd_at(mm, addr, pmdp, pmd) __set_pmds(mm, addr, pmdp, pmd, 1) + +static inline void __set_puds(struct mm_struct *mm, + unsigned long __always_unused addr, + pud_t *pudp, pud_t pud, unsigned int nr) { - page_table_check_pud_set(mm, pudp, pud); - return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud), - PUD_SIZE >> PAGE_SHIFT); + __set_ptes_anysz(mm, (pte_t *)pudp, pud_pte(pud), nr, PUD_SIZE); } +#define set_pud_at(mm, addr, pudp, pud) __set_puds(mm, addr, pudp, pud, 1) #define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d)) #define __phys_to_p4d_val(phys) __phys_to_pte_val(phys) @@ -739,8 +837,7 @@ static inline int pmd_trans_huge(pmd_t pmd) * If pmd is present-invalid, pmd_table() won't detect it * as a table, so force the valid bit for the comparison. */ - return pmd_val(pmd) && pmd_present(pmd) && - !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID)); + return pmd_present(pmd) && !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -754,8 +851,6 @@ static inline bool pud_table(pud_t pud) { return true; } PUD_TYPE_TABLE) #endif -extern pgd_t init_pg_dir[]; -extern pgd_t init_pg_end[]; extern pgd_t swapper_pg_dir[]; extern pgd_t idmap_pg_dir[]; extern pgd_t tramp_pg_dir[]; @@ -780,10 +875,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) WRITE_ONCE(*pmdp, pmd); - if (pmd_valid(pmd)) { - dsb(ishst); - isb(); - } + if (pmd_valid(pmd)) + queue_pte_barriers(); } static inline void pmd_clear(pmd_t *pmdp) @@ -848,10 +941,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud) WRITE_ONCE(*pudp, pud); - if (pud_valid(pud)) { - dsb(ishst); - isb(); - } + if (pud_valid(pud)) + queue_pte_barriers(); } static inline void pud_clear(pud_t *pudp) @@ -930,8 +1021,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) } WRITE_ONCE(*p4dp, p4d); - dsb(ishst); - isb(); + queue_pte_barriers(); } static inline void p4d_clear(p4d_t *p4dp) @@ -1059,8 +1149,7 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) } WRITE_ONCE(*pgdp, pgd); - dsb(ishst); - isb(); + queue_pte_barriers(); } static inline void pgd_clear(pgd_t *pgdp) @@ -1301,16 +1390,37 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ -static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, - unsigned long address, pte_t *ptep) +static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, + pte_t *ptep, + unsigned long pgsize) { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); - page_table_check_pte_clear(mm, pte); + switch (pgsize) { + case PAGE_SIZE: + page_table_check_pte_clear(mm, pte); + break; + case PMD_SIZE: + page_table_check_pmd_clear(mm, pte_pmd(pte)); + break; +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + page_table_check_pud_clear(mm, pte_pud(pte)); + break; +#endif + default: + VM_WARN_ON(1); + } return pte; } +static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, pte_t *ptep) +{ + return __ptep_get_and_clear_anysz(mm, ptep, PAGE_SIZE); +} + static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { @@ -1347,11 +1457,7 @@ static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm, static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0)); - - page_table_check_pmd_clear(mm, pmd); - - return pmd; + return pte_pmd(__ptep_get_and_clear_anysz(mm, (pte_t *)pmdp, PMD_SIZE)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h index e6a211001bd3..2c8763876dfb 100644 --- a/arch/arm64/include/asm/rsi_cmds.h +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -7,6 +7,8 @@ #define __ASM_RSI_CMDS_H #include <linux/arm-smccc.h> +#include <linux/string.h> +#include <asm/memory.h> #include <asm/rsi_smc.h> diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 40971ac1303f..51b0d594239e 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -11,6 +11,7 @@ extern char __alt_instructions[], __alt_instructions_end[]; extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[]; extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; extern char __hyp_text_start[], __hyp_text_end[]; +extern char __hyp_data_start[], __hyp_data_end[]; extern char __hyp_rodata_start[], __hyp_rodata_end[]; extern char __hyp_reloc_begin[], __hyp_reloc_end[]; extern char __hyp_bss_start[], __hyp_bss_end[]; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 2639d3633073..cd853801a8f7 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -117,6 +117,7 @@ #define SB_BARRIER_INSN __SYS_BARRIER_INSN(0, 7, 31) +/* Data cache zero operations */ #define SYS_DC_ISW sys_insn(1, 0, 7, 6, 2) #define SYS_DC_IGSW sys_insn(1, 0, 7, 6, 4) #define SYS_DC_IGDSW sys_insn(1, 0, 7, 6, 6) @@ -153,11 +154,13 @@ #define SYS_DC_CIGVAC sys_insn(1, 3, 7, 14, 3) #define SYS_DC_CIGDVAC sys_insn(1, 3, 7, 14, 5) -/* Data cache zero operations */ #define SYS_DC_ZVA sys_insn(1, 3, 7, 4, 1) #define SYS_DC_GVA sys_insn(1, 3, 7, 4, 3) #define SYS_DC_GZVA sys_insn(1, 3, 7, 4, 4) +#define SYS_DC_CIVAPS sys_insn(1, 0, 7, 15, 1) +#define SYS_DC_CIGDVAPS sys_insn(1, 0, 7, 15, 5) + /* * Automatically generated definitions for system registers, the * manual encodings below are in the process of being converted to @@ -497,12 +500,22 @@ #define __PMEV_op2(n) ((n) & 0x7) #define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3)) +#define SYS_PMEVCNTSVRn_EL1(n) sys_reg(2, 0, 14, __CNTR_CRm(n), __PMEV_op2(n)) #define SYS_PMEVCNTRn_EL0(n) sys_reg(3, 3, 14, __CNTR_CRm(n), __PMEV_op2(n)) #define __TYPER_CRm(n) (0xc | (((n) >> 3) & 0x3)) #define SYS_PMEVTYPERn_EL0(n) sys_reg(3, 3, 14, __TYPER_CRm(n), __PMEV_op2(n)) #define SYS_PMCCFILTR_EL0 sys_reg(3, 3, 14, 15, 7) +#define SYS_SPMCGCRn_EL1(n) sys_reg(2, 0, 9, 13, ((n) & 1)) + +#define __SPMEV_op2(n) ((n) & 0x7) +#define __SPMEV_crm(p, n) ((((p) & 7) << 1) | (((n) >> 3) & 1)) +#define SYS_SPMEVCNTRn_EL0(n) sys_reg(2, 3, 14, __SPMEV_crm(0b000, n), __SPMEV_op2(n)) +#define SYS_SPMEVFILT2Rn_EL0(n) sys_reg(2, 3, 14, __SPMEV_crm(0b011, n), __SPMEV_op2(n)) +#define SYS_SPMEVFILTRn_EL0(n) sys_reg(2, 3, 14, __SPMEV_crm(0b010, n), __SPMEV_op2(n)) +#define SYS_SPMEVTYPERn_EL0(n) sys_reg(2, 3, 14, __SPMEV_crm(0b001, n), __SPMEV_op2(n)) + #define SYS_VPIDR_EL2 sys_reg(3, 4, 0, 0, 0) #define SYS_VMPIDR_EL2 sys_reg(3, 4, 0, 0, 5) @@ -521,7 +534,6 @@ #define SYS_VTTBR_EL2 sys_reg(3, 4, 2, 1, 0) #define SYS_VTCR_EL2 sys_reg(3, 4, 2, 1, 2) -#define SYS_VNCR_EL2 sys_reg(3, 4, 2, 2, 0) #define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6) #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) #define SYS_ELR_EL2 sys_reg(3, 4, 4, 0, 1) @@ -608,28 +620,18 @@ /* VHE encodings for architectural EL0/1 system registers */ #define SYS_BRBCR_EL12 sys_reg(2, 5, 9, 0, 0) -#define SYS_SCTLR_EL12 sys_reg(3, 5, 1, 0, 0) -#define SYS_CPACR_EL12 sys_reg(3, 5, 1, 0, 2) -#define SYS_SCTLR2_EL12 sys_reg(3, 5, 1, 0, 3) -#define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0) -#define SYS_TRFCR_EL12 sys_reg(3, 5, 1, 2, 1) -#define SYS_SMCR_EL12 sys_reg(3, 5, 1, 2, 6) #define SYS_TTBR0_EL12 sys_reg(3, 5, 2, 0, 0) #define SYS_TTBR1_EL12 sys_reg(3, 5, 2, 0, 1) -#define SYS_TCR_EL12 sys_reg(3, 5, 2, 0, 2) -#define SYS_TCR2_EL12 sys_reg(3, 5, 2, 0, 3) #define SYS_SPSR_EL12 sys_reg(3, 5, 4, 0, 0) #define SYS_ELR_EL12 sys_reg(3, 5, 4, 0, 1) #define SYS_AFSR0_EL12 sys_reg(3, 5, 5, 1, 0) #define SYS_AFSR1_EL12 sys_reg(3, 5, 5, 1, 1) #define SYS_ESR_EL12 sys_reg(3, 5, 5, 2, 0) #define SYS_TFSR_EL12 sys_reg(3, 5, 5, 6, 0) -#define SYS_FAR_EL12 sys_reg(3, 5, 6, 0, 0) #define SYS_PMSCR_EL12 sys_reg(3, 5, 9, 9, 0) #define SYS_MAIR_EL12 sys_reg(3, 5, 10, 2, 0) #define SYS_AMAIR_EL12 sys_reg(3, 5, 10, 3, 0) #define SYS_VBAR_EL12 sys_reg(3, 5, 12, 0, 0) -#define SYS_CONTEXTIDR_EL12 sys_reg(3, 5, 13, 0, 1) #define SYS_SCXTNUM_EL12 sys_reg(3, 5, 13, 0, 7) #define SYS_CNTKCTL_EL12 sys_reg(3, 5, 14, 1, 0) #define SYS_CNTP_TVAL_EL02 sys_reg(3, 5, 14, 2, 0) @@ -1091,6 +1093,15 @@ __emit_inst(0xd5000000|(\sreg)|(.L__gpr_num_\rt)) .endm + .macro msr_hcr_el2, reg +#if IS_ENABLED(CONFIG_AMPERE_ERRATUM_AC04_CPU_23) + dsb nsh + msr hcr_el2, \reg + isb +#else + msr hcr_el2, \reg +#endif + .endm #else #include <linux/bitfield.h> @@ -1178,6 +1189,13 @@ write_sysreg(__scs_new, sysreg); \ } while (0) +#define sysreg_clear_set_hcr(clear, set) do { \ + u64 __scs_val = read_sysreg(hcr_el2); \ + u64 __scs_new = (__scs_val & ~(u64)(clear)) | (set); \ + if (__scs_new != __scs_val) \ + write_sysreg_hcr(__scs_new); \ +} while (0) + #define sysreg_clear_set_s(sysreg, clear, set) do { \ u64 __scs_val = read_sysreg_s(sysreg); \ u64 __scs_new = (__scs_val & ~(u64)(clear)) | (set); \ @@ -1185,6 +1203,17 @@ write_sysreg_s(__scs_new, sysreg); \ } while (0) +#define write_sysreg_hcr(__val) do { \ + if (IS_ENABLED(CONFIG_AMPERE_ERRATUM_AC04_CPU_23) && \ + (!system_capabilities_finalized() || \ + alternative_has_cap_unlikely(ARM64_WORKAROUND_AMPERE_AC04_CPU_23))) \ + asm volatile("dsb nsh; msr hcr_el2, %x0; isb" \ + : : "rZ" (__val)); \ + else \ + asm volatile("msr hcr_el2, %x0" \ + : : "rZ" (__val)); \ +} while (0) + #define read_sysreg_par() ({ \ u64 par; \ asm(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412)); \ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 1114c1c3300a..1269c2487574 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -59,11 +59,12 @@ void arch_setup_new_exec(void); #define TIF_SIGPENDING 0 /* signal pending */ #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ -#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ -#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ -#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ -#define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ -#define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 2 /* Lazy rescheduling needed */ +#define TIF_NOTIFY_RESUME 3 /* callback before returning to user */ +#define TIF_FOREIGN_FPSTATE 4 /* CPU's FP state is not current's */ +#define TIF_UPROBE 5 /* uprobe breakpoint or singlestep */ +#define TIF_MTE_ASYNC_FAULT 6 /* MTE Asynchronous Tag Check Fault */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -82,9 +83,12 @@ void arch_setup_new_exec(void); #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */ #define TIF_TSC_SIGSEGV 30 /* SIGSEGV on counter-timer access */ +#define TIF_LAZY_MMU 31 /* Task in lazy mmu mode */ +#define TIF_LAZY_MMU_PENDING 32 /* Ops pending for lazy mmu mode exit */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) @@ -100,10 +104,10 @@ void arch_setup_new_exec(void); #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_TSC_SIGSEGV (1 << TIF_TSC_SIGSEGV) -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ - _TIF_NOTIFY_SIGNAL) + _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index 3322c7047d84..da1ab8759592 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -8,6 +8,7 @@ #ifndef __ASSEMBLY__ #include <asm/alternative.h> +#include <asm/arch_timer.h> #include <asm/barrier.h> #include <asm/unistd.h> #include <asm/sysreg.h> @@ -69,8 +70,6 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_time_data *vd) { - u64 res; - /* * Core checks for mode already, so this raced against a concurrent * update. Return something. Core will do another round and then @@ -79,24 +78,7 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, if (clock_mode == VDSO_CLOCKMODE_NONE) return 0; - /* - * If FEAT_ECV is available, use the self-synchronizing counter. - * Otherwise the isb is required to prevent that the counter value - * is speculated. - */ - asm volatile( - ALTERNATIVE("isb\n" - "mrs %0, cntvct_el0", - "nop\n" - __mrs_s("%0", SYS_CNTVCTSS_EL0), - ARM64_HAS_ECV) - : "=r" (res) - : - : "memory"); - - arch_counter_enforce_ordering(res); - - return res; + return __arch_counter_get_cntvct(); } #if IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index ebf4a9f943ed..aa280f356b96 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -67,7 +67,8 @@ * __boot_cpu_mode records what mode CPUs were booted in. * A correctly-implemented bootloader must start all CPUs in the same mode: * In this case, both 32bit halves of __boot_cpu_mode will contain the - * same value (either 0 if booted in EL1, BOOT_CPU_MODE_EL2 if booted in EL2). + * same value (either BOOT_CPU_MODE_EL1 if booted in EL1, BOOT_CPU_MODE_EL2 if + * booted in EL2). * * Should the bootloader fail to do this, the two values will be different. * This allows the kernel to flag an error when the secondaries have come up. diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 38fafffe699f..12f534e8f3ed 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -23,6 +23,51 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); } +#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size +static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, + unsigned long end, u64 pfn, + unsigned int max_page_shift) +{ + /* + * If the block is at least CONT_PTE_SIZE in size, and is naturally + * aligned in both virtual and physical space, then we can pte-map the + * block using the PTE_CONT bit for more efficient use of the TLB. + */ + if (max_page_shift < CONT_PTE_SHIFT) + return PAGE_SIZE; + + if (end - addr < CONT_PTE_SIZE) + return PAGE_SIZE; + + if (!IS_ALIGNED(addr, CONT_PTE_SIZE)) + return PAGE_SIZE; + + if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE)) + return PAGE_SIZE; + + return CONT_PTE_SIZE; +} + +#define arch_vmap_pte_range_unmap_size arch_vmap_pte_range_unmap_size +static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr, + pte_t *ptep) +{ + /* + * The caller handles alignment so it's sufficient just to check + * PTE_CONT. + */ + return pte_valid_cont(__ptep_get(ptep)) ? CONT_PTE_SIZE : PAGE_SIZE; +} + +#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift +static inline int arch_vmap_pte_supported_shift(unsigned long size) +{ + if (size >= CONT_PTE_SIZE) + return CONT_PTE_SHIFT; + + return PAGE_SHIFT; +} + #endif #define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h index 4f9bbd4d6c26..6f556e993644 100644 --- a/arch/arm64/include/asm/vncr_mapping.h +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -35,6 +35,8 @@ #define VNCR_CNTP_CTL_EL0 0x180 #define VNCR_SCXTNUM_EL1 0x188 #define VNCR_TFSR_EL1 0x190 +#define VNCR_HDFGRTR2_EL2 0x1A0 +#define VNCR_HDFGWTR2_EL2 0x1B0 #define VNCR_HFGRTR_EL2 0x1B8 #define VNCR_HFGWTR_EL2 0x1C0 #define VNCR_HFGITR_EL2 0x1C8 @@ -52,6 +54,9 @@ #define VNCR_PIRE0_EL1 0x290 #define VNCR_PIR_EL1 0x2A0 #define VNCR_POR_EL1 0x2A8 +#define VNCR_HFGRTR2_EL2 0x2C0 +#define VNCR_HFGWTR2_EL2 0x2C8 +#define VNCR_HFGITR2_EL2 0x310 #define VNCR_ICH_LR0_EL2 0x400 #define VNCR_ICH_LR1_EL2 0x408 #define VNCR_ICH_LR2_EL2 0x410 diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index af9d9acaf997..ed5f3892674c 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -431,10 +431,11 @@ enum { /* Device Control API on vcpu fd */ #define KVM_ARM_VCPU_PMU_V3_CTRL 0 -#define KVM_ARM_VCPU_PMU_V3_IRQ 0 -#define KVM_ARM_VCPU_PMU_V3_INIT 1 -#define KVM_ARM_VCPU_PMU_V3_FILTER 2 -#define KVM_ARM_VCPU_PMU_V3_SET_PMU 3 +#define KVM_ARM_VCPU_PMU_V3_IRQ 0 +#define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_PMU_V3_FILTER 2 +#define KVM_ARM_VCPU_PMU_V3_SET_PMU 3 +#define KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS 4 #define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index eb1a840e4110..30d4bbe68661 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -182,5 +182,7 @@ int main(void) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); #endif + DEFINE(PIE_E0_ASM, PIE_E0); + DEFINE(PIE_E1_ASM, PIE_E1); return 0; } diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 6b0ad5070d3e..59d723c9ab8f 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -557,6 +557,13 @@ static const struct midr_range erratum_ac03_cpu_38_list[] = { }; #endif +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 +static const struct midr_range erratum_ac04_cpu_23_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -876,6 +883,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), }, #endif +#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23 + { + .desc = "AmpereOne erratum AC04_CPU_23", + .capability = ARM64_WORKAROUND_AMPERE_AC04_CPU_23, + ERRATA_MIDR_RANGE_LIST(erratum_ac04_cpu_23_list), + }, +#endif { .desc = "Broken CNTVOFF_EL2", .capability = ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF, diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4c46d80aa64b..45ea79cacf46 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -305,6 +305,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), @@ -765,17 +766,17 @@ static const struct arm64_ftr_bits ftr_raz[] = { #define ARM64_FTR_REG(id, table) \ __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) -struct arm64_ftr_override id_aa64mmfr0_override; -struct arm64_ftr_override id_aa64mmfr1_override; -struct arm64_ftr_override id_aa64mmfr2_override; -struct arm64_ftr_override id_aa64pfr0_override; -struct arm64_ftr_override id_aa64pfr1_override; -struct arm64_ftr_override id_aa64zfr0_override; -struct arm64_ftr_override id_aa64smfr0_override; -struct arm64_ftr_override id_aa64isar1_override; -struct arm64_ftr_override id_aa64isar2_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr0_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr1_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr2_override; +struct arm64_ftr_override __read_mostly id_aa64pfr0_override; +struct arm64_ftr_override __read_mostly id_aa64pfr1_override; +struct arm64_ftr_override __read_mostly id_aa64zfr0_override; +struct arm64_ftr_override __read_mostly id_aa64smfr0_override; +struct arm64_ftr_override __read_mostly id_aa64isar1_override; +struct arm64_ftr_override __read_mostly id_aa64isar2_override; -struct arm64_ftr_override arm64_sw_feature_override; +struct arm64_ftr_override __read_mostly arm64_sw_feature_override; static const struct __ftr_reg_entry { u32 sys_id; @@ -1410,6 +1411,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR3_EL1, cpu, info->reg_id_aa64mmfr3, boot->reg_id_aa64mmfr3); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR4_EL1, cpu, + info->reg_id_aa64mmfr4, boot->reg_id_aa64mmfr4); taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); @@ -2883,6 +2886,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, IMP) }, + { + .desc = "Fine Grained Traps 2", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_FGT2, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, FGT2) + }, #ifdef CONFIG_ARM64_SME { .desc = "Scalable Matrix Extension", diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 285d7d538342..94525abd1c22 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -209,80 +209,79 @@ static const char *const compat_hwcap2_str[] = { static int c_show(struct seq_file *m, void *v) { - int i, j; + int j; + int cpu = m->index; bool compat = personality(current->personality) == PER_LINUX32; + struct cpuinfo_arm64 *cpuinfo = v; + u32 midr = cpuinfo->reg_midr; - for_each_online_cpu(i) { - struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i); - u32 midr = cpuinfo->reg_midr; - - /* - * glibc reads /proc/cpuinfo to determine the number of - * online processors, looking for lines beginning with - * "processor". Give glibc what it expects. - */ - seq_printf(m, "processor\t: %d\n", i); - if (compat) - seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", - MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); + /* + * glibc reads /proc/cpuinfo to determine the number of + * online processors, looking for lines beginning with + * "processor". Give glibc what it expects. + */ + seq_printf(m, "processor\t: %d\n", cpu); + if (compat) + seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", + MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); - seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", - loops_per_jiffy / (500000UL/HZ), - loops_per_jiffy / (5000UL/HZ) % 100); + seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", + loops_per_jiffy / (500000UL/HZ), + loops_per_jiffy / (5000UL/HZ) % 100); - /* - * Dump out the common processor features in a single line. - * Userspace should read the hwcaps with getauxval(AT_HWCAP) - * rather than attempting to parse this, but there's a body of - * software which does already (at least for 32-bit). - */ - seq_puts(m, "Features\t:"); - if (compat) { + /* + * Dump out the common processor features in a single line. + * Userspace should read the hwcaps with getauxval(AT_HWCAP) + * rather than attempting to parse this, but there's a body of + * software which does already (at least for 32-bit). + */ + seq_puts(m, "Features\t:"); + if (compat) { #ifdef CONFIG_COMPAT - for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { - if (compat_elf_hwcap & (1 << j)) { - /* - * Warn once if any feature should not - * have been present on arm64 platform. - */ - if (WARN_ON_ONCE(!compat_hwcap_str[j])) - continue; - - seq_printf(m, " %s", compat_hwcap_str[j]); - } + for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { + if (compat_elf_hwcap & (1 << j)) { + /* + * Warn once if any feature should not + * have been present on arm64 platform. + */ + if (WARN_ON_ONCE(!compat_hwcap_str[j])) + continue; + + seq_printf(m, " %s", compat_hwcap_str[j]); } + } - for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) - if (compat_elf_hwcap2 & (1 << j)) - seq_printf(m, " %s", compat_hwcap2_str[j]); + for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) + if (compat_elf_hwcap2 & (1 << j)) + seq_printf(m, " %s", compat_hwcap2_str[j]); #endif /* CONFIG_COMPAT */ - } else { - for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) - if (cpu_have_feature(j)) - seq_printf(m, " %s", hwcap_str[j]); - } - seq_puts(m, "\n"); - - seq_printf(m, "CPU implementer\t: 0x%02x\n", - MIDR_IMPLEMENTOR(midr)); - seq_printf(m, "CPU architecture: 8\n"); - seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); - seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); - seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); + } else { + for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) + if (cpu_have_feature(j)) + seq_printf(m, " %s", hwcap_str[j]); } + seq_puts(m, "\n"); + + seq_printf(m, "CPU implementer\t: 0x%02x\n", + MIDR_IMPLEMENTOR(midr)); + seq_puts(m, "CPU architecture: 8\n"); + seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); + seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); + seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < 1 ? (void *)1 : NULL; + *pos = cpumask_next(*pos - 1, cpu_online_mask); + return *pos < nr_cpu_ids ? &per_cpu(cpu_data, *pos) : NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { ++*pos; - return NULL; + return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) @@ -328,11 +327,13 @@ static const struct kobj_type cpuregs_kobj_type = { CPUREGS_ATTR_RO(midr_el1, midr); CPUREGS_ATTR_RO(revidr_el1, revidr); +CPUREGS_ATTR_RO(aidr_el1, aidr); CPUREGS_ATTR_RO(smidr_el1, smidr); static struct attribute *cpuregs_id_attrs[] = { &cpuregs_attr_midr_el1.attr, &cpuregs_attr_revidr_el1.attr, + &cpuregs_attr_aidr_el1.attr, NULL }; @@ -469,6 +470,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_dczid = read_cpuid(DCZID_EL0); info->reg_midr = read_cpuid_id(); info->reg_revidr = read_cpuid(REVIDR_EL1); + info->reg_aidr = read_cpuid(AIDR_EL1); info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1); info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 1d25d8899dbf..250e9d7c08a7 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -169,14 +169,14 @@ static DEFINE_RAW_SPINLOCK(efi_rt_lock); void arch_efi_call_virt_setup(void) { efi_virtmap_load(); - __efi_fpsimd_begin(); raw_spin_lock(&efi_rt_lock); + __efi_fpsimd_begin(); } void arch_efi_call_virt_teardown(void) { - raw_spin_unlock(&efi_rt_lock); __efi_fpsimd_end(); + raw_spin_unlock(&efi_rt_lock); efi_virtmap_unload(); } diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index b260ddc4d3e9..7c1970b341b8 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -132,7 +132,7 @@ static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) do { local_irq_enable(); - if (thread_flags & _TIF_NEED_RESCHED) + if (thread_flags & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (thread_flags & _TIF_UPROBE) @@ -393,20 +393,16 @@ static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) * As per the ABI exit SME streaming mode and clear the SVE state not * shared with FPSIMD on syscall entry. */ -static inline void fp_user_discard(void) +static inline void fpsimd_syscall_enter(void) { - /* - * If SME is active then exit streaming mode. If ZA is active - * then flush the SVE registers but leave userspace access to - * both SVE and SME enabled, otherwise disable SME for the - * task and fall through to disabling SVE too. This means - * that after a syscall we never have any streaming mode - * register state to track, if this changes the KVM code will - * need updating. - */ + /* Ensure PSTATE.SM is clear, but leave PSTATE.ZA as-is. */ if (system_supports_sme()) sme_smstop_sm(); + /* + * The CPU is not in streaming mode. If non-streaming SVE is not + * supported, there is no SVE state that needs to be discarded. + */ if (!system_supports_sve()) return; @@ -416,6 +412,33 @@ static inline void fp_user_discard(void) sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1; sve_flush_live(true, sve_vq_minus_one); } + + /* + * Any live non-FPSIMD SVE state has been zeroed. Allow + * fpsimd_save_user_state() to lazily discard SVE state until either + * the live state is unbound or fpsimd_syscall_exit() is called. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_FPSIMD); +} + +static __always_inline void fpsimd_syscall_exit(void) +{ + if (!system_supports_sve()) + return; + + /* + * The current task's user FPSIMD/SVE/SME state is now bound to this + * CPU. The fpsimd_last_state.to_save value is either: + * + * - FP_STATE_FPSIMD, if the state has not been reloaded on this CPU + * since fpsimd_syscall_enter(). + * + * - FP_STATE_CURRENT, if the state has been reloaded on this CPU at + * any point. + * + * Reset this to FP_STATE_CURRENT to stop lazy discarding. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_CURRENT); } UNHANDLED(el1t, 64, sync) @@ -739,10 +762,11 @@ static void noinstr el0_svc(struct pt_regs *regs) { enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); - fp_user_discard(); + fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); exit_to_user_mode(regs); + fpsimd_syscall_exit(); } static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 8370d55f0353..c37f02d7194e 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -119,7 +119,7 @@ * whatever is in the FPSIMD registers is not saved to memory, but discarded. */ -static DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); +DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); __ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { #ifdef CONFIG_ARM64_SVE @@ -180,12 +180,12 @@ static inline void set_sve_default_vl(int val) set_default_vl(ARM64_VEC_SVE, val); } -static void __percpu *efi_sve_state; +static u8 *efi_sve_state; #else /* ! CONFIG_ARM64_SVE */ /* Dummy declaration for code that will be optimised out: */ -extern void __percpu *efi_sve_state; +extern u8 *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ @@ -359,20 +359,15 @@ static void task_fpsimd_load(void) WARN_ON(preemptible()); WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE)); - if (system_supports_fpmr()) - write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); - if (system_supports_sve() || system_supports_sme()) { switch (current->thread.fp_type) { case FP_STATE_FPSIMD: /* Stop tracking SVE for this task until next use. */ - if (test_and_clear_thread_flag(TIF_SVE)) - sve_user_disable(); + clear_thread_flag(TIF_SVE); break; case FP_STATE_SVE: - if (!thread_sm_enabled(¤t->thread) && - !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE))) - sve_user_enable(); + if (!thread_sm_enabled(¤t->thread)) + WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)); if (test_thread_flag(TIF_SVE)) sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); @@ -413,6 +408,9 @@ static void task_fpsimd_load(void) restore_ffr = system_supports_fa64(); } + if (system_supports_fpmr()) + write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); + if (restore_sve_regs) { WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE); sve_load_state(sve_pffr(¤t->thread), @@ -453,12 +451,15 @@ static void fpsimd_save_user_state(void) *(last->fpmr) = read_sysreg_s(SYS_FPMR); /* - * If a task is in a syscall the ABI allows us to only - * preserve the state shared with FPSIMD so don't bother - * saving the full SVE state in that case. + * Save SVE state if it is live. + * + * The syscall ABI discards live SVE state at syscall entry. When + * entering a syscall, fpsimd_syscall_enter() sets to_save to + * FP_STATE_FPSIMD to allow the SVE state to be lazily discarded until + * either new SVE state is loaded+bound or fpsimd_syscall_exit() is + * called prior to a return to userspace. */ - if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) && - !in_syscall(current_pt_regs())) || + if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE)) || last->to_save == FP_STATE_SVE) { save_sve_regs = true; save_ffr = true; @@ -651,7 +652,7 @@ static void __fpsimd_to_sve(void *sst, struct user_fpsimd_state const *fst, * task->thread.uw.fpsimd_state must be up to date before calling this * function. */ -static void fpsimd_to_sve(struct task_struct *task) +static inline void fpsimd_to_sve(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; @@ -675,7 +676,7 @@ static void fpsimd_to_sve(struct task_struct *task) * bytes of allocated kernel memory. * task->thread.sve_state must be up to date before calling this function. */ -static void sve_to_fpsimd(struct task_struct *task) +static inline void sve_to_fpsimd(struct task_struct *task) { unsigned int vq, vl; void const *sst = task->thread.sve_state; @@ -694,44 +695,39 @@ static void sve_to_fpsimd(struct task_struct *task) } } -void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) +static inline void __fpsimd_zero_vregs(struct user_fpsimd_state *fpsimd) { - write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, - SYS_SCTLR_EL1); + memset(&fpsimd->vregs, 0, sizeof(fpsimd->vregs)); } -#ifdef CONFIG_ARM64_SVE /* - * Call __sve_free() directly only if you know task can't be scheduled - * or preempted. + * Simulate the effects of an SMSTOP SM instruction. */ -static void __sve_free(struct task_struct *task) +void task_smstop_sm(struct task_struct *task) { - kfree(task->thread.sve_state); - task->thread.sve_state = NULL; -} + if (!thread_sm_enabled(&task->thread)) + return; -static void sve_free(struct task_struct *task) -{ - WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + __fpsimd_zero_vregs(&task->thread.uw.fpsimd_state); + task->thread.uw.fpsimd_state.fpsr = 0x0800009f; + if (system_supports_fpmr()) + task->thread.uw.fpmr = 0; - __sve_free(task); + task->thread.svcr &= ~SVCR_SM_MASK; + task->thread.fp_type = FP_STATE_FPSIMD; } -/* - * Return how many bytes of memory are required to store the full SVE - * state for task, given task's currently configured vector length. - */ -size_t sve_state_size(struct task_struct const *task) +void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) { - unsigned int vl = 0; - - if (system_supports_sve()) - vl = task_get_sve_vl(task); - if (system_supports_sme()) - vl = max(vl, task_get_sme_vl(task)); + write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, + SYS_SCTLR_EL1); +} - return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +#ifdef CONFIG_ARM64_SVE +static void sve_free(struct task_struct *task) +{ + kfree(task->thread.sve_state); + task->thread.sve_state = NULL; } /* @@ -758,69 +754,34 @@ void sve_alloc(struct task_struct *task, bool flush) kzalloc(sve_state_size(task), GFP_KERNEL); } - -/* - * Force the FPSIMD state shared with SVE to be updated in the SVE state - * even if the SVE state is the current active state. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void fpsimd_force_sync_to_sve(struct task_struct *task) -{ - fpsimd_to_sve(task); -} - -/* - * Ensure that task->thread.sve_state is up to date with respect to - * the user task, irrespective of when SVE is in use or not. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void fpsimd_sync_to_sve(struct task_struct *task) -{ - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) - fpsimd_to_sve(task); -} - /* - * Ensure that task->thread.uw.fpsimd_state is up to date with respect to - * the user task, irrespective of whether SVE is in use or not. + * Ensure that task->thread.uw.fpsimd_state is up to date with respect to the + * task's currently effective FPSIMD/SVE state. * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void sve_sync_to_fpsimd(struct task_struct *task) +void fpsimd_sync_from_effective_state(struct task_struct *task) { if (task->thread.fp_type == FP_STATE_SVE) sve_to_fpsimd(task); } /* - * Ensure that task->thread.sve_state is up to date with respect to - * the task->thread.uw.fpsimd_state. + * Ensure that the task's currently effective FPSIMD/SVE state is up to date + * with respect to task->thread.uw.fpsimd_state, zeroing any effective + * non-FPSIMD (S)SVE state. * - * This should only be called by ptrace to merge new FPSIMD register - * values into a task for which SVE is currently active. - * task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - * task->thread.uw.fpsimd_state must already have been initialised with - * the new FPSIMD register values to be merged in. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void sve_sync_from_fpsimd_zeropad(struct task_struct *task) +void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) + if (task->thread.fp_type != FP_STATE_SVE) return; vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); @@ -829,10 +790,73 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task) __fpsimd_to_sve(sst, fst, vq); } +static int change_live_vector_length(struct task_struct *task, + enum vec_type type, + unsigned long vl) +{ + unsigned int sve_vl = task_get_sve_vl(task); + unsigned int sme_vl = task_get_sme_vl(task); + void *sve_state = NULL, *sme_state = NULL; + + if (type == ARM64_VEC_SME) + sme_vl = vl; + else + sve_vl = vl; + + /* + * Allocate the new sve_state and sme_state before freeing the old + * copies so that allocation failure can be handled without needing to + * mutate the task's state in any way. + * + * Changes to the SVE vector length must not discard live ZA state or + * clear PSTATE.ZA, as userspace code which is unaware of the AAPCS64 + * ZA lazy saving scheme may attempt to change the SVE vector length + * while unsaved/dormant ZA state exists. + */ + sve_state = kzalloc(__sve_state_size(sve_vl, sme_vl), GFP_KERNEL); + if (!sve_state) + goto out_mem; + + if (type == ARM64_VEC_SME) { + sme_state = kzalloc(__sme_state_size(sme_vl), GFP_KERNEL); + if (!sme_state) + goto out_mem; + } + + if (task == current) + fpsimd_save_and_flush_current_state(); + else + fpsimd_flush_task_state(task); + + /* + * Always preserve PSTATE.SM and the effective FPSIMD state, zeroing + * other SVE state. + */ + fpsimd_sync_from_effective_state(task); + task_set_vl(task, type, vl); + kfree(task->thread.sve_state); + task->thread.sve_state = sve_state; + fpsimd_sync_to_effective_state_zeropad(task); + + if (type == ARM64_VEC_SME) { + task->thread.svcr &= ~SVCR_ZA_MASK; + kfree(task->thread.sme_state); + task->thread.sme_state = sme_state; + } + + return 0; + +out_mem: + kfree(sve_state); + kfree(sme_state); + return -ENOMEM; +} + int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags) { - bool free_sme = false; + bool onexec = flags & PR_SVE_SET_VL_ONEXEC; + bool inherit = flags & PR_SVE_VL_INHERIT; if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | PR_SVE_SET_VL_ONEXEC)) @@ -852,71 +876,17 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, vl = find_supported_vector_length(type, vl); - if (flags & (PR_SVE_VL_INHERIT | - PR_SVE_SET_VL_ONEXEC)) + if (!onexec && vl != task_get_vl(task, type)) { + if (change_live_vector_length(task, type, vl)) + return -ENOMEM; + } + + if (onexec || inherit) task_set_vl_onexec(task, type, vl); else /* Reset VL to system default on next exec: */ task_set_vl_onexec(task, type, 0); - /* Only actually set the VL if not deferred: */ - if (flags & PR_SVE_SET_VL_ONEXEC) - goto out; - - if (vl == task_get_vl(task, type)) - goto out; - - /* - * To ensure the FPSIMD bits of the SVE vector registers are preserved, - * write any live register state back to task_struct, and convert to a - * regular FPSIMD thread. - */ - if (task == current) { - get_cpu_fpsimd_context(); - - fpsimd_save_user_state(); - } - - fpsimd_flush_task_state(task); - if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || - thread_sm_enabled(&task->thread)) { - sve_to_fpsimd(task); - task->thread.fp_type = FP_STATE_FPSIMD; - } - - if (system_supports_sme()) { - if (type == ARM64_VEC_SME || - !(task->thread.svcr & (SVCR_SM_MASK | SVCR_ZA_MASK))) { - /* - * We are changing the SME VL or weren't using - * SME anyway, discard the state and force a - * reallocation. - */ - task->thread.svcr &= ~(SVCR_SM_MASK | - SVCR_ZA_MASK); - clear_tsk_thread_flag(task, TIF_SME); - free_sme = true; - } - } - - if (task == current) - put_cpu_fpsimd_context(); - - task_set_vl(task, type, vl); - - /* - * Free the changed states if they are not in use, SME will be - * reallocated to the correct size on next use and we just - * allocate SVE now in case it is needed for use in streaming - * mode. - */ - sve_free(task); - sve_alloc(task, true); - - if (free_sme) - sme_free(task); - -out: update_tsk_thread_flag(task, vec_vl_inherit_flag(type), flags & PR_SVE_VL_INHERIT); @@ -1131,15 +1101,15 @@ static void __init sve_efi_setup(void) if (!sve_vl_valid(max_vl)) goto fail; - efi_sve_state = __alloc_percpu( - SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES); + efi_sve_state = kmalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), + GFP_KERNEL); if (!efi_sve_state) goto fail; return; fail: - panic("Cannot allocate percpu memory for EFI SVE save/restore"); + panic("Cannot allocate memory for EFI SVE save/restore"); } void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) @@ -1212,7 +1182,7 @@ void __init sve_setup(void) */ void fpsimd_release_task(struct task_struct *dead_task) { - __sve_free(dead_task); + sve_free(dead_task); sme_free(dead_task); } @@ -1436,7 +1406,7 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) * If this not a trap due to SME being disabled then something * is being used in the wrong mode, report as SIGILL. */ - if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) { + if (ESR_ELx_SME_ISS_SMTC(esr) != ESR_ELx_SME_ISS_SMTC_SME_DISABLED) { force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } @@ -1460,6 +1430,8 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) sme_set_vq(vq_minus_one); fpsimd_bind_task_to_cpu(); + } else { + fpsimd_flush_task_state(current); } put_cpu_fpsimd_context(); @@ -1573,8 +1545,8 @@ void fpsimd_thread_switch(struct task_struct *next) fpsimd_save_user_state(); if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) { - fpsimd_load_kernel_state(next); fpsimd_flush_cpu_state(); + fpsimd_load_kernel_state(next); } else { /* * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's @@ -1661,6 +1633,9 @@ void fpsimd_flush_thread(void) current->thread.svcr = 0; } + if (system_supports_fpmr()) + current->thread.uw.fpmr = 0; + current->thread.fp_type = FP_STATE_FPSIMD; put_cpu_fpsimd_context(); @@ -1683,18 +1658,6 @@ void fpsimd_preserve_current_state(void) } /* - * Like fpsimd_preserve_current_state(), but ensure that - * current->thread.uw.fpsimd_state is updated so that it can be copied to - * the signal frame. - */ -void fpsimd_signal_preserve_current_state(void) -{ - fpsimd_preserve_current_state(); - if (current->thread.fp_type == FP_STATE_SVE) - sve_to_fpsimd(current); -} - -/* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling * this function. @@ -1786,30 +1749,14 @@ void fpsimd_restore_current_state(void) put_cpu_fpsimd_context(); } -/* - * Load an updated userland FPSIMD state for 'current' from memory and set the - * flag that indicates that the FPSIMD register contents are the most recent - * FPSIMD state of 'current'. This is used by the signal code to restore the - * register state when returning from a signal handler in FPSIMD only cases, - * any SVE context will be discarded. - */ void fpsimd_update_current_state(struct user_fpsimd_state const *state) { if (WARN_ON(!system_supports_fpsimd())) return; - get_cpu_fpsimd_context(); - current->thread.uw.fpsimd_state = *state; - if (test_thread_flag(TIF_SVE)) + if (current->thread.fp_type == FP_STATE_SVE) fpsimd_to_sve(current); - - task_fpsimd_load(); - fpsimd_bind_task_to_cpu(); - - clear_thread_flag(TIF_FOREIGN_FPSTATE); - - put_cpu_fpsimd_context(); } /* @@ -1839,6 +1786,17 @@ void fpsimd_flush_task_state(struct task_struct *t) barrier(); } +void fpsimd_save_and_flush_current_state(void) +{ + if (!system_supports_fpsimd()) + return; + + get_cpu_fpsimd_context(); + fpsimd_save_user_state(); + fpsimd_flush_task_state(current); + put_cpu_fpsimd_context(); +} + /* * Save the FPSIMD state to memory and invalidate cpu view. * This function must be called with preemption disabled. @@ -1948,10 +1906,10 @@ EXPORT_SYMBOL_GPL(kernel_neon_end); #ifdef CONFIG_EFI -static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); -static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); -static DEFINE_PER_CPU(bool, efi_sve_state_used); -static DEFINE_PER_CPU(bool, efi_sm_state); +static struct user_fpsimd_state efi_fpsimd_state; +static bool efi_fpsimd_state_used; +static bool efi_sve_state_used; +static bool efi_sm_state; /* * EFI runtime services support functions @@ -1984,18 +1942,16 @@ void __efi_fpsimd_begin(void) * If !efi_sve_state, SVE can't be in use yet and doesn't need * preserving: */ - if (system_supports_sve() && likely(efi_sve_state)) { - char *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state != NULL) { bool ffr = true; u64 svcr; - __this_cpu_write(efi_sve_state_used, true); + efi_sve_state_used = true; if (system_supports_sme()) { svcr = read_sysreg_s(SYS_SVCR); - __this_cpu_write(efi_sm_state, - svcr & SVCR_SM_MASK); + efi_sm_state = svcr & SVCR_SM_MASK; /* * Unless we have FA64 FFR does not @@ -2005,19 +1961,18 @@ void __efi_fpsimd_begin(void) ffr = !(svcr & SVCR_SM_MASK); } - sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); + sve_save_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); if (system_supports_sme()) sysreg_clear_set_s(SYS_SVCR, SVCR_SM_MASK, 0); } else { - fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_save_state(&efi_fpsimd_state); } - __this_cpu_write(efi_fpsimd_state_used, true); + efi_fpsimd_state_used = true; } } @@ -2029,12 +1984,10 @@ void __efi_fpsimd_end(void) if (!system_supports_fpsimd()) return; - if (!__this_cpu_xchg(efi_fpsimd_state_used, false)) { + if (!efi_fpsimd_state_used) { kernel_neon_end(); } else { - if (system_supports_sve() && - likely(__this_cpu_read(efi_sve_state_used))) { - char const *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state_used) { bool ffr = true; /* @@ -2043,7 +1996,7 @@ void __efi_fpsimd_end(void) * streaming mode. */ if (system_supports_sme()) { - if (__this_cpu_read(efi_sm_state)) { + if (efi_sm_state) { sysreg_clear_set_s(SYS_SVCR, 0, SVCR_SM_MASK); @@ -2057,14 +2010,15 @@ void __efi_fpsimd_end(void) } } - sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); + sve_load_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); - __this_cpu_write(efi_sve_state_used, false); + efi_sve_state_used = false; } else { - fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_load_state(&efi_fpsimd_state); } + + efi_fpsimd_state_used = false; } } diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 2ce73525de2c..ca04b338cb0d 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -89,7 +89,7 @@ SYM_CODE_START(primary_entry) adrp x1, early_init_stack mov sp, x1 mov x29, xzr - adrp x0, init_idmap_pg_dir + adrp x0, __pi_init_idmap_pg_dir mov x1, xzr bl __pi_create_init_idmap @@ -101,7 +101,7 @@ SYM_CODE_START(primary_entry) cbnz x19, 0f dmb sy mov x1, x0 // end of used region - adrp x0, init_idmap_pg_dir + adrp x0, __pi_init_idmap_pg_dir adr_l x2, dcache_inval_poc blr x2 b 1f @@ -507,7 +507,7 @@ SYM_FUNC_END(__no_granule_support) SYM_FUNC_START_LOCAL(__primary_switch) adrp x1, reserved_pg_dir - adrp x2, init_idmap_pg_dir + adrp x2, __pi_init_idmap_pg_dir bl __enable_mmu adrp x1, early_init_stack diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index ae990da1eae5..36e2d26b54f5 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -97,7 +97,7 @@ SYM_CODE_START_LOCAL(__finalise_el2) 2: // Engage the VHE magic! mov_q x0, HCR_HOST_VHE_FLAGS - msr hcr_el2, x0 + msr_hcr_el2 x0 isb // Use the EL1 allocated stack, per-cpu offset diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 2004b4f41ade..5a69b6eb4090 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,6 +10,12 @@ #error This file should only be included in vmlinux.lds.S #endif +#define PI_EXPORT_SYM(sym) \ + __PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code) +#define __PI_EXPORT_SYM(sym, pisym, msg)\ + PROVIDE(pisym = sym); \ + ASSERT((sym - KIMAGE_VADDR) < (__bss_start - KIMAGE_VADDR), #msg) + PROVIDE(__efistub_primary_entry = primary_entry); /* @@ -36,37 +42,30 @@ PROVIDE(__pi___memcpy = __pi_memcpy); PROVIDE(__pi___memmove = __pi_memmove); PROVIDE(__pi___memset = __pi_memset); -PROVIDE(__pi_id_aa64isar1_override = id_aa64isar1_override); -PROVIDE(__pi_id_aa64isar2_override = id_aa64isar2_override); -PROVIDE(__pi_id_aa64mmfr0_override = id_aa64mmfr0_override); -PROVIDE(__pi_id_aa64mmfr1_override = id_aa64mmfr1_override); -PROVIDE(__pi_id_aa64mmfr2_override = id_aa64mmfr2_override); -PROVIDE(__pi_id_aa64pfr0_override = id_aa64pfr0_override); -PROVIDE(__pi_id_aa64pfr1_override = id_aa64pfr1_override); -PROVIDE(__pi_id_aa64smfr0_override = id_aa64smfr0_override); -PROVIDE(__pi_id_aa64zfr0_override = id_aa64zfr0_override); -PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override); -PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings); -PROVIDE(__pi__ctype = _ctype); -PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed); - -PROVIDE(__pi_init_idmap_pg_dir = init_idmap_pg_dir); -PROVIDE(__pi_init_idmap_pg_end = init_idmap_pg_end); -PROVIDE(__pi_init_pg_dir = init_pg_dir); -PROVIDE(__pi_init_pg_end = init_pg_end); -PROVIDE(__pi_swapper_pg_dir = swapper_pg_dir); - -PROVIDE(__pi__text = _text); -PROVIDE(__pi__stext = _stext); -PROVIDE(__pi__etext = _etext); -PROVIDE(__pi___start_rodata = __start_rodata); -PROVIDE(__pi___inittext_begin = __inittext_begin); -PROVIDE(__pi___inittext_end = __inittext_end); -PROVIDE(__pi___initdata_begin = __initdata_begin); -PROVIDE(__pi___initdata_end = __initdata_end); -PROVIDE(__pi__data = _data); -PROVIDE(__pi___bss_start = __bss_start); -PROVIDE(__pi__end = _end); +PI_EXPORT_SYM(id_aa64isar1_override); +PI_EXPORT_SYM(id_aa64isar2_override); +PI_EXPORT_SYM(id_aa64mmfr0_override); +PI_EXPORT_SYM(id_aa64mmfr1_override); +PI_EXPORT_SYM(id_aa64mmfr2_override); +PI_EXPORT_SYM(id_aa64pfr0_override); +PI_EXPORT_SYM(id_aa64pfr1_override); +PI_EXPORT_SYM(id_aa64smfr0_override); +PI_EXPORT_SYM(id_aa64zfr0_override); +PI_EXPORT_SYM(arm64_sw_feature_override); +PI_EXPORT_SYM(arm64_use_ng_mappings); +PI_EXPORT_SYM(_ctype); + +PI_EXPORT_SYM(swapper_pg_dir); + +PI_EXPORT_SYM(_text); +PI_EXPORT_SYM(_stext); +PI_EXPORT_SYM(_etext); +PI_EXPORT_SYM(__start_rodata); +PI_EXPORT_SYM(__inittext_begin); +PI_EXPORT_SYM(__inittext_end); +PI_EXPORT_SYM(__initdata_begin); +PI_EXPORT_SYM(__initdata_end); +PI_EXPORT_SYM(_data); #ifdef CONFIG_KVM @@ -127,6 +126,8 @@ KVM_NVHE_ALIAS(__hyp_text_start); KVM_NVHE_ALIAS(__hyp_text_end); KVM_NVHE_ALIAS(__hyp_bss_start); KVM_NVHE_ALIAS(__hyp_bss_end); +KVM_NVHE_ALIAS(__hyp_data_start); +KVM_NVHE_ALIAS(__hyp_data_end); KVM_NVHE_ALIAS(__hyp_rodata_start); KVM_NVHE_ALIAS(__hyp_rodata_end); diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 1da3e25f9d9e..c9503ed45a6c 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -10,8 +10,6 @@ #include <asm/cpufeature.h> #include <asm/memory.h> -u16 __initdata memstart_offset_seed; - bool __ro_after_init __kaslr_is_enabled = false; void __init kaslr_init(void) diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c index 0257b43819db..e0e018046a46 100644 --- a/arch/arm64/kernel/pi/kaslr_early.c +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -18,8 +18,6 @@ #include "pi.h" -extern u16 memstart_offset_seed; - static u64 __init get_kaslr_seed(void *fdt, int node) { static char const seed_str[] __initconst = "kaslr-seed"; @@ -53,8 +51,6 @@ u64 __init kaslr_early_init(void *fdt, int chosen) return 0; } - memstart_offset_seed = seed & U16_MAX; - /* * OK, so we are proceeding with KASLR enabled. Calculate a suitable * kernel image offset from the seed. Let's place the kernel in the diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index c91e5e965cd3..1f4731a4e17e 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -22,6 +22,7 @@ static inline void *prel64_to_pointer(const prel64_t *offset) extern bool dynamic_scs_is_enabled; extern pgd_t init_idmap_pg_dir[], init_idmap_pg_end[]; +extern pgd_t init_pg_dir[], init_pg_end[]; void init_feature_override(u64 boot_status, const void *fdt, int chosen); u64 kaslr_early_init(void *fdt, int chosen); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 42faebb7b712..a5ca15daeb8a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -344,50 +344,34 @@ void arch_release_task_struct(struct task_struct *tsk) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - if (current->mm) - fpsimd_preserve_current_state(); + /* + * The current/src task's FPSIMD state may or may not be live, and may + * have been altered by ptrace after entry to the kernel. Save the + * effective FPSIMD state so that this will be copied into dst. + */ + fpsimd_save_and_flush_current_state(); + fpsimd_sync_from_effective_state(src); + *dst = *src; /* - * Detach src's sve_state (if any) from dst so that it does not - * get erroneously used or freed prematurely. dst's copies - * will be allocated on demand later on if dst uses SVE. - * For consistency, also clear TIF_SVE here: this could be done - * later in copy_process(), but to avoid tripping up future - * maintainers it is best not to leave TIF flags and buffers in - * an inconsistent state, even temporarily. + * Drop stale reference to src's sve_state and convert dst to + * non-streaming FPSIMD mode. */ + dst->thread.fp_type = FP_STATE_FPSIMD; dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + task_smstop_sm(dst); /* - * In the unlikely event that we create a new thread with ZA - * enabled we should retain the ZA and ZT state so duplicate - * it here. This may be shortly freed if we exec() or if - * CLONE_SETTLS but it's simpler to do it here. To avoid - * confusing the rest of the code ensure that we have a - * sve_state allocated whenever sme_state is allocated. + * Drop stale reference to src's sme_state and ensure dst has ZA + * disabled. + * + * When necessary, ZA will be inherited later in copy_thread_za(). */ - if (thread_za_enabled(&src->thread)) { - dst->thread.sve_state = kzalloc(sve_state_size(src), - GFP_KERNEL); - if (!dst->thread.sve_state) - return -ENOMEM; - - dst->thread.sme_state = kmemdup(src->thread.sme_state, - sme_state_size(src), - GFP_KERNEL); - if (!dst->thread.sme_state) { - kfree(dst->thread.sve_state); - dst->thread.sve_state = NULL; - return -ENOMEM; - } - } else { - dst->thread.sme_state = NULL; - clear_tsk_thread_flag(dst, TIF_SME); - } - - dst->thread.fp_type = FP_STATE_FPSIMD; + dst->thread.sme_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr &= ~SVCR_ZA_MASK; /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); @@ -395,6 +379,31 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } +static int copy_thread_za(struct task_struct *dst, struct task_struct *src) +{ + if (!thread_za_enabled(&src->thread)) + return 0; + + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + + dst->thread.sme_state = kmemdup(src->thread.sme_state, + sme_state_size(src), + GFP_KERNEL); + if (!dst->thread.sme_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + + set_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} + asmlinkage void ret_from_fork(void) asm("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) @@ -427,8 +436,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); - if (system_supports_tpidr2()) - p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); if (system_supports_poe()) p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); @@ -441,13 +448,39 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) } /* + * Due to the AAPCS64 "ZA lazy saving scheme", PSTATE.ZA and + * TPIDR2 need to be manipulated as a pair, and either both + * need to be inherited or both need to be reset. + * + * Within a process, child threads must not inherit their + * parent's TPIDR2 value or they may clobber their parent's + * stack at some later point. + * + * When a process is fork()'d, the child must inherit ZA and + * TPIDR2 from its parent in case there was dormant ZA state. + * + * Use CLONE_VM to determine when the child will share the + * address space with the parent, and cannot safely inherit the + * state. + */ + if (system_supports_sme()) { + if (!(clone_flags & CLONE_VM)) { + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + ret = copy_thread_za(p, current); + if (ret) + return ret; + } else { + p->thread.tpidr2_el0 = 0; + WARN_ON_ONCE(p->thread.svcr & SVCR_ZA_MASK); + } + } + + /* * If a TLS pointer was passed to clone, use it for the new - * thread. We also reset TPIDR2 if it's in use. + * thread. */ - if (clone_flags & CLONE_SETTLS) { + if (clone_flags & CLONE_SETTLS) p->thread.uw.tp_value = tls; - p->thread.tpidr2_el0 = 0; - } ret = copy_thread_gcs(p, args); if (ret != 0) @@ -680,10 +713,11 @@ struct task_struct *__switch_to(struct task_struct *prev, gcs_thread_switch(next); /* - * Complete any pending TLB or cache maintenance on this CPU in case - * the thread migrates to a different CPU. - * This full barrier is also required by the membarrier system - * call. + * Complete any pending TLB or cache maintenance on this CPU in case the + * thread migrates to a different CPU. This full barrier is also + * required by the membarrier system call. Additionally it makes any + * in-progress pgtable writes visible to the table walker; See + * emit_pte_barriers(). */ dsb(ish); diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index f79b0d5f71ac..a360e52db02f 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -594,7 +594,7 @@ static int __fpr_get(struct task_struct *target, { struct user_fpsimd_state *uregs; - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); uregs = &target->thread.uw.fpsimd_state; @@ -626,7 +626,7 @@ static int __fpr_set(struct task_struct *target, * Ensure target->thread.uw.fpsimd_state is up to date, so that a * short copyin can't resurrect stale data. */ - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); newstate = target->thread.uw.fpsimd_state; @@ -653,7 +653,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - sve_sync_from_fpsimd_zeropad(target); + fpsimd_sync_to_effective_state_zeropad(target); fpsimd_flush_task_state(target); return ret; @@ -775,6 +775,11 @@ static void sve_init_header_from_task(struct user_sve_header *header, task_type = ARM64_VEC_SVE; active = (task_type == type); + if (active && target->thread.fp_type == FP_STATE_SVE) + header->flags = SVE_PT_REGS_SVE; + else + header->flags = SVE_PT_REGS_FPSIMD; + switch (type) { case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) @@ -789,19 +794,14 @@ static void sve_init_header_from_task(struct user_sve_header *header, return; } - if (active) { - if (target->thread.fp_type == FP_STATE_FPSIMD) { - header->flags |= SVE_PT_REGS_FPSIMD; - } else { - header->flags |= SVE_PT_REGS_SVE; - } - } - header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); header->max_vl = vec_max_vl(type); - header->size = SVE_PT_SIZE(vq, header->flags); + if (active) + header->size = SVE_PT_SIZE(vq, header->flags); + else + header->size = sizeof(header); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); } @@ -820,18 +820,25 @@ static int sve_get_common(struct task_struct *target, unsigned int vq; unsigned long start, end; + if (target == current) + fpsimd_preserve_current_state(); + /* Header */ sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); membuf_write(&to, &header, sizeof(header)); - if (target == current) - fpsimd_preserve_current_state(); - BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + /* + * When the requested vector type is not active, do not present data + * from the other mode to userspace. + */ + if (header.size == sizeof(header)) + return 0; + switch ((header.flags & SVE_PT_REGS_MASK)) { case SVE_PT_REGS_FPSIMD: return __fpr_get(target, regset, to); @@ -859,7 +866,7 @@ static int sve_get_common(struct task_struct *target, return membuf_zero(&to, end - start); default: - return 0; + BUILD_BUG(); } } @@ -883,6 +890,9 @@ static int sve_set_common(struct task_struct *target, struct user_sve_header header; unsigned int vq; unsigned long start, end; + bool fpsimd; + + fpsimd_flush_task_state(target); /* Header */ if (count < sizeof(header)) @@ -890,7 +900,16 @@ static int sve_set_common(struct task_struct *target, ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, 0, sizeof(header)); if (ret) - goto out; + return ret; + + /* + * Streaming SVE data is always stored and presented in SVE format. + * Require the user to provide SVE formatted data for consistency, and + * to avoid the risk that we configure the task into an invalid state. + */ + fpsimd = (header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD; + if (fpsimd && type == ARM64_VEC_SME) + return -EINVAL; /* * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by @@ -899,7 +918,21 @@ static int sve_set_common(struct task_struct *target, ret = vec_set_vector_length(target, type, header.vl, ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); if (ret) - goto out; + return ret; + + /* Allocate SME storage if necessary, preserving any existing ZA/ZT state */ + if (type == ARM64_VEC_SME) { + sme_alloc(target, false); + if (!target->thread.sme_state) + return -ENOMEM; + } + + /* Allocate SVE storage if necessary, zeroing any existing SVE state */ + if (!fpsimd) { + sve_alloc(target, true); + if (!target->thread.sve_state) + return -ENOMEM; + } /* * Actual VL set may be different from what the user asked @@ -910,81 +943,47 @@ static int sve_set_common(struct task_struct *target, /* Enter/exit streaming mode */ if (system_supports_sme()) { - u64 old_svcr = target->thread.svcr; - switch (type) { case ARM64_VEC_SVE: target->thread.svcr &= ~SVCR_SM_MASK; + set_tsk_thread_flag(target, TIF_SVE); break; case ARM64_VEC_SME: target->thread.svcr |= SVCR_SM_MASK; - - /* - * Disable traps and ensure there is SME storage but - * preserve any currently set values in ZA/ZT. - */ - sme_alloc(target, false); set_tsk_thread_flag(target, TIF_SME); break; default: WARN_ON_ONCE(1); - ret = -EINVAL; - goto out; + return -EINVAL; } - - /* - * If we switched then invalidate any existing SVE - * state and ensure there's storage. - */ - if (target->thread.svcr != old_svcr) - sve_alloc(target, true); } + /* Always zero V regs, FPSR, and FPCR */ + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); + /* Registers: FPSIMD-only case */ BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) { - ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, - SVE_PT_FPSIMD_OFFSET); + if (fpsimd) { clear_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_FPSIMD; - goto out; + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, + SVE_PT_FPSIMD_OFFSET); + return ret; } - /* - * Otherwise: no registers or full SVE case. For backwards - * compatibility reasons we treat empty flags as SVE registers. - */ + /* Otherwise: no registers or full SVE case. */ + + target->thread.fp_type = FP_STATE_SVE; /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ - if (count && vq != sve_vq_from_vl(header.vl)) { - ret = -EIO; - goto out; - } - - sve_alloc(target, true); - if (!target->thread.sve_state) { - ret = -ENOMEM; - clear_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_FPSIMD; - goto out; - } - - /* - * Ensure target->thread.sve_state is up to date with target's - * FPSIMD regs, so that a short copyin leaves trailing - * registers unmodified. Only enable SVE if we are - * configuring normal SVE, a system with streaming SVE may not - * have normal SVE. - */ - fpsimd_sync_to_sve(target); - if (type == ARM64_VEC_SVE) - set_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_SVE; + if (count && vq != sve_vq_from_vl(header.vl)) + return -EIO; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; @@ -993,7 +992,7 @@ static int sve_set_common(struct task_struct *target, target->thread.sve_state, start, end); if (ret) - goto out; + return ret; start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); @@ -1009,8 +1008,6 @@ static int sve_set_common(struct task_struct *target, &target->thread.uw.fpsimd_state.fpsr, start, end); -out: - fpsimd_flush_task_state(target); return ret; } diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 85104587f849..77c7926a4df6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -169,7 +169,7 @@ static void __init smp_build_mpidr_hash(void) static void __init setup_machine_fdt(phys_addr_t dt_phys) { - int size; + int size = 0; void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; @@ -182,10 +182,10 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys) */ if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" - "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n" - "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" - "\nPlease check your bootloader.", - &dt_phys, dt_virt); + "Error: invalid device tree blob: PA=%pa, VA=%px, size=%d bytes\n" + "The dtb must be 8-byte aligned and must not exceed 2 MB in size.\n" + "\nPlease check your bootloader.\n", + &dt_phys, dt_virt, size); /* * Note that in this _really_ early stage we cannot even BUG() diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index a7c37afb4ebe..417140cd399b 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -250,6 +250,8 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) ¤t->thread.uw.fpsimd_state; int err; + fpsimd_sync_from_effective_state(current); + /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); @@ -262,37 +264,46 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) return err ? -EFAULT : 0; } -static int restore_fpsimd_context(struct user_ctxs *user) +static int read_fpsimd_context(struct user_fpsimd_state *fpsimd, + struct user_ctxs *user) { - struct user_fpsimd_state fpsimd; - int err = 0; + int err; /* check the size information */ if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ - err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs), - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err); - __get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err); + err = __copy_from_user(fpsimd->vregs, &(user->fpsimd->vregs), + sizeof(fpsimd->vregs)); + __get_user_error(fpsimd->fpsr, &(user->fpsimd->fpsr), err); + __get_user_error(fpsimd->fpcr, &(user->fpsimd->fpcr), err); + + return err ? -EFAULT : 0; +} + +static int restore_fpsimd_context(struct user_ctxs *user) +{ + struct user_fpsimd_state fpsimd; + int err; + + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); - - return err ? -EFAULT : 0; + fpsimd_update_current_state(&fpsimd); + return 0; } static int preserve_fpmr_context(struct fpmr_context __user *ctx) { int err = 0; - current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR); - __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); @@ -310,7 +321,7 @@ static int restore_fpmr_context(struct user_ctxs *user) __get_user_error(fpmr, &user->fpmr->fpmr, err); if (!err) - write_sysreg_s(fpmr, SYS_FPMR); + current->thread.uw.fpmr = fpmr; return err; } @@ -372,11 +383,6 @@ static int preserve_sve_context(struct sve_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the SVE state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, SVE_SIG_REGS_SIZE(vq)); @@ -391,6 +397,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) unsigned int vl, vq; struct user_fpsimd_state fpsimd; u16 user_vl, flags; + bool sm; if (user->sve_size < sizeof(*user->sve)) return -EINVAL; @@ -400,7 +407,8 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return err; - if (flags & SVE_SIG_FLAG_SM) { + sm = flags & SVE_SIG_FLAG_SM; + if (sm) { if (!system_supports_sme()) return -EINVAL; @@ -420,28 +428,23 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (user_vl != vl) return -EINVAL; - if (user->sve_size == sizeof(*user->sve)) { - clear_thread_flag(TIF_SVE); - current->thread.svcr &= ~SVCR_SM_MASK; - current->thread.fp_type = FP_STATE_FPSIMD; - goto fpsimd_only; - } + /* + * Non-streaming SVE state may be preserved without an SVE payload, in + * which case the SVE context only has a header with VL==0, and all + * state can be restored from the FPSIMD context. + * + * Streaming SVE state is always preserved with an SVE payload. For + * consistency and robustness, reject restoring streaming SVE state + * without an SVE payload. + */ + if (!sm && user->sve_size == sizeof(*user->sve)) + return restore_fpsimd_context(user); vq = sve_vq_from_vl(vl); if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sve_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sve_alloc(current, true); if (!current->thread.sve_state) { clear_thread_flag(TIF_SVE); @@ -461,19 +464,14 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) set_thread_flag(TIF_SVE); current->thread.fp_type = FP_STATE_SVE; -fpsimd_only: - /* copy the FP and status/control registers */ - /* restore_sigframe() already checked that user->fpsimd != NULL. */ - err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs, - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err); - __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err); + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; - /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); + /* Merge the FPSIMD registers into the SVE state */ + fpsimd_update_current_state(&fpsimd); - return err ? -EFAULT : 0; + return 0; } #else /* ! CONFIG_ARM64_SVE */ @@ -493,13 +491,12 @@ extern int preserve_sve_context(void __user *ctx); static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) { + u64 tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); int err = 0; - current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); - __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); - __put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err); + __put_user_error(tpidr2_el0, &ctx->tpidr2, err); return err; } @@ -541,11 +538,6 @@ static int preserve_za_context(struct za_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the ZA state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, current->thread.sme_state, ZA_SIG_REGS_SIZE(vq)); @@ -580,16 +572,6 @@ static int restore_za_context(struct user_ctxs *user) if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sme_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sme_alloc(current, true); if (!current->thread.sme_state) { current->thread.svcr &= ~SVCR_ZA_MASK; @@ -627,11 +609,6 @@ static int preserve_zt_context(struct zt_context __user *ctx) BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); - /* - * This assumes that the ZT state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, thread_zt_state(¤t->thread), ZT_SIG_REGS_SIZE(1)); @@ -657,16 +634,6 @@ static int restore_zt_context(struct user_ctxs *user) if (nregs != 1) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.zt_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch ZT in thread state */ - err = __copy_from_user(thread_zt_state(¤t->thread), (char __user const *)user->zt + ZT_SIG_REGS_OFFSET, @@ -1017,6 +984,8 @@ static int restore_sigframe(struct pt_regs *regs, */ forget_syscall(regs); + fpsimd_save_and_flush_current_state(); + err |= !valid_user_regs(®s->user_regs, current); if (err == 0) err = parse_user_sigframe(&user, sf); @@ -1507,21 +1476,9 @@ static int setup_return(struct pt_regs *regs, struct ksignal *ksig, /* Signal handlers are invoked with ZA and streaming mode disabled */ if (system_supports_sme()) { - /* - * If we were in streaming mode the saved register - * state was SVE but we will exit SM and use the - * FPSIMD register state - flush the saved FPSIMD - * register state in case it gets loaded. - */ - if (current->thread.svcr & SVCR_SM_MASK) { - memset(¤t->thread.uw.fpsimd_state, 0, - sizeof(current->thread.uw.fpsimd_state)); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - current->thread.svcr &= ~(SVCR_ZA_MASK | - SVCR_SM_MASK); - sme_smstop(); + task_smstop_sm(current); + current->thread.svcr &= ~SVCR_ZA_MASK; + write_sysreg_s(0, SYS_TPIDR2_EL0); } return 0; @@ -1535,7 +1492,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, struct user_access_state ua_state; int err = 0; - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 81e798b6dada..bb3b526ff43f 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -103,7 +103,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) * Note that this also saves V16-31, which aren't visible * in AArch32. */ - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); /* Place structure header on the stack */ __put_user_error(magic, &frame->magic, err); @@ -169,14 +169,17 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) fpsimd.fpsr = fpscr & VFP_FPSCR_STAT_MASK; fpsimd.fpcr = fpscr & VFP_FPSCR_CTRL_MASK; + if (err) + return -EFAULT; + /* * We don't need to touch the exception register, so * reload the hardware state. */ - if (!err) - fpsimd_update_current_state(&fpsimd); + fpsimd_save_and_flush_current_state(); + current->thread.uw.fpsimd_state = fpsimd; - return err ? -EFAULT : 0; + return 0; } static int compat_restore_sigframe(struct pt_regs *regs, diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 529cff825531..9bfa5c944379 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -1118,7 +1118,7 @@ static struct break_hook kasan_break_hook = { #ifdef CONFIG_UBSAN_TRAP static int ubsan_handler(struct pt_regs *regs, unsigned long esr) { - die(report_ubsan_failure(regs, esr & UBSAN_BRK_MASK), regs, esr); + die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr); return DBG_HOOK_HANDLED; } @@ -1145,7 +1145,7 @@ int __init early_brk64(unsigned long addr, unsigned long esr, return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif #ifdef CONFIG_UBSAN_TRAP - if ((esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM) + if (esr_is_ubsan_brk(esr)) return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif return bug_handler(regs, esr) != DBG_HOOK_HANDLED; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index e73326bd3ff7..ad6133b89e7a 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -13,7 +13,7 @@ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; -#define HYPERVISOR_DATA_SECTIONS \ +#define HYPERVISOR_RODATA_SECTIONS \ HYP_SECTION_NAME(.rodata) : { \ . = ALIGN(PAGE_SIZE); \ __hyp_rodata_start = .; \ @@ -23,6 +23,15 @@ __hyp_rodata_end = .; \ } +#define HYPERVISOR_DATA_SECTION \ + HYP_SECTION_NAME(.data) : { \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_start = .; \ + *(HYP_SECTION_NAME(.data)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_data_end = .; \ + } + #define HYPERVISOR_PERCPU_SECTION \ . = ALIGN(PAGE_SIZE); \ HYP_SECTION_NAME(.data..percpu) : { \ @@ -51,7 +60,8 @@ #define SBSS_ALIGN PAGE_SIZE #else /* CONFIG_KVM */ #define HYPERVISOR_EXTABLE -#define HYPERVISOR_DATA_SECTIONS +#define HYPERVISOR_RODATA_SECTIONS +#define HYPERVISOR_DATA_SECTION #define HYPERVISOR_PERCPU_SECTION #define HYPERVISOR_RELOC_SECTION #define SBSS_ALIGN 0 @@ -190,7 +200,7 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) - HYPERVISOR_DATA_SECTIONS + HYPERVISOR_RODATA_SECTIONS .got : { *(.got) } /* @@ -249,9 +259,9 @@ SECTIONS __inittext_end = .; __initdata_begin = .; - init_idmap_pg_dir = .; + __pi_init_idmap_pg_dir = .; . += INIT_IDMAP_DIR_SIZE; - init_idmap_pg_end = .; + __pi_init_idmap_pg_end = .; .init.data : { INIT_DATA @@ -295,6 +305,8 @@ SECTIONS _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) + HYPERVISOR_DATA_SECTION + /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback @@ -319,11 +331,12 @@ SECTIONS /* start of zero-init region */ BSS_SECTION(SBSS_ALIGN, 0, 0) + __pi___bss_start = __bss_start; . = ALIGN(PAGE_SIZE); - init_pg_dir = .; + __pi_init_pg_dir = .; . += INIT_DIR_SIZE; - init_pg_end = .; + __pi_init_pg_end = .; /* end of zero-init region */ . += SZ_4K; /* stack for the early C runtime */ @@ -332,6 +345,7 @@ SECTIONS . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; + __pi__end = .; STABS_DEBUG DWARF_DEBUG diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 209bc76263f1..7c329e01c557 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -14,7 +14,7 @@ CFLAGS_sys_regs.o += -Wno-override-init CFLAGS_handle_exit.o += -Wno-override-init kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ - inject_fault.o va_layout.o handle_exit.o \ + inject_fault.o va_layout.o handle_exit.o config.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 19ca57def629..36cfcffb40d8 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -368,6 +368,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_EL1_32BIT: r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1); break; + case KVM_CAP_ARM_EL2: + r = cpus_have_final_cap(ARM64_HAS_NESTED_VIRT); + break; + case KVM_CAP_ARM_EL2_E2H0: + r = cpus_have_final_cap(ARM64_HAS_HCR_NV1); + break; case KVM_CAP_GUEST_DEBUG_HW_BPS: r = get_num_brps(); break; @@ -843,6 +849,10 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; if (vcpu_has_nv(vcpu)) { + ret = kvm_vcpu_allocate_vncr_tlb(vcpu); + if (ret) + return ret; + ret = kvm_vgic_vcpu_nv_init(vcpu); if (ret) return ret; @@ -2450,6 +2460,19 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(__icache_flags) = __icache_flags; kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; + /* Propagate the FGT state to the the nVHE side */ + kvm_nvhe_sym(hfgrtr_masks) = hfgrtr_masks; + kvm_nvhe_sym(hfgwtr_masks) = hfgwtr_masks; + kvm_nvhe_sym(hfgitr_masks) = hfgitr_masks; + kvm_nvhe_sym(hdfgrtr_masks) = hdfgrtr_masks; + kvm_nvhe_sym(hdfgwtr_masks) = hdfgwtr_masks; + kvm_nvhe_sym(hafgrtr_masks) = hafgrtr_masks; + kvm_nvhe_sym(hfgrtr2_masks) = hfgrtr2_masks; + kvm_nvhe_sym(hfgwtr2_masks) = hfgwtr2_masks; + kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks; + kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks; + kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks; + /* * Flush entire BSS since part of its data containing init symbols is read * while the MMU is off. @@ -2604,6 +2627,13 @@ static int __init init_hyp_mode(void) goto out_err; } + err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start), + kvm_ksym_ref(__hyp_data_end), PAGE_HYP); + if (err) { + kvm_err("Cannot map .hyp.data section\n"); + goto out_err; + } + err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start), kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO); if (err) { diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index f74a66ce3064..a25be111cd8f 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -10,61 +10,11 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> -enum trans_regime { - TR_EL10, - TR_EL20, - TR_EL2, -}; - -struct s1_walk_info { - u64 baddr; - enum trans_regime regime; - unsigned int max_oa_bits; - unsigned int pgshift; - unsigned int txsz; - int sl; - bool hpd; - bool e0poe; - bool poe; - bool pan; - bool be; - bool s2; -}; - -struct s1_walk_result { - union { - struct { - u64 desc; - u64 pa; - s8 level; - u8 APTable; - bool UXNTable; - bool PXNTable; - bool uwxn; - bool uov; - bool ur; - bool uw; - bool ux; - bool pwxn; - bool pov; - bool pr; - bool pw; - bool px; - }; - struct { - u8 fst; - bool ptw; - bool s2; - }; - }; - bool failed; -}; - -static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) { wr->fst = fst; - wr->ptw = ptw; - wr->s2 = s2; + wr->ptw = s1ptw; + wr->s2 = s1ptw; wr->failed = true; } @@ -145,20 +95,15 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) } } -static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; unsigned int stride, x; - bool va55, tbi, lva, as_el0; + bool va55, tbi, lva; hcr = __vcpu_sys_reg(vcpu, HCR_EL2); - wi->regime = compute_translation_regime(vcpu, op); - as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); - wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && - (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); - va55 = va & BIT(55); if (wi->regime == TR_EL2 && va55) @@ -319,7 +264,7 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, /* R_BNDVG and following statements */ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && - as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) goto transfault_l0; /* AArch64.S1StartLevel() */ @@ -345,11 +290,11 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, return 0; addrsz: /* Address Size Fault level 0 */ - fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); return -EFAULT; transfault_l0: /* Translation Fault level 0 */ - fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false); return -EFAULT; } @@ -380,13 +325,13 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (ret) { fail_s1_walk(wr, (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, - true, true); + true); return ret; } if (!kvm_s2_trans_readable(&s2_trans)) { fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), - true, true); + true); return -EPERM; } @@ -396,8 +341,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); if (ret) { - fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), - true, false); + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); return ret; } @@ -457,6 +401,11 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (check_output_size(desc & GENMASK(47, va_bottom), wi)) goto addrsz; + if (!(desc & PTE_AF)) { + fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); + return -EACCES; + } + va_bottom += contiguous_bit_shift(desc, wi, level); wr->failed = false; @@ -465,13 +414,40 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->pa = desc & GENMASK(47, va_bottom); wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); + if (wr->nG) { + u64 asid_ttbr, tcr; + + switch (wi->regime) { + case TR_EL10: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL20: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + wr->asid &= GENMASK(7, 0); + } + return 0; addrsz: - fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false); return -EINVAL; transfault: - fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false); return -ENOENT; } @@ -488,7 +464,6 @@ struct mmu_config { u64 sctlr; u64 vttbr; u64 vtcr; - u64 hcr; }; static void __mmu_config_save(struct mmu_config *config) @@ -511,13 +486,10 @@ static void __mmu_config_save(struct mmu_config *config) config->sctlr = read_sysreg_el1(SYS_SCTLR); config->vttbr = read_sysreg(vttbr_el2); config->vtcr = read_sysreg(vtcr_el2); - config->hcr = read_sysreg(hcr_el2); } static void __mmu_config_restore(struct mmu_config *config) { - write_sysreg(config->hcr, hcr_el2); - /* * ARM errata 1165522 and 1530923 require TGE to be 1 before * we update the guest state. @@ -1155,7 +1127,12 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) bool perm_fail = false; int ret, idx; - ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + wi.regime = compute_translation_regime(vcpu, op); + wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); + + ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); if (ret) goto compute_par; @@ -1198,7 +1175,7 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) } if (perm_fail) - fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); compute_par: return compute_par_s1(vcpu, &wr, wi.regime); @@ -1210,7 +1187,8 @@ compute_par: * If the translation is unsuccessful, the value may only contain * PAR_EL1.F, and cannot be taken at face value. It isn't an * indication of the translation having failed, only that the fast - * path did not succeed, *unless* it indicates a S1 permission fault. + * path did not succeed, *unless* it indicates a S1 permission or + * access fault. */ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { @@ -1266,8 +1244,8 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) __load_stage2(mmu, mmu->arch); skip_mmu_switch: - /* Clear TGE, enable S2 translation, we're rolling */ - write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); + /* Temporarily switch back to guest context */ + write_sysreg_hcr(vcpu->arch.hcr_el2); isb(); switch (op) { @@ -1299,6 +1277,8 @@ skip_mmu_switch: if (!fail) par = read_sysreg_par(); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) __mmu_config_restore(&config); @@ -1313,19 +1293,29 @@ static bool par_check_s1_perm_fault(u64 par) !(par & SYS_PAR_EL1_S)); } +static bool par_check_s1_access_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS && + !(par & SYS_PAR_EL1_S)); +} + void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); /* - * If PAR_EL1 reports that AT failed on a S1 permission fault, we - * know for sure that the PTW was able to walk the S1 tables and - * there's nothing else to do. + * If PAR_EL1 reports that AT failed on a S1 permission or access + * fault, we know for sure that the PTW was able to walk the S1 + * tables and there's nothing else to do. * * If AT failed for any other reason, then we must walk the guest S1 * to emulate the instruction. */ - if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + if ((par & SYS_PAR_EL1_F) && + !par_check_s1_perm_fault(par) && + !par_check_s1_access_fault(par)) par = handle_at_slow(vcpu, op, vaddr); vcpu_write_sys_reg(vcpu, par, PAR_EL1); @@ -1350,7 +1340,7 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) if (!vcpu_el2_e2h_is_set(vcpu)) val |= HCR_NV | HCR_NV1; - write_sysreg(val, hcr_el2); + write_sysreg_hcr(val); isb(); par = SYS_PAR_EL1_F; @@ -1375,7 +1365,7 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) if (!fail) par = read_sysreg_par(); - write_sysreg(hcr, hcr_el2); + write_sysreg_hcr(hcr); isb(); } @@ -1444,3 +1434,31 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) par = compute_par_s12(vcpu, par, &out); vcpu_write_sys_reg(vcpu, par, PAR_EL1); } + +/* + * Translate a VA for a given EL in a given translation regime, with + * or without PAN. This requires wi->{regime, as_el0, pan} to be + * set. The rest of the wi and wr should be 0-initialised. + */ +int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + int ret; + + ret = setup_s1_walk(vcpu, wi, wr, va); + if (ret) + return ret; + + if (wr->level == S1_MMU_DISABLED) { + wr->ur = wr->uw = wr->ux = true; + wr->pr = wr->pw = wr->px = true; + } else { + ret = walk_s1(vcpu, wi, wr, va); + if (ret) + return ret; + + compute_s1_permissions(vcpu, wi, wr); + } + + return 0; +} diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c new file mode 100644 index 000000000000..54911a93b001 --- /dev/null +++ b/arch/arm64/kvm/config.c @@ -0,0 +1,1085 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google LLC + * Author: Marc Zyngier <maz@kernel.org> + */ + +#include <linux/kvm_host.h> +#include <asm/sysreg.h> + +struct reg_bits_to_feat_map { + u64 bits; + +#define NEVER_FGU BIT(0) /* Can trap, but never UNDEF */ +#define CALL_FUNC BIT(1) /* Needs to evaluate tons of crap */ +#define FIXED_VALUE BIT(2) /* RAZ/WI or RAO/WI in KVM */ + unsigned long flags; + + union { + struct { + u8 regidx; + u8 shift; + u8 width; + bool sign; + s8 lo_lim; + }; + bool (*match)(struct kvm *); + bool (*fval)(struct kvm *, u64 *); + }; +}; + +#define __NEEDS_FEAT_3(m, f, id, fld, lim) \ + { \ + .bits = (m), \ + .flags = (f), \ + .regidx = IDREG_IDX(SYS_ ## id), \ + .shift = id ##_## fld ## _SHIFT, \ + .width = id ##_## fld ## _WIDTH, \ + .sign = id ##_## fld ## _SIGNED, \ + .lo_lim = id ##_## fld ##_## lim \ + } + +#define __NEEDS_FEAT_2(m, f, fun, dummy) \ + { \ + .bits = (m), \ + .flags = (f) | CALL_FUNC, \ + .fval = (fun), \ + } + +#define __NEEDS_FEAT_1(m, f, fun) \ + { \ + .bits = (m), \ + .flags = (f) | CALL_FUNC, \ + .match = (fun), \ + } + +#define NEEDS_FEAT_FLAG(m, f, ...) \ + CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, __VA_ARGS__) + +#define NEEDS_FEAT_FIXED(m, ...) \ + NEEDS_FEAT_FLAG(m, FIXED_VALUE, __VA_ARGS__, 0) + +#define NEEDS_FEAT(m, ...) NEEDS_FEAT_FLAG(m, 0, __VA_ARGS__) + +#define FEAT_SPE ID_AA64DFR0_EL1, PMSVer, IMP +#define FEAT_SPE_FnE ID_AA64DFR0_EL1, PMSVer, V1P2 +#define FEAT_BRBE ID_AA64DFR0_EL1, BRBE, IMP +#define FEAT_TRC_SR ID_AA64DFR0_EL1, TraceVer, IMP +#define FEAT_PMUv3 ID_AA64DFR0_EL1, PMUVer, IMP +#define FEAT_PMUv3p9 ID_AA64DFR0_EL1, PMUVer, V3P9 +#define FEAT_TRBE ID_AA64DFR0_EL1, TraceBuffer, IMP +#define FEAT_TRBEv1p1 ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1 +#define FEAT_DoubleLock ID_AA64DFR0_EL1, DoubleLock, IMP +#define FEAT_TRF ID_AA64DFR0_EL1, TraceFilt, IMP +#define FEAT_AA32EL0 ID_AA64PFR0_EL1, EL0, AARCH32 +#define FEAT_AA32EL1 ID_AA64PFR0_EL1, EL1, AARCH32 +#define FEAT_AA64EL1 ID_AA64PFR0_EL1, EL1, IMP +#define FEAT_AA64EL3 ID_AA64PFR0_EL1, EL3, IMP +#define FEAT_AIE ID_AA64MMFR3_EL1, AIE, IMP +#define FEAT_S2POE ID_AA64MMFR3_EL1, S2POE, IMP +#define FEAT_S1POE ID_AA64MMFR3_EL1, S1POE, IMP +#define FEAT_S1PIE ID_AA64MMFR3_EL1, S1PIE, IMP +#define FEAT_THE ID_AA64PFR1_EL1, THE, IMP +#define FEAT_SME ID_AA64PFR1_EL1, SME, IMP +#define FEAT_GCS ID_AA64PFR1_EL1, GCS, IMP +#define FEAT_LS64 ID_AA64ISAR1_EL1, LS64, LS64 +#define FEAT_LS64_V ID_AA64ISAR1_EL1, LS64, LS64_V +#define FEAT_LS64_ACCDATA ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA +#define FEAT_RAS ID_AA64PFR0_EL1, RAS, IMP +#define FEAT_RASv2 ID_AA64PFR0_EL1, RAS, V2 +#define FEAT_GICv3 ID_AA64PFR0_EL1, GIC, IMP +#define FEAT_LOR ID_AA64MMFR1_EL1, LO, IMP +#define FEAT_SPEv1p4 ID_AA64DFR0_EL1, PMSVer, V1P4 +#define FEAT_SPEv1p5 ID_AA64DFR0_EL1, PMSVer, V1P5 +#define FEAT_ATS1A ID_AA64ISAR2_EL1, ATS1A, IMP +#define FEAT_SPECRES2 ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX +#define FEAT_SPECRES ID_AA64ISAR1_EL1, SPECRES, IMP +#define FEAT_TLBIRANGE ID_AA64ISAR0_EL1, TLB, RANGE +#define FEAT_TLBIOS ID_AA64ISAR0_EL1, TLB, OS +#define FEAT_PAN2 ID_AA64MMFR1_EL1, PAN, PAN2 +#define FEAT_DPB2 ID_AA64ISAR1_EL1, DPB, DPB2 +#define FEAT_AMUv1 ID_AA64PFR0_EL1, AMU, IMP +#define FEAT_AMUv1p1 ID_AA64PFR0_EL1, AMU, V1P1 +#define FEAT_CMOW ID_AA64MMFR1_EL1, CMOW, IMP +#define FEAT_D128 ID_AA64MMFR3_EL1, D128, IMP +#define FEAT_DoubleFault2 ID_AA64PFR1_EL1, DF2, IMP +#define FEAT_FPMR ID_AA64PFR2_EL1, FPMR, IMP +#define FEAT_MOPS ID_AA64ISAR2_EL1, MOPS, IMP +#define FEAT_NMI ID_AA64PFR1_EL1, NMI, IMP +#define FEAT_SCTLR2 ID_AA64MMFR3_EL1, SCTLRX, IMP +#define FEAT_SYSREG128 ID_AA64ISAR2_EL1, SYSREG_128, IMP +#define FEAT_TCR2 ID_AA64MMFR3_EL1, TCRX, IMP +#define FEAT_XS ID_AA64ISAR1_EL1, XS, IMP +#define FEAT_EVT ID_AA64MMFR2_EL1, EVT, IMP +#define FEAT_EVT_TTLBxS ID_AA64MMFR2_EL1, EVT, TTLBxS +#define FEAT_MTE2 ID_AA64PFR1_EL1, MTE, MTE2 +#define FEAT_RME ID_AA64PFR0_EL1, RME, IMP +#define FEAT_MPAM ID_AA64PFR0_EL1, MPAM, 1 +#define FEAT_S2FWB ID_AA64MMFR2_EL1, FWB, IMP +#define FEAT_TME ID_AA64ISAR0_EL1, TME, IMP +#define FEAT_TWED ID_AA64MMFR1_EL1, TWED, IMP +#define FEAT_E2H0 ID_AA64MMFR4_EL1, E2H0, IMP +#define FEAT_SRMASK ID_AA64MMFR4_EL1, SRMASK, IMP +#define FEAT_PoPS ID_AA64MMFR4_EL1, PoPS, IMP +#define FEAT_PFAR ID_AA64PFR1_EL1, PFAR, IMP +#define FEAT_Debugv8p9 ID_AA64DFR0_EL1, PMUVer, V3P9 +#define FEAT_PMUv3_SS ID_AA64DFR0_EL1, PMSS, IMP +#define FEAT_SEBEP ID_AA64DFR0_EL1, SEBEP, IMP +#define FEAT_EBEP ID_AA64DFR1_EL1, EBEP, IMP +#define FEAT_ITE ID_AA64DFR1_EL1, ITE, IMP +#define FEAT_PMUv3_ICNTR ID_AA64DFR1_EL1, PMICNTR, IMP +#define FEAT_SPMU ID_AA64DFR1_EL1, SPMU, IMP +#define FEAT_SPE_nVM ID_AA64DFR2_EL1, SPE_nVM, IMP +#define FEAT_STEP2 ID_AA64DFR2_EL1, STEP, IMP + +static bool not_feat_aa64el3(struct kvm *kvm) +{ + return !kvm_has_feat(kvm, FEAT_AA64EL3); +} + +static bool feat_nv2(struct kvm *kvm) +{ + return ((kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) && + kvm_has_feat_enum(kvm, ID_AA64MMFR2_EL1, NV, NI)) || + kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2)); +} + +static bool feat_nv2_e2h0_ni(struct kvm *kvm) +{ + return feat_nv2(kvm) && !kvm_has_feat(kvm, FEAT_E2H0); +} + +static bool feat_rasv1p1(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) || + (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) && + kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1))); +} + +static bool feat_csv2_2_csv2_1p2(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) || + (kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2) && + kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, CSV2, IMP))); +} + +static bool feat_pauth(struct kvm *kvm) +{ + return kvm_has_pauth(kvm, PAuth); +} + +static bool feat_pauth_lr(struct kvm *kvm) +{ + return kvm_has_pauth(kvm, PAuth_LR); +} + +static bool feat_aderr(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ADERR, FEAT_ADERR) && + kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SDERR, FEAT_ADERR)); +} + +static bool feat_anerr(struct kvm *kvm) +{ + return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ANERR, FEAT_ANERR) && + kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SNERR, FEAT_ANERR)); +} + +static bool feat_sme_smps(struct kvm *kvm) +{ + /* + * Revists this if KVM ever supports SME -- this really should + * look at the guest's view of SMIDR_EL1. Funnily enough, this + * is not captured in the JSON file, but only as a note in the + * ARM ARM. + */ + return (kvm_has_feat(kvm, FEAT_SME) && + (read_sysreg_s(SYS_SMIDR_EL1) & SMIDR_EL1_SMPS)); +} + +static bool feat_spe_fds(struct kvm *kvm) +{ + /* + * Revists this if KVM ever supports SPE -- this really should + * look at the guest's view of PMSIDR_EL1. + */ + return (kvm_has_feat(kvm, FEAT_SPEv1p4) && + (read_sysreg_s(SYS_PMSIDR_EL1) & PMSIDR_EL1_FDS)); +} + +static bool feat_trbe_mpam(struct kvm *kvm) +{ + /* + * Revists this if KVM ever supports both MPAM and TRBE -- + * this really should look at the guest's view of TRBIDR_EL1. + */ + return (kvm_has_feat(kvm, FEAT_TRBE) && + kvm_has_feat(kvm, FEAT_MPAM) && + (read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_MPAM)); +} + +static bool feat_ebep_pmuv3_ss(struct kvm *kvm) +{ + return kvm_has_feat(kvm, FEAT_EBEP) || kvm_has_feat(kvm, FEAT_PMUv3_SS); +} + +static bool compute_hcr_rw(struct kvm *kvm, u64 *bits) +{ + /* This is purely academic: AArch32 and NV are mutually exclusive */ + if (bits) { + if (kvm_has_feat(kvm, FEAT_AA32EL1)) + *bits &= ~HCR_EL2_RW; + else + *bits |= HCR_EL2_RW; + } + + return true; +} + +static bool compute_hcr_e2h(struct kvm *kvm, u64 *bits) +{ + if (bits) { + if (kvm_has_feat(kvm, FEAT_E2H0)) + *bits &= ~HCR_EL2_E2H; + else + *bits |= HCR_EL2_E2H; + } + + return true; +} + +static const struct reg_bits_to_feat_map hfgrtr_feat_map[] = { + NEEDS_FEAT(HFGRTR_EL2_nAMAIR2_EL1 | + HFGRTR_EL2_nMAIR2_EL1, + FEAT_AIE), + NEEDS_FEAT(HFGRTR_EL2_nS2POR_EL1, FEAT_S2POE), + NEEDS_FEAT(HFGRTR_EL2_nPOR_EL1 | + HFGRTR_EL2_nPOR_EL0, + FEAT_S1POE), + NEEDS_FEAT(HFGRTR_EL2_nPIR_EL1 | + HFGRTR_EL2_nPIRE0_EL1, + FEAT_S1PIE), + NEEDS_FEAT(HFGRTR_EL2_nRCWMASK_EL1, FEAT_THE), + NEEDS_FEAT(HFGRTR_EL2_nTPIDR2_EL0 | + HFGRTR_EL2_nSMPRI_EL1, + FEAT_SME), + NEEDS_FEAT(HFGRTR_EL2_nGCS_EL1 | + HFGRTR_EL2_nGCS_EL0, + FEAT_GCS), + NEEDS_FEAT(HFGRTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA), + NEEDS_FEAT(HFGRTR_EL2_ERXADDR_EL1 | + HFGRTR_EL2_ERXMISCn_EL1 | + HFGRTR_EL2_ERXSTATUS_EL1 | + HFGRTR_EL2_ERXCTLR_EL1 | + HFGRTR_EL2_ERXFR_EL1 | + HFGRTR_EL2_ERRSELR_EL1 | + HFGRTR_EL2_ERRIDR_EL1, + FEAT_RAS), + NEEDS_FEAT(HFGRTR_EL2_ERXPFGCDN_EL1 | + HFGRTR_EL2_ERXPFGCTL_EL1 | + HFGRTR_EL2_ERXPFGF_EL1, + feat_rasv1p1), + NEEDS_FEAT(HFGRTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3), + NEEDS_FEAT(HFGRTR_EL2_SCXTNUM_EL0 | + HFGRTR_EL2_SCXTNUM_EL1, + feat_csv2_2_csv2_1p2), + NEEDS_FEAT(HFGRTR_EL2_LORSA_EL1 | + HFGRTR_EL2_LORN_EL1 | + HFGRTR_EL2_LORID_EL1 | + HFGRTR_EL2_LOREA_EL1 | + HFGRTR_EL2_LORC_EL1, + FEAT_LOR), + NEEDS_FEAT(HFGRTR_EL2_APIBKey | + HFGRTR_EL2_APIAKey | + HFGRTR_EL2_APGAKey | + HFGRTR_EL2_APDBKey | + HFGRTR_EL2_APDAKey, + feat_pauth), + NEEDS_FEAT_FLAG(HFGRTR_EL2_VBAR_EL1 | + HFGRTR_EL2_TTBR1_EL1 | + HFGRTR_EL2_TTBR0_EL1 | + HFGRTR_EL2_TPIDR_EL0 | + HFGRTR_EL2_TPIDRRO_EL0 | + HFGRTR_EL2_TPIDR_EL1 | + HFGRTR_EL2_TCR_EL1 | + HFGRTR_EL2_SCTLR_EL1 | + HFGRTR_EL2_REVIDR_EL1 | + HFGRTR_EL2_PAR_EL1 | + HFGRTR_EL2_MPIDR_EL1 | + HFGRTR_EL2_MIDR_EL1 | + HFGRTR_EL2_MAIR_EL1 | + HFGRTR_EL2_ISR_EL1 | + HFGRTR_EL2_FAR_EL1 | + HFGRTR_EL2_ESR_EL1 | + HFGRTR_EL2_DCZID_EL0 | + HFGRTR_EL2_CTR_EL0 | + HFGRTR_EL2_CSSELR_EL1 | + HFGRTR_EL2_CPACR_EL1 | + HFGRTR_EL2_CONTEXTIDR_EL1| + HFGRTR_EL2_CLIDR_EL1 | + HFGRTR_EL2_CCSIDR_EL1 | + HFGRTR_EL2_AMAIR_EL1 | + HFGRTR_EL2_AIDR_EL1 | + HFGRTR_EL2_AFSR1_EL1 | + HFGRTR_EL2_AFSR0_EL1, + NEVER_FGU, FEAT_AA64EL1), +}; + +static const struct reg_bits_to_feat_map hfgwtr_feat_map[] = { + NEEDS_FEAT(HFGWTR_EL2_nAMAIR2_EL1 | + HFGWTR_EL2_nMAIR2_EL1, + FEAT_AIE), + NEEDS_FEAT(HFGWTR_EL2_nS2POR_EL1, FEAT_S2POE), + NEEDS_FEAT(HFGWTR_EL2_nPOR_EL1 | + HFGWTR_EL2_nPOR_EL0, + FEAT_S1POE), + NEEDS_FEAT(HFGWTR_EL2_nPIR_EL1 | + HFGWTR_EL2_nPIRE0_EL1, + FEAT_S1PIE), + NEEDS_FEAT(HFGWTR_EL2_nRCWMASK_EL1, FEAT_THE), + NEEDS_FEAT(HFGWTR_EL2_nTPIDR2_EL0 | + HFGWTR_EL2_nSMPRI_EL1, + FEAT_SME), + NEEDS_FEAT(HFGWTR_EL2_nGCS_EL1 | + HFGWTR_EL2_nGCS_EL0, + FEAT_GCS), + NEEDS_FEAT(HFGWTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA), + NEEDS_FEAT(HFGWTR_EL2_ERXADDR_EL1 | + HFGWTR_EL2_ERXMISCn_EL1 | + HFGWTR_EL2_ERXSTATUS_EL1 | + HFGWTR_EL2_ERXCTLR_EL1 | + HFGWTR_EL2_ERRSELR_EL1, + FEAT_RAS), + NEEDS_FEAT(HFGWTR_EL2_ERXPFGCDN_EL1 | + HFGWTR_EL2_ERXPFGCTL_EL1, + feat_rasv1p1), + NEEDS_FEAT(HFGWTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3), + NEEDS_FEAT(HFGWTR_EL2_SCXTNUM_EL0 | + HFGWTR_EL2_SCXTNUM_EL1, + feat_csv2_2_csv2_1p2), + NEEDS_FEAT(HFGWTR_EL2_LORSA_EL1 | + HFGWTR_EL2_LORN_EL1 | + HFGWTR_EL2_LOREA_EL1 | + HFGWTR_EL2_LORC_EL1, + FEAT_LOR), + NEEDS_FEAT(HFGWTR_EL2_APIBKey | + HFGWTR_EL2_APIAKey | + HFGWTR_EL2_APGAKey | + HFGWTR_EL2_APDBKey | + HFGWTR_EL2_APDAKey, + feat_pauth), + NEEDS_FEAT_FLAG(HFGWTR_EL2_VBAR_EL1 | + HFGWTR_EL2_TTBR1_EL1 | + HFGWTR_EL2_TTBR0_EL1 | + HFGWTR_EL2_TPIDR_EL0 | + HFGWTR_EL2_TPIDRRO_EL0 | + HFGWTR_EL2_TPIDR_EL1 | + HFGWTR_EL2_TCR_EL1 | + HFGWTR_EL2_SCTLR_EL1 | + HFGWTR_EL2_PAR_EL1 | + HFGWTR_EL2_MAIR_EL1 | + HFGWTR_EL2_FAR_EL1 | + HFGWTR_EL2_ESR_EL1 | + HFGWTR_EL2_CSSELR_EL1 | + HFGWTR_EL2_CPACR_EL1 | + HFGWTR_EL2_CONTEXTIDR_EL1| + HFGWTR_EL2_AMAIR_EL1 | + HFGWTR_EL2_AFSR1_EL1 | + HFGWTR_EL2_AFSR0_EL1, + NEVER_FGU, FEAT_AA64EL1), +}; + +static const struct reg_bits_to_feat_map hdfgrtr_feat_map[] = { + NEEDS_FEAT(HDFGRTR_EL2_PMBIDR_EL1 | + HDFGRTR_EL2_PMSLATFR_EL1 | + HDFGRTR_EL2_PMSIRR_EL1 | + HDFGRTR_EL2_PMSIDR_EL1 | + HDFGRTR_EL2_PMSICR_EL1 | + HDFGRTR_EL2_PMSFCR_EL1 | + HDFGRTR_EL2_PMSEVFR_EL1 | + HDFGRTR_EL2_PMSCR_EL1 | + HDFGRTR_EL2_PMBSR_EL1 | + HDFGRTR_EL2_PMBPTR_EL1 | + HDFGRTR_EL2_PMBLIMITR_EL1, + FEAT_SPE), + NEEDS_FEAT(HDFGRTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE), + NEEDS_FEAT(HDFGRTR_EL2_nBRBDATA | + HDFGRTR_EL2_nBRBCTL | + HDFGRTR_EL2_nBRBIDR, + FEAT_BRBE), + NEEDS_FEAT(HDFGRTR_EL2_TRCVICTLR | + HDFGRTR_EL2_TRCSTATR | + HDFGRTR_EL2_TRCSSCSRn | + HDFGRTR_EL2_TRCSEQSTR | + HDFGRTR_EL2_TRCPRGCTLR | + HDFGRTR_EL2_TRCOSLSR | + HDFGRTR_EL2_TRCIMSPECn | + HDFGRTR_EL2_TRCID | + HDFGRTR_EL2_TRCCNTVRn | + HDFGRTR_EL2_TRCCLAIM | + HDFGRTR_EL2_TRCAUXCTLR | + HDFGRTR_EL2_TRCAUTHSTATUS | + HDFGRTR_EL2_TRC, + FEAT_TRC_SR), + NEEDS_FEAT(HDFGRTR_EL2_PMCEIDn_EL0 | + HDFGRTR_EL2_PMUSERENR_EL0 | + HDFGRTR_EL2_PMMIR_EL1 | + HDFGRTR_EL2_PMSELR_EL0 | + HDFGRTR_EL2_PMOVS | + HDFGRTR_EL2_PMINTEN | + HDFGRTR_EL2_PMCNTEN | + HDFGRTR_EL2_PMCCNTR_EL0 | + HDFGRTR_EL2_PMCCFILTR_EL0 | + HDFGRTR_EL2_PMEVTYPERn_EL0 | + HDFGRTR_EL2_PMEVCNTRn_EL0, + FEAT_PMUv3), + NEEDS_FEAT(HDFGRTR_EL2_TRBTRG_EL1 | + HDFGRTR_EL2_TRBSR_EL1 | + HDFGRTR_EL2_TRBPTR_EL1 | + HDFGRTR_EL2_TRBMAR_EL1 | + HDFGRTR_EL2_TRBLIMITR_EL1 | + HDFGRTR_EL2_TRBIDR_EL1 | + HDFGRTR_EL2_TRBBASER_EL1, + FEAT_TRBE), + NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSDLR_EL1, NEVER_FGU, + FEAT_DoubleLock), + NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSECCR_EL1 | + HDFGRTR_EL2_OSLSR_EL1 | + HDFGRTR_EL2_DBGPRCR_EL1 | + HDFGRTR_EL2_DBGAUTHSTATUS_EL1| + HDFGRTR_EL2_DBGCLAIM | + HDFGRTR_EL2_MDSCR_EL1 | + HDFGRTR_EL2_DBGWVRn_EL1 | + HDFGRTR_EL2_DBGWCRn_EL1 | + HDFGRTR_EL2_DBGBVRn_EL1 | + HDFGRTR_EL2_DBGBCRn_EL1, + NEVER_FGU, FEAT_AA64EL1) +}; + +static const struct reg_bits_to_feat_map hdfgwtr_feat_map[] = { + NEEDS_FEAT(HDFGWTR_EL2_PMSLATFR_EL1 | + HDFGWTR_EL2_PMSIRR_EL1 | + HDFGWTR_EL2_PMSICR_EL1 | + HDFGWTR_EL2_PMSFCR_EL1 | + HDFGWTR_EL2_PMSEVFR_EL1 | + HDFGWTR_EL2_PMSCR_EL1 | + HDFGWTR_EL2_PMBSR_EL1 | + HDFGWTR_EL2_PMBPTR_EL1 | + HDFGWTR_EL2_PMBLIMITR_EL1, + FEAT_SPE), + NEEDS_FEAT(HDFGWTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE), + NEEDS_FEAT(HDFGWTR_EL2_nBRBDATA | + HDFGWTR_EL2_nBRBCTL, + FEAT_BRBE), + NEEDS_FEAT(HDFGWTR_EL2_TRCVICTLR | + HDFGWTR_EL2_TRCSSCSRn | + HDFGWTR_EL2_TRCSEQSTR | + HDFGWTR_EL2_TRCPRGCTLR | + HDFGWTR_EL2_TRCOSLAR | + HDFGWTR_EL2_TRCIMSPECn | + HDFGWTR_EL2_TRCCNTVRn | + HDFGWTR_EL2_TRCCLAIM | + HDFGWTR_EL2_TRCAUXCTLR | + HDFGWTR_EL2_TRC, + FEAT_TRC_SR), + NEEDS_FEAT(HDFGWTR_EL2_PMUSERENR_EL0 | + HDFGWTR_EL2_PMCR_EL0 | + HDFGWTR_EL2_PMSWINC_EL0 | + HDFGWTR_EL2_PMSELR_EL0 | + HDFGWTR_EL2_PMOVS | + HDFGWTR_EL2_PMINTEN | + HDFGWTR_EL2_PMCNTEN | + HDFGWTR_EL2_PMCCNTR_EL0 | + HDFGWTR_EL2_PMCCFILTR_EL0 | + HDFGWTR_EL2_PMEVTYPERn_EL0 | + HDFGWTR_EL2_PMEVCNTRn_EL0, + FEAT_PMUv3), + NEEDS_FEAT(HDFGWTR_EL2_TRBTRG_EL1 | + HDFGWTR_EL2_TRBSR_EL1 | + HDFGWTR_EL2_TRBPTR_EL1 | + HDFGWTR_EL2_TRBMAR_EL1 | + HDFGWTR_EL2_TRBLIMITR_EL1 | + HDFGWTR_EL2_TRBBASER_EL1, + FEAT_TRBE), + NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSDLR_EL1, + NEVER_FGU, FEAT_DoubleLock), + NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSECCR_EL1 | + HDFGWTR_EL2_OSLAR_EL1 | + HDFGWTR_EL2_DBGPRCR_EL1 | + HDFGWTR_EL2_DBGCLAIM | + HDFGWTR_EL2_MDSCR_EL1 | + HDFGWTR_EL2_DBGWVRn_EL1 | + HDFGWTR_EL2_DBGWCRn_EL1 | + HDFGWTR_EL2_DBGBVRn_EL1 | + HDFGWTR_EL2_DBGBCRn_EL1, + NEVER_FGU, FEAT_AA64EL1), + NEEDS_FEAT(HDFGWTR_EL2_TRFCR_EL1, FEAT_TRF), +}; + + +static const struct reg_bits_to_feat_map hfgitr_feat_map[] = { + NEEDS_FEAT(HFGITR_EL2_PSBCSYNC, FEAT_SPEv1p5), + NEEDS_FEAT(HFGITR_EL2_ATS1E1A, FEAT_ATS1A), + NEEDS_FEAT(HFGITR_EL2_COSPRCTX, FEAT_SPECRES2), + NEEDS_FEAT(HFGITR_EL2_nGCSEPP | + HFGITR_EL2_nGCSSTR_EL1 | + HFGITR_EL2_nGCSPUSHM_EL1, + FEAT_GCS), + NEEDS_FEAT(HFGITR_EL2_nBRBIALL | + HFGITR_EL2_nBRBINJ, + FEAT_BRBE), + NEEDS_FEAT(HFGITR_EL2_CPPRCTX | + HFGITR_EL2_DVPRCTX | + HFGITR_EL2_CFPRCTX, + FEAT_SPECRES), + NEEDS_FEAT(HFGITR_EL2_TLBIRVAALE1 | + HFGITR_EL2_TLBIRVALE1 | + HFGITR_EL2_TLBIRVAAE1 | + HFGITR_EL2_TLBIRVAE1 | + HFGITR_EL2_TLBIRVAALE1IS | + HFGITR_EL2_TLBIRVALE1IS | + HFGITR_EL2_TLBIRVAAE1IS | + HFGITR_EL2_TLBIRVAE1IS | + HFGITR_EL2_TLBIRVAALE1OS | + HFGITR_EL2_TLBIRVALE1OS | + HFGITR_EL2_TLBIRVAAE1OS | + HFGITR_EL2_TLBIRVAE1OS, + FEAT_TLBIRANGE), + NEEDS_FEAT(HFGITR_EL2_TLBIVAALE1OS | + HFGITR_EL2_TLBIVALE1OS | + HFGITR_EL2_TLBIVAAE1OS | + HFGITR_EL2_TLBIASIDE1OS | + HFGITR_EL2_TLBIVAE1OS | + HFGITR_EL2_TLBIVMALLE1OS, + FEAT_TLBIOS), + NEEDS_FEAT(HFGITR_EL2_ATS1E1WP | + HFGITR_EL2_ATS1E1RP, + FEAT_PAN2), + NEEDS_FEAT(HFGITR_EL2_DCCVADP, FEAT_DPB2), + NEEDS_FEAT_FLAG(HFGITR_EL2_DCCVAC | + HFGITR_EL2_SVC_EL1 | + HFGITR_EL2_SVC_EL0 | + HFGITR_EL2_ERET | + HFGITR_EL2_TLBIVAALE1 | + HFGITR_EL2_TLBIVALE1 | + HFGITR_EL2_TLBIVAAE1 | + HFGITR_EL2_TLBIASIDE1 | + HFGITR_EL2_TLBIVAE1 | + HFGITR_EL2_TLBIVMALLE1 | + HFGITR_EL2_TLBIVAALE1IS | + HFGITR_EL2_TLBIVALE1IS | + HFGITR_EL2_TLBIVAAE1IS | + HFGITR_EL2_TLBIASIDE1IS | + HFGITR_EL2_TLBIVAE1IS | + HFGITR_EL2_TLBIVMALLE1IS| + HFGITR_EL2_ATS1E0W | + HFGITR_EL2_ATS1E0R | + HFGITR_EL2_ATS1E1W | + HFGITR_EL2_ATS1E1R | + HFGITR_EL2_DCZVA | + HFGITR_EL2_DCCIVAC | + HFGITR_EL2_DCCVAP | + HFGITR_EL2_DCCVAU | + HFGITR_EL2_DCCISW | + HFGITR_EL2_DCCSW | + HFGITR_EL2_DCISW | + HFGITR_EL2_DCIVAC | + HFGITR_EL2_ICIVAU | + HFGITR_EL2_ICIALLU | + HFGITR_EL2_ICIALLUIS, + NEVER_FGU, FEAT_AA64EL1), +}; + +static const struct reg_bits_to_feat_map hafgrtr_feat_map[] = { + NEEDS_FEAT(HAFGRTR_EL2_AMEVTYPER115_EL0 | + HAFGRTR_EL2_AMEVTYPER114_EL0 | + HAFGRTR_EL2_AMEVTYPER113_EL0 | + HAFGRTR_EL2_AMEVTYPER112_EL0 | + HAFGRTR_EL2_AMEVTYPER111_EL0 | + HAFGRTR_EL2_AMEVTYPER110_EL0 | + HAFGRTR_EL2_AMEVTYPER19_EL0 | + HAFGRTR_EL2_AMEVTYPER18_EL0 | + HAFGRTR_EL2_AMEVTYPER17_EL0 | + HAFGRTR_EL2_AMEVTYPER16_EL0 | + HAFGRTR_EL2_AMEVTYPER15_EL0 | + HAFGRTR_EL2_AMEVTYPER14_EL0 | + HAFGRTR_EL2_AMEVTYPER13_EL0 | + HAFGRTR_EL2_AMEVTYPER12_EL0 | + HAFGRTR_EL2_AMEVTYPER11_EL0 | + HAFGRTR_EL2_AMEVTYPER10_EL0 | + HAFGRTR_EL2_AMEVCNTR115_EL0 | + HAFGRTR_EL2_AMEVCNTR114_EL0 | + HAFGRTR_EL2_AMEVCNTR113_EL0 | + HAFGRTR_EL2_AMEVCNTR112_EL0 | + HAFGRTR_EL2_AMEVCNTR111_EL0 | + HAFGRTR_EL2_AMEVCNTR110_EL0 | + HAFGRTR_EL2_AMEVCNTR19_EL0 | + HAFGRTR_EL2_AMEVCNTR18_EL0 | + HAFGRTR_EL2_AMEVCNTR17_EL0 | + HAFGRTR_EL2_AMEVCNTR16_EL0 | + HAFGRTR_EL2_AMEVCNTR15_EL0 | + HAFGRTR_EL2_AMEVCNTR14_EL0 | + HAFGRTR_EL2_AMEVCNTR13_EL0 | + HAFGRTR_EL2_AMEVCNTR12_EL0 | + HAFGRTR_EL2_AMEVCNTR11_EL0 | + HAFGRTR_EL2_AMEVCNTR10_EL0 | + HAFGRTR_EL2_AMCNTEN1 | + HAFGRTR_EL2_AMCNTEN0 | + HAFGRTR_EL2_AMEVCNTR03_EL0 | + HAFGRTR_EL2_AMEVCNTR02_EL0 | + HAFGRTR_EL2_AMEVCNTR01_EL0 | + HAFGRTR_EL2_AMEVCNTR00_EL0, + FEAT_AMUv1), +}; + +static const struct reg_bits_to_feat_map hfgitr2_feat_map[] = { + NEEDS_FEAT(HFGITR2_EL2_nDCCIVAPS, FEAT_PoPS), + NEEDS_FEAT(HFGITR2_EL2_TSBCSYNC, FEAT_TRBEv1p1) +}; + +static const struct reg_bits_to_feat_map hfgrtr2_feat_map[] = { + NEEDS_FEAT(HFGRTR2_EL2_nPFAR_EL1, FEAT_PFAR), + NEEDS_FEAT(HFGRTR2_EL2_nERXGSR_EL1, FEAT_RASv2), + NEEDS_FEAT(HFGRTR2_EL2_nACTLRALIAS_EL1 | + HFGRTR2_EL2_nACTLRMASK_EL1 | + HFGRTR2_EL2_nCPACRALIAS_EL1 | + HFGRTR2_EL2_nCPACRMASK_EL1 | + HFGRTR2_EL2_nSCTLR2MASK_EL1 | + HFGRTR2_EL2_nSCTLRALIAS2_EL1 | + HFGRTR2_EL2_nSCTLRALIAS_EL1 | + HFGRTR2_EL2_nSCTLRMASK_EL1 | + HFGRTR2_EL2_nTCR2ALIAS_EL1 | + HFGRTR2_EL2_nTCR2MASK_EL1 | + HFGRTR2_EL2_nTCRALIAS_EL1 | + HFGRTR2_EL2_nTCRMASK_EL1, + FEAT_SRMASK), + NEEDS_FEAT(HFGRTR2_EL2_nRCWSMASK_EL1, FEAT_THE), +}; + +static const struct reg_bits_to_feat_map hfgwtr2_feat_map[] = { + NEEDS_FEAT(HFGWTR2_EL2_nPFAR_EL1, FEAT_PFAR), + NEEDS_FEAT(HFGWTR2_EL2_nACTLRALIAS_EL1 | + HFGWTR2_EL2_nACTLRMASK_EL1 | + HFGWTR2_EL2_nCPACRALIAS_EL1 | + HFGWTR2_EL2_nCPACRMASK_EL1 | + HFGWTR2_EL2_nSCTLR2MASK_EL1 | + HFGWTR2_EL2_nSCTLRALIAS2_EL1 | + HFGWTR2_EL2_nSCTLRALIAS_EL1 | + HFGWTR2_EL2_nSCTLRMASK_EL1 | + HFGWTR2_EL2_nTCR2ALIAS_EL1 | + HFGWTR2_EL2_nTCR2MASK_EL1 | + HFGWTR2_EL2_nTCRALIAS_EL1 | + HFGWTR2_EL2_nTCRMASK_EL1, + FEAT_SRMASK), + NEEDS_FEAT(HFGWTR2_EL2_nRCWSMASK_EL1, FEAT_THE), +}; + +static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = { + NEEDS_FEAT(HDFGRTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9), + NEEDS_FEAT(HDFGRTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss), + NEEDS_FEAT(HDFGRTR2_EL2_nTRCITECR_EL1, FEAT_ITE), + NEEDS_FEAT(HDFGRTR2_EL2_nPMICFILTR_EL0 | + HDFGRTR2_EL2_nPMICNTR_EL0, + FEAT_PMUv3_ICNTR), + NEEDS_FEAT(HDFGRTR2_EL2_nPMUACR_EL1, FEAT_PMUv3p9), + NEEDS_FEAT(HDFGRTR2_EL2_nPMSSCR_EL1 | + HDFGRTR2_EL2_nPMSSDATA, + FEAT_PMUv3_SS), + NEEDS_FEAT(HDFGRTR2_EL2_nPMIAR_EL1, FEAT_SEBEP), + NEEDS_FEAT(HDFGRTR2_EL2_nPMSDSFR_EL1, feat_spe_fds), + NEEDS_FEAT(HDFGRTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM), + NEEDS_FEAT(HDFGRTR2_EL2_nSPMACCESSR_EL1 | + HDFGRTR2_EL2_nSPMCNTEN | + HDFGRTR2_EL2_nSPMCR_EL0 | + HDFGRTR2_EL2_nSPMDEVAFF_EL1 | + HDFGRTR2_EL2_nSPMEVCNTRn_EL0 | + HDFGRTR2_EL2_nSPMEVTYPERn_EL0| + HDFGRTR2_EL2_nSPMID | + HDFGRTR2_EL2_nSPMINTEN | + HDFGRTR2_EL2_nSPMOVS | + HDFGRTR2_EL2_nSPMSCR_EL1 | + HDFGRTR2_EL2_nSPMSELR_EL0, + FEAT_SPMU), + NEEDS_FEAT(HDFGRTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2), + NEEDS_FEAT(HDFGRTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam), +}; + +static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = { + NEEDS_FEAT(HDFGWTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9), + NEEDS_FEAT(HDFGWTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss), + NEEDS_FEAT(HDFGWTR2_EL2_nTRCITECR_EL1, FEAT_ITE), + NEEDS_FEAT(HDFGWTR2_EL2_nPMICFILTR_EL0 | + HDFGWTR2_EL2_nPMICNTR_EL0, + FEAT_PMUv3_ICNTR), + NEEDS_FEAT(HDFGWTR2_EL2_nPMUACR_EL1 | + HDFGWTR2_EL2_nPMZR_EL0, + FEAT_PMUv3p9), + NEEDS_FEAT(HDFGWTR2_EL2_nPMSSCR_EL1, FEAT_PMUv3_SS), + NEEDS_FEAT(HDFGWTR2_EL2_nPMIAR_EL1, FEAT_SEBEP), + NEEDS_FEAT(HDFGWTR2_EL2_nPMSDSFR_EL1, feat_spe_fds), + NEEDS_FEAT(HDFGWTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM), + NEEDS_FEAT(HDFGWTR2_EL2_nSPMACCESSR_EL1 | + HDFGWTR2_EL2_nSPMCNTEN | + HDFGWTR2_EL2_nSPMCR_EL0 | + HDFGWTR2_EL2_nSPMEVCNTRn_EL0 | + HDFGWTR2_EL2_nSPMEVTYPERn_EL0| + HDFGWTR2_EL2_nSPMINTEN | + HDFGWTR2_EL2_nSPMOVS | + HDFGWTR2_EL2_nSPMSCR_EL1 | + HDFGWTR2_EL2_nSPMSELR_EL0, + FEAT_SPMU), + NEEDS_FEAT(HDFGWTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2), + NEEDS_FEAT(HDFGWTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam), +}; + +static const struct reg_bits_to_feat_map hcrx_feat_map[] = { + NEEDS_FEAT(HCRX_EL2_PACMEn, feat_pauth_lr), + NEEDS_FEAT(HCRX_EL2_EnFPM, FEAT_FPMR), + NEEDS_FEAT(HCRX_EL2_GCSEn, FEAT_GCS), + NEEDS_FEAT(HCRX_EL2_EnIDCP128, FEAT_SYSREG128), + NEEDS_FEAT(HCRX_EL2_EnSDERR, feat_aderr), + NEEDS_FEAT(HCRX_EL2_TMEA, FEAT_DoubleFault2), + NEEDS_FEAT(HCRX_EL2_EnSNERR, feat_anerr), + NEEDS_FEAT(HCRX_EL2_D128En, FEAT_D128), + NEEDS_FEAT(HCRX_EL2_PTTWI, FEAT_THE), + NEEDS_FEAT(HCRX_EL2_SCTLR2En, FEAT_SCTLR2), + NEEDS_FEAT(HCRX_EL2_TCR2En, FEAT_TCR2), + NEEDS_FEAT(HCRX_EL2_MSCEn | + HCRX_EL2_MCE2, + FEAT_MOPS), + NEEDS_FEAT(HCRX_EL2_CMOW, FEAT_CMOW), + NEEDS_FEAT(HCRX_EL2_VFNMI | + HCRX_EL2_VINMI | + HCRX_EL2_TALLINT, + FEAT_NMI), + NEEDS_FEAT(HCRX_EL2_SMPME, feat_sme_smps), + NEEDS_FEAT(HCRX_EL2_FGTnXS | + HCRX_EL2_FnXS, + FEAT_XS), + NEEDS_FEAT(HCRX_EL2_EnASR, FEAT_LS64_V), + NEEDS_FEAT(HCRX_EL2_EnALS, FEAT_LS64), + NEEDS_FEAT(HCRX_EL2_EnAS0, FEAT_LS64_ACCDATA), +}; + +static const struct reg_bits_to_feat_map hcr_feat_map[] = { + NEEDS_FEAT(HCR_EL2_TID0, FEAT_AA32EL0), + NEEDS_FEAT_FIXED(HCR_EL2_RW, compute_hcr_rw), + NEEDS_FEAT(HCR_EL2_HCD, not_feat_aa64el3), + NEEDS_FEAT(HCR_EL2_AMO | + HCR_EL2_BSU | + HCR_EL2_CD | + HCR_EL2_DC | + HCR_EL2_FB | + HCR_EL2_FMO | + HCR_EL2_ID | + HCR_EL2_IMO | + HCR_EL2_MIOCNCE | + HCR_EL2_PTW | + HCR_EL2_SWIO | + HCR_EL2_TACR | + HCR_EL2_TDZ | + HCR_EL2_TGE | + HCR_EL2_TID1 | + HCR_EL2_TID2 | + HCR_EL2_TID3 | + HCR_EL2_TIDCP | + HCR_EL2_TPCP | + HCR_EL2_TPU | + HCR_EL2_TRVM | + HCR_EL2_TSC | + HCR_EL2_TSW | + HCR_EL2_TTLB | + HCR_EL2_TVM | + HCR_EL2_TWE | + HCR_EL2_TWI | + HCR_EL2_VF | + HCR_EL2_VI | + HCR_EL2_VM | + HCR_EL2_VSE, + FEAT_AA64EL1), + NEEDS_FEAT(HCR_EL2_AMVOFFEN, FEAT_AMUv1p1), + NEEDS_FEAT(HCR_EL2_EnSCXT, feat_csv2_2_csv2_1p2), + NEEDS_FEAT(HCR_EL2_TICAB | + HCR_EL2_TID4 | + HCR_EL2_TOCU, + FEAT_EVT), + NEEDS_FEAT(HCR_EL2_TTLBIS | + HCR_EL2_TTLBOS, + FEAT_EVT_TTLBxS), + NEEDS_FEAT(HCR_EL2_TLOR, FEAT_LOR), + NEEDS_FEAT(HCR_EL2_ATA | + HCR_EL2_DCT | + HCR_EL2_TID5, + FEAT_MTE2), + NEEDS_FEAT(HCR_EL2_AT | /* Ignore the original FEAT_NV */ + HCR_EL2_NV2 | + HCR_EL2_NV, + feat_nv2), + NEEDS_FEAT(HCR_EL2_NV1, feat_nv2_e2h0_ni), /* Missing from JSON */ + NEEDS_FEAT(HCR_EL2_API | + HCR_EL2_APK, + feat_pauth), + NEEDS_FEAT(HCR_EL2_TEA | + HCR_EL2_TERR, + FEAT_RAS), + NEEDS_FEAT(HCR_EL2_FIEN, feat_rasv1p1), + NEEDS_FEAT(HCR_EL2_GPF, FEAT_RME), + NEEDS_FEAT(HCR_EL2_FWB, FEAT_S2FWB), + NEEDS_FEAT(HCR_EL2_TME, FEAT_TME), + NEEDS_FEAT(HCR_EL2_TWEDEL | + HCR_EL2_TWEDEn, + FEAT_TWED), + NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h), +}; + +static void __init check_feat_map(const struct reg_bits_to_feat_map *map, + int map_size, u64 res0, const char *str) +{ + u64 mask = 0; + + for (int i = 0; i < map_size; i++) + mask |= map[i].bits; + + if (mask != ~res0) + kvm_err("Undefined %s behaviour, bits %016llx\n", + str, mask ^ ~res0); +} + +void __init check_feature_map(void) +{ + check_feat_map(hfgrtr_feat_map, ARRAY_SIZE(hfgrtr_feat_map), + hfgrtr_masks.res0, hfgrtr_masks.str); + check_feat_map(hfgwtr_feat_map, ARRAY_SIZE(hfgwtr_feat_map), + hfgwtr_masks.res0, hfgwtr_masks.str); + check_feat_map(hfgitr_feat_map, ARRAY_SIZE(hfgitr_feat_map), + hfgitr_masks.res0, hfgitr_masks.str); + check_feat_map(hdfgrtr_feat_map, ARRAY_SIZE(hdfgrtr_feat_map), + hdfgrtr_masks.res0, hdfgrtr_masks.str); + check_feat_map(hdfgwtr_feat_map, ARRAY_SIZE(hdfgwtr_feat_map), + hdfgwtr_masks.res0, hdfgwtr_masks.str); + check_feat_map(hafgrtr_feat_map, ARRAY_SIZE(hafgrtr_feat_map), + hafgrtr_masks.res0, hafgrtr_masks.str); + check_feat_map(hcrx_feat_map, ARRAY_SIZE(hcrx_feat_map), + __HCRX_EL2_RES0, "HCRX_EL2"); + check_feat_map(hcr_feat_map, ARRAY_SIZE(hcr_feat_map), + HCR_EL2_RES0, "HCR_EL2"); +} + +static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map) +{ + u64 regval = kvm->arch.id_regs[map->regidx]; + u64 regfld = (regval >> map->shift) & GENMASK(map->width - 1, 0); + + if (map->sign) { + s64 sfld = sign_extend64(regfld, map->width - 1); + s64 slim = sign_extend64(map->lo_lim, map->width - 1); + return sfld >= slim; + } else { + return regfld >= map->lo_lim; + } +} + +static u64 __compute_fixed_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + u64 *fixed_bits, + unsigned long require, + unsigned long exclude) +{ + u64 val = 0; + + for (int i = 0; i < map_size; i++) { + bool match; + + if ((map[i].flags & require) != require) + continue; + + if (map[i].flags & exclude) + continue; + + if (map[i].flags & CALL_FUNC) + match = (map[i].flags & FIXED_VALUE) ? + map[i].fval(kvm, fixed_bits) : + map[i].match(kvm); + else + match = idreg_feat_match(kvm, &map[i]); + + if (!match || (map[i].flags & FIXED_VALUE)) + val |= map[i].bits; + } + + return val; +} + +static u64 compute_res0_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + unsigned long require, + unsigned long exclude) +{ + return __compute_fixed_bits(kvm, map, map_size, NULL, + require, exclude | FIXED_VALUE); +} + +static u64 compute_fixed_bits(struct kvm *kvm, + const struct reg_bits_to_feat_map *map, + int map_size, + u64 *fixed_bits, + unsigned long require, + unsigned long exclude) +{ + return __compute_fixed_bits(kvm, map, map_size, fixed_bits, + require | FIXED_VALUE, exclude); +} + +void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) +{ + u64 val = 0; + + switch (fgt) { + case HFGRTR_GROUP: + val |= compute_res0_bits(kvm, hfgrtr_feat_map, + ARRAY_SIZE(hfgrtr_feat_map), + 0, NEVER_FGU); + val |= compute_res0_bits(kvm, hfgwtr_feat_map, + ARRAY_SIZE(hfgwtr_feat_map), + 0, NEVER_FGU); + break; + case HFGITR_GROUP: + val |= compute_res0_bits(kvm, hfgitr_feat_map, + ARRAY_SIZE(hfgitr_feat_map), + 0, NEVER_FGU); + break; + case HDFGRTR_GROUP: + val |= compute_res0_bits(kvm, hdfgrtr_feat_map, + ARRAY_SIZE(hdfgrtr_feat_map), + 0, NEVER_FGU); + val |= compute_res0_bits(kvm, hdfgwtr_feat_map, + ARRAY_SIZE(hdfgwtr_feat_map), + 0, NEVER_FGU); + break; + case HAFGRTR_GROUP: + val |= compute_res0_bits(kvm, hafgrtr_feat_map, + ARRAY_SIZE(hafgrtr_feat_map), + 0, NEVER_FGU); + break; + case HFGRTR2_GROUP: + val |= compute_res0_bits(kvm, hfgrtr2_feat_map, + ARRAY_SIZE(hfgrtr2_feat_map), + 0, NEVER_FGU); + val |= compute_res0_bits(kvm, hfgwtr2_feat_map, + ARRAY_SIZE(hfgwtr2_feat_map), + 0, NEVER_FGU); + break; + case HFGITR2_GROUP: + val |= compute_res0_bits(kvm, hfgitr2_feat_map, + ARRAY_SIZE(hfgitr2_feat_map), + 0, NEVER_FGU); + break; + case HDFGRTR2_GROUP: + val |= compute_res0_bits(kvm, hdfgrtr2_feat_map, + ARRAY_SIZE(hdfgrtr2_feat_map), + 0, NEVER_FGU); + val |= compute_res0_bits(kvm, hdfgwtr2_feat_map, + ARRAY_SIZE(hdfgwtr2_feat_map), + 0, NEVER_FGU); + break; + default: + BUG(); + } + + kvm->arch.fgu[fgt] = val; +} + +void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1) +{ + u64 fixed = 0, mask; + + switch (reg) { + case HFGRTR_EL2: + *res0 = compute_res0_bits(kvm, hfgrtr_feat_map, + ARRAY_SIZE(hfgrtr_feat_map), 0, 0); + *res0 |= hfgrtr_masks.res0; + *res1 = HFGRTR_EL2_RES1; + break; + case HFGWTR_EL2: + *res0 = compute_res0_bits(kvm, hfgwtr_feat_map, + ARRAY_SIZE(hfgwtr_feat_map), 0, 0); + *res0 |= hfgwtr_masks.res0; + *res1 = HFGWTR_EL2_RES1; + break; + case HFGITR_EL2: + *res0 = compute_res0_bits(kvm, hfgitr_feat_map, + ARRAY_SIZE(hfgitr_feat_map), 0, 0); + *res0 |= hfgitr_masks.res0; + *res1 = HFGITR_EL2_RES1; + break; + case HDFGRTR_EL2: + *res0 = compute_res0_bits(kvm, hdfgrtr_feat_map, + ARRAY_SIZE(hdfgrtr_feat_map), 0, 0); + *res0 |= hdfgrtr_masks.res0; + *res1 = HDFGRTR_EL2_RES1; + break; + case HDFGWTR_EL2: + *res0 = compute_res0_bits(kvm, hdfgwtr_feat_map, + ARRAY_SIZE(hdfgwtr_feat_map), 0, 0); + *res0 |= hdfgwtr_masks.res0; + *res1 = HDFGWTR_EL2_RES1; + break; + case HAFGRTR_EL2: + *res0 = compute_res0_bits(kvm, hafgrtr_feat_map, + ARRAY_SIZE(hafgrtr_feat_map), 0, 0); + *res0 |= hafgrtr_masks.res0; + *res1 = HAFGRTR_EL2_RES1; + break; + case HFGRTR2_EL2: + *res0 = compute_res0_bits(kvm, hfgrtr2_feat_map, + ARRAY_SIZE(hfgrtr2_feat_map), 0, 0); + *res0 |= hfgrtr2_masks.res0; + *res1 = HFGRTR2_EL2_RES1; + break; + case HFGWTR2_EL2: + *res0 = compute_res0_bits(kvm, hfgwtr2_feat_map, + ARRAY_SIZE(hfgwtr2_feat_map), 0, 0); + *res0 |= hfgwtr2_masks.res0; + *res1 = HFGWTR2_EL2_RES1; + break; + case HFGITR2_EL2: + *res0 = compute_res0_bits(kvm, hfgitr2_feat_map, + ARRAY_SIZE(hfgitr2_feat_map), 0, 0); + *res0 |= hfgitr2_masks.res0; + *res1 = HFGITR2_EL2_RES1; + break; + case HDFGRTR2_EL2: + *res0 = compute_res0_bits(kvm, hdfgrtr2_feat_map, + ARRAY_SIZE(hdfgrtr2_feat_map), 0, 0); + *res0 |= hdfgrtr2_masks.res0; + *res1 = HDFGRTR2_EL2_RES1; + break; + case HDFGWTR2_EL2: + *res0 = compute_res0_bits(kvm, hdfgwtr2_feat_map, + ARRAY_SIZE(hdfgwtr2_feat_map), 0, 0); + *res0 |= hdfgwtr2_masks.res0; + *res1 = HDFGWTR2_EL2_RES1; + break; + case HCRX_EL2: + *res0 = compute_res0_bits(kvm, hcrx_feat_map, + ARRAY_SIZE(hcrx_feat_map), 0, 0); + *res0 |= __HCRX_EL2_RES0; + *res1 = __HCRX_EL2_RES1; + break; + case HCR_EL2: + mask = compute_fixed_bits(kvm, hcr_feat_map, + ARRAY_SIZE(hcr_feat_map), &fixed, + 0, 0); + *res0 = compute_res0_bits(kvm, hcr_feat_map, + ARRAY_SIZE(hcr_feat_map), 0, 0); + *res0 |= HCR_EL2_RES0 | (mask & ~fixed); + *res1 = HCR_EL2_RES1 | (mask & fixed); + break; + default: + WARN_ON_ONCE(1); + *res0 = *res1 = 0; + break; + } +} diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 0fcfcc0478f9..3a384e9660b8 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -622,6 +622,11 @@ struct encoding_to_trap_config { const unsigned int line; }; +/* + * WARNING: using ranges is a treacherous endeavour, as sysregs that + * are part of an architectural range are not necessarily contiguous + * in the [Op0,Op1,CRn,CRm,Ops] space. Tread carefully. + */ #define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \ { \ .encoding = sr_start, \ @@ -1279,98 +1284,128 @@ enum fg_filter_id { __NR_FG_FILTER_IDS__ }; -#define SR_FGF(sr, g, b, p, f) \ - { \ - .encoding = sr, \ - .end = sr, \ - .tc = { \ +#define __FGT(g, b, p, f) \ + { \ .fgt = g ## _GROUP, \ .bit = g ## _EL2_ ## b ## _SHIFT, \ .pol = p, \ .fgf = f, \ - }, \ + } + +#define FGT(g, b, p) __FGT(g, b, p, __NO_FGF__) + +/* + * See the warning next to SR_RANGE_TRAP(), and apply the same + * level of caution. + */ +#define SR_FGF_RANGE(sr, e, g, b, p, f) \ + { \ + .encoding = sr, \ + .end = e, \ + .tc = __FGT(g, b, p, f), \ .line = __LINE__, \ } -#define SR_FGT(sr, g, b, p) SR_FGF(sr, g, b, p, __NO_FGF__) +#define SR_FGF(sr, g, b, p, f) SR_FGF_RANGE(sr, sr, g, b, p, f) +#define SR_FGT(sr, g, b, p) SR_FGF_RANGE(sr, sr, g, b, p, __NO_FGF__) +#define SR_FGT_RANGE(sr, end, g, b, p) \ + SR_FGF_RANGE(sr, end, g, b, p, __NO_FGF__) static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { /* HFGRTR_EL2, HFGWTR_EL2 */ - SR_FGT(SYS_AMAIR2_EL1, HFGxTR, nAMAIR2_EL1, 0), - SR_FGT(SYS_MAIR2_EL1, HFGxTR, nMAIR2_EL1, 0), - SR_FGT(SYS_S2POR_EL1, HFGxTR, nS2POR_EL1, 0), - SR_FGT(SYS_POR_EL1, HFGxTR, nPOR_EL1, 0), - SR_FGT(SYS_POR_EL0, HFGxTR, nPOR_EL0, 0), - SR_FGT(SYS_PIR_EL1, HFGxTR, nPIR_EL1, 0), - SR_FGT(SYS_PIRE0_EL1, HFGxTR, nPIRE0_EL1, 0), - SR_FGT(SYS_RCWMASK_EL1, HFGxTR, nRCWMASK_EL1, 0), - SR_FGT(SYS_TPIDR2_EL0, HFGxTR, nTPIDR2_EL0, 0), - SR_FGT(SYS_SMPRI_EL1, HFGxTR, nSMPRI_EL1, 0), - SR_FGT(SYS_GCSCR_EL1, HFGxTR, nGCS_EL1, 0), - SR_FGT(SYS_GCSPR_EL1, HFGxTR, nGCS_EL1, 0), - SR_FGT(SYS_GCSCRE0_EL1, HFGxTR, nGCS_EL0, 0), - SR_FGT(SYS_GCSPR_EL0, HFGxTR, nGCS_EL0, 0), - SR_FGT(SYS_ACCDATA_EL1, HFGxTR, nACCDATA_EL1, 0), - SR_FGT(SYS_ERXADDR_EL1, HFGxTR, ERXADDR_EL1, 1), - SR_FGT(SYS_ERXPFGCDN_EL1, HFGxTR, ERXPFGCDN_EL1, 1), - SR_FGT(SYS_ERXPFGCTL_EL1, HFGxTR, ERXPFGCTL_EL1, 1), - SR_FGT(SYS_ERXPFGF_EL1, HFGxTR, ERXPFGF_EL1, 1), - SR_FGT(SYS_ERXMISC0_EL1, HFGxTR, ERXMISCn_EL1, 1), - SR_FGT(SYS_ERXMISC1_EL1, HFGxTR, ERXMISCn_EL1, 1), - SR_FGT(SYS_ERXMISC2_EL1, HFGxTR, ERXMISCn_EL1, 1), - SR_FGT(SYS_ERXMISC3_EL1, HFGxTR, ERXMISCn_EL1, 1), - SR_FGT(SYS_ERXSTATUS_EL1, HFGxTR, ERXSTATUS_EL1, 1), - SR_FGT(SYS_ERXCTLR_EL1, HFGxTR, ERXCTLR_EL1, 1), - SR_FGT(SYS_ERXFR_EL1, HFGxTR, ERXFR_EL1, 1), - SR_FGT(SYS_ERRSELR_EL1, HFGxTR, ERRSELR_EL1, 1), - SR_FGT(SYS_ERRIDR_EL1, HFGxTR, ERRIDR_EL1, 1), - SR_FGT(SYS_ICC_IGRPEN0_EL1, HFGxTR, ICC_IGRPENn_EL1, 1), - SR_FGT(SYS_ICC_IGRPEN1_EL1, HFGxTR, ICC_IGRPENn_EL1, 1), - SR_FGT(SYS_VBAR_EL1, HFGxTR, VBAR_EL1, 1), - SR_FGT(SYS_TTBR1_EL1, HFGxTR, TTBR1_EL1, 1), - SR_FGT(SYS_TTBR0_EL1, HFGxTR, TTBR0_EL1, 1), - SR_FGT(SYS_TPIDR_EL0, HFGxTR, TPIDR_EL0, 1), - SR_FGT(SYS_TPIDRRO_EL0, HFGxTR, TPIDRRO_EL0, 1), - SR_FGT(SYS_TPIDR_EL1, HFGxTR, TPIDR_EL1, 1), - SR_FGT(SYS_TCR_EL1, HFGxTR, TCR_EL1, 1), - SR_FGT(SYS_TCR2_EL1, HFGxTR, TCR_EL1, 1), - SR_FGT(SYS_SCXTNUM_EL0, HFGxTR, SCXTNUM_EL0, 1), - SR_FGT(SYS_SCXTNUM_EL1, HFGxTR, SCXTNUM_EL1, 1), - SR_FGT(SYS_SCTLR_EL1, HFGxTR, SCTLR_EL1, 1), - SR_FGT(SYS_REVIDR_EL1, HFGxTR, REVIDR_EL1, 1), - SR_FGT(SYS_PAR_EL1, HFGxTR, PAR_EL1, 1), - SR_FGT(SYS_MPIDR_EL1, HFGxTR, MPIDR_EL1, 1), - SR_FGT(SYS_MIDR_EL1, HFGxTR, MIDR_EL1, 1), - SR_FGT(SYS_MAIR_EL1, HFGxTR, MAIR_EL1, 1), - SR_FGT(SYS_LORSA_EL1, HFGxTR, LORSA_EL1, 1), - SR_FGT(SYS_LORN_EL1, HFGxTR, LORN_EL1, 1), - SR_FGT(SYS_LORID_EL1, HFGxTR, LORID_EL1, 1), - SR_FGT(SYS_LOREA_EL1, HFGxTR, LOREA_EL1, 1), - SR_FGT(SYS_LORC_EL1, HFGxTR, LORC_EL1, 1), - SR_FGT(SYS_ISR_EL1, HFGxTR, ISR_EL1, 1), - SR_FGT(SYS_FAR_EL1, HFGxTR, FAR_EL1, 1), - SR_FGT(SYS_ESR_EL1, HFGxTR, ESR_EL1, 1), - SR_FGT(SYS_DCZID_EL0, HFGxTR, DCZID_EL0, 1), - SR_FGT(SYS_CTR_EL0, HFGxTR, CTR_EL0, 1), - SR_FGT(SYS_CSSELR_EL1, HFGxTR, CSSELR_EL1, 1), - SR_FGT(SYS_CPACR_EL1, HFGxTR, CPACR_EL1, 1), - SR_FGT(SYS_CONTEXTIDR_EL1, HFGxTR, CONTEXTIDR_EL1, 1), - SR_FGT(SYS_CLIDR_EL1, HFGxTR, CLIDR_EL1, 1), - SR_FGT(SYS_CCSIDR_EL1, HFGxTR, CCSIDR_EL1, 1), - SR_FGT(SYS_APIBKEYLO_EL1, HFGxTR, APIBKey, 1), - SR_FGT(SYS_APIBKEYHI_EL1, HFGxTR, APIBKey, 1), - SR_FGT(SYS_APIAKEYLO_EL1, HFGxTR, APIAKey, 1), - SR_FGT(SYS_APIAKEYHI_EL1, HFGxTR, APIAKey, 1), - SR_FGT(SYS_APGAKEYLO_EL1, HFGxTR, APGAKey, 1), - SR_FGT(SYS_APGAKEYHI_EL1, HFGxTR, APGAKey, 1), - SR_FGT(SYS_APDBKEYLO_EL1, HFGxTR, APDBKey, 1), - SR_FGT(SYS_APDBKEYHI_EL1, HFGxTR, APDBKey, 1), - SR_FGT(SYS_APDAKEYLO_EL1, HFGxTR, APDAKey, 1), - SR_FGT(SYS_APDAKEYHI_EL1, HFGxTR, APDAKey, 1), - SR_FGT(SYS_AMAIR_EL1, HFGxTR, AMAIR_EL1, 1), - SR_FGT(SYS_AIDR_EL1, HFGxTR, AIDR_EL1, 1), - SR_FGT(SYS_AFSR1_EL1, HFGxTR, AFSR1_EL1, 1), - SR_FGT(SYS_AFSR0_EL1, HFGxTR, AFSR0_EL1, 1), + SR_FGT(SYS_AMAIR2_EL1, HFGRTR, nAMAIR2_EL1, 0), + SR_FGT(SYS_MAIR2_EL1, HFGRTR, nMAIR2_EL1, 0), + SR_FGT(SYS_S2POR_EL1, HFGRTR, nS2POR_EL1, 0), + SR_FGT(SYS_POR_EL1, HFGRTR, nPOR_EL1, 0), + SR_FGT(SYS_POR_EL0, HFGRTR, nPOR_EL0, 0), + SR_FGT(SYS_PIR_EL1, HFGRTR, nPIR_EL1, 0), + SR_FGT(SYS_PIRE0_EL1, HFGRTR, nPIRE0_EL1, 0), + SR_FGT(SYS_RCWMASK_EL1, HFGRTR, nRCWMASK_EL1, 0), + SR_FGT(SYS_TPIDR2_EL0, HFGRTR, nTPIDR2_EL0, 0), + SR_FGT(SYS_SMPRI_EL1, HFGRTR, nSMPRI_EL1, 0), + SR_FGT(SYS_GCSCR_EL1, HFGRTR, nGCS_EL1, 0), + SR_FGT(SYS_GCSPR_EL1, HFGRTR, nGCS_EL1, 0), + SR_FGT(SYS_GCSCRE0_EL1, HFGRTR, nGCS_EL0, 0), + SR_FGT(SYS_GCSPR_EL0, HFGRTR, nGCS_EL0, 0), + SR_FGT(SYS_ACCDATA_EL1, HFGRTR, nACCDATA_EL1, 0), + SR_FGT(SYS_ERXADDR_EL1, HFGRTR, ERXADDR_EL1, 1), + SR_FGT(SYS_ERXPFGCDN_EL1, HFGRTR, ERXPFGCDN_EL1, 1), + SR_FGT(SYS_ERXPFGCTL_EL1, HFGRTR, ERXPFGCTL_EL1, 1), + SR_FGT(SYS_ERXPFGF_EL1, HFGRTR, ERXPFGF_EL1, 1), + SR_FGT(SYS_ERXMISC0_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXMISC1_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXMISC2_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXMISC3_EL1, HFGRTR, ERXMISCn_EL1, 1), + SR_FGT(SYS_ERXSTATUS_EL1, HFGRTR, ERXSTATUS_EL1, 1), + SR_FGT(SYS_ERXCTLR_EL1, HFGRTR, ERXCTLR_EL1, 1), + SR_FGT(SYS_ERXFR_EL1, HFGRTR, ERXFR_EL1, 1), + SR_FGT(SYS_ERRSELR_EL1, HFGRTR, ERRSELR_EL1, 1), + SR_FGT(SYS_ERRIDR_EL1, HFGRTR, ERRIDR_EL1, 1), + SR_FGT(SYS_ICC_IGRPEN0_EL1, HFGRTR, ICC_IGRPENn_EL1, 1), + SR_FGT(SYS_ICC_IGRPEN1_EL1, HFGRTR, ICC_IGRPENn_EL1, 1), + SR_FGT(SYS_VBAR_EL1, HFGRTR, VBAR_EL1, 1), + SR_FGT(SYS_TTBR1_EL1, HFGRTR, TTBR1_EL1, 1), + SR_FGT(SYS_TTBR0_EL1, HFGRTR, TTBR0_EL1, 1), + SR_FGT(SYS_TPIDR_EL0, HFGRTR, TPIDR_EL0, 1), + SR_FGT(SYS_TPIDRRO_EL0, HFGRTR, TPIDRRO_EL0, 1), + SR_FGT(SYS_TPIDR_EL1, HFGRTR, TPIDR_EL1, 1), + SR_FGT(SYS_TCR_EL1, HFGRTR, TCR_EL1, 1), + SR_FGT(SYS_TCR2_EL1, HFGRTR, TCR_EL1, 1), + SR_FGT(SYS_SCXTNUM_EL0, HFGRTR, SCXTNUM_EL0, 1), + SR_FGT(SYS_SCXTNUM_EL1, HFGRTR, SCXTNUM_EL1, 1), + SR_FGT(SYS_SCTLR_EL1, HFGRTR, SCTLR_EL1, 1), + SR_FGT(SYS_REVIDR_EL1, HFGRTR, REVIDR_EL1, 1), + SR_FGT(SYS_PAR_EL1, HFGRTR, PAR_EL1, 1), + SR_FGT(SYS_MPIDR_EL1, HFGRTR, MPIDR_EL1, 1), + SR_FGT(SYS_MIDR_EL1, HFGRTR, MIDR_EL1, 1), + SR_FGT(SYS_MAIR_EL1, HFGRTR, MAIR_EL1, 1), + SR_FGT(SYS_LORSA_EL1, HFGRTR, LORSA_EL1, 1), + SR_FGT(SYS_LORN_EL1, HFGRTR, LORN_EL1, 1), + SR_FGT(SYS_LORID_EL1, HFGRTR, LORID_EL1, 1), + SR_FGT(SYS_LOREA_EL1, HFGRTR, LOREA_EL1, 1), + SR_FGT(SYS_LORC_EL1, HFGRTR, LORC_EL1, 1), + SR_FGT(SYS_ISR_EL1, HFGRTR, ISR_EL1, 1), + SR_FGT(SYS_FAR_EL1, HFGRTR, FAR_EL1, 1), + SR_FGT(SYS_ESR_EL1, HFGRTR, ESR_EL1, 1), + SR_FGT(SYS_DCZID_EL0, HFGRTR, DCZID_EL0, 1), + SR_FGT(SYS_CTR_EL0, HFGRTR, CTR_EL0, 1), + SR_FGT(SYS_CSSELR_EL1, HFGRTR, CSSELR_EL1, 1), + SR_FGT(SYS_CPACR_EL1, HFGRTR, CPACR_EL1, 1), + SR_FGT(SYS_CONTEXTIDR_EL1, HFGRTR, CONTEXTIDR_EL1, 1), + SR_FGT(SYS_CLIDR_EL1, HFGRTR, CLIDR_EL1, 1), + SR_FGT(SYS_CCSIDR_EL1, HFGRTR, CCSIDR_EL1, 1), + SR_FGT(SYS_APIBKEYLO_EL1, HFGRTR, APIBKey, 1), + SR_FGT(SYS_APIBKEYHI_EL1, HFGRTR, APIBKey, 1), + SR_FGT(SYS_APIAKEYLO_EL1, HFGRTR, APIAKey, 1), + SR_FGT(SYS_APIAKEYHI_EL1, HFGRTR, APIAKey, 1), + SR_FGT(SYS_APGAKEYLO_EL1, HFGRTR, APGAKey, 1), + SR_FGT(SYS_APGAKEYHI_EL1, HFGRTR, APGAKey, 1), + SR_FGT(SYS_APDBKEYLO_EL1, HFGRTR, APDBKey, 1), + SR_FGT(SYS_APDBKEYHI_EL1, HFGRTR, APDBKey, 1), + SR_FGT(SYS_APDAKEYLO_EL1, HFGRTR, APDAKey, 1), + SR_FGT(SYS_APDAKEYHI_EL1, HFGRTR, APDAKey, 1), + SR_FGT(SYS_AMAIR_EL1, HFGRTR, AMAIR_EL1, 1), + SR_FGT(SYS_AIDR_EL1, HFGRTR, AIDR_EL1, 1), + SR_FGT(SYS_AFSR1_EL1, HFGRTR, AFSR1_EL1, 1), + SR_FGT(SYS_AFSR0_EL1, HFGRTR, AFSR0_EL1, 1), + + /* HFGRTR2_EL2, HFGWTR2_EL2 */ + SR_FGT(SYS_ACTLRALIAS_EL1, HFGRTR2, nACTLRALIAS_EL1, 0), + SR_FGT(SYS_ACTLRMASK_EL1, HFGRTR2, nACTLRMASK_EL1, 0), + SR_FGT(SYS_CPACRALIAS_EL1, HFGRTR2, nCPACRALIAS_EL1, 0), + SR_FGT(SYS_CPACRMASK_EL1, HFGRTR2, nCPACRMASK_EL1, 0), + SR_FGT(SYS_PFAR_EL1, HFGRTR2, nPFAR_EL1, 0), + SR_FGT(SYS_RCWSMASK_EL1, HFGRTR2, nRCWSMASK_EL1, 0), + SR_FGT(SYS_SCTLR2ALIAS_EL1, HFGRTR2, nSCTLRALIAS2_EL1, 0), + SR_FGT(SYS_SCTLR2MASK_EL1, HFGRTR2, nSCTLR2MASK_EL1, 0), + SR_FGT(SYS_SCTLRALIAS_EL1, HFGRTR2, nSCTLRALIAS_EL1, 0), + SR_FGT(SYS_SCTLRMASK_EL1, HFGRTR2, nSCTLRMASK_EL1, 0), + SR_FGT(SYS_TCR2ALIAS_EL1, HFGRTR2, nTCR2ALIAS_EL1, 0), + SR_FGT(SYS_TCR2MASK_EL1, HFGRTR2, nTCR2MASK_EL1, 0), + SR_FGT(SYS_TCRALIAS_EL1, HFGRTR2, nTCRALIAS_EL1, 0), + SR_FGT(SYS_TCRMASK_EL1, HFGRTR2, nTCRMASK_EL1, 0), + SR_FGT(SYS_ERXGSR_EL1, HFGRTR2, nERXGSR_EL1, 0), + /* HFGITR_EL2 */ SR_FGT(OP_AT_S1E1A, HFGITR, ATS1E1A, 1), SR_FGT(OP_COSP_RCTX, HFGITR, COSPRCTX, 1), @@ -1480,6 +1515,11 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { SR_FGT(SYS_IC_IVAU, HFGITR, ICIVAU, 1), SR_FGT(SYS_IC_IALLU, HFGITR, ICIALLU, 1), SR_FGT(SYS_IC_IALLUIS, HFGITR, ICIALLUIS, 1), + + /* HFGITR2_EL2 */ + SR_FGT(SYS_DC_CIGDVAPS, HFGITR2, nDCCIVAPS, 0), + SR_FGT(SYS_DC_CIVAPS, HFGITR2, nDCCIVAPS, 0), + /* HDFGRTR_EL2 */ SR_FGT(SYS_PMBIDR_EL1, HDFGRTR, PMBIDR_EL1, 1), SR_FGT(SYS_PMSNEVFR_EL1, HDFGRTR, nPMSNEVFR_EL1, 0), @@ -1789,68 +1829,12 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { SR_FGT(SYS_PMCNTENSET_EL0, HDFGRTR, PMCNTEN, 1), SR_FGT(SYS_PMCCNTR_EL0, HDFGRTR, PMCCNTR_EL0, 1), SR_FGT(SYS_PMCCFILTR_EL0, HDFGRTR, PMCCFILTR_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(0), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(1), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(2), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(3), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(4), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(5), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(6), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(7), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(8), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(9), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(10), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(11), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(12), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(13), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(14), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(15), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(16), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(17), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(18), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(19), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(20), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(21), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(22), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(23), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(24), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(25), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(26), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(27), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(28), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(29), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVTYPERn_EL0(30), HDFGRTR, PMEVTYPERn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(0), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(1), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(2), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(3), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(4), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(5), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(6), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(7), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(8), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(9), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(10), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(11), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(12), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(13), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(14), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(15), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(16), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(17), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(18), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(19), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(20), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(21), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(22), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(23), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(24), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(25), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(26), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(27), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(28), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(29), HDFGRTR, PMEVCNTRn_EL0, 1), - SR_FGT(SYS_PMEVCNTRn_EL0(30), HDFGRTR, PMEVCNTRn_EL0, 1), + SR_FGT_RANGE(SYS_PMEVTYPERn_EL0(0), + SYS_PMEVTYPERn_EL0(30), + HDFGRTR, PMEVTYPERn_EL0, 1), + SR_FGT_RANGE(SYS_PMEVCNTRn_EL0(0), + SYS_PMEVCNTRn_EL0(30), + HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_OSDLR_EL1, HDFGRTR, OSDLR_EL1, 1), SR_FGT(SYS_OSECCR_EL1, HDFGRTR, OSECCR_EL1, 1), SR_FGT(SYS_OSLSR_EL1, HDFGRTR, OSLSR_EL1, 1), @@ -1928,6 +1912,59 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { SR_FGT(SYS_DBGBCRn_EL1(13), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(14), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(15), HDFGRTR, DBGBCRn_EL1, 1), + + /* HDFGRTR2_EL2 */ + SR_FGT(SYS_MDSELR_EL1, HDFGRTR2, nMDSELR_EL1, 0), + SR_FGT(SYS_MDSTEPOP_EL1, HDFGRTR2, nMDSTEPOP_EL1, 0), + SR_FGT(SYS_PMCCNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0), + SR_FGT_RANGE(SYS_PMEVCNTSVRn_EL1(0), + SYS_PMEVCNTSVRn_EL1(30), + HDFGRTR2, nPMSSDATA, 0), + SR_FGT(SYS_PMICNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0), + SR_FGT(SYS_PMECR_EL1, HDFGRTR2, nPMECR_EL1, 0), + SR_FGT(SYS_PMIAR_EL1, HDFGRTR2, nPMIAR_EL1, 0), + SR_FGT(SYS_PMICFILTR_EL0, HDFGRTR2, nPMICFILTR_EL0, 0), + SR_FGT(SYS_PMICNTR_EL0, HDFGRTR2, nPMICNTR_EL0, 0), + SR_FGT(SYS_PMSSCR_EL1, HDFGRTR2, nPMSSCR_EL1, 0), + SR_FGT(SYS_PMUACR_EL1, HDFGRTR2, nPMUACR_EL1, 0), + SR_FGT(SYS_SPMACCESSR_EL1, HDFGRTR2, nSPMACCESSR_EL1, 0), + SR_FGT(SYS_SPMCFGR_EL1, HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMDEVARCH_EL1, HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMCGCRn_EL1(0), HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMCGCRn_EL1(1), HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMIIDR_EL1, HDFGRTR2, nSPMID, 0), + SR_FGT(SYS_SPMCNTENCLR_EL0, HDFGRTR2, nSPMCNTEN, 0), + SR_FGT(SYS_SPMCNTENSET_EL0, HDFGRTR2, nSPMCNTEN, 0), + SR_FGT(SYS_SPMCR_EL0, HDFGRTR2, nSPMCR_EL0, 0), + SR_FGT(SYS_SPMDEVAFF_EL1, HDFGRTR2, nSPMDEVAFF_EL1, 0), + /* + * We have up to 64 of these registers in ranges of 16, banked via + * SPMSELR_EL0.BANK. We're only concerned with the accessors here, + * not the architectural registers. + */ + SR_FGT_RANGE(SYS_SPMEVCNTRn_EL0(0), + SYS_SPMEVCNTRn_EL0(15), + HDFGRTR2, nSPMEVCNTRn_EL0, 0), + SR_FGT_RANGE(SYS_SPMEVFILT2Rn_EL0(0), + SYS_SPMEVFILT2Rn_EL0(15), + HDFGRTR2, nSPMEVTYPERn_EL0, 0), + SR_FGT_RANGE(SYS_SPMEVFILTRn_EL0(0), + SYS_SPMEVFILTRn_EL0(15), + HDFGRTR2, nSPMEVTYPERn_EL0, 0), + SR_FGT_RANGE(SYS_SPMEVTYPERn_EL0(0), + SYS_SPMEVTYPERn_EL0(15), + HDFGRTR2, nSPMEVTYPERn_EL0, 0), + SR_FGT(SYS_SPMINTENCLR_EL1, HDFGRTR2, nSPMINTEN, 0), + SR_FGT(SYS_SPMINTENSET_EL1, HDFGRTR2, nSPMINTEN, 0), + SR_FGT(SYS_SPMOVSCLR_EL0, HDFGRTR2, nSPMOVS, 0), + SR_FGT(SYS_SPMOVSSET_EL0, HDFGRTR2, nSPMOVS, 0), + SR_FGT(SYS_SPMSCR_EL1, HDFGRTR2, nSPMSCR_EL1, 0), + SR_FGT(SYS_SPMSELR_EL0, HDFGRTR2, nSPMSELR_EL0, 0), + SR_FGT(SYS_TRCITECR_EL1, HDFGRTR2, nTRCITECR_EL1, 0), + SR_FGT(SYS_PMBMAR_EL1, HDFGRTR2, nPMBMAR_EL1, 0), + SR_FGT(SYS_PMSDSFR_EL1, HDFGRTR2, nPMSDSFR_EL1, 0), + SR_FGT(SYS_TRBMPAM_EL1, HDFGRTR2, nTRBMPAM_EL1, 0), + /* * HDFGWTR_EL2 * @@ -1938,12 +1975,19 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { * read-side mappings, and only the write-side mappings that * differ from the read side, and the trap handler will pick * the correct shadow register based on the access type. + * + * Same model applies to the FEAT_FGT2 registers. */ SR_FGT(SYS_TRFCR_EL1, HDFGWTR, TRFCR_EL1, 1), SR_FGT(SYS_TRCOSLAR, HDFGWTR, TRCOSLAR, 1), SR_FGT(SYS_PMCR_EL0, HDFGWTR, PMCR_EL0, 1), SR_FGT(SYS_PMSWINC_EL0, HDFGWTR, PMSWINC_EL0, 1), SR_FGT(SYS_OSLAR_EL1, HDFGWTR, OSLAR_EL1, 1), + + /* HDFGWTR2_EL2 */ + SR_FGT(SYS_PMZR_EL0, HDFGWTR2, nPMZR_EL0, 0), + SR_FGT(SYS_SPMZR_EL0, HDFGWTR2, nSPMEVCNTRn_EL0, 0), + /* * HAFGRTR_EL2 */ @@ -1989,6 +2033,20 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1), }; +/* + * Additional FGTs that do not fire with ESR_EL2.EC==0x18. This table + * isn't used for exception routing, but only as a promise that the + * trap is handled somewhere else. + */ +static const union trap_config non_0x18_fgt[] __initconst = { + FGT(HFGITR, PSBCSYNC, 1), + FGT(HFGITR, nGCSSTR_EL1, 0), + FGT(HFGITR, SVC_EL1, 1), + FGT(HFGITR, SVC_EL0, 1), + FGT(HFGITR, ERET, 1), + FGT(HFGITR2, TSBCSYNC, 1), +}; + static union trap_config get_trap_config(u32 sysreg) { return (union trap_config) { @@ -2033,6 +2091,130 @@ static u32 encoding_next(u32 encoding) return sys_reg(op0 + 1, 0, 0, 0, 0); } +#define FGT_MASKS(__n, __m) \ + struct fgt_masks __n = { .str = #__m, .res0 = __m, } + +FGT_MASKS(hfgrtr_masks, HFGRTR_EL2_RES0); +FGT_MASKS(hfgwtr_masks, HFGWTR_EL2_RES0); +FGT_MASKS(hfgitr_masks, HFGITR_EL2_RES0); +FGT_MASKS(hdfgrtr_masks, HDFGRTR_EL2_RES0); +FGT_MASKS(hdfgwtr_masks, HDFGWTR_EL2_RES0); +FGT_MASKS(hafgrtr_masks, HAFGRTR_EL2_RES0); +FGT_MASKS(hfgrtr2_masks, HFGRTR2_EL2_RES0); +FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2_RES0); +FGT_MASKS(hfgitr2_masks, HFGITR2_EL2_RES0); +FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2_RES0); +FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2_RES0); + +static __init bool aggregate_fgt(union trap_config tc) +{ + struct fgt_masks *rmasks, *wmasks; + + switch (tc.fgt) { + case HFGRTR_GROUP: + rmasks = &hfgrtr_masks; + wmasks = &hfgwtr_masks; + break; + case HDFGRTR_GROUP: + rmasks = &hdfgrtr_masks; + wmasks = &hdfgwtr_masks; + break; + case HAFGRTR_GROUP: + rmasks = &hafgrtr_masks; + wmasks = NULL; + break; + case HFGITR_GROUP: + rmasks = &hfgitr_masks; + wmasks = NULL; + break; + case HFGRTR2_GROUP: + rmasks = &hfgrtr2_masks; + wmasks = &hfgwtr2_masks; + break; + case HDFGRTR2_GROUP: + rmasks = &hdfgrtr2_masks; + wmasks = &hdfgwtr2_masks; + break; + case HFGITR2_GROUP: + rmasks = &hfgitr2_masks; + wmasks = NULL; + break; + } + + /* + * A bit can be reserved in either the R or W register, but + * not both. + */ + if ((BIT(tc.bit) & rmasks->res0) && + (!wmasks || (BIT(tc.bit) & wmasks->res0))) + return false; + + if (tc.pol) + rmasks->mask |= BIT(tc.bit) & ~rmasks->res0; + else + rmasks->nmask |= BIT(tc.bit) & ~rmasks->res0; + + if (wmasks) { + if (tc.pol) + wmasks->mask |= BIT(tc.bit) & ~wmasks->res0; + else + wmasks->nmask |= BIT(tc.bit) & ~wmasks->res0; + } + + return true; +} + +static __init int check_fgt_masks(struct fgt_masks *masks) +{ + unsigned long duplicate = masks->mask & masks->nmask; + u64 res0 = masks->res0; + int ret = 0; + + if (duplicate) { + int i; + + for_each_set_bit(i, &duplicate, 64) { + kvm_err("%s[%d] bit has both polarities\n", + masks->str, i); + } + + ret = -EINVAL; + } + + masks->res0 = ~(masks->mask | masks->nmask); + if (masks->res0 != res0) + kvm_info("Implicit %s = %016llx, expecting %016llx\n", + masks->str, masks->res0, res0); + + return ret; +} + +static __init int check_all_fgt_masks(int ret) +{ + static struct fgt_masks * const masks[] __initconst = { + &hfgrtr_masks, + &hfgwtr_masks, + &hfgitr_masks, + &hdfgrtr_masks, + &hdfgwtr_masks, + &hafgrtr_masks, + &hfgrtr2_masks, + &hfgwtr2_masks, + &hfgitr2_masks, + &hdfgrtr2_masks, + &hdfgwtr2_masks, + }; + int err = 0; + + for (int i = 0; i < ARRAY_SIZE(masks); i++) + err |= check_fgt_masks(masks[i]); + + return ret ?: err; +} + +#define for_each_encoding_in(__x, __s, __e) \ + for (u32 __x = __s; __x <= __e; __x = encoding_next(__x)) + int __init populate_nv_trap_config(void) { int ret = 0; @@ -2041,6 +2223,7 @@ int __init populate_nv_trap_config(void) BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS)); BUILD_BUG_ON(__NR_FGT_GROUP_IDS__ > BIT(TC_FGT_BITS)); BUILD_BUG_ON(__NR_FG_FILTER_IDS__ > BIT(TC_FGF_BITS)); + BUILD_BUG_ON(__HCRX_EL2_MASK & __HCRX_EL2_nMASK); for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) { const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i]; @@ -2051,7 +2234,7 @@ int __init populate_nv_trap_config(void) ret = -EINVAL; } - for (u32 enc = cgt->encoding; enc <= cgt->end; enc = encoding_next(enc)) { + for_each_encoding_in(enc, cgt->encoding, cgt->end) { prev = xa_store(&sr_forward_xa, enc, xa_mk_value(cgt->tc.val), GFP_KERNEL); if (prev && !xa_is_err(prev)) { @@ -2066,6 +2249,10 @@ int __init populate_nv_trap_config(void) } } + if (__HCRX_EL2_RES0 != HCRX_EL2_RES0) + kvm_info("Sanitised HCR_EL2_RES0 = %016llx, expecting %016llx\n", + __HCRX_EL2_RES0, HCRX_EL2_RES0); + kvm_info("nv: %ld coarse grained trap handlers\n", ARRAY_SIZE(encoding_to_cgt)); @@ -2082,23 +2269,39 @@ int __init populate_nv_trap_config(void) print_nv_trap_error(fgt, "Invalid FGT", ret); } - tc = get_trap_config(fgt->encoding); + for_each_encoding_in(enc, fgt->encoding, fgt->end) { + tc = get_trap_config(enc); - if (tc.fgt) { - ret = -EINVAL; - print_nv_trap_error(fgt, "Duplicate FGT", ret); - } + if (tc.fgt) { + ret = -EINVAL; + print_nv_trap_error(fgt, "Duplicate FGT", ret); + } + + tc.val |= fgt->tc.val; + prev = xa_store(&sr_forward_xa, enc, + xa_mk_value(tc.val), GFP_KERNEL); + + if (xa_is_err(prev)) { + ret = xa_err(prev); + print_nv_trap_error(fgt, "Failed FGT insertion", ret); + } - tc.val |= fgt->tc.val; - prev = xa_store(&sr_forward_xa, fgt->encoding, - xa_mk_value(tc.val), GFP_KERNEL); + if (!aggregate_fgt(tc)) { + ret = -EINVAL; + print_nv_trap_error(fgt, "FGT bit is reserved", ret); + } + } + } - if (xa_is_err(prev)) { - ret = xa_err(prev); - print_nv_trap_error(fgt, "Failed FGT insertion", ret); + for (int i = 0; i < ARRAY_SIZE(non_0x18_fgt); i++) { + if (!aggregate_fgt(non_0x18_fgt[i])) { + ret = -EINVAL; + kvm_err("non_0x18_fgt[%d] is reserved\n", i); } } + ret = check_all_fgt_masks(ret); + kvm_info("nv: %ld fine grained trap handlers\n", ARRAY_SIZE(encoding_to_fgt)); @@ -2215,11 +2418,11 @@ static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) return masks->mask[sr - __VNCR_START__].res0; } -static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read, - u64 val, const union trap_config tc) +static bool check_fgt_bit(struct kvm_vcpu *vcpu, enum vcpu_sysreg sr, + const union trap_config tc) { struct kvm *kvm = vcpu->kvm; - enum vcpu_sysreg sr; + u64 val; /* * KVM doesn't know about any FGTs that apply to the host, and hopefully @@ -2228,6 +2431,8 @@ static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read, if (is_hyp_ctxt(vcpu)) return false; + val = __vcpu_sys_reg(vcpu, sr); + if (tc.pol) return (val & BIT(tc.bit)); @@ -2242,38 +2447,17 @@ static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read, if (val & BIT(tc.bit)) return false; - switch ((enum fgt_group_id)tc.fgt) { - case HFGxTR_GROUP: - sr = is_read ? HFGRTR_EL2 : HFGWTR_EL2; - break; - - case HDFGRTR_GROUP: - sr = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2; - break; - - case HAFGRTR_GROUP: - sr = HAFGRTR_EL2; - break; - - case HFGITR_GROUP: - sr = HFGITR_EL2; - break; - - default: - WARN_ONCE(1, "Unhandled FGT group"); - return false; - } - return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit)); } bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) { + enum vcpu_sysreg fgtreg; union trap_config tc; enum trap_behaviour b; bool is_read; u32 sysreg; - u64 esr, val; + u64 esr; esr = kvm_vcpu_get_esr(vcpu); sysreg = esr_sys64_to_sysreg(esr); @@ -2319,26 +2503,20 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) case __NO_FGT_GROUP__: break; - case HFGxTR_GROUP: - if (is_read) - val = __vcpu_sys_reg(vcpu, HFGRTR_EL2); - else - val = __vcpu_sys_reg(vcpu, HFGWTR_EL2); + case HFGRTR_GROUP: + fgtreg = is_read ? HFGRTR_EL2 : HFGWTR_EL2; break; case HDFGRTR_GROUP: - if (is_read) - val = __vcpu_sys_reg(vcpu, HDFGRTR_EL2); - else - val = __vcpu_sys_reg(vcpu, HDFGWTR_EL2); + fgtreg = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2; break; case HAFGRTR_GROUP: - val = __vcpu_sys_reg(vcpu, HAFGRTR_EL2); + fgtreg = HAFGRTR_EL2; break; case HFGITR_GROUP: - val = __vcpu_sys_reg(vcpu, HFGITR_EL2); + fgtreg = HFGITR_EL2; switch (tc.fgf) { u64 tmp; @@ -2352,13 +2530,26 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) } break; - case __NR_FGT_GROUP_IDS__: + case HFGRTR2_GROUP: + fgtreg = is_read ? HFGRTR2_EL2 : HFGWTR2_EL2; + break; + + case HDFGRTR2_GROUP: + fgtreg = is_read ? HDFGRTR2_EL2 : HDFGWTR2_EL2; + break; + + case HFGITR2_GROUP: + fgtreg = HFGITR2_EL2; + break; + + default: /* Something is really wrong, bail out */ - WARN_ONCE(1, "__NR_FGT_GROUP_IDS__"); + WARN_ONCE(1, "Bad FGT group (encoding %08x, config %016llx)\n", + sysreg, tc.val); goto local; } - if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, is_read, val, tc)) + if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, fgtreg, tc)) goto inject; b = compute_trap_behaviour(vcpu, tc); @@ -2471,13 +2662,6 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) { u64 spsr, elr, esr; - /* - * Forward this trap to the virtual EL2 if the virtual - * HCR_EL2.NV bit is set and this is coming from !EL2. - */ - if (forward_hcr_traps(vcpu, HCR_NV)) - return; - spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); spsr = kvm_check_illegal_exception_return(vcpu, spsr); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index b73dc26bc44b..453266c96481 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -10,6 +10,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/ubsan.h> #include <asm/esr.h> #include <asm/exception.h> @@ -298,6 +299,81 @@ static int handle_svc(struct kvm_vcpu *vcpu) return 1; } +static int kvm_handle_gcs(struct kvm_vcpu *vcpu) +{ + /* We don't expect GCS, so treat it with contempt */ + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, GCS, IMP)) + WARN_ON_ONCE(1); + + kvm_inject_undefined(vcpu); + return 1; +} + +static int handle_other(struct kvm_vcpu *vcpu) +{ + bool is_l2 = vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu); + u64 hcrx = __vcpu_sys_reg(vcpu, HCRX_EL2); + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 iss = ESR_ELx_ISS(esr); + struct kvm *kvm = vcpu->kvm; + bool allowed, fwd = false; + + /* + * We only trap for two reasons: + * + * - the feature is disabled, and the only outcome is to + * generate an UNDEF. + * + * - the feature is enabled, but a NV guest wants to trap the + * feature used by its L2 guest. We forward the exception in + * this case. + * + * What we don't expect is to end-up here if the guest is + * expected be be able to directly use the feature, hence the + * WARN_ON below. + */ + switch (iss) { + case ESR_ELx_ISS_OTHER_ST64BV: + allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V); + if (is_l2) + fwd = !(hcrx & HCRX_EL2_EnASR); + break; + case ESR_ELx_ISS_OTHER_ST64BV0: + allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA); + if (is_l2) + fwd = !(hcrx & HCRX_EL2_EnAS0); + break; + case ESR_ELx_ISS_OTHER_LDST64B: + allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64); + if (is_l2) + fwd = !(hcrx & HCRX_EL2_EnALS); + break; + case ESR_ELx_ISS_OTHER_TSBCSYNC: + allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1); + if (is_l2) + fwd = (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC); + break; + case ESR_ELx_ISS_OTHER_PSBCSYNC: + allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P5); + if (is_l2) + fwd = (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC); + break; + default: + /* Clearly, we're missing something. */ + WARN_ON_ONCE(1); + allowed = false; + } + + WARN_ON_ONCE(allowed && !fwd); + + if (allowed && fwd) + kvm_inject_nested_sync(vcpu, esr); + else + kvm_inject_undefined(vcpu); + + return 1; +} + static exit_handle_fn arm_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec, [ESR_ELx_EC_WFx] = kvm_handle_wfx, @@ -307,6 +383,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_CP14_LS] = kvm_handle_cp14_load_store, [ESR_ELx_EC_CP10_ID] = kvm_handle_cp10_id, [ESR_ELx_EC_CP14_64] = kvm_handle_cp14_64, + [ESR_ELx_EC_OTHER] = handle_other, [ESR_ELx_EC_HVC32] = handle_hvc, [ESR_ELx_EC_SMC32] = handle_smc, [ESR_ELx_EC_HVC64] = handle_hvc, @@ -317,6 +394,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_ERET] = kvm_handle_eret, [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, + [ESR_ELx_EC_DABT_CUR] = kvm_handle_vncr_abort, [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, @@ -324,6 +402,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, [ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd, [ESR_ELx_EC_PAC] = kvm_handle_ptrauth, + [ESR_ELx_EC_GCS] = kvm_handle_gcs, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) @@ -474,6 +553,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, print_nvhe_hyp_panic("BUG", panic_addr); } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) { kvm_nvhe_report_cfi_failure(panic_addr); + } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) && + ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && + esr_is_ubsan_brk(esr)) { + print_nvhe_hyp_panic(report_ubsan_failure(esr & UBSAN_BRK_MASK), + panic_addr); } else { print_nvhe_hyp_panic("panic", panic_addr); } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 96f625dc7256..bb9f2eecfb67 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -65,12 +65,56 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } +#define reg_to_fgt_masks(reg) \ + ({ \ + struct fgt_masks *m; \ + switch(reg) { \ + case HFGRTR_EL2: \ + m = &hfgrtr_masks; \ + break; \ + case HFGWTR_EL2: \ + m = &hfgwtr_masks; \ + break; \ + case HFGITR_EL2: \ + m = &hfgitr_masks; \ + break; \ + case HDFGRTR_EL2: \ + m = &hdfgrtr_masks; \ + break; \ + case HDFGWTR_EL2: \ + m = &hdfgwtr_masks; \ + break; \ + case HAFGRTR_EL2: \ + m = &hafgrtr_masks; \ + break; \ + case HFGRTR2_EL2: \ + m = &hfgrtr2_masks; \ + break; \ + case HFGWTR2_EL2: \ + m = &hfgwtr2_masks; \ + break; \ + case HFGITR2_EL2: \ + m = &hfgitr2_masks; \ + break; \ + case HDFGRTR2_EL2: \ + m = &hdfgrtr2_masks; \ + break; \ + case HDFGWTR2_EL2: \ + m = &hdfgwtr2_masks; \ + break; \ + default: \ + BUILD_BUG_ON(1); \ + } \ + \ + m; \ + }) + #define compute_clr_set(vcpu, reg, clr, set) \ do { \ - u64 hfg; \ - hfg = __vcpu_sys_reg(vcpu, reg) & ~__ ## reg ## _RES0; \ - set |= hfg & __ ## reg ## _MASK; \ - clr |= ~hfg & __ ## reg ## _nMASK; \ + u64 hfg = __vcpu_sys_reg(vcpu, reg); \ + struct fgt_masks *m = reg_to_fgt_masks(reg); \ + set |= hfg & m->mask; \ + clr |= ~hfg & m->nmask; \ } while(0) #define reg_to_fgt_group_id(reg) \ @@ -79,7 +123,7 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) switch(reg) { \ case HFGRTR_EL2: \ case HFGWTR_EL2: \ - id = HFGxTR_GROUP; \ + id = HFGRTR_GROUP; \ break; \ case HFGITR_EL2: \ id = HFGITR_GROUP; \ @@ -91,6 +135,17 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) case HAFGRTR_EL2: \ id = HAFGRTR_GROUP; \ break; \ + case HFGRTR2_EL2: \ + case HFGWTR2_EL2: \ + id = HFGRTR2_GROUP; \ + break; \ + case HFGITR2_EL2: \ + id = HFGITR2_GROUP; \ + break; \ + case HDFGRTR2_EL2: \ + case HDFGWTR2_EL2: \ + id = HDFGRTR2_GROUP; \ + break; \ default: \ BUILD_BUG_ON(1); \ } \ @@ -101,13 +156,16 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) #define compute_undef_clr_set(vcpu, kvm, reg, clr, set) \ do { \ u64 hfg = kvm->arch.fgu[reg_to_fgt_group_id(reg)]; \ - set |= hfg & __ ## reg ## _MASK; \ - clr |= hfg & __ ## reg ## _nMASK; \ + struct fgt_masks *m = reg_to_fgt_masks(reg); \ + set |= hfg & m->mask; \ + clr |= hfg & m->nmask; \ } while(0) #define update_fgt_traps_cs(hctxt, vcpu, kvm, reg, clr, set) \ do { \ - u64 c = 0, s = 0; \ + struct fgt_masks *m = reg_to_fgt_masks(reg); \ + u64 c = clr, s = set; \ + u64 val; \ \ ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) \ @@ -115,30 +173,15 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) \ compute_undef_clr_set(vcpu, kvm, reg, c, s); \ \ - s |= set; \ - c |= clr; \ - if (c || s) { \ - u64 val = __ ## reg ## _nMASK; \ - val |= s; \ - val &= ~c; \ - write_sysreg_s(val, SYS_ ## reg); \ - } \ + val = m->nmask; \ + val |= s; \ + val &= ~c; \ + write_sysreg_s(val, SYS_ ## reg); \ } while(0) #define update_fgt_traps(hctxt, vcpu, kvm, reg) \ update_fgt_traps_cs(hctxt, vcpu, kvm, reg, 0, 0) -/* - * Validate the fine grain trap masks. - * Check that the masks do not overlap and that all bits are accounted for. - */ -#define CHECK_FGT_MASKS(reg) \ - do { \ - BUILD_BUG_ON((__ ## reg ## _MASK) & (__ ## reg ## _nMASK)); \ - BUILD_BUG_ON(~((__ ## reg ## _RES0) ^ (__ ## reg ## _MASK) ^ \ - (__ ## reg ## _nMASK))); \ - } while(0) - static inline bool cpu_has_amu(void) { u64 pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1); @@ -152,56 +195,60 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); struct kvm *kvm = kern_hyp_va(vcpu->kvm); - CHECK_FGT_MASKS(HFGRTR_EL2); - CHECK_FGT_MASKS(HFGWTR_EL2); - CHECK_FGT_MASKS(HFGITR_EL2); - CHECK_FGT_MASKS(HDFGRTR_EL2); - CHECK_FGT_MASKS(HDFGWTR_EL2); - CHECK_FGT_MASKS(HAFGRTR_EL2); - CHECK_FGT_MASKS(HCRX_EL2); - if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; update_fgt_traps(hctxt, vcpu, kvm, HFGRTR_EL2); update_fgt_traps_cs(hctxt, vcpu, kvm, HFGWTR_EL2, 0, cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) ? - HFGxTR_EL2_TCR_EL1_MASK : 0); + HFGWTR_EL2_TCR_EL1_MASK : 0); update_fgt_traps(hctxt, vcpu, kvm, HFGITR_EL2); update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR_EL2); update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR_EL2); if (cpu_has_amu()) update_fgt_traps(hctxt, vcpu, kvm, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + update_fgt_traps(hctxt, vcpu, kvm, HFGRTR2_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HFGWTR2_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HFGITR2_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR2_EL2); + update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR2_EL2); } -#define __deactivate_fgt(htcxt, vcpu, kvm, reg) \ +#define __deactivate_fgt(htcxt, vcpu, reg) \ do { \ - if ((vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) || \ - kvm->arch.fgu[reg_to_fgt_group_id(reg)]) \ - write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ - SYS_ ## reg); \ + write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ + SYS_ ## reg); \ } while(0) static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); - struct kvm *kvm = kern_hyp_va(vcpu->kvm); if (!cpus_have_final_cap(ARM64_HAS_FGT)) return; - __deactivate_fgt(hctxt, vcpu, kvm, HFGRTR_EL2); - if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) - write_sysreg_s(ctxt_sys_reg(hctxt, HFGWTR_EL2), SYS_HFGWTR_EL2); - else - __deactivate_fgt(hctxt, vcpu, kvm, HFGWTR_EL2); - __deactivate_fgt(hctxt, vcpu, kvm, HFGITR_EL2); - __deactivate_fgt(hctxt, vcpu, kvm, HDFGRTR_EL2); - __deactivate_fgt(hctxt, vcpu, kvm, HDFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR_EL2); if (cpu_has_amu()) - __deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, HAFGRTR_EL2); + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + __deactivate_fgt(hctxt, vcpu, HFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGWTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HFGITR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGRTR2_EL2); + __deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2); } static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) @@ -260,12 +307,9 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) if (cpus_have_final_cap(ARM64_HAS_HCX)) { u64 hcrx = vcpu->arch.hcrx_el2; if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { - u64 clr = 0, set = 0; - - compute_clr_set(vcpu, HCRX_EL2, clr, set); - - hcrx |= set; - hcrx &= ~clr; + u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2); + hcrx |= val & __HCRX_EL2_MASK; + hcrx &= ~(~val & __HCRX_EL2_nMASK); } ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2); @@ -300,7 +344,7 @@ static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr) if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM)) hcr |= HCR_TVM; - write_sysreg(hcr, hcr_el2); + write_sysreg_hcr(hcr); if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index ea0a704da9b8..5f9d56754e39 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -39,12 +39,12 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); -int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); -int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); -int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); -int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm); +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm); int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu); bool addr_is_memory(phys_addr_t phys); @@ -67,4 +67,10 @@ static __always_inline void __load_host_stage2(void) else write_sysreg(0, vttbr_el2); } + +#ifdef CONFIG_NVHE_EL2_DEBUG +void pkvm_ownership_selftest(void *base); +#else +static inline void pkvm_ownership_selftest(void *base) { } +#endif #endif /* __KVM_NVHE_MEM_PROTECT__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index 34233d586060..dee1a406b0c2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -8,23 +8,30 @@ #include <linux/types.h> /* - * Bits 0-1 are reserved to track the memory ownership state of each page: - * 00: The page is owned exclusively by the page-table owner. - * 01: The page is owned by the page-table owner, but is shared - * with another entity. - * 10: The page is shared with, but not owned by the page-table owner. - * 11: Reserved for future use (lending). + * Bits 0-1 are used to encode the memory ownership state of each page from the + * point of view of a pKVM "component" (host, hyp, guest, ... see enum + * pkvm_component_id): + * 00: The page is owned and exclusively accessible by the component; + * 01: The page is owned and accessible by the component, but is also + * accessible by another component; + * 10: The page is accessible but not owned by the component; + * The storage of this state depends on the component: either in the + * hyp_vmemmap for the host and hyp states or in PTE software bits for guests. */ enum pkvm_page_state { PKVM_PAGE_OWNED = 0ULL, PKVM_PAGE_SHARED_OWNED = BIT(0), PKVM_PAGE_SHARED_BORROWED = BIT(1), - __PKVM_PAGE_RESERVED = BIT(0) | BIT(1), - /* Meta-states which aren't encoded directly in the PTE's SW bits */ - PKVM_NOPAGE = BIT(2), + /* + * 'Meta-states' are not stored directly in PTE SW bits for guest + * states, but inferred from the context (e.g. invalid PTE entries). + * For the host and hyp, meta-states are stored directly in the + * struct hyp_page. + */ + PKVM_NOPAGE = BIT(0) | BIT(1), }; -#define PKVM_PAGE_META_STATES_MASK (~__PKVM_PAGE_RESERVED) +#define PKVM_PAGE_STATE_MASK (BIT(0) | BIT(1)) #define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, @@ -44,8 +51,15 @@ struct hyp_page { u16 refcount; u8 order; - /* Host (non-meta) state. Guarded by the host stage-2 lock. */ - enum pkvm_page_state host_state : 8; + /* Host state. Guarded by the host stage-2 lock. */ + unsigned __host_state : 4; + + /* + * Complement of the hyp state. Guarded by the hyp stage-1 lock. We use + * the complement so that the initial 0 in __hyp_state_comp (due to the + * entire vmemmap starting off zeroed) encodes PKVM_NOPAGE. + */ + unsigned __hyp_state_comp : 4; u32 host_share_guest_count; }; @@ -82,6 +96,26 @@ static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys) #define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) #define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) +static inline enum pkvm_page_state get_host_state(struct hyp_page *p) +{ + return p->__host_state; +} + +static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__host_state = state; +} + +static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p) +{ + return p->__hyp_state_comp ^ PKVM_PAGE_STATE_MASK; +} + +static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state) +{ + p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_MASK; +} + /* * Refcounting for 'struct hyp_page'. * hyp_pool::lock must be held if atomic access to the refcount is required. diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 230e4f2527de..6e83ce35c2f2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -13,9 +13,11 @@ extern struct kvm_pgtable pkvm_pgtable; extern hyp_spinlock_t pkvm_pgd_lock; -int hyp_create_pcpu_fixmap(void); +int hyp_create_fixmap(void); void *hyp_fixmap_map(phys_addr_t phys); void hyp_fixmap_unmap(void); +void *hyp_fixblock_map(phys_addr_t phys, size_t *size); +void hyp_fixblock_unmap(void); int hyp_create_idmap(u32 hyp_va_bits); int hyp_map_vectors(void); diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index b43426a493df..a76522d63c3e 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -99,3 +99,9 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAG # causes a build failure. Remove profile optimization flags. KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables + +ifeq ($(CONFIG_UBSAN_KVM_EL2),y) +UBSAN_SANITIZE := y +# Always use brk and not hooks +ccflags-y += $(CFLAGS_UBSAN_TRAP) +endif diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 58f0cb2298cc..eef15b374abb 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -124,7 +124,7 @@ SYM_FUNC_START(__hyp_do_panic) /* Ensure host stage-2 is disabled */ mrs x0, hcr_el2 bic x0, x0, #HCR_VM - msr hcr_el2, x0 + msr_hcr_el2 x0 isb tlbi vmalls12e1 dsb nsh diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index f8af11189572..aada42522e7b 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -100,7 +100,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) msr mair_el2, x1 ldr x1, [x0, #NVHE_INIT_HCR_EL2] - msr hcr_el2, x1 + msr_hcr_el2 x1 mov x2, #HCR_E2H and x2, x1, x2 @@ -262,7 +262,7 @@ reset: alternative_if ARM64_KVM_PROTECTED_MODE mov_q x5, HCR_HOST_NVHE_FLAGS - msr hcr_el2, x5 + msr_hcr_el2 x5 alternative_else_nop_endif /* Install stub vectors */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 2c37680d954c..8e8848de4d47 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -123,10 +123,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; - hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state); - /* Limit guest vector length to the maximum supported by the host. */ - hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl); - hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & @@ -249,7 +245,8 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(u64, pfn, host_ctxt, 1); DECLARE_REG(u64, gfn, host_ctxt, 2); - DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); struct pkvm_hyp_vcpu *hyp_vcpu; int ret = -EINVAL; @@ -264,7 +261,7 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) if (ret) goto out; - ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu, prot); + ret = __pkvm_host_share_guest(pfn, gfn, nr_pages, hyp_vcpu, prot); out: cpu_reg(host_ctxt, 1) = ret; } @@ -273,6 +270,7 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); struct pkvm_hyp_vm *hyp_vm; int ret = -EINVAL; @@ -283,7 +281,7 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) if (!hyp_vm) goto out; - ret = __pkvm_host_unshare_guest(gfn, hyp_vm); + ret = __pkvm_host_unshare_guest(gfn, nr_pages, hyp_vm); put_pkvm_hyp_vm(hyp_vm); out: cpu_reg(host_ctxt, 1) = ret; @@ -312,6 +310,7 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); struct pkvm_hyp_vm *hyp_vm; int ret = -EINVAL; @@ -322,7 +321,7 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt if (!hyp_vm) goto out; - ret = __pkvm_host_wrprotect_guest(gfn, hyp_vm); + ret = __pkvm_host_wrprotect_guest(gfn, nr_pages, hyp_vm); put_pkvm_hyp_vm(hyp_vm); out: cpu_reg(host_ctxt, 1) = ret; @@ -332,7 +331,8 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); DECLARE_REG(u64, gfn, host_ctxt, 2); - DECLARE_REG(bool, mkold, host_ctxt, 3); + DECLARE_REG(u64, nr_pages, host_ctxt, 3); + DECLARE_REG(bool, mkold, host_ctxt, 4); struct pkvm_hyp_vm *hyp_vm; int ret = -EINVAL; @@ -343,7 +343,7 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho if (!hyp_vm) goto out; - ret = __pkvm_host_test_clear_young_guest(gfn, mkold, hyp_vm); + ret = __pkvm_host_test_clear_young_guest(gfn, nr_pages, mkold, hyp_vm); put_pkvm_hyp_vm(hyp_vm); out: cpu_reg(host_ctxt, 1) = ret; diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index f4562f417d3f..d724f6d69302 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -25,5 +25,7 @@ SECTIONS { BEGIN_HYP_SECTION(.data..percpu) PERCPU_INPUT(L1_CACHE_BYTES) END_HYP_SECTION + HYP_SECTION(.bss) + HYP_SECTION(.data) } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index e80f3ebd3e2a..95d7534c9679 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -60,6 +60,11 @@ static void hyp_unlock_component(void) hyp_spin_unlock(&pkvm_pgd_lock); } +#define for_each_hyp_page(__p, __st, __sz) \ + for (struct hyp_page *__p = hyp_phys_to_page(__st), \ + *__e = __p + ((__sz) >> PAGE_SHIFT); \ + __p < __e; __p++) + static void *host_s2_zalloc_pages_exact(size_t size) { void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size)); @@ -161,12 +166,6 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) return 0; } -static bool guest_stage2_force_pte_cb(u64 addr, u64 end, - enum kvm_pgtable_prot prot) -{ - return true; -} - static void *guest_s2_zalloc_pages_exact(size_t size) { void *addr = hyp_alloc_pages(¤t_vm->pool, get_order(size)); @@ -217,16 +216,42 @@ static void guest_s2_put_page(void *addr) hyp_put_page(¤t_vm->pool, addr); } +static void __apply_guest_page(void *va, size_t size, + void (*func)(void *addr, size_t size)) +{ + size += va - PTR_ALIGN_DOWN(va, PAGE_SIZE); + va = PTR_ALIGN_DOWN(va, PAGE_SIZE); + size = PAGE_ALIGN(size); + + while (size) { + size_t map_size = PAGE_SIZE; + void *map; + + if (IS_ALIGNED((unsigned long)va, PMD_SIZE) && size >= PMD_SIZE) + map = hyp_fixblock_map(__hyp_pa(va), &map_size); + else + map = hyp_fixmap_map(__hyp_pa(va)); + + func(map, map_size); + + if (map_size == PMD_SIZE) + hyp_fixblock_unmap(); + else + hyp_fixmap_unmap(); + + size -= map_size; + va += map_size; + } +} + static void clean_dcache_guest_page(void *va, size_t size) { - __clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); - hyp_fixmap_unmap(); + __apply_guest_page(va, size, __clean_dcache_guest_page); } static void invalidate_icache_guest_page(void *va, size_t size) { - __invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size); - hyp_fixmap_unmap(); + __apply_guest_page(va, size, __invalidate_icache_guest_page); } int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) @@ -255,8 +280,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) }; guest_lock_component(vm); - ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, - guest_stage2_force_pte_cb); + ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, NULL); guest_unlock_component(vm); if (ret) return ret; @@ -309,7 +333,7 @@ int __pkvm_prot_finalize(void) */ kvm_flush_dcache_to_poc(params, sizeof(*params)); - write_sysreg(params->hcr_el2, hcr_el2); + write_sysreg_hcr(params->hcr_el2); __load_stage2(&host_mmu.arch.mmu, &host_mmu.arch); /* @@ -467,7 +491,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) return -EAGAIN; if (pte) { - WARN_ON(addr_is_memory(addr) && hyp_phys_to_page(addr)->host_state != PKVM_NOPAGE); + WARN_ON(addr_is_memory(addr) && + get_host_state(hyp_phys_to_page(addr)) != PKVM_NOPAGE); return -EPERM; } @@ -493,10 +518,8 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state) { - phys_addr_t end = addr + size; - - for (; addr < end; addr += PAGE_SIZE) - hyp_phys_to_page(addr)->host_state = state; + for_each_hyp_page(page, addr, size) + set_host_state(page, state); } int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) @@ -618,16 +641,16 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, static int __host_check_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - u64 end = addr + size; int ret; - ret = check_range_allowed_memory(addr, end); + ret = check_range_allowed_memory(addr, addr + size); if (ret) return ret; hyp_assert_lock_held(&host_mmu.lock); - for (; addr < end; addr += PAGE_SIZE) { - if (hyp_phys_to_page(addr)->host_state != state) + + for_each_hyp_page(page, addr, size) { + if (get_host_state(page) != state) return -EPERM; } @@ -637,7 +660,7 @@ static int __host_check_page_state_range(u64 addr, u64 size, static int __host_set_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - if (hyp_phys_to_page(addr)->host_state == PKVM_NOPAGE) { + if (get_host_state(hyp_phys_to_page(addr)) == PKVM_NOPAGE) { int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); if (ret) @@ -649,24 +672,20 @@ static int __host_set_page_state_range(u64 addr, u64 size, return 0; } -static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr) +static void __hyp_set_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) { - if (!kvm_pte_valid(pte)) - return PKVM_NOPAGE; - - return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); + for_each_hyp_page(page, phys, size) + set_hyp_state(page, state); } -static int __hyp_check_page_state_range(u64 addr, u64 size, - enum pkvm_page_state state) +static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state) { - struct check_walk_data d = { - .desired = state, - .get_page_state = hyp_get_page_state, - }; + for_each_hyp_page(page, phys, size) { + if (get_hyp_state(page) != state) + return -EPERM; + } - hyp_assert_lock_held(&pkvm_pgd_lock); - return check_page_state_range(&pkvm_pgtable, addr, size, &d); + return 0; } static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) @@ -677,10 +696,9 @@ static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); } -static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, +static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr, u64 size, enum pkvm_page_state state) { - struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); struct check_walk_data d = { .desired = state, .get_page_state = guest_get_page_state, @@ -693,8 +711,6 @@ static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, int __pkvm_host_share_hyp(u64 pfn) { u64 phys = hyp_pfn_to_phys(pfn); - void *virt = __hyp_va(phys); - enum kvm_pgtable_prot prot; u64 size = PAGE_SIZE; int ret; @@ -704,14 +720,11 @@ int __pkvm_host_share_hyp(u64 pfn) ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); if (ret) goto unlock; - if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { - ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); - if (ret) - goto unlock; - } + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; - prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); - WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + __hyp_set_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED)); unlock: @@ -734,7 +747,7 @@ int __pkvm_host_unshare_hyp(u64 pfn) ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); if (ret) goto unlock; - ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_SHARED_BORROWED); + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); if (ret) goto unlock; if (hyp_page_count((void *)virt)) { @@ -742,7 +755,7 @@ int __pkvm_host_unshare_hyp(u64 pfn) goto unlock; } - WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED)); unlock: @@ -757,7 +770,6 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) u64 phys = hyp_pfn_to_phys(pfn); u64 size = PAGE_SIZE * nr_pages; void *virt = __hyp_va(phys); - enum kvm_pgtable_prot prot; int ret; host_lock_component(); @@ -766,14 +778,12 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); if (ret) goto unlock; - if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { - ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); - if (ret) - goto unlock; - } + ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; - prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); - WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + __hyp_set_page_state_range(phys, size, PKVM_PAGE_OWNED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, PAGE_HYP)); WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP)); unlock: @@ -793,15 +803,14 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) host_lock_component(); hyp_lock_component(); - ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_OWNED); + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); if (ret) goto unlock; - if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { - ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); - if (ret) - goto unlock; - } + __hyp_set_page_state_range(phys, size, PKVM_NOPAGE); WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST)); @@ -816,24 +825,30 @@ int hyp_pin_shared_mem(void *from, void *to) { u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); u64 end = PAGE_ALIGN((u64)to); + u64 phys = __hyp_pa(start); u64 size = end - start; + struct hyp_page *p; int ret; host_lock_component(); hyp_lock_component(); - ret = __host_check_page_state_range(__hyp_pa(start), size, - PKVM_PAGE_SHARED_OWNED); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); if (ret) goto unlock; - ret = __hyp_check_page_state_range(start, size, - PKVM_PAGE_SHARED_BORROWED); + ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED); if (ret) goto unlock; - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_inc(hyp_virt_to_page(cur)); + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + hyp_page_ref_inc(p); + if (p->refcount == 1) + WARN_ON(pkvm_create_mappings_locked((void *)cur, + (void *)cur + PAGE_SIZE, + PAGE_HYP)); + } unlock: hyp_unlock_component(); @@ -846,12 +861,17 @@ void hyp_unpin_shared_mem(void *from, void *to) { u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); u64 end = PAGE_ALIGN((u64)to); + struct hyp_page *p; host_lock_component(); hyp_lock_component(); - for (cur = start; cur < end; cur += PAGE_SIZE) - hyp_page_ref_dec(hyp_virt_to_page(cur)); + for (cur = start; cur < end; cur += PAGE_SIZE) { + p = hyp_virt_to_page(cur); + if (p->refcount == 1) + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, cur, PAGE_SIZE) != PAGE_SIZE); + hyp_page_ref_dec(p); + } hyp_unlock_component(); host_unlock_component(); @@ -887,49 +907,84 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) return ret; } -int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, +static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *size) +{ + size_t block_size; + + if (nr_pages == 1) { + *size = PAGE_SIZE; + return 0; + } + + /* We solely support second to last level huge mapping */ + block_size = kvm_granule_size(KVM_PGTABLE_LAST_LEVEL - 1); + + if (nr_pages != block_size >> PAGE_SHIFT) + return -EINVAL; + + if (!IS_ALIGNED(phys | ipa, block_size)) + return -EINVAL; + + *size = block_size; + return 0; +} + +int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); u64 phys = hyp_pfn_to_phys(pfn); u64 ipa = hyp_pfn_to_phys(gfn); - struct hyp_page *page; + u64 size; int ret; if (prot & ~KVM_PGTABLE_PROT_RWX) return -EINVAL; - ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + ret = __guest_check_transition_size(phys, ipa, nr_pages, &size); + if (ret) + return ret; + + ret = check_range_allowed_memory(phys, phys + size); if (ret) return ret; host_lock_component(); guest_lock_component(vm); - ret = __guest_check_page_state_range(vcpu, ipa, PAGE_SIZE, PKVM_NOPAGE); + ret = __guest_check_page_state_range(vm, ipa, size, PKVM_NOPAGE); if (ret) goto unlock; - page = hyp_phys_to_page(phys); - switch (page->host_state) { - case PKVM_PAGE_OWNED: - WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED)); - break; - case PKVM_PAGE_SHARED_OWNED: - if (page->host_share_guest_count) - break; - /* Only host to np-guest multi-sharing is tolerated */ - WARN_ON(1); - fallthrough; - default: - ret = -EPERM; - goto unlock; + for_each_hyp_page(page, phys, size) { + switch (get_host_state(page)) { + case PKVM_PAGE_OWNED: + continue; + case PKVM_PAGE_SHARED_OWNED: + if (page->host_share_guest_count == U32_MAX) { + ret = -EBUSY; + goto unlock; + } + + /* Only host to np-guest multi-sharing is tolerated */ + if (page->host_share_guest_count) + continue; + + fallthrough; + default: + ret = -EPERM; + goto unlock; + } } - WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + for_each_hyp_page(page, phys, size) { + set_host_state(page, PKVM_PAGE_SHARED_OWNED); + page->host_share_guest_count++; + } + + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, size, phys, pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED), &vcpu->vcpu.arch.pkvm_memcache, 0)); - page->host_share_guest_count++; unlock: guest_unlock_component(vm); @@ -938,10 +993,9 @@ unlock: return ret; } -static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa) +static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa, u64 size) { enum pkvm_page_state state; - struct hyp_page *page; kvm_pte_t pte; u64 phys; s8 level; @@ -952,7 +1006,7 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip return ret; if (!kvm_pte_valid(pte)) return -ENOENT; - if (level != KVM_PGTABLE_LAST_LEVEL) + if (kvm_granule_size(level) != size) return -E2BIG; state = guest_get_page_state(pte, ipa); @@ -960,43 +1014,49 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip return -EPERM; phys = kvm_pte_to_phys(pte); - ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + ret = check_range_allowed_memory(phys, phys + size); if (WARN_ON(ret)) return ret; - page = hyp_phys_to_page(phys); - if (page->host_state != PKVM_PAGE_SHARED_OWNED) - return -EPERM; - if (WARN_ON(!page->host_share_guest_count)) - return -EINVAL; + for_each_hyp_page(page, phys, size) { + if (get_host_state(page) != PKVM_PAGE_SHARED_OWNED) + return -EPERM; + if (WARN_ON(!page->host_share_guest_count)) + return -EINVAL; + } *__phys = phys; return 0; } -int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm) +int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) { u64 ipa = hyp_pfn_to_phys(gfn); - struct hyp_page *page; - u64 phys; + u64 size, phys; int ret; + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; + host_lock_component(); guest_lock_component(vm); - ret = __check_host_shared_guest(vm, &phys, ipa); + ret = __check_host_shared_guest(vm, &phys, ipa, size); if (ret) goto unlock; - ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE); + ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, size); if (ret) goto unlock; - page = hyp_phys_to_page(phys); - page->host_share_guest_count--; - if (!page->host_share_guest_count) - WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED)); + for_each_hyp_page(page, phys, size) { + /* __check_host_shared_guest() protects against underflow */ + page->host_share_guest_count--; + if (!page->host_share_guest_count) + set_host_state(page, PKVM_PAGE_OWNED); + } unlock: guest_unlock_component(vm); @@ -1005,7 +1065,7 @@ unlock: return ret; } -static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa) +static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa, u64 size) { u64 phys; int ret; @@ -1016,7 +1076,7 @@ static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa) host_lock_component(); guest_lock_component(vm); - ret = __check_host_shared_guest(vm, &phys, ipa); + ret = __check_host_shared_guest(vm, &phys, ipa, size); guest_unlock_component(vm); host_unlock_component(); @@ -1036,7 +1096,7 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_ if (prot & ~KVM_PGTABLE_PROT_RWX) return -EINVAL; - assert_host_shared_guest(vm, ipa); + assert_host_shared_guest(vm, ipa, PAGE_SIZE); guest_lock_component(vm); ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); guest_unlock_component(vm); @@ -1044,33 +1104,41 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_ return ret; } -int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) +int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm) { - u64 ipa = hyp_pfn_to_phys(gfn); + u64 size, ipa = hyp_pfn_to_phys(gfn); int ret; if (pkvm_hyp_vm_is_protected(vm)) return -EPERM; - assert_host_shared_guest(vm, ipa); + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; + + assert_host_shared_guest(vm, ipa, size); guest_lock_component(vm); - ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, size); guest_unlock_component(vm); return ret; } -int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) +int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm) { - u64 ipa = hyp_pfn_to_phys(gfn); + u64 size, ipa = hyp_pfn_to_phys(gfn); int ret; if (pkvm_hyp_vm_is_protected(vm)) return -EPERM; - assert_host_shared_guest(vm, ipa); + ret = __guest_check_transition_size(0, ipa, nr_pages, &size); + if (ret) + return ret; + + assert_host_shared_guest(vm, ipa, size); guest_lock_component(vm); - ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, size, mkold); guest_unlock_component(vm); return ret; @@ -1084,10 +1152,210 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) if (pkvm_hyp_vm_is_protected(vm)) return -EPERM; - assert_host_shared_guest(vm, ipa); + assert_host_shared_guest(vm, ipa, PAGE_SIZE); guest_lock_component(vm); kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); guest_unlock_component(vm); return 0; } + +#ifdef CONFIG_NVHE_EL2_DEBUG +struct pkvm_expected_state { + enum pkvm_page_state host; + enum pkvm_page_state hyp; + enum pkvm_page_state guest[2]; /* [ gfn, gfn + 1 ] */ +}; + +static struct pkvm_expected_state selftest_state; +static struct hyp_page *selftest_page; + +static struct pkvm_hyp_vm selftest_vm = { + .kvm = { + .arch = { + .mmu = { + .arch = &selftest_vm.kvm.arch, + .pgt = &selftest_vm.pgt, + }, + }, + }, +}; + +static struct pkvm_hyp_vcpu selftest_vcpu = { + .vcpu = { + .arch = { + .hw_mmu = &selftest_vm.kvm.arch.mmu, + }, + .kvm = &selftest_vm.kvm, + }, +}; + +static void init_selftest_vm(void *virt) +{ + struct hyp_page *p = hyp_virt_to_page(virt); + int i; + + selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + + for (i = 0; i < pkvm_selftest_pages(); i++) { + if (p[i].refcount) + continue; + p[i].refcount = 1; + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } +} + +static u64 selftest_ipa(void) +{ + return BIT(selftest_vm.pgt.ia_bits - 1); +} + +static void assert_page_state(void) +{ + void *virt = hyp_page_to_virt(selftest_page); + u64 size = PAGE_SIZE << selftest_page->order; + struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; + u64 phys = hyp_virt_to_phys(virt); + u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE }; + struct pkvm_hyp_vm *vm; + + vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + + host_lock_component(); + WARN_ON(__host_check_page_state_range(phys, size, selftest_state.host)); + host_unlock_component(); + + hyp_lock_component(); + WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp)); + hyp_unlock_component(); + + guest_lock_component(&selftest_vm); + WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0])); + WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1])); + guest_unlock_component(&selftest_vm); +} + +#define assert_transition_res(res, fn, ...) \ + do { \ + WARN_ON(fn(__VA_ARGS__) != res); \ + assert_page_state(); \ + } while (0) + +void pkvm_ownership_selftest(void *base) +{ + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX; + void *virt = hyp_alloc_pages(&host_s2_pool, 0); + struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; + struct pkvm_hyp_vm *vm = &selftest_vm; + u64 phys, size, pfn, gfn; + + WARN_ON(!virt); + selftest_page = hyp_virt_to_page(virt); + selftest_page->refcount = 0; + init_selftest_vm(base); + + size = PAGE_SIZE << selftest_page->order; + phys = hyp_virt_to_phys(virt); + pfn = hyp_phys_to_pfn(phys); + gfn = hyp_phys_to_pfn(selftest_ipa()); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + selftest_state.guest[0] = selftest_state.guest[1] = PKVM_NOPAGE; + assert_page_state(); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); + hyp_unpin_shared_mem(virt, virt + size); + WARN_ON(hyp_page_count(virt) != 1); + assert_transition_res(-EBUSY, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + + hyp_unpin_shared_mem(virt, virt + size); + assert_page_state(); + WARN_ON(hyp_page_count(virt)); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_hyp, pfn); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.hyp = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1); + + selftest_state.host = PKVM_PAGE_SHARED_OWNED; + selftest_state.guest[0] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); + + selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED; + assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + WARN_ON(hyp_virt_to_page(virt)->host_share_guest_count != 2); + + selftest_state.guest[0] = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn, 1, vm); + + selftest_state.guest[1] = PKVM_NOPAGE; + selftest_state.host = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.hyp = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1); + + selftest_page->refcount = 1; + hyp_put_page(&host_s2_pool, virt); +} +#endif diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index f41c7440b34b..ae8391baebc3 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -229,9 +229,8 @@ int hyp_map_vectors(void) return 0; } -void *hyp_fixmap_map(phys_addr_t phys) +static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys) { - struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots); kvm_pte_t pte, *ptep = slot->ptep; pte = *ptep; @@ -243,10 +242,21 @@ void *hyp_fixmap_map(phys_addr_t phys) return (void *)slot->addr; } +void *hyp_fixmap_map(phys_addr_t phys) +{ + return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys); +} + static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) { kvm_pte_t *ptep = slot->ptep; u64 addr = slot->addr; + u32 level; + + if (FIELD_GET(KVM_PTE_TYPE, *ptep) == KVM_PTE_TYPE_PAGE) + level = KVM_PGTABLE_LAST_LEVEL; + else + level = KVM_PGTABLE_LAST_LEVEL - 1; /* create_fixblock() guarantees PMD level */ WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID); @@ -260,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 */ dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); dsb(ish); isb(); } @@ -273,9 +283,9 @@ void hyp_fixmap_unmap(void) static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg); + struct hyp_fixmap_slot *slot = (struct hyp_fixmap_slot *)ctx->arg; - if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL) + if (!kvm_pte_valid(ctx->old) || (ctx->end - ctx->start) != kvm_granule_size(ctx->level)) return -EINVAL; slot->addr = ctx->addr; @@ -296,13 +306,84 @@ static int create_fixmap_slot(u64 addr, u64 cpu) struct kvm_pgtable_walker walker = { .cb = __create_fixmap_slot_cb, .flags = KVM_PGTABLE_WALK_LEAF, - .arg = (void *)cpu, + .arg = per_cpu_ptr(&fixmap_slots, cpu), }; return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker); } -int hyp_create_pcpu_fixmap(void) +#if PAGE_SHIFT < 16 +#define HAS_FIXBLOCK +static struct hyp_fixmap_slot hyp_fixblock_slot; +static DEFINE_HYP_SPINLOCK(hyp_fixblock_lock); +#endif + +static int create_fixblock(void) +{ +#ifdef HAS_FIXBLOCK + struct kvm_pgtable_walker walker = { + .cb = __create_fixmap_slot_cb, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &hyp_fixblock_slot, + }; + unsigned long addr; + phys_addr_t phys; + int ret, i; + + /* Find a RAM phys address, PMD aligned */ + for (i = 0; i < hyp_memblock_nr; i++) { + phys = ALIGN(hyp_memory[i].base, PMD_SIZE); + if (phys + PMD_SIZE < (hyp_memory[i].base + hyp_memory[i].size)) + break; + } + + if (i >= hyp_memblock_nr) + return -EINVAL; + + hyp_spin_lock(&pkvm_pgd_lock); + addr = ALIGN(__io_map_base, PMD_SIZE); + ret = __pkvm_alloc_private_va_range(addr, PMD_SIZE); + if (ret) + goto unlock; + + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PMD_SIZE, phys, PAGE_HYP); + if (ret) + goto unlock; + + ret = kvm_pgtable_walk(&pkvm_pgtable, addr, PMD_SIZE, &walker); + +unlock: + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +#else + return 0; +#endif +} + +void *hyp_fixblock_map(phys_addr_t phys, size_t *size) +{ +#ifdef HAS_FIXBLOCK + *size = PMD_SIZE; + hyp_spin_lock(&hyp_fixblock_lock); + return fixmap_map_slot(&hyp_fixblock_slot, phys); +#else + *size = PAGE_SIZE; + return hyp_fixmap_map(phys); +#endif +} + +void hyp_fixblock_unmap(void) +{ +#ifdef HAS_FIXBLOCK + fixmap_clear_slot(&hyp_fixblock_slot); + hyp_spin_unlock(&hyp_fixblock_lock); +#else + hyp_fixmap_unmap(); +#endif +} + +int hyp_create_fixmap(void) { unsigned long addr, i; int ret; @@ -322,7 +403,7 @@ int hyp_create_pcpu_fixmap(void) return ret; } - return 0; + return create_fixblock(); } int hyp_create_idmap(u32 hyp_va_bits) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 5a335a51deca..338505cb0171 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -372,6 +372,18 @@ static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1); } +static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + void *sve_state; + + if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE)) + return; + + sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state); + hyp_unpin_shared_mem(sve_state, + sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu)); +} + static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], unsigned int nr_vcpus) { @@ -384,6 +396,7 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[], continue; unpin_host_vcpu(hyp_vcpu->host_vcpu); + unpin_host_sve_state(hyp_vcpu); } } @@ -398,12 +411,40 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, pkvm_init_features_from_host(hyp_vm, host_kvm); } -static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) { struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + unsigned int sve_max_vl; + size_t sve_state_size; + void *sve_state; + int ret = 0; - if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); + return 0; + } + + /* Limit guest vector length to the maximum supported by the host. */ + sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl); + sve_state_size = sve_state_size_from_vl(sve_max_vl); + sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state)); + + if (!sve_state || !sve_state_size) { + ret = -EINVAL; + goto err; + } + + ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size); + if (ret) + goto err; + + vcpu->arch.sve_state = sve_state; + vcpu->arch.sve_max_vl = sve_max_vl; + + return 0; +err: + clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features); + return ret; } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, @@ -432,7 +473,7 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, if (ret) goto done; - pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); + ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index d62bcb5634a2..a48d3f5a5afb 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -28,6 +28,7 @@ static void *vmemmap_base; static void *vm_table_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; +static void *selftest_base; static void *ffa_proxy_pages; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; static struct hyp_pool hpool; @@ -38,6 +39,11 @@ static int divide_memory_pool(void *virt, unsigned long size) hyp_early_alloc_init(virt, size); + nr_pages = pkvm_selftest_pages(); + selftest_base = hyp_early_alloc_contig(nr_pages); + if (nr_pages && !selftest_base) + return -ENOMEM; + nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page)); vmemmap_base = hyp_early_alloc_contig(nr_pages); if (!vmemmap_base) @@ -119,6 +125,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; + ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP); + if (ret) + return ret; + ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); if (ret) return ret; @@ -180,6 +190,7 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { enum pkvm_page_state state; + struct hyp_page *page; phys_addr_t phys; if (!kvm_pte_valid(ctx->old)) @@ -192,19 +203,25 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!addr_is_memory(phys)) return -EINVAL; + page = hyp_phys_to_page(phys); + /* * Adjust the host stage-2 mappings to match the ownership attributes - * configured in the hypervisor stage-1. + * configured in the hypervisor stage-1, and make sure to propagate them + * to the hyp_vmemmap state. */ state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old)); switch (state) { case PKVM_PAGE_OWNED: + set_hyp_state(page, PKVM_PAGE_OWNED); return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); case PKVM_PAGE_SHARED_OWNED: - hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED; + set_hyp_state(page, PKVM_PAGE_SHARED_OWNED); + set_host_state(page, PKVM_PAGE_SHARED_BORROWED); break; case PKVM_PAGE_SHARED_BORROWED: - hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED; + set_hyp_state(page, PKVM_PAGE_SHARED_BORROWED); + set_host_state(page, PKVM_PAGE_SHARED_OWNED); break; default: return -EINVAL; @@ -295,7 +312,7 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; - ret = hyp_create_pcpu_fixmap(); + ret = hyp_create_fixmap(); if (ret) goto out; @@ -304,6 +321,8 @@ void __noreturn __pkvm_init_finalise(void) goto out; pkvm_hyp_vm_table_init(vm_table_base); + + pkvm_ownership_selftest(selftest_base); out: /* * We tail-called to here from handle___pkvm_init() and will not return, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 7d2ba6ef0261..73affe1333a4 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -33,6 +33,18 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); +struct fgt_masks hfgrtr_masks; +struct fgt_masks hfgwtr_masks; +struct fgt_masks hfgitr_masks; +struct fgt_masks hdfgrtr_masks; +struct fgt_masks hdfgwtr_masks; +struct fgt_masks hafgrtr_masks; +struct fgt_masks hfgrtr2_masks; +struct fgt_masks hfgwtr2_masks; +struct fgt_masks hfgitr2_masks; +struct fgt_masks hdfgrtr2_masks; +struct fgt_masks hdfgwtr2_masks; + extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); static void __activate_cptr_traps(struct kvm_vcpu *vcpu) @@ -142,7 +154,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) __deactivate_traps_common(vcpu); - write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); + write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2); __deactivate_cptr_traps(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index df5cc74a7dd0..c351b4abd5db 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -11,12 +11,6 @@ #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> - -#define KVM_PTE_TYPE BIT(1) -#define KVM_PTE_TYPE_BLOCK 0 -#define KVM_PTE_TYPE_PAGE 1 -#define KVM_PTE_TYPE_TABLE 1 - struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 50aa8dbcae75..f162b0df5cae 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -446,7 +446,7 @@ u64 __vgic_v3_get_gic_config(void) if (has_vhe()) { flags = local_daif_save(); } else { - sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); + sysreg_clear_set_hcr(0, HCR_AMO | HCR_FMO | HCR_IMO); isb(); } @@ -461,7 +461,7 @@ u64 __vgic_v3_get_gic_config(void) if (has_vhe()) { local_daif_restore(flags); } else { - sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); + sysreg_clear_set_hcr(HCR_AMO | HCR_FMO | HCR_IMO, 0); isb(); } @@ -1058,11 +1058,11 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, switch (sysreg) { case SYS_ICC_IGRPEN0_EL1: if (is_read && - (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && - (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; @@ -1079,11 +1079,11 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, case SYS_ICC_IGRPEN1_EL1: if (is_read && - (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1)) return true; if (!is_read && - (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1)) return true; fallthrough; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 731a0378ed13..c9b330dc2066 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -48,21 +48,46 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); static u64 __compute_hcr(struct kvm_vcpu *vcpu) { + u64 guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2); u64 hcr = vcpu->arch.hcr_el2; if (!vcpu_has_nv(vcpu)) return hcr; + /* + * We rely on the invariant that a vcpu entered from HYP + * context must also exit in the same context, as only an ERET + * instruction can kick us out of it, and we obviously trap + * that sucker. PSTATE.M will get fixed-up on exit. + */ if (is_hyp_ctxt(vcpu)) { + host_data_set_flag(VCPU_IN_HYP_CONTEXT); + hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB; if (!vcpu_el2_e2h_is_set(vcpu)) hcr |= HCR_NV1; write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2); + } else { + host_data_clear_flag(VCPU_IN_HYP_CONTEXT); + + if (guest_hcr & HCR_NV) { + u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id())); + + /* Inherit the low bits from the actual register */ + va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0); + write_sysreg_s(va, SYS_VNCR_EL2); + + /* Force NV2 in case the guest is forgetful... */ + guest_hcr |= HCR_NV2; + } } - return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE); + BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) && + host_data_test_flag(L1_VNCR_MAPPED)); + + return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } static void __activate_cptr_traps(struct kvm_vcpu *vcpu) @@ -184,7 +209,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) ___deactivate_traps(vcpu); - write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); if (has_cntpoff()) { struct timer_map map; @@ -459,6 +484,14 @@ static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) if (ret) return false; + /* + * If we have to check for any VNCR mapping being invalidated, + * go back to the slow path for further processing. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) && + atomic_read(&vcpu->kvm->arch.vncr_map_count)) + return false; + __kvm_skip_instr(vcpu); return true; @@ -568,9 +601,12 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) /* * If we were in HYP context on entry, adjust the PSTATE view - * so that the usual helpers work correctly. + * so that the usual helpers work correctly. This enforces our + * invariant that the guest's HYP context status is preserved + * across a run. */ - if (vcpu_has_nv(vcpu) && (read_sysreg(hcr_el2) & HCR_NV)) { + if (vcpu_has_nv(vcpu) && + unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) { u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT); switch (mode) { @@ -586,6 +622,10 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) *vcpu_cpsr(vcpu) |= mode; } + /* Apply extreme paranoia! */ + BUG_ON(vcpu_has_nv(vcpu) && + !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu)); + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 3d50a1bd2bdb..ec2569818629 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -63,7 +63,7 @@ static void enter_vmid_context(struct kvm_s2_mmu *mmu, __load_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; - write_sysreg(val, hcr_el2); + write_sysreg_hcr(val); isb(); } @@ -73,7 +73,7 @@ static void exit_vmid_context(struct tlb_inv_context *cxt) * We're done with the TLB operation, let's restore the host's * view of HCR_EL2. */ - write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); isb(); /* ... and the stage-2 MMU context that we switched away from */ diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index eeda92330ade..2942ec92c5a4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1304,6 +1304,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, if (map_size == PAGE_SIZE) return true; + /* pKVM only supports PMD_SIZE huge-mappings */ + if (is_protected_kvm_enabled() && map_size != PMD_SIZE) + return false; + size = memslot->npages * PAGE_SIZE; gpa_start = memslot->base_gfn << PAGE_SHIFT; @@ -1540,7 +1544,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * logging_active is guaranteed to never be true for VM_PFNMAP * memslots. */ - if (logging_active || is_protected_kvm_enabled()) { + if (logging_active) { force_pte = true; vma_shift = PAGE_SHIFT; } else { diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 4a3fc11f7ecf..291dbe38eb5c 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -8,6 +8,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <asm/fixmap.h> #include <asm/kvm_arm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> @@ -16,6 +17,24 @@ #include "sys_regs.h" +struct vncr_tlb { + /* The guest's VNCR_EL2 */ + u64 gva; + struct s1_walk_info wi; + struct s1_walk_result wr; + + u64 hpa; + + /* -1 when not mapped on a CPU */ + int cpu; + + /* + * true if the TLB is valid. Can only be changed with the + * mmu_lock held. + */ + bool valid; +}; + /* * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between * memory usage and potential number of different sets of S2 PTs in @@ -28,6 +47,7 @@ void kvm_init_nested(struct kvm *kvm) { kvm->arch.nested_mmus = NULL; kvm->arch.nested_mmus_size = 0; + atomic_set(&kvm->arch.vncr_map_count, 0); } static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) @@ -55,6 +75,13 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) !cpus_have_final_cap(ARM64_HAS_HCR_NV1)) return -EINVAL; + if (!vcpu->arch.ctxt.vncr_array) + vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT | + __GFP_ZERO); + + if (!vcpu->arch.ctxt.vncr_array) + return -ENOMEM; + /* * Let's treat memory allocation failures as benign: If we fail to * allocate anything, return an error and keep the allocated array @@ -85,6 +112,9 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); + free_page((unsigned long)vcpu->arch.ctxt.vncr_array); + vcpu->arch.ctxt.vncr_array = NULL; + return ret; } @@ -405,6 +435,30 @@ static unsigned int ttl_to_size(u8 ttl) return max_size; } +static u8 pgshift_level_to_ttl(u16 shift, u8 level) +{ + u8 ttl; + + switch(shift) { + case 12: + ttl = TLBI_TTL_TG_4K; + break; + case 14: + ttl = TLBI_TTL_TG_16K; + break; + case 16: + ttl = TLBI_TTL_TG_64K; + break; + default: + BUG(); + } + + ttl <<= 2; + ttl |= level & 3; + + return ttl; +} + /* * Compute the equivalent of the TTL field by parsing the shadow PT. The * granule size is extracted from the cached VTCR_EL2.TG0 while the level is @@ -676,23 +730,36 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) { /* - * The vCPU kept its reference on the MMU after the last put, keep - * rolling with it. + * If the vCPU kept its reference on the MMU after the last put, + * keep rolling with it. */ - if (vcpu->arch.hw_mmu) - return; - if (is_hyp_ctxt(vcpu)) { - vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + if (!vcpu->arch.hw_mmu) + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; } else { - write_lock(&vcpu->kvm->mmu_lock); - vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); - write_unlock(&vcpu->kvm->mmu_lock); + if (!vcpu->arch.hw_mmu) { + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) + vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); + } + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) + kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu); } } void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) { + /* Unconditionally drop the VNCR mapping if we have one */ + if (host_data_test_flag(L1_VNCR_MAPPED)) { + BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id()); + BUG_ON(is_hyp_ctxt(vcpu)); + + clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu)); + vcpu->arch.vncr_tlb->cpu = -1; + host_data_clear_flag(L1_VNCR_MAPPED); + atomic_dec(&vcpu->kvm->arch.vncr_map_count); + } + /* * Keep a reference on the associated stage-2 MMU if the vCPU is * scheduling out and not in WFI emulation, suggesting it is likely to @@ -743,6 +810,245 @@ int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2) return kvm_inject_nested_sync(vcpu, esr_el2); } +static void invalidate_vncr(struct vncr_tlb *vt) +{ + vt->valid = false; + if (vt->cpu != -1) + clear_fixmap(vncr_fixmap(vt->cpu)); +} + +static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 ipa_start, ipa_end, ipa_size; + + /* + * Careful here: We end-up here from an MMU notifier, + * and this can race against a vcpu not being onlined + * yet, without the pseudo-TLB being allocated. + * + * Skip those, as they obviously don't participate in + * the invalidation at this stage. + */ + if (!vt) + continue; + + if (!vt->valid) + continue; + + ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, + vt->wr.level)); + ipa_start = vt->wr.pa & (ipa_size - 1); + ipa_end = ipa_start + ipa_size; + + if (ipa_end <= start || ipa_start >= end) + continue; + + invalidate_vncr(vt); + } +} + +struct s1e2_tlbi_scope { + enum { + TLBI_ALL, + TLBI_VA, + TLBI_VAA, + TLBI_ASID, + } type; + + u16 asid; + u64 va; + u64 size; +}; + +static void invalidate_vncr_va(struct kvm *kvm, + struct s1e2_tlbi_scope *scope) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held_write(&kvm->mmu_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 va_start, va_end, va_size; + + if (!vt->valid) + continue; + + va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, + vt->wr.level)); + va_start = vt->gva & (va_size - 1); + va_end = va_start + va_size; + + switch (scope->type) { + case TLBI_ALL: + break; + + case TLBI_VA: + if (va_end <= scope->va || + va_start >= (scope->va + scope->size)) + continue; + if (vt->wr.nG && vt->wr.asid != scope->asid) + continue; + break; + + case TLBI_VAA: + if (va_end <= scope->va || + va_start >= (scope->va + scope->size)) + continue; + break; + + case TLBI_ASID: + if (!vt->wr.nG || vt->wr.asid != scope->asid) + continue; + break; + } + + invalidate_vncr(vt); + } +} + +static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val, + struct s1e2_tlbi_scope *scope) +{ + switch (inst) { + case OP_TLBI_ALLE2: + case OP_TLBI_ALLE2IS: + case OP_TLBI_ALLE2OS: + case OP_TLBI_VMALLE1: + case OP_TLBI_VMALLE1IS: + case OP_TLBI_VMALLE1OS: + case OP_TLBI_ALLE2NXS: + case OP_TLBI_ALLE2ISNXS: + case OP_TLBI_ALLE2OSNXS: + case OP_TLBI_VMALLE1NXS: + case OP_TLBI_VMALLE1ISNXS: + case OP_TLBI_VMALLE1OSNXS: + scope->type = TLBI_ALL; + break; + case OP_TLBI_VAE2: + case OP_TLBI_VAE2IS: + case OP_TLBI_VAE2OS: + case OP_TLBI_VAE1: + case OP_TLBI_VAE1IS: + case OP_TLBI_VAE1OS: + case OP_TLBI_VAE2NXS: + case OP_TLBI_VAE2ISNXS: + case OP_TLBI_VAE2OSNXS: + case OP_TLBI_VAE1NXS: + case OP_TLBI_VAE1ISNXS: + case OP_TLBI_VAE1OSNXS: + case OP_TLBI_VALE2: + case OP_TLBI_VALE2IS: + case OP_TLBI_VALE2OS: + case OP_TLBI_VALE1: + case OP_TLBI_VALE1IS: + case OP_TLBI_VALE1OS: + case OP_TLBI_VALE2NXS: + case OP_TLBI_VALE2ISNXS: + case OP_TLBI_VALE2OSNXS: + case OP_TLBI_VALE1NXS: + case OP_TLBI_VALE1ISNXS: + case OP_TLBI_VALE1OSNXS: + scope->type = TLBI_VA; + scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val)); + if (!scope->size) + scope->size = SZ_1G; + scope->va = (val << 12) & ~(scope->size - 1); + scope->asid = FIELD_GET(TLBIR_ASID_MASK, val); + break; + case OP_TLBI_ASIDE1: + case OP_TLBI_ASIDE1IS: + case OP_TLBI_ASIDE1OS: + case OP_TLBI_ASIDE1NXS: + case OP_TLBI_ASIDE1ISNXS: + case OP_TLBI_ASIDE1OSNXS: + scope->type = TLBI_ASID; + scope->asid = FIELD_GET(TLBIR_ASID_MASK, val); + break; + case OP_TLBI_VAAE1: + case OP_TLBI_VAAE1IS: + case OP_TLBI_VAAE1OS: + case OP_TLBI_VAAE1NXS: + case OP_TLBI_VAAE1ISNXS: + case OP_TLBI_VAAE1OSNXS: + case OP_TLBI_VAALE1: + case OP_TLBI_VAALE1IS: + case OP_TLBI_VAALE1OS: + case OP_TLBI_VAALE1NXS: + case OP_TLBI_VAALE1ISNXS: + case OP_TLBI_VAALE1OSNXS: + scope->type = TLBI_VAA; + scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val)); + if (!scope->size) + scope->size = SZ_1G; + scope->va = (val << 12) & ~(scope->size - 1); + break; + case OP_TLBI_RVAE2: + case OP_TLBI_RVAE2IS: + case OP_TLBI_RVAE2OS: + case OP_TLBI_RVAE1: + case OP_TLBI_RVAE1IS: + case OP_TLBI_RVAE1OS: + case OP_TLBI_RVAE2NXS: + case OP_TLBI_RVAE2ISNXS: + case OP_TLBI_RVAE2OSNXS: + case OP_TLBI_RVAE1NXS: + case OP_TLBI_RVAE1ISNXS: + case OP_TLBI_RVAE1OSNXS: + case OP_TLBI_RVALE2: + case OP_TLBI_RVALE2IS: + case OP_TLBI_RVALE2OS: + case OP_TLBI_RVALE1: + case OP_TLBI_RVALE1IS: + case OP_TLBI_RVALE1OS: + case OP_TLBI_RVALE2NXS: + case OP_TLBI_RVALE2ISNXS: + case OP_TLBI_RVALE2OSNXS: + case OP_TLBI_RVALE1NXS: + case OP_TLBI_RVALE1ISNXS: + case OP_TLBI_RVALE1OSNXS: + scope->type = TLBI_VA; + scope->va = decode_range_tlbi(val, &scope->size, &scope->asid); + break; + case OP_TLBI_RVAAE1: + case OP_TLBI_RVAAE1IS: + case OP_TLBI_RVAAE1OS: + case OP_TLBI_RVAAE1NXS: + case OP_TLBI_RVAAE1ISNXS: + case OP_TLBI_RVAAE1OSNXS: + case OP_TLBI_RVAALE1: + case OP_TLBI_RVAALE1IS: + case OP_TLBI_RVAALE1OS: + case OP_TLBI_RVAALE1NXS: + case OP_TLBI_RVAALE1ISNXS: + case OP_TLBI_RVAALE1OSNXS: + scope->type = TLBI_VAA; + scope->va = decode_range_tlbi(val, &scope->size, NULL); + break; + } +} + +void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val) +{ + struct s1e2_tlbi_scope scope = {}; + + compute_s1_tlbi_range(vcpu, inst, val, &scope); + + guard(write_lock)(&vcpu->kvm->mmu_lock); + invalidate_vncr_va(vcpu->kvm, &scope); +} + void kvm_nested_s2_wp(struct kvm *kvm) { int i; @@ -755,6 +1061,8 @@ void kvm_nested_s2_wp(struct kvm *kvm) if (kvm_s2_mmu_valid(mmu)) kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu)); } + + kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits)); } void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) @@ -769,6 +1077,8 @@ void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) if (kvm_s2_mmu_valid(mmu)) kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block); } + + kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits)); } void kvm_nested_s2_flush(struct kvm *kvm) @@ -802,6 +1112,295 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) } /* + * Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter: + * + * - We introduce an internal representation of a vcpu-private TLB, + * representing the mapping between the guest VA contained in VNCR_EL2, + * the IPA the guest's EL2 PTs point to, and the actual PA this lives at. + * + * - On translation fault from a nested VNCR access, we create such a TLB. + * If there is no mapping to describe, the guest inherits the fault. + * Crucially, no actual mapping is done at this stage. + * + * - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above + * TLB exists, we map it in the fixmap for this CPU, and run with it. We + * have to respect the permissions dictated by the guest, but not the + * memory type (FWB is a must). + * + * - Note that we usually don't do a vcpu_load() on the back of a fault + * (unless we are preempted), so the resolution of a translation fault + * must go via a request that will map the VNCR page in the fixmap. + * vcpu_load() might as well use the same mechanism. + * + * - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was + * mapped, we unmap it. Yes it is that simple. The TLB still exists + * though, and may be reused at a later load. + * + * - On permission fault, we simply forward the fault to the guest's EL2. + * Get out of my way. + * + * - On any TLBI for the EL2&0 translation regime, we must find any TLB that + * intersects with the TLBI request, invalidate it, and unmap the page + * from the fixmap. Because we need to look at all the vcpu-private TLBs, + * this requires some wide-ranging locking to ensure that nothing races + * against it. This may require some refcounting to avoid the search when + * no such TLB is present. + * + * - On MMU notifiers, we must invalidate our TLB in a similar way, but + * looking at the IPA instead. The funny part is that there may not be a + * stage-2 mapping for this page if L1 hasn't accessed it using LD/ST + * instructions. + */ + +int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu) +{ + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return 0; + + vcpu->arch.vncr_tlb = kzalloc(sizeof(*vcpu->arch.vncr_tlb), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.vncr_tlb) + return -ENOMEM; + + return 0; +} + +static u64 read_vncr_el2(struct kvm_vcpu *vcpu) +{ + return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48); +} + +static int kvm_translate_vncr(struct kvm_vcpu *vcpu) +{ + bool write_fault, writable; + unsigned long mmu_seq; + struct vncr_tlb *vt; + struct page *page; + u64 va, pfn, gfn; + int ret; + + vt = vcpu->arch.vncr_tlb; + + /* + * If we're about to walk the EL2 S1 PTs, we must invalidate the + * current TLB, as it could be sampled from another vcpu doing a + * TLBI *IS. A real CPU wouldn't do that, but we only keep a single + * translation, so not much of a choice. + * + * We also prepare the next walk wilst we're at it. + */ + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + invalidate_vncr(vt); + + vt->wi = (struct s1_walk_info) { + .regime = TR_EL20, + .as_el0 = false, + .pan = false, + }; + vt->wr = (struct s1_walk_result){}; + } + + guard(srcu)(&vcpu->kvm->srcu); + + va = read_vncr_el2(vcpu); + + ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va); + if (ret) + return ret; + + write_fault = kvm_is_write_fault(vcpu); + + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + gfn = vt->wr.pa >> PAGE_SHIFT; + pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page); + if (is_error_noslot_pfn(pfn) || (write_fault && !writable)) + return -EFAULT; + + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) + return -EAGAIN; + + vt->gva = va; + vt->hpa = pfn << PAGE_SHIFT; + vt->valid = true; + vt->cpu = -1; + + kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu); + kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw); + } + + if (vt->wr.pw) + mark_page_dirty(vcpu->kvm, gfn); + + return 0; +} + +static void inject_vncr_perm(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 esr = kvm_vcpu_get_esr(vcpu); + + /* Adjust the fault level to reflect that of the guest's */ + esr &= ~ESR_ELx_FSC; + esr |= FIELD_PREP(ESR_ELx_FSC, + ESR_ELx_FSC_PERM_L(vt->wr.level)); + + kvm_inject_nested_sync(vcpu, esr); +} + +static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + + lockdep_assert_held_read(&vcpu->kvm->mmu_lock); + + if (!vt->valid) + return false; + + if (read_vncr_el2(vcpu) != vt->gva) + return false; + + if (vt->wr.nG) { + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + u64 ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + u16 asid; + + asid = FIELD_GET(TTBR_ASID_MASK, ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + return asid != vt->wr.asid; + } + + return true; +} + +int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + u64 esr = kvm_vcpu_get_esr(vcpu); + + BUG_ON(!(esr & ESR_ELx_VNCR_SHIFT)); + + if (esr_fsc_is_permission_fault(esr)) { + inject_vncr_perm(vcpu); + } else if (esr_fsc_is_translation_fault(esr)) { + bool valid; + int ret; + + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + valid = kvm_vncr_tlb_lookup(vcpu); + + if (!valid) + ret = kvm_translate_vncr(vcpu); + else + ret = -EPERM; + + switch (ret) { + case -EAGAIN: + case -ENOMEM: + /* Let's try again... */ + break; + case -EFAULT: + case -EINVAL: + case -ENOENT: + case -EACCES: + /* + * Translation failed, inject the corresponding + * exception back to EL2. + */ + BUG_ON(!vt->wr.failed); + + esr &= ~ESR_ELx_FSC; + esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst); + + kvm_inject_nested_sync(vcpu, esr); + break; + case -EPERM: + /* Hack to deal with POE until we get kernel support */ + inject_vncr_perm(vcpu); + break; + case 0: + break; + } + } else { + WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr); + } + + return 1; +} + +static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) +{ + struct vncr_tlb *vt = vcpu->arch.vncr_tlb; + pgprot_t prot; + + guard(preempt)(); + guard(read_lock)(&vcpu->kvm->mmu_lock); + + /* + * The request to map VNCR may have raced against some other + * event, such as an interrupt, and may not be valid anymore. + */ + if (is_hyp_ctxt(vcpu)) + return; + + /* + * Check that the pseudo-TLB is valid and that VNCR_EL2 still + * contains the expected value. If it doesn't, we simply bail out + * without a mapping -- a transformed MSR/MRS will generate the + * fault and allows us to populate the pseudo-TLB. + */ + if (!vt->valid) + return; + + if (read_vncr_el2(vcpu) != vt->gva) + return; + + if (vt->wr.nG) { + u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + u64 ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + u16 asid; + + asid = FIELD_GET(TTBR_ASID_MASK, ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + if (asid != vt->wr.asid) + return; + } + + vt->cpu = smp_processor_id(); + + if (vt->wr.pw && vt->wr.pr) + prot = PAGE_KERNEL; + else if (vt->wr.pr) + prot = PAGE_KERNEL_RO; + else + prot = PAGE_NONE; + + /* + * We can't map write-only (or no permission at all) in the kernel, + * but the guest can do it if using POE, so we'll have to turn a + * translation fault into a permission fault at runtime. + * FIXME: WO doesn't work at all, need POE support in the kernel. + */ + if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) { + __set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot); + host_data_set_flag(L1_VNCR_MAPPED); + atomic_inc(&vcpu->kvm->arch.vncr_map_count); + } +} + +/* * Our emulated CPU doesn't support all the possible features. For the * sake of simplicity (and probably mental sanity), wipe out a number * of feature bits we don't intend to support for the time being. @@ -1018,216 +1617,49 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1); /* HCR_EL2 */ - res0 = BIT(48); - res1 = HCR_RW; - if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, TWED, IMP)) - res0 |= GENMASK(63, 59); - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, MTE2)) - res0 |= (HCR_TID5 | HCR_DCT | HCR_ATA); - if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, EVT, TTLBxS)) - res0 |= (HCR_TTLBIS | HCR_TTLBOS); - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) && - !kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2)) - res0 |= HCR_ENSCXT; - if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, EVT, IMP)) - res0 |= (HCR_TOCU | HCR_TICAB | HCR_TID4); - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) - res0 |= HCR_AMVOFFEN; - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1)) - res0 |= HCR_FIEN; - if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, FWB, IMP)) - res0 |= HCR_FWB; - /* Implementation choice: NV2 is the only supported config */ - if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) - res0 |= (HCR_NV2 | HCR_NV | HCR_AT); - if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, NI)) - res0 |= HCR_NV1; - if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && - kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC))) - res0 |= (HCR_API | HCR_APK); - if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TME, IMP)) - res0 |= BIT(39); - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) - res0 |= (HCR_TEA | HCR_TERR); - if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) - res0 |= HCR_TLOR; - if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) - res0 |= HCR_E2H; - if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, IMP)) - res1 |= HCR_E2H; + get_reg_fixed_bits(kvm, HCR_EL2, &res0, &res1); set_sysreg_masks(kvm, HCR_EL2, res0, res1); /* HCRX_EL2 */ - res0 = HCRX_EL2_RES0; - res1 = HCRX_EL2_RES1; - if (!kvm_has_feat(kvm, ID_AA64ISAR3_EL1, PACM, TRIVIAL_IMP)) - res0 |= HCRX_EL2_PACMEn; - if (!kvm_has_feat(kvm, ID_AA64PFR2_EL1, FPMR, IMP)) - res0 |= HCRX_EL2_EnFPM; - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) - res0 |= HCRX_EL2_GCSEn; - if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, SYSREG_128, IMP)) - res0 |= HCRX_EL2_EnIDCP128; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ADERR, DEV_ASYNC)) - res0 |= (HCRX_EL2_EnSDERR | HCRX_EL2_EnSNERR); - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, DF2, IMP)) - res0 |= HCRX_EL2_TMEA; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP)) - res0 |= HCRX_EL2_D128En; - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) - res0 |= HCRX_EL2_PTTWI; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SCTLRX, IMP)) - res0 |= HCRX_EL2_SCTLR2En; - if (!kvm_has_tcr2(kvm)) - res0 |= HCRX_EL2_TCR2En; - if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) - res0 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); - if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, CMOW, IMP)) - res0 |= HCRX_EL2_CMOW; - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, NMI, IMP)) - res0 |= (HCRX_EL2_VFNMI | HCRX_EL2_VINMI | HCRX_EL2_TALLINT); - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP) || - !(read_sysreg_s(SYS_SMIDR_EL1) & SMIDR_EL1_SMPS)) - res0 |= HCRX_EL2_SMPME; - if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP)) - res0 |= (HCRX_EL2_FGTnXS | HCRX_EL2_FnXS); - if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V)) - res0 |= HCRX_EL2_EnASR; - if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64)) - res0 |= HCRX_EL2_EnALS; - if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA)) - res0 |= HCRX_EL2_EnAS0; + get_reg_fixed_bits(kvm, HCRX_EL2, &res0, &res1); set_sysreg_masks(kvm, HCRX_EL2, res0, res1); /* HFG[RW]TR_EL2 */ - res0 = res1 = 0; - if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && - kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC))) - res0 |= (HFGxTR_EL2_APDAKey | HFGxTR_EL2_APDBKey | - HFGxTR_EL2_APGAKey | HFGxTR_EL2_APIAKey | - HFGxTR_EL2_APIBKey); - if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) - res0 |= (HFGxTR_EL2_LORC_EL1 | HFGxTR_EL2_LOREA_EL1 | - HFGxTR_EL2_LORID_EL1 | HFGxTR_EL2_LORN_EL1 | - HFGxTR_EL2_LORSA_EL1); - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) && - !kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2)) - res0 |= (HFGxTR_EL2_SCXTNUM_EL1 | HFGxTR_EL2_SCXTNUM_EL0); - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP)) - res0 |= HFGxTR_EL2_ICC_IGRPENn_EL1; - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) - res0 |= (HFGxTR_EL2_ERRIDR_EL1 | HFGxTR_EL2_ERRSELR_EL1 | - HFGxTR_EL2_ERXFR_EL1 | HFGxTR_EL2_ERXCTLR_EL1 | - HFGxTR_EL2_ERXSTATUS_EL1 | HFGxTR_EL2_ERXMISCn_EL1 | - HFGxTR_EL2_ERXPFGF_EL1 | HFGxTR_EL2_ERXPFGCTL_EL1 | - HFGxTR_EL2_ERXPFGCDN_EL1 | HFGxTR_EL2_ERXADDR_EL1); - if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA)) - res0 |= HFGxTR_EL2_nACCDATA_EL1; - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) - res0 |= (HFGxTR_EL2_nGCS_EL0 | HFGxTR_EL2_nGCS_EL1); - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) - res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) - res0 |= HFGxTR_EL2_nRCWMASK_EL1; - if (!kvm_has_s1pie(kvm)) - res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); - if (!kvm_has_s1poe(kvm)) - res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - res0 |= HFGxTR_EL2_nS2POR_EL1; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP)) - res0 |= (HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nAMAIR2_EL1); - set_sysreg_masks(kvm, HFGRTR_EL2, res0 | __HFGRTR_EL2_RES0, res1); - set_sysreg_masks(kvm, HFGWTR_EL2, res0 | __HFGWTR_EL2_RES0, res1); + get_reg_fixed_bits(kvm, HFGRTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGRTR_EL2, res0, res1); + get_reg_fixed_bits(kvm, HFGWTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGWTR_EL2, res0, res1); /* HDFG[RW]TR_EL2 */ - res0 = res1 = 0; - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) - res0 |= HDFGRTR_EL2_OSDLR_EL1; - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) - res0 |= (HDFGRTR_EL2_PMEVCNTRn_EL0 | HDFGRTR_EL2_PMEVTYPERn_EL0 | - HDFGRTR_EL2_PMCCFILTR_EL0 | HDFGRTR_EL2_PMCCNTR_EL0 | - HDFGRTR_EL2_PMCNTEN | HDFGRTR_EL2_PMINTEN | - HDFGRTR_EL2_PMOVS | HDFGRTR_EL2_PMSELR_EL0 | - HDFGRTR_EL2_PMMIR_EL1 | HDFGRTR_EL2_PMUSERENR_EL0 | - HDFGRTR_EL2_PMCEIDn_EL0); - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) - res0 |= (HDFGRTR_EL2_PMBLIMITR_EL1 | HDFGRTR_EL2_PMBPTR_EL1 | - HDFGRTR_EL2_PMBSR_EL1 | HDFGRTR_EL2_PMSCR_EL1 | - HDFGRTR_EL2_PMSEVFR_EL1 | HDFGRTR_EL2_PMSFCR_EL1 | - HDFGRTR_EL2_PMSICR_EL1 | HDFGRTR_EL2_PMSIDR_EL1 | - HDFGRTR_EL2_PMSIRR_EL1 | HDFGRTR_EL2_PMSLATFR_EL1 | - HDFGRTR_EL2_PMBIDR_EL1); - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP)) - res0 |= (HDFGRTR_EL2_TRC | HDFGRTR_EL2_TRCAUTHSTATUS | - HDFGRTR_EL2_TRCAUXCTLR | HDFGRTR_EL2_TRCCLAIM | - HDFGRTR_EL2_TRCCNTVRn | HDFGRTR_EL2_TRCID | - HDFGRTR_EL2_TRCIMSPECn | HDFGRTR_EL2_TRCOSLSR | - HDFGRTR_EL2_TRCPRGCTLR | HDFGRTR_EL2_TRCSEQSTR | - HDFGRTR_EL2_TRCSSCSRn | HDFGRTR_EL2_TRCSTATR | - HDFGRTR_EL2_TRCVICTLR); - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) - res0 |= (HDFGRTR_EL2_TRBBASER_EL1 | HDFGRTR_EL2_TRBIDR_EL1 | - HDFGRTR_EL2_TRBLIMITR_EL1 | HDFGRTR_EL2_TRBMAR_EL1 | - HDFGRTR_EL2_TRBPTR_EL1 | HDFGRTR_EL2_TRBSR_EL1 | - HDFGRTR_EL2_TRBTRG_EL1); - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) - res0 |= (HDFGRTR_EL2_nBRBIDR | HDFGRTR_EL2_nBRBCTL | - HDFGRTR_EL2_nBRBDATA); - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2)) - res0 |= HDFGRTR_EL2_nPMSNEVFR_EL1; - set_sysreg_masks(kvm, HDFGRTR_EL2, res0 | HDFGRTR_EL2_RES0, res1); - - /* Reuse the bits from the read-side and add the write-specific stuff */ - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) - res0 |= (HDFGWTR_EL2_PMCR_EL0 | HDFGWTR_EL2_PMSWINC_EL0); - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP)) - res0 |= HDFGWTR_EL2_TRCOSLAR; - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) - res0 |= HDFGWTR_EL2_TRFCR_EL1; - set_sysreg_masks(kvm, HFGWTR_EL2, res0 | HDFGWTR_EL2_RES0, res1); + get_reg_fixed_bits(kvm, HDFGRTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGRTR_EL2, res0, res1); + get_reg_fixed_bits(kvm, HDFGWTR_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGWTR_EL2, res0, res1); /* HFGITR_EL2 */ - res0 = HFGITR_EL2_RES0; - res1 = HFGITR_EL2_RES1; - if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, DPB, DPB2)) - res0 |= HFGITR_EL2_DCCVADP; - if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) - res0 |= (HFGITR_EL2_ATS1E1RP | HFGITR_EL2_ATS1E1WP); - if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) - res0 |= (HFGITR_EL2_TLBIRVAALE1OS | HFGITR_EL2_TLBIRVALE1OS | - HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS | - HFGITR_EL2_TLBIVAALE1OS | HFGITR_EL2_TLBIVALE1OS | - HFGITR_EL2_TLBIVAAE1OS | HFGITR_EL2_TLBIASIDE1OS | - HFGITR_EL2_TLBIVAE1OS | HFGITR_EL2_TLBIVMALLE1OS); - if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) - res0 |= (HFGITR_EL2_TLBIRVAALE1 | HFGITR_EL2_TLBIRVALE1 | - HFGITR_EL2_TLBIRVAAE1 | HFGITR_EL2_TLBIRVAE1 | - HFGITR_EL2_TLBIRVAALE1IS | HFGITR_EL2_TLBIRVALE1IS | - HFGITR_EL2_TLBIRVAAE1IS | HFGITR_EL2_TLBIRVAE1IS | - HFGITR_EL2_TLBIRVAALE1OS | HFGITR_EL2_TLBIRVALE1OS | - HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS); - if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, IMP)) - res0 |= (HFGITR_EL2_CFPRCTX | HFGITR_EL2_DVPRCTX | - HFGITR_EL2_CPPRCTX); - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) - res0 |= (HFGITR_EL2_nBRBINJ | HFGITR_EL2_nBRBIALL); - if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP)) - res0 |= (HFGITR_EL2_nGCSPUSHM_EL1 | HFGITR_EL2_nGCSSTR_EL1 | - HFGITR_EL2_nGCSEPP); - if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX)) - res0 |= HFGITR_EL2_COSPRCTX; - if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) - res0 |= HFGITR_EL2_ATS1E1A; + get_reg_fixed_bits(kvm, HFGITR_EL2, &res0, &res1); set_sysreg_masks(kvm, HFGITR_EL2, res0, res1); /* HAFGRTR_EL2 - not a lot to see here */ - res0 = HAFGRTR_EL2_RES0; - res1 = HAFGRTR_EL2_RES1; - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) - res0 |= ~(res0 | res1); + get_reg_fixed_bits(kvm, HAFGRTR_EL2, &res0, &res1); set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); + /* HFG[RW]TR2_EL2 */ + get_reg_fixed_bits(kvm, HFGRTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGRTR2_EL2, res0, res1); + get_reg_fixed_bits(kvm, HFGWTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGWTR2_EL2, res0, res1); + + /* HDFG[RW]TR2_EL2 */ + get_reg_fixed_bits(kvm, HDFGRTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGRTR2_EL2, res0, res1); + get_reg_fixed_bits(kvm, HDFGWTR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HDFGWTR2_EL2, res0, res1); + + /* HFGITR2_EL2 */ + get_reg_fixed_bits(kvm, HFGITR2_EL2, &res0, &res1); + set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1); + /* TCR2_EL2 */ res0 = TCR2_EL2_RES0; res1 = TCR2_EL2_RES1; @@ -1318,6 +1750,9 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount; set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1); + /* VNCR_EL2 */ + set_sysreg_masks(kvm, VNCR_EL2, VNCR_EL2_RES0, VNCR_EL2_RES1); + out: for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) (void)__vcpu_sys_reg(vcpu, sr); @@ -1338,6 +1773,9 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) write_unlock(&vcpu->kvm->mmu_lock); } + if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu)) + kvm_map_l1_vncr(vcpu); + /* Must be last, as may switch context! */ if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu)) kvm_inject_nested_irq(vcpu); diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 0f89157d31fd..fcd70bfe44fb 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -5,12 +5,12 @@ */ #include <linux/init.h> +#include <linux/interval_tree_generic.h> #include <linux/kmemleak.h> #include <linux/kvm_host.h> #include <asm/kvm_mmu.h> #include <linux/memblock.h> #include <linux/mutex.h> -#include <linux/sort.h> #include <asm/kvm_pkvm.h> @@ -24,23 +24,6 @@ static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); phys_addr_t hyp_mem_base; phys_addr_t hyp_mem_size; -static int cmp_hyp_memblock(const void *p1, const void *p2) -{ - const struct memblock_region *r1 = p1; - const struct memblock_region *r2 = p2; - - return r1->base < r2->base ? -1 : (r1->base > r2->base); -} - -static void __init sort_memblock_regions(void) -{ - sort(hyp_memory, - *hyp_memblock_nr_ptr, - sizeof(struct memblock_region), - cmp_hyp_memblock, - NULL); -} - static int __init register_memblock_regions(void) { struct memblock_region *reg; @@ -52,7 +35,6 @@ static int __init register_memblock_regions(void) hyp_memory[*hyp_memblock_nr_ptr] = *reg; (*hyp_memblock_nr_ptr)++; } - sort_memblock_regions(); return 0; } @@ -79,6 +61,7 @@ void __init kvm_hyp_reserve(void) hyp_mem_pages += host_s2_pgtable_pages(); hyp_mem_pages += hyp_vm_table_pages(); hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); + hyp_mem_pages += pkvm_selftest_pages(); hyp_mem_pages += hyp_ffa_proxy_pages(); /* @@ -262,6 +245,7 @@ static int __init finalize_pkvm(void) * at, which would end badly once inaccessible. */ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); + kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start); kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start); kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size); @@ -273,80 +257,68 @@ static int __init finalize_pkvm(void) } device_initcall_sync(finalize_pkvm); -static int cmp_mappings(struct rb_node *node, const struct rb_node *parent) +static u64 __pkvm_mapping_start(struct pkvm_mapping *m) { - struct pkvm_mapping *a = rb_entry(node, struct pkvm_mapping, node); - struct pkvm_mapping *b = rb_entry(parent, struct pkvm_mapping, node); - - if (a->gfn < b->gfn) - return -1; - if (a->gfn > b->gfn) - return 1; - return 0; + return m->gfn * PAGE_SIZE; } -static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn) +static u64 __pkvm_mapping_end(struct pkvm_mapping *m) { - struct rb_node *node = root->rb_node, *prev = NULL; - struct pkvm_mapping *mapping; - - while (node) { - mapping = rb_entry(node, struct pkvm_mapping, node); - if (mapping->gfn == gfn) - return node; - prev = node; - node = (gfn < mapping->gfn) ? node->rb_left : node->rb_right; - } - - return prev; + return (m->gfn + m->nr_pages) * PAGE_SIZE - 1; } +INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last, + __pkvm_mapping_start, __pkvm_mapping_end, static, + pkvm_mapping); + /* - * __tmp is updated to rb_next(__tmp) *before* entering the body of the loop to allow freeing - * of __map inline. + * __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow + * freeing of __map inline. */ #define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \ - for (struct rb_node *__tmp = find_first_mapping_node(&(__pgt)->pkvm_mappings, \ - ((__start) >> PAGE_SHIFT)); \ + for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings, \ + __start, __end - 1); \ __tmp && ({ \ - __map = rb_entry(__tmp, struct pkvm_mapping, node); \ - __tmp = rb_next(__tmp); \ + __map = __tmp; \ + __tmp = pkvm_mapping_iter_next(__map, __start, __end - 1); \ true; \ }); \ - ) \ - if (__map->gfn < ((__start) >> PAGE_SHIFT)) \ - continue; \ - else if (__map->gfn >= ((__end) >> PAGE_SHIFT)) \ - break; \ - else + ) int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops) { - pgt->pkvm_mappings = RB_ROOT; + pgt->pkvm_mappings = RB_ROOT_CACHED; pgt->mmu = mmu; return 0; } -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); pkvm_handle_t handle = kvm->arch.pkvm.handle; struct pkvm_mapping *mapping; - struct rb_node *node; + int ret; if (!handle) - return; + return 0; - node = rb_first(&pgt->pkvm_mappings); - while (node) { - mapping = rb_entry(node, struct pkvm_mapping, node); - kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); - node = rb_next(node); - rb_erase(&mapping->node, &pgt->pkvm_mappings); + for_each_mapping_in_range_safe(pgt, start, end, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn, + mapping->nr_pages); + if (WARN_ON(ret)) + return ret; + pkvm_mapping_remove(mapping, &pgt->pkvm_mappings); kfree(mapping); } + + return 0; +} + +void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL)); } int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, @@ -360,42 +332,46 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 pfn = phys >> PAGE_SHIFT; int ret; - if (size != PAGE_SIZE) + if (size != PAGE_SIZE && size != PMD_SIZE) return -EINVAL; lockdep_assert_held_write(&kvm->mmu_lock); - ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, prot); - if (ret) { - /* Is the gfn already mapped due to a racing vCPU? */ - if (ret == -EPERM) + + /* + * Calling stage2_map() on top of existing mappings is either happening because of a race + * with another vCPU, or because we're changing between page and block mappings. As per + * user_mem_abort(), same-size permission faults are handled in the relax_perms() path. + */ + mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1); + if (mapping) { + if (size == (mapping->nr_pages * PAGE_SIZE)) return -EAGAIN; + + /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */ + ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); + if (ret) + return ret; + mapping = NULL; } + ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot); + if (WARN_ON(ret)) + return ret; + swap(mapping, cache->mapping); mapping->gfn = gfn; mapping->pfn = pfn; - WARN_ON(rb_find_add(&mapping->node, &pgt->pkvm_mappings, cmp_mappings)); + mapping->nr_pages = size / PAGE_SIZE; + pkvm_mapping_insert(mapping, &pgt->pkvm_mappings); return ret; } int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { - struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); - pkvm_handle_t handle = kvm->arch.pkvm.handle; - struct pkvm_mapping *mapping; - int ret = 0; - - lockdep_assert_held_write(&kvm->mmu_lock); - for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { - ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); - if (WARN_ON(ret)) - break; - rb_erase(&mapping->node, &pgt->pkvm_mappings); - kfree(mapping); - } + lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock); - return ret; + return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); } int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) @@ -407,7 +383,8 @@ int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { - ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn); + ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn, + mapping->nr_pages); if (WARN_ON(ret)) break; } @@ -422,7 +399,8 @@ int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) - __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE); + __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), + PAGE_SIZE * mapping->nr_pages); return 0; } @@ -437,7 +415,7 @@ bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, - mkold); + mapping->nr_pages, mkold); return young; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index a1bc10d7116a..25c29107f13f 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -280,7 +280,7 @@ static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu) return 0; hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); - n = vcpu->kvm->arch.pmcr_n; + n = vcpu->kvm->arch.nr_pmu_counters; /* * Programming HPMN to a value greater than PMCR_EL0.N is @@ -608,14 +608,12 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - /* - * Unlike other PMU sysregs, the controls in PMCR_EL0 always apply - * to the 'guest' range of counters and never the 'hyp' range. - */ unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) & - ~kvm_pmu_hyp_counter_mask(vcpu) & ~BIT(ARMV8_PMU_CYCLE_IDX); + if (!vcpu_is_el2(vcpu)) + mask &= ~kvm_pmu_hyp_counter_mask(vcpu); + for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } @@ -1027,12 +1025,30 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); } +static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr) +{ + kvm->arch.nr_pmu_counters = nr; + + /* Reset MDCR_EL2.HPMN behind the vcpus' back... */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2, kvm->arch.vcpu_features)) { + struct kvm_vcpu *vcpu; + unsigned long i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2); + val &= ~MDCR_EL2_HPMN; + val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters); + __vcpu_sys_reg(vcpu, MDCR_EL2) = val; + } + } +} + static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) { lockdep_assert_held(&kvm->arch.config_lock); kvm->arch.arm_pmu = arm_pmu; - kvm->arch.pmcr_n = kvm_arm_pmu_get_max_counters(kvm); + kvm_arm_set_nr_counters(kvm, kvm_arm_pmu_get_max_counters(kvm)); } /** @@ -1088,6 +1104,20 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) return ret; } +static int kvm_arm_pmu_v3_set_nr_counters(struct kvm_vcpu *vcpu, unsigned int n) +{ + struct kvm *kvm = vcpu->kvm; + + if (!kvm->arch.arm_pmu) + return -EINVAL; + + if (n > kvm_arm_pmu_get_max_counters(kvm)) + return -EINVAL; + + kvm_arm_set_nr_counters(kvm, n); + return 0; +} + int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { struct kvm *kvm = vcpu->kvm; @@ -1184,6 +1214,15 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id); } + case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: { + unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr; + unsigned int n; + + if (get_user(n, uaddr)) + return -EFAULT; + + return kvm_arm_pmu_v3_set_nr_counters(vcpu, n); + } case KVM_ARM_VCPU_PMU_V3_INIT: return kvm_arm_pmu_v3_init(vcpu); } @@ -1222,6 +1261,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) case KVM_ARM_VCPU_PMU_V3_INIT: case KVM_ARM_VCPU_PMU_V3_FILTER: case KVM_ARM_VCPU_PMU_V3_SET_PMU: + case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: if (kvm_vcpu_has_pmu(vcpu)) return 0; } @@ -1260,8 +1300,12 @@ u8 kvm_arm_pmu_get_pmuver_limit(void) u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) { u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); + u64 n = vcpu->kvm->arch.nr_pmu_counters; + + if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu)) + n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); - return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N); + return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N); } void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index f82fcc614e13..959532422d3a 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -158,6 +158,8 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) if (sve_state) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); kfree(sve_state); + free_page((unsigned long)vcpu->arch.ctxt.vncr_array); + kfree(vcpu->arch.vncr_tlb); kfree(vcpu->arch.ccsidr); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 5dde9285afc8..a6cf2888d150 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -785,7 +785,7 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 mask = BIT(ARMV8_PMU_CYCLE_IDX); - u8 n = vcpu->kvm->arch.pmcr_n; + u8 n = vcpu->kvm->arch.nr_pmu_counters; if (n) mask |= GENMASK(n - 1, 0); @@ -1216,8 +1216,9 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, * with the existing KVM behavior. */ if (!kvm_vm_has_ran_once(kvm) && + !vcpu_has_nv(vcpu) && new_n <= kvm_arm_pmu_get_max_counters(kvm)) - kvm->arch.pmcr_n = new_n; + kvm->arch.nr_pmu_counters = new_n; mutex_unlock(&kvm->arch.config_lock); @@ -1600,13 +1601,14 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val = sanitise_id_aa64pfr0_el1(vcpu, val); break; case SYS_ID_AA64PFR1_EL1: - if (!kvm_has_mte(vcpu->kvm)) + if (!kvm_has_mte(vcpu->kvm)) { val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); + } val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); @@ -1959,11 +1961,34 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, { u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; + u8 mte = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE, hw_val); + u8 user_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, user_val); + u8 hw_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, hw_val); /* See set_id_aa64pfr0_el1 for comment about MPAM */ if ((hw_val & mpam_mask) == (user_val & mpam_mask)) user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + /* + * Previously MTE_frac was hidden from guest. However, if the + * hardware supports MTE2 but not MTE_ASYM_FAULT then a value + * of 0 for this field indicates that the hardware supports + * MTE_ASYNC. Whereas, 0xf indicates MTE_ASYNC is not supported. + * + * As KVM must accept values from KVM provided by user-space, + * when ID_AA64PFR1_EL1.MTE is 2 allow user-space to set + * ID_AA64PFR1_EL1.MTE_frac to 0. However, ignore it to avoid + * incorrectly claiming hardware support for MTE_ASYNC in the + * guest. + */ + + if (mte == ID_AA64PFR1_EL1_MTE_MTE2 && + hw_mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI && + user_mte_frac == ID_AA64PFR1_EL1_MTE_frac_ASYNC) { + user_val &= ~ID_AA64PFR1_EL1_MTE_frac_MASK; + user_val |= hw_val & ID_AA64PFR1_EL1_MTE_frac_MASK; + } + return set_id_reg(vcpu, rd, user_val); } @@ -2287,15 +2312,6 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, "trap of EL2 register redirected to EL1"); } -#define EL2_REG(name, acc, rst, v) { \ - SYS_DESC(SYS_##name), \ - .access = acc, \ - .reset = rst, \ - .reg = name, \ - .visibility = el2_visibility, \ - .val = v, \ -} - #define EL2_REG_FILTERED(name, acc, rst, v, filter) { \ SYS_DESC(SYS_##name), \ .access = acc, \ @@ -2305,6 +2321,9 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = v, \ } +#define EL2_REG(name, acc, rst, v) \ + EL2_REG_FILTERED(name, acc, rst, v, el2_visibility) + #define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) @@ -2452,6 +2471,16 @@ static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, return __el2_visibility(vcpu, rd, sve_visibility); } +static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (el2_visibility(vcpu, rd) == 0 && + kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)) + return 0; + + return REG_HIDDEN; +} + static bool access_zcr_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -2576,16 +2605,33 @@ static bool access_mdcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 old = __vcpu_sys_reg(vcpu, MDCR_EL2); + u64 hpmn, val, old = __vcpu_sys_reg(vcpu, MDCR_EL2); - if (!access_rw(vcpu, p, r)) - return false; + if (!p->is_write) { + p->regval = old; + return true; + } + + val = p->regval; + hpmn = FIELD_GET(MDCR_EL2_HPMN, val); + + /* + * If HPMN is out of bounds, limit it to what we actually + * support. This matches the UNKNOWN definition of the field + * in that case, and keeps the emulation simple. Sort of. + */ + if (hpmn > vcpu->kvm->arch.nr_pmu_counters) { + hpmn = vcpu->kvm->arch.nr_pmu_counters; + u64_replace_bits(val, hpmn, MDCR_EL2_HPMN); + } + + __vcpu_sys_reg(vcpu, MDCR_EL2) = val; /* - * Request a reload of the PMU to enable/disable the counters affected - * by HPME. + * Request a reload of the PMU to enable/disable the counters + * affected by HPME. */ - if ((old ^ __vcpu_sys_reg(vcpu, MDCR_EL2)) & MDCR_EL2_HPME) + if ((old ^ val) & MDCR_EL2_HPME) kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return true; @@ -2704,6 +2750,12 @@ static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, .set_user = set_imp_id_reg, \ .reset = reset_imp_id_reg, \ .val = mask, \ + } + +static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + __vcpu_sys_reg(vcpu, r->reg) = vcpu->kvm->arch.nr_pmu_counters; + return vcpu->kvm->arch.nr_pmu_counters; } /* @@ -3249,7 +3301,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), EL2_REG_VNCR(HCR_EL2, reset_hcr, 0), - EL2_REG(MDCR_EL2, access_mdcr, reset_val, 0), + EL2_REG(MDCR_EL2, access_mdcr, reset_mdcr, 0), EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), EL2_REG_VNCR(HSTR_EL2, reset_val, 0), EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0), @@ -3269,6 +3321,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { tcr2_el2_visibility), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), + EL2_REG_FILTERED(VNCR_EL2, bad_vncr_trap, reset_val, 0, + vncr_el2_visibility), { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), @@ -3552,8 +3606,7 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); - u64 base, range, tg, num, scale; - int shift; + u64 base, range; if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); @@ -3563,26 +3616,7 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * of the guest's S2 (different base granule size, for example), we * decide to ignore TTL and only use the described range. */ - tg = FIELD_GET(GENMASK(47, 46), p->regval); - scale = FIELD_GET(GENMASK(45, 44), p->regval); - num = FIELD_GET(GENMASK(43, 39), p->regval); - base = p->regval & GENMASK(36, 0); - - switch(tg) { - case 1: - shift = 12; - break; - case 2: - shift = 14; - break; - case 3: - default: /* IMPDEF: handle tg==0 as 64k */ - shift = 16; - break; - } - - base <<= shift; - range = __TLBI_RANGE_PAGES(num, scale) << shift; + base = decode_range_tlbi(p->regval, &range, NULL); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { @@ -3648,11 +3682,22 @@ static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu, WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding)); } +static bool handle_tlbi_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + if (!kvm_supported_tlbi_s1e2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); + + kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); + return true; +} + static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); /* * If we're here, this is because we've trapped on a EL1 TLBI @@ -3663,6 +3708,13 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * - HCR_EL2.E2H == 0 : a non-VHE guest * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode * + * Another possibility is that we are invalidating the EL2 context + * using EL1 instructions, but that we landed here because we need + * additional invalidation for structures that are not held in the + * CPU TLBs (such as the VNCR pseudo-TLB and its EL2 mapping). In + * that case, we are guaranteed that HCR_EL2.{E2H,TGE} == { 1, 1 } + * as we don't allow an NV-capable L1 in a nVHE configuration. + * * We don't expect these helpers to ever be called when running * in a vEL1 context. */ @@ -3672,7 +3724,13 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) return undef_access(vcpu, p, r); - kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) { + kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval); + return true; + } + + kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, + get_vmid(__vcpu_sys_reg(vcpu, VTTBR_EL2)), &(union tlbi_info) { .va = { .addr = p->regval, @@ -3794,16 +3852,21 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OS, undef_access), - SYS_INSN(TLBI_VAE2OS, undef_access), + SYS_INSN(TLBI_ALLE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2OS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1OS, handle_alle1is), - SYS_INSN(TLBI_VALE2OS, undef_access), + SYS_INSN(TLBI_VALE2OS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2IS, undef_access), - SYS_INSN(TLBI_RVALE2IS, undef_access), + SYS_INSN(TLBI_RVAE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2IS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1IS, handle_alle1is), + + SYS_INSN(TLBI_VALE2IS, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is), @@ -3813,11 +3876,17 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OS, undef_access), - SYS_INSN(TLBI_RVALE2OS, undef_access), - SYS_INSN(TLBI_RVAE2, undef_access), - SYS_INSN(TLBI_RVALE2, undef_access), + SYS_INSN(TLBI_RVAE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2OS, handle_tlbi_el2), + SYS_INSN(TLBI_RVAE2, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE1, handle_alle1is), + + SYS_INSN(TLBI_VALE2, handle_tlbi_el2), + SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is), @@ -3825,19 +3894,19 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OSNXS, undef_access), - SYS_INSN(TLBI_VAE2OSNXS, undef_access), + SYS_INSN(TLBI_ALLE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2OSNXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2OSNXS, undef_access), + SYS_INSN(TLBI_VALE2OSNXS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2ISNXS, undef_access), - SYS_INSN(TLBI_RVALE2ISNXS, undef_access), - SYS_INSN(TLBI_ALLE2ISNXS, undef_access), - SYS_INSN(TLBI_VAE2ISNXS, undef_access), + SYS_INSN(TLBI_RVAE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2ISNXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2ISNXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2ISNXS, undef_access), + SYS_INSN(TLBI_VALE2ISNXS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), @@ -3847,14 +3916,14 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OSNXS, undef_access), - SYS_INSN(TLBI_RVALE2OSNXS, undef_access), - SYS_INSN(TLBI_RVAE2NXS, undef_access), - SYS_INSN(TLBI_RVALE2NXS, undef_access), - SYS_INSN(TLBI_ALLE2NXS, undef_access), - SYS_INSN(TLBI_VAE2NXS, undef_access), + SYS_INSN(TLBI_RVAE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2OSNXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVAE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_RVALE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_ALLE2NXS, handle_tlbi_el2), + SYS_INSN(TLBI_VAE2NXS, handle_tlbi_el2), SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), - SYS_INSN(TLBI_VALE2NXS, undef_access), + SYS_INSN(TLBI_VALE2NXS, handle_tlbi_el2), SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), }; @@ -5153,65 +5222,13 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) goto out; - kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 | - HFGxTR_EL2_nMAIR2_EL1 | - HFGxTR_EL2_nS2POR_EL1 | - HFGxTR_EL2_nACCDATA_EL1 | - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK); - - if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS)) - kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1OS| - HFGITR_EL2_TLBIRVALE1OS | - HFGITR_EL2_TLBIRVAAE1OS | - HFGITR_EL2_TLBIRVAE1OS | - HFGITR_EL2_TLBIVAALE1OS | - HFGITR_EL2_TLBIVALE1OS | - HFGITR_EL2_TLBIVAAE1OS | - HFGITR_EL2_TLBIASIDE1OS | - HFGITR_EL2_TLBIVAE1OS | - HFGITR_EL2_TLBIVMALLE1OS); - - if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE)) - kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1 | - HFGITR_EL2_TLBIRVALE1 | - HFGITR_EL2_TLBIRVAAE1 | - HFGITR_EL2_TLBIRVAE1 | - HFGITR_EL2_TLBIRVAALE1IS| - HFGITR_EL2_TLBIRVALE1IS | - HFGITR_EL2_TLBIRVAAE1IS | - HFGITR_EL2_TLBIRVAE1IS | - HFGITR_EL2_TLBIRVAALE1OS| - HFGITR_EL2_TLBIRVALE1OS | - HFGITR_EL2_TLBIRVAAE1OS | - HFGITR_EL2_TLBIRVAE1OS); - - if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) - kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A; - - if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) - kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP | - HFGITR_EL2_ATS1E1WP); - - if (!kvm_has_s1pie(kvm)) - kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | - HFGxTR_EL2_nPIR_EL1); - - if (!kvm_has_s1poe(kvm)) - kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | - HFGxTR_EL2_nPOR_EL0); - - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) - kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | - HAFGRTR_EL2_RES1); - - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) { - kvm->arch.fgu[HDFGRTR_GROUP] |= (HDFGRTR_EL2_nBRBDATA | - HDFGRTR_EL2_nBRBCTL | - HDFGRTR_EL2_nBRBIDR); - kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_nBRBINJ | - HFGITR_EL2_nBRBIALL); - } + compute_fgu(kvm, HFGRTR_GROUP); + compute_fgu(kvm, HFGITR_GROUP); + compute_fgu(kvm, HDFGRTR_GROUP); + compute_fgu(kvm, HAFGRTR_GROUP); + compute_fgu(kvm, HFGRTR2_GROUP); + compute_fgu(kvm, HFGITR2_GROUP); + compute_fgu(kvm, HDFGRTR2_GROUP); set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); out: @@ -5269,6 +5286,8 @@ int __init kvm_sys_reg_table_init(void) ret = populate_nv_trap_config(); + check_feature_map(); + for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++) ret = populate_sysreg_config(sys_reg_descs + i, i); diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index c18c1a95831e..9c60f6465c78 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -176,7 +176,7 @@ TRACE_EVENT(kvm_set_way_flush, ), TP_printk("S/W flush at 0x%016lx (cache %s)", - __entry->vcpu_pc, __entry->cache ? "on" : "off") + __entry->vcpu_pc, str_on_off(__entry->cache)) ); TRACE_EVENT(kvm_toggle_cache, @@ -196,8 +196,8 @@ TRACE_EVENT(kvm_toggle_cache, ), TP_printk("VM op at 0x%016lx (cache was %s, now %s)", - __entry->vcpu_pc, __entry->was ? "on" : "off", - __entry->now ? "on" : "off") + __entry->vcpu_pc, str_on_off(__entry->was), + str_on_off(__entry->now)) ); /* diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index afb018528bc3..f8425f381de9 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -320,3 +320,227 @@ void vgic_debug_init(struct kvm *kvm) void vgic_debug_destroy(struct kvm *kvm) { } + +/** + * struct vgic_its_iter - Iterator for traversing VGIC ITS device tables. + * @dev: Pointer to the current its_device being processed. + * @ite: Pointer to the current its_ite within the device being processed. + * + * This structure is used to maintain the current position during iteration + * over the ITS device tables. It holds pointers to both the current device + * and the current ITE within that device. + */ +struct vgic_its_iter { + struct its_device *dev; + struct its_ite *ite; +}; + +/** + * end_of_iter - Checks if the iterator has reached the end. + * @iter: The iterator to check. + * + * When the iterator completed processing the final ITE in the last device + * table, it was marked to indicate the end of iteration by setting its + * device and ITE pointers to NULL. + * This function checks whether the iterator was marked as end. + * + * Return: True if the iterator is marked as end, false otherwise. + */ +static inline bool end_of_iter(struct vgic_its_iter *iter) +{ + return !iter->dev && !iter->ite; +} + +/** + * vgic_its_iter_next - Advances the iterator to the next entry in the ITS tables. + * @its: The VGIC ITS structure. + * @iter: The iterator to advance. + * + * This function moves the iterator to the next ITE within the current device, + * or to the first ITE of the next device if the current ITE is the last in + * the device. If the current device is the last device, the iterator is set + * to indicate the end of iteration. + */ +static void vgic_its_iter_next(struct vgic_its *its, struct vgic_its_iter *iter) +{ + struct its_device *dev = iter->dev; + struct its_ite *ite = iter->ite; + + if (!ite || list_is_last(&ite->ite_list, &dev->itt_head)) { + if (list_is_last(&dev->dev_list, &its->device_list)) { + dev = NULL; + ite = NULL; + } else { + dev = list_next_entry(dev, dev_list); + ite = list_first_entry_or_null(&dev->itt_head, + struct its_ite, + ite_list); + } + } else { + ite = list_next_entry(ite, ite_list); + } + + iter->dev = dev; + iter->ite = ite; +} + +/** + * vgic_its_debug_start - Start function for the seq_file interface. + * @s: The seq_file structure. + * @pos: The starting position (offset). + * + * This function initializes the iterator to the beginning of the ITS tables + * and advances it to the specified position. It acquires the its_lock mutex + * to protect shared data. + * + * Return: An iterator pointer on success, NULL if no devices are found or + * the end of the list is reached, or ERR_PTR(-ENOMEM) on memory + * allocation failure. + */ +static void *vgic_its_debug_start(struct seq_file *s, loff_t *pos) +{ + struct vgic_its *its = s->private; + struct vgic_its_iter *iter; + struct its_device *dev; + loff_t offset = *pos; + + mutex_lock(&its->its_lock); + + dev = list_first_entry_or_null(&its->device_list, + struct its_device, dev_list); + if (!dev) + return NULL; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return ERR_PTR(-ENOMEM); + + iter->dev = dev; + iter->ite = list_first_entry_or_null(&dev->itt_head, + struct its_ite, ite_list); + + while (!end_of_iter(iter) && offset--) + vgic_its_iter_next(its, iter); + + if (end_of_iter(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +/** + * vgic_its_debug_next - Next function for the seq_file interface. + * @s: The seq_file structure. + * @v: The current iterator. + * @pos: The current position (offset). + * + * This function advances the iterator to the next entry and increments the + * position. + * + * Return: An iterator pointer on success, or NULL if the end of the list is + * reached. + */ +static void *vgic_its_debug_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct vgic_its *its = s->private; + struct vgic_its_iter *iter = v; + + ++*pos; + vgic_its_iter_next(its, iter); + + if (end_of_iter(iter)) { + kfree(iter); + return NULL; + } + return iter; +} + +/** + * vgic_its_debug_stop - Stop function for the seq_file interface. + * @s: The seq_file structure. + * @v: The current iterator. + * + * This function frees the iterator and releases the its_lock mutex. + */ +static void vgic_its_debug_stop(struct seq_file *s, void *v) +{ + struct vgic_its *its = s->private; + struct vgic_its_iter *iter = v; + + if (!IS_ERR_OR_NULL(iter)) + kfree(iter); + mutex_unlock(&its->its_lock); +} + +/** + * vgic_its_debug_show - Show function for the seq_file interface. + * @s: The seq_file structure. + * @v: The current iterator. + * + * This function formats and prints the ITS table entry information to the + * seq_file output. + * + * Return: 0 on success. + */ +static int vgic_its_debug_show(struct seq_file *s, void *v) +{ + struct vgic_its_iter *iter = v; + struct its_device *dev = iter->dev; + struct its_ite *ite = iter->ite; + + if (list_is_first(&ite->ite_list, &dev->itt_head)) { + seq_printf(s, "\n"); + seq_printf(s, "Device ID: 0x%x, Event ID Range: [0 - %llu]\n", + dev->device_id, BIT_ULL(dev->num_eventid_bits) - 1); + seq_printf(s, "EVENT_ID INTID HWINTID TARGET COL_ID HW\n"); + seq_printf(s, "-----------------------------------------------\n"); + } + + if (ite && ite->irq && ite->collection) { + seq_printf(s, "%8u %8u %8u %8u %8u %2d\n", + ite->event_id, ite->irq->intid, ite->irq->hwintid, + ite->collection->target_addr, + ite->collection->collection_id, ite->irq->hw); + } + + return 0; +} + +static const struct seq_operations vgic_its_debug_sops = { + .start = vgic_its_debug_start, + .next = vgic_its_debug_next, + .stop = vgic_its_debug_stop, + .show = vgic_its_debug_show +}; + +DEFINE_SEQ_ATTRIBUTE(vgic_its_debug); + +/** + * vgic_its_debug_init - Initializes the debugfs interface for VGIC ITS. + * @dev: The KVM device structure. + * + * This function creates a debugfs file named "vgic-its-state@%its_base" + * to expose the ITS table information. + * + * Return: 0 on success. + */ +int vgic_its_debug_init(struct kvm_device *dev) +{ + struct vgic_its *its = dev->private; + char *name; + + name = kasprintf(GFP_KERNEL, "vgic-its-state@%llx", (u64)its->vgic_its_base); + if (!name) + return -ENOMEM; + + debugfs_create_file(name, 0444, dev->kvm->debugfs_dentry, its, &vgic_its_debug_fops); + + kfree(name); + return 0; +} + +void vgic_its_debug_destroy(struct kvm_device *dev) +{ +} diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index fb96802799c6..569f9da9049f 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -154,36 +154,6 @@ out_unlock: return irq; } -struct its_device { - struct list_head dev_list; - - /* the head for the list of ITTEs */ - struct list_head itt_head; - u32 num_eventid_bits; - gpa_t itt_addr; - u32 device_id; -}; - -#define COLLECTION_NOT_MAPPED ((u32)~0) - -struct its_collection { - struct list_head coll_list; - - u32 collection_id; - u32 target_addr; -}; - -#define its_is_collection_mapped(coll) ((coll) && \ - ((coll)->target_addr != COLLECTION_NOT_MAPPED)) - -struct its_ite { - struct list_head ite_list; - - struct vgic_irq *irq; - struct its_collection *collection; - u32 event_id; -}; - /** * struct vgic_its_abi - ITS abi ops and settings * @cte_esz: collection table entry size @@ -1938,6 +1908,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev) mutex_lock(&its->its_lock); + vgic_its_debug_destroy(kvm_dev); + vgic_its_free_device_list(kvm, its); vgic_its_free_collection_list(kvm, its); vgic_its_invalidate_cache(its); @@ -2771,7 +2743,12 @@ static int vgic_its_set_attr(struct kvm_device *dev, if (ret) return ret; - return vgic_register_its_iodev(dev->kvm, its, addr); + ret = vgic_register_its_iodev(dev->kvm, its, addr); + if (ret) + return ret; + + return vgic_its_debug_init(dev); + } case KVM_DEV_ARM_VGIC_GRP_CTRL: return vgic_its_ctrl(dev->kvm, its, attr->attr); diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index bfa5bde1f106..4f6954c30674 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -240,9 +240,6 @@ static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, goto next; } - /* It is illegal to have the EOI bit set with HW */ - lr &= ~ICH_LR_EOI; - /* Translate the virtual mapping to the real one */ lr &= ~ICH_LR_PHYS_ID_MASK; lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 0c5a63712702..4349084cb9a6 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -172,6 +172,36 @@ struct vgic_reg_attr { gpa_t addr; }; +struct its_device { + struct list_head dev_list; + + /* the head for the list of ITTEs */ + struct list_head itt_head; + u32 num_eventid_bits; + gpa_t itt_addr; + u32 device_id; +}; + +#define COLLECTION_NOT_MAPPED ((u32)~0) + +struct its_collection { + struct list_head coll_list; + + u32 collection_id; + u32 target_addr; +}; + +#define its_is_collection_mapped(coll) ((coll) && \ + ((coll)->target_addr != COLLECTION_NOT_MAPPED)) + +struct its_ite { + struct list_head ite_list; + + struct vgic_irq *irq; + struct its_collection *collection; + u32 event_id; +}; + int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr); int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, @@ -359,4 +389,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu); void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu); void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu); +int vgic_its_debug_init(struct kvm_device *dev); +void vgic_its_debug_destroy(struct kvm_device *dev); + #endif diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index cfe8cb8ba1cc..0c8737f4f2ce 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -129,7 +129,7 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) if (!pte_present(orig_pte) || !pte_cont(orig_pte)) return orig_pte; - ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize); + ncontig = find_num_contig(mm, addr, ptep, &pgsize); for (i = 0; i < ncontig; i++, ptep++) { pte_t pte = __ptep_get(ptep); @@ -159,12 +159,11 @@ static pte_t get_clear_contig(struct mm_struct *mm, pte_t pte, tmp_pte; bool present; - pte = __ptep_get_and_clear(mm, addr, ptep); + pte = __ptep_get_and_clear_anysz(mm, ptep, pgsize); present = pte_present(pte); while (--ncontig) { ptep++; - addr += pgsize; - tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + tmp_pte = __ptep_get_and_clear_anysz(mm, ptep, pgsize); if (present) { if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); @@ -183,8 +182,9 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm, { pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig); struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + unsigned long end = addr + (pgsize * ncontig); - flush_tlb_range(&vma, addr, addr + (pgsize * ncontig)); + __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, true); return orig_pte; } @@ -207,9 +207,12 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - __ptep_get_and_clear(mm, addr, ptep); + __ptep_get_and_clear_anysz(mm, ptep, pgsize); - flush_tlb_range(&vma, saddr, addr); + if (mm == &init_mm) + flush_tlb_kernel_range(saddr, addr); + else + __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true); } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, @@ -218,30 +221,20 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, size_t pgsize; int i; int ncontig; - unsigned long pfn, dpfn; - pgprot_t hugeprot; ncontig = num_contig_ptes(sz, &pgsize); if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) - __set_ptes(mm, addr, ptep, pte, 1); - return; - } - - if (!pte_cont(pte)) { - __set_ptes(mm, addr, ptep, pte, 1); + __set_ptes_anysz(mm, ptep, pte, 1, pgsize); return; } - pfn = pte_pfn(pte); - dpfn = pgsize >> PAGE_SHIFT; - hugeprot = pte_pgprot(pte); - - clear_flush(mm, addr, ptep, pgsize, ncontig); + /* Only need to "break" if transitioning valid -> valid. */ + if (pte_cont(pte) && pte_valid(__ptep_get(ptep))) + clear_flush(mm, addr, ptep, pgsize, ncontig); - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -431,23 +424,23 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - int ncontig, i; + int ncontig; size_t pgsize = 0; - unsigned long pfn = pte_pfn(pte), dpfn; struct mm_struct *mm = vma->vm_mm; - pgprot_t hugeprot; pte_t orig_pte; + VM_WARN_ON(!pte_present(pte)); + if (!pte_cont(pte)) return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); - ncontig = find_num_contig(mm, addr, ptep, &pgsize); - dpfn = pgsize >> PAGE_SHIFT; + ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); if (!__cont_access_flags_changed(ptep, pte, ncontig)) return 0; orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); + VM_WARN_ON(!pte_present(orig_pte)); /* Make sure we don't lose the dirty or young state */ if (pte_dirty(orig_pte)) @@ -456,38 +449,31 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, if (pte_young(orig_pte)) pte = pte_mkyoung(pte); - hugeprot = pte_pgprot(pte); - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); - + __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); return 1; } void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long pfn, dpfn; - pgprot_t hugeprot; - int ncontig, i; + int ncontig; size_t pgsize; pte_t pte; - if (!pte_cont(__ptep_get(ptep))) { + pte = __ptep_get(ptep); + VM_WARN_ON(!pte_present(pte)); + + if (!pte_cont(pte)) { __ptep_set_wrprotect(mm, addr, ptep); return; } ncontig = find_num_contig(mm, addr, ptep, &pgsize); - dpfn = pgsize >> PAGE_SHIFT; pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); pte = pte_wrprotect(pte); - hugeprot = pte_pgprot(pte); - pfn = pte_pfn(pte); - - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -497,10 +483,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(__ptep_get(ptep))) - return ptep_clear_flush(vma, addr, ptep); - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); + ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index b99bf3980fc6..0c8c35dd645e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -275,26 +275,6 @@ void __init arm64_memblock_init(void) } } - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - extern u16 memstart_offset_seed; - u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); - int parange = cpuid_feature_extract_unsigned_field( - mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); - s64 range = linear_region_size - - BIT(id_aa64mmfr0_parange_to_phys_shift(parange)); - - /* - * If the size of the linear region exceeds, by a sufficient - * margin, the size of the region that the physical memory can - * span, randomize the linear region as well. - */ - if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) { - range /= ARM64_MEMSTART_ALIGN; - memstart_addr -= ARM64_MEMSTART_ALIGN * - ((range * memstart_offset_seed) >> 16); - } - } - /* * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 39fd1f7ff02a..04d4a8f676db 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -96,8 +96,8 @@ static int change_memory_common(unsigned long addr, int numpages, * we are operating on does not result in such splitting. * * Let's restrict ourselves to mappings created by vmalloc (or vmap). - * Those are guaranteed to consist entirely of page mappings, and - * splitting is never needed. + * Disallow VM_ALLOW_HUGE_VMAP mappings to guarantee that only page + * mappings are updated and splitting is never needed. * * So check whether the [addr, addr + size) interval is entirely * covered by precisely one VM area that has the VM_ALLOC flag set. @@ -105,7 +105,7 @@ static int change_memory_common(unsigned long addr, int numpages, area = find_vm_area((void *)addr); if (!area || end > (unsigned long)kasan_reset_tag(area->addr) + area->size || - !(area->flags & VM_ALLOC)) + ((area->flags & (VM_ALLOC | VM_ALLOW_HUGE_VMAP)) != VM_ALLOC)) return -EINVAL; if (!numpages) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index fb30c8804f87..80d470aa469d 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -512,26 +512,11 @@ alternative_else_nop_endif ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 cbz x1, .Lskip_indirection - /* - * The PROT_* macros describing the various memory types may resolve to - * C expressions if they include the PTE_MAYBE_* macros, and so they - * can only be used from C code. The PIE_E* constants below are also - * defined in terms of those macros, but will mask out those - * PTE_MAYBE_* constants, whether they are set or not. So #define them - * as 0x0 here so we can evaluate the PIE_E* constants in asm context. - */ - -#define PTE_MAYBE_NG 0 -#define PTE_MAYBE_SHARED 0 - - mov_q x0, PIE_E0 + mov_q x0, PIE_E0_ASM msr REG_PIRE0_EL1, x0 - mov_q x0, PIE_E1 + mov_q x0, PIE_E1_ASM msr REG_PIR_EL1, x0 -#undef PTE_MAYBE_NG -#undef PTE_MAYBE_SHARED - orr tcr2, tcr2, TCR2_EL1_PIE msr REG_TCR2_EL1, x0 diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 634d78422adb..da8b89dd2910 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2113,7 +2113,7 @@ bool bpf_jit_supports_subprog_tailcalls(void) } static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, - int args_off, int retval_off, int run_ctx_off, + int bargs_off, int retval_off, int run_ctx_off, bool save_ret) { __le32 *branch; @@ -2155,7 +2155,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, branch = ctx->image + ctx->idx; emit(A64_NOP, ctx); - emit(A64_ADD_I(1, A64_R(0), A64_SP, args_off), ctx); + emit(A64_ADD_I(1, A64_R(0), A64_SP, bargs_off), ctx); if (!p->jited) emit_addr_mov_i64(A64_R(1), (const u64)p->insnsi, ctx); @@ -2180,7 +2180,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, } static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, - int args_off, int retval_off, int run_ctx_off, + int bargs_off, int retval_off, int run_ctx_off, __le32 **branches) { int i; @@ -2190,7 +2190,7 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, */ emit(A64_STR64I(A64_ZR, A64_SP, retval_off), ctx); for (i = 0; i < tl->nr_links; i++) { - invoke_bpf_prog(ctx, tl->links[i], args_off, retval_off, + invoke_bpf_prog(ctx, tl->links[i], bargs_off, retval_off, run_ctx_off, true); /* if (*(u64 *)(sp + retval_off) != 0) * goto do_fexit; @@ -2204,23 +2204,125 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, } } -static void save_args(struct jit_ctx *ctx, int args_off, int nregs) +struct arg_aux { + /* how many args are passed through registers, the rest of the args are + * passed through stack + */ + int args_in_regs; + /* how many registers are used to pass arguments */ + int regs_for_args; + /* how much stack is used for additional args passed to bpf program + * that did not fit in original function registers + */ + int bstack_for_args; + /* home much stack is used for additional args passed to the + * original function when called from trampoline (this one needs + * arguments to be properly aligned) + */ + int ostack_for_args; +}; + +static int calc_arg_aux(const struct btf_func_model *m, + struct arg_aux *a) { - int i; + int stack_slots, nregs, slots, i; + + /* verifier ensures m->nr_args <= MAX_BPF_FUNC_ARGS */ + for (i = 0, nregs = 0; i < m->nr_args; i++) { + slots = (m->arg_size[i] + 7) / 8; + if (nregs + slots <= 8) /* passed through register ? */ + nregs += slots; + else + break; + } + + a->args_in_regs = i; + a->regs_for_args = nregs; + a->ostack_for_args = 0; + a->bstack_for_args = 0; + + /* the rest arguments are passed through stack */ + for (; i < m->nr_args; i++) { + /* We can not know for sure about exact alignment needs for + * struct passed on stack, so deny those + */ + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) + return -ENOTSUPP; + stack_slots = (m->arg_size[i] + 7) / 8; + a->bstack_for_args += stack_slots * 8; + a->ostack_for_args = a->ostack_for_args + stack_slots * 8; + } + + return 0; +} - for (i = 0; i < nregs; i++) { - emit(A64_STR64I(i, A64_SP, args_off), ctx); - args_off += 8; +static void clear_garbage(struct jit_ctx *ctx, int reg, int effective_bytes) +{ + if (effective_bytes) { + int garbage_bits = 64 - 8 * effective_bytes; +#ifdef CONFIG_CPU_BIG_ENDIAN + /* garbage bits are at the right end */ + emit(A64_LSR(1, reg, reg, garbage_bits), ctx); + emit(A64_LSL(1, reg, reg, garbage_bits), ctx); +#else + /* garbage bits are at the left end */ + emit(A64_LSL(1, reg, reg, garbage_bits), ctx); + emit(A64_LSR(1, reg, reg, garbage_bits), ctx); +#endif } } -static void restore_args(struct jit_ctx *ctx, int args_off, int nregs) +static void save_args(struct jit_ctx *ctx, int bargs_off, int oargs_off, + const struct btf_func_model *m, + const struct arg_aux *a, + bool for_call_origin) { int i; + int reg; + int doff; + int soff; + int slots; + u8 tmp = bpf2a64[TMP_REG_1]; + + /* store arguments to the stack for the bpf program, or restore + * arguments from stack for the original function + */ + for (reg = 0; reg < a->regs_for_args; reg++) { + emit(for_call_origin ? + A64_LDR64I(reg, A64_SP, bargs_off) : + A64_STR64I(reg, A64_SP, bargs_off), + ctx); + bargs_off += 8; + } + + soff = 32; /* on stack arguments start from FP + 32 */ + doff = (for_call_origin ? oargs_off : bargs_off); + + /* save on stack arguments */ + for (i = a->args_in_regs; i < m->nr_args; i++) { + slots = (m->arg_size[i] + 7) / 8; + /* verifier ensures arg_size <= 16, so slots equals 1 or 2 */ + while (slots-- > 0) { + emit(A64_LDR64I(tmp, A64_FP, soff), ctx); + /* if there is unused space in the last slot, clear + * the garbage contained in the space. + */ + if (slots == 0 && !for_call_origin) + clear_garbage(ctx, tmp, m->arg_size[i] % 8); + emit(A64_STR64I(tmp, A64_SP, doff), ctx); + soff += 8; + doff += 8; + } + } +} - for (i = 0; i < nregs; i++) { - emit(A64_LDR64I(i, A64_SP, args_off), ctx); - args_off += 8; +static void restore_args(struct jit_ctx *ctx, int bargs_off, int nregs) +{ + int reg; + + for (reg = 0; reg < nregs; reg++) { + emit(A64_LDR64I(reg, A64_SP, bargs_off), ctx); + bargs_off += 8; } } @@ -2243,17 +2345,21 @@ static bool is_struct_ops_tramp(const struct bpf_tramp_links *fentry_links) */ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, struct bpf_tramp_links *tlinks, void *func_addr, - int nregs, u32 flags) + const struct btf_func_model *m, + const struct arg_aux *a, + u32 flags) { int i; int stack_size; int retaddr_off; int regs_off; int retval_off; - int args_off; - int nregs_off; + int bargs_off; + int nfuncargs_off; int ip_off; int run_ctx_off; + int oargs_off; + int nfuncargs; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; @@ -2262,31 +2368,38 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, bool is_struct_ops = is_struct_ops_tramp(fentry); /* trampoline stack layout: - * [ parent ip ] - * [ FP ] - * SP + retaddr_off [ self ip ] - * [ FP ] + * [ parent ip ] + * [ FP ] + * SP + retaddr_off [ self ip ] + * [ FP ] * - * [ padding ] align SP to multiples of 16 + * [ padding ] align SP to multiples of 16 * - * [ x20 ] callee saved reg x20 - * SP + regs_off [ x19 ] callee saved reg x19 + * [ x20 ] callee saved reg x20 + * SP + regs_off [ x19 ] callee saved reg x19 * - * SP + retval_off [ return value ] BPF_TRAMP_F_CALL_ORIG or - * BPF_TRAMP_F_RET_FENTRY_RET + * SP + retval_off [ return value ] BPF_TRAMP_F_CALL_ORIG or + * BPF_TRAMP_F_RET_FENTRY_RET + * [ arg reg N ] + * [ ... ] + * SP + bargs_off [ arg reg 1 ] for bpf * - * [ arg reg N ] - * [ ... ] - * SP + args_off [ arg reg 1 ] + * SP + nfuncargs_off [ arg regs count ] * - * SP + nregs_off [ arg regs count ] + * SP + ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * - * SP + ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag + * SP + run_ctx_off [ bpf_tramp_run_ctx ] * - * SP + run_ctx_off [ bpf_tramp_run_ctx ] + * [ stack arg N ] + * [ ... ] + * SP + oargs_off [ stack arg 1 ] for original func */ stack_size = 0; + oargs_off = stack_size; + if (flags & BPF_TRAMP_F_CALL_ORIG) + stack_size += a->ostack_for_args; + run_ctx_off = stack_size; /* room for bpf_tramp_run_ctx */ stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8); @@ -2296,13 +2409,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; - nregs_off = stack_size; + nfuncargs_off = stack_size; /* room for args count */ stack_size += 8; - args_off = stack_size; + bargs_off = stack_size; /* room for args */ - stack_size += nregs * 8; + nfuncargs = a->regs_for_args + a->bstack_for_args / 8; + stack_size += 8 * nfuncargs; /* room for return value */ retval_off = stack_size; @@ -2349,11 +2463,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } /* save arg regs count*/ - emit(A64_MOVZ(1, A64_R(10), nregs, 0), ctx); - emit(A64_STR64I(A64_R(10), A64_SP, nregs_off), ctx); + emit(A64_MOVZ(1, A64_R(10), nfuncargs, 0), ctx); + emit(A64_STR64I(A64_R(10), A64_SP, nfuncargs_off), ctx); - /* save arg regs */ - save_args(ctx, args_off, nregs); + /* save args for bpf */ + save_args(ctx, bargs_off, oargs_off, m, a, false); /* save callee saved registers */ emit(A64_STR64I(A64_R(19), A64_SP, regs_off), ctx); @@ -2369,7 +2483,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } for (i = 0; i < fentry->nr_links; i++) - invoke_bpf_prog(ctx, fentry->links[i], args_off, + invoke_bpf_prog(ctx, fentry->links[i], bargs_off, retval_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET); @@ -2379,12 +2493,13 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, if (!branches) return -ENOMEM; - invoke_bpf_mod_ret(ctx, fmod_ret, args_off, retval_off, + invoke_bpf_mod_ret(ctx, fmod_ret, bargs_off, retval_off, run_ctx_off, branches); } if (flags & BPF_TRAMP_F_CALL_ORIG) { - restore_args(ctx, args_off, nregs); + /* save args for original func */ + save_args(ctx, bargs_off, oargs_off, m, a, true); /* call original func */ emit(A64_LDR64I(A64_R(10), A64_SP, retaddr_off), ctx); emit(A64_ADR(A64_LR, AARCH64_INSN_SIZE * 2), ctx); @@ -2403,7 +2518,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } for (i = 0; i < fexit->nr_links; i++) - invoke_bpf_prog(ctx, fexit->links[i], args_off, retval_off, + invoke_bpf_prog(ctx, fexit->links[i], bargs_off, retval_off, run_ctx_off, false); if (flags & BPF_TRAMP_F_CALL_ORIG) { @@ -2417,7 +2532,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } if (flags & BPF_TRAMP_F_RESTORE_REGS) - restore_args(ctx, args_off, nregs); + restore_args(ctx, bargs_off, a->regs_for_args); /* restore callee saved register x19 and x20 */ emit(A64_LDR64I(A64_R(19), A64_SP, regs_off), ctx); @@ -2454,21 +2569,6 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, return ctx->idx; } -static int btf_func_model_nregs(const struct btf_func_model *m) -{ - int nregs = m->nr_args; - int i; - - /* extra registers needed for struct argument */ - for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) { - /* The arg_size is at most 16 bytes, enforced by the verifier. */ - if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) - nregs += (m->arg_size[i] + 7) / 8 - 1; - } - - return nregs; -} - int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { @@ -2477,14 +2577,14 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, .idx = 0, }; struct bpf_tramp_image im; - int nregs, ret; + struct arg_aux aaux; + int ret; - nregs = btf_func_model_nregs(m); - /* the first 8 registers are used for arguments */ - if (nregs > 8) - return -ENOTSUPP; + ret = calc_arg_aux(m, &aaux); + if (ret < 0) + return ret; - ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, nregs, flags); + ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, m, &aaux, flags); if (ret < 0) return ret; @@ -2511,9 +2611,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { - int ret, nregs; - void *image, *tmp; u32 size = ro_image_end - ro_image; + struct arg_aux aaux; + void *image, *tmp; + int ret; /* image doesn't need to be in module memory range, so we can * use kvmalloc. @@ -2529,13 +2630,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, .write = true, }; - nregs = btf_func_model_nregs(m); - /* the first 8 registers are used for arguments */ - if (nregs > 8) - return -ENOTSUPP; jit_fill_hole(image, (unsigned int)(ro_image_end - ro_image)); - ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags); + ret = calc_arg_aux(m, &aaux); + if (ret) + goto out; + ret = prepare_trampoline(&ctx, im, tlinks, func_addr, m, &aaux, flags); if (ret > 0 && validate_code(&ctx) < 0) { ret = -EINVAL; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 772c1b008e43..10effd4cff6b 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -28,6 +28,7 @@ HAS_EPAN HAS_EVT HAS_FPMR HAS_FGT +HAS_FGT2 HAS_FPSIMD HAS_GCS HAS_GENERIC_AUTH @@ -94,6 +95,7 @@ WORKAROUND_2457168 WORKAROUND_2645198 WORKAROUND_2658417 WORKAROUND_AMPERE_AC03_CPU_38 +WORKAROUND_AMPERE_AC04_CPU_23 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE WORKAROUND_TRBE_WRITE_OUT_OF_RANGE diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index bdf044c5d11b..8a8cf6874298 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -101,6 +101,17 @@ Res0 63:32 Field 31:0 DTRTX EndSysreg +Sysreg MDSELR_EL1 2 0 0 4 2 +Res0 63:6 +Field 5:4 BANK +Res0 3:0 +EndSysreg + +Sysreg MDSTEPOP_EL1 2 0 0 5 2 +Res0 63:32 +Field 31:0 OPCODE +EndSysreg + Sysreg OSECCR_EL1 2 0 0 6 2 Res0 63:32 Field 31:0 EDECCR @@ -111,6 +122,285 @@ Res0 63:1 Field 0 OSLK EndSysreg +Sysreg SPMACCESSR_EL1 2 0 9 13 3 +UnsignedEnum 63:62 P31 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 61:60 P30 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 59:58 P29 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 57:56 P28 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 55:54 P27 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 53:52 P26 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 51:50 P25 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 49:48 P24 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 47:46 P23 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 45:44 P22 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 43:42 P21 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 41:40 P20 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 39:38 P19 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 37:36 P18 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 35:34 P17 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 33:32 P16 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 31:30 P15 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 29:28 P14 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 27:26 P13 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 25:24 P12 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 23:22 P11 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 21:20 P10 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 19:18 P9 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 17:16 P8 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 15:14 P7 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 13:12 P6 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 11:10 P5 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 9:8 P4 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 7:6 P3 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 5:4 P2 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 3:2 P1 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +UnsignedEnum 1:0 P0 + 0b00 TRAP_RW + 0b01 TRAP_W + 0b11 NOTRAP +EndEnum +EndSysreg + +Sysreg SPMACCESSR_EL12 2 5 9 13 3 +Mapping SPMACCESSR_EL1 +EndSysreg + +Sysreg SPMIIDR_EL1 2 0 9 13 4 +Res0 63:32 +Field 31:20 ProductID +Field 19:16 Variant +Field 15:12 Revision +Field 11:0 Implementer +EndSysreg + +Sysreg SPMDEVARCH_EL1 2 0 9 13 5 +Res0 63:32 +Field 31:21 ARCHITECT +Field 20 PRESENT +Field 19:16 REVISION +Field 15:12 ARCHVER +Field 11:0 ARCHPART +EndSysreg + +Sysreg SPMDEVAFF_EL1 2 0 9 13 6 +Res0 63:40 +Field 39:32 Aff3 +Field 31 F0V +Field 30 U +Res0 29:25 +Field 24 MT +Field 23:16 Aff2 +Field 15:8 Aff1 +Field 7:0 Aff0 +EndSysreg + +Sysreg SPMCFGR_EL1 2 0 9 13 7 +Res0 63:32 +Field 31:28 NCG +Res0 27:25 +Field 24 HDBG +Field 23 TRO +Field 22 SS +Field 21 FZO +Field 20 MSI +Field 19 RAO +Res0 18 +Field 17 NA +Field 16 EX +Field 15:14 RAZ +Field 13:8 SIZE +Field 7:0 N +EndSysreg + +Sysreg SPMINTENSET_EL1 2 0 9 14 1 +Field 63:0 P +EndSysreg + +Sysreg SPMINTENCLR_EL1 2 0 9 14 2 +Field 63:0 P +EndSysreg + +Sysreg PMCCNTSVR_EL1 2 0 14 11 7 +Field 63:0 CCNT +EndSysreg + +Sysreg PMICNTSVR_EL1 2 0 14 12 0 +Field 63:0 ICNT +EndSysreg + +Sysreg SPMCR_EL0 2 3 9 12 0 +Res0 63:12 +Field 11 TRO +Field 10 HDBG +Field 9 FZO +Field 8 NA +Res0 7:5 +Field 4 EX +Res0 3:2 +Field 1 P +Field 0 E +EndSysreg + +Sysreg SPMCNTENSET_EL0 2 3 9 12 1 +Field 63:0 P +EndSysreg + +Sysreg SPMCNTENCLR_EL0 2 3 9 12 2 +Field 63:0 P +EndSysreg + +Sysreg SPMOVSCLR_EL0 2 3 9 12 3 +Field 63:0 P +EndSysreg + +Sysreg SPMZR_EL0 2 3 9 12 4 +Field 63:0 P +EndSysreg + +Sysreg SPMSELR_EL0 2 3 9 12 5 +Res0 63:10 +Field 9:4 SYSPMUSEL +Res0 3:2 +Field 1:0 BANK +EndSysreg + +Sysreg SPMOVSSET_EL0 2 3 9 14 3 +Field 63:0 P +EndSysreg + +Sysreg SPMSCR_EL1 2 7 9 14 7 +Field 63:32 IMPDEF +Field 31 RAO +Res0 30:5 +Field 4 NAO +Res0 3:1 +Field 0 SO +EndSysreg + Sysreg ID_PFR0_EL1 3 0 0 1 0 Res0 63:32 UnsignedEnum 31:28 RAS @@ -907,6 +1197,7 @@ UnsignedEnum 31:28 RAS 0b0000 NI 0b0001 IMP 0b0010 V1P1 + 0b0011 V2 EndEnum UnsignedEnum 27:24 GIC 0b0000 NI @@ -1466,6 +1757,7 @@ UnsignedEnum 63:60 LS64 0b0001 LS64 0b0010 LS64_V 0b0011 LS64_ACCDATA + 0b0100 LS64WB EndEnum UnsignedEnum 59:56 XS 0b0000 NI @@ -1945,12 +2237,21 @@ EndEnum EndSysreg Sysreg ID_AA64MMFR4_EL1 3 0 0 7 4 -Res0 63:40 +Res0 63:48 +UnsignedEnum 47:44 SRMASK + 0b0000 NI + 0b0001 IMP +EndEnum +Res0 43:40 UnsignedEnum 39:36 E3DSE 0b0000 NI 0b0001 IMP EndEnum -Res0 35:28 +Res0 35:32 +UnsignedEnum 31:28 RMEGDI + 0b0000 NI + 0b0001 IMP +EndEnum SignedEnum 27:24 E2H0 0b0000 IMP 0b1110 NI_NV1 @@ -1959,6 +2260,7 @@ EndEnum UnsignedEnum 23:20 NV_frac 0b0000 NV_NV2 0b0001 NV2_ONLY + 0b0010 NV2P1 EndEnum UnsignedEnum 19:16 FGWTE3 0b0000 NI @@ -1978,7 +2280,10 @@ SignedEnum 7:4 EIESB 0b0010 ToELx 0b1111 ANY EndEnum -Res0 3:0 +UnsignedEnum 3:0 PoPS + 0b0000 NI + 0b0001 IMP +EndEnum EndSysreg Sysreg SCTLR_EL1 3 0 1 0 0 @@ -2053,8 +2358,30 @@ Field 1 A Field 0 M EndSysreg +Sysreg SCTLR_EL12 3 5 1 0 0 +Mapping SCTLR_EL1 +EndSysreg + +Sysreg SCTLRALIAS_EL1 3 0 1 4 6 +Mapping SCTLR_EL1 +EndSysreg + +Sysreg ACTLR_EL1 3 0 1 0 1 +Field 63:0 IMPDEF +EndSysreg + +Sysreg ACTLR_EL12 3 5 1 0 1 +Mapping ACTLR_EL1 +EndSysreg + +Sysreg ACTLRALIAS_EL1 3 0 1 4 5 +Mapping ACTLR_EL1 +EndSysreg + Sysreg CPACR_EL1 3 0 1 0 2 -Res0 63:30 +Res0 63:32 +Field 31 TCPAC +Field 30 TAM Field 29 E0POE Field 28 TTA Res0 27:26 @@ -2066,6 +2393,323 @@ Field 17:16 ZEN Res0 15:0 EndSysreg +Sysreg CPACR_EL12 3 5 1 0 2 +Mapping CPACR_EL1 +EndSysreg + +Sysreg CPACRALIAS_EL1 3 0 1 4 4 +Mapping CPACR_EL1 +EndSysreg + +Sysreg ACTLRMASK_EL1 3 0 1 4 1 +Field 63:0 IMPDEF +EndSysreg + +Sysreg ACTLRMASK_EL12 3 5 1 4 1 +Mapping ACTLRMASK_EL1 +EndSysreg + +Sysreg CPACRMASK_EL1 3 0 1 4 2 +Res0 63:32 +Field 31 TCPAC +Field 30 TAM +Field 29 E0POE +Field 28 TTA +Res0 27:25 +Field 24 SMEN +Res0 23:21 +Field 20 FPEN +Res0 19:17 +Field 16 ZEN +Res0 15:0 +EndSysreg + +Sysreg CPACRMASK_EL12 3 5 1 4 2 +Mapping CPACRMASK_EL1 +EndSysreg + +Sysreg PFAR_EL1 3 0 6 0 5 +Field 63 NS +Field 62 NSE +Res0 61:56 +Field 55:52 PA_55_52 +Field 51:48 PA_51_48 +Field 47:0 PA +EndSysreg + +Sysreg PFAR_EL12 3 5 6 0 5 +Mapping PFAR_EL1 +EndSysreg + +Sysreg RCWSMASK_EL1 3 0 13 0 3 +Field 63:0 RCWSMASK +EndSysreg + +Sysreg SCTLR2_EL1 3 0 1 0 3 +Res0 63:13 +Field 12 CPTM0 +Field 11 CPTM +Field 10 CPTA0 +Field 9 CPTA +Field 8 EnPACM0 +Field 7 EnPACM +Field 6 EnIDCP128 +Field 5 EASE +Field 4 EnANERR +Field 3 EnADERR +Field 2 NMEA +Res0 1:0 +EndSysreg + +Sysreg SCTLR2_EL12 3 5 1 0 3 +Mapping SCTLR2_EL1 +EndSysreg + +Sysreg SCTLR2ALIAS_EL1 3 0 1 4 7 +Mapping SCTLR2_EL1 +EndSysreg + +Sysreg SCTLR2MASK_EL1 3 0 1 4 3 +Res0 63:13 +Field 12 CPTM0 +Field 11 CPTM +Field 10 CPTA0 +Field 9 CPTA +Field 8 EnPACM0 +Field 7 EnPACM +Field 6 EnIDCP128 +Field 5 EASE +Field 4 EnANERR +Field 3 EnADERR +Field 2 NMEA +Res0 1:0 +EndSysreg + +Sysreg SCTLR2MASK_EL12 3 5 1 4 3 +Mapping SCTLR2MASK_EL1 +EndSysreg + +Sysreg SCTLRMASK_EL1 3 0 1 4 0 +Field 63 TIDCP +Field 62 SPINTMASK +Field 61 NMI +Field 60 EnTP2 +Field 59 TCSO +Field 58 TCSO0 +Field 57 EPAN +Field 56 EnALS +Field 55 EnAS0 +Field 54 EnASR +Field 53 TME +Field 52 TME0 +Field 51 TMT +Field 50 TMT0 +Res0 49:47 +Field 46 TWEDEL +Field 45 TWEDEn +Field 44 DSSBS +Field 43 ATA +Field 42 ATA0 +Res0 41 +Field 40 TCF +Res0 39 +Field 38 TCF0 +Field 37 ITFSB +Field 36 BT1 +Field 35 BT0 +Field 34 EnFPM +Field 33 MSCEn +Field 32 CMOW +Field 31 EnIA +Field 30 EnIB +Field 29 LSMAOE +Field 28 nTLSMD +Field 27 EnDA +Field 26 UCI +Field 25 EE +Field 24 E0E +Field 23 SPAN +Field 22 EIS +Field 21 IESB +Field 20 TSCXT +Field 19 WXN +Field 18 nTWE +Res0 17 +Field 16 nTWI +Field 15 UCT +Field 14 DZE +Field 13 EnDB +Field 12 I +Field 11 EOS +Field 10 EnRCTX +Field 9 UMA +Field 8 SED +Field 7 ITD +Field 6 nAA +Field 5 CP15BEN +Field 4 SA0 +Field 3 SA +Field 2 C +Field 1 A +Field 0 M +EndSysreg + +Sysreg SCTLRMASK_EL12 3 5 1 4 0 +Mapping SCTLRMASK_EL1 +EndSysreg + +Sysreg TCR2MASK_EL1 3 0 2 7 3 +Res0 63:22 +Field 21 FNGNA1 +Field 20 FNGNA0 +Res0 19 +Field 18 FNG1 +Field 17 FNG0 +Field 16 A2 +Field 15 DisCH1 +Field 14 DisCH0 +Res0 13:12 +Field 11 HAFT +Field 10 PTTWI +Res0 9:6 +Field 5 D128 +Field 4 AIE +Field 3 POE +Field 2 E0POE +Field 1 PIE +Field 0 PnCH +EndSysreg + +Sysreg TCR2MASK_EL12 3 5 2 7 3 +Mapping TCR2MASK_EL1 +EndSysreg + +Sysreg TCRMASK_EL1 3 0 2 7 2 +Res0 63:62 +Field 61 MTX1 +Field 60 MTX0 +Field 59 DS +Field 58 TCMA1 +Field 57 TCMA0 +Field 56 E0PD1 +Field 55 E0PD0 +Field 54 NFD1 +Field 53 NFD0 +Field 52 TBID1 +Field 51 TBID0 +Field 50 HWU162 +Field 49 HWU161 +Field 48 HWU160 +Field 47 HWU159 +Field 46 HWU062 +Field 45 HWU061 +Field 44 HWU060 +Field 43 HWU059 +Field 42 HPD1 +Field 41 HPD0 +Field 40 HD +Field 39 HA +Field 38 TBI1 +Field 37 TBI0 +Field 36 AS +Res0 35:33 +Field 32 IPS +Res0 31 +Field 30 TG1 +Res0 29 +Field 28 SH1 +Res0 27 +Field 26 ORGN1 +Res0 25 +Field 24 IRGN1 +Field 23 EPD1 +Field 22 A1 +Res0 21:17 +Field 16 T1SZ +Res0 15 +Field 14 TG0 +Res0 13 +Field 12 SH0 +Res0 11 +Field 10 ORGN0 +Res0 9 +Field 8 IRGN0 +Field 7 EPD0 +Res0 6:1 +Field 0 T0SZ +EndSysreg + +Sysreg TCRMASK_EL12 3 5 2 7 2 +Mapping TCRMASK_EL1 +EndSysreg + +Sysreg ERXGSR_EL1 3 0 5 3 2 +Field 63 S63 +Field 62 S62 +Field 61 S61 +Field 60 S60 +Field 59 S59 +Field 58 S58 +Field 57 S57 +Field 56 S56 +Field 55 S55 +Field 54 S54 +Field 53 S53 +Field 52 S52 +Field 51 S51 +Field 50 S50 +Field 49 S49 +Field 48 S48 +Field 47 S47 +Field 46 S46 +Field 45 S45 +Field 44 S44 +Field 43 S43 +Field 42 S42 +Field 41 S41 +Field 40 S40 +Field 39 S39 +Field 38 S38 +Field 37 S37 +Field 36 S36 +Field 35 S35 +Field 34 S34 +Field 33 S33 +Field 32 S32 +Field 31 S31 +Field 30 S30 +Field 29 S29 +Field 28 S28 +Field 27 S27 +Field 26 S26 +Field 25 S25 +Field 24 S24 +Field 23 S23 +Field 22 S22 +Field 21 S21 +Field 20 S20 +Field 19 S19 +Field 18 S18 +Field 17 S17 +Field 16 S16 +Field 15 S15 +Field 14 S14 +Field 13 S13 +Field 12 S12 +Field 11 S11 +Field 10 S10 +Field 9 S9 +Field 8 S8 +Field 7 S7 +Field 6 S6 +Field 5 S5 +Field 4 S4 +Field 3 S3 +Field 2 S2 +Field 1 S1 +Field 0 S0 +EndSysreg + Sysreg TRFCR_EL1 3 0 1 2 1 Res0 63:7 UnsignedEnum 6:5 TS @@ -2078,6 +2722,16 @@ Field 1 ExTRE Field 0 E0TRE EndSysreg +Sysreg TRCITECR_EL1 3 0 1 2 3 +Res0 63:2 +Field 1 E1E +Field 0 E0E +EndSysreg + +Sysreg TRCITECR_EL12 3 5 1 2 3 +Mapping TRCITECR_EL1 +EndSysreg + Sysreg SMPRI_EL1 3 0 1 2 4 Res0 63:4 Field 3:0 PRIORITY @@ -2226,7 +2880,28 @@ Field 15:0 MINLAT EndSysreg Sysreg PMSIDR_EL1 3 0 9 9 7 -Res0 63:25 +Res0 63:33 +UnsignedEnum 32 SME + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 31:28 ALTCLK + 0b0000 NI + 0b0001 IMP + 0b1111 IMPDEF +EndEnum +UnsignedEnum 27 FPF + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 26 EFT + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 25 CRR + 0b0 NI + 0b1 IMP +EndEnum Field 24 PBT Field 23:20 FORMAT Enum 19:16 COUNTSIZE @@ -2244,7 +2919,10 @@ Enum 11:8 INTERVAL 0b0111 3072 0b1000 4096 EndEnum -Res0 7 +UnsignedEnum 7 FDS + 0b0 NI + 0b1 IMP +EndEnum Field 6 FnE Field 5 ERND Field 4 LDS @@ -2287,6 +2965,16 @@ Field 16 COLL Field 15:0 MSS EndSysreg +Sysreg PMSDSFR_EL1 3 0 9 10 4 +Field 63:0 S +EndSysreg + +Sysreg PMBMAR_EL1 3 0 9 10 5 +Res0 63:10 +Field 9:8 SH +Field 7:0 Attr +EndSysreg + Sysreg PMBIDR_EL1 3 0 9 10 7 Res0 63:12 Enum 11:8 EA @@ -2300,6 +2988,21 @@ Field 4 P Field 3:0 ALIGN EndSysreg +Sysreg TRBMPAM_EL1 3 0 9 11 5 +Res0 63:27 +Field 26 EN +Field 25:24 MPAM_SP +Field 23:16 PMG +Field 15:0 PARTID +EndSysreg + +Sysreg PMSSCR_EL1 3 0 9 13 3 +Res0 63:33 +Field 32 NC +Res0 31:1 +Field 0 SS +EndSysreg + Sysreg PMUACR_EL1 3 0 9 14 4 Res0 63:33 Field 32 F0 @@ -2307,11 +3010,29 @@ Field 31 C Field 30:0 P EndSysreg +Sysreg PMECR_EL1 3 0 9 14 5 +Res0 63:5 +Field 4:3 SSE +Field 2 KPME +Field 1:0 PMEE +EndSysreg + +Sysreg PMIAR_EL1 3 0 9 14 7 +Field 63:0 ADDRESS +EndSysreg + Sysreg PMSELR_EL0 3 3 9 12 5 Res0 63:5 Field 4:0 SEL EndSysreg +Sysreg PMZR_EL0 3 3 9 13 4 +Res0 63:33 +Field 32 F0 +Field 31 C +Field 30:0 P +EndSysreg + SysregFields CONTEXTIDR_ELx Res0 63:32 Field 31:0 PROCID @@ -2450,7 +3171,110 @@ UnsignedEnum 2:0 F8S1 EndEnum EndSysreg -SysregFields HFGxTR_EL2 +Sysreg HCR_EL2 3 4 1 1 0 +Field 63:60 TWEDEL +Field 59 TWEDEn +Field 58 TID5 +Field 57 DCT +Field 56 ATA +Field 55 TTLBOS +Field 54 TTLBIS +Field 53 EnSCXT +Field 52 TOCU +Field 51 AMVOFFEN +Field 50 TICAB +Field 49 TID4 +Field 48 GPF +Field 47 FIEN +Field 46 FWB +Field 45 NV2 +Field 44 AT +Field 43 NV1 +Field 42 NV +Field 41 API +Field 40 APK +Field 39 TME +Field 38 MIOCNCE +Field 37 TEA +Field 36 TERR +Field 35 TLOR +Field 34 E2H +Field 33 ID +Field 32 CD +Field 31 RW +Field 30 TRVM +Field 29 HCD +Field 28 TDZ +Field 27 TGE +Field 26 TVM +Field 25 TTLB +Field 24 TPU +Field 23 TPCP +Field 22 TSW +Field 21 TACR +Field 20 TIDCP +Field 19 TSC +Field 18 TID3 +Field 17 TID2 +Field 16 TID1 +Field 15 TID0 +Field 14 TWE +Field 13 TWI +Field 12 DC +UnsignedEnum 11:10 BSU + 0b00 NONE + 0b01 IS + 0b10 OS + 0b11 FS +EndEnum +Field 9 FB +Field 8 VSE +Field 7 VI +Field 6 VF +Field 5 AMO +Field 4 IMO +Field 3 FMO +Field 2 PTW +Field 1 SWIO +Field 0 VM +EndSysreg + +Sysreg MDCR_EL2 3 4 1 1 1 +Res0 63:51 +Field 50 EnSTEPOP +Res0 49:44 +Field 43 EBWE +Res0 42 +Field 41:40 PMEE +Res0 39:37 +Field 36 HPMFZS +Res0 35:32 +Field 31:30 PMSSE +Field 29 HPMFZO +Field 28 MTPME +Field 27 TDCC +Field 26 HLP +Field 25:24 E2TB +Field 23 HCCD +Res0 22:20 +Field 19 TTRF +Res0 18 +Field 17 HPMD +Res0 16 +Field 15 EnSPM +Field 14 TPMS +Field 13:12 E2PB +Field 11 TDRA +Field 10 TDOSA +Field 9 TDA +Field 8 TDE +Field 7 HPME +Field 6 TPM +Field 5 TPMCR +Field 4:0 HPMN +EndSysreg + +Sysreg HFGRTR_EL2 3 4 1 1 4 Field 63 nAMAIR2_EL1 Field 62 nMAIR2_EL1 Field 61 nS2POR_EL1 @@ -2515,53 +3339,74 @@ Field 3 AMAIR_EL1 Field 2 AIDR_EL1 Field 1 AFSR1_EL1 Field 0 AFSR0_EL1 -EndSysregFields - -Sysreg MDCR_EL2 3 4 1 1 1 -Res0 63:51 -Field 50 EnSTEPOP -Res0 49:44 -Field 43 EBWE -Res0 42 -Field 41:40 PMEE -Res0 39:37 -Field 36 HPMFZS -Res0 35:32 -Field 31:30 PMSSE -Field 29 HPMFZO -Field 28 MTPME -Field 27 TDCC -Field 26 HLP -Field 25:24 E2TB -Field 23 HCCD -Res0 22:20 -Field 19 TTRF -Res0 18 -Field 17 HPMD -Res0 16 -Field 15 EnSPM -Field 14 TPMS -Field 13:12 E2PB -Field 11 TDRA -Field 10 TDOSA -Field 9 TDA -Field 8 TDE -Field 7 HPME -Field 6 TPM -Field 5 TPMCR -Field 4:0 HPMN -EndSysreg - -Sysreg HFGRTR_EL2 3 4 1 1 4 -Fields HFGxTR_EL2 EndSysreg Sysreg HFGWTR_EL2 3 4 1 1 5 -Fields HFGxTR_EL2 +Field 63 nAMAIR2_EL1 +Field 62 nMAIR2_EL1 +Field 61 nS2POR_EL1 +Field 60 nPOR_EL1 +Field 59 nPOR_EL0 +Field 58 nPIR_EL1 +Field 57 nPIRE0_EL1 +Field 56 nRCWMASK_EL1 +Field 55 nTPIDR2_EL0 +Field 54 nSMPRI_EL1 +Field 53 nGCS_EL1 +Field 52 nGCS_EL0 +Res0 51 +Field 50 nACCDATA_EL1 +Field 49 ERXADDR_EL1 +Field 48 ERXPFGCDN_EL1 +Field 47 ERXPFGCTL_EL1 +Res0 46 +Field 45 ERXMISCn_EL1 +Field 44 ERXSTATUS_EL1 +Field 43 ERXCTLR_EL1 +Res0 42 +Field 41 ERRSELR_EL1 +Res0 40 +Field 39 ICC_IGRPENn_EL1 +Field 38 VBAR_EL1 +Field 37 TTBR1_EL1 +Field 36 TTBR0_EL1 +Field 35 TPIDR_EL0 +Field 34 TPIDRRO_EL0 +Field 33 TPIDR_EL1 +Field 32 TCR_EL1 +Field 31 SCXTNUM_EL0 +Field 30 SCXTNUM_EL1 +Field 29 SCTLR_EL1 +Res0 28 +Field 27 PAR_EL1 +Res0 26:25 +Field 24 MAIR_EL1 +Field 23 LORSA_EL1 +Field 22 LORN_EL1 +Res0 21 +Field 20 LOREA_EL1 +Field 19 LORC_EL1 +Res0 18 +Field 17 FAR_EL1 +Field 16 ESR_EL1 +Res0 15:14 +Field 13 CSSELR_EL1 +Field 12 CPACR_EL1 +Field 11 CONTEXTIDR_EL1 +Res0 10:9 +Field 8 APIBKey +Field 7 APIAKey +Field 6 APGAKey +Field 5 APDBKey +Field 4 APDAKey +Field 3 AMAIR_EL1 +Res0 2 +Field 1 AFSR1_EL1 +Field 0 AFSR0_EL1 EndSysreg Sysreg HFGITR_EL2 3 4 1 1 6 -Res0 63 +Field 63 PSBCSYNC Field 62 ATS1E1A Res0 61 Field 60 COSPRCTX @@ -2971,6 +3816,12 @@ Sysreg SMCR_EL2 3 4 1 2 6 Fields SMCR_ELx EndSysreg +Sysreg VNCR_EL2 3 4 2 2 0 +Field 63:57 RESS +Field 56:12 BADDR +Res0 11:0 +EndSysreg + Sysreg GCSCR_EL2 3 4 2 5 0 Fields GCSCR_ELx EndSysreg @@ -3244,6 +4095,60 @@ Sysreg TTBR1_EL1 3 0 2 0 1 Fields TTBRx_EL1 EndSysreg +Sysreg TCR_EL1 3 0 2 0 2 +Res0 63:62 +Field 61 MTX1 +Field 60 MTX0 +Field 59 DS +Field 58 TCMA1 +Field 57 TCMA0 +Field 56 E0PD1 +Field 55 E0PD0 +Field 54 NFD1 +Field 53 NFD0 +Field 52 TBID1 +Field 51 TBID0 +Field 50 HWU162 +Field 49 HWU161 +Field 48 HWU160 +Field 47 HWU159 +Field 46 HWU062 +Field 45 HWU061 +Field 44 HWU060 +Field 43 HWU059 +Field 42 HPD1 +Field 41 HPD0 +Field 40 HD +Field 39 HA +Field 38 TBI1 +Field 37 TBI0 +Field 36 AS +Res0 35 +Field 34:32 IPS +Field 31:30 TG1 +Field 29:28 SH1 +Field 27:26 ORGN1 +Field 25:24 IRGN1 +Field 23 EPD1 +Field 22 A1 +Field 21:16 T1SZ +Field 15:14 TG0 +Field 13:12 SH0 +Field 11:10 ORGN0 +Field 9:8 IRGN0 +Field 7 EPD0 +Res0 6 +Field 5:0 T0SZ +EndSysreg + +Sysreg TCR_EL12 3 5 2 0 2 +Mapping TCR_EL1 +EndSysreg + +Sysreg TCRALIAS_EL1 3 0 2 7 6 +Mapping TCR_EL1 +EndSysreg + Sysreg TCR2_EL1 3 0 2 0 3 Res0 63:16 Field 15 DisCH1 @@ -3264,6 +4169,10 @@ Sysreg TCR2_EL12 3 5 2 0 3 Mapping TCR2_EL1 EndSysreg +Sysreg TCR2ALIAS_EL1 3 0 2 7 7 +Mapping TCR2_EL1 +EndSysreg + Sysreg TCR2_EL2 3 4 2 0 3 Res0 63:16 Field 15 DisCH1 @@ -3525,7 +4434,12 @@ Field 31:0 TRG EndSysreg Sysreg TRBIDR_EL1 3 0 9 11 7 -Res0 63:12 +Res0 63:16 +UnsignedEnum 15:12 MPAM + 0b0000 NI + 0b0001 DEFAULT + 0b0010 IMP +EndEnum Enum 11:8 EA 0b0000 NON_DESC 0b0001 IGNORE diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index f457c2662e2f..a3c4cc46c892 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -301,7 +301,7 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); /* MMU handling */ void kvm_flush_tlb_all(void); void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa); -int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write); +int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write, int ecode); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h index 2c349f961bfb..f1efd7cfbc20 100644 --- a/arch/loongarch/include/asm/kvm_vcpu.h +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -37,7 +37,7 @@ #define KVM_LOONGSON_IRQ_NUM_MASK 0xffff typedef union loongarch_instruction larch_inst; -typedef int (*exit_handle_fn)(struct kvm_vcpu *); +typedef int (*exit_handle_fn)(struct kvm_vcpu *, int); int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst); int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst); diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index ea321403644a..fa52251b3bf1 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -341,7 +341,7 @@ static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu) * 2) Execute CACOP/IDLE instructions; * 3) Access to unimplemented CSRs/IOCSRs. */ -static int kvm_handle_gspr(struct kvm_vcpu *vcpu) +static int kvm_handle_gspr(struct kvm_vcpu *vcpu, int ecode) { int ret = RESUME_GUEST; enum emulation_result er = EMULATE_DONE; @@ -661,7 +661,7 @@ int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst) return ret; } -static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write) +static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write, int ecode) { int ret; larch_inst inst; @@ -675,7 +675,7 @@ static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write) return RESUME_GUEST; } - ret = kvm_handle_mm_fault(vcpu, badv, write); + ret = kvm_handle_mm_fault(vcpu, badv, write, ecode); if (ret) { /* Treat as MMIO */ inst.word = vcpu->arch.badi; @@ -705,14 +705,14 @@ static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write) return ret; } -static int kvm_handle_read_fault(struct kvm_vcpu *vcpu) +static int kvm_handle_read_fault(struct kvm_vcpu *vcpu, int ecode) { - return kvm_handle_rdwr_fault(vcpu, false); + return kvm_handle_rdwr_fault(vcpu, false, ecode); } -static int kvm_handle_write_fault(struct kvm_vcpu *vcpu) +static int kvm_handle_write_fault(struct kvm_vcpu *vcpu, int ecode) { - return kvm_handle_rdwr_fault(vcpu, true); + return kvm_handle_rdwr_fault(vcpu, true, ecode); } int kvm_complete_user_service(struct kvm_vcpu *vcpu, struct kvm_run *run) @@ -726,11 +726,12 @@ int kvm_complete_user_service(struct kvm_vcpu *vcpu, struct kvm_run *run) /** * kvm_handle_fpu_disabled() - Guest used fpu however it is disabled at host * @vcpu: Virtual CPU context. + * @ecode: Exception code. * * Handle when the guest attempts to use fpu which hasn't been allowed * by the root context. */ -static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu) +static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu, int ecode) { struct kvm_run *run = vcpu->run; @@ -783,11 +784,12 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu) /* * kvm_handle_lsx_disabled() - Guest used LSX while disabled in root. * @vcpu: Virtual CPU context. + * @ecode: Exception code. * * Handle when the guest attempts to use LSX when it is disabled in the root * context. */ -static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu) +static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu, int ecode) { if (kvm_own_lsx(vcpu)) kvm_queue_exception(vcpu, EXCCODE_INE, 0); @@ -798,11 +800,12 @@ static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu) /* * kvm_handle_lasx_disabled() - Guest used LASX while disabled in root. * @vcpu: Virtual CPU context. + * @ecode: Exception code. * * Handle when the guest attempts to use LASX when it is disabled in the root * context. */ -static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu) +static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu, int ecode) { if (kvm_own_lasx(vcpu)) kvm_queue_exception(vcpu, EXCCODE_INE, 0); @@ -810,7 +813,7 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu) return RESUME_GUEST; } -static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu) +static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu, int ecode) { if (kvm_own_lbt(vcpu)) kvm_queue_exception(vcpu, EXCCODE_INE, 0); @@ -872,7 +875,7 @@ static void kvm_handle_service(struct kvm_vcpu *vcpu) kvm_write_reg(vcpu, LOONGARCH_GPR_A0, ret); } -static int kvm_handle_hypercall(struct kvm_vcpu *vcpu) +static int kvm_handle_hypercall(struct kvm_vcpu *vcpu, int ecode) { int ret; larch_inst inst; @@ -932,16 +935,14 @@ static int kvm_handle_hypercall(struct kvm_vcpu *vcpu) /* * LoongArch KVM callback handling for unimplemented guest exiting */ -static int kvm_fault_ni(struct kvm_vcpu *vcpu) +static int kvm_fault_ni(struct kvm_vcpu *vcpu, int ecode) { - unsigned int ecode, inst; - unsigned long estat, badv; + unsigned int inst; + unsigned long badv; /* Fetch the instruction */ inst = vcpu->arch.badi; badv = vcpu->arch.badv; - estat = vcpu->arch.host_estat; - ecode = (estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; kvm_err("ECode: %d PC=%#lx Inst=0x%08x BadVaddr=%#lx ESTAT=%#lx\n", ecode, vcpu->arch.pc, inst, badv, read_gcsr_estat()); kvm_arch_vcpu_dump_regs(vcpu); @@ -966,5 +967,5 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = { int kvm_handle_fault(struct kvm_vcpu *vcpu, int fault) { - return kvm_fault_tables[fault](vcpu); + return kvm_fault_tables[fault](vcpu, fault); } diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 4d203294767c..ed956c5cf2cc 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -912,7 +912,7 @@ out: return err; } -int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) +int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write, int ecode) { int ret; @@ -921,8 +921,17 @@ int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) return ret; /* Invalidate this entry in the TLB */ - vcpu->arch.flush_gpa = gpa; - kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu); + if (!cpu_has_ptw || (ecode == EXCCODE_TLBM)) { + /* + * With HW PTW, invalid TLB is not added when page fault. But + * for EXCCODE_TLBM exception, stale TLB may exist because of + * the last read access. + * + * With SW PTW, invalid TLB is added in TLB refill exception. + */ + vcpu->arch.flush_gpa = gpa; + kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu); + } return 0; } diff --git a/arch/m68k/coldfire/m5272.c b/arch/m68k/coldfire/m5272.c index 734dab657fe3..5b70dfdab368 100644 --- a/arch/m68k/coldfire/m5272.c +++ b/arch/m68k/coldfire/m5272.c @@ -119,7 +119,7 @@ static struct fixed_phy_status nettel_fixed_phy_status __initdata = { static int __init init_BSP(void) { m5272_uarts_init(); - fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status); + fixed_phy_add(0, &nettel_fixed_phy_status); clkdev_add_table(m5272_clk_lookup, ARRAY_SIZE(m5272_clk_lookup)); return 0; } diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 6a644122d38e..d05690289e33 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -267,8 +267,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 2284c04b9b55..a1747fbe23fb 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -263,8 +263,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 7cf84fd14e78..74293551f66b 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -270,8 +270,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index ef269ea337a1..419b13ae950a 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index fe427837fe6f..4c81d756587c 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -262,8 +262,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 4a0c7716b560..daa01d7fb462 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 6d3b20e9d0b2..641ca22eb3b2 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -281,8 +281,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index b57d9234979b..f98ffa7a1640 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -259,8 +259,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index cf0f7f10ebc6..2bfc3f4b48f9 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 2205e7ae55e5..2bd46cbcca2a 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 0e22f6acf575..dc7fc94fc669 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -256,8 +256,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index f63fd6ab3e68..b026a54867f5 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -257,8 +257,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index 247be207f293..de426a474b5b 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -282,7 +282,7 @@ static int __init bcm47xx_register_bus_complete(void) bcm47xx_leds_register(); bcm47xx_workarounds(); - fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status); + fixed_phy_add(0, &bcm47xx_fixed_phy_status); return 0; } device_initcall(bcm47xx_register_bus_complete); diff --git a/arch/mips/configs/bigsur_defconfig b/arch/mips/configs/bigsur_defconfig index 8f7c36868204..97d2cd997285 100644 --- a/arch/mips/configs/bigsur_defconfig +++ b/arch/mips/configs/bigsur_defconfig @@ -81,7 +81,6 @@ CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m CONFIG_IP_VS_FTP=m -CONFIG_IP_DCCP=m CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig index 48c8feec958f..437ef6dc0b4c 100644 --- a/arch/mips/configs/gpr_defconfig +++ b/arch/mips/configs/gpr_defconfig @@ -84,7 +84,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index cbf9c35a6177..e4bcdb64df6c 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -130,7 +130,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 22fa8f19924a..31ac655b7837 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -161,6 +161,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index eab87c6beacb..e5d64c84aadf 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -291,4 +291,20 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) +static inline int pte_same(pte_t pte_a, pte_t pte_b); + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +static inline int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + if (!pte_same(*ptep, entry)) + set_ptes(vma->vm_mm, address, ptep, entry, 1); + /* + * update_mmu_cache will unconditionally execute, handling both + * the case that the PTE changed and the spurious fault case. + */ + return true; +} + #endif /* _ASM_NIOS2_PGTABLE_H */ diff --git a/arch/nios2/kernel/cpuinfo.c b/arch/nios2/kernel/cpuinfo.c index 7b1e8f9128e9..55882feb6249 100644 --- a/arch/nios2/kernel/cpuinfo.c +++ b/arch/nios2/kernel/cpuinfo.c @@ -46,10 +46,7 @@ void __init setup_cpuinfo(void) cpuinfo.cpu_clock_freq = fcpu(cpu, "clock-frequency"); str = of_get_property(cpu, "altr,implementation", &len); - if (str) - strscpy(cpuinfo.cpu_impl, str, sizeof(cpuinfo.cpu_impl)); - else - strcpy(cpuinfo.cpu_impl, "<unknown>"); + strscpy(cpuinfo.cpu_impl, str ?: "<unknown>"); cpuinfo.has_div = of_property_read_bool(cpu, "altr,has-div"); cpuinfo.has_mul = of_property_read_bool(cpu, "altr,has-mul"); diff --git a/arch/nios2/mm/tlb.c b/arch/nios2/mm/tlb.c index f90ac35f05f3..a9cbe20f9e79 100644 --- a/arch/nios2/mm/tlb.c +++ b/arch/nios2/mm/tlb.c @@ -144,10 +144,11 @@ static void flush_tlb_one(unsigned long addr) if (((pteaddr >> 2) & 0xfffff) != (addr >> PAGE_SHIFT)) continue; + tlbmisc = RDCTL(CTL_TLBMISC); pr_debug("Flush entry by writing way=%dl pid=%ld\n", - way, (pid_misc >> TLBMISC_PID_SHIFT)); + way, ((tlbmisc >> TLBMISC_PID_SHIFT) & TLBMISC_PID_MASK)); - tlbmisc = TLBMISC_WE | (way << TLBMISC_WAY_SHIFT); + tlbmisc = TLBMISC_WE | (way << TLBMISC_WAY_SHIFT) | (tlbmisc & TLBMISC_PID); WRCTL(CTL_TLBMISC, tlbmisc); WRCTL(CTL_PTEADDR, pteaddr_invalid(addr)); WRCTL(CTL_TLBACC, 0); @@ -237,7 +238,8 @@ void flush_tlb_pid(unsigned long mmu_pid) if (pid != mmu_pid) continue; - tlbmisc = TLBMISC_WE | (way << TLBMISC_WAY_SHIFT); + tlbmisc = TLBMISC_WE | (way << TLBMISC_WAY_SHIFT) | + (pid << TLBMISC_PID_SHIFT); WRCTL(CTL_TLBMISC, tlbmisc); WRCTL(CTL_TLBACC, 0); } @@ -272,15 +274,17 @@ void flush_tlb_all(void) /* remember pid/way until we return */ get_misc_and_pid(&org_misc, &pid_misc); - /* Start at way 0, way is auto-incremented after each TLBACC write */ - WRCTL(CTL_TLBMISC, TLBMISC_WE); - /* Map each TLB entry to physcal address 0 with no-access and a bad ptbase */ for (line = 0; line < cpuinfo.tlb_num_lines; line++) { WRCTL(CTL_PTEADDR, pteaddr_invalid(addr)); - for (way = 0; way < cpuinfo.tlb_num_ways; way++) + for (way = 0; way < cpuinfo.tlb_num_ways; way++) { + // Code such as replace_tlb_one_pid assumes that no duplicate entries exist + // for a single address across ways, so also use way as a dummy PID + WRCTL(CTL_TLBMISC, TLBMISC_WE | (way << TLBMISC_WAY_SHIFT) | + (way << TLBMISC_PID_SHIFT)); WRCTL(CTL_TLBACC, 0); + } addr += PAGE_SIZE; } diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 96831c988606..1f2d5b7a7f5d 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -142,6 +142,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 0x4050 +#define SO_PASSRIGHTS 0x4051 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 1bc3466bc909..ae45f70b29f0 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -87,7 +87,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_IP_DCCP=m CONFIG_BT=m CONFIG_BT_RFCOMM=m CONFIG_BT_RFCOMM_TTY=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 242c1fab9d46..f96f8ed9856c 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -225,7 +225,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m CONFIG_TIPC=m CONFIG_ATM=m CONFIG_ATM_CLIP=m diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index 1f37b600ca47..3b643b9efc07 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -63,9 +63,6 @@ struct kvm_vcpu_aia { /* CPU AIA CSR context of Guest VCPU */ struct kvm_vcpu_aia_csr guest_csr; - /* CPU AIA CSR context upon Guest VCPU reset */ - struct kvm_vcpu_aia_csr guest_reset_csr; - /* Guest physical address of IMSIC for this VCPU */ gpa_t imsic_addr; diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 0e9c2fab6378..85cfebc32e4c 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -119,6 +119,9 @@ struct kvm_arch { /* AIA Guest/VM context */ struct kvm_aia aia; + + /* KVM_CAP_RISCV_MP_STATE_RESET */ + bool mp_state_reset; }; struct kvm_cpu_trap { @@ -193,6 +196,12 @@ struct kvm_vcpu_smstateen_csr { unsigned long sstateen0; }; +struct kvm_vcpu_reset_state { + spinlock_t lock; + unsigned long pc; + unsigned long a1; +}; + struct kvm_vcpu_arch { /* VCPU ran at least once */ bool ran_atleast_once; @@ -227,12 +236,8 @@ struct kvm_vcpu_arch { /* CPU Smstateen CSR context of Guest VCPU */ struct kvm_vcpu_smstateen_csr smstateen_csr; - /* CPU context upon Guest VCPU reset */ - struct kvm_cpu_context guest_reset_context; - spinlock_t reset_cntx_lock; - - /* CPU CSR context upon Guest VCPU reset */ - struct kvm_vcpu_csr guest_reset_csr; + /* CPU reset state of Guest VCPU */ + struct kvm_vcpu_reset_state reset_state; /* * VCPU interrupts diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index 4ed6203cdd30..439ab2b3534f 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -55,6 +55,9 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run); void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, struct kvm_run *run, u32 type, u64 flags); +void kvm_riscv_vcpu_sbi_request_reset(struct kvm_vcpu *vcpu, + unsigned long pc, unsigned long a1); +void kvm_riscv_vcpu_sbi_load_reset_state(struct kvm_vcpu *vcpu); int kvm_riscv_vcpu_sbi_return(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_riscv_vcpu_set_reg_sbi_ext(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); diff --git a/arch/riscv/include/asm/kvm_vcpu_vector.h b/arch/riscv/include/asm/kvm_vcpu_vector.h index 27f5bccdd8b0..57a798a4cb0d 100644 --- a/arch/riscv/include/asm/kvm_vcpu_vector.h +++ b/arch/riscv/include/asm/kvm_vcpu_vector.h @@ -33,8 +33,7 @@ void kvm_riscv_vcpu_guest_vector_restore(struct kvm_cpu_context *cntx, unsigned long *isa); void kvm_riscv_vcpu_host_vector_save(struct kvm_cpu_context *cntx); void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cntx); -int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu, - struct kvm_cpu_context *cntx); +int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_free_vector_context(struct kvm_vcpu *vcpu); #else @@ -62,8 +61,7 @@ static inline void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cn { } -static inline int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu, - struct kvm_cpu_context *cntx) +static inline int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu) { return 0; } diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 356d5397b2a2..bdf3352acf4c 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -131,6 +131,12 @@ secondary_start_sbi: csrw CSR_IE, zero csrw CSR_IP, zero +#ifndef CONFIG_RISCV_M_MODE + /* Enable time CSR */ + li t0, 0x2 + csrw CSR_SCOUNTEREN, t0 +#endif + /* Load the global pointer */ load_global_pointer @@ -226,6 +232,10 @@ SYM_CODE_START(_start_kernel) * to hand it to us. */ csrr a0, CSR_MHARTID +#else + /* Enable time CSR */ + li t0, 0x2 + csrw CSR_SCOUNTEREN, t0 #endif /* CONFIG_RISCV_M_MODE */ /* Load the global pointer */ diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 0c3cbb0915ff..704c2899197e 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -18,7 +18,7 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION config KVM - tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" + tristate "Kernel-based Virtual Machine (KVM) support" depends on RISCV_SBI && MMU select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index 39cd26af5a69..43e472ff3e1a 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -526,12 +526,10 @@ int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu) void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu) { struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; - struct kvm_vcpu_aia_csr *reset_csr = - &vcpu->arch.aia_context.guest_reset_csr; if (!kvm_riscv_aia_available()) return; - memcpy(csr, reset_csr, sizeof(*csr)); + memset(csr, 0, sizeof(*csr)); /* Proceed only if AIA was initialized successfully */ if (!kvm_riscv_aia_initialized(vcpu->kvm)) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 02635bac91f1..e0a01af426ff 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -51,12 +51,33 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) +static void kvm_riscv_vcpu_context_reset(struct kvm_vcpu *vcpu, + bool kvm_sbi_reset) { struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; - struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; - struct kvm_cpu_context *reset_cntx = &vcpu->arch.guest_reset_context; + void *vector_datap = cntx->vector.datap; + + memset(cntx, 0, sizeof(*cntx)); + memset(csr, 0, sizeof(*csr)); + memset(&vcpu->arch.smstateen_csr, 0, sizeof(vcpu->arch.smstateen_csr)); + + /* Restore datap as it's not a part of the guest context. */ + cntx->vector.datap = vector_datap; + + if (kvm_sbi_reset) + kvm_riscv_vcpu_sbi_load_reset_state(vcpu); + + /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */ + cntx->sstatus = SR_SPP | SR_SPIE; + + cntx->hstatus |= HSTATUS_VTW; + cntx->hstatus |= HSTATUS_SPVP; + cntx->hstatus |= HSTATUS_SPV; +} + +static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu, bool kvm_sbi_reset) +{ bool loaded; /** @@ -71,13 +92,7 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.last_exit_cpu = -1; - memcpy(csr, reset_csr, sizeof(*csr)); - - spin_lock(&vcpu->arch.reset_cntx_lock); - memcpy(cntx, reset_cntx, sizeof(*cntx)); - spin_unlock(&vcpu->arch.reset_cntx_lock); - - memset(&vcpu->arch.smstateen_csr, 0, sizeof(vcpu->arch.smstateen_csr)); + kvm_riscv_vcpu_context_reset(vcpu, kvm_sbi_reset); kvm_riscv_vcpu_fp_reset(vcpu); @@ -112,8 +127,6 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int rc; - struct kvm_cpu_context *cntx; - struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; spin_lock_init(&vcpu->arch.mp_state_lock); @@ -133,24 +146,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) /* Setup VCPU hfence queue */ spin_lock_init(&vcpu->arch.hfence_lock); - /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */ - spin_lock_init(&vcpu->arch.reset_cntx_lock); + spin_lock_init(&vcpu->arch.reset_state.lock); - spin_lock(&vcpu->arch.reset_cntx_lock); - cntx = &vcpu->arch.guest_reset_context; - cntx->sstatus = SR_SPP | SR_SPIE; - cntx->hstatus = 0; - cntx->hstatus |= HSTATUS_VTW; - cntx->hstatus |= HSTATUS_SPVP; - cntx->hstatus |= HSTATUS_SPV; - spin_unlock(&vcpu->arch.reset_cntx_lock); - - if (kvm_riscv_vcpu_alloc_vector_context(vcpu, cntx)) + if (kvm_riscv_vcpu_alloc_vector_context(vcpu)) return -ENOMEM; - /* By default, make CY, TM, and IR counters accessible in VU mode */ - reset_csr->scounteren = 0x7; - /* Setup VCPU timer */ kvm_riscv_vcpu_timer_init(vcpu); @@ -169,7 +169,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_riscv_vcpu_sbi_init(vcpu); /* Reset VCPU */ - kvm_riscv_reset_vcpu(vcpu); + kvm_riscv_reset_vcpu(vcpu, false); return 0; } @@ -518,6 +518,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, case KVM_MP_STATE_STOPPED: __kvm_riscv_vcpu_power_off(vcpu); break; + case KVM_MP_STATE_INIT_RECEIVED: + if (vcpu->kvm->arch.mp_state_reset) + kvm_riscv_reset_vcpu(vcpu, false); + else + ret = -EINVAL; + break; default: ret = -EINVAL; } @@ -706,7 +712,7 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) - kvm_riscv_reset_vcpu(vcpu); + kvm_riscv_reset_vcpu(vcpu, true); if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu)) kvm_riscv_gstage_update_hgatp(vcpu); diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index d1c83a77735e..6e09b518a5d1 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -143,9 +143,9 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, struct kvm_vcpu *tmp; kvm_for_each_vcpu(i, tmp, vcpu->kvm) { - spin_lock(&vcpu->arch.mp_state_lock); + spin_lock(&tmp->arch.mp_state_lock); WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); - spin_unlock(&vcpu->arch.mp_state_lock); + spin_unlock(&tmp->arch.mp_state_lock); } kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP); @@ -156,6 +156,34 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, run->exit_reason = KVM_EXIT_SYSTEM_EVENT; } +void kvm_riscv_vcpu_sbi_request_reset(struct kvm_vcpu *vcpu, + unsigned long pc, unsigned long a1) +{ + spin_lock(&vcpu->arch.reset_state.lock); + vcpu->arch.reset_state.pc = pc; + vcpu->arch.reset_state.a1 = a1; + spin_unlock(&vcpu->arch.reset_state.lock); + + kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); +} + +void kvm_riscv_vcpu_sbi_load_reset_state(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; + struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; + struct kvm_vcpu_reset_state *reset_state = &vcpu->arch.reset_state; + + cntx->a0 = vcpu->vcpu_id; + + spin_lock(&vcpu->arch.reset_state.lock); + cntx->sepc = reset_state->pc; + cntx->a1 = reset_state->a1; + spin_unlock(&vcpu->arch.reset_state.lock); + + cntx->sstatus &= ~SR_SIE; + csr->vsatp = 0; +} + int kvm_riscv_vcpu_sbi_return(struct kvm_vcpu *vcpu, struct kvm_run *run) { struct kvm_cpu_context *cp = &vcpu->arch.guest_context; diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c index 3070bb31745d..f26207f84bab 100644 --- a/arch/riscv/kvm/vcpu_sbi_hsm.c +++ b/arch/riscv/kvm/vcpu_sbi_hsm.c @@ -15,7 +15,6 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu) { - struct kvm_cpu_context *reset_cntx; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; struct kvm_vcpu *target_vcpu; unsigned long target_vcpuid = cp->a0; @@ -32,17 +31,7 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu) goto out; } - spin_lock(&target_vcpu->arch.reset_cntx_lock); - reset_cntx = &target_vcpu->arch.guest_reset_context; - /* start address */ - reset_cntx->sepc = cp->a1; - /* target vcpu id to start */ - reset_cntx->a0 = target_vcpuid; - /* private data passed from kernel */ - reset_cntx->a1 = cp->a2; - spin_unlock(&target_vcpu->arch.reset_cntx_lock); - - kvm_make_request(KVM_REQ_VCPU_RESET, target_vcpu); + kvm_riscv_vcpu_sbi_request_reset(target_vcpu, cp->a1, cp->a2); __kvm_riscv_vcpu_power_on(target_vcpu); diff --git a/arch/riscv/kvm/vcpu_sbi_system.c b/arch/riscv/kvm/vcpu_sbi_system.c index bc0ebba89003..359be90b0fc5 100644 --- a/arch/riscv/kvm/vcpu_sbi_system.c +++ b/arch/riscv/kvm/vcpu_sbi_system.c @@ -13,7 +13,6 @@ static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_vcpu_sbi_return *retdata) { struct kvm_cpu_context *cp = &vcpu->arch.guest_context; - struct kvm_cpu_context *reset_cntx; unsigned long funcid = cp->a6; unsigned long hva, i; struct kvm_vcpu *tmp; @@ -45,14 +44,7 @@ static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, } } - spin_lock(&vcpu->arch.reset_cntx_lock); - reset_cntx = &vcpu->arch.guest_reset_context; - reset_cntx->sepc = cp->a1; - reset_cntx->a0 = vcpu->vcpu_id; - reset_cntx->a1 = cp->a2; - spin_unlock(&vcpu->arch.reset_cntx_lock); - - kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); + kvm_riscv_vcpu_sbi_request_reset(vcpu, cp->a1, cp->a2); /* userspace provides the suspend implementation */ kvm_riscv_vcpu_sbi_forward(vcpu, run); diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c index d92d1348045c..a5f88cb717f3 100644 --- a/arch/riscv/kvm/vcpu_vector.c +++ b/arch/riscv/kvm/vcpu_vector.c @@ -22,6 +22,9 @@ void kvm_riscv_vcpu_vector_reset(struct kvm_vcpu *vcpu) struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; cntx->sstatus &= ~SR_VS; + + cntx->vector.vlenb = riscv_v_vsize / 32; + if (riscv_isa_extension_available(isa, v)) { cntx->sstatus |= SR_VS_INITIAL; WARN_ON(!cntx->vector.datap); @@ -70,13 +73,11 @@ void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cntx) __kvm_riscv_vector_restore(cntx); } -int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu, - struct kvm_cpu_context *cntx) +int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu) { - cntx->vector.datap = kmalloc(riscv_v_vsize, GFP_KERNEL); - if (!cntx->vector.datap) + vcpu->arch.guest_context.vector.datap = kzalloc(riscv_v_vsize, GFP_KERNEL); + if (!vcpu->arch.guest_context.vector.datap) return -ENOMEM; - cntx->vector.vlenb = riscv_v_vsize / 32; vcpu->arch.host_context.vector.datap = kzalloc(riscv_v_vsize, GFP_KERNEL); if (!vcpu->arch.host_context.vector.datap) @@ -87,7 +88,7 @@ int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu, void kvm_riscv_vcpu_free_vector_context(struct kvm_vcpu *vcpu) { - kfree(vcpu->arch.guest_reset_context.vector.datap); + kfree(vcpu->arch.guest_context.vector.datap); kfree(vcpu->arch.host_context.vector.datap); } #endif diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 7396b8654f45..b27ec8f96697 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -209,6 +209,19 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) return r; } +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +{ + switch (cap->cap) { + case KVM_CAP_RISCV_MP_STATE_RESET: + if (cap->flags) + return -EINVAL; + kvm->arch.mp_state_reset = true; + return 0; + default: + return -EINVAL; + } +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { return -EINVAL; diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 1d1c78d4cff1..e7b032dfd17f 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -608,6 +608,21 @@ static inline u32 rv_fence(u8 pred, u8 succ) return rv_i_insn(imm11_0, 0, 0, 0, 0xf); } +static inline void emit_fence_r_rw(struct rv_jit_context *ctx) +{ + emit(rv_fence(0x2, 0x3), ctx); +} + +static inline void emit_fence_rw_w(struct rv_jit_context *ctx) +{ + emit(rv_fence(0x3, 0x1), ctx); +} + +static inline void emit_fence_rw_rw(struct rv_jit_context *ctx) +{ + emit(rv_fence(0x3, 0x3), ctx); +} + static inline u32 rv_nop(void) { return rv_i_insn(0, 0, 0, 0, 0x13); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index ca60db75199d..10e01ff06312 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -473,11 +473,212 @@ static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx) emit(hash, ctx); } -static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, - struct rv_jit_context *ctx) +static int emit_load_8(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + int insns_start; + + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lb(rd, off, rs), ctx); + else + emit(rv_lbu(rd, off, rs), ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lb(rd, 0, RV_REG_T1), ctx); + else + emit(rv_lbu(rd, 0, RV_REG_T1), ctx); + return ctx->ninsns - insns_start; +} + +static int emit_load_16(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + int insns_start; + + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lh(rd, off, rs), ctx); + else + emit(rv_lhu(rd, off, rs), ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lh(rd, 0, RV_REG_T1), ctx); + else + emit(rv_lhu(rd, 0, RV_REG_T1), ctx); + return ctx->ninsns - insns_start; +} + +static int emit_load_32(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + int insns_start; + + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lw(rd, off, rs), ctx); + else + emit(rv_lwu(rd, off, rs), ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lw(rd, 0, RV_REG_T1), ctx); + else + emit(rv_lwu(rd, 0, RV_REG_T1), ctx); + return ctx->ninsns - insns_start; +} + +static int emit_load_64(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + int insns_start; + + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_ld(rd, off, rs, ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + insns_start = ctx->ninsns; + emit_ld(rd, 0, RV_REG_T1, ctx); + return ctx->ninsns - insns_start; +} + +static void emit_store_8(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + if (is_12b_int(off)) { + emit(rv_sb(rd, off, rs), ctx); + return; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + emit(rv_sb(RV_REG_T1, 0, rs), ctx); +} + +static void emit_store_16(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + if (is_12b_int(off)) { + emit(rv_sh(rd, off, rs), ctx); + return; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + emit(rv_sh(RV_REG_T1, 0, rs), ctx); +} + +static void emit_store_32(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + if (is_12b_int(off)) { + emit_sw(rd, off, rs, ctx); + return; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + emit_sw(RV_REG_T1, 0, rs, ctx); +} + +static void emit_store_64(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + if (is_12b_int(off)) { + emit_sd(rd, off, rs, ctx); + return; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + emit_sd(RV_REG_T1, 0, rs, ctx); +} + +static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, + struct rv_jit_context *ctx) +{ + u8 code = insn->code; + s32 imm = insn->imm; + s16 off = insn->off; + + switch (imm) { + /* dst_reg = load_acquire(src_reg + off16) */ + case BPF_LOAD_ACQ: + switch (BPF_SIZE(code)) { + case BPF_B: + emit_load_8(false, rd, off, rs, ctx); + break; + case BPF_H: + emit_load_16(false, rd, off, rs, ctx); + break; + case BPF_W: + emit_load_32(false, rd, off, rs, ctx); + break; + case BPF_DW: + emit_load_64(false, rd, off, rs, ctx); + break; + } + emit_fence_r_rw(ctx); + + /* If our next insn is a redundant zext, return 1 to tell + * build_body() to skip it. + */ + if (BPF_SIZE(code) != BPF_DW && insn_is_zext(&insn[1])) + return 1; + break; + /* store_release(dst_reg + off16, src_reg) */ + case BPF_STORE_REL: + emit_fence_rw_w(ctx); + switch (BPF_SIZE(code)) { + case BPF_B: + emit_store_8(rd, off, rs, ctx); + break; + case BPF_H: + emit_store_16(rd, off, rs, ctx); + break; + case BPF_W: + emit_store_32(rd, off, rs, ctx); + break; + case BPF_DW: + emit_store_64(rd, off, rs, ctx); + break; + } + break; + default: + pr_err_once("bpf-jit: invalid atomic load/store opcode %02x\n", imm); + return -EINVAL; + } + + return 0; +} + +static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, + struct rv_jit_context *ctx) { - u8 r0; + u8 r0, code = insn->code; + s16 off = insn->off; + s32 imm = insn->imm; int jmp_offset; + bool is64; + + if (BPF_SIZE(code) != BPF_W && BPF_SIZE(code) != BPF_DW) { + pr_err_once("bpf-jit: 1- and 2-byte RMW atomics are not supported\n"); + return -EINVAL; + } + is64 = BPF_SIZE(code) == BPF_DW; if (off) { if (is_12b_int(off)) { @@ -554,9 +755,14 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, rv_sc_w(RV_REG_T3, rs, rd, 0, 1), ctx); jmp_offset = ninsns_rvoff(-6); emit(rv_bne(RV_REG_T3, 0, jmp_offset >> 1), ctx); - emit(rv_fence(0x3, 0x3), ctx); + emit_fence_rw_rw(ctx); break; + default: + pr_err_once("bpf-jit: invalid atomic RMW opcode %02x\n", imm); + return -EINVAL; } + + return 0; } #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) @@ -1650,8 +1856,8 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: { - int insn_len, insns_start; bool sign_ext; + int insn_len; sign_ext = BPF_MODE(insn->code) == BPF_MEMSX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX; @@ -1663,78 +1869,16 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, switch (BPF_SIZE(code)) { case BPF_B: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, off, rs), ctx); - else - emit(rv_lbu(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lbu(rd, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; + insn_len = emit_load_8(sign_ext, rd, off, rs, ctx); break; case BPF_H: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, off, rs), ctx); - else - emit(rv_lhu(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lhu(rd, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; + insn_len = emit_load_16(sign_ext, rd, off, rs, ctx); break; case BPF_W: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, off, rs), ctx); - else - emit(rv_lwu(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lwu(rd, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; + insn_len = emit_load_32(sign_ext, rd, off, rs, ctx); break; case BPF_DW: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_ld(rd, off, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - emit_ld(rd, 0, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; + insn_len = emit_load_64(sign_ext, rd, off, rs, ctx); break; } @@ -1879,49 +2023,27 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_B: - if (is_12b_int(off)) { - emit(rv_sb(rd, off, rs), ctx); - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sb(RV_REG_T1, 0, rs), ctx); + emit_store_8(rd, off, rs, ctx); break; case BPF_STX | BPF_MEM | BPF_H: - if (is_12b_int(off)) { - emit(rv_sh(rd, off, rs), ctx); - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sh(RV_REG_T1, 0, rs), ctx); + emit_store_16(rd, off, rs, ctx); break; case BPF_STX | BPF_MEM | BPF_W: - if (is_12b_int(off)) { - emit_sw(rd, off, rs, ctx); - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sw(RV_REG_T1, 0, rs, ctx); + emit_store_32(rd, off, rs, ctx); break; case BPF_STX | BPF_MEM | BPF_DW: - if (is_12b_int(off)) { - emit_sd(rd, off, rs, ctx); - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sd(RV_REG_T1, 0, rs, ctx); + emit_store_64(rd, off, rs, ctx); break; + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: - emit_atomic(rd, rs, off, imm, - BPF_SIZE(code) == BPF_DW, ctx); + if (bpf_atomic_is_load_store(insn)) + ret = emit_atomic_ld_st(rd, rs, insn, ctx); + else + ret = emit_atomic_rmw(rd, rs, insn, ctx); + if (ret) + return ret; break; case BPF_STX | BPF_PROBE_MEM32 | BPF_B: diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index f8cd2f70a7fb..f6ca5cfa6b2f 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -26,9 +26,8 @@ static int build_body(struct rv_jit_context *ctx, bool extra_pass, int *offset) int ret; ret = bpf_jit_emit_insn(insn, ctx, extra_pass); - /* BPF_LD | BPF_IMM | BPF_DW: skip the next instruction. */ if (ret > 0) - i++; + i++; /* skip the next instruction */ if (offset) offset[i] = ctx->ninsns; if (ret < 0) diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h index 192835a3e24d..c7c96282f011 100644 --- a/arch/s390/include/asm/nospec-branch.h +++ b/arch/s390/include/asm/nospec-branch.h @@ -26,8 +26,6 @@ static inline bool nospec_uses_trampoline(void) return __is_defined(CC_USING_EXPOLINE) && !nospec_disable; } -#ifdef CONFIG_EXPOLINE_EXTERN - void __s390_indirect_jump_r1(void); void __s390_indirect_jump_r2(void); void __s390_indirect_jump_r3(void); @@ -44,8 +42,6 @@ void __s390_indirect_jump_r13(void); void __s390_indirect_jump_r14(void); void __s390_indirect_jump_r15(void); -#endif - #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_EXPOLINE_H */ diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 0776dfde2dba..c7f8313ba449 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -48,8 +48,6 @@ struct bpf_jit { int lit64; /* Current position in 64-bit literal pool */ int base_ip; /* Base address for literal pool */ int exit_ip; /* Address of exit */ - int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ - int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ int excnt; /* Number of exception table entries */ int prologue_plt_ret; /* Return address for prologue hotpatch PLT */ @@ -127,6 +125,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) jit->seen_regs |= (1 << r1); } +static s32 off_to_pcrel(struct bpf_jit *jit, u32 off) +{ + return off - jit->prg; +} + +static s64 ptr_to_pcrel(struct bpf_jit *jit, const void *ptr) +{ + if (jit->prg_buf) + return (const u8 *)ptr - ((const u8 *)jit->prg_buf + jit->prg); + return 0; +} + #define REG_SET_SEEN(b1) \ ({ \ reg_set_seen(jit, b1); \ @@ -201,7 +211,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT4_PCREL_RIC(op, mask, target) \ ({ \ - int __rel = ((target) - jit->prg) / 2; \ + int __rel = off_to_pcrel(jit, target) / 2; \ _EMIT4((op) | (mask) << 20 | (__rel & 0xffff)); \ }) @@ -239,7 +249,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \ ({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ + unsigned int rel = off_to_pcrel(jit, target) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ (op2) | (mask) << 12); \ REG_SET_SEEN(b1); \ @@ -248,7 +258,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \ ({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ + unsigned int rel = off_to_pcrel(jit, target) / 2; \ _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ REG_SET_SEEN(b1); \ @@ -257,29 +267,41 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ ({ \ - int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \ + int rel = off_to_pcrel(jit, addrs[(i) + (off) + 1]) / 2;\ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) +static void emit6_pcrel_ril(struct bpf_jit *jit, u32 op, s64 pcrel) +{ + u32 pc32dbl = (s32)(pcrel / 2); + + _EMIT6(op | pc32dbl >> 16, pc32dbl & 0xffff); +} + +static void emit6_pcrel_rilb(struct bpf_jit *jit, u32 op, u8 b, s64 pcrel) +{ + emit6_pcrel_ril(jit, op | reg_high(b) << 16, pcrel); + REG_SET_SEEN(b); +} + #define EMIT6_PCREL_RILB(op, b, target) \ -({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ - _EMIT6((op) | reg_high(b) << 16 | rel >> 16, rel & 0xffff);\ - REG_SET_SEEN(b); \ -}) + emit6_pcrel_rilb(jit, op, b, off_to_pcrel(jit, target)) -#define EMIT6_PCREL_RIL(op, target) \ -({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ - _EMIT6((op) | rel >> 16, rel & 0xffff); \ -}) +#define EMIT6_PCREL_RILB_PTR(op, b, target_ptr) \ + emit6_pcrel_rilb(jit, op, b, ptr_to_pcrel(jit, target_ptr)) + +static void emit6_pcrel_rilc(struct bpf_jit *jit, u32 op, u8 mask, s64 pcrel) +{ + emit6_pcrel_ril(jit, op | mask << 20, pcrel); +} #define EMIT6_PCREL_RILC(op, mask, target) \ -({ \ - EMIT6_PCREL_RIL((op) | (mask) << 20, (target)); \ -}) + emit6_pcrel_rilc(jit, op, mask, off_to_pcrel(jit, target)) + +#define EMIT6_PCREL_RILC_PTR(op, mask, target_ptr) \ + emit6_pcrel_rilc(jit, op, mask, ptr_to_pcrel(jit, target_ptr)) #define _EMIT6_IMM(op, imm) \ ({ \ @@ -503,7 +525,7 @@ static void bpf_skip(struct bpf_jit *jit, int size) { if (size >= 6 && !is_valid_rel(size)) { /* brcl 0xf,size */ - EMIT6_PCREL_RIL(0xc0f4000000, size); + EMIT6_PCREL_RILC(0xc0040000, 0xf, size); size -= 6; } else if (size >= 4 && is_valid_rel(size)) { /* brc 0xf,size */ @@ -605,43 +627,30 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, } /* Setup stack and backchain */ if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) { - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - /* lgr %w1,%r15 (backchain) */ - EMIT4(0xb9040000, REG_W1, REG_15); + /* lgr %w1,%r15 (backchain) */ + EMIT4(0xb9040000, REG_W1, REG_15); /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */ EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED); /* aghi %r15,-STK_OFF */ EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth)); - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - /* stg %w1,152(%r15) (backchain) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, - REG_15, 152); + /* stg %w1,152(%r15) (backchain) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, + REG_15, 152); } } /* - * Emit an expoline for a jump that follows + * Jump using a register either directly or via an expoline thunk */ -static void emit_expoline(struct bpf_jit *jit) -{ - /* exrl %r0,.+10 */ - EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); -} - -/* - * Emit __s390_indirect_jump_r1 thunk if necessary - */ -static void emit_r1_thunk(struct bpf_jit *jit) -{ - if (nospec_uses_trampoline()) { - jit->r1_thunk_ip = jit->prg; - emit_expoline(jit); - /* br %r1 */ - _EMIT2(0x07f1); - } -} +#define EMIT_JUMP_REG(reg) do { \ + if (nospec_uses_trampoline()) \ + /* brcl 0xf,__s390_indirect_jump_rN */ \ + EMIT6_PCREL_RILC_PTR(0xc0040000, 0x0f, \ + __s390_indirect_jump_r ## reg); \ + else \ + /* br %rN */ \ + _EMIT2(0x07f0 | reg); \ +} while (0) /* * Call r1 either directly or via __s390_indirect_jump_r1 thunk @@ -650,7 +659,8 @@ static void call_r1(struct bpf_jit *jit) { if (nospec_uses_trampoline()) /* brasl %r14,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + __s390_indirect_jump_r1); else /* basr %r14,%r1 */ EMIT2(0x0d00, REG_14, REG_1); @@ -666,16 +676,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ save_restore_regs(jit, REGS_RESTORE, stack_depth, 0); - if (nospec_uses_trampoline()) { - jit->r14_thunk_ip = jit->prg; - /* Generate __s390_indirect_jump_r14 thunk */ - emit_expoline(jit); - } - /* br %r14 */ - _EMIT2(0x07fe); - - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - emit_r1_thunk(jit); + EMIT_JUMP_REG(14); jit->prg = ALIGN(jit->prg, 8); jit->prologue_plt = jit->prg; @@ -1877,7 +1878,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* aghi %r1,tail_call_start */ EMIT4_IMM(0xa70b0000, REG_1, jit->tail_call_start); /* brcl 0xf,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->r1_thunk_ip); + EMIT6_PCREL_RILC_PTR(0xc0040000, 0xf, + __s390_indirect_jump_r1); } else { /* bc 0xf,tail_call_start(%r1) */ _EMIT4(0x47f01000 + jit->tail_call_start); @@ -2585,9 +2587,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, if (nr_stack_args > MAX_NR_STACK_ARGS) return -ENOTSUPP; - /* Return to %r14, since func_addr and %r0 are not available. */ - if ((!func_addr && !(flags & BPF_TRAMP_F_ORIG_STACK)) || - (flags & BPF_TRAMP_F_INDIRECT)) + /* Return to %r14 in the struct_ops case. */ + if (flags & BPF_TRAMP_F_INDIRECT) flags |= BPF_TRAMP_F_SKIP_FRAME; /* @@ -2847,17 +2848,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, 0xf000 | tjit->tccnt_off); /* aghi %r15,stack_size */ EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); - /* Emit an expoline for the following indirect jump. */ - if (nospec_uses_trampoline()) - emit_expoline(jit); if (flags & BPF_TRAMP_F_SKIP_FRAME) - /* br %r14 */ - _EMIT2(0x07fe); + EMIT_JUMP_REG(14); else - /* br %r1 */ - _EMIT2(0x07f1); - - emit_r1_thunk(jit); + EMIT_JUMP_REG(1); return 0; } diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 5b464a568664..adcba7329386 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -143,6 +143,8 @@ #define SO_RCVPRIORITY 0x005b +#define SO_PASSRIGHTS 0x005c + #if !defined(__KERNEL__) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0be4937203c7..ae1654280c40 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1858,8 +1858,7 @@ endchoice config X86_SGX bool "Software Guard eXtensions (SGX)" depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC - depends on CRYPTO=y - depends on CRYPTO_SHA256=y + select CRYPTO_LIB_SHA256 select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA select XARRAY_MULTI diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index edab6d6049be..7b2833705d47 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -36,6 +36,7 @@ /* TDX Module call error codes */ #define TDCALL_RETURN_CODE(a) ((a) >> 32) #define TDCALL_INVALID_OPERAND 0xc0000100 +#define TDCALL_OPERAND_BUSY 0x80000200 #define TDREPORT_SUBTYPE_0 0 @@ -109,12 +110,13 @@ static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask) * REPORTDATA to be included into TDREPORT. * @tdreport: Address of the output buffer to store TDREPORT. * - * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module - * v1.0 specification for more information on TDG.MR.REPORT TDCALL. + * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module v1.0 + * specification for more information on TDG.MR.REPORT TDCALL. + * * It is used in the TDX guest driver module to get the TDREPORT0. * - * Return 0 on success, -EINVAL for invalid operands, or -EIO on - * other TDCALL failures. + * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation, + * or -EIO on other TDCALL failures. */ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) { @@ -128,7 +130,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) ret = __tdcall(TDG_MR_REPORT, &args); if (ret) { if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) - return -EINVAL; + return -ENXIO; + else if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY) + return -EBUSY; return -EIO; } @@ -137,6 +141,42 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) EXPORT_SYMBOL_GPL(tdx_mcall_get_report0); /** + * tdx_mcall_extend_rtmr() - Wrapper to extend RTMR registers using + * TDG.MR.RTMR.EXTEND TDCALL. + * @index: Index of RTMR register to be extended. + * @data: Address of the input buffer with RTMR register extend data. + * + * Refer to section titled "TDG.MR.RTMR.EXTEND leaf" in the TDX Module v1.0 + * specification for more information on TDG.MR.RTMR.EXTEND TDCALL. + * + * It is used in the TDX guest driver module to allow user to extend the RTMR + * registers. + * + * Return 0 on success, -ENXIO for invalid operands, -EBUSY for busy operation, + * or -EIO on other TDCALL failures. + */ +int tdx_mcall_extend_rtmr(u8 index, u8 *data) +{ + struct tdx_module_args args = { + .rcx = virt_to_phys(data), + .rdx = index, + }; + u64 ret; + + ret = __tdcall(TDG_MR_RTMR_EXTEND, &args); + if (ret) { + if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) + return -ENXIO; + if (TDCALL_RETURN_CODE(ret) == TDCALL_OPERAND_BUSY) + return -EBUSY; + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(tdx_mcall_extend_rtmr); + +/** * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote * hypercall. * @buf: Address of the directly mapped shared kernel buffer which diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 823c0434bbad..79406bf07a1c 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) KVM_X86_OP_OPTIONAL(vm_destroy) +KVM_X86_OP_OPTIONAL(vm_pre_destroy) KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) @@ -115,6 +116,7 @@ KVM_X86_OP_OPTIONAL(pi_start_assignment) KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) +KVM_X86_OP_OPTIONAL(protected_apic_has_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) @@ -125,7 +127,8 @@ KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) #endif KVM_X86_OP_OPTIONAL(dev_get_attr) -KVM_X86_OP_OPTIONAL(mem_enc_ioctl) +KVM_X86_OP(mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9c971f846108..67b464651c8d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -609,8 +609,15 @@ struct kvm_pmu { struct kvm_pmu_ops; enum { - KVM_DEBUGREG_BP_ENABLED = 1, - KVM_DEBUGREG_WONT_EXIT = 2, + KVM_DEBUGREG_BP_ENABLED = BIT(0), + KVM_DEBUGREG_WONT_EXIT = BIT(1), + /* + * Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by + * hardware on exit from or enter to guest. KVM needn't switch them. + * DR0-3, DR6 and DR7 are set to their architectural INIT value on VM + * exit, host values need to be restored. + */ + KVM_DEBUGREG_AUTO_SWITCH = BIT(2), }; struct kvm_mtrr { @@ -1571,6 +1578,13 @@ struct kvm_arch { struct kvm_mmu_memory_cache split_desc_cache; gfn_t gfn_direct_bits; + + /* + * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero + * value indicates CPU dirty logging is unsupported or disabled in + * current VM. + */ + int cpu_dirty_log_size; }; struct kvm_vm_stat { @@ -1674,6 +1688,7 @@ struct kvm_x86_ops { unsigned int vm_size; int (*vm_init)(struct kvm *kvm); void (*vm_destroy)(struct kvm *kvm); + void (*vm_pre_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ int (*vcpu_precreate)(struct kvm *kvm); @@ -1823,11 +1838,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - /* - * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero - * value indicates CPU dirty logging is unsupported or disabled. - */ - int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); const struct kvm_x86_nested_ops *nested_ops; @@ -1841,6 +1851,7 @@ struct kvm_x86_ops { void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); + bool (*protected_apic_has_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired); @@ -1857,6 +1868,7 @@ struct kvm_x86_ops { int (*dev_get_attr)(u32 group, u64 attr, u64 *val); int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); + int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); @@ -2333,6 +2345,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); +void kvm_user_return_msr_update_cache(unsigned int index, u64 val); static inline bool kvm_is_supported_user_return_msr(u32 msr) { @@ -2416,7 +2429,12 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ KVM_X86_QUIRK_SLOT_ZAP_ALL | \ - KVM_X86_QUIRK_STUFF_FEATURE_MSRS) + KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT) + +#define KVM_X86_CONDITIONAL_QUIRKS \ + (KVM_X86_QUIRK_CD_NW_CLEARED | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index de788b400fba..bb107ebbe713 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -81,6 +81,11 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc) return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); } +static inline bool pi_test_pir(int vector, struct pi_desc *pi_desc) +{ + return test_bit(vector, (unsigned long *)pi_desc->pir); +} + /* Non-atomic helpers */ static inline void __pi_set_sn(struct pi_desc *pi_desc) { diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index a28ff6b14145..2f3820342598 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -13,6 +13,7 @@ /* TDX module Call Leaf IDs */ #define TDG_VP_VMCALL 0 #define TDG_VP_INFO 1 +#define TDG_MR_RTMR_EXTEND 2 #define TDG_VP_VEINFO_GET 3 #define TDG_MR_REPORT 4 #define TDG_MEM_PAGE_ACCEPT 6 @@ -67,11 +68,18 @@ #define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT) /* TDX hypercall Leaf IDs */ +#define TDVMCALL_GET_TD_VM_CALL_INFO 0x10000 #define TDVMCALL_MAP_GPA 0x10001 #define TDVMCALL_GET_QUOTE 0x10002 #define TDVMCALL_REPORT_FATAL_ERROR 0x10003 -#define TDVMCALL_STATUS_RETRY 1 +/* + * TDG.VP.VMCALL Status Codes (returned in R10) + */ +#define TDVMCALL_STATUS_SUCCESS 0x0000000000000000ULL +#define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL +#define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL +#define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL /* * Bitmasks of exposed registers (with VMM). diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 4a1922ec80cf..8b19294600c4 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -5,6 +5,7 @@ #include <linux/init.h> #include <linux/bits.h> +#include <linux/mmzone.h> #include <asm/errno.h> #include <asm/ptrace.h> @@ -18,6 +19,7 @@ * TDX module. */ #define TDX_ERROR _BITUL(63) +#define TDX_NON_RECOVERABLE _BITUL(62) #define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) #define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) @@ -33,6 +35,8 @@ #ifndef __ASSEMBLER__ #include <uapi/asm/mce.h> +#include <asm/tdx_global_metadata.h> +#include <linux/pgtable.h> /* * Used by the #VE exception handler to gather the #VE exception @@ -64,6 +68,8 @@ bool tdx_early_handle_ve(struct pt_regs *regs); int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport); +int tdx_mcall_extend_rtmr(u8 index, u8 *data); + u64 tdx_hcall_get_quote(u8 *buf, size_t size); void __init tdx_dump_attributes(u64 td_attr); @@ -119,11 +125,82 @@ static inline u64 sc_retry(sc_func_t func, u64 fn, int tdx_cpu_enable(void); int tdx_enable(void); const char *tdx_dump_mce_info(struct mce *m); +const struct tdx_sys_info *tdx_get_sysinfo(void); + +int tdx_guest_keyid_alloc(void); +u32 tdx_get_nr_guest_keyids(void); +void tdx_guest_keyid_free(unsigned int keyid); + +struct tdx_td { + /* TD root structure: */ + struct page *tdr_page; + + int tdcs_nr_pages; + /* TD control structure: */ + struct page **tdcs_pages; + + /* Size of `tdcx_pages` in struct tdx_vp */ + int tdcx_nr_pages; +}; + +struct tdx_vp { + /* TDVP root page */ + struct page *tdvpr_page; + + /* TD vCPU control structure: */ + struct page **tdcx_pages; +}; + +static inline u64 mk_keyed_paddr(u16 hkid, struct page *page) +{ + u64 ret; + + ret = page_to_phys(page); + /* KeyID bits are just above the physical address bits: */ + ret |= (u64)hkid << boot_cpu_data.x86_phys_bits; + + return ret; +} + +static inline int pg_level_to_tdx_sept_level(enum pg_level level) +{ + WARN_ON_ONCE(level == PG_LEVEL_NONE); + return level - 1; +} + +u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args); +u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); +u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mng_key_config(struct tdx_td *td); +u64 tdh_mng_create(struct tdx_td *td, u16 hkid); +u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); +u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data); +u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mr_finalize(struct tdx_td *td); +u64 tdh_vp_flush(struct tdx_vp *vp); +u64 tdh_mng_vpflushdone(struct tdx_td *td); +u64 tdh_mng_key_freeid(struct tdx_td *td); +u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err); +u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid); +u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data); +u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask); +u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size); +u64 tdh_mem_track(struct tdx_td *tdr); +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2); +u64 tdh_phymem_cache_wb(bool resume); +u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page); #else static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } +static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } +static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h index 6dd3c9695f59..060a2ad744bf 100644 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h +++ b/arch/x86/include/asm/tdx_global_metadata.h @@ -17,9 +17,28 @@ struct tdx_sys_info_tdmr { u16 pamt_1g_entry_size; }; +struct tdx_sys_info_td_ctrl { + u16 tdr_base_size; + u16 tdcs_base_size; + u16 tdvps_base_size; +}; + +struct tdx_sys_info_td_conf { + u64 attributes_fixed0; + u64 attributes_fixed1; + u64 xfam_fixed0; + u64 xfam_fixed1; + u16 num_cpuid_config; + u16 max_vcpus_per_td; + u64 cpuid_config_leaves[128]; + u64 cpuid_config_values[128][2]; +}; + struct tdx_sys_info { struct tdx_sys_info_features features; struct tdx_sys_info_tdmr tdmr; + struct tdx_sys_info_td_ctrl td_ctrl; + struct tdx_sys_info_td_conf td_conf; }; #endif diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 8707361b24da..cca7d6641287 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -256,6 +256,7 @@ enum vmcs_field { TSC_MULTIPLIER_HIGH = 0x00002033, TERTIARY_VM_EXEC_CONTROL = 0x00002034, TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + SHARED_EPT_POINTER = 0x0000203C, PID_POINTER_TABLE = 0x00002042, PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, @@ -586,6 +587,7 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_PROT_READ BIT(3) #define EPT_VIOLATION_PROT_WRITE BIT(4) #define EPT_VIOLATION_PROT_EXEC BIT(5) +#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6) #define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \ EPT_VIOLATION_PROT_WRITE | \ EPT_VIOLATION_PROT_EXEC) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 460306b35a4b..225a12e0d5d6 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -441,6 +441,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) +#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 @@ -930,4 +931,74 @@ struct kvm_hyperv_eventfd { #define KVM_X86_SNP_VM 4 #define KVM_X86_TDX_VM 5 +/* Trust Domain eXtension sub-ioctl() commands. */ +enum kvm_tdx_cmd_id { + KVM_TDX_CAPABILITIES = 0, + KVM_TDX_INIT_VM, + KVM_TDX_INIT_VCPU, + KVM_TDX_INIT_MEM_REGION, + KVM_TDX_FINALIZE_VM, + KVM_TDX_GET_CPUID, + + KVM_TDX_CMD_NR_MAX, +}; + +struct kvm_tdx_cmd { + /* enum kvm_tdx_cmd_id */ + __u32 id; + /* flags for sub-commend. If sub-command doesn't use this, set zero. */ + __u32 flags; + /* + * data for each sub-command. An immediate or a pointer to the actual + * data in process virtual address. If sub-command doesn't use it, + * set zero. + */ + __u64 data; + /* + * Auxiliary error code. The sub-command may return TDX SEAMCALL + * status code in addition to -Exxx. + */ + __u64 hw_error; +}; + +struct kvm_tdx_capabilities { + __u64 supported_attrs; + __u64 supported_xfam; + __u64 reserved[254]; + + /* Configurable CPUID bits for userspace */ + struct kvm_cpuid2 cpuid; +}; + +struct kvm_tdx_init_vm { + __u64 attributes; + __u64 xfam; + __u64 mrconfigid[6]; /* sha384 digest */ + __u64 mrowner[6]; /* sha384 digest */ + __u64 mrownerconfig[6]; /* sha384 digest */ + + /* The total space for TD_PARAMS before the CPUIDs is 256 bytes */ + __u64 reserved[12]; + + /* + * Call KVM_TDX_INIT_VM before vcpu creation, thus before + * KVM_SET_CPUID2. + * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the + * TDX module directly virtualizes those CPUIDs without VMM. The user + * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with + * those values. If it doesn't, KVM may have wrong idea of vCPUIDs of + * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX + * module doesn't virtualize. + */ + struct kvm_cpuid2 cpuid; +}; + +#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0) + +struct kvm_tdx_init_mem_region { + __u64 source_addr; + __u64 gpa; + __u64 nr_pages; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index a5faf6d88f1b..f0f4a4cf84a7 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -34,6 +34,7 @@ #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 #define EXIT_REASON_SIPI_SIGNAL 4 +#define EXIT_REASON_OTHER_SMI 6 #define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -92,6 +93,7 @@ #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_TDCALL 77 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -155,7 +157,8 @@ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ - { EXIT_REASON_NOTIFY, "NOTIFY" } + { EXIT_REASON_NOTIFY, "NOTIFY" }, \ + { EXIT_REASON_TDCALL, "TDCALL" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h index 4eddb4d571ef..30f39f92c98f 100644 --- a/arch/x86/kernel/cpu/sgx/driver.h +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -2,7 +2,6 @@ #ifndef __ARCH_SGX_DRIVER_H__ #define __ARCH_SGX_DRIVER_H__ -#include <crypto/hash.h> #include <linux/kref.h> #include <linux/mmu_notifier.h> #include <linux/radix-tree.h> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 776a20172867..66f1efa16fbb 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -3,6 +3,7 @@ #include <asm/mman.h> #include <asm/sgx.h> +#include <crypto/sha2.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -463,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) return ret; } -static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, - void *hash) -{ - SHASH_DESC_ON_STACK(shash, tfm); - - shash->tfm = tfm; - - return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); -} - -static int sgx_get_key_hash(const void *modulus, void *hash) -{ - struct crypto_shash *tfm; - int ret; - - tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - ret = __sgx_get_key_hash(tfm, modulus, hash); - - crypto_free_shash(tfm); - return ret; -} - static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { @@ -523,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, sgx_xfrm_reserved_mask) return -EINVAL; - ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); - if (ret) - return ret; + sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner); mutex_lock(&encl->lock); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 6722b2fc82cf..2de01b379aa3 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -720,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags) goto out; } + sgx_unmark_page_reclaimable(page); + /* * TBD: Add additional plumbing to enable pre-emptive * action for asynchronous poison notification. Until diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 94c0236963c6..c5c897a86418 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -352,7 +352,7 @@ static noinstr bool handle_bug(struct pt_regs *regs) case BUG_UD1_UBSAN: if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { pr_crit("%s at %pS\n", - report_ubsan_failure(regs, ud_imm), + report_ubsan_failure(ud_imm), (void *)regs->ip); } break; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fe8ea8c097de..2eeffcec5382 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -95,6 +95,8 @@ config KVM_SW_PROTECTED_VM config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL + select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST + select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST help Provides support for KVM on processors equipped with Intel's VT extensions, a.k.a. Virtual Machine Extensions (VMX). @@ -129,6 +131,16 @@ config X86_SGX_KVM If unsure, say N. +config KVM_INTEL_TDX + bool "Intel Trust Domain Extensions (TDX) support" + default y + depends on INTEL_TDX_HOST + help + Provides support for launching Intel Trust Domain Extensions (TDX) + confidential VMs on Intel processors. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f9dddb8cb466..a5d362c7b504 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,7 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o +kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ecd85f4801cc..6569b453546b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -81,17 +81,8 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } -/* - * Magic value used by KVM when querying userspace-provided CPUID entries and - * doesn't care about the CPIUD index because the index of the function in - * question is not significant. Note, this magic value must have at least one - * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() - * to avoid false positives when processing guest CPUID input. - */ -#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull - -static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu, - u32 function, u64 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -108,8 +99,8 @@ static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu, */ lockdep_assert_irqs_enabled(); - for (i = 0; i < vcpu->arch.cpuid_nent; i++) { - e = &vcpu->arch.cpuid_entries[i]; + for (i = 0; i < nent; i++) { + e = &entries[i]; if (e->function != function) continue; @@ -140,26 +131,7 @@ static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu, return NULL; } - -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - return cpuid_entry2_find(vcpu, function, index); -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); - -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function) -{ - return cpuid_entry2_find(vcpu, function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); - -/* - * cpuid_entry2_find() and KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used - * directly outside of kvm_find_cpuid_entry() and kvm_find_cpuid_entry_index(). - */ -#undef KVM_CPUID_INDEX_NOT_SIGNIFICANT +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { @@ -492,6 +464,20 @@ not_found: return 36; } +int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000000); + if (!best || best->eax < 0x80000008) + goto not_found; + best = kvm_find_cpuid_entry(vcpu, 0x80000008); + if (best) + return (best->eax >> 16) & 0xff; +not_found: + return 0; +} + /* * This "raw" version returns the reserved GPA bits without any adjustments for * encryption technologies that usurp bits. The raw mask should be used if and diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d2884162a46a..d3f5ae15a7ca 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -11,10 +11,34 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, - u32 function, u32 index); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, + int nent, u32 function, u64 index); +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by kvm_find_cpuid_entry2() + * to avoid false positives when processing guest CPUID input. + * + * KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used directly outside of + * kvm_find_cpuid_entry2() and kvm_find_cpuid_entry(). + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull + +static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, index); +} + +static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} + int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -34,6 +58,7 @@ void __init kvm_init_xstate_sizes(void); u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); +int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu); u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 63f66c51975a..97d68d837929 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -100,6 +100,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; + if (lapic_in_kernel(v) && v->arch.apic->guest_apic_protected) + return kvm_x86_call(protected_apic_has_interrupt)(v); + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 28e3317124fd..c9de81cc27e1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1790,8 +1790,17 @@ static void apic_update_lvtt(struct kvm_lapic *apic) static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT); + u32 reg; + /* + * Assume a timer IRQ was "injected" if the APIC is protected. KVM's + * copy of the vIRR is bogus, it's the responsibility of the caller to + * precisely check whether or not a timer IRQ is pending. + */ + if (apic->guest_apic_protected) + return true; + + reg = kvm_lapic_get_reg(apic, APIC_LVTT); if (kvm_apic_hw_enabled(apic)) { int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; @@ -2650,6 +2659,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) kvm_recalculate_apic_map(vcpu->kvm); return 0; } +EXPORT_SYMBOL_GPL(kvm_apic_set_base); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { @@ -2958,6 +2968,9 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) if (!kvm_apic_present(vcpu)) return -1; + if (apic->guest_apic_protected) + return -1; + __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 1a8553ebdb42..e33c969439f7 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -65,6 +65,8 @@ struct kvm_lapic { bool sw_enabled; bool irr_pending; bool lvt0_in_nmi_mode; + /* Select registers in the vAPIC cannot be read/written. */ + bool guest_apic_protected; /* Number of bits set in ISR. */ s16 isr_count; /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f2b36d32ef40..b4b6860ab971 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -79,6 +79,7 @@ static inline gfn_t kvm_mmu_max_gfn(void) u8 kvm_mmu_get_max_tdp_level(void); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); @@ -234,7 +235,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -bool kvm_mmu_may_ignore_guest_pat(void); +bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); @@ -256,6 +257,9 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); + static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8d1b632e33d2..7b3f1783ab3c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -110,6 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +EXPORT_SYMBOL_GPL(tdp_mmu_enabled); #endif static int max_huge_page_level __read_mostly; @@ -1456,15 +1457,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * enabled but it chooses between clearing the Dirty bit and Writeable * bit based on the context. */ - if (kvm_x86_ops.cpu_dirty_log_size) + if (kvm->arch.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -int kvm_cpu_dirty_log_size(void) +int kvm_cpu_dirty_log_size(struct kvm *kvm) { - return kvm_x86_ops.cpu_dirty_log_size; + return kvm->arch.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, @@ -4835,19 +4836,6 @@ out_unlock: } #endif -bool kvm_mmu_may_ignore_guest_pat(void) -{ - /* - * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM - * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to - * honor the memtype from the guest's PAT so that guest accesses to - * memory that is DMA'd aren't cached against the guest's wishes. As a - * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, - * KVM _always_ ignores guest PAT (when EPT is enabled). - */ - return shadow_memtype_mask; -} - int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { #ifdef CONFIG_X86_64 @@ -4858,8 +4846,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, - u8 *level) +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { int r; @@ -4873,6 +4860,10 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, do { if (signal_pending(current)) return -EINTR; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + cond_resched(); r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); } while (r == RET_PF_RETRY); @@ -4897,6 +4888,7 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, return -EIO; } } +EXPORT_SYMBOL_GPL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) @@ -5589,12 +5581,19 @@ void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + int maxpa; + + if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM) + maxpa = cpuid_query_maxguestphyaddr(vcpu); + else + maxpa = cpuid_maxphyaddr(vcpu); + /* tdp_root_level is architecture forced level, use it if nonzero */ if (tdp_root_level) return tdp_root_level; /* Use 5-level TDP if and only if it's useful/necessary. */ - if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + if (max_tdp_level == 5 && maxpa <= 48) return 4; return max_tdp_level; @@ -5913,6 +5912,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -7239,6 +7239,7 @@ static void kvm_mmu_zap_memslot(struct kvm *kvm, .start = slot->base_gfn, .end = slot->base_gfn + slot->npages, .may_block = true, + .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED, }; bool flush; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 75f00598289d..db8f33e4de62 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -187,7 +187,8 @@ static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mm return kvm_gfn_direct_bits(kvm); } -static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm, + struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty @@ -197,7 +198,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) * being enabled is mandatory as the bits used to denote WP-only SPTEs * are reserved for PAE paging (32-bit KVM). */ - return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; + return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode; } static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 561c331fd6ec..1b17b12393a8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -172,6 +172,9 @@ static int kvm_enable_external_write_tracking(struct kvm *kvm) struct kvm_memory_slot *slot; int r = 0, i, bkt; + if (kvm->arch.vm_type == KVM_X86_TDX_VM) + return -EOPNOTSUPP; + mutex_lock(&kvm->slots_arch_lock); /* diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 0f9f47b4ab0e..cfce03d8f123 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -37,7 +37,6 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; -u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -96,8 +95,6 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value); - access &= shadow_mmio_access_mask; spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; @@ -177,7 +174,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; - else if (kvm_mmu_page_ad_need_write_protect(sp)) + else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; spte |= shadow_present_mask; @@ -212,9 +209,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (shadow_memtype_mask) - spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -440,6 +435,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) +{ + kvm->arch.shadow_mmio_value = mmio_value; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value); + void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { /* shadow_me_value must be a subset of shadow_me_mask */ @@ -463,13 +464,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ shadow_present_mask = (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; - /* - * EPT overrides the host MTRRs, and so KVM must program the desired - * memtype directly into the SPTEs. Note, this mask is just the mask - * of all bits that factor into the memtype, the actual memtype must be - * dynamically calculated, e.g. to ensure host MMIO is mapped UC. - */ - shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -521,12 +516,6 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; - /* - * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB - * memtype in the SPTEs, i.e. relies on host MTRRs to provide the - * correct memtype (WB is the "weakest" memtype). - */ - shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 79cdceba9857..1e94f081bdaf 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -187,7 +187,6 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; -extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 21a3b8166242..405874f4d088 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1630,21 +1630,21 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, } } -static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) +static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp) { /* * All TDP MMU shadow pages share the same role as their root, aside * from level, so it is valid to key off any shadow page to determine if * write protection is needed for an entire tree. */ - return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; + return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled; } static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { - const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; rcu_read_lock(); @@ -1689,8 +1689,8 @@ void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { - const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); @@ -1911,16 +1911,13 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) +static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + struct kvm_mmu_page *root) { - struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); struct tdp_iter iter; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; - *root_level = vcpu->arch.mmu->root_role.level; - for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; @@ -1929,6 +1926,36 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) +{ + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); + *root_level = vcpu->arch.mmu->root_role.level; + + return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); +} + +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) +{ + struct kvm *kvm = vcpu->kvm; + bool is_direct = kvm_is_addr_direct(kvm, gpa); + hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : + vcpu->arch.mmu->mirror_root_hpa; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; + int leaf; + + lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); + leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); + rcu_read_unlock(); + if (leaf < 0) + return false; + + spte = sptes[leaf]; + return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); +} +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); + /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index a1cf2ac5bd78..551703fbe200 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -142,6 +142,9 @@ union kvm_smram { static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { + if (!kvm_x86_call(has_emulated_msr)(vcpu->kvm, MSR_IA32_SMBASE)) + return -ENOTTY; + kvm_make_request(KVM_REQ_SMI, vcpu); return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 67fee545d42a..ffb34dadff1c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5551,6 +5551,7 @@ static __init int svm_hardware_setup(void) */ allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; err: diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h new file mode 100644 index 000000000000..8f46a06e2c44 --- /dev/null +++ b/arch/x86/kvm/vmx/common.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_X86_VMX_COMMON_H +#define __KVM_X86_VMX_COMMON_H + +#include <linux/kvm_host.h> +#include <asm/posted_intr.h> + +#include "mmu.h" + +union vmx_exit_reason { + struct { + u32 basic : 16; + u32 reserved16 : 1; + u32 reserved17 : 1; + u32 reserved18 : 1; + u32 reserved19 : 1; + u32 reserved20 : 1; + u32 reserved21 : 1; + u32 reserved22 : 1; + u32 reserved23 : 1; + u32 reserved24 : 1; + u32 reserved25 : 1; + u32 bus_lock_detected : 1; + u32 enclave_mode : 1; + u32 smi_pending_mtf : 1; + u32 smi_from_vmx_root : 1; + u32 reserved30 : 1; + u32 failed_vmentry : 1; + }; + u32 full; +}; + +struct vcpu_vt { + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + + union vmx_exit_reason exit_reason; + + unsigned long exit_qualification; + u32 exit_intr_info; + + /* + * If true, guest state has been loaded into hardware, and host state + * saved into vcpu_{vt,vmx,tdx}. If false, host state is loaded into + * hardware. + */ + bool guest_state_loaded; + bool emulation_required; + +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; +#endif + + unsigned long host_debugctlmsr; +}; + +#ifdef CONFIG_KVM_INTEL_TDX + +static __always_inline bool is_td(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_TDX_VM; +} + +static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) +{ + return is_td(vcpu->kvm); +} + +#else + +static inline bool is_td(struct kvm *kvm) { return false; } +static inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; } + +#endif + +static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa) +{ + /* For TDX the direct mask is the shared mask. */ + return !kvm_is_addr_direct(kvm, gpa); +} + +static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, + unsigned long exit_qualification) +{ + u64 error_code; + + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) + ? PFERR_PRESENT_MASK : 0; + + if (error_code & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) + error_code |= PFERR_PRIVATE_ACCESS; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); +} + +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + int pi_vec) +{ +#ifdef CONFIG_SMP + if (vcpu->mode == IN_GUEST_MODE) { + /* + * The vector of the virtual has already been set in the PIR. + * Send a notification event to deliver the virtual interrupt + * unless the vCPU is the currently running vCPU, i.e. the + * event is being sent from a fastpath VM-Exit handler, in + * which case the PIR will be synced to the vIRR before + * re-entering the guest. + * + * When the target is not the running vCPU, the following + * possibilities emerge: + * + * Case 1: vCPU stays in non-root mode. Sending a notification + * event posts the interrupt to the vCPU. + * + * Case 2: vCPU exits to root mode and is still runnable. The + * PIR will be synced to the vIRR before re-entering the guest. + * Sending a notification event is ok as the host IRQ handler + * will ignore the spurious event. + * + * Case 3: vCPU exits to root mode and is blocked. vcpu_block() + * has already synced PIR to vIRR and never blocks the vCPU if + * the vIRR is not empty. Therefore, a blocked vCPU here does + * not wait for any requested interrupts in PIR, and sending a + * notification event also results in a benign, spurious event. + */ + + if (vcpu != kvm_get_running_vcpu()) + __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + return; + } +#endif + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); +} + +/* + * Post an interrupt to a vCPU's PIR and trigger the vCPU to process the + * interrupt if necessary. + */ +static inline void __vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, + struct pi_desc *pi_desc, int vector) +{ + if (pi_test_and_set_pir(vector, pi_desc)) + return; + + /* If a previous notification has sent the IPI, nothing to do. */ + if (pi_test_and_set_on(pi_desc)) + return; + + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); +} + +noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu); + +#endif /* __KVM_X86_VMX_COMMON_H */ diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 43ee9ed11291..94d5d907d37b 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -3,9 +3,890 @@ #include "x86_ops.h" #include "vmx.h" +#include "mmu.h" #include "nested.h" #include "pmu.h" #include "posted_intr.h" +#include "tdx.h" +#include "tdx_arch.h" + +#ifdef CONFIG_KVM_INTEL_TDX +static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt)); +#endif + +static void vt_disable_virtualization_cpu(void) +{ + /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */ + if (enable_tdx) + tdx_disable_virtualization_cpu(); + vmx_disable_virtualization_cpu(); +} + +static __init int vt_hardware_setup(void) +{ + int ret; + + ret = vmx_hardware_setup(); + if (ret) + return ret; + + /* + * Update vt_x86_ops::vm_size here so it is ready before + * kvm_ops_update() is called in kvm_x86_vendor_init(). + * + * Note, the actual bringing up of TDX must be done after + * kvm_ops_update() because enabling TDX requires enabling + * hardware virtualization first, i.e., all online CPUs must + * be in post-VMXON state. This means the @vm_size here + * may be updated to TDX's size but TDX may fail to enable + * at later time. + * + * The VMX/VT code could update kvm_x86_ops::vm_size again + * after bringing up TDX, but this would require exporting + * either kvm_x86_ops or kvm_ops_update() from the base KVM + * module, which looks overkill. Anyway, the worst case here + * is KVM may allocate couple of more bytes than needed for + * each VM. + */ + if (enable_tdx) { + vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, + sizeof(struct kvm_tdx)); + /* + * Note, TDX may fail to initialize in a later time in + * vt_init(), in which case it is not necessary to setup + * those callbacks. But making them valid here even + * when TDX fails to init later is fine because those + * callbacks won't be called if the VM isn't TDX guest. + */ + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; + vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; + vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; + } + + return 0; +} + +static int vt_vm_init(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_vm_init(kvm); + + return vmx_vm_init(kvm); +} + +static void vt_vm_pre_destroy(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_mmu_release_hkid(kvm); +} + +static void vt_vm_destroy(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_vm_destroy(kvm); + + vmx_vm_destroy(kvm); +} + +static int vt_vcpu_precreate(struct kvm *kvm) +{ + if (is_td(kvm)) + return 0; + + return vmx_vcpu_precreate(kvm); +} + +static int vt_vcpu_create(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_create(vcpu); + + return vmx_vcpu_create(vcpu); +} + +static void vt_vcpu_free(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_free(vcpu); + return; + } + + vmx_vcpu_free(vcpu); +} + +static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_reset(vcpu, init_event); + return; + } + + vmx_vcpu_reset(vcpu, init_event); +} + +static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_load(vcpu, cpu); + return; + } + + vmx_vcpu_load(vcpu, cpu); +} + +static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) +{ + /* + * Basic TDX does not support feature PML. KVM does not enable PML in + * TD's VMCS, nor does it allocate or flush PML buffer for TDX. + */ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_update_cpu_dirty_logging(vcpu); +} + +static void vt_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_prepare_switch_to_guest(vcpu); + return; + } + + vmx_prepare_switch_to_guest(vcpu); +} + +static void vt_vcpu_put(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_put(vcpu); + return; + } + + vmx_vcpu_put(vcpu); +} + +static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_pre_run(vcpu); + + return vmx_vcpu_pre_run(vcpu); +} + +static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_run(vcpu, force_immediate_exit); + + return vmx_vcpu_run(vcpu, force_immediate_exit); +} + +static int vt_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion fastpath) +{ + if (is_td_vcpu(vcpu)) + return tdx_handle_exit(vcpu, fastpath); + + return vmx_handle_exit(vcpu, fastpath); +} + +static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (unlikely(is_td_vcpu(vcpu))) + return tdx_set_msr(vcpu, msr_info); + + return vmx_set_msr(vcpu, msr_info); +} + +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool vt_has_emulated_msr(struct kvm *kvm, u32 index) +{ + if (kvm && is_td(kvm)) + return tdx_has_emulated_msr(index); + + return vmx_has_emulated_msr(kvm, index); +} + +static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (unlikely(is_td_vcpu(vcpu))) + return tdx_get_msr(vcpu, msr_info); + + return vmx_get_msr(vcpu, msr_info); +} + +static void vt_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + /* + * TDX doesn't allow VMM to configure interception of MSR accesses. + * TDX guest requests MSR accesses by calling TDVMCALL. The MSR + * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR + * if the userspace has set any. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_msr_filter_changed(vcpu); +} + +static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + if (is_td_vcpu(vcpu)) + return tdx_complete_emulated_msr(vcpu, err); + + return kvm_complete_insn_gp(vcpu, err); +} + +#ifdef CONFIG_KVM_SMM +static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_smi_allowed(vcpu, for_injection); +} + +static int vt_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_enter_smm(vcpu, smram); +} + +static int vt_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_leave_smm(vcpu, smram); +} + +static void vt_enable_smi_window(struct kvm_vcpu *vcpu) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return; + + /* RSM will cause a vmexit anyway. */ + vmx_enable_smi_window(vcpu); +} +#endif + +static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + /* + * For TDX, this can only be triggered for MMIO emulation. Let the + * guest retry after installing the SPTE with suppress #VE bit cleared, + * so that the guest will receive #VE when retry. The guest is expected + * to call TDG.VP.VMCALL<MMIO> to request VMM to do MMIO emulation on + * #VE. + */ + if (is_td_vcpu(vcpu)) + return X86EMUL_RETRY_INSTR; + + return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len); +} + +static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + /* + * INIT and SIPI are always blocked for TDX, i.e., INIT handling and + * the OP vcpu_deliver_sipi_vector() won't be called. + */ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_apic_init_signal_blocked(vcpu); +} + +static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + /* Only x2APIC mode is supported for TD. */ + if (is_td_vcpu(vcpu)) + return; + + return vmx_set_virtual_apic_mode(vcpu); +} + +static void vt_apicv_pre_state_restore(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi = vcpu_to_pi_desc(vcpu); + + pi_clear_on(pi); + memset(pi->pir, 0, sizeof(pi->pir)); +} + +static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ + if (is_td_vcpu(vcpu)) + return; + + return vmx_hwapic_isr_update(vcpu, max_isr); +} + +static int vt_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return -1; + + return vmx_sync_pir_to_irr(vcpu); +} + +static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + if (is_td_vcpu(apic->vcpu)) { + tdx_deliver_interrupt(apic, delivery_mode, trig_mode, + vector); + return; + } + + vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector); +} + +static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_vcpu_after_set_cpuid(vcpu); +} + +static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_exception_bitmap(vcpu); +} + +static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_segment_base(vcpu, seg); +} + +static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) { + memset(var, 0, sizeof(*var)); + return; + } + + vmx_get_segment(vcpu, var, seg); +} + +static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_segment(vcpu, var, seg); +} + +static int vt_get_cpl(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl(vcpu); +} + +static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl_no_cache(vcpu); +} + +static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + if (is_td_vcpu(vcpu)) { + *db = 0; + *l = 0; + return; + } + + vmx_get_cs_db_l_bits(vcpu, db, l); +} + +static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr0(vcpu, cr0); +} + +static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr0(vcpu, cr0); +} + +static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr4(vcpu, cr4); +} + +static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr4(vcpu, cr4); +} + +static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_set_efer(vcpu, efer); +} + +static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_idt(vcpu, dt); +} + +static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_idt(vcpu, dt); +} + +static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_gdt(vcpu, dt); +} + +static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_gdt(vcpu, dt); +} + +static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_dr6(vcpu, val); +} + +static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_dr7(vcpu, val); +} + +static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + /* + * MOV-DR exiting is always cleared for TD guest, even in debug mode. + * Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never + * reach here for TD vcpu. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_sync_dirty_debug_regs(vcpu); +} + +static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_cache_reg(vcpu, reg); +} + +static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_rflags(vcpu); +} + +static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_rflags(vcpu, rflags); +} + +static bool vt_get_if_flag(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return vmx_get_if_flag(vcpu); +} + +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_all(vcpu); + return; + } + + vmx_flush_tlb_all(vcpu); +} + +static void vt_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_current(vcpu); + return; + } + + vmx_flush_tlb_current(vcpu); +} + +static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_gva(vcpu, addr); +} + +static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_guest(vcpu); +} + +static void vt_inject_nmi(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_inject_nmi(vcpu); + return; + } + + vmx_inject_nmi(vcpu); +} + +static int vt_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + /* + * The TDX module manages NMI windows and NMI reinjection, and hides NMI + * blocking, all KVM can do is throw an NMI over the wall. + */ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_nmi_allowed(vcpu, for_injection); +} + +static bool vt_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + /* + * KVM can't get NMI blocking status for TDX guest, assume NMIs are + * always unmasked. + */ + if (is_td_vcpu(vcpu)) + return false; + + return vmx_get_nmi_mask(vcpu); +} + +static void vt_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_nmi_mask(vcpu, masked); +} + +static void vt_enable_nmi_window(struct kvm_vcpu *vcpu) +{ + /* Refer to the comments in tdx_inject_nmi(). */ + if (is_td_vcpu(vcpu)) + return; + + vmx_enable_nmi_window(vcpu); +} + +static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int pgd_level) +{ + if (is_td_vcpu(vcpu)) { + tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level); + return; + } + + vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level); +} + +static void vt_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_interrupt_shadow(vcpu, mask); +} + +static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_interrupt_shadow(vcpu); +} + +static void vt_patch_hypercall(struct kvm_vcpu *vcpu, + unsigned char *hypercall) +{ + /* + * Because guest memory is protected, guest can't be patched. TD kernel + * is modified to use TDG.VP.VMCALL for hypercall. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_patch_hypercall(vcpu, hypercall); +} + +static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_inject_irq(vcpu, reinjected); +} + +static void vt_inject_exception(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_inject_exception(vcpu); +} + +static void vt_cancel_injection(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_cancel_injection(vcpu); +} + +static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (is_td_vcpu(vcpu)) + return tdx_interrupt_allowed(vcpu); + + return vmx_interrupt_allowed(vcpu, for_injection); +} + +static void vt_enable_irq_window(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_enable_irq_window(vcpu); +} + +static void vt_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) +{ + *intr_info = 0; + *error_code = 0; + + if (is_td_vcpu(vcpu)) + return; + + vmx_get_entry_info(vcpu, intr_info, error_code); +} + +static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) +{ + if (is_td_vcpu(vcpu)) { + tdx_get_exit_info(vcpu, reason, info1, info2, intr_info, + error_code); + return; + } + + vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code); +} + +static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_cr8_intercept(vcpu, tpr, irr); +} + +static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_apic_access_page_addr(vcpu); +} + +static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + KVM_BUG_ON(!kvm_vcpu_apicv_active(vcpu), vcpu->kvm); + return; + } + + vmx_refresh_apicv_exec_ctrl(vcpu); +} + +static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap); +} + +static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_tss_addr(kvm, addr); +} + +static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_identity_map_addr(kvm, ident_addr); +} + +static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_offset(vcpu); +} + +static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_multiplier(vcpu); +} + +static void vt_write_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc offset can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_offset(vcpu); +} + +static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc multiplier can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_multiplier(vcpu); +} + +#ifdef CONFIG_X86_64 +static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) +{ + /* VMX-preemption timer isn't available for TDX. */ + if (is_td_vcpu(vcpu)) + return -EINVAL; + + return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired); +} + +static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + /* VMX-preemption timer can't be set. See vt_set_hv_timer(). */ + if (is_td_vcpu(vcpu)) + return; + + vmx_cancel_hv_timer(vcpu); +} +#endif + +static void vt_setup_mce(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_setup_mce(vcpu); +} + +static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) +{ + if (!is_td(kvm)) + return -ENOTTY; + + return tdx_vm_ioctl(kvm, argp); +} + +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + if (!is_td_vcpu(vcpu)) + return -EINVAL; + + return tdx_vcpu_ioctl(vcpu, argp); +} + +static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + if (is_td(kvm)) + return tdx_gmem_private_max_mapping_level(kvm, pfn); + + return 0; +} #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLED) | \ @@ -24,111 +905,113 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vmx_hardware_unsetup, .enable_virtualization_cpu = vmx_enable_virtualization_cpu, - .disable_virtualization_cpu = vmx_disable_virtualization_cpu, + .disable_virtualization_cpu = vt_disable_virtualization_cpu, .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, - .has_emulated_msr = vmx_has_emulated_msr, + .has_emulated_msr = vt_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), - .vm_init = vmx_vm_init, - .vm_destroy = vmx_vm_destroy, - .vcpu_precreate = vmx_vcpu_precreate, - .vcpu_create = vmx_vcpu_create, - .vcpu_free = vmx_vcpu_free, - .vcpu_reset = vmx_vcpu_reset, + .vm_init = vt_vm_init, + .vm_pre_destroy = vt_vm_pre_destroy, + .vm_destroy = vt_vm_destroy, + + .vcpu_precreate = vt_vcpu_precreate, + .vcpu_create = vt_vcpu_create, + .vcpu_free = vt_vcpu_free, + .vcpu_reset = vt_vcpu_reset, - .prepare_switch_to_guest = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, + .prepare_switch_to_guest = vt_prepare_switch_to_guest, + .vcpu_load = vt_vcpu_load, + .vcpu_put = vt_vcpu_put, - .update_exception_bitmap = vmx_update_exception_bitmap, + .update_exception_bitmap = vt_update_exception_bitmap, .get_feature_msr = vmx_get_feature_msr, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cpl_no_cache = vmx_get_cpl_no_cache, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .is_valid_cr0 = vmx_is_valid_cr0, - .set_cr0 = vmx_set_cr0, - .is_valid_cr4 = vmx_is_valid_cr4, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .set_dr6 = vmx_set_dr6, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - .get_if_flag = vmx_get_if_flag, - - .flush_tlb_all = vmx_flush_tlb_all, - .flush_tlb_current = vmx_flush_tlb_current, - .flush_tlb_gva = vmx_flush_tlb_gva, - .flush_tlb_guest = vmx_flush_tlb_guest, - - .vcpu_pre_run = vmx_vcpu_pre_run, - .vcpu_run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, + .get_msr = vt_get_msr, + .set_msr = vt_set_msr, + + .get_segment_base = vt_get_segment_base, + .get_segment = vt_get_segment, + .set_segment = vt_set_segment, + .get_cpl = vt_get_cpl, + .get_cpl_no_cache = vt_get_cpl_no_cache, + .get_cs_db_l_bits = vt_get_cs_db_l_bits, + .is_valid_cr0 = vt_is_valid_cr0, + .set_cr0 = vt_set_cr0, + .is_valid_cr4 = vt_is_valid_cr4, + .set_cr4 = vt_set_cr4, + .set_efer = vt_set_efer, + .get_idt = vt_get_idt, + .set_idt = vt_set_idt, + .get_gdt = vt_get_gdt, + .set_gdt = vt_set_gdt, + .set_dr6 = vt_set_dr6, + .set_dr7 = vt_set_dr7, + .sync_dirty_debug_regs = vt_sync_dirty_debug_regs, + .cache_reg = vt_cache_reg, + .get_rflags = vt_get_rflags, + .set_rflags = vt_set_rflags, + .get_if_flag = vt_get_if_flag, + + .flush_tlb_all = vt_flush_tlb_all, + .flush_tlb_current = vt_flush_tlb_current, + .flush_tlb_gva = vt_flush_tlb_gva, + .flush_tlb_guest = vt_flush_tlb_guest, + + .vcpu_pre_run = vt_vcpu_pre_run, + .vcpu_run = vt_vcpu_run, + .handle_exit = vt_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .inject_irq = vmx_inject_irq, - .inject_nmi = vmx_inject_nmi, - .inject_exception = vmx_inject_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = vmx_enable_nmi_window, - .enable_irq_window = vmx_enable_irq_window, - .update_cr8_intercept = vmx_update_cr8_intercept, + .set_interrupt_shadow = vt_set_interrupt_shadow, + .get_interrupt_shadow = vt_get_interrupt_shadow, + .patch_hypercall = vt_patch_hypercall, + .inject_irq = vt_inject_irq, + .inject_nmi = vt_inject_nmi, + .inject_exception = vt_inject_exception, + .cancel_injection = vt_cancel_injection, + .interrupt_allowed = vt_interrupt_allowed, + .nmi_allowed = vt_nmi_allowed, + .get_nmi_mask = vt_get_nmi_mask, + .set_nmi_mask = vt_set_nmi_mask, + .enable_nmi_window = vt_enable_nmi_window, + .enable_irq_window = vt_enable_irq_window, + .update_cr8_intercept = vt_update_cr8_intercept, .x2apic_icr_is_split = false, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_pre_state_restore = vmx_apicv_pre_state_restore, + .set_virtual_apic_mode = vt_set_virtual_apic_mode, + .set_apic_access_page_addr = vt_set_apic_access_page_addr, + .refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl, + .load_eoi_exitmap = vt_load_eoi_exitmap, + .apicv_pre_state_restore = vt_apicv_pre_state_restore, .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, - .hwapic_isr_update = vmx_hwapic_isr_update, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_interrupt = vmx_deliver_interrupt, + .hwapic_isr_update = vt_hwapic_isr_update, + .sync_pir_to_irr = vt_sync_pir_to_irr, + .deliver_interrupt = vt_deliver_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, + .set_tss_addr = vt_set_tss_addr, + .set_identity_map_addr = vt_set_identity_map_addr, .get_mt_mask = vmx_get_mt_mask, - .get_exit_info = vmx_get_exit_info, - .get_entry_info = vmx_get_entry_info, + .get_exit_info = vt_get_exit_info, + .get_entry_info = vt_get_entry_info, - .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, + .vcpu_after_set_cpuid = vt_vcpu_after_set_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .get_l2_tsc_offset = vmx_get_l2_tsc_offset, - .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, - .write_tsc_offset = vmx_write_tsc_offset, - .write_tsc_multiplier = vmx_write_tsc_multiplier, + .get_l2_tsc_offset = vt_get_l2_tsc_offset, + .get_l2_tsc_multiplier = vt_get_l2_tsc_multiplier, + .write_tsc_offset = vt_write_tsc_offset, + .write_tsc_multiplier = vt_write_tsc_multiplier, - .load_mmu_pgd = vmx_load_mmu_pgd, + .load_mmu_pgd = vt_load_mmu_pgd, .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .cpu_dirty_log_size = PML_LOG_NR_ENTRIES, - .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, + .update_cpu_dirty_logging = vt_update_cpu_dirty_logging, .nested_ops = &vmx_nested_ops, @@ -136,35 +1019,95 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, + .set_hv_timer = vt_set_hv_timer, + .cancel_hv_timer = vt_cancel_hv_timer, #endif - .setup_mce = vmx_setup_mce, + .setup_mce = vt_setup_mce, #ifdef CONFIG_KVM_SMM - .smi_allowed = vmx_smi_allowed, - .enter_smm = vmx_enter_smm, - .leave_smm = vmx_leave_smm, - .enable_smi_window = vmx_enable_smi_window, + .smi_allowed = vt_smi_allowed, + .enter_smm = vt_enter_smm, + .leave_smm = vt_leave_smm, + .enable_smi_window = vt_enable_smi_window, #endif - .check_emulate_instruction = vmx_check_emulate_instruction, - .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .check_emulate_instruction = vt_check_emulate_instruction, + .apic_init_signal_blocked = vt_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, - .msr_filter_changed = vmx_msr_filter_changed, - .complete_emulated_msr = kvm_complete_insn_gp, + .msr_filter_changed = vt_msr_filter_changed, + .complete_emulated_msr = vt_complete_emulated_msr, .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, .get_untagged_addr = vmx_get_untagged_addr, + + .mem_enc_ioctl = vt_mem_enc_ioctl, + .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl, + + .private_max_mapping_level = vt_gmem_private_max_mapping_level }; struct kvm_x86_init_ops vt_init_ops __initdata = { - .hardware_setup = vmx_hardware_setup, + .hardware_setup = vt_hardware_setup, .handle_intel_pt_intr = NULL, .runtime_ops = &vt_x86_ops, .pmu_ops = &intel_pmu_ops, }; + +static void __exit vt_exit(void) +{ + kvm_exit(); + tdx_cleanup(); + vmx_exit(); +} +module_exit(vt_exit); + +static int __init vt_init(void) +{ + unsigned vcpu_size, vcpu_align; + int r; + + r = vmx_init(); + if (r) + return r; + + /* tdx_init() has been taken */ + r = tdx_bringup(); + if (r) + goto err_tdx_bringup; + + /* + * TDX and VMX have different vCPU structures. Calculate the + * maximum size/align so that kvm_init() can use the larger + * values to create the kmem_vcpu_cache. + */ + vcpu_size = sizeof(struct vcpu_vmx); + vcpu_align = __alignof__(struct vcpu_vmx); + if (enable_tdx) { + vcpu_size = max_t(unsigned, vcpu_size, + sizeof(struct vcpu_tdx)); + vcpu_align = max_t(unsigned, vcpu_align, + __alignof__(struct vcpu_tdx)); + kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM); + } + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(vcpu_size, vcpu_align, THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + tdx_cleanup(); +err_tdx_bringup: + vmx_exit(); + return r; +} +module_init(vt_init); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d268224227f0..71701e2414a4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -276,7 +276,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, { struct vmcs_host_state *dest, *src; - if (unlikely(!vmx->guest_state_loaded)) + if (unlikely(!vmx->vt.guest_state_loaded)) return; src = &prev->host_state; @@ -426,7 +426,7 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, * tables also changed, but KVM should not treat EPT Misconfig * VM-Exits as writes. */ - WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); + WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); /* * PML Full and EPT Violation VM-Exits both use bit 12 to report @@ -4623,7 +4623,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; - if (to_vmx(vcpu)->exit_reason.enclave_mode) + if (vmx_get_exit_reason(vcpu).enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; @@ -4795,7 +4795,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) @@ -6128,7 +6128,7 @@ fail: * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode * EXIT_REASON_VMFUNC as the exit reason. */ - nested_vmx_vmexit(vcpu, vmx->exit_reason.full, + nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; @@ -6573,7 +6573,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->vt.exit_reason; unsigned long exit_qual; u32 exit_intr_info; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 231a9633359c..bbf4509f32d0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -20,6 +20,7 @@ #include "lapic.h" #include "nested.h" #include "pmu.h" +#include "tdx.h" /* * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX @@ -35,6 +36,24 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) +static struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return NULL; + + return &to_vmx(vcpu)->lbr_desc; +} + +static struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return NULL; + + return &to_vmx(vcpu)->lbr_desc.records; +} + +#pragma GCC poison to_vmx + static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { struct kvm_pmc *pmc; @@ -130,6 +149,22 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } +static bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return cpuid_model_is_consistent(vcpu); +} + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return !!vcpu_to_lbr_records(vcpu)->nr; +} + static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) { struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu); @@ -195,6 +230,9 @@ static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu) { struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (!lbr_desc) + return; + if (lbr_desc->event) { perf_event_release_kernel(lbr_desc->event); lbr_desc->event = NULL; @@ -236,6 +274,9 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) PERF_SAMPLE_BRANCH_USER, }; + if (WARN_ON_ONCE(!lbr_desc)) + return 0; + if (unlikely(lbr_desc->event)) { __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); return 0; @@ -467,6 +508,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) u64 perf_capabilities; u64 counter_rsvd; + if (!lbr_desc) + return; + memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); /* @@ -543,7 +587,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); perf_capabilities = vcpu_get_perf_capabilities(vcpu); - if (cpuid_model_is_consistent(vcpu) && + if (intel_pmu_lbr_is_compatible(vcpu) && (perf_capabilities & PMU_CAP_LBR_FMT)) memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else @@ -571,6 +615,9 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (!lbr_desc) + return; + for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; @@ -678,6 +725,9 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (WARN_ON_ONCE(!lbr_desc)) + return; + if (!lbr_desc->event) { vmx_disable_lbr_msrs_passthrough(vcpu); if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h new file mode 100644 index 000000000000..5620d0882cdc --- /dev/null +++ b/arch/x86/kvm/vmx/pmu_intel.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_PMU_INTEL_H +#define __KVM_X86_VMX_PMU_INTEL_H + +#include <linux/kvm_host.h> + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); + +struct lbr_desc { + /* Basic info about guest LBR records. */ + struct x86_pmu_lbr records; + + /* + * Emulate LBR feature via passthrough LBR registers when the + * per-vcpu guest LBR event is scheduled on the current pcpu. + * + * The records may be inaccurate if the host reclaims the LBR. + */ + struct perf_event *event; + + /* True if LBRs are marked as not intercepted in the MSR bitmap */ + bool msr_passthrough; +}; + +extern struct x86_pmu_lbr vmx_lbr_caps; + +#endif /* __KVM_X86_VMX_PMU_INTEL_H */ diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index d70e5b90087d..99d1d599ff8c 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -11,6 +11,7 @@ #include "posted_intr.h" #include "trace.h" #include "vmx.h" +#include "tdx.h" /* * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() @@ -33,9 +34,9 @@ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); #define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING -static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { - return &(to_vmx(vcpu)->pi_desc); + return &(to_vt(vcpu)->pi_desc); } static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) @@ -55,7 +56,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; unsigned long flags; unsigned int dest; @@ -102,7 +103,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) */ raw_spin_lock(spinlock); spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); - list_del(&vmx->pi_wakeup_list); + list_del(&vt->pi_wakeup_list); spin_release(&spinlock->dep_map, _RET_IP_); raw_spin_unlock(spinlock); } @@ -159,7 +160,7 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; lockdep_assert_irqs_disabled(); @@ -178,7 +179,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) */ raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), PI_LOCK_SCHED_OUT); - list_add_tail(&vmx->pi_wakeup_list, + list_add_tail(&vt->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); @@ -213,7 +214,8 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) * notification vector is switched to the one that calls * back to the pi_wakeup_handler() function. */ - return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); + return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) || + vmx_can_use_vtd_pi(vcpu->kvm); } void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -223,7 +225,9 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) if (!vmx_needs_pi_wakeup(vcpu)) return; - if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + if (kvm_vcpu_is_blocking(vcpu) && + ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || + (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) pi_enable_wakeup_handler(vcpu); /* @@ -243,13 +247,13 @@ void pi_wakeup_handler(void) int cpu = smp_processor_id(); struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); - struct vcpu_vmx *vmx; + struct vcpu_vt *vt; raw_spin_lock(spinlock); - list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) { + list_for_each_entry(vt, wakeup_list, pi_wakeup_list) { - if (pi_test_on(&vmx->pi_desc)) - kvm_vcpu_wake_up(&vmx->vcpu); + if (pi_test_on(&vt->pi_desc)) + kvm_vcpu_wake_up(vt_to_vcpu(vt)); } raw_spin_unlock(spinlock); } diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index ad9116a99bcc..68605ca7ef68 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -5,6 +5,8 @@ #include <linux/bitmap.h> #include <asm/posted_intr.h> +struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu); + void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c new file mode 100644 index 000000000000..b952bc673271 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.c @@ -0,0 +1,3526 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cleanup.h> +#include <linux/cpu.h> +#include <asm/cpufeature.h> +#include <asm/fpu/xcr.h> +#include <linux/misc_cgroup.h> +#include <linux/mmu_context.h> +#include <asm/tdx.h> +#include "capabilities.h" +#include "mmu.h" +#include "x86_ops.h" +#include "lapic.h" +#include "tdx.h" +#include "vmx.h" +#include "mmu/spte.h" +#include "common.h" +#include "posted_intr.h" +#include "irq.h" +#include <trace/events/kvm.h> +#include "trace.h" + +#pragma GCC poison to_vmx + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define pr_tdx_error(__fn, __err) \ + pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) + +#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ + pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) + +#define pr_tdx_error_1(__fn, __err, __rcx) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) + +#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) + +#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) + +bool enable_tdx __ro_after_init; +module_param_named(tdx, enable_tdx, bool, 0444); + +#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) +#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) + +static enum cpuhp_state tdx_cpuhp_state; + +static const struct tdx_sys_info *tdx_sysinfo; + +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); +} + +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); +} + +#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) + +static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_tdx, kvm); +} + +static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_tdx, vcpu); +} + +static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) +{ + u64 val = KVM_SUPPORTED_TD_ATTRS; + + if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) + return 0; + + val &= td_conf->attributes_fixed0; + + return val; +} + +static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) +{ + u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; + + if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) + return 0; + + val &= td_conf->xfam_fixed0; + + return val; +} + +static int tdx_get_guest_phys_addr_bits(const u32 eax) +{ + return (eax & GENMASK(23, 16)) >> 16; +} + +static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) +{ + return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; +} + +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) + +static bool has_tsx(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ebx & TDX_FEATURE_TSX); +} + +static void clear_tsx(struct kvm_cpuid_entry2 *entry) +{ + entry->ebx &= ~TDX_FEATURE_TSX; +} + +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); +} + +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) +{ + entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); +} + +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) +{ + if (has_tsx(entry)) + clear_tsx(entry); + + if (has_waitpkg(entry)) + clear_waitpkg(entry); +} + +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) +{ + return has_tsx(entry) || has_waitpkg(entry); +} + +#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) + +static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + + entry->function = (u32)td_conf->cpuid_config_leaves[idx]; + entry->index = td_conf->cpuid_config_leaves[idx] >> 32; + entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; + entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; + entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; + entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; + + if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) + entry->index = 0; + + /* + * The TDX module doesn't allow configuring the guest phys addr bits + * (EAX[23:16]). However, KVM uses it as an interface to the userspace + * to configure the GPAW. Report these bits as configurable. + */ + if (entry->function == 0x80000008) + entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); + + tdx_clear_unsupported_cpuid(entry); +} + +static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, + struct kvm_tdx_capabilities *caps) +{ + int i; + + caps->supported_attrs = tdx_get_supported_attrs(td_conf); + if (!caps->supported_attrs) + return -EIO; + + caps->supported_xfam = tdx_get_supported_xfam(td_conf); + if (!caps->supported_xfam) + return -EIO; + + caps->cpuid.nent = td_conf->num_cpuid_config; + + for (i = 0; i < td_conf->num_cpuid_config; i++) + td_init_cpuid_entry2(&caps->cpuid.entries[i], i); + + return 0; +} + +/* + * Some SEAMCALLs acquire the TDX module globally, and can fail with + * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. + */ +static DEFINE_MUTEX(tdx_lock); + +static atomic_t nr_configured_hkid; + +static bool tdx_operand_busy(u64 err) +{ + return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; +} + + +/* + * A per-CPU list of TD vCPUs associated with a given CPU. + * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU + * list. + * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of + * the old CPU during the IPI callback running on the old CPU, and then added + * to the per-CPU list of the new CPU. + * - When a TD is tearing down, all vCPUs are disassociated from their current + * running CPUs and removed from the per-CPU list during the IPI callback + * running on those CPUs. + * - When a CPU is brought down, traverse the per-CPU list to disassociate all + * associated TD vCPUs and remove them from the per-CPU list. + */ +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); + +static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) +{ + return to_tdx(vcpu)->vp_enter_args.r10; +} + +static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) +{ + return to_tdx(vcpu)->vp_enter_args.r11; +} + +static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, + long val) +{ + to_tdx(vcpu)->vp_enter_args.r10 = val; +} + +static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, + unsigned long val) +{ + to_tdx(vcpu)->vp_enter_args.r11 = val; +} + +static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) +{ + tdx_guest_keyid_free(kvm_tdx->hkid); + kvm_tdx->hkid = -1; + atomic_dec(&nr_configured_hkid); + misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + put_misc_cg(kvm_tdx->misc_cg); + kvm_tdx->misc_cg = NULL; +} + +static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) +{ + return kvm_tdx->hkid > 0; +} + +static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) +{ + lockdep_assert_irqs_disabled(); + + list_del(&to_tdx(vcpu)->cpu_list); + + /* + * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, + * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU + * to its list before it's deleted from this CPU's list. + */ + smp_wmb(); + + vcpu->cpu = -1; +} + +static void tdx_clear_page(struct page *page) +{ + const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); + void *dest = page_to_virt(page); + unsigned long i; + + /* + * The page could have been poisoned. MOVDIR64B also clears + * the poison bit so the kernel can safely use the page again. + */ + for (i = 0; i < PAGE_SIZE; i += 64) + movdir64b(dest + i, zero_page); + /* + * MOVDIR64B store uses WC buffer. Prevent following memory reads + * from seeing potentially poisoned cache. + */ + __mb(); +} + +static void tdx_no_vcpus_enter_start(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + lockdep_assert_held_write(&kvm->mmu_lock); + + WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); + + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + +static void tdx_no_vcpus_enter_stop(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + lockdep_assert_held_write(&kvm->mmu_lock); + + WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); +} + +/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ +static int __tdx_reclaim_page(struct page *page) +{ + u64 err, rcx, rdx, r8; + + err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); + + /* + * No need to check for TDX_OPERAND_BUSY; all TD pages are freed + * before the HKID is released and control pages have also been + * released at this point, so there is no possibility of contention. + */ + if (WARN_ON_ONCE(err)) { + pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); + return -EIO; + } + return 0; +} + +static int tdx_reclaim_page(struct page *page) +{ + int r; + + r = __tdx_reclaim_page(page); + if (!r) + tdx_clear_page(page); + return r; +} + + +/* + * Reclaim the TD control page(s) which are crypto-protected by TDX guest's + * private KeyID. Assume the cache associated with the TDX private KeyID has + * been flushed. + */ +static void tdx_reclaim_control_page(struct page *ctrl_page) +{ + /* + * Leak the page if the kernel failed to reclaim the page. + * The kernel cannot use it safely anymore. + */ + if (tdx_reclaim_page(ctrl_page)) + return; + + __free_page(ctrl_page); +} + +struct tdx_flush_vp_arg { + struct kvm_vcpu *vcpu; + u64 err; +}; + +static void tdx_flush_vp(void *_arg) +{ + struct tdx_flush_vp_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + u64 err; + + arg->err = 0; + lockdep_assert_irqs_disabled(); + + /* Task migration can race with CPU offlining. */ + if (unlikely(vcpu->cpu != raw_smp_processor_id())) + return; + + /* + * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The + * list tracking still needs to be updated so that it's correct if/when + * the vCPU does get initialized. + */ + if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { + /* + * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: + * TDVPR as exclusive, TDR as shared, and TDCS as shared. This + * vp flush function is called when destructing vCPU/TD or vCPU + * migration. No other thread uses TDVPR in those cases. + */ + err = tdh_vp_flush(&to_tdx(vcpu)->vp); + if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { + /* + * This function is called in IPI context. Do not use + * printk to avoid console semaphore. + * The caller prints out the error message, instead. + */ + if (err) + arg->err = err; + } + } + + tdx_disassociate_vp(vcpu); +} + +static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) +{ + struct tdx_flush_vp_arg arg = { + .vcpu = vcpu, + }; + int cpu = vcpu->cpu; + + if (unlikely(cpu == -1)) + return; + + smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); + if (KVM_BUG_ON(arg.err, vcpu->kvm)) + pr_tdx_error(TDH_VP_FLUSH, arg.err); +} + +void tdx_disable_virtualization_cpu(void) +{ + int cpu = raw_smp_processor_id(); + struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); + struct tdx_flush_vp_arg arg; + struct vcpu_tdx *tdx, *tmp; + unsigned long flags; + + local_irq_save(flags); + /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ + list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { + arg.vcpu = &tdx->vcpu; + tdx_flush_vp(&arg); + } + local_irq_restore(flags); +} + +#define TDX_SEAMCALL_RETRIES 10000 + +static void smp_func_do_phymem_cache_wb(void *unused) +{ + u64 err = 0; + bool resume; + int i; + + /* + * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private + * KeyID on the package or core. The TDX module may not finish the + * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The + * kernel should retry it until it returns success w/o rescheduling. + */ + for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { + resume = !!err; + err = tdh_phymem_cache_wb(resume); + switch (err) { + case TDX_INTERRUPTED_RESUMABLE: + continue; + case TDX_NO_HKID_READY_TO_WBCACHE: + err = TDX_SUCCESS; /* Already done by other thread */ + fallthrough; + default: + goto out; + } + } + +out: + if (WARN_ON_ONCE(err)) + pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); +} + +void tdx_mmu_release_hkid(struct kvm *kvm) +{ + bool packages_allocated, targets_allocated; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + cpumask_var_t packages, targets; + struct kvm_vcpu *vcpu; + unsigned long j; + int i; + u64 err; + + if (!is_hkid_assigned(kvm_tdx)) + return; + + packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); + targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); + cpus_read_lock(); + + kvm_for_each_vcpu(j, vcpu, kvm) + tdx_flush_vp_on_cpu(vcpu); + + /* + * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock + * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. + * Multiple TDX guests can be destroyed simultaneously. Take the + * mutex to prevent it from getting error. + */ + mutex_lock(&tdx_lock); + + /* + * Releasing HKID is in vm_destroy(). + * After the above flushing vps, there should be no more vCPU + * associations, as all vCPU fds have been released at this stage. + */ + err = tdh_mng_vpflushdone(&kvm_tdx->td); + if (err == TDX_FLUSHVP_NOT_DONE) + goto out; + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); + pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", + kvm_tdx->hkid); + goto out; + } + + for_each_online_cpu(i) { + if (packages_allocated && + cpumask_test_and_set_cpu(topology_physical_package_id(i), + packages)) + continue; + if (targets_allocated) + cpumask_set_cpu(i, targets); + } + if (targets_allocated) + on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); + else + on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); + /* + * In the case of error in smp_func_do_phymem_cache_wb(), the following + * tdh_mng_key_freeid() will fail. + */ + err = tdh_mng_key_freeid(&kvm_tdx->td); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MNG_KEY_FREEID, err); + pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", + kvm_tdx->hkid); + } else { + tdx_hkid_free(kvm_tdx); + } + +out: + mutex_unlock(&tdx_lock); + cpus_read_unlock(); + free_cpumask_var(targets); + free_cpumask_var(packages); +} + +static void tdx_reclaim_td_control_pages(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err; + int i; + + /* + * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong + * heavily with TDX module. Give up freeing TD pages. As the function + * already warned, don't warn it again. + */ + if (is_hkid_assigned(kvm_tdx)) + return; + + if (kvm_tdx->td.tdcs_pages) { + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (!kvm_tdx->td.tdcs_pages[i]) + continue; + + tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); + } + kfree(kvm_tdx->td.tdcs_pages); + kvm_tdx->td.tdcs_pages = NULL; + } + + if (!kvm_tdx->td.tdr_page) + return; + + if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) + return; + + /* + * Use a SEAMCALL to ask the TDX module to flush the cache based on the + * KeyID. TDX module may access TDR while operating on TD (Especially + * when it is reclaiming TDCS). + */ + err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return; + } + tdx_clear_page(kvm_tdx->td.tdr_page); + + __free_page(kvm_tdx->td.tdr_page); + kvm_tdx->td.tdr_page = NULL; +} + +void tdx_vm_destroy(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + tdx_reclaim_td_control_pages(kvm); + + kvm_tdx->state = TD_STATE_UNINITIALIZED; +} + +static int tdx_do_tdh_mng_key_config(void *param) +{ + struct kvm_tdx *kvm_tdx = param; + u64 err; + + /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ + err = tdh_mng_key_config(&kvm_tdx->td); + + if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { + pr_tdx_error(TDH_MNG_KEY_CONFIG, err); + return -EIO; + } + + return 0; +} + +int tdx_vm_init(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + kvm->arch.has_protected_state = true; + kvm->arch.has_private_mem = true; + kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; + + /* + * Because guest TD is protected, VMM can't parse the instruction in TD. + * Instead, guest uses MMIO hypercall. For unmodified device driver, + * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO + * instruction into MMIO hypercall. + * + * SPTE value for MMIO needs to be setup so that #VE is injected into + * TD instead of triggering EPT MISCONFIG. + * - RWX=0 so that EPT violation is triggered. + * - suppress #VE bit is cleared to inject #VE. + */ + kvm_mmu_set_mmio_spte_value(kvm, 0); + + /* + * TDX has its own limit of maximum vCPUs it can support for all + * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports + * such limit via the MAX_VCPU_PER_TD global metadata. In + * practice, it reflects the number of logical CPUs that ALL + * platforms that the TDX module supports can possibly have. + * + * Limit TDX guest's maximum vCPUs to the number of logical CPUs + * the platform has. Simply forwarding the MAX_VCPU_PER_TD to + * userspace would result in an unpredictable ABI. + */ + kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); + + kvm_tdx->state = TD_STATE_UNINITIALIZED; + + return 0; +} + +int tdx_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (kvm_tdx->state != TD_STATE_INITIALIZED) + return -EIO; + + /* + * TDX module mandates APICv, which requires an in-kernel local APIC. + * Disallow an in-kernel I/O APIC, because level-triggered interrupts + * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. + */ + if (!irqchip_split(vcpu->kvm)) + return -EINVAL; + + fpstate_set_confidential(&vcpu->arch.guest_fpu); + vcpu->arch.apic->guest_apic_protected = true; + INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); + + vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; + + vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; + vcpu->arch.cr0_guest_owned_bits = -1ul; + vcpu->arch.cr4_guest_owned_bits = -1ul; + + /* KVM can't change TSC offset/multiplier as TDX module manages them. */ + vcpu->arch.guest_tsc_protected = true; + vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; + vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; + vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; + vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; + + vcpu->arch.guest_state_protected = + !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); + + if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) + vcpu->arch.xfd_no_write_intercept = true; + + tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&tdx->vt.pi_desc); + + tdx->state = VCPU_TD_STATE_UNINITIALIZED; + + return 0; +} + +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + vmx_vcpu_pi_load(vcpu, cpu); + if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) + return; + + tdx_flush_vp_on_cpu(vcpu); + + KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); + local_irq_disable(); + /* + * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure + * vcpu->cpu is read before tdx->cpu_list. + */ + smp_rmb(); + + list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); + local_irq_enable(); +} + +bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + /* + * KVM can't get the interrupt status of TDX guest and it assumes + * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, + * which passes the interrupt blocked flag. + */ + return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || + !to_tdx(vcpu)->vp_enter_args.r12; +} + +bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + u64 vcpu_state_details; + + if (pi_has_pending_interrupt(vcpu)) + return true; + + /* + * Only check RVI pending for HALTED case with IRQ enabled. + * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the + * interrupt was pending before TD exit, then it _must_ be blocked, + * otherwise the interrupt would have been serviced at the instruction + * boundary. + */ + if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || + to_tdx(vcpu)->vp_enter_args.r12) + return false; + + vcpu_state_details = + td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); + + return tdx_vcpu_state_details_intr_pending(vcpu_state_details); +} + +/* + * Compared to vmx_prepare_switch_to_guest(), there is not much to do + * as SEAMCALL/SEAMRET calls take care of most of save and restore. + */ +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (vt->guest_state_loaded) + return; + + if (likely(is_64bit_mm(current->mm))) + vt->msr_host_kernel_gs_base = current->thread.gsbase; + else + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + + vt->host_debugctlmsr = get_debugctlmsr(); + + vt->guest_state_loaded = true; +} + +struct tdx_uret_msr { + u32 msr; + unsigned int slot; + u64 defval; +}; + +static struct tdx_uret_msr tdx_uret_msrs[] = { + {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, + {.msr = MSR_STAR,}, + {.msr = MSR_LSTAR,}, + {.msr = MSR_TSC_AUX,}, +}; + +static void tdx_user_return_msr_update_cache(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) + kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, + tdx_uret_msrs[i].defval); +} + +static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (!vt->guest_state_loaded) + return; + + ++vcpu->stat.host_state_reload; + wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); + + if (tdx->guest_entered) { + tdx_user_return_msr_update_cache(); + tdx->guest_entered = false; + } + + vt->guest_state_loaded = false; +} + +void tdx_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_vcpu_pi_put(vcpu); + tdx_prepare_switch_to_host(vcpu); +} + +void tdx_vcpu_free(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + int i; + + /* + * It is not possible to reclaim pages while hkid is assigned. It might + * be assigned if: + * 1. the TD VM is being destroyed but freeing hkid failed, in which + * case the pages are leaked + * 2. TD VCPU creation failed and this on the error path, in which case + * there is nothing to do anyway + */ + if (is_hkid_assigned(kvm_tdx)) + return; + + if (tdx->vp.tdcx_pages) { + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + if (tdx->vp.tdcx_pages[i]) + tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); + } + kfree(tdx->vp.tdcx_pages); + tdx->vp.tdcx_pages = NULL; + } + if (tdx->vp.tdvpr_page) { + tdx_reclaim_control_page(tdx->vp.tdvpr_page); + tdx->vp.tdvpr_page = 0; + } + + tdx->state = VCPU_TD_STATE_UNINITIALIZED; +} + +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || + to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) + return -EINVAL; + + return 1; +} + +static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) +{ + switch (tdvmcall_leaf(vcpu)) { + case EXIT_REASON_CPUID: + case EXIT_REASON_HLT: + case EXIT_REASON_IO_INSTRUCTION: + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + return tdvmcall_leaf(vcpu); + case EXIT_REASON_EPT_VIOLATION: + return EXIT_REASON_EPT_MISCONFIG; + default: + break; + } + + return EXIT_REASON_TDCALL; +} + +static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u32 exit_reason; + + switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { + case TDX_SUCCESS: + case TDX_NON_RECOVERABLE_VCPU: + case TDX_NON_RECOVERABLE_TD: + case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: + case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: + break; + default: + return -1u; + } + + exit_reason = tdx->vp_enter_ret; + + switch (exit_reason) { + case EXIT_REASON_TDCALL: + if (tdvmcall_exit_type(vcpu)) + return EXIT_REASON_VMCALL; + + return tdcall_to_vmx_exit_reason(vcpu); + case EXIT_REASON_EPT_MISCONFIG: + /* + * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in + * non-instrumentable code with interrupts disabled. + */ + return -1u; + default: + break; + } + + return exit_reason; +} + +static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); + + guest_state_enter_irqoff(); + + tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); + + vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); + + vt->exit_qualification = tdx->vp_enter_args.rcx; + tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; + tdx->exit_gpa = tdx->vp_enter_args.r8; + vt->exit_intr_info = tdx->vp_enter_args.r9; + + vmx_handle_nmi(vcpu); + + guest_state_exit_irqoff(); +} + +static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) +{ + return vmx_get_exit_reason(vcpu).failed_vmentry && + vmx_get_exit_reason(vcpu).full != -1u; +} + +static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; + + /* + * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation + * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. + * + * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both + * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target + * vCPUs leaving fastpath so that interrupt can be enabled to ensure the + * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of + * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the + * requester may be blocked endlessly. + */ + if (unlikely(tdx_operand_busy(vp_enter_ret))) + return EXIT_FASTPATH_EXIT_HANDLED; + + return EXIT_FASTPATH_NONE; +} + +#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ + BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ + BIT_ULL(VCPU_REGS_RAX) | \ + BIT_ULL(VCPU_REGS_RBX) | \ + BIT_ULL(VCPU_REGS_RCX) | \ + BIT_ULL(VCPU_REGS_RDX) | \ + BIT_ULL(VCPU_REGS_RBP) | \ + BIT_ULL(VCPU_REGS_RSI) | \ + BIT_ULL(VCPU_REGS_RDI) | \ + BIT_ULL(VCPU_REGS_R8) | \ + BIT_ULL(VCPU_REGS_R9) | \ + BIT_ULL(VCPU_REGS_R10) | \ + BIT_ULL(VCPU_REGS_R11) | \ + BIT_ULL(VCPU_REGS_R12) | \ + BIT_ULL(VCPU_REGS_R13) | \ + BIT_ULL(VCPU_REGS_R14) | \ + BIT_ULL(VCPU_REGS_R15)) + +static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + + /* + * All TDX hosts support PKRU; but even if they didn't, + * vcpu->arch.host_pkru would be 0 and the wrpkru would be + * skipped. + */ + if (vcpu->arch.host_pkru != 0) + wrpkru(vcpu->arch.host_pkru); + + if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) + xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); + + /* + * Likewise, even if a TDX hosts didn't support XSS both arms of + * the comparison would be 0 and the wrmsrl would be skipped. + */ + if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) + wrmsrl(MSR_IA32_XSS, kvm_host.xss); +} + +#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ + DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ + DEBUGCTLMSR_FREEZE_IN_SMM) + +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); + + /* + * force_immediate_exit requires vCPU entering for events injection with + * an immediately exit followed. But The TDX module doesn't guarantee + * entry, it's already possible for KVM to _think_ it completely entry + * to the guest without actually having done so. + * Since KVM never needs to force an immediate exit for TDX, and can't + * do direct injection, just warn on force_immediate_exit. + */ + WARN_ON_ONCE(force_immediate_exit); + + /* + * Wait until retry of SEPT-zap-related SEAMCALL completes before + * allowing vCPU entry to avoid contention with tdh_vp_enter() and + * TDCALLs. + */ + if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) + return EXIT_FASTPATH_EXIT_HANDLED; + + trace_kvm_entry(vcpu, force_immediate_exit); + + if (pi_test_on(&vt->pi_desc)) { + apic->send_IPI_self(POSTED_INTR_VECTOR); + + if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & + APIC_VECTOR_MASK, &vt->pi_desc)) + kvm_wait_lapic_expire(vcpu); + } + + tdx_vcpu_enter_exit(vcpu); + + if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED) + update_debugctlmsr(vt->host_debugctlmsr); + + tdx_load_host_xsave_state(vcpu); + tdx->guest_entered = true; + + vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; + + if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) + return EXIT_FASTPATH_NONE; + + if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) + return EXIT_FASTPATH_NONE; + + if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) + kvm_machine_check(); + + trace_kvm_exit(vcpu, KVM_ISA_VMX); + + if (unlikely(tdx_failed_vmentry(vcpu))) + return EXIT_FASTPATH_NONE; + + return tdx_exit_handlers_fastpath(vcpu); +} + +void tdx_inject_nmi(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.nmi_injections; + td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); + /* + * From KVM's perspective, NMI injection is completed right after + * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by + * the TDX module or not. + */ + vcpu->arch.nmi_injected = false; + /* + * TDX doesn't support KVM to request NMI window exit. If there is + * still a pending vNMI, KVM is not able to inject it along with the + * one pending in TDX module in a back-to-back way. Since the previous + * vNMI is still pending in TDX module, i.e. it has not been delivered + * to TDX guest yet, it's OK to collapse the pending vNMI into the + * previous one. The guest is expected to handle all the NMI sources + * when handling the first vNMI. + */ + vcpu->arch.nmi_pending = 0; +} + +static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) +{ + u32 intr_info = vmx_get_intr_info(vcpu); + + /* + * Machine checks are handled by handle_exception_irqoff(), or by + * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on + * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). + */ + if (is_nmi(intr_info) || is_machine_check(intr_info)) + return 1; + + vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; + vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; + vcpu->run->ex.error_code = 0; + + return 0; +} + +static int complete_hypercall_exit(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); + return 1; +} + +static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) +{ + kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); + kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); + kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); + kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); + kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); + + return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); +} + +/* + * Split into chunks and check interrupt pending between chunks. This allows + * for timely injection of interrupts to prevent issues with guest lockup + * detection. + */ +#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) +static void __tdx_map_gpa(struct vcpu_tdx *tdx); + +static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (vcpu->run->hypercall.ret) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + tdx->vp_enter_args.r11 = tdx->map_gpa_next; + return 1; + } + + tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; + if (tdx->map_gpa_next >= tdx->map_gpa_end) + return 1; + + /* + * Stop processing the remaining part if there is a pending interrupt, + * which could be qualified to deliver. Skip checking pending RVI for + * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). + */ + if (kvm_vcpu_has_events(vcpu)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); + tdx->vp_enter_args.r11 = tdx->map_gpa_next; + return 1; + } + + __tdx_map_gpa(tdx); + return 0; +} + +static void __tdx_map_gpa(struct vcpu_tdx *tdx) +{ + u64 gpa = tdx->map_gpa_next; + u64 size = tdx->map_gpa_end - tdx->map_gpa_next; + + if (size > TDX_MAP_GPA_MAX_LEN) + size = TDX_MAP_GPA_MAX_LEN; + + tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; + tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + tdx->vcpu.run->hypercall.ret = 0; + tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; + tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? + KVM_MAP_GPA_RANGE_ENCRYPTED : + KVM_MAP_GPA_RANGE_DECRYPTED; + tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; + + tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; +} + +static int tdx_map_gpa(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + u64 ret; + + /* + * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires + * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE + * bit set. If not, the error code is not defined in GHCI for TDX, use + * TDVMCALL_STATUS_INVALID_OPERAND for this case. + */ + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + ret = TDVMCALL_STATUS_INVALID_OPERAND; + goto error; + } + + if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || + !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || + (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != + vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { + ret = TDVMCALL_STATUS_INVALID_OPERAND; + goto error; + } + + if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { + ret = TDVMCALL_STATUS_ALIGN_ERROR; + goto error; + } + + tdx->map_gpa_end = gpa + size; + tdx->map_gpa_next = gpa; + + __tdx_map_gpa(tdx); + return 0; + +error: + tdvmcall_set_return_code(vcpu, ret); + tdx->vp_enter_args.r11 = gpa; + return 1; +} + +static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 *regs = vcpu->run->system_event.data; + u64 *module_regs = &tdx->vp_enter_args.r8; + int index = VCPU_REGS_RAX; + + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; + vcpu->run->system_event.ndata = 16; + + /* Dump 16 general-purpose registers to userspace in ascending order. */ + regs[index++] = tdx->vp_enter_ret; + regs[index++] = tdx->vp_enter_args.rcx; + regs[index++] = tdx->vp_enter_args.rdx; + regs[index++] = tdx->vp_enter_args.rbx; + regs[index++] = 0; + regs[index++] = 0; + regs[index++] = tdx->vp_enter_args.rsi; + regs[index] = tdx->vp_enter_args.rdi; + for (index = 0; index < 8; index++) + regs[VCPU_REGS_R8 + index] = module_regs[index]; + + return 0; +} + +static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 eax, ebx, ecx, edx; + struct vcpu_tdx *tdx = to_tdx(vcpu); + + /* EAX and ECX for cpuid is stored in R12 and R13. */ + eax = tdx->vp_enter_args.r12; + ecx = tdx->vp_enter_args.r13; + + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); + + tdx->vp_enter_args.r12 = eax; + tdx->vp_enter_args.r13 = ebx; + tdx->vp_enter_args.r14 = ecx; + tdx->vp_enter_args.r15 = edx; + + return 1; +} + +static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pio.count = 0; + return 1; +} + +static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + unsigned long val = 0; + int ret; + + ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, + vcpu->arch.pio.port, &val, 1); + + WARN_ON_ONCE(!ret); + + tdvmcall_set_return_val(vcpu, val); + + return 1; +} + +static int tdx_emulate_io(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + unsigned long val = 0; + unsigned int port; + u64 size, write; + int ret; + + ++vcpu->stat.io_exits; + + size = tdx->vp_enter_args.r12; + write = tdx->vp_enter_args.r13; + port = tdx->vp_enter_args.r14; + + if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + if (write) { + val = tdx->vp_enter_args.r15; + ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); + } else { + ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); + } + + if (!ret) + vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : + tdx_complete_pio_in; + else if (!write) + tdvmcall_set_return_val(vcpu, val); + + return ret; +} + +static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) +{ + unsigned long val = 0; + gpa_t gpa; + int size; + + gpa = vcpu->mmio_fragments[0].gpa; + size = vcpu->mmio_fragments[0].len; + + memcpy(&val, vcpu->run->mmio.data, size); + tdvmcall_set_return_val(vcpu, val); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); + return 1; +} + +static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, + unsigned long val) +{ + if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + trace_kvm_fast_mmio(gpa); + return 0; + } + + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); + if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) + return -EOPNOTSUPP; + + return 0; +} + +static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) +{ + unsigned long val; + + if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) + return -EOPNOTSUPP; + + tdvmcall_set_return_val(vcpu, val); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); + return 0; +} + +static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + int size, write, r; + unsigned long val; + gpa_t gpa; + + size = tdx->vp_enter_args.r12; + write = tdx->vp_enter_args.r13; + gpa = tdx->vp_enter_args.r14; + val = write ? tdx->vp_enter_args.r15 : 0; + + if (size != 1 && size != 2 && size != 4 && size != 8) + goto error; + if (write != 0 && write != 1) + goto error; + + /* + * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to + * do MMIO emulation for private GPA. + */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || + vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) + goto error; + + gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); + + if (write) + r = tdx_mmio_write(vcpu, gpa, size, val); + else + r = tdx_mmio_read(vcpu, gpa, size); + if (!r) + /* Kernel completed device emulation. */ + return 1; + + /* Request the device emulation to userspace device model. */ + vcpu->mmio_is_write = write; + if (!write) + vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = size; + vcpu->run->mmio.is_write = write; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + if (write) { + memcpy(vcpu->run->mmio.data, &val, size); + } else { + vcpu->mmio_fragments[0].gpa = gpa; + vcpu->mmio_fragments[0].len = size; + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); + } + return 0; + +error: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; +} + +static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (tdx->vp_enter_args.r12) + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + else { + tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r13 = 0; + tdx->vp_enter_args.r14 = 0; + } + return 1; +} + +static int handle_tdvmcall(struct kvm_vcpu *vcpu) +{ + switch (tdvmcall_leaf(vcpu)) { + case TDVMCALL_MAP_GPA: + return tdx_map_gpa(vcpu); + case TDVMCALL_REPORT_FATAL_ERROR: + return tdx_report_fatal_error(vcpu); + case TDVMCALL_GET_TD_VM_CALL_INFO: + return tdx_get_td_vm_call_info(vcpu); + default: + break; + } + + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; +} + +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) +{ + u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : + TDX_SHARED_BIT_PWL_4; + + if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) + return; + + td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); +} + +static void tdx_unpin(struct kvm *kvm, struct page *page) +{ + put_page(page); +} + +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 entry, level_state; + u64 err; + + err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) { + tdx_unpin(kvm, page); + return -EBUSY; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); + tdx_unpin(kvm, page); + return -EIO; + } + + return 0; +} + +/* + * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the + * callback tdx_gmem_post_populate() then maps pages into private memory. + * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the + * private EPT structures for the page to have been built before, which is + * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that + * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). + * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there + * are no half-initialized shared EPT pages. + */ +static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) + return -EINVAL; + + /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ + atomic64_inc(&kvm_tdx->nr_premapped); + return 0; +} + +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + /* + * Because guest_memfd doesn't support page migration with + * a_ops->migrate_folio (yet), no callback is triggered for KVM on page + * migration. Until guest_memfd supports page migration, prevent page + * migration. + * TODO: Once guest_memfd introduces callback on page migration, + * implement it and remove get_page/put_page(). + */ + get_page(page); + + /* + * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching + * barrier in tdx_td_finalize(). + */ + smp_rmb(); + if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) + return tdx_mem_page_aug(kvm, gfn, level, page); + + return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); +} + +static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * When zapping private page, write lock is held. So no race condition + * with other vcpu sept operation. + * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. + */ + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + + if (unlikely(tdx_operand_busy(err))) { + /* + * The second retry is expected to succeed after kicking off all + * other vCPUs and prevent them from invoking TDH.VP.ENTER. + */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); + return -EIO; + } + + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return -EIO; + } + tdx_clear_page(page); + tdx_unpin(kvm, page); + return 0; +} + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + gpa_t gpa = gfn_to_gpa(gfn); + struct page *page = virt_to_page(private_spt); + u64 err, entry, level_state; + + err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, + &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + return -EIO; + } + + return 0; +} + +/* + * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is + * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called + * successfully. + * + * Since tdh_mem_sept_add() must have been invoked successfully before a + * non-leaf entry present in the mirrored page table, the SEPT ZAP related + * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead + * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the + * SEPT. + * + * Further check if the returned entry from SEPT walking is with RWX permissions + * to filter out anything unexpected. + * + * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from + * level_state returned from a SEAMCALL error is the same as that passed into + * the SEAMCALL. + */ +static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, + u64 entry, int level) +{ + if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) + return false; + + if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) + return false; + + if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) + return false; + + return true; +} + +static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); + u64 err, entry, level_state; + + /* For now large page isn't supported yet. */ + WARN_ON_ONCE(level != PG_LEVEL_4K); + + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + + if (unlikely(tdx_operand_busy(err))) { + /* After no vCPUs enter, the second retry is expected to succeed */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && + !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { + atomic64_dec(&kvm_tdx->nr_premapped); + tdx_unpin(kvm, page); + return 0; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); + return -EIO; + } + return 1; +} + +/* + * Ensure shared and private EPTs to be flushed on all vCPUs. + * tdh_mem_track() is the only caller that increases TD epoch. An increase in + * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are + * running in guest mode with the value "N - 1". + * + * A successful execution of tdh_mem_track() ensures that vCPUs can only run in + * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch + * being increased to "N + 1". + * + * Kicking off all vCPUs after that further results in no vCPUs can run in guest + * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. + * to increase TD epoch to "N + 2"). + * + * TDX module will flush EPT on the next TD enter and make vCPUs to run in + * guest mode with TD epoch value "N + 1". + * + * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by + * waiting empty IPI handler ack_kick(). + * + * No action is required to the vCPUs being kicked off since the kicking off + * occurs certainly after TD epoch increment and before the next + * tdh_mem_track(). + */ +static void tdx_track(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err; + + /* If TD isn't finalized, it's before any vcpu running. */ + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) + return; + + lockdep_assert_held_write(&kvm->mmu_lock); + + err = tdh_mem_track(&kvm_tdx->td); + if (unlikely(tdx_operand_busy(err))) { + /* After no vCPUs enter, the second retry is expected to succeed */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_track(&kvm_tdx->td); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) + pr_tdx_error(TDH_MEM_TRACK, err); + + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + /* + * free_external_spt() is only called after hkid is freed when TD is + * tearing down. + * KVM doesn't (yet) zap page table pages in mirror page table while + * TD is active, though guest pages mapped in mirror page table could be + * zapped during TD is active, e.g. for shared <-> private conversion + * and slot move/deletion. + */ + if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * The HKID assigned to this TD was already freed and cache was + * already flushed. We don't have to flush again. + */ + return tdx_reclaim_page(virt_to_page(private_spt)); +} + +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + int ret; + + /* + * HKID is released after all private pages have been removed, and set + * before any might be populated. Warn if zapping is attempted when + * there can't be anything populated in the private EPT. + */ + if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) + return -EINVAL; + + ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); + if (ret <= 0) + return ret; + + /* + * TDX requires TLB tracking before dropping private page. Do + * it here, although it is also done later. + */ + tdx_track(kvm); + + return tdx_sept_drop_private_spte(kvm, gfn, level, page); +} + +void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + struct vcpu_tdx *tdx = to_tdx(vcpu); + + /* TDX supports only posted interrupt. No lapic emulation. */ + __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); + + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); +} + +static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) +{ + u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; + u64 eq = vmx_get_exit_qual(vcpu); + + if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) + return false; + + return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); +} + +static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual; + gpa_t gpa = to_tdx(vcpu)->exit_gpa; + bool local_retry = false; + int ret; + + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + if (tdx_is_sept_violation_unexpected_pending(vcpu)) { + pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", + gpa, vcpu->vcpu_id); + kvm_vm_dead(vcpu->kvm); + return -EIO; + } + /* + * Always treat SEPT violations as write faults. Ignore the + * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. + * TD private pages are always RWX in the SEPT tables, + * i.e. they're always mapped writable. Just as importantly, + * treating SEPT violations as write faults is necessary to + * avoid COW allocations, which will cause TDAUGPAGE failures + * due to aliasing a single HPA to multiple GPAs. + */ + exit_qual = EPT_VIOLATION_ACC_WRITE; + + /* Only private GPA triggers zero-step mitigation */ + local_retry = true; + } else { + exit_qual = vmx_get_exit_qual(vcpu); + /* + * EPT violation due to instruction fetch should never be + * triggered from shared memory in TDX guest. If such EPT + * violation occurs, treat it as broken hardware. + */ + if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) + return -EIO; + } + + trace_kvm_page_fault(vcpu, gpa, exit_qual); + + /* + * To minimize TDH.VP.ENTER invocations, retry locally for private GPA + * mapping in TDX. + * + * KVM may return RET_PF_RETRY for private GPA due to + * - contentions when atomically updating SPTEs of the mirror page table + * - in-progress GFN invalidation or memslot removal. + * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, + * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) + * or certain TDCALLs. + * + * If TDH.VP.ENTER is invoked more times than the threshold set by the + * TDX module before KVM resolves the private GPA mapping, the TDX + * module will activate zero-step mitigation during TDH.VP.ENTER. This + * process acquires an SEPT tree lock in the TDX module, leading to + * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD + * operations on other vCPUs. + * + * Breaking out of local retries for kvm_vcpu_has_events() is for + * interrupt injection. kvm_vcpu_has_events() should not see pending + * events for TDX. Since KVM can't determine if IRQs (or NMIs) are + * blocked by TDs, false positives are inevitable i.e., KVM may re-enter + * the guest even if the IRQ/NMI can't be delivered. + * + * Note: even without breaking out of local retries, zero-step + * mitigation may still occur due to + * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, + * - a single RIP causing EPT violations for more GFNs than the + * threshold count. + * This is safe, as triggering zero-step mitigation only introduces + * contentions to page installation SEAMCALLs on other vCPUs, which will + * handle retries locally in their EPT violation handlers. + */ + while (1) { + ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); + + if (ret != RET_PF_RETRY || !local_retry) + break; + + if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) + break; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { + ret = -EIO; + break; + } + + cond_resched(); + } + return ret; +} + +int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + if (err) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) + tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); + + return 1; +} + + +int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 vp_enter_ret = tdx->vp_enter_ret; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); + + if (fastpath != EXIT_FASTPATH_NONE) + return 1; + + if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { + KVM_BUG_ON(1, vcpu->kvm); + return -EIO; + } + + /* + * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and + * TDX_SEAMCALL_VMFAILINVALID. + */ + if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { + KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); + goto unhandled_exit; + } + + if (unlikely(tdx_failed_vmentry(vcpu))) { + /* + * If the guest state is protected, that means off-TD debug is + * not enabled, TDX_NON_RECOVERABLE must be set. + */ + WARN_ON_ONCE(vcpu->arch.guest_state_protected && + !(vp_enter_ret & TDX_NON_RECOVERABLE)); + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; + return 0; + } + + if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && + exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { + kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); + goto unhandled_exit; + } + + WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && + (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); + + switch (exit_reason.basic) { + case EXIT_REASON_TRIPLE_FAULT: + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; + return 0; + case EXIT_REASON_EXCEPTION_NMI: + return tdx_handle_exception_nmi(vcpu); + case EXIT_REASON_EXTERNAL_INTERRUPT: + ++vcpu->stat.irq_exits; + return 1; + case EXIT_REASON_CPUID: + return tdx_emulate_cpuid(vcpu); + case EXIT_REASON_HLT: + return kvm_emulate_halt_noskip(vcpu); + case EXIT_REASON_TDCALL: + return handle_tdvmcall(vcpu); + case EXIT_REASON_VMCALL: + return tdx_emulate_vmcall(vcpu); + case EXIT_REASON_IO_INSTRUCTION: + return tdx_emulate_io(vcpu); + case EXIT_REASON_MSR_READ: + kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); + return kvm_emulate_rdmsr(vcpu); + case EXIT_REASON_MSR_WRITE: + kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); + kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); + kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); + return kvm_emulate_wrmsr(vcpu); + case EXIT_REASON_EPT_MISCONFIG: + return tdx_emulate_mmio(vcpu); + case EXIT_REASON_EPT_VIOLATION: + return tdx_handle_ept_violation(vcpu); + case EXIT_REASON_OTHER_SMI: + /* + * Unlike VMX, SMI in SEAM non-root mode (i.e. when + * TD guest vCPU is running) will cause VM exit to TDX module, + * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered + * and handled by kernel handler right away. + * + * The Other SMI exit can also be caused by the SEAM non-root + * machine check delivered via Machine Check System Management + * Interrupt (MSMI), but it has already been handled by the + * kernel machine check handler, i.e., the memory page has been + * marked as poisoned and it won't be freed to the free list + * when the TDX guest is terminated (the TDX module marks the + * guest as dead and prevent it from further running when + * machine check happens in SEAM non-root). + * + * - A MSMI will not reach here, it's handled as non_recoverable + * case above. + * - If it's not an MSMI, no need to do anything here. + */ + return 1; + default: + break; + } + +unhandled_exit: + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vp_enter_ret; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + return 0; +} + +void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + *reason = tdx->vt.exit_reason.full; + if (*reason != -1u) { + *info1 = vmx_get_exit_qual(vcpu); + *info2 = tdx->ext_exit_qualification; + *intr_info = vmx_get_intr_info(vcpu); + } else { + *info1 = 0; + *info2 = 0; + *intr_info = 0; + } + + *error_code = 0; +} + +bool tdx_has_emulated_msr(u32 index) +{ + switch (index) { + case MSR_IA32_UCODE_REV: + case MSR_IA32_ARCH_CAPABILITIES: + case MSR_IA32_POWER_CTL: + case MSR_IA32_CR_PAT: + case MSR_MTRRcap: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_TSC_DEADLINE: + case MSR_IA32_MISC_ENABLE: + case MSR_PLATFORM_INFO: + case MSR_MISC_FEATURES_ENABLES: + case MSR_IA32_APICBASE: + case MSR_EFER: + case MSR_IA32_FEAT_CTL: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_EXT_CTL: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ + case MSR_KVM_POLL_CONTROL: + return true; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: + /* + * x2APIC registers that are virtualized by the CPU can't be + * emulated, KVM doesn't have access to the virtual APIC page. + */ + switch (index) { + case X2APIC_MSR(APIC_TASKPRI): + case X2APIC_MSR(APIC_PROCPRI): + case X2APIC_MSR(APIC_EOI): + case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): + case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): + case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): + return false; + default: + return true; + } + default: + return false; + } +} + +static bool tdx_is_read_only_msr(u32 index) +{ + return index == MSR_IA32_APICBASE || index == MSR_EFER || + index == MSR_IA32_FEAT_CTL; +} + +int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + switch (msr->index) { + case MSR_IA32_FEAT_CTL: + /* + * MCE and MCA are advertised via cpuid. Guest kernel could + * check if LMCE is enabled or not. + */ + msr->data = FEAT_CTL_LOCKED; + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + msr->data |= FEAT_CTL_LMCE_ENABLED; + return 0; + case MSR_IA32_MCG_EXT_CTL: + if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) + return 1; + msr->data = vcpu->arch.mcg_ext_ctl; + return 0; + default: + if (!tdx_has_emulated_msr(msr->index)) + return 1; + + return kvm_get_msr_common(vcpu, msr); + } +} + +int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + switch (msr->index) { + case MSR_IA32_MCG_EXT_CTL: + if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || + (msr->data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = msr->data; + return 0; + default: + if (tdx_is_read_only_msr(msr->index)) + return 1; + + if (!tdx_has_emulated_msr(msr->index)) + return 1; + + return kvm_set_msr_common(vcpu, msr); + } +} + +static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + struct kvm_tdx_capabilities __user *user_caps; + struct kvm_tdx_capabilities *caps = NULL; + int ret = 0; + + /* flags is reserved for future use */ + if (cmd->flags) + return -EINVAL; + + caps = kmalloc(sizeof(*caps) + + sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, + GFP_KERNEL); + if (!caps) + return -ENOMEM; + + user_caps = u64_to_user_ptr(cmd->data); + if (copy_from_user(caps, user_caps, sizeof(*caps))) { + ret = -EFAULT; + goto out; + } + + if (caps->cpuid.nent < td_conf->num_cpuid_config) { + ret = -E2BIG; + goto out; + } + + ret = init_kvm_tdx_caps(td_conf, caps); + if (ret) + goto out; + + if (copy_to_user(user_caps, caps, sizeof(*caps))) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, + caps->cpuid.nent * + sizeof(caps->cpuid.entries[0]))) + ret = -EFAULT; + +out: + /* kfree() accepts NULL. */ + kfree(caps); + return ret; +} + +/* + * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is + * similar to TDX's GPAW. Use this field as the interface for userspace to + * configure the GPAW and EPT level for TDs. + * + * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level + * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always + * supported. Value 52 is only supported when the platform supports 5 level + * EPT. + */ +static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + const struct kvm_cpuid_entry2 *entry; + int guest_pa; + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); + if (!entry) + return -EINVAL; + + guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); + + if (guest_pa != 48 && guest_pa != 52) + return -EINVAL; + + if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) + return -EINVAL; + + td_params->eptp_controls = VMX_EPTP_MT_WB; + if (guest_pa == 52) { + td_params->eptp_controls |= VMX_EPTP_PWL_5; + td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; + } else { + td_params->eptp_controls |= VMX_EPTP_PWL_4; + } + + return 0; +} + +static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + const struct kvm_cpuid_entry2 *entry; + struct tdx_cpuid_value *value; + int i, copy_cnt = 0; + + /* + * td_params.cpuid_values: The number and the order of cpuid_value must + * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} + * It's assumed that td_params was zeroed. + */ + for (i = 0; i < td_conf->num_cpuid_config; i++) { + struct kvm_cpuid_entry2 tmp; + + td_init_cpuid_entry2(&tmp, i); + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, + tmp.function, tmp.index); + if (!entry) + continue; + + if (tdx_unsupported_cpuid(entry)) + return -EINVAL; + + copy_cnt++; + + value = &td_params->cpuid_values[i]; + value->eax = entry->eax; + value->ebx = entry->ebx; + value->ecx = entry->ecx; + value->edx = entry->edx; + + /* + * TDX module does not accept nonzero bits 16..23 for the + * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). + */ + if (tmp.function == 0x80000008) + value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); + } + + /* + * Rely on the TDX module to reject invalid configuration, but it can't + * check of leafs that don't have a proper slot in td_params->cpuid_values + * to stick then. So fail if there were entries that didn't get copied to + * td_params. + */ + if (copy_cnt != cpuid->nent) + return -EINVAL; + + return 0; +} + +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, + struct kvm_tdx_init_vm *init_vm) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + struct kvm_cpuid2 *cpuid = &init_vm->cpuid; + int ret; + + if (kvm->created_vcpus) + return -EBUSY; + + if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) + return -EINVAL; + + if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) + return -EINVAL; + + td_params->max_vcpus = kvm->max_vcpus; + td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; + td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; + + td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; + td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); + + ret = setup_tdparams_eptp_controls(cpuid, td_params); + if (ret) + return ret; + + ret = setup_tdparams_cpuids(cpuid, td_params); + if (ret) + return ret; + +#define MEMCPY_SAME_SIZE(dst, src) \ + do { \ + BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ + memcpy((dst), (src), sizeof(dst)); \ + } while (0) + + MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); + MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); + MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); + + return 0; +} + +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, + u64 *seamcall_err) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + cpumask_var_t packages; + struct page **tdcs_pages = NULL; + struct page *tdr_page; + int ret, i; + u64 err, rcx; + + *seamcall_err = 0; + ret = tdx_guest_keyid_alloc(); + if (ret < 0) + return ret; + kvm_tdx->hkid = ret; + kvm_tdx->misc_cg = get_current_misc_cg(); + ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + if (ret) + goto free_hkid; + + ret = -ENOMEM; + + atomic_inc(&nr_configured_hkid); + + tdr_page = alloc_page(GFP_KERNEL); + if (!tdr_page) + goto free_hkid; + + kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; + /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ + kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; + tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), + GFP_KERNEL | __GFP_ZERO); + if (!tdcs_pages) + goto free_tdr; + + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + tdcs_pages[i] = alloc_page(GFP_KERNEL); + if (!tdcs_pages[i]) + goto free_tdcs; + } + + if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) + goto free_tdcs; + + cpus_read_lock(); + + /* + * Need at least one CPU of the package to be online in order to + * program all packages for host key id. Check it. + */ + for_each_present_cpu(i) + cpumask_set_cpu(topology_physical_package_id(i), packages); + for_each_online_cpu(i) + cpumask_clear_cpu(topology_physical_package_id(i), packages); + if (!cpumask_empty(packages)) { + ret = -EIO; + /* + * Because it's hard for human operator to figure out the + * reason, warn it. + */ +#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" + pr_warn_ratelimited(MSG_ALLPKG); + goto free_packages; + } + + /* + * TDH.MNG.CREATE tries to grab the global TDX module and fails + * with TDX_OPERAND_BUSY when it fails to grab. Take the global + * lock to prevent it from failure. + */ + mutex_lock(&tdx_lock); + kvm_tdx->td.tdr_page = tdr_page; + err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); + mutex_unlock(&tdx_lock); + + if (err == TDX_RND_NO_ENTROPY) { + ret = -EAGAIN; + goto free_packages; + } + + if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_CREATE, err); + ret = -EIO; + goto free_packages; + } + + for_each_online_cpu(i) { + int pkg = topology_physical_package_id(i); + + if (cpumask_test_and_set_cpu(pkg, packages)) + continue; + + /* + * Program the memory controller in the package with an + * encryption key associated to a TDX private host key id + * assigned to this TDR. Concurrent operations on same memory + * controller results in TDX_OPERAND_BUSY. No locking needed + * beyond the cpus_read_lock() above as it serializes against + * hotplug and the first online CPU of the package is always + * used. We never have two CPUs in the same socket trying to + * program the key. + */ + ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, + kvm_tdx, true); + if (ret) + break; + } + cpus_read_unlock(); + free_cpumask_var(packages); + if (ret) { + i = 0; + goto teardown; + } + + kvm_tdx->td.tdcs_pages = tdcs_pages; + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); + if (err == TDX_RND_NO_ENTROPY) { + /* Here it's hard to allow userspace to retry. */ + ret = -EAGAIN; + goto teardown; + } + if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_ADDCX, err); + ret = -EIO; + goto teardown; + } + } + + err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); + if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { + /* + * Because a user gives operands, don't warn. + * Return a hint to the user because it's sometimes hard for the + * user to figure out which operand is invalid. SEAMCALL status + * code includes which operand caused invalid operand error. + */ + *seamcall_err = err; + ret = -EINVAL; + goto teardown; + } else if (WARN_ON_ONCE(err)) { + pr_tdx_error_1(TDH_MNG_INIT, err, rcx); + ret = -EIO; + goto teardown; + } + + return 0; + + /* + * The sequence for freeing resources from a partially initialized TD + * varies based on where in the initialization flow failure occurred. + * Simply use the full teardown and destroy, which naturally play nice + * with partial initialization. + */ +teardown: + /* Only free pages not yet added, so start at 'i' */ + for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (tdcs_pages[i]) { + __free_page(tdcs_pages[i]); + tdcs_pages[i] = NULL; + } + } + if (!kvm_tdx->td.tdcs_pages) + kfree(tdcs_pages); + + tdx_mmu_release_hkid(kvm); + tdx_reclaim_td_control_pages(kvm); + + return ret; + +free_packages: + cpus_read_unlock(); + free_cpumask_var(packages); + +free_tdcs: + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (tdcs_pages[i]) + __free_page(tdcs_pages[i]); + } + kfree(tdcs_pages); + kvm_tdx->td.tdcs_pages = NULL; + +free_tdr: + if (tdr_page) + __free_page(tdr_page); + kvm_tdx->td.tdr_page = 0; + +free_hkid: + tdx_hkid_free(kvm_tdx); + + return ret; +} + +static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, + u64 *data) +{ + u64 err; + + err = tdh_mng_rd(&tdx->td, field_id, data); + + return err; +} + +#define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) +#define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) + +static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, + bool sub_leaf_set, int *entry_index, + struct kvm_cpuid_entry2 *out) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; + u64 ebx_eax, edx_ecx; + u64 err = 0; + + if (sub_leaf > 0b1111111) + return -EINVAL; + + if (*entry_index >= KVM_MAX_CPUID_ENTRIES) + return -EINVAL; + + if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || + sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) + return -EINVAL; + + /* + * bit 23:17, REVSERVED: reserved, must be 0; + * bit 16, LEAF_31: leaf number bit 31; + * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are + * implicitly 0; + * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; + * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, + * the SUBLEAF_6_0 is all-1. + * sub-leaf bits 31:7 are implicitly 0; + * bit 0, ELEMENT_I: Element index within field; + */ + field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; + field_id |= (leaf & 0x7f) << 9; + if (sub_leaf_set) + field_id |= (sub_leaf & 0x7f) << 1; + else + field_id |= 0x1fe; + + err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); + if (err) //TODO check for specific errors + goto err_out; + + out->eax = (u32) ebx_eax; + out->ebx = (u32) (ebx_eax >> 32); + + field_id++; + err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); + /* + * It's weird that reading edx_ecx fails while reading ebx_eax + * succeeded. + */ + if (WARN_ON_ONCE(err)) + goto err_out; + + out->ecx = (u32) edx_ecx; + out->edx = (u32) (edx_ecx >> 32); + + out->function = leaf; + out->index = sub_leaf; + out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; + + /* + * Work around missing support on old TDX modules, fetch + * guest maxpa from gfn_direct_bits. + */ + if (leaf == 0x80000008) { + gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); + unsigned int g_maxpa = __ffs(gpa_bits) + 1; + + out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); + } + + (*entry_index)++; + + return 0; + +err_out: + out->eax = 0; + out->ebx = 0; + out->ecx = 0; + out->edx = 0; + + return -EIO; +} + +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_vm *init_vm; + struct td_params *td_params = NULL; + int ret; + + BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); + BUILD_BUG_ON(sizeof(struct td_params) != 1024); + + if (kvm_tdx->state != TD_STATE_UNINITIALIZED) + return -EINVAL; + + if (cmd->flags) + return -EINVAL; + + init_vm = kmalloc(sizeof(*init_vm) + + sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, + GFP_KERNEL); + if (!init_vm) + return -ENOMEM; + + if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { + ret = -EFAULT; + goto out; + } + + if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { + ret = -E2BIG; + goto out; + } + + if (copy_from_user(init_vm->cpuid.entries, + u64_to_user_ptr(cmd->data) + sizeof(*init_vm), + flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { + ret = -EFAULT; + goto out; + } + + if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { + ret = -EINVAL; + goto out; + } + + if (init_vm->cpuid.padding) { + ret = -EINVAL; + goto out; + } + + td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL); + if (!td_params) { + ret = -ENOMEM; + goto out; + } + + ret = setup_tdparams(kvm, td_params, init_vm); + if (ret) + goto out; + + ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); + if (ret) + goto out; + + kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); + kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); + kvm_tdx->attributes = td_params->attributes; + kvm_tdx->xfam = td_params->xfam; + + if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; + else + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; + + kvm_tdx->state = TD_STATE_INITIALIZED; +out: + /* kfree() accepts NULL. */ + kfree(init_vm); + kfree(td_params); + + return ret; +} + +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + /* + * flush_tlb_current() is invoked when the first time for the vcpu to + * run or when root of shared EPT is invalidated. + * KVM only needs to flush shared EPT because the TDX module handles TLB + * invalidation for private EPT in tdh_vp_enter(); + * + * A single context invalidation for shared EPT can be performed here. + * However, this single context invalidation requires the private EPTP + * rather than the shared EPTP to flush shared EPT, as shared EPT uses + * private EPTP as its ASID for TLB invalidation. + * + * To avoid reading back private EPTP, perform a global invalidation for + * shared EPT instead to keep this function simple. + */ + ept_sync_global(); +} + +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + /* + * TDX has called tdx_track() in tdx_sept_remove_private_spte() to + * ensure that private EPT will be flushed on the next TD enter. No need + * to call tdx_track() here again even when this callback is a result of + * zapping private EPT. + * + * Due to the lack of the context to determine which EPT has been + * affected by zapping, invoke invept() directly here for both shared + * EPT and private EPT for simplicity, though it's not necessary for + * private EPT. + */ + ept_sync_global(); +} + +static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + guard(mutex)(&kvm->slots_lock); + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + /* + * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue + * TDH.MEM.PAGE.ADD(). + */ + if (atomic64_read(&kvm_tdx->nr_premapped)) + return -EINVAL; + + cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); + if (tdx_operand_busy(cmd->hw_error)) + return -EBUSY; + if (KVM_BUG_ON(cmd->hw_error, kvm)) { + pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); + return -EIO; + } + + kvm_tdx->state = TD_STATE_RUNNABLE; + /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ + smp_wmb(); + kvm->arch.pre_fault_allowed = true; + return 0; +} + +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) +{ + struct kvm_tdx_cmd tdx_cmd; + int r; + + if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) + return -EFAULT; + + /* + * Userspace should never set hw_error. It is used to fill + * hardware-defined error by the kernel. + */ + if (tdx_cmd.hw_error) + return -EINVAL; + + mutex_lock(&kvm->lock); + + switch (tdx_cmd.id) { + case KVM_TDX_CAPABILITIES: + r = tdx_get_capabilities(&tdx_cmd); + break; + case KVM_TDX_INIT_VM: + r = tdx_td_init(kvm, &tdx_cmd); + break; + case KVM_TDX_FINALIZE_VM: + r = tdx_td_finalize(kvm, &tdx_cmd); + break; + default: + r = -EINVAL; + goto out; + } + + if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) + r = -EFAULT; + +out: + mutex_unlock(&kvm->lock); + return r; +} + +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct page *page; + int ret, i; + u64 err; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + tdx->vp.tdvpr_page = page; + + tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), + GFP_KERNEL); + if (!tdx->vp.tdcx_pages) { + ret = -ENOMEM; + goto free_tdvpr; + } + + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto free_tdcx; + } + tdx->vp.tdcx_pages[i] = page; + } + + err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); + if (KVM_BUG_ON(err, vcpu->kvm)) { + ret = -EIO; + pr_tdx_error(TDH_VP_CREATE, err); + goto free_tdcx; + } + + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_ADDCX, err); + /* + * Pages already added are reclaimed by the vcpu_free + * method, but the rest are freed here. + */ + for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { + __free_page(tdx->vp.tdcx_pages[i]); + tdx->vp.tdcx_pages[i] = NULL; + } + return -EIO; + } + } + + err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_INIT, err); + return -EIO; + } + + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + return 0; + +free_tdcx: + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + if (tdx->vp.tdcx_pages[i]) + __free_page(tdx->vp.tdcx_pages[i]); + tdx->vp.tdcx_pages[i] = NULL; + } + kfree(tdx->vp.tdcx_pages); + tdx->vp.tdcx_pages = NULL; + +free_tdvpr: + if (tdx->vp.tdvpr_page) + __free_page(tdx->vp.tdvpr_page); + tdx->vp.tdvpr_page = 0; + + return ret; +} + +/* Sometimes reads multipple subleafs. Return how many enties were written. */ +static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, + struct kvm_cpuid_entry2 *output_e) +{ + int sub_leaf = 0; + int ret; + + /* First try without a subleaf */ + ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); + + /* If success, or invalid leaf, just give up */ + if (ret != -EIO) + return ret; + + /* + * If the try without a subleaf failed, try reading subleafs until + * failure. The TDX module only supports 6 bits of subleaf index. + */ + while (1) { + /* Keep reading subleafs until there is a failure. */ + if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) + return !sub_leaf; + + sub_leaf++; + output_e++; + } + + return 0; +} + +static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + struct kvm_cpuid2 __user *output, *td_cpuid; + int r = 0, i = 0, leaf; + u32 level; + + output = u64_to_user_ptr(cmd->data); + td_cpuid = kzalloc(sizeof(*td_cpuid) + + sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, + GFP_KERNEL); + if (!td_cpuid) + return -ENOMEM; + + if (copy_from_user(td_cpuid, output, sizeof(*output))) { + r = -EFAULT; + goto out; + } + + /* Read max CPUID for normal range */ + if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { + r = -EIO; + goto out; + } + level = td_cpuid->entries[0].eax; + + for (leaf = 1; leaf <= level; leaf++) + tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); + + /* Read max CPUID for extended range */ + if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { + r = -EIO; + goto out; + } + level = td_cpuid->entries[i - 1].eax; + + for (leaf = 0x80000001; leaf <= level; leaf++) + tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); + + if (td_cpuid->nent < i) + r = -E2BIG; + td_cpuid->nent = i; + + if (copy_to_user(output, td_cpuid, sizeof(*output))) { + r = -EFAULT; + goto out; + } + + if (r == -E2BIG) + goto out; + + if (copy_to_user(output->entries, td_cpuid->entries, + td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + r = -EFAULT; + +out: + kfree(td_cpuid); + + return r; +} + +static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + u64 apic_base; + struct vcpu_tdx *tdx = to_tdx(vcpu); + int ret; + + if (cmd->flags) + return -EINVAL; + + if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) + return -EINVAL; + + /* + * TDX requires X2APIC, userspace is responsible for configuring guest + * CPUID accordingly. + */ + apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | + (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); + if (kvm_apic_set_base(vcpu, apic_base, true)) + return -EINVAL; + + ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); + if (ret) + return ret; + + td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); + td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); + td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); + + tdx->state = VCPU_TD_STATE_INITIALIZED; + + return 0; +} + +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + /* + * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all + * INIT events. + * + * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as + * userspace needs to define the vCPU model before KVM can initialize + * vCPU state, e.g. to enable x2APIC. + */ + WARN_ON_ONCE(init_event); +} + +struct tdx_gmem_post_populate_arg { + struct kvm_vcpu *vcpu; + __u32 flags; +}; + +static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *_arg) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct tdx_gmem_post_populate_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + gpa_t gpa = gfn_to_gpa(gfn); + u8 level = PG_LEVEL_4K; + struct page *src_page; + int ret, i; + u64 err, entry, level_state; + + /* + * Get the source page if it has been faulted in. Return failure if the + * source page has been swapped out or unmapped in primary memory. + */ + ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); + if (ret < 0) + return ret; + if (ret != 1) + return -ENOMEM; + + ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); + if (ret < 0) + goto out; + + /* + * The private mem cannot be zapped after kvm_tdp_map_page() + * because all paths are covered by slots_lock and the + * filemap invalidate lock. Check that they are indeed enough. + */ + if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { + scoped_guard(read_lock, &kvm->mmu_lock) { + if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { + ret = -EIO; + goto out; + } + } + } + + ret = 0; + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), + src_page, &entry, &level_state); + if (err) { + ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; + goto out; + } + + if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) + atomic64_dec(&kvm_tdx->nr_premapped); + + if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { + err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, + &level_state); + if (err) { + ret = -EIO; + break; + } + } + } + +out: + put_page(src_page); + return ret; +} + +static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct kvm *kvm = vcpu->kvm; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_mem_region region; + struct tdx_gmem_post_populate_arg arg; + long gmem_ret; + int ret; + + if (tdx->state != VCPU_TD_STATE_INITIALIZED) + return -EINVAL; + + guard(mutex)(&kvm->slots_lock); + + /* Once TD is finalized, the initial guest memory is fixed. */ + if (kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) + return -EINVAL; + + if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) + return -EFAULT; + + if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || + !region.nr_pages || + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || + !vt_is_tdx_private_gpa(kvm, region.gpa) || + !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) + return -EINVAL; + + kvm_mmu_reload(vcpu); + ret = 0; + while (region.nr_pages) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + arg = (struct tdx_gmem_post_populate_arg) { + .vcpu = vcpu, + .flags = cmd->flags, + }; + gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), + u64_to_user_ptr(region.source_addr), + 1, tdx_gmem_post_populate, &arg); + if (gmem_ret < 0) { + ret = gmem_ret; + break; + } + + if (gmem_ret != 1) { + ret = -EIO; + break; + } + + region.source_addr += PAGE_SIZE; + region.gpa += PAGE_SIZE; + region.nr_pages--; + + cond_resched(); + } + + if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) + ret = -EFAULT; + return ret; +} + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm_tdx_cmd cmd; + int ret; + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (copy_from_user(&cmd, argp, sizeof(cmd))) + return -EFAULT; + + if (cmd.hw_error) + return -EINVAL; + + switch (cmd.id) { + case KVM_TDX_INIT_VCPU: + ret = tdx_vcpu_init(vcpu, &cmd); + break; + case KVM_TDX_INIT_MEM_REGION: + ret = tdx_vcpu_init_mem_region(vcpu, &cmd); + break; + case KVM_TDX_GET_CPUID: + ret = tdx_vcpu_get_cpuid(vcpu, &cmd); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return PG_LEVEL_4K; +} + +static int tdx_online_cpu(unsigned int cpu) +{ + unsigned long flags; + int r; + + /* Sanity check CPU is already in post-VMXON */ + WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); + + local_irq_save(flags); + r = tdx_cpu_enable(); + local_irq_restore(flags); + + return r; +} + +static int tdx_offline_cpu(unsigned int cpu) +{ + int i; + + /* No TD is running. Allow any cpu to be offline. */ + if (!atomic_read(&nr_configured_hkid)) + return 0; + + /* + * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to + * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory + * controller with pconfig. If we have active TDX HKID, refuse to + * offline the last online cpu. + */ + for_each_online_cpu(i) { + /* + * Found another online cpu on the same package. + * Allow to offline. + */ + if (i != cpu && topology_physical_package_id(i) == + topology_physical_package_id(cpu)) + return 0; + } + + /* + * This is the last cpu of this package. Don't offline it. + * + * Because it's hard for human operator to understand the + * reason, warn it. + */ +#define MSG_ALLPKG_ONLINE \ + "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" + pr_warn_ratelimited(MSG_ALLPKG_ONLINE); + return -EBUSY; +} + +static void __do_tdx_cleanup(void) +{ + /* + * Once TDX module is initialized, it cannot be disabled and + * re-initialized again w/o runtime update (which isn't + * supported by kernel). Only need to remove the cpuhp here. + * The TDX host core code tracks TDX status and can handle + * 'multiple enabling' scenario. + */ + WARN_ON_ONCE(!tdx_cpuhp_state); + cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); + tdx_cpuhp_state = 0; +} + +static void __tdx_cleanup(void) +{ + cpus_read_lock(); + __do_tdx_cleanup(); + cpus_read_unlock(); +} + +static int __init __do_tdx_bringup(void) +{ + int r; + + /* + * TDX-specific cpuhp callback to call tdx_cpu_enable() on all + * online CPUs before calling tdx_enable(), and on any new + * going-online CPU to make sure it is ready for TDX guest. + */ + r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "kvm/cpu/tdx:online", + tdx_online_cpu, tdx_offline_cpu); + if (r < 0) + return r; + + tdx_cpuhp_state = r; + + r = tdx_enable(); + if (r) + __do_tdx_cleanup(); + + return r; +} + +static int __init __tdx_bringup(void) +{ + const struct tdx_sys_info_td_conf *td_conf; + int r, i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { + /* + * Check if MSRs (tdx_uret_msrs) can be saved/restored + * before returning to user space. + * + * this_cpu_ptr(user_return_msrs)->registered isn't checked + * because the registration is done at vcpu runtime by + * tdx_user_return_msr_update_cache(). + */ + tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); + if (tdx_uret_msrs[i].slot == -1) { + /* If any MSR isn't supported, it is a KVM bug */ + pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", + tdx_uret_msrs[i].msr); + return -EIO; + } + } + + /* + * Enabling TDX requires enabling hardware virtualization first, + * as making SEAMCALLs requires CPU being in post-VMXON state. + */ + r = kvm_enable_virtualization(); + if (r) + return r; + + cpus_read_lock(); + r = __do_tdx_bringup(); + cpus_read_unlock(); + + if (r) + goto tdx_bringup_err; + + /* Get TDX global information for later use */ + tdx_sysinfo = tdx_get_sysinfo(); + if (WARN_ON_ONCE(!tdx_sysinfo)) { + r = -EINVAL; + goto get_sysinfo_err; + } + + /* Check TDX module and KVM capabilities */ + if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || + !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) + goto get_sysinfo_err; + + if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) + goto get_sysinfo_err; + + /* + * TDX has its own limit of maximum vCPUs it can support for all + * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to + * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU + * extension on per-VM basis. + * + * TDX module reports such limit via the MAX_VCPU_PER_TD global + * metadata. Different modules may report different values. + * Some old module may also not support this metadata (in which + * case this limit is U16_MAX). + * + * In practice, the reported value reflects the maximum logical + * CPUs that ALL the platforms that the module supports can + * possibly have. + * + * Simply forwarding the MAX_VCPU_PER_TD to userspace could + * result in an unpredictable ABI. KVM instead always advertise + * the number of logical CPUs the platform has as the maximum + * vCPUs for TDX guests. + * + * Make sure MAX_VCPU_PER_TD reported by TDX module is not + * smaller than the number of logical CPUs, otherwise KVM will + * report an unsupported value to userspace. + * + * Note, a platform with TDX enabled in the BIOS cannot support + * physical CPU hotplug, and TDX requires the BIOS has marked + * all logical CPUs in MADT table as enabled. Just use + * num_present_cpus() for the number of logical CPUs. + */ + td_conf = &tdx_sysinfo->td_conf; + if (td_conf->max_vcpus_per_td < num_present_cpus()) { + pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", + td_conf->max_vcpus_per_td, num_present_cpus()); + r = -EINVAL; + goto get_sysinfo_err; + } + + if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { + r = -EINVAL; + goto get_sysinfo_err; + } + + /* + * Leave hardware virtualization enabled after TDX is enabled + * successfully. TDX CPU hotplug depends on this. + */ + return 0; + +get_sysinfo_err: + __tdx_cleanup(); +tdx_bringup_err: + kvm_disable_virtualization(); + return r; +} + +void tdx_cleanup(void) +{ + if (enable_tdx) { + misc_cg_set_capacity(MISC_CG_RES_TDX, 0); + __tdx_cleanup(); + kvm_disable_virtualization(); + } +} + +int __init tdx_bringup(void) +{ + int r, i; + + /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); + + if (!enable_tdx) + return 0; + + if (!enable_ept) { + pr_err("EPT is required for TDX\n"); + goto success_disable_tdx; + } + + if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { + pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); + goto success_disable_tdx; + } + + if (!enable_apicv) { + pr_err("APICv is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + pr_err("tdx: OSXSAVE is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { + pr_err("tdx: MOVDIR64B is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { + pr_err("Self-snoop is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { + pr_err("tdx: no TDX private KeyIDs available\n"); + goto success_disable_tdx; + } + + if (!enable_virt_at_load) { + pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); + goto success_disable_tdx; + } + + /* + * Ideally KVM should probe whether TDX module has been loaded + * first and then try to bring it up. But TDX needs to use SEAMCALL + * to probe whether the module is loaded (there is no CPUID or MSR + * for that), and making SEAMCALL requires enabling virtualization + * first, just like the rest steps of bringing up TDX module. + * + * So, for simplicity do everything in __tdx_bringup(); the first + * SEAMCALL will return -ENODEV when the module is not loaded. The + * only complication is having to make sure that initialization + * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other + * cases. + */ + r = __tdx_bringup(); + if (r) { + /* + * Disable TDX only but don't fail to load module if + * the TDX module could not be loaded. No need to print + * message saying "module is not loaded" because it was + * printed when the first SEAMCALL failed. + */ + if (r == -ENODEV) + goto success_disable_tdx; + + enable_tdx = 0; + } + + return r; + +success_disable_tdx: + enable_tdx = 0; + return 0; +} diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h new file mode 100644 index 000000000000..51f98443e8a2 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_TDX_H +#define __KVM_X86_VMX_TDX_H + +#include "tdx_arch.h" +#include "tdx_errno.h" + +#ifdef CONFIG_KVM_INTEL_TDX +#include "common.h" + +int tdx_bringup(void); +void tdx_cleanup(void); + +extern bool enable_tdx; + +/* TDX module hardware states. These follow the TDX module OP_STATEs. */ +enum kvm_tdx_state { + TD_STATE_UNINITIALIZED = 0, + TD_STATE_INITIALIZED, + TD_STATE_RUNNABLE, +}; + +struct kvm_tdx { + struct kvm kvm; + + struct misc_cg *misc_cg; + int hkid; + enum kvm_tdx_state state; + + u64 attributes; + u64 xfam; + + u64 tsc_offset; + u64 tsc_multiplier; + + struct tdx_td td; + + /* For KVM_TDX_INIT_MEM_REGION. */ + atomic64_t nr_premapped; + + /* + * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do + * not contend with tdh_vp_enter() and TDCALLs. + * Set/unset is protected with kvm->mmu_lock. + */ + bool wait_for_sept_zap; +}; + +/* TDX module vCPU states */ +enum vcpu_tdx_state { + VCPU_TD_STATE_UNINITIALIZED = 0, + VCPU_TD_STATE_INITIALIZED, +}; + +struct vcpu_tdx { + struct kvm_vcpu vcpu; + struct vcpu_vt vt; + u64 ext_exit_qualification; + gpa_t exit_gpa; + struct tdx_module_args vp_enter_args; + + struct tdx_vp vp; + + struct list_head cpu_list; + + u64 vp_enter_ret; + + enum vcpu_tdx_state state; + bool guest_entered; + + u64 map_gpa_next; + u64 map_gpa_end; +}; + +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err); +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err); + +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field) +{ + u64 err, data; + + err = tdh_mng_rd(&kvm_tdx->td, TDCS_EXEC(field), &data); + if (unlikely(err)) { + pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err); + return 0; + } + return data; +} + +static __always_inline void tdvps_vmcs_check(u32 field, u8 bits) +{ +#define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL +#define VMCS_ENC_ACCESS_TYPE_FULL 0x0UL +#define VMCS_ENC_ACCESS_TYPE_HIGH 0x1UL +#define VMCS_ENC_ACCESS_TYPE(field) ((field) & VMCS_ENC_ACCESS_TYPE_MASK) + + /* TDX is 64bit only. HIGH field isn't supported. */ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && + VMCS_ENC_ACCESS_TYPE(field) == VMCS_ENC_ACCESS_TYPE_HIGH, + "Read/Write to TD VMCS *_HIGH fields not supported"); + + BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64); + +#define VMCS_ENC_WIDTH_MASK GENMASK(14, 13) +#define VMCS_ENC_WIDTH_16BIT (0UL << 13) +#define VMCS_ENC_WIDTH_64BIT (1UL << 13) +#define VMCS_ENC_WIDTH_32BIT (2UL << 13) +#define VMCS_ENC_WIDTH_NATURAL (3UL << 13) +#define VMCS_ENC_WIDTH(field) ((field) & VMCS_ENC_WIDTH_MASK) + + /* TDX is 64bit only. i.e. natural width = 64bit. */ + BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) && + (VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_64BIT || + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_NATURAL), + "Invalid TD VMCS access for 64-bit field"); + BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_32BIT, + "Invalid TD VMCS access for 32-bit field"); + BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_16BIT, + "Invalid TD VMCS access for 16-bit field"); +} + +static __always_inline void tdvps_management_check(u64 field, u8 bits) {} +static __always_inline void tdvps_state_non_arch_check(u64 field, u8 bits) {} + +#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \ +static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \ + u32 field) \ +{ \ + u64 err, data; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_rd(&tdx->vp, TDVPS_##uclass(field), &data); \ + if (unlikely(err)) { \ + tdh_vp_rd_failed(tdx, #uclass, field, err); \ + return 0; \ + } \ + return (u##bits)data; \ +} \ +static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx, \ + u32 field, u##bits val) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), val, \ + GENMASK_ULL(bits - 1, 0)); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " = ", field, (u64)val, err); \ +} \ +static __always_inline void td_##lclass##_setbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), bit, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " |= ", field, bit, err); \ +} \ +static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), 0, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\ +} + + +bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu); +int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err); + +TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs); + +TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management); +TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch); + +#else +static inline int tdx_bringup(void) { return 0; } +static inline void tdx_cleanup(void) {} + +#define enable_tdx 0 + +struct kvm_tdx { + struct kvm kvm; +}; + +struct vcpu_tdx { + struct kvm_vcpu vcpu; +}; + +static inline bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) { return false; } +static inline int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { return 0; } + +#endif + +#endif diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h new file mode 100644 index 000000000000..a30e880849e3 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx_arch.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* architectural constants/data definitions for TDX SEAMCALLs */ + +#ifndef __KVM_X86_TDX_ARCH_H +#define __KVM_X86_TDX_ARCH_H + +#include <linux/types.h> + +/* TDX control structure (TDR/TDCS/TDVPS) field access codes */ +#define TDX_NON_ARCH BIT_ULL(63) +#define TDX_CLASS_SHIFT 56 +#define TDX_FIELD_MASK GENMASK_ULL(31, 0) + +#define __BUILD_TDX_FIELD(non_arch, class, field) \ + (((non_arch) ? TDX_NON_ARCH : 0) | \ + ((u64)(class) << TDX_CLASS_SHIFT) | \ + ((u64)(field) & TDX_FIELD_MASK)) + +#define BUILD_TDX_FIELD(class, field) \ + __BUILD_TDX_FIELD(false, (class), (field)) + +#define BUILD_TDX_FIELD_NON_ARCH(class, field) \ + __BUILD_TDX_FIELD(true, (class), (field)) + + +/* Class code for TD */ +#define TD_CLASS_EXECUTION_CONTROLS 17ULL + +/* Class code for TDVPS */ +#define TDVPS_CLASS_VMCS 0ULL +#define TDVPS_CLASS_GUEST_GPR 16ULL +#define TDVPS_CLASS_OTHER_GUEST 17ULL +#define TDVPS_CLASS_MANAGEMENT 32ULL + +enum tdx_tdcs_execution_control { + TD_TDCS_EXEC_TSC_OFFSET = 10, + TD_TDCS_EXEC_TSC_MULTIPLIER = 11, +}; + +enum tdx_vcpu_guest_other_state { + TD_VCPU_STATE_DETAILS_NON_ARCH = 0x100, +}; + +#define TDX_VCPU_STATE_DETAILS_INTR_PENDING BIT_ULL(0) + +static inline bool tdx_vcpu_state_details_intr_pending(u64 vcpu_state_details) +{ + return !!(vcpu_state_details & TDX_VCPU_STATE_DETAILS_INTR_PENDING); +} + +/* @field is any of enum tdx_tdcs_execution_control */ +#define TDCS_EXEC(field) BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field)) + +/* @field is the VMCS field encoding */ +#define TDVPS_VMCS(field) BUILD_TDX_FIELD(TDVPS_CLASS_VMCS, (field)) + +/* @field is any of enum tdx_guest_other_state */ +#define TDVPS_STATE(field) BUILD_TDX_FIELD(TDVPS_CLASS_OTHER_GUEST, (field)) +#define TDVPS_STATE_NON_ARCH(field) BUILD_TDX_FIELD_NON_ARCH(TDVPS_CLASS_OTHER_GUEST, (field)) + +/* Management class fields */ +enum tdx_vcpu_guest_management { + TD_VCPU_PEND_NMI = 11, +}; + +/* @field is any of enum tdx_vcpu_guest_management */ +#define TDVPS_MANAGEMENT(field) BUILD_TDX_FIELD(TDVPS_CLASS_MANAGEMENT, (field)) + +#define TDX_EXTENDMR_CHUNKSIZE 256 + +struct tdx_cpuid_value { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +} __packed; + +#define TDX_TD_ATTR_DEBUG BIT_ULL(0) +#define TDX_TD_ATTR_SEPT_VE_DISABLE BIT_ULL(28) +#define TDX_TD_ATTR_PKS BIT_ULL(30) +#define TDX_TD_ATTR_KL BIT_ULL(31) +#define TDX_TD_ATTR_PERFMON BIT_ULL(63) + +#define TDX_EXT_EXIT_QUAL_TYPE_MASK GENMASK(3, 0) +#define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION 6 +/* + * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B. + */ +struct td_params { + u64 attributes; + u64 xfam; + u16 max_vcpus; + u8 reserved0[6]; + + u64 eptp_controls; + u64 config_flags; + u16 tsc_frequency; + u8 reserved1[38]; + + u64 mrconfigid[6]; + u64 mrowner[6]; + u64 mrownerconfig[6]; + u64 reserved2[4]; + + union { + DECLARE_FLEX_ARRAY(struct tdx_cpuid_value, cpuid_values); + u8 reserved3[768]; + }; +} __packed __aligned(1024); + +/* + * Guest uses MAX_PA for GPAW when set. + * 0: GPA.SHARED bit is GPA[47] + * 1: GPA.SHARED bit is GPA[51] + */ +#define TDX_CONFIG_FLAGS_MAX_GPAW BIT_ULL(0) + +/* + * TDH.VP.ENTER, TDG.VP.VMCALL preserves RBP + * 0: RBP can be used for TDG.VP.VMCALL input. RBP is clobbered. + * 1: RBP can't be used for TDG.VP.VMCALL input. RBP is preserved. + */ +#define TDX_CONFIG_FLAGS_NO_RBP_MOD BIT_ULL(2) + + +/* + * TDX requires the frequency to be defined in units of 25MHz, which is the + * frequency of the core crystal clock on TDX-capable platforms, i.e. the TDX + * module can only program frequencies that are multiples of 25MHz. The + * frequency must be between 100mhz and 10ghz (inclusive). + */ +#define TDX_TSC_KHZ_TO_25MHZ(tsc_in_khz) ((tsc_in_khz) / (25 * 1000)) +#define TDX_TSC_25MHZ_TO_KHZ(tsc_in_25mhz) ((tsc_in_25mhz) * (25 * 1000)) +#define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) +#define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) + +/* Additional Secure EPT entry information */ +#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0) +#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8) +#define TDX_SEPT_STATE_SHIFT 8 + +enum tdx_sept_entry_state { + TDX_SEPT_FREE = 0, + TDX_SEPT_BLOCKED = 1, + TDX_SEPT_PENDING = 2, + TDX_SEPT_PENDING_BLOCKED = 3, + TDX_SEPT_PRESENT = 4, +}; + +static inline u8 tdx_get_sept_level(u64 sept_entry_info) +{ + return sept_entry_info & TDX_SEPT_LEVEL_MASK; +} + +static inline u8 tdx_get_sept_state(u64 sept_entry_info) +{ + return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT; +} + +#define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20) + +/* + * TD scope metadata field ID. + */ +#define TD_MD_FIELD_ID_CPUID_VALUES 0x9410000300000000ULL + +#endif /* __KVM_X86_TDX_ARCH_H */ diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h new file mode 100644 index 000000000000..6ff4672c4181 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx_errno.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* architectural status code for SEAMCALL */ + +#ifndef __KVM_X86_TDX_ERRNO_H +#define __KVM_X86_TDX_ERRNO_H + +#define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL + +/* + * TDX SEAMCALL Status Codes (returned in RAX) + */ +#define TDX_NON_RECOVERABLE_VCPU 0x4000000100000000ULL +#define TDX_NON_RECOVERABLE_TD 0x4000000200000000ULL +#define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL +#define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE 0x6000000700000000ULL +#define TDX_INTERRUPTED_RESUMABLE 0x8000000300000000ULL +#define TDX_OPERAND_INVALID 0xC000010000000000ULL +#define TDX_OPERAND_BUSY 0x8000020000000000ULL +#define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL +#define TDX_PAGE_METADATA_INCORRECT 0xC000030000000000ULL +#define TDX_VCPU_NOT_ASSOCIATED 0x8000070200000000ULL +#define TDX_KEY_GENERATION_FAILED 0x8000080000000000ULL +#define TDX_KEY_STATE_INCORRECT 0xC000081100000000ULL +#define TDX_KEY_CONFIGURED 0x0000081500000000ULL +#define TDX_NO_HKID_READY_TO_WBCACHE 0x0000082100000000ULL +#define TDX_FLUSHVP_NOT_DONE 0x8000082400000000ULL +#define TDX_EPT_WALK_FAILED 0xC0000B0000000000ULL +#define TDX_EPT_ENTRY_STATE_INCORRECT 0xC0000B0D00000000ULL +#define TDX_METADATA_FIELD_NOT_READABLE 0xC0000C0200000000ULL + +/* + * TDX module operand ID, appears in 31:0 part of error code as + * detail information + */ +#define TDX_OPERAND_ID_RCX 0x01 +#define TDX_OPERAND_ID_TDR 0x80 +#define TDX_OPERAND_ID_SEPT 0x92 +#define TDX_OPERAND_ID_TD_EPOCH 0xa9 + +#endif /* __KVM_X86_TDX_ERRNO_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 157c23db22be..b12414108cbf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -54,6 +54,7 @@ #include <trace/events/ipi.h> #include "capabilities.h" +#include "common.h" #include "cpuid.h" #include "hyperv.h" #include "kvm_onhyperv.h" @@ -1283,6 +1284,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); @@ -1311,7 +1313,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); - if (vmx->guest_state_loaded) + if (vt->guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1332,12 +1334,12 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - vmx->msr_host_kernel_gs_base = current->thread.gsbase; + vt->msr_host_kernel_gs_base = current->thread.gsbase; } else { savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); @@ -1349,14 +1351,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - vmx->guest_state_loaded = true; + vt->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; - if (!vmx->guest_state_loaded) + if (!vmx->vt.guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1384,10 +1386,10 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 - wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); - vmx->guest_state_loaded = false; + vmx->vt.guest_state_loaded = false; vmx->guest_uret_msrs_loaded = false; } @@ -1395,7 +1397,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); - if (vmx->guest_state_loaded) + if (vmx->vt.guest_state_loaded) rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; @@ -1404,7 +1406,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); - if (vmx->guest_state_loaded) + if (vmx->vt.guest_state_loaded) wrmsrq(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; @@ -1581,7 +1583,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } bool vmx_get_if_flag(struct kvm_vcpu *vcpu) @@ -1701,7 +1703,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). */ - if (to_vmx(vcpu)->exit_reason.enclave_mode) { + if (vmx_get_exit_reason(vcpu).enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT; } @@ -1716,7 +1718,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { - union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); unsigned long rip, orig_rip; u32 instr_len; @@ -1863,7 +1865,7 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu) return; } - WARN_ON_ONCE(vmx->emulation_required); + WARN_ON_ONCE(vmx->vt.emulation_required); if (kvm_exception_is_soft(ex->vector)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, @@ -3406,7 +3408,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_ept_level(void) @@ -3669,7 +3671,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); } void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -4197,50 +4199,6 @@ void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } -static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - int pi_vec) -{ -#ifdef CONFIG_SMP - if (vcpu->mode == IN_GUEST_MODE) { - /* - * The vector of the virtual has already been set in the PIR. - * Send a notification event to deliver the virtual interrupt - * unless the vCPU is the currently running vCPU, i.e. the - * event is being sent from a fastpath VM-Exit handler, in - * which case the PIR will be synced to the vIRR before - * re-entering the guest. - * - * When the target is not the running vCPU, the following - * possibilities emerge: - * - * Case 1: vCPU stays in non-root mode. Sending a notification - * event posts the interrupt to the vCPU. - * - * Case 2: vCPU exits to root mode and is still runnable. The - * PIR will be synced to the vIRR before re-entering the guest. - * Sending a notification event is ok as the host IRQ handler - * will ignore the spurious event. - * - * Case 3: vCPU exits to root mode and is blocked. vcpu_block() - * has already synced PIR to vIRR and never blocks the vCPU if - * the vIRR is not empty. Therefore, a blocked vCPU here does - * not wait for any requested interrupts in PIR, and sending a - * notification event also results in a benign, spurious event. - */ - - if (vcpu != kvm_get_running_vcpu()) - __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return; - } -#endif - /* - * The vCPU isn't in the guest; wake the vCPU in case it is blocking, - * otherwise do nothing as KVM will grab the highest priority pending - * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). - */ - kvm_vcpu_wake_up(vcpu); -} - static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { @@ -4289,7 +4247,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int r; r = vmx_deliver_nested_posted_interrupt(vcpu, vector); @@ -4300,20 +4258,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!vcpu->arch.apic->apicv_active) return -1; - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return 0; - - /* If a previous notification has sent the IPI, nothing to do. */ - if (pi_test_and_set_on(&vmx->pi_desc)) - return 0; - - /* - * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() - * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is - * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a - * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. - */ - kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); + __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); return 0; } @@ -4780,7 +4725,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); } if (vmx_can_use_ipiv(&vmx->vcpu)) { @@ -4893,8 +4838,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR * or POSTED_INTR_WAKEUP_VECTOR. */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - __pi_set_sn(&vmx->pi_desc); + vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&vmx->vt.pi_desc); } void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -5811,11 +5756,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) static int handle_ept_violation(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); gpa_t gpa; - u64 error_code; - - exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5831,23 +5773,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(vcpu, gpa, exit_qualification); - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; - /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) - ? PFERR_WRITE_MASK : 0; - /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) - ? PFERR_PRESENT_MASK : 0; - - if (error_code & EPT_VIOLATION_GVA_IS_VALID) - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because @@ -5859,7 +5784,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); + return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); } static int handle_ept_misconfig(struct kvm_vcpu *vcpu) @@ -5904,7 +5829,7 @@ static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx->emulation_required) + if (!vmx->vt.emulation_required) return false; /* @@ -5936,7 +5861,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) intr_window_requested = exec_controls_get(vmx) & CPU_BASED_INTR_WINDOW_EXITING; - while (vmx->emulation_required && count-- != 0) { + while (vmx->vt.emulation_required && count-- != 0) { if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -6131,7 +6056,7 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) * VM-Exits. Unconditionally set the flag here and leave the handling to * vmx_handle_exit(). */ - to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + to_vt(vcpu)->exit_reason.bus_lock_detected = true; return 1; } @@ -6229,9 +6154,9 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); - *reason = vmx->exit_reason.full; + *reason = vmx->vt.exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); - if (!(vmx->exit_reason.failed_vmentry)) { + if (!(vmx->vt.exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) @@ -6527,7 +6452,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; u16 exit_handler_index; @@ -6583,7 +6508,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * the least awful solution for the userspace case without * risking false positives. */ - if (vmx->emulation_required) { + if (vmx->vt.emulation_required) { nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); return 1; } @@ -6593,7 +6518,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) } /* If guest state is invalid, start emulating. L2 is handled above. */ - if (vmx->emulation_required) + if (vmx->vt.emulation_required) return handle_invalid_guest_state(vcpu); if (exit_reason.failed_vmentry) { @@ -6693,7 +6618,7 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * Exit to user space when bus lock detected to inform that there is * a bus lock in guest. */ - if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { + if (vmx_get_exit_reason(vcpu).bus_lock_detected) { if (ret > 0) vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; @@ -6972,22 +6897,22 @@ static void vmx_set_rvi(int vector) int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int max_irr; bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; - if (pi_test_on(&vmx->pi_desc)) { - pi_clear_on(&vmx->pi_desc); + if (pi_test_on(&vt->pi_desc)) { + pi_clear_on(&vt->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); got_posted_interrupt = - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); got_posted_interrupt = false; @@ -7027,14 +6952,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - pi_clear_on(&vmx->pi_desc); - memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); -} - void vmx_do_interrupt_irqoff(unsigned long entry); void vmx_do_nmi_irqoff(void); @@ -7091,14 +7008,12 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->emulation_required) + if (to_vt(vcpu)->emulation_required) return; - if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); - else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) + else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } @@ -7333,10 +7248,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, * the fastpath even, all other exits must use the slow path. */ if (is_guest_mode(vcpu) && - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) return EXIT_FASTPATH_NONE; - switch (to_vmx(vcpu)->exit_reason.basic) { + switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: @@ -7348,6 +7263,20 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, } } +noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) +{ + if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || + !is_nmi(vmx_get_intr_info(vcpu))) + return; + + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); + else + vmx_do_nmi_irqoff(); + kvm_after_interrupt(vcpu); +} + static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, unsigned int flags) { @@ -7387,23 +7316,15 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + vmx->vt.exit_reason.full = 0xdead; goto out; } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); - if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && - is_nmi(vmx_get_intr_info(vcpu))) { - kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - if (cpu_feature_enabled(X86_FEATURE_FRED)) - fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); - else - vmx_do_nmi_irqoff(); - kvm_after_interrupt(vcpu); - } + vmx_handle_nmi(vcpu); out: guest_state_exit_irqoff(); @@ -7424,15 +7345,15 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * start emulation until we arrive back to a valid state. Synthesize a * consistency check VM-Exit due to invalid guest state and bail. */ - if (unlikely(vmx->emulation_required)) { + if (unlikely(vmx->vt.emulation_required)) { vmx->fail = 0; - vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; - vmx->exit_reason.failed_vmentry = 1; + vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->vt.exit_reason.failed_vmentry = 1; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); - vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); - vmx->exit_intr_info = 0; + vmx->vt.exit_intr_info = 0; return EXIT_FASTPATH_NONE; } @@ -7535,7 +7456,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * checking. */ if (vmx->nested.nested_run_pending && - !vmx->exit_reason.failed_vmentry) + !vmx_get_exit_reason(vcpu).failed_vmentry) ++vcpu->stat.nested_run; vmx->nested.nested_run_pending = 0; @@ -7544,12 +7465,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) + if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); trace_kvm_exit(vcpu, KVM_ISA_VMX); - if (unlikely(vmx->exit_reason.failed_vmentry)) + if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; @@ -7581,7 +7502,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - INIT_LIST_HEAD(&vmx->pi_wakeup_list); + INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); err = -ENOMEM; @@ -7679,7 +7600,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], - __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); return 0; @@ -7724,9 +7645,23 @@ int vmx_vm_init(struct kvm *kvm) break; } } + + if (enable_pml) + kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; return 0; } +static inline bool vmx_ignore_guest_pat(struct kvm *kvm) +{ + /* + * Non-coherent DMA devices need the guest to flush CPU properly. + * In that case it is not possible to map all guest RAM as WB, so + * always trust guest PAT. + */ + return !kvm_arch_has_noncoherent_dma(kvm) && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); +} + u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { /* @@ -7736,13 +7671,8 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - /* - * Force WB and ignore guest PAT if the VM does NOT have a non-coherent - * device attached. Letting the guest control memory types on Intel - * CPUs may result in unexpected behavior, and so KVM's ABI is to trust - * the guest to behave only as a last resort. - */ - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + /* Force WB if ignoring guest PAT */ + if (vmx_ignore_guest_pat(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); @@ -8604,6 +8534,8 @@ __init int vmx_hardware_setup(void) if (enable_ept) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); + else + vt_x86_ops.get_mt_mask = NULL; /* * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID @@ -8621,9 +8553,6 @@ __init int vmx_hardware_setup(void) if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; - if (!enable_pml) - vt_x86_ops.cpu_dirty_log_size = 0; - if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; @@ -8681,6 +8610,27 @@ __init int vmx_hardware_setup(void) kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + /* + * On Intel CPUs that lack self-snoop feature, letting the guest control + * memory types may result in unexpected behavior. So always ignore guest + * PAT on those CPUs and map VM as writeback, not allowing userspace to + * disable the quirk. + * + * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is + * supported, UC is slow enough to cause issues with some older guests (e.g. + * an old version of bochs driver uses ioremap() instead of ioremap_wc() to + * map the video RAM, causing wayland desktop to fail to get started + * correctly). To avoid breaking those older guests that rely on KVM to force + * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the + * safer (for performance) default behavior. + * + * On top of this, non-coherent DMA devices need the guest to flush CPU + * caches properly. This also requires honoring guest PAT, and is forced + * independent of the quirk in vmx_ignore_guest_pat(). + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; return r; } @@ -8694,23 +8644,16 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void __vmx_exit(void) +void vmx_exit(void) { allow_smaller_maxphyaddr = false; vmx_cleanup_l1d_flush(); -} -static void __exit vmx_exit(void) -{ - kvm_exit(); - __vmx_exit(); kvm_x86_vendor_exit(); - } -module_exit(vmx_exit); -static int __init vmx_init(void) +int __init vmx_init(void) { int r, cpu; @@ -8754,21 +8697,9 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; - /* - * Common KVM initialization _must_ come last, after this, /dev/kvm is - * exposed to userspace! - */ - r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), - THIS_MODULE); - if (r) - goto err_kvm_init; - return 0; -err_kvm_init: - __vmx_exit(); err_l1d_flush: kvm_x86_vendor_exit(); return r; } -module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 951e44dc9d0e..6d1e40ecc024 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -11,11 +11,13 @@ #include "capabilities.h" #include "../kvm_cache_regs.h" +#include "pmu_intel.h" #include "vmcs.h" #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" #include "../mmu.h" +#include "common.h" #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) @@ -67,47 +69,6 @@ struct pt_desc { struct pt_ctx guest; }; -union vmx_exit_reason { - struct { - u32 basic : 16; - u32 reserved16 : 1; - u32 reserved17 : 1; - u32 reserved18 : 1; - u32 reserved19 : 1; - u32 reserved20 : 1; - u32 reserved21 : 1; - u32 reserved22 : 1; - u32 reserved23 : 1; - u32 reserved24 : 1; - u32 reserved25 : 1; - u32 bus_lock_detected : 1; - u32 enclave_mode : 1; - u32 smi_pending_mtf : 1; - u32 smi_from_vmx_root : 1; - u32 reserved30 : 1; - u32 failed_vmentry : 1; - }; - u32 full; -}; - -struct lbr_desc { - /* Basic info about guest LBR records. */ - struct x86_pmu_lbr records; - - /* - * Emulate LBR feature via passthrough LBR registers when the - * per-vcpu guest LBR event is scheduled on the current pcpu. - * - * The records may be inaccurate if the host reclaims the LBR. - */ - struct perf_event *event; - - /* True if LBRs are marked as not intercepted in the MSR bitmap */ - bool msr_passthrough; -}; - -extern struct x86_pmu_lbr vmx_lbr_caps; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -248,20 +209,10 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; + struct vcpu_vt vt; u8 fail; u8 x2apic_msr_bitmap_mode; - /* - * If true, host state has been stored in vmx->loaded_vmcs for - * the CPU registers that only need to be switched when transitioning - * to/from the kernel, and the registers have been loaded with guest - * values. If false, host state is loaded in the CPU registers - * and vmx->loaded_vmcs->host_state is invalid. - */ - bool guest_state_loaded; - - unsigned long exit_qualification; - u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -274,7 +225,6 @@ struct vcpu_vmx { struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif @@ -313,15 +263,6 @@ struct vcpu_vmx { } seg[8]; } segment_cache; int vpid; - bool emulation_required; - - union vmx_exit_reason exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Used if this vCPU is waiting for PI notification wakeup. */ - struct list_head pi_wakeup_list; /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; @@ -376,6 +317,43 @@ struct kvm_vmx { u64 *pid_table; }; +static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu) +{ + return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt); +} + +static __always_inline struct kvm_vcpu *vt_to_vcpu(struct vcpu_vt *vt) +{ + return &(container_of(vt, struct vcpu_vmx, vt)->vcpu); +} + +static __always_inline union vmx_exit_reason vmx_get_exit_reason(struct kvm_vcpu *vcpu) +{ + return to_vt(vcpu)->exit_reason; +} + +static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + return vt->exit_qualification; +} + +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + return vt->exit_intr_info; +} + void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy); int allocate_vpid(void); @@ -662,45 +640,10 @@ static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static inline struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) -{ - return &to_vmx(vcpu)->lbr_desc; -} - -static inline struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) -{ - return &vcpu_to_lbr_desc(vcpu)->records; -} - -static inline bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) -{ - return !!vcpu_to_lbr_records(vcpu)->nr; -} - void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); -static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) - vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - return vmx->exit_qualification; -} - -static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - return vmx->exit_intr_info; -} - struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -758,4 +701,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) vmx->segment_cache.bitmask = 0; } +int vmx_init(void); +void vmx_exit(void); + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 430773a5ef8e..6bf8be570b2e 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -46,7 +46,6 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu); void vmx_migrate_timers(struct kvm_vcpu *vcpu); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu); void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu); void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, @@ -121,4 +120,114 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu); #endif void vmx_setup_mce(struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM_INTEL_TDX +void tdx_disable_virtualization_cpu(void); +int tdx_vm_init(struct kvm *kvm); +void tdx_mmu_release_hkid(struct kvm *kvm); +void tdx_vm_destroy(struct kvm *kvm); +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); + +int tdx_vcpu_create(struct kvm_vcpu *vcpu); +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); +void tdx_vcpu_free(struct kvm_vcpu *vcpu); +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu); +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void tdx_vcpu_put(struct kvm_vcpu *vcpu); +bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu); +int tdx_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion fastpath); + +void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); +void tdx_inject_nmi(struct kvm_vcpu *vcpu); +void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); +bool tdx_has_emulated_msr(u32 index); +int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); +int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); + +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +#else +static inline void tdx_disable_virtualization_cpu(void) {} +static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; } +static inline void tdx_mmu_release_hkid(struct kvm *kvm) {} +static inline void tdx_vm_destroy(struct kvm *kvm) {} +static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; } + +static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } +static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {} +static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} +static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {} +static inline int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } +static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + return EXIT_FASTPATH_NONE; +} +static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {} +static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {} +static inline bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) { return false; } +static inline int tdx_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion fastpath) { return 0; } + +static inline void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) {} +static inline void tdx_inject_nmi(struct kvm_vcpu *vcpu) {} +static inline void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, + u64 *info2, u32 *intr_info, u32 *error_code) {} +static inline bool tdx_has_emulated_msr(u32 index) { return false; } +static inline int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; } +static inline int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; } + +static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } + +static inline int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + void *private_spt) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + void *private_spt) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + kvm_pfn_t pfn) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + kvm_pfn_t pfn) +{ + return -EOPNOTSUPP; +} + +static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {} +static inline void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) {} +static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {} +static inline int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { return 0; } +#endif + #endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5bdb5b854924..570e7f8cbf64 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,7 +90,6 @@ #include "trace.h" #define MAX_IO_MSRS 256 -#define KVM_MAX_MCE_BANKS 32 /* * Note, kvm_caps fields should *never* have default values, all fields must be @@ -636,6 +635,15 @@ static void kvm_user_return_msr_cpu_online(void) } } +static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) +{ + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; + } +} + int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); @@ -649,15 +657,20 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) return 1; msrs->values[slot].curr = value; - if (!msrs->registered) { - msrs->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&msrs->urn); - msrs->registered = true; - } + kvm_user_return_register_notifier(msrs); return 0; } EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); +void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) +{ + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + + msrs->values[slot].curr = value; + kvm_user_return_register_notifier(msrs); +} +EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache); + static void drop_user_return_notifiers(void) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); @@ -4739,6 +4752,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; + if (kvm) + r = kvm->max_vcpus; break; case KVM_CAP_MAX_VCPU_ID: r = KVM_MAX_VCPU_IDS; @@ -4794,7 +4809,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; case KVM_CAP_DISABLE_QUIRKS2: - r = KVM_X86_VALID_QUIRKS; + r = kvm_caps.supported_quirks; break; case KVM_CAP_X86_NOTIFY_VMEXIT: r = kvm_caps.has_notify_vmexit; @@ -5117,6 +5132,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + kvm_x86_call(sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); @@ -5127,6 +5145,9 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, { int r; + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + r = kvm_apic_set_state(vcpu, s); if (r) return r; @@ -6304,6 +6325,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_DEVICE_ATTR: r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); break; + case KVM_MEMORY_ENCRYPT_OP: + r = -ENOTTY; + if (!kvm_x86_ops.vcpu_mem_enc_ioctl) + goto out; + r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp); + break; default: r = -EINVAL; } @@ -6491,7 +6518,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; kvm_for_each_vcpu(i, vcpu, kvm) @@ -6521,11 +6548,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, switch (cap->cap) { case KVM_CAP_DISABLE_QUIRKS2: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) + if (cap->args[0] & ~kvm_caps.supported_quirks) break; fallthrough; case KVM_CAP_DISABLE_QUIRKS: - kvm->arch.disabled_quirks = cap->args[0]; + kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks; r = 0; break; case KVM_CAP_SPLIT_IRQCHIP: { @@ -7300,10 +7327,6 @@ set_pit2_out: goto out; } case KVM_MEMORY_ENCRYPT_OP: { - r = -ENOTTY; - if (!kvm_x86_ops.mem_enc_ioctl) - goto out; - r = kvm_x86_call(mem_enc_ioctl)(kvm, argp); break; } @@ -9771,6 +9794,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; + kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; rdmsrq_safe(MSR_EFER, &kvm_host.efer); @@ -9815,6 +9840,10 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); + /* KVM always ignores guest PAT for shadow paging. */ + if (!tdp_enabled) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; @@ -10023,13 +10052,16 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl, +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int (*complete_hypercall)(struct kvm_vcpu *)) { unsigned long ret; + unsigned long nr = kvm_rax_read(vcpu); + unsigned long a0 = kvm_rbx_read(vcpu); + unsigned long a1 = kvm_rcx_read(vcpu); + unsigned long a2 = kvm_rdx_read(vcpu); + unsigned long a3 = kvm_rsi_read(vcpu); + int op_64_bit = is_64_bit_hypercall(vcpu); ++vcpu->stat.hypercalls; @@ -10132,9 +10164,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); - return __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi, - is_64_bit_hypercall(vcpu), - kvm_x86_call(get_cpl)(vcpu), + return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu), complete_hypercall_exit); } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); @@ -10978,7 +11008,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); - if (unlikely(vcpu->arch.switch_db_regs)) { + if (unlikely(vcpu->arch.switch_db_regs && + !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); @@ -11030,6 +11061,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH); kvm_x86_call(sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); @@ -11134,7 +11166,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } -static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) return true; @@ -11143,9 +11175,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_apic_init_sipi_allowed(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (kvm_is_exception_pending(vcpu)) return true; @@ -11183,10 +11212,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } +EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); } /* Called within kvm->srcu read side. */ @@ -11320,7 +11351,7 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - if (kvm_vcpu_has_events(vcpu)) + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) state = KVM_MP_STATE_RUNNABLE; kvm_set_mp_state(vcpu, state); return 1; @@ -12694,6 +12725,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { @@ -12723,6 +12755,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Decided by the vendor code for other VM types. */ kvm->arch.pre_fault_allowed = type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; + kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks; ret = kvm_page_track_init(kvm); if (ret) @@ -12876,6 +12909,7 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) kvm_free_pit(kvm); kvm_mmu_pre_destroy_vm(kvm); + static_call_cond(kvm_x86_vm_pre_destroy)(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -13073,7 +13107,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { int nr_slots; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); @@ -13145,7 +13179,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (READ_ONCE(eager_page_split)) kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); - if (kvm_x86_ops.cpu_dirty_log_size) { + if (kvm->arch.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); } else { @@ -13534,8 +13568,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first * (or last) non-coherent device is (un)registered to so that new SPTEs * with the correct "ignore guest PAT" setting are created. + * + * If KVM always honors guest PAT, however, there is nothing to do. */ - if (kvm_mmu_may_ignore_guest_pat()) + if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT)) kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); } @@ -14012,6 +14048,7 @@ EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9dc32a409076..88a9475899c8 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -10,6 +10,8 @@ #include "kvm_emulate.h" #include "cpuid.h" +#define KVM_MAX_MCE_BANKS 32 + struct kvm_caps { /* control of guest tsc rate supported? */ bool has_tsc_control; @@ -32,6 +34,9 @@ struct kvm_caps { u64 supported_xcr0; u64 supported_xss; u64 supported_perf_cap; + + u64 supported_quirks; + u64 inapplicable_quirks; }; struct kvm_host_values { @@ -629,25 +634,17 @@ static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr) return kvm->arch.hypercall_exit_enabled & BIT(hc_nr); } -int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl, +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int (*complete_hypercall)(struct kvm_vcpu *)); -#define __kvm_emulate_hypercall(_vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, complete_hypercall) \ -({ \ - int __ret; \ - \ - __ret = ____kvm_emulate_hypercall(_vcpu, \ - kvm_##nr##_read(_vcpu), kvm_##a0##_read(_vcpu), \ - kvm_##a1##_read(_vcpu), kvm_##a2##_read(_vcpu), \ - kvm_##a3##_read(_vcpu), op_64_bit, cpl, \ - complete_hypercall); \ - \ - if (__ret > 0) \ - __ret = complete_hypercall(_vcpu); \ - __ret; \ +#define __kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall) \ +({ \ + int __ret; \ + __ret = ____kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall); \ + \ + if (__ret > 0) \ + __ret = complete_hypercall(_vcpu); \ + __ret; \ }) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/virt/vmx/tdx/seamcall.S b/arch/x86/virt/vmx/tdx/seamcall.S index 5b1f2286aea9..6854c52c374b 100644 --- a/arch/x86/virt/vmx/tdx/seamcall.S +++ b/arch/x86/virt/vmx/tdx/seamcall.S @@ -41,6 +41,9 @@ SYM_FUNC_START(__seamcall_ret) TDX_MODULE_CALL host=1 ret=1 SYM_FUNC_END(__seamcall_ret) +/* KVM requires non-instrumentable __seamcall_saved_ret() for TDH.VP.ENTER */ +.section .noinstr.text, "ax" + /* * __seamcall_saved_ret() - Host-side interface functions to SEAM software * (the P-SEAMLDR or the TDX module), with saving output registers to the diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 7fdb37387886..2457d13c3f9e 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -5,6 +5,7 @@ * Intel Trusted Domain Extensions (TDX) support */ +#include "asm/page_types.h" #define pr_fmt(fmt) "virt/tdx: " fmt #include <linux/types.h> @@ -27,6 +28,7 @@ #include <linux/log2.h> #include <linux/acpi.h> #include <linux/suspend.h> +#include <linux/idr.h> #include <asm/page.h> #include <asm/special_insns.h> #include <asm/msr-index.h> @@ -42,6 +44,8 @@ static u32 tdx_global_keyid __ro_after_init; static u32 tdx_guest_keyid_start __ro_after_init; static u32 tdx_nr_guest_keyids __ro_after_init; +static DEFINE_IDA(tdx_guest_keyid_pool); + static DEFINE_PER_CPU(bool, tdx_lp_initialized); static struct tdmr_info_list tdx_tdmr_list; @@ -52,6 +56,8 @@ static DEFINE_MUTEX(tdx_module_lock); /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ static LIST_HEAD(tdx_memlist); +static struct tdx_sys_info tdx_sysinfo; + typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) @@ -1060,15 +1066,14 @@ static int init_tdmrs(struct tdmr_info_list *tdmr_list) static int init_tdx_module(void) { - struct tdx_sys_info sysinfo; int ret; - ret = get_tdx_sys_info(&sysinfo); + ret = get_tdx_sys_info(&tdx_sysinfo); if (ret) return ret; /* Check whether the kernel can support this module */ - ret = check_features(&sysinfo); + ret = check_features(&tdx_sysinfo); if (ret) return ret; @@ -1089,12 +1094,12 @@ static int init_tdx_module(void) goto out_put_tdxmem; /* Allocate enough space for constructing TDMRs */ - ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo.tdmr); + ret = alloc_tdmr_list(&tdx_tdmr_list, &tdx_sysinfo.tdmr); if (ret) goto err_free_tdxmem; /* Cover all TDX-usable memory regions in TDMRs */ - ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo.tdmr); + ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdx_sysinfo.tdmr); if (ret) goto err_free_tdmrs; @@ -1456,3 +1461,411 @@ void __init tdx_init(void) check_tdx_erratum(); } + +const struct tdx_sys_info *tdx_get_sysinfo(void) +{ + const struct tdx_sys_info *p = NULL; + + /* Make sure all fields in @tdx_sysinfo have been populated */ + mutex_lock(&tdx_module_lock); + if (tdx_module_status == TDX_MODULE_INITIALIZED) + p = (const struct tdx_sys_info *)&tdx_sysinfo; + mutex_unlock(&tdx_module_lock); + + return p; +} +EXPORT_SYMBOL_GPL(tdx_get_sysinfo); + +u32 tdx_get_nr_guest_keyids(void) +{ + return tdx_nr_guest_keyids; +} +EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids); + +int tdx_guest_keyid_alloc(void) +{ + return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, + tdx_guest_keyid_start + tdx_nr_guest_keyids - 1, + GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc); + +void tdx_guest_keyid_free(unsigned int keyid) +{ + ida_free(&tdx_guest_keyid_pool, keyid); +} +EXPORT_SYMBOL_GPL(tdx_guest_keyid_free); + +static inline u64 tdx_tdr_pa(struct tdx_td *td) +{ + return page_to_phys(td->tdr_page); +} + +static inline u64 tdx_tdvpr_pa(struct tdx_vp *td) +{ + return page_to_phys(td->tdvpr_page); +} + +/* + * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether + * a CLFLUSH of pages is required before handing them to the TDX module. + * Be conservative and make the code simpler by doing the CLFLUSH + * unconditionally. + */ +static void tdx_clflush_page(struct page *page) +{ + clflush_cache_range(page_to_virt(page), PAGE_SIZE); +} + +noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) +{ + args->rcx = tdx_tdvpr_pa(td); + + return __seamcall_saved_ret(TDH_VP_ENTER, args); +} +EXPORT_SYMBOL_GPL(tdh_vp_enter); + +u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) +{ + struct tdx_module_args args = { + .rcx = page_to_phys(tdcs_page), + .rdx = tdx_tdr_pa(td), + }; + + tdx_clflush_page(tdcs_page); + return seamcall(TDH_MNG_ADDCX, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_addcx); + +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + .r9 = page_to_phys(source), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_add); + +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_sept_add); + +u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) +{ + struct tdx_module_args args = { + .rcx = page_to_phys(tdcx_page), + .rdx = tdx_tdvpr_pa(vp), + }; + + tdx_clflush_page(tdcx_page); + return seamcall(TDH_VP_ADDCX, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_addcx); + +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_aug); + +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_range_block); + +u64 tdh_mng_key_config(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MNG_KEY_CONFIG, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_key_config); + +u64 tdh_mng_create(struct tdx_td *td, u16 hkid) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + .rdx = hkid, + }; + + tdx_clflush_page(td->tdr_page); + return seamcall(TDH_MNG_CREATE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_create); + +u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = tdx_tdr_pa(td), + }; + + tdx_clflush_page(vp->tdvpr_page); + return seamcall(TDH_VP_CREATE, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_create); + +u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + .rdx = field, + }; + u64 ret; + + ret = seamcall_ret(TDH_MNG_RD, &args); + + /* R8: Content of the field, or 0 in case of error. */ + *data = args.r8; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mng_rd); + +u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MR_EXTEND, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mr_extend); + +u64 tdh_mr_finalize(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MR_FINALIZE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mr_finalize); + +u64 tdh_vp_flush(struct tdx_vp *vp) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + }; + + return seamcall(TDH_VP_FLUSH, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_flush); + +u64 tdh_mng_vpflushdone(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MNG_VPFLUSHDONE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone); + +u64 tdh_mng_key_freeid(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MNG_KEY_FREEID, &args); +} +EXPORT_SYMBOL_GPL(tdh_mng_key_freeid); + +u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + .rdx = td_params, + }; + u64 ret; + + ret = seamcall_ret(TDH_MNG_INIT, &args); + + *extended_err = args.rcx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mng_init); + +u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = field, + }; + u64 ret; + + ret = seamcall_ret(TDH_VP_RD, &args); + + /* R8: Content of the field, or 0 in case of error. */ + *data = args.r8; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_vp_rd); + +u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = field, + .r8 = data, + .r9 = mask, + }; + + return seamcall(TDH_VP_WR, &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_wr); + +u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) +{ + struct tdx_module_args args = { + .rcx = tdx_tdvpr_pa(vp), + .rdx = initial_rcx, + .r8 = x2apicid, + }; + + /* apicid requires version == 1. */ + return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); +} +EXPORT_SYMBOL_GPL(tdh_vp_init); + +/* + * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. + * So despite the names, they must be interpted specially as described by the spec. Return + * them only for error reporting purposes. + */ +u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size) +{ + struct tdx_module_args args = { + .rcx = page_to_phys(page), + }; + u64 ret; + + ret = seamcall_ret(TDH_PHYMEM_PAGE_RECLAIM, &args); + + *tdx_pt = args.rcx; + *tdx_owner = args.rdx; + *tdx_size = args.r8; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); + +u64 tdh_mem_track(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MEM_TRACK, &args); +} +EXPORT_SYMBOL_GPL(tdh_mem_track); + +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_remove); + +u64 tdh_phymem_cache_wb(bool resume) +{ + struct tdx_module_args args = { + .rcx = resume ? 1 : 0, + }; + + return seamcall(TDH_PHYMEM_CACHE_WB, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb); + +u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) +{ + struct tdx_module_args args = {}; + + args.rcx = mk_keyed_paddr(tdx_global_keyid, td->tdr_page); + + return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); + +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) +{ + struct tdx_module_args args = {}; + + args.rcx = mk_keyed_paddr(hkid, page); + + return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 4e3d533cdd61..82bb82be8567 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -3,7 +3,6 @@ #define _X86_VIRT_TDX_H #include <linux/bits.h> -#include "tdx_global_metadata.h" /* * This file contains both macros and data structures defined by the TDX @@ -15,13 +14,46 @@ /* * TDX module SEAMCALL leaf functions */ -#define TDH_PHYMEM_PAGE_RDMD 24 -#define TDH_SYS_KEY_CONFIG 31 -#define TDH_SYS_INIT 33 -#define TDH_SYS_RD 34 -#define TDH_SYS_LP_INIT 35 -#define TDH_SYS_TDMR_INIT 36 -#define TDH_SYS_CONFIG 45 +#define TDH_VP_ENTER 0 +#define TDH_MNG_ADDCX 1 +#define TDH_MEM_PAGE_ADD 2 +#define TDH_MEM_SEPT_ADD 3 +#define TDH_VP_ADDCX 4 +#define TDH_MEM_PAGE_AUG 6 +#define TDH_MEM_RANGE_BLOCK 7 +#define TDH_MNG_KEY_CONFIG 8 +#define TDH_MNG_CREATE 9 +#define TDH_MNG_RD 11 +#define TDH_MR_EXTEND 16 +#define TDH_MR_FINALIZE 17 +#define TDH_VP_FLUSH 18 +#define TDH_MNG_VPFLUSHDONE 19 +#define TDH_VP_CREATE 10 +#define TDH_MNG_KEY_FREEID 20 +#define TDH_MNG_INIT 21 +#define TDH_VP_INIT 22 +#define TDH_PHYMEM_PAGE_RDMD 24 +#define TDH_VP_RD 26 +#define TDH_PHYMEM_PAGE_RECLAIM 28 +#define TDH_MEM_PAGE_REMOVE 29 +#define TDH_SYS_KEY_CONFIG 31 +#define TDH_SYS_INIT 33 +#define TDH_SYS_RD 34 +#define TDH_SYS_LP_INIT 35 +#define TDH_SYS_TDMR_INIT 36 +#define TDH_MEM_TRACK 38 +#define TDH_PHYMEM_CACHE_WB 40 +#define TDH_PHYMEM_PAGE_WBINVD 41 +#define TDH_VP_WR 43 +#define TDH_SYS_CONFIG 45 + +/* + * SEAMCALL leaf: + * + * Bit 15:0 Leaf number + * Bit 23:16 Version number + */ +#define TDX_VERSION_SHIFT 16 /* TDX page types */ #define PT_NDA 0x0 diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c index 8027a24d1c6e..13ad2663488b 100644 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c +++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c @@ -37,12 +37,62 @@ static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) return ret; } +static int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl) +{ + int ret = 0; + u64 val; + + if (!ret && !(ret = read_sys_metadata_field(0x9800000100000000, &val))) + sysinfo_td_ctrl->tdr_base_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9800000100000100, &val))) + sysinfo_td_ctrl->tdcs_base_size = val; + if (!ret && !(ret = read_sys_metadata_field(0x9800000100000200, &val))) + sysinfo_td_ctrl->tdvps_base_size = val; + + return ret; +} + +static int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf) +{ + int ret = 0; + u64 val; + int i, j; + + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000000, &val))) + sysinfo_td_conf->attributes_fixed0 = val; + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000001, &val))) + sysinfo_td_conf->attributes_fixed1 = val; + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000002, &val))) + sysinfo_td_conf->xfam_fixed0 = val; + if (!ret && !(ret = read_sys_metadata_field(0x1900000300000003, &val))) + sysinfo_td_conf->xfam_fixed1 = val; + if (!ret && !(ret = read_sys_metadata_field(0x9900000100000004, &val))) + sysinfo_td_conf->num_cpuid_config = val; + if (!ret && !(ret = read_sys_metadata_field(0x9900000100000008, &val))) + sysinfo_td_conf->max_vcpus_per_td = val; + if (sysinfo_td_conf->num_cpuid_config > ARRAY_SIZE(sysinfo_td_conf->cpuid_config_leaves)) + return -EINVAL; + for (i = 0; i < sysinfo_td_conf->num_cpuid_config; i++) + if (!ret && !(ret = read_sys_metadata_field(0x9900000300000400 + i, &val))) + sysinfo_td_conf->cpuid_config_leaves[i] = val; + if (sysinfo_td_conf->num_cpuid_config > ARRAY_SIZE(sysinfo_td_conf->cpuid_config_values)) + return -EINVAL; + for (i = 0; i < sysinfo_td_conf->num_cpuid_config; i++) + for (j = 0; j < 2; j++) + if (!ret && !(ret = read_sys_metadata_field(0x9900000300000500 + i * 2 + j, &val))) + sysinfo_td_conf->cpuid_config_values[i][j] = val; + + return ret; +} + static int get_tdx_sys_info(struct tdx_sys_info *sysinfo) { int ret = 0; ret = ret ?: get_tdx_sys_info_features(&sysinfo->features); ret = ret ?: get_tdx_sys_info_tdmr(&sysinfo->tdmr); + ret = ret ?: get_tdx_sys_info_td_ctrl(&sysinfo->td_ctrl); + ret = ret ?: get_tdx_sys_info_td_conf(&sysinfo->td_conf); return ret; } |