diff options
Diffstat (limited to 'arch/x86')
207 files changed, 4418 insertions, 3203 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a825bf031f49..53bab123a8ee 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,6 +27,7 @@ config X86_64 # Options that are inherently 64-bit kernel only: select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 + select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA @@ -125,8 +126,8 @@ config X86 select ARCH_WANTS_NO_INSTR select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE - select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64 select ARCH_WANT_LD_ORPHAN_WARN + select ARCH_WANT_OPTIMIZE_VMEMMAP if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT @@ -162,6 +163,7 @@ config X86 select GUP_GET_PXX_LOW_HIGH if X86_PAE select HARDIRQS_SW_RESEND select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 + select HAS_IOPORT select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB @@ -283,7 +285,6 @@ config X86 select RTC_LIB select RTC_MC146818_LIB select SPARSE_IRQ - select SRCU select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT @@ -434,7 +435,7 @@ config SMP Y to "Enhanced Real Time Clock Support", below. The "Advanced Power Management" code will be disabled if you say Y here. - See also <file:Documentation/x86/i386/IO-APIC.rst>, + See also <file:Documentation/arch/x86/i386/IO-APIC.rst>, <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at <http://www.tldp.org/docs.html#howto>. @@ -1324,7 +1325,7 @@ config MICROCODE the Linux kernel. The preferred method to load microcode from a detached initrd is described - in Documentation/x86/microcode.rst. For that you need to enable + in Documentation/arch/x86/microcode.rst. For that you need to enable CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the initrd for microcode blobs. @@ -1510,7 +1511,7 @@ config X86_5LEVEL A kernel with the option enabled can be booted on machines that support 4- or 5-level paging. - See Documentation/x86/x86_64/5level-paging.rst for more + See Documentation/arch/x86/x86_64/5level-paging.rst for more information. Say N if unsure. @@ -1774,7 +1775,7 @@ config MTRR You can safely say Y even if your machine doesn't have MTRRs, you'll just add about 9 KB to your kernel. - See <file:Documentation/x86/mtrr.rst> for more information. + See <file:Documentation/arch/x86/mtrr.rst> for more information. config MTRR_SANITIZER def_bool y @@ -1938,7 +1939,6 @@ config X86_SGX depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC depends on CRYPTO=y depends on CRYPTO_SHA256=y - select SRCU select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA select XARRAY_MULTI @@ -2290,6 +2290,17 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. +config ADDRESS_MASKING + bool "Linear Address Masking support" + depends on X86_64 + help + Linear Address Masking (LAM) modifies the checking that is applied + to 64-bit linear addresses, allowing software to use of the + untranslated address bits for metadata. + + The capability can be used for efficient address sanitizers (ASAN) + implementation and for optimizations in JITs. + config HOTPLUG_CPU def_bool y depends on SMP @@ -2551,7 +2562,7 @@ config PAGE_TABLE_ISOLATION ensuring that the majority of kernel addresses are not mapped into userspace. - See Documentation/x86/pti.rst for more details. + See Documentation/arch/x86/pti.rst for more details. config RETPOLINE bool "Avoid speculative indirect branches in kernel" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bdfe08f1a930..c5d614d28a75 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -97,7 +97,7 @@ config IOMMU_DEBUG code. When you use it make sure you have a big enough IOMMU/AGP aperture. Most of the options enabled by this can be set more finegrained using the iommu= command line - options. See Documentation/x86/x86_64/boot-options.rst for more + options. See Documentation/arch/x86/x86_64/boot-options.rst for more details. config IOMMU_LEAK diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index b70559b821df..2106a2bd152b 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -3,9 +3,14 @@ core-y += arch/x86/crypto/ # # Disable SSE and other FP/SIMD instructions to match normal x86 +# This is required to work around issues in older LLVM versions, but breaks +# GCC versions < 11. See: +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652 # +ifeq ($(CONFIG_CC_IS_CLANG),y) KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 +endif ifeq ($(CONFIG_X86_32),y) START := 0x8048000 diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 321a5011042d..bcc956c17872 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -8,14 +8,6 @@ * Copyright (C) 2016 Kees Cook */ -/* - * Since we're dealing with identity mappings, physical and virtual - * addresses are the same, so override these defines which are ultimately - * used by the headers in misc.h. - */ -#define __pa(x) ((unsigned long)(x)) -#define __va(x) ((void *)((unsigned long)(x))) - /* No PAGE_TABLE_ISOLATION support needed either: */ #undef CONFIG_PAGE_TABLE_ISOLATION diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 20118fb7c53b..2f155a0e3041 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -19,6 +19,15 @@ /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 +/* + * Boot stub deals with identity mappings, physical and virtual addresses are + * the same, so override these defines. + * + * <asm/page.h> will not define them if they are already defined. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index d63ad8f99f83..014b89c89088 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -104,9 +104,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, } #undef __init -#undef __pa #define __init -#define __pa(x) ((unsigned long)(x)) #define __BOOT_COMPRESSED diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c index 918a7606f53c..2d81d3cc72a1 100644 --- a/arch/x86/boot/compressed/tdx.c +++ b/arch/x86/boot/compressed/tdx.c @@ -26,7 +26,7 @@ static inline unsigned int tdx_io_in(int size, u16 port) .r14 = port, }; - if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + if (__tdx_hypercall_ret(&args)) return UINT_MAX; return args.r11; @@ -43,7 +43,7 @@ static inline void tdx_io_out(int size, u16 port, u32 value) .r15 = value, }; - __tdx_hypercall(&args, 0); + __tdx_hypercall(&args); } static inline u8 tdx_inb(u16 port) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 9338c68e7413..b04ca8e2b213 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -321,7 +321,7 @@ start_sys_seg: .word SYSSEG # obsolete and meaningless, but just type_of_loader: .byte 0 # 0 means ancient bootloader, newer # bootloaders know to change this. - # See Documentation/x86/boot.rst for + # See Documentation/arch/x86/boot.rst for # assigned ids # flags, unused bits must be zero (RFU) bit within loadflags diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 49b44f881484..73f83233d25d 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -13,7 +13,7 @@ #include <asm/coco.h> #include <asm/processor.h> -static enum cc_vendor vendor __ro_after_init; +enum cc_vendor cc_vendor __ro_after_init; static u64 cc_mask __ro_after_init; static bool intel_cc_platform_has(enum cc_attr attr) @@ -30,6 +30,22 @@ static bool intel_cc_platform_has(enum cc_attr attr) } /* + * Handle the SEV-SNP vTOM case where sme_me_mask is zero, and + * the other levels of SME/SEV functionality, including C-bit + * based SEV-SNP, are not enabled. + */ +static __maybe_unused bool amd_cc_platform_vtom(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_GUEST_MEM_ENCRYPT: + case CC_ATTR_MEM_ENCRYPT: + return true; + default: + return false; + } +} + +/* * SME and SEV are very similar but they are not the same, so there are * times that the kernel will need to distinguish between SME and SEV. The * cc_platform_has() function is used for this. When a distinction isn't @@ -41,9 +57,14 @@ static bool intel_cc_platform_has(enum cc_attr attr) * up under SME the trampoline area cannot be encrypted, whereas under SEV * the trampoline area must be encrypted. */ + static bool amd_cc_platform_has(enum cc_attr attr) { #ifdef CONFIG_AMD_MEM_ENCRYPT + + if (sev_status & MSR_AMD64_SNP_VTOM) + return amd_cc_platform_vtom(attr); + switch (attr) { case CC_ATTR_MEM_ENCRYPT: return sme_me_mask; @@ -76,20 +97,13 @@ static bool amd_cc_platform_has(enum cc_attr attr) #endif } -static bool hyperv_cc_platform_has(enum cc_attr attr) -{ - return attr == CC_ATTR_GUEST_MEM_ENCRYPT; -} - bool cc_platform_has(enum cc_attr attr) { - switch (vendor) { + switch (cc_vendor) { case CC_VENDOR_AMD: return amd_cc_platform_has(attr); case CC_VENDOR_INTEL: return intel_cc_platform_has(attr); - case CC_VENDOR_HYPERV: - return hyperv_cc_platform_has(attr); default: return false; } @@ -103,11 +117,14 @@ u64 cc_mkenc(u64 val) * encryption status of the page. * * - for AMD, bit *set* means the page is encrypted - * - for Intel *clear* means encrypted. + * - for AMD with vTOM and for Intel, *clear* means encrypted */ - switch (vendor) { + switch (cc_vendor) { case CC_VENDOR_AMD: - return val | cc_mask; + if (sev_status & MSR_AMD64_SNP_VTOM) + return val & ~cc_mask; + else + return val | cc_mask; case CC_VENDOR_INTEL: return val & ~cc_mask; default: @@ -118,9 +135,12 @@ u64 cc_mkenc(u64 val) u64 cc_mkdec(u64 val) { /* See comment in cc_mkenc() */ - switch (vendor) { + switch (cc_vendor) { case CC_VENDOR_AMD: - return val & ~cc_mask; + if (sev_status & MSR_AMD64_SNP_VTOM) + return val | cc_mask; + else + return val & ~cc_mask; case CC_VENDOR_INTEL: return val | cc_mask; default: @@ -129,11 +149,6 @@ u64 cc_mkdec(u64 val) } EXPORT_SYMBOL_GPL(cc_mkdec); -__init void cc_set_vendor(enum cc_vendor v) -{ - vendor = v; -} - __init void cc_set_mask(u64 mask) { cc_mask = mask; diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S index 6a255e6809bc..b193c0a1d8db 100644 --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -85,12 +85,12 @@ SYM_FUNC_START(__tdx_module_call) SYM_FUNC_END(__tdx_module_call) /* - * __tdx_hypercall() - Make hypercalls to a TDX VMM using TDVMCALL leaf - * of TDCALL instruction + * TDX_HYPERCALL - Make hypercalls to a TDX VMM using TDVMCALL leaf of TDCALL + * instruction * * Transforms values in function call argument struct tdx_hypercall_args @args * into the TDCALL register ABI. After TDCALL operation, VMM output is saved - * back in @args. + * back in @args, if \ret is 1. * *------------------------------------------------------------------------- * TD VMCALL ABI: @@ -105,26 +105,18 @@ SYM_FUNC_END(__tdx_module_call) * specification. Non zero value indicates vendor * specific ABI. * R11 - VMCALL sub function number - * RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments. + * RBX, RDX, RDI, RSI - Used to pass VMCALL sub function specific arguments. * R8-R9, R12-R15 - Same as above. * * Output Registers: * * RAX - TDCALL instruction status (Not related to hypercall * output). - * R10 - Hypercall output error code. - * R11-R15 - Hypercall sub function specific output values. + * RBX, RDX, RDI, RSI - Hypercall sub function specific output values. + * R8-R15 - Same as above. * - *------------------------------------------------------------------------- - * - * __tdx_hypercall() function ABI: - * - * @args (RDI) - struct tdx_hypercall_args for input and output - * @flags (RSI) - TDX_HCALL_* flags - * - * On successful completion, return the hypercall error code. */ -SYM_FUNC_START(__tdx_hypercall) +.macro TDX_HYPERCALL ret:req FRAME_BEGIN /* Save callee-saved GPRs as mandated by the x86_64 ABI */ @@ -134,9 +126,8 @@ SYM_FUNC_START(__tdx_hypercall) push %r12 push %rbx - /* Free RDI and RSI to be used as TDVMCALL arguments */ + /* Free RDI to be used as TDVMCALL arguments */ movq %rdi, %rax - push %rsi /* Copy hypercall registers from arg struct: */ movq TDX_HYPERCALL_r8(%rax), %r8 @@ -171,14 +162,11 @@ SYM_FUNC_START(__tdx_hypercall) * and are handled by callers. */ testq %rax, %rax - jne .Lpanic + jne .Lpanic\@ pop %rax - /* Copy hypercall result registers to arg struct if needed */ - testq $TDX_HCALL_HAS_OUTPUT, (%rsp) - jz .Lout - + .if \ret movq %r8, TDX_HYPERCALL_r8(%rax) movq %r9, TDX_HYPERCALL_r9(%rax) movq %r10, TDX_HYPERCALL_r10(%rax) @@ -191,7 +179,8 @@ SYM_FUNC_START(__tdx_hypercall) movq %rsi, TDX_HYPERCALL_rsi(%rax) movq %rbx, TDX_HYPERCALL_rbx(%rax) movq %rdx, TDX_HYPERCALL_rdx(%rax) -.Lout: + .endif + /* TDVMCALL leaf return code is in R10 */ movq %r10, %rax @@ -208,9 +197,6 @@ SYM_FUNC_START(__tdx_hypercall) xor %rdi, %rdi xor %rdx, %rdx - /* Remove TDX_HCALL_* flags from the stack */ - pop %rsi - /* Restore callee-saved GPRs as mandated by the x86_64 ABI */ pop %rbx pop %r12 @@ -221,9 +207,33 @@ SYM_FUNC_START(__tdx_hypercall) FRAME_END RET -.Lpanic: +.Lpanic\@: call __tdx_hypercall_failed /* __tdx_hypercall_failed never returns */ REACHABLE - jmp .Lpanic + jmp .Lpanic\@ +.endm + +/* + * + * __tdx_hypercall() function ABI: + * + * @args (RDI) - struct tdx_hypercall_args for input + * + * On successful completion, return the hypercall error code. + */ +SYM_FUNC_START(__tdx_hypercall) + TDX_HYPERCALL ret=0 SYM_FUNC_END(__tdx_hypercall) + +/* + * + * __tdx_hypercall_ret() function ABI: + * + * @args (RDI) - struct tdx_hypercall_args for input and output + * + * On successful completion, return the hypercall error code. + */ +SYM_FUNC_START(__tdx_hypercall_ret) + TDX_HYPERCALL ret=1 +SYM_FUNC_END(__tdx_hypercall_ret) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 055300e08fb3..e146b599260f 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -66,7 +66,7 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) .r15 = r15, }; - return __tdx_hypercall(&args, 0); + return __tdx_hypercall(&args); } /* Called from __tdx_hypercall() for unrecoverable failure */ @@ -99,7 +99,7 @@ long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, .r14 = p4, }; - return __tdx_hypercall(&args, 0); + return __tdx_hypercall(&args); } EXPORT_SYMBOL_GPL(tdx_kvm_hypercall); #endif @@ -179,7 +179,7 @@ static void __noreturn tdx_panic(const char *msg) * happens to return. */ while (1) - __tdx_hypercall(&args, 0); + __tdx_hypercall(&args); } static void tdx_parse_tdinfo(u64 *cc_mask) @@ -289,7 +289,7 @@ static u64 __cpuidle __halt(const bool irq_disabled) * can keep the vCPU in virtual HLT, even if an IRQ is * pending, without hanging/breaking the guest. */ - return __tdx_hypercall(&args, 0); + return __tdx_hypercall(&args); } static int handle_halt(struct ve_info *ve) @@ -326,7 +326,7 @@ static int read_msr(struct pt_regs *regs, struct ve_info *ve) * can be found in TDX Guest-Host-Communication Interface * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>". */ - if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + if (__tdx_hypercall_ret(&args)) return -EIO; regs->ax = lower_32_bits(args.r11); @@ -348,7 +348,7 @@ static int write_msr(struct pt_regs *regs, struct ve_info *ve) * can be found in TDX Guest-Host-Communication Interface * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>". */ - if (__tdx_hypercall(&args, 0)) + if (__tdx_hypercall(&args)) return -EIO; return ve_instr_len(ve); @@ -380,7 +380,7 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve) * ABI can be found in TDX Guest-Host-Communication Interface * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>". */ - if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + if (__tdx_hypercall_ret(&args)) return -EIO; /* @@ -407,7 +407,7 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val) .r15 = *val, }; - if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + if (__tdx_hypercall_ret(&args)) return false; *val = args.r11; return true; @@ -541,7 +541,7 @@ static bool handle_in(struct pt_regs *regs, int size, int port) * in TDX Guest-Host-Communication Interface (GHCI) section titled * "TDG.VP.VMCALL<Instruction.IO>". */ - success = !__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT); + success = !__tdx_hypercall_ret(&args); /* Update part of the register affected by the emulated instruction */ regs->ax &= ~mask; diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index cdf3215ec272..ad7f4c891625 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -201,8 +201,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) movdqa KEY, STATE4 /* load the constants: */ - movdqa .Laegis128_const_0, STATE2 - movdqa .Laegis128_const_1, STATE1 + movdqa .Laegis128_const_0(%rip), STATE2 + movdqa .Laegis128_const_1(%rip), STATE1 pxor STATE2, STATE3 pxor STATE1, STATE4 @@ -682,7 +682,7 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) punpcklbw T0, T0 punpcklbw T0, T0 punpcklbw T0, T0 - movdqa .Laegis128_counter, T1 + movdqa .Laegis128_counter(%rip), T1 pcmpgtb T1, T0 pand T0, MSG diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 837c1e0aa021..3ac7487ecad2 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -288,53 +288,53 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff # Encrypt/Decrypt first few blocks and $(3<<4), %r12 - jz _initial_num_blocks_is_0_\@ + jz .L_initial_num_blocks_is_0_\@ cmp $(2<<4), %r12 - jb _initial_num_blocks_is_1_\@ - je _initial_num_blocks_is_2_\@ -_initial_num_blocks_is_3_\@: + jb .L_initial_num_blocks_is_1_\@ + je .L_initial_num_blocks_is_2_\@ +.L_initial_num_blocks_is_3_\@: INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation sub $48, %r13 - jmp _initial_blocks_\@ -_initial_num_blocks_is_2_\@: + jmp .L_initial_blocks_\@ +.L_initial_num_blocks_is_2_\@: INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation sub $32, %r13 - jmp _initial_blocks_\@ -_initial_num_blocks_is_1_\@: + jmp .L_initial_blocks_\@ +.L_initial_num_blocks_is_1_\@: INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation sub $16, %r13 - jmp _initial_blocks_\@ -_initial_num_blocks_is_0_\@: + jmp .L_initial_blocks_\@ +.L_initial_num_blocks_is_0_\@: INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation -_initial_blocks_\@: +.L_initial_blocks_\@: # Main loop - Encrypt/Decrypt remaining blocks test %r13, %r13 - je _zero_cipher_left_\@ + je .L_zero_cipher_left_\@ sub $64, %r13 - je _four_cipher_left_\@ -_crypt_by_4_\@: + je .L_four_cipher_left_\@ +.L_crypt_by_4_\@: GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ %xmm7, %xmm8, enc add $64, %r11 sub $64, %r13 - jne _crypt_by_4_\@ -_four_cipher_left_\@: + jne .L_crypt_by_4_\@ +.L_four_cipher_left_\@: GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 -_zero_cipher_left_\@: +.L_zero_cipher_left_\@: movdqu %xmm8, AadHash(%arg2) movdqu %xmm0, CurCount(%arg2) mov %arg5, %r13 and $15, %r13 # %r13 = arg5 (mod 16) - je _multiple_of_16_bytes_\@ + je .L_multiple_of_16_bytes_\@ mov %r13, PBlockLen(%arg2) @@ -348,14 +348,14 @@ _zero_cipher_left_\@: movdqu %xmm0, PBlockEncKey(%arg2) cmp $16, %arg5 - jge _large_enough_update_\@ + jge .L_large_enough_update_\@ lea (%arg4,%r11,1), %r10 mov %r13, %r12 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 - jmp _data_read_\@ + jmp .L_data_read_\@ -_large_enough_update_\@: +.L_large_enough_update_\@: sub $16, %r11 add %r13, %r11 @@ -374,7 +374,7 @@ _large_enough_update_\@: # shift right 16-r13 bytes pshufb %xmm2, %xmm1 -_data_read_\@: +.L_data_read_\@: lea ALL_F+16(%rip), %r12 sub %r13, %r12 @@ -409,19 +409,19 @@ _data_read_\@: # Output %r13 bytes movq %xmm0, %rax cmp $8, %r13 - jle _less_than_8_bytes_left_\@ + jle .L_less_than_8_bytes_left_\@ mov %rax, (%arg3 , %r11, 1) add $8, %r11 psrldq $8, %xmm0 movq %xmm0, %rax sub $8, %r13 -_less_than_8_bytes_left_\@: +.L_less_than_8_bytes_left_\@: mov %al, (%arg3, %r11, 1) add $1, %r11 shr $8, %rax sub $1, %r13 - jne _less_than_8_bytes_left_\@ -_multiple_of_16_bytes_\@: + jne .L_less_than_8_bytes_left_\@ +.L_multiple_of_16_bytes_\@: .endm # GCM_COMPLETE Finishes update of tag of last partial block @@ -434,11 +434,11 @@ _multiple_of_16_bytes_\@: mov PBlockLen(%arg2), %r12 test %r12, %r12 - je _partial_done\@ + je .L_partial_done\@ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 -_partial_done\@: +.L_partial_done\@: mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits movd %r12d, %xmm15 # len(A) in %xmm15 @@ -457,44 +457,44 @@ _partial_done\@: movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) pxor %xmm8, %xmm0 -_return_T_\@: +.L_return_T_\@: mov \AUTHTAG, %r10 # %r10 = authTag mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len cmp $16, %r11 - je _T_16_\@ + je .L_T_16_\@ cmp $8, %r11 - jl _T_4_\@ -_T_8_\@: + jl .L_T_4_\@ +.L_T_8_\@: movq %xmm0, %rax mov %rax, (%r10) add $8, %r10 sub $8, %r11 psrldq $8, %xmm0 test %r11, %r11 - je _return_T_done_\@ -_T_4_\@: + je .L_return_T_done_\@ +.L_T_4_\@: movd %xmm0, %eax mov %eax, (%r10) add $4, %r10 sub $4, %r11 psrldq $4, %xmm0 test %r11, %r11 - je _return_T_done_\@ -_T_123_\@: + je .L_return_T_done_\@ +.L_T_123_\@: movd %xmm0, %eax cmp $2, %r11 - jl _T_1_\@ + jl .L_T_1_\@ mov %ax, (%r10) cmp $2, %r11 - je _return_T_done_\@ + je .L_return_T_done_\@ add $2, %r10 sar $16, %eax -_T_1_\@: +.L_T_1_\@: mov %al, (%r10) - jmp _return_T_done_\@ -_T_16_\@: + jmp .L_return_T_done_\@ +.L_T_16_\@: movdqu %xmm0, (%r10) -_return_T_done_\@: +.L_return_T_done_\@: .endm #ifdef __x86_64__ @@ -563,30 +563,30 @@ _return_T_done_\@: # Clobbers %rax, DLEN and XMM1 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst cmp $8, \DLEN - jl _read_lt8_\@ + jl .L_read_lt8_\@ mov (\DPTR), %rax movq %rax, \XMMDst sub $8, \DLEN - jz _done_read_partial_block_\@ + jz .L_done_read_partial_block_\@ xor %eax, %eax -_read_next_byte_\@: +.L_read_next_byte_\@: shl $8, %rax mov 7(\DPTR, \DLEN, 1), %al dec \DLEN - jnz _read_next_byte_\@ + jnz .L_read_next_byte_\@ movq %rax, \XMM1 pslldq $8, \XMM1 por \XMM1, \XMMDst - jmp _done_read_partial_block_\@ -_read_lt8_\@: + jmp .L_done_read_partial_block_\@ +.L_read_lt8_\@: xor %eax, %eax -_read_next_byte_lt8_\@: +.L_read_next_byte_lt8_\@: shl $8, %rax mov -1(\DPTR, \DLEN, 1), %al dec \DLEN - jnz _read_next_byte_lt8_\@ + jnz .L_read_next_byte_lt8_\@ movq %rax, \XMMDst -_done_read_partial_block_\@: +.L_done_read_partial_block_\@: .endm # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. @@ -600,8 +600,8 @@ _done_read_partial_block_\@: pxor \TMP6, \TMP6 cmp $16, %r11 - jl _get_AAD_rest\@ -_get_AAD_blocks\@: + jl .L_get_AAD_rest\@ +.L_get_AAD_blocks\@: movdqu (%r10), \TMP7 pshufb %xmm14, \TMP7 # byte-reflect the AAD data pxor \TMP7, \TMP6 @@ -609,14 +609,14 @@ _get_AAD_blocks\@: add $16, %r10 sub $16, %r11 cmp $16, %r11 - jge _get_AAD_blocks\@ + jge .L_get_AAD_blocks\@ movdqu \TMP6, \TMP7 /* read the last <16B of AAD */ -_get_AAD_rest\@: +.L_get_AAD_rest\@: test %r11, %r11 - je _get_AAD_done\@ + je .L_get_AAD_done\@ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 pshufb %xmm14, \TMP7 # byte-reflect the AAD data @@ -624,7 +624,7 @@ _get_AAD_rest\@: GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 movdqu \TMP7, \TMP6 -_get_AAD_done\@: +.L_get_AAD_done\@: movdqu \TMP6, AadHash(%arg2) .endm @@ -637,21 +637,21 @@ _get_AAD_done\@: AAD_HASH operation mov PBlockLen(%arg2), %r13 test %r13, %r13 - je _partial_block_done_\@ # Leave Macro if no partial blocks + je .L_partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN - jl _fewer_than_16_bytes_\@ + jl .L_fewer_than_16_bytes_\@ movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp _data_read_\@ + jmp .L_data_read_\@ -_fewer_than_16_bytes_\@: +.L_fewer_than_16_bytes_\@: lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 mov \PLAIN_CYPH_LEN, %r12 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 mov PBlockLen(%arg2), %r13 -_data_read_\@: # Finished reading in data +.L_data_read_\@: # Finished reading in data movdqu PBlockEncKey(%arg2), %xmm9 movdqu HashKey(%arg2), %xmm13 @@ -674,9 +674,9 @@ _data_read_\@: # Finished reading in data sub $16, %r10 # Determine if if partial block is not being filled and # shift mask accordingly - jge _no_extra_mask_1_\@ + jge .L_no_extra_mask_1_\@ sub %r10, %r12 -_no_extra_mask_1_\@: +.L_no_extra_mask_1_\@: movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out bottom r13 bytes of xmm9 @@ -689,17 +689,17 @@ _no_extra_mask_1_\@: pxor %xmm3, \AAD_HASH test %r10, %r10 - jl _partial_incomplete_1_\@ + jl .L_partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 xor %eax, %eax mov %rax, PBlockLen(%arg2) - jmp _dec_done_\@ -_partial_incomplete_1_\@: + jmp .L_dec_done_\@ +.L_partial_incomplete_1_\@: add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -_dec_done_\@: +.L_dec_done_\@: movdqu \AAD_HASH, AadHash(%arg2) .else pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) @@ -710,9 +710,9 @@ _dec_done_\@: sub $16, %r10 # Determine if if partial block is not being filled and # shift mask accordingly - jge _no_extra_mask_2_\@ + jge .L_no_extra_mask_2_\@ sub %r10, %r12 -_no_extra_mask_2_\@: +.L_no_extra_mask_2_\@: movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out bottom r13 bytes of xmm9 @@ -724,17 +724,17 @@ _no_extra_mask_2_\@: pxor %xmm9, \AAD_HASH test %r10, %r10 - jl _partial_incomplete_2_\@ + jl .L_partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 xor %eax, %eax mov %rax, PBlockLen(%arg2) - jmp _encode_done_\@ -_partial_incomplete_2_\@: + jmp .L_encode_done_\@ +.L_partial_incomplete_2_\@: add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -_encode_done_\@: +.L_encode_done_\@: movdqu \AAD_HASH, AadHash(%arg2) movdqa SHUF_MASK(%rip), %xmm10 @@ -744,32 +744,32 @@ _encode_done_\@: .endif # output encrypted Bytes test %r10, %r10 - jl _partial_fill_\@ + jl .L_partial_fill_\@ mov %r13, %r12 mov $16, %r13 # Set r13 to be the number of bytes to write out sub %r12, %r13 - jmp _count_set_\@ -_partial_fill_\@: + jmp .L_count_set_\@ +.L_partial_fill_\@: mov \PLAIN_CYPH_LEN, %r13 -_count_set_\@: +.L_count_set_\@: movdqa %xmm9, %xmm0 movq %xmm0, %rax cmp $8, %r13 - jle _less_than_8_bytes_left_\@ + jle .L_less_than_8_bytes_left_\@ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $8, \DATA_OFFSET psrldq $8, %xmm0 movq %xmm0, %rax sub $8, %r13 -_less_than_8_bytes_left_\@: +.L_less_than_8_bytes_left_\@: movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $1, \DATA_OFFSET shr $8, %rax sub $1, %r13 - jne _less_than_8_bytes_left_\@ -_partial_block_done_\@: + jne .L_less_than_8_bytes_left_\@ +.L_partial_block_done_\@: .endm # PARTIAL_BLOCK /* @@ -813,14 +813,14 @@ _partial_block_done_\@: shr $2,%eax # 128->4, 192->6, 256->8 add $5,%eax # 128->9, 192->11, 256->13 -aes_loop_initial_\@: +.Laes_loop_initial_\@: MOVADQ (%r10),\TMP1 .irpc index, \i_seq aesenc \TMP1, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_initial_\@ + jnz .Laes_loop_initial_\@ MOVADQ (%r10), \TMP1 .irpc index, \i_seq @@ -861,7 +861,7 @@ aes_loop_initial_\@: GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 .endif cmp $64, %r13 - jl _initial_blocks_done\@ + jl .L_initial_blocks_done\@ # no need for precomputed values /* * @@ -908,18 +908,18 @@ aes_loop_initial_\@: mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_pre_done\@ + jz .Laes_loop_pre_done\@ -aes_loop_pre_\@: +.Laes_loop_pre_\@: MOVADQ (%r10),\TMP2 .irpc index, 1234 aesenc \TMP2, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_pre_\@ + jnz .Laes_loop_pre_\@ -aes_loop_pre_done\@: +.Laes_loop_pre_done\@: MOVADQ (%r10), \TMP2 aesenclast \TMP2, \XMM1 aesenclast \TMP2, \XMM2 @@ -963,7 +963,7 @@ aes_loop_pre_done\@: pshufb %xmm14, \XMM3 # perform a 16 byte swap pshufb %xmm14, \XMM4 # perform a 16 byte swap -_initial_blocks_done\@: +.L_initial_blocks_done\@: .endm @@ -1095,18 +1095,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_par_enc_done\@ + jz .Laes_loop_par_enc_done\@ -aes_loop_par_enc\@: +.Laes_loop_par_enc\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_par_enc\@ + jnz .Laes_loop_par_enc\@ -aes_loop_par_enc_done\@: +.Laes_loop_par_enc_done\@: MOVADQ (%r10), \TMP3 aesenclast \TMP3, \XMM1 # Round 10 aesenclast \TMP3, \XMM2 @@ -1303,18 +1303,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 sub $4,%eax # 128->0, 192->2, 256->4 - jz aes_loop_par_dec_done\@ + jz .Laes_loop_par_dec_done\@ -aes_loop_par_dec\@: +.Laes_loop_par_dec\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax - jnz aes_loop_par_dec\@ + jnz .Laes_loop_par_dec\@ -aes_loop_par_dec_done\@: +.Laes_loop_par_dec_done\@: MOVADQ (%r10), \TMP3 aesenclast \TMP3, \XMM1 # last round aesenclast \TMP3, \XMM2 @@ -2717,7 +2717,7 @@ SYM_FUNC_END(aesni_cts_cbc_dec) * BSWAP_MASK == endian swapping mask */ SYM_FUNC_START_LOCAL(_aesni_inc_init) - movaps .Lbswap_mask, BSWAP_MASK + movaps .Lbswap_mask(%rip), BSWAP_MASK movaps IV, CTR pshufb BSWAP_MASK, CTR mov $1, TCTR_LOW diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 0852ab573fd3..46cddd78857b 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -154,30 +154,6 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 -.section .rodata -.align 16 -.type aad_shift_arr, @object -.size aad_shift_arr, 272 -aad_shift_arr: - .octa 0xffffffffffffffffffffffffffffffff - .octa 0xffffffffffffffffffffffffffffff0C - .octa 0xffffffffffffffffffffffffffff0D0C - .octa 0xffffffffffffffffffffffffff0E0D0C - .octa 0xffffffffffffffffffffffff0F0E0D0C - .octa 0xffffffffffffffffffffff0C0B0A0908 - .octa 0xffffffffffffffffffff0D0C0B0A0908 - .octa 0xffffffffffffffffff0E0D0C0B0A0908 - .octa 0xffffffffffffffff0F0E0D0C0B0A0908 - .octa 0xffffffffffffff0C0B0A090807060504 - .octa 0xffffffffffff0D0C0B0A090807060504 - .octa 0xffffffffff0E0D0C0B0A090807060504 - .octa 0xffffffff0F0E0D0C0B0A090807060504 - .octa 0xffffff0C0B0A09080706050403020100 - .octa 0xffff0D0C0B0A09080706050403020100 - .octa 0xff0E0D0C0B0A09080706050403020100 - .octa 0x0F0E0D0C0B0A09080706050403020100 - - .text @@ -302,68 +278,68 @@ VARIABLE_OFFSET = 16*8 mov %r13, %r12 shr $4, %r12 and $7, %r12 - jz _initial_num_blocks_is_0\@ + jz .L_initial_num_blocks_is_0\@ cmp $7, %r12 - je _initial_num_blocks_is_7\@ + je .L_initial_num_blocks_is_7\@ cmp $6, %r12 - je _initial_num_blocks_is_6\@ + je .L_initial_num_blocks_is_6\@ cmp $5, %r12 - je _initial_num_blocks_is_5\@ + je .L_initial_num_blocks_is_5\@ cmp $4, %r12 - je _initial_num_blocks_is_4\@ + je .L_initial_num_blocks_is_4\@ cmp $3, %r12 - je _initial_num_blocks_is_3\@ + je .L_initial_num_blocks_is_3\@ cmp $2, %r12 - je _initial_num_blocks_is_2\@ + je .L_initial_num_blocks_is_2\@ - jmp _initial_num_blocks_is_1\@ + jmp .L_initial_num_blocks_is_1\@ -_initial_num_blocks_is_7\@: +.L_initial_num_blocks_is_7\@: \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*7, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_6\@: +.L_initial_num_blocks_is_6\@: \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*6, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_5\@: +.L_initial_num_blocks_is_5\@: \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*5, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_4\@: +.L_initial_num_blocks_is_4\@: \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*4, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_3\@: +.L_initial_num_blocks_is_3\@: \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*3, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_2\@: +.L_initial_num_blocks_is_2\@: \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*2, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_1\@: +.L_initial_num_blocks_is_1\@: \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*1, %r13 - jmp _initial_blocks_encrypted\@ + jmp .L_initial_blocks_encrypted\@ -_initial_num_blocks_is_0\@: +.L_initial_num_blocks_is_0\@: \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC -_initial_blocks_encrypted\@: +.L_initial_blocks_encrypted\@: test %r13, %r13 - je _zero_cipher_left\@ + je .L_zero_cipher_left\@ sub $128, %r13 - je _eight_cipher_left\@ + je .L_eight_cipher_left\@ @@ -373,9 +349,9 @@ _initial_blocks_encrypted\@: vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 -_encrypt_by_8_new\@: +.L_encrypt_by_8_new\@: cmp $(255-8), %r15d - jg _encrypt_by_8\@ + jg .L_encrypt_by_8\@ @@ -383,30 +359,30 @@ _encrypt_by_8_new\@: \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC add $128, %r11 sub $128, %r13 - jne _encrypt_by_8_new\@ + jne .L_encrypt_by_8_new\@ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp _eight_cipher_left\@ + jmp .L_eight_cipher_left\@ -_encrypt_by_8\@: +.L_encrypt_by_8\@: vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 add $8, %r15b \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 add $128, %r11 sub $128, %r13 - jne _encrypt_by_8_new\@ + jne .L_encrypt_by_8_new\@ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 -_eight_cipher_left\@: +.L_eight_cipher_left\@: \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 -_zero_cipher_left\@: +.L_zero_cipher_left\@: vmovdqu %xmm14, AadHash(arg2) vmovdqu %xmm9, CurCount(arg2) @@ -414,7 +390,7 @@ _zero_cipher_left\@: mov arg5, %r13 and $15, %r13 # r13 = (arg5 mod 16) - je _multiple_of_16_bytes\@ + je .L_multiple_of_16_bytes\@ # handle the last <16 Byte block separately @@ -428,7 +404,7 @@ _zero_cipher_left\@: vmovdqu %xmm9, PBlockEncKey(arg2) cmp $16, arg5 - jge _large_enough_update\@ + jge .L_large_enough_update\@ lea (arg4,%r11,1), %r10 mov %r13, %r12 @@ -440,9 +416,9 @@ _zero_cipher_left\@: # able to shift 16-r13 bytes (r13 is the # number of bytes in plaintext mod 16) - jmp _final_ghash_mul\@ + jmp .L_final_ghash_mul\@ -_large_enough_update\@: +.L_large_enough_update\@: sub $16, %r11 add %r13, %r11 @@ -461,7 +437,7 @@ _large_enough_update\@: # shift right 16-r13 bytes vpshufb %xmm2, %xmm1, %xmm1 -_final_ghash_mul\@: +.L_final_ghash_mul\@: .if \ENC_DEC == DEC vmovdqa %xmm1, %xmm2 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) @@ -490,7 +466,7 @@ _final_ghash_mul\@: # output r13 Bytes vmovq %xmm9, %rax cmp $8, %r13 - jle _less_than_8_bytes_left\@ + jle .L_less_than_8_bytes_left\@ mov %rax, (arg3 , %r11) add $8, %r11 @@ -498,15 +474,15 @@ _final_ghash_mul\@: vmovq %xmm9, %rax sub $8, %r13 -_less_than_8_bytes_left\@: +.L_less_than_8_bytes_left\@: movb %al, (arg3 , %r11) add $1, %r11 shr $8, %rax sub $1, %r13 - jne _less_than_8_bytes_left\@ + jne .L_less_than_8_bytes_left\@ ############################# -_multiple_of_16_bytes\@: +.L_multiple_of_16_bytes\@: .endm @@ -519,12 +495,12 @@ _multiple_of_16_bytes\@: mov PBlockLen(arg2), %r12 test %r12, %r12 - je _partial_done\@ + je .L_partial_done\@ #GHASH computation for the last <16 Byte block \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 -_partial_done\@: +.L_partial_done\@: mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 @@ -547,49 +523,49 @@ _partial_done\@: -_return_T\@: +.L_return_T\@: mov \AUTH_TAG, %r10 # r10 = authTag mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len cmp $16, %r11 - je _T_16\@ + je .L_T_16\@ cmp $8, %r11 - jl _T_4\@ + jl .L_T_4\@ -_T_8\@: +.L_T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) add $8, %r10 sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 test %r11, %r11 - je _return_T_done\@ -_T_4\@: + je .L_return_T_done\@ +.L_T_4\@: vmovd %xmm9, %eax mov %eax, (%r10) add $4, %r10 sub $4, %r11 vpsrldq $4, %xmm9, %xmm9 test %r11, %r11 - je _return_T_done\@ -_T_123\@: + je .L_return_T_done\@ +.L_T_123\@: vmovd %xmm9, %eax cmp $2, %r11 - jl _T_1\@ + jl .L_T_1\@ mov %ax, (%r10) cmp $2, %r11 - je _return_T_done\@ + je .L_return_T_done\@ add $2, %r10 sar $16, %eax -_T_1\@: +.L_T_1\@: mov %al, (%r10) - jmp _return_T_done\@ + jmp .L_return_T_done\@ -_T_16\@: +.L_T_16\@: vmovdqu %xmm9, (%r10) -_return_T_done\@: +.L_return_T_done\@: .endm .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 @@ -603,8 +579,8 @@ _return_T_done\@: vpxor \T8, \T8, \T8 vpxor \T7, \T7, \T7 cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: + jl .L_get_AAD_rest8\@ +.L_get_AAD_blocks\@: vmovdqu (%r10), \T7 vpshufb SHUF_MASK(%rip), \T7, \T7 vpxor \T7, \T8, \T8 @@ -613,29 +589,29 @@ _get_AAD_blocks\@: sub $16, %r12 sub $16, %r11 cmp $16, %r11 - jge _get_AAD_blocks\@ + jge .L_get_AAD_blocks\@ vmovdqu \T8, \T7 test %r11, %r11 - je _get_AAD_done\@ + je .L_get_AAD_done\@ vpxor \T7, \T7, \T7 /* read the last <16B of AAD. since we have at least 4B of data right after the AAD (the ICV, and maybe some CT), we can read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: +.L_get_AAD_rest8\@: cmp $4, %r11 - jle _get_AAD_rest4\@ + jle .L_get_AAD_rest4\@ movq (%r10), \T1 add $8, %r10 sub $8, %r11 vpslldq $8, \T1, \T1 vpsrldq $8, \T7, \T7 vpxor \T1, \T7, \T7 - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: + jmp .L_get_AAD_rest8\@ +.L_get_AAD_rest4\@: test %r11, %r11 - jle _get_AAD_rest0\@ + jle .L_get_AAD_rest0\@ mov (%r10), %eax movq %rax, \T1 add $4, %r10 @@ -643,20 +619,22 @@ _get_AAD_rest4\@: vpslldq $12, \T1, \T1 vpsrldq $4, \T7, \T7 vpxor \T1, \T7, \T7 -_get_AAD_rest0\@: +.L_get_AAD_rest0\@: /* finalize: shift out the extra bytes we read, and align left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - vmovdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, \T7, \T7 -_get_AAD_rest_final\@: + vpshufb and a pair of shuffle masks */ + leaq ALL_F(%rip), %r11 + subq %r12, %r11 + vmovdqu 16(%r11), \T1 + andq $~3, %r11 + vpshufb (%r11), \T7, \T7 + vpand \T1, \T7, \T7 +.L_get_AAD_rest_final\@: vpshufb SHUF_MASK(%rip), \T7, \T7 vpxor \T8, \T7, \T7 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 -_get_AAD_done\@: +.L_get_AAD_done\@: vmovdqu \T7, AadHash(arg2) .endm @@ -707,28 +685,28 @@ _get_AAD_done\@: vpxor \XMMDst, \XMMDst, \XMMDst cmp $8, \DLEN - jl _read_lt8_\@ + jl .L_read_lt8_\@ mov (\DPTR), %rax vpinsrq $0, %rax, \XMMDst, \XMMDst sub $8, \DLEN - jz _done_read_partial_block_\@ + jz .L_done_read_partial_block_\@ xor %eax, %eax -_read_next_byte_\@: +.L_read_next_byte_\@: shl $8, %rax mov 7(\DPTR, \DLEN, 1), %al dec \DLEN - jnz _read_next_byte_\@ + jnz .L_read_next_byte_\@ vpinsrq $1, %rax, \XMMDst, \XMMDst - jmp _done_read_partial_block_\@ -_read_lt8_\@: + jmp .L_done_read_partial_block_\@ +.L_read_lt8_\@: xor %eax, %eax -_read_next_byte_lt8_\@: +.L_read_next_byte_lt8_\@: shl $8, %rax mov -1(\DPTR, \DLEN, 1), %al dec \DLEN - jnz _read_next_byte_lt8_\@ + jnz .L_read_next_byte_lt8_\@ vpinsrq $0, %rax, \XMMDst, \XMMDst -_done_read_partial_block_\@: +.L_done_read_partial_block_\@: .endm # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks @@ -740,21 +718,21 @@ _done_read_partial_block_\@: AAD_HASH ENC_DEC mov PBlockLen(arg2), %r13 test %r13, %r13 - je _partial_block_done_\@ # Leave Macro if no partial blocks + je .L_partial_block_done_\@ # Leave Macro if no partial blocks # Read in input data without over reading cmp $16, \PLAIN_CYPH_LEN - jl _fewer_than_16_bytes_\@ + jl .L_fewer_than_16_bytes_\@ vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp _data_read_\@ + jmp .L_data_read_\@ -_fewer_than_16_bytes_\@: +.L_fewer_than_16_bytes_\@: lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 mov \PLAIN_CYPH_LEN, %r12 READ_PARTIAL_BLOCK %r10 %r12 %xmm1 mov PBlockLen(arg2), %r13 -_data_read_\@: # Finished reading in data +.L_data_read_\@: # Finished reading in data vmovdqu PBlockEncKey(arg2), %xmm9 vmovdqu HashKey(arg2), %xmm13 @@ -777,9 +755,9 @@ _data_read_\@: # Finished reading in data sub $16, %r10 # Determine if if partial block is not being filled and # shift mask accordingly - jge _no_extra_mask_1_\@ + jge .L_no_extra_mask_1_\@ sub %r10, %r12 -_no_extra_mask_1_\@: +.L_no_extra_mask_1_\@: vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out bottom r13 bytes of xmm9 @@ -792,17 +770,17 @@ _no_extra_mask_1_\@: vpxor %xmm3, \AAD_HASH, \AAD_HASH test %r10, %r10 - jl _partial_incomplete_1_\@ + jl .L_partial_incomplete_1_\@ # GHASH computation for the last <16 Byte block \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 xor %eax,%eax mov %rax, PBlockLen(arg2) - jmp _dec_done_\@ -_partial_incomplete_1_\@: + jmp .L_dec_done_\@ +.L_partial_incomplete_1_\@: add \PLAIN_CYPH_LEN, PBlockLen(arg2) -_dec_done_\@: +.L_dec_done_\@: vmovdqu \AAD_HASH, AadHash(arg2) .else vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) @@ -813,9 +791,9 @@ _dec_done_\@: sub $16, %r10 # Determine if if partial block is not being filled and # shift mask accordingly - jge _no_extra_mask_2_\@ + jge .L_no_extra_mask_2_\@ sub %r10, %r12 -_no_extra_mask_2_\@: +.L_no_extra_mask_2_\@: vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out bottom r13 bytes of xmm9 @@ -827,17 +805,17 @@ _no_extra_mask_2_\@: vpxor %xmm9, \AAD_HASH, \AAD_HASH test %r10, %r10 - jl _partial_incomplete_2_\@ + jl .L_partial_incomplete_2_\@ # GHASH computation for the last <16 Byte block \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 xor %eax,%eax mov %rax, PBlockLen(arg2) - jmp _encode_done_\@ -_partial_incomplete_2_\@: + jmp .L_encode_done_\@ +.L_partial_incomplete_2_\@: add \PLAIN_CYPH_LEN, PBlockLen(arg2) -_encode_done_\@: +.L_encode_done_\@: vmovdqu \AAD_HASH, AadHash(arg2) vmovdqa SHUF_MASK(%rip), %xmm10 @@ -847,32 +825,32 @@ _encode_done_\@: .endif # output encrypted Bytes test %r10, %r10 - jl _partial_fill_\@ + jl .L_partial_fill_\@ mov %r13, %r12 mov $16, %r13 # Set r13 to be the number of bytes to write out sub %r12, %r13 - jmp _count_set_\@ -_partial_fill_\@: + jmp .L_count_set_\@ +.L_partial_fill_\@: mov \PLAIN_CYPH_LEN, %r13 -_count_set_\@: +.L_count_set_\@: vmovdqa %xmm9, %xmm0 vmovq %xmm0, %rax cmp $8, %r13 - jle _less_than_8_bytes_left_\@ + jle .L_less_than_8_bytes_left_\@ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $8, \DATA_OFFSET psrldq $8, %xmm0 vmovq %xmm0, %rax sub $8, %r13 -_less_than_8_bytes_left_\@: +.L_less_than_8_bytes_left_\@: movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $1, \DATA_OFFSET shr $8, %rax sub $1, %r13 - jne _less_than_8_bytes_left_\@ -_partial_block_done_\@: + jne .L_less_than_8_bytes_left_\@ +.L_partial_block_done_\@: .endm # PARTIAL_BLOCK ############################################################################### @@ -1073,7 +1051,7 @@ _partial_block_done_\@: vmovdqa \XMM8, \T3 cmp $128, %r13 - jl _initial_blocks_done\@ # no need for precomputed constants + jl .L_initial_blocks_done\@ # no need for precomputed constants ############################################################################### # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i @@ -1215,7 +1193,7 @@ _partial_block_done_\@: ############################################################################### -_initial_blocks_done\@: +.L_initial_blocks_done\@: .endm @@ -2023,7 +2001,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) vmovdqa \XMM8, \T3 cmp $128, %r13 - jl _initial_blocks_done\@ # no need for precomputed constants + jl .L_initial_blocks_done\@ # no need for precomputed constants ############################################################################### # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i @@ -2167,7 +2145,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) ############################################################################### -_initial_blocks_done\@: +.L_initial_blocks_done\@: .endm diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S index 9243f6289d34..7c1abc513f34 100644 --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -80,7 +80,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vmovdqu .Lshufb_16x16b, a0; \ + vmovdqu .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -132,7 +132,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vmovdqu .Lshufb_16x16b, a0; \ + vmovdqu .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -300,11 +300,11 @@ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vmovdqa .Ltf_s2_bitmatrix, t0; \ - vmovdqa .Ltf_inv_bitmatrix, t1; \ - vmovdqa .Ltf_id_bitmatrix, t2; \ - vmovdqa .Ltf_aff_bitmatrix, t3; \ - vmovdqa .Ltf_x2_bitmatrix, t4; \ + vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \ + vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \ + vmovdqa .Ltf_id_bitmatrix(%rip), t2; \ + vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \ + vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ @@ -324,13 +324,13 @@ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vmovdqa .Linv_shift_row, t0; \ - vmovdqa .Lshift_row, t1; \ - vbroadcastss .L0f0f0f0f, t6; \ - vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ - vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ - vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ - vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \ + vmovdqa .Linv_shift_row(%rip), t0; \ + vmovdqa .Lshift_row(%rip), t1; \ + vbroadcastss .L0f0f0f0f(%rip), t6; \ + vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \ + vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \ + vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \ + vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \ \ vaesenclast t7, x0, x0; \ vaesenclast t7, x4, x4; \ diff --git a/arch/x86/crypto/aria-aesni-avx2-asm_64.S b/arch/x86/crypto/aria-aesni-avx2-asm_64.S index 82a14b4ad920..c60fa2980630 100644 --- a/arch/x86/crypto/aria-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx2-asm_64.S @@ -96,7 +96,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti128 .Lshufb_16x16b, a0; \ + vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -148,7 +148,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti128 .Lshufb_16x16b, a0; \ + vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -307,11 +307,11 @@ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vpbroadcastq .Ltf_s2_bitmatrix, t0; \ - vpbroadcastq .Ltf_inv_bitmatrix, t1; \ - vpbroadcastq .Ltf_id_bitmatrix, t2; \ - vpbroadcastq .Ltf_aff_bitmatrix, t3; \ - vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \ + vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \ + vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \ + vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \ + vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ @@ -332,12 +332,12 @@ t4, t5, t6, t7) \ vpxor t7, t7, t7; \ vpxor t6, t6, t6; \ - vbroadcasti128 .Linv_shift_row, t0; \ - vbroadcasti128 .Lshift_row, t1; \ - vbroadcasti128 .Ltf_lo__inv_aff__and__s2, t2; \ - vbroadcasti128 .Ltf_hi__inv_aff__and__s2, t3; \ - vbroadcasti128 .Ltf_lo__x2__and__fwd_aff, t4; \ - vbroadcasti128 .Ltf_hi__x2__and__fwd_aff, t5; \ + vbroadcasti128 .Linv_shift_row(%rip), t0; \ + vbroadcasti128 .Lshift_row(%rip), t1; \ + vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \ + vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \ + vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \ + vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \ \ vextracti128 $1, x0, t6##_x; \ vaesenclast t7##_x, x0##_x, x0##_x; \ @@ -369,7 +369,7 @@ vaesdeclast t7##_x, t6##_x, t6##_x; \ vinserti128 $1, t6##_x, x6, x6; \ \ - vpbroadcastd .L0f0f0f0f, t6; \ + vpbroadcastd .L0f0f0f0f(%rip), t6; \ \ /* AES inverse shift rows */ \ vpshufb t0, x0, x0; \ diff --git a/arch/x86/crypto/aria-gfni-avx512-asm_64.S b/arch/x86/crypto/aria-gfni-avx512-asm_64.S index 3193f0701450..860887e5d02e 100644 --- a/arch/x86/crypto/aria-gfni-avx512-asm_64.S +++ b/arch/x86/crypto/aria-gfni-avx512-asm_64.S @@ -80,7 +80,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti64x2 .Lshufb_16x16b, a0; \ + vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \ vmovdqu64 st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -132,7 +132,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti64x2 .Lshufb_16x16b, a0; \ + vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \ vmovdqu64 st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -308,11 +308,11 @@ x4, x5, x6, x7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vpbroadcastq .Ltf_s2_bitmatrix, t0; \ - vpbroadcastq .Ltf_inv_bitmatrix, t1; \ - vpbroadcastq .Ltf_id_bitmatrix, t2; \ - vpbroadcastq .Ltf_aff_bitmatrix, t3; \ - vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \ + vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \ + vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \ + vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \ + vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ @@ -332,11 +332,11 @@ y4, y5, y6, y7, \ t0, t1, t2, t3, \ t4, t5, t6, t7) \ - vpbroadcastq .Ltf_s2_bitmatrix, t0; \ - vpbroadcastq .Ltf_inv_bitmatrix, t1; \ - vpbroadcastq .Ltf_id_bitmatrix, t2; \ - vpbroadcastq .Ltf_aff_bitmatrix, t3; \ - vpbroadcastq .Ltf_x2_bitmatrix, t4; \ + vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \ + vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \ + vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \ + vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \ + vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index aaba21230528..0313f9673f56 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -8,7 +8,6 @@ #include <linux/types.h> #include <linux/jump_label.h> #include <linux/kernel.h> -#include <linux/module.h> #include <linux/sizes.h> #include <asm/cpufeature.h> @@ -72,6 +71,4 @@ static int __init blake2s_mod_init(void) return 0; } -module_init(blake2s_mod_init); - -MODULE_LICENSE("GPL v2"); +subsys_initcall(blake2s_mod_init); diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 4a30618281ec..646477a13e11 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -52,10 +52,10 @@ /* \ * S-function with AES subbytes \ */ \ - vmovdqa .Linv_shift_row, t4; \ - vbroadcastss .L0f0f0f0f, t7; \ - vmovdqa .Lpre_tf_lo_s1, t0; \ - vmovdqa .Lpre_tf_hi_s1, t1; \ + vmovdqa .Linv_shift_row(%rip), t4; \ + vbroadcastss .L0f0f0f0f(%rip), t7; \ + vmovdqa .Lpre_tf_lo_s1(%rip), t0; \ + vmovdqa .Lpre_tf_hi_s1(%rip), t1; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ @@ -68,8 +68,8 @@ vpshufb t4, x6, x6; \ \ /* prefilter sboxes 1, 2 and 3 */ \ - vmovdqa .Lpre_tf_lo_s4, t2; \ - vmovdqa .Lpre_tf_hi_s4, t3; \ + vmovdqa .Lpre_tf_lo_s4(%rip), t2; \ + vmovdqa .Lpre_tf_hi_s4(%rip), t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x1, t0, t1, t7, t6); \ @@ -83,8 +83,8 @@ filter_8bit(x6, t2, t3, t7, t6); \ \ /* AES subbytes + AES shift rows */ \ - vmovdqa .Lpost_tf_lo_s1, t0; \ - vmovdqa .Lpost_tf_hi_s1, t1; \ + vmovdqa .Lpost_tf_lo_s1(%rip), t0; \ + vmovdqa .Lpost_tf_hi_s1(%rip), t1; \ vaesenclast t4, x0, x0; \ vaesenclast t4, x7, x7; \ vaesenclast t4, x1, x1; \ @@ -95,16 +95,16 @@ vaesenclast t4, x6, x6; \ \ /* postfilter sboxes 1 and 4 */ \ - vmovdqa .Lpost_tf_lo_s3, t2; \ - vmovdqa .Lpost_tf_hi_s3, t3; \ + vmovdqa .Lpost_tf_lo_s3(%rip), t2; \ + vmovdqa .Lpost_tf_hi_s3(%rip), t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \ \ /* postfilter sbox 3 */ \ - vmovdqa .Lpost_tf_lo_s2, t4; \ - vmovdqa .Lpost_tf_hi_s2, t5; \ + vmovdqa .Lpost_tf_lo_s2(%rip), t4; \ + vmovdqa .Lpost_tf_hi_s2(%rip), t5; \ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ @@ -443,7 +443,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vmovdqu .Lshufb_16x16b, a0; \ + vmovdqu .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -482,7 +482,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio, key) \ vmovq key, x0; \ - vpshufb .Lpack_bswap, x0, x0; \ + vpshufb .Lpack_bswap(%rip), x0, x0; \ \ vpxor 0 * 16(rio), x0, y7; \ vpxor 1 * 16(rio), x0, y6; \ @@ -533,7 +533,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vmovdqu x0, stack_tmp0; \ \ vmovq key, x0; \ - vpshufb .Lpack_bswap, x0, x0; \ + vpshufb .Lpack_bswap(%rip), x0, x0; \ \ vpxor x0, y7, y7; \ vpxor x0, y6, y6; \ diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index deaf62aa73a6..a0eb94e53b1b 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -64,12 +64,12 @@ /* \ * S-function with AES subbytes \ */ \ - vbroadcasti128 .Linv_shift_row, t4; \ - vpbroadcastd .L0f0f0f0f, t7; \ - vbroadcasti128 .Lpre_tf_lo_s1, t5; \ - vbroadcasti128 .Lpre_tf_hi_s1, t6; \ - vbroadcasti128 .Lpre_tf_lo_s4, t2; \ - vbroadcasti128 .Lpre_tf_hi_s4, t3; \ + vbroadcasti128 .Linv_shift_row(%rip), t4; \ + vpbroadcastd .L0f0f0f0f(%rip), t7; \ + vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \ + vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \ + vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \ + vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ @@ -115,8 +115,8 @@ vinserti128 $1, t2##_x, x6, x6; \ vextracti128 $1, x1, t3##_x; \ vextracti128 $1, x4, t2##_x; \ - vbroadcasti128 .Lpost_tf_lo_s1, t0; \ - vbroadcasti128 .Lpost_tf_hi_s1, t1; \ + vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \ + vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \ vaesenclast t4##_x, x2##_x, x2##_x; \ vaesenclast t4##_x, t6##_x, t6##_x; \ vinserti128 $1, t6##_x, x2, x2; \ @@ -131,16 +131,16 @@ vinserti128 $1, t2##_x, x4, x4; \ \ /* postfilter sboxes 1 and 4 */ \ - vbroadcasti128 .Lpost_tf_lo_s3, t2; \ - vbroadcasti128 .Lpost_tf_hi_s3, t3; \ + vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \ + vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \ \ /* postfilter sbox 3 */ \ - vbroadcasti128 .Lpost_tf_lo_s2, t4; \ - vbroadcasti128 .Lpost_tf_hi_s2, t5; \ + vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \ + vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ @@ -475,7 +475,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti128 .Lshufb_16x16b, a0; \ + vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -514,7 +514,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio, key) \ vpbroadcastq key, x0; \ - vpshufb .Lpack_bswap, x0, x0; \ + vpshufb .Lpack_bswap(%rip), x0, x0; \ \ vpxor 0 * 32(rio), x0, y7; \ vpxor 1 * 32(rio), x0, y6; \ @@ -565,7 +565,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vmovdqu x0, stack_tmp0; \ \ vpbroadcastq key, x0; \ - vpshufb .Lpack_bswap, x0, x0; \ + vpshufb .Lpack_bswap(%rip), x0, x0; \ \ vpxor x0, y7, y7; \ vpxor x0, y6, y6; \ diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 347c059f5940..816b6bb8bded 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -77,11 +77,13 @@ #define RXORbl %r9b #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ + leaq T0(%rip), tmp1; \ movzbl ab ## bl, tmp2 ## d; \ + xorq (tmp1, tmp2, 8), dst; \ + leaq T1(%rip), tmp2; \ movzbl ab ## bh, tmp1 ## d; \ rorq $16, ab; \ - xorq T0(, tmp2, 8), dst; \ - xorq T1(, tmp1, 8), dst; + xorq (tmp2, tmp1, 8), dst; /********************************************************************** 1-way camellia diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index 0326a01503c3..b4e460a87f18 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -84,15 +84,19 @@ #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ movzbl src ## bh, RID1d; \ + leaq s1(%rip), RID2; \ + movl (RID2,RID1,4), dst ## d; \ movzbl src ## bl, RID2d; \ + leaq s2(%rip), RID1; \ + op1 (RID1,RID2,4), dst ## d; \ shrq $16, src; \ - movl s1(, RID1, 4), dst ## d; \ - op1 s2(, RID2, 4), dst ## d; \ movzbl src ## bh, RID1d; \ + leaq s3(%rip), RID2; \ + op2 (RID2,RID1,4), dst ## d; \ movzbl src ## bl, RID2d; \ interleave_op(il_reg); \ - op2 s3(, RID1, 4), dst ## d; \ - op3 s4(, RID2, 4), dst ## d; + leaq s4(%rip), RID1; \ + op3 (RID1,RID2,4), dst ## d; #define dummy(d) /* do nothing */ @@ -151,15 +155,15 @@ subround(l ## 3, r ## 3, l ## 4, r ## 4, f); #define enc_preload_rkr() \ - vbroadcastss .L16_mask, RKR; \ + vbroadcastss .L16_mask(%rip), RKR; \ /* add 16-bit rotation to key rotations (mod 32) */ \ vpxor kr(CTX), RKR, RKR; #define dec_preload_rkr() \ - vbroadcastss .L16_mask, RKR; \ + vbroadcastss .L16_mask(%rip), RKR; \ /* add 16-bit rotation to key rotations (mod 32) */ \ vpxor kr(CTX), RKR, RKR; \ - vpshufb .Lbswap128_mask, RKR, RKR; + vpshufb .Lbswap128_mask(%rip), RKR, RKR; #define transpose_2x4(x0, x1, t0, t1) \ vpunpckldq x1, x0, t0; \ @@ -235,9 +239,9 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; enc_preload_rkr(); inpack_blocks(RL1, RR1, RTMP, RX, RKM); @@ -271,7 +275,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) popq %rbx; popq %r15; - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; outunpack_blocks(RR1, RL1, RTMP, RX, RKM); outunpack_blocks(RR2, RL2, RTMP, RX, RKM); @@ -308,9 +312,9 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; dec_preload_rkr(); inpack_blocks(RL1, RR1, RTMP, RX, RKM); @@ -341,7 +345,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16) round(RL, RR, 1, 2); round(RR, RL, 0, 1); - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; popq %rbx; popq %r15; @@ -504,8 +508,8 @@ SYM_FUNC_START(cast5_ctr_16way) vpcmpeqd RKR, RKR, RKR; vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ - vmovdqa .Lbswap_iv_mask, R1ST; - vmovdqa .Lbswap128_mask, RKM; + vmovdqa .Lbswap_iv_mask(%rip), R1ST; + vmovdqa .Lbswap128_mask(%rip), RKM; /* load IV and byteswap */ vmovq (%rcx), RX; diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 82b716fd5dba..9e86d460b409 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -84,15 +84,19 @@ #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ movzbl src ## bh, RID1d; \ + leaq s1(%rip), RID2; \ + movl (RID2,RID1,4), dst ## d; \ movzbl src ## bl, RID2d; \ + leaq s2(%rip), RID1; \ + op1 (RID1,RID2,4), dst ## d; \ shrq $16, src; \ - movl s1(, RID1, 4), dst ## d; \ - op1 s2(, RID2, 4), dst ## d; \ movzbl src ## bh, RID1d; \ + leaq s3(%rip), RID2; \ + op2 (RID2,RID1,4), dst ## d; \ movzbl src ## bl, RID2d; \ interleave_op(il_reg); \ - op2 s3(, RID1, 4), dst ## d; \ - op3 s4(, RID2, 4), dst ## d; + leaq s4(%rip), RID1; \ + op3 (RID1,RID2,4), dst ## d; #define dummy(d) /* do nothing */ @@ -175,10 +179,10 @@ qop(RD, RC, 1); #define shuffle(mask) \ - vpshufb mask, RKR, RKR; + vpshufb mask(%rip), RKR, RKR; #define preload_rkr(n, do_mask, mask) \ - vbroadcastss .L16_mask, RKR; \ + vbroadcastss .L16_mask(%rip), RKR; \ /* add 16-bit rotation to key rotations (mod 32) */ \ vpxor (kr+n*16)(CTX), RKR, RKR; \ do_mask(mask); @@ -258,9 +262,9 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); @@ -284,7 +288,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8) popq %rbx; popq %r15; - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); @@ -306,9 +310,9 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8) movq %rdi, CTX; - vmovdqa .Lbswap_mask, RKM; - vmovd .Lfirst_mask, R1ST; - vmovd .L32_mask, R32; + vmovdqa .Lbswap_mask(%rip), RKM; + vmovd .Lfirst_mask(%rip), R1ST; + vmovd .L32_mask(%rip), R32; inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); @@ -332,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8) popq %rbx; popq %r15; - vmovdqa .Lbswap_mask, RKM; + vmovdqa .Lbswap_mask(%rip), RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index ca53e96996ac..5d31137e2c7d 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -90,7 +90,7 @@ SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligne sub $0x40, LEN add $0x40, BUF cmp $0x40, LEN - jb less_64 + jb .Lless_64 #ifdef __x86_64__ movdqa .Lconstant_R2R1(%rip), CONSTANT @@ -98,7 +98,7 @@ SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligne movdqa .Lconstant_R2R1, CONSTANT #endif -loop_64:/* 64 bytes Full cache line folding */ +.Lloop_64:/* 64 bytes Full cache line folding */ prefetchnta 0x40(BUF) movdqa %xmm1, %xmm5 movdqa %xmm2, %xmm6 @@ -139,8 +139,8 @@ loop_64:/* 64 bytes Full cache line folding */ sub $0x40, LEN add $0x40, BUF cmp $0x40, LEN - jge loop_64 -less_64:/* Folding cache line into 128bit */ + jge .Lloop_64 +.Lless_64:/* Folding cache line into 128bit */ #ifdef __x86_64__ movdqa .Lconstant_R4R3(%rip), CONSTANT #else @@ -167,8 +167,8 @@ less_64:/* Folding cache line into 128bit */ pxor %xmm4, %xmm1 cmp $0x10, LEN - jb fold_64 -loop_16:/* Folding rest buffer into 128bit */ + jb .Lfold_64 +.Lloop_16:/* Folding rest buffer into 128bit */ movdqa %xmm1, %xmm5 pclmulqdq $0x00, CONSTANT, %xmm1 pclmulqdq $0x11, CONSTANT, %xmm5 @@ -177,9 +177,9 @@ loop_16:/* Folding rest buffer into 128bit */ sub $0x10, LEN add $0x10, BUF cmp $0x10, LEN - jge loop_16 + jge .Lloop_16 -fold_64: +.Lfold_64: /* perform the last 64 bit fold, also adds 32 zeroes * to the input stream */ pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index ec35915f0901..81ce0f4db555 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -49,15 +49,15 @@ ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction .macro LABEL prefix n -\prefix\n\(): +.L\prefix\n\(): .endm .macro JMPTBL_ENTRY i -.quad crc_\i +.quad .Lcrc_\i .endm .macro JNC_LESS_THAN j - jnc less_than_\j + jnc .Lless_than_\j .endm # Define threshold where buffers are considered "small" and routed to more @@ -108,30 +108,30 @@ SYM_FUNC_START(crc_pcl) neg %bufp and $7, %bufp # calculate the unalignment amount of # the address - je proc_block # Skip if aligned + je .Lproc_block # Skip if aligned ## If len is less than 8 and we're unaligned, we need to jump ## to special code to avoid reading beyond the end of the buffer cmp $8, len - jae do_align + jae .Ldo_align # less_than_8 expects length in upper 3 bits of len_dw # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] shl $32-3+1, len_dw - jmp less_than_8_post_shl1 + jmp .Lless_than_8_post_shl1 -do_align: +.Ldo_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer add %bufp, bufptmp # align buffer pointer for quadword # processing sub %bufp, len # update buffer length -align_loop: +.Lalign_loop: crc32b %bl, crc_init_dw # compute crc32 of 1-byte shr $8, tmp # get next byte dec %bufp - jne align_loop + jne .Lalign_loop -proc_block: +.Lproc_block: ################################################################ ## 2) PROCESS BLOCKS: @@ -141,11 +141,11 @@ proc_block: movq len, tmp # save num bytes in tmp cmpq $128*24, len - jae full_block + jae .Lfull_block -continue_block: +.Lcontinue_block: cmpq $SMALL_SIZE, len - jb small + jb .Lsmall ## len < 128*24 movq $2731, %rax # 2731 = ceil(2^16 / 24) @@ -168,13 +168,14 @@ continue_block: xor crc2, crc2 ## branch into array - mov jump_table(,%rax,8), %bufp + leaq jump_table(%rip), %bufp + mov (%bufp,%rax,8), %bufp JMP_NOSPEC bufp ################################################################ ## 2a) PROCESS FULL BLOCKS: ################################################################ -full_block: +.Lfull_block: movl $128,%eax lea 128*8*2(block_0), block_1 lea 128*8*3(block_0), block_2 @@ -189,7 +190,6 @@ full_block: ## 3) CRC Array: ################################################################ -crc_array: i=128 .rept 128-1 .altmacro @@ -242,28 +242,28 @@ LABEL crc_ 0 ENDBR mov tmp, len cmp $128*24, tmp - jae full_block + jae .Lfull_block cmp $24, tmp - jae continue_block + jae .Lcontinue_block -less_than_24: +.Lless_than_24: shl $32-4, len_dw # less_than_16 expects length # in upper 4 bits of len_dw - jnc less_than_16 + jnc .Lless_than_16 crc32q (bufptmp), crc_init crc32q 8(bufptmp), crc_init - jz do_return + jz .Ldo_return add $16, bufptmp # len is less than 8 if we got here # less_than_8 expects length in upper 3 bits of len_dw # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] shl $2, len_dw - jmp less_than_8_post_shl1 + jmp .Lless_than_8_post_shl1 ####################################################################### ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) ####################################################################### -small: +.Lsmall: shl $32-8, len_dw # Prepare len_dw for less_than_256 j=256 .rept 5 # j = {256, 128, 64, 32, 16} @@ -279,32 +279,32 @@ LABEL less_than_ %j # less_than_j: Length should be in crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data i=i+8 .endr - jz do_return # Return if remaining length is zero + jz .Ldo_return # Return if remaining length is zero add $j, bufptmp # Advance buf .endr -less_than_8: # Length should be stored in +.Lless_than_8: # Length should be stored in # upper 3 bits of len_dw shl $1, len_dw -less_than_8_post_shl1: - jnc less_than_4 +.Lless_than_8_post_shl1: + jnc .Lless_than_4 crc32l (bufptmp), crc_init_dw # CRC of 4 bytes - jz do_return # return if remaining data is zero + jz .Ldo_return # return if remaining data is zero add $4, bufptmp -less_than_4: # Length should be stored in +.Lless_than_4: # Length should be stored in # upper 2 bits of len_dw shl $1, len_dw - jnc less_than_2 + jnc .Lless_than_2 crc32w (bufptmp), crc_init_dw # CRC of 2 bytes - jz do_return # return if remaining data is zero + jz .Ldo_return # return if remaining data is zero add $2, bufptmp -less_than_2: # Length should be stored in the MSB +.Lless_than_2: # Length should be stored in the MSB # of len_dw shl $1, len_dw - jnc less_than_1 + jnc .Lless_than_1 crc32b (bufptmp), crc_init_dw # CRC of 1 byte -less_than_1: # Length should be zero -do_return: +.Lless_than_1: # Length should be zero +.Ldo_return: movq crc_init, %rax popq %rsi popq %rdi diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index f4c760f4cade..cf21b998e77c 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -129,21 +129,29 @@ movzbl RW0bl, RT2d; \ movzbl RW0bh, RT3d; \ shrq $16, RW0; \ - movq s8(, RT0, 8), RT0; \ - xorq s6(, RT1, 8), to; \ + leaq s8(%rip), RW1; \ + movq (RW1, RT0, 8), RT0; \ + leaq s6(%rip), RW1; \ + xorq (RW1, RT1, 8), to; \ movzbl RW0bl, RL1d; \ movzbl RW0bh, RT1d; \ shrl $16, RW0d; \ - xorq s4(, RT2, 8), RT0; \ - xorq s2(, RT3, 8), to; \ + leaq s4(%rip), RW1; \ + xorq (RW1, RT2, 8), RT0; \ + leaq s2(%rip), RW1; \ + xorq (RW1, RT3, 8), to; \ movzbl RW0bl, RT2d; \ movzbl RW0bh, RT3d; \ - xorq s7(, RL1, 8), RT0; \ - xorq s5(, RT1, 8), to; \ - xorq s3(, RT2, 8), RT0; \ + leaq s7(%rip), RW1; \ + xorq (RW1, RL1, 8), RT0; \ + leaq s5(%rip), RW1; \ + xorq (RW1, RT1, 8), to; \ + leaq s3(%rip), RW1; \ + xorq (RW1, RT2, 8), RT0; \ load_next_key(n, RW0); \ xorq RT0, to; \ - xorq s1(, RT3, 8), to; \ + leaq s1(%rip), RW1; \ + xorq (RW1, RT3, 8), to; \ #define load_next_key(n, RWx) \ movq (((n) + 1) * 8)(CTX), RWx; @@ -355,65 +363,89 @@ SYM_FUNC_END(des3_ede_x86_64_crypt_blk) movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ shrq $16, RW0; \ - xorq s8(, RT3, 8), to##0; \ - xorq s6(, RT1, 8), to##0; \ + leaq s8(%rip), RT2; \ + xorq (RT2, RT3, 8), to##0; \ + leaq s6(%rip), RT2; \ + xorq (RT2, RT1, 8), to##0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ shrq $16, RW0; \ - xorq s4(, RT3, 8), to##0; \ - xorq s2(, RT1, 8), to##0; \ + leaq s4(%rip), RT2; \ + xorq (RT2, RT3, 8), to##0; \ + leaq s2(%rip), RT2; \ + xorq (RT2, RT1, 8), to##0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ shrl $16, RW0d; \ - xorq s7(, RT3, 8), to##0; \ - xorq s5(, RT1, 8), to##0; \ + leaq s7(%rip), RT2; \ + xorq (RT2, RT3, 8), to##0; \ + leaq s5(%rip), RT2; \ + xorq (RT2, RT1, 8), to##0; \ movzbl RW0bl, RT3d; \ movzbl RW0bh, RT1d; \ load_next_key(n, RW0); \ - xorq s3(, RT3, 8), to##0; \ - xorq s1(, RT1, 8), to##0; \ + leaq s3(%rip), RT2; \ + xorq (RT2, RT3, 8), to##0; \ + leaq s1(%rip), RT2; \ + xorq (RT2, RT1, 8), to##0; \ xorq from##1, RW1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ shrq $16, RW1; \ - xorq s8(, RT3, 8), to##1; \ - xorq s6(, RT1, 8), to##1; \ + leaq s8(%rip), RT2; \ + xorq (RT2, RT3, 8), to##1; \ + leaq s6(%rip), RT2; \ + xorq (RT2, RT1, 8), to##1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ shrq $16, RW1; \ - xorq s4(, RT3, 8), to##1; \ - xorq s2(, RT1, 8), to##1; \ + leaq s4(%rip), RT2; \ + xorq (RT2, RT3, 8), to##1; \ + leaq s2(%rip), RT2; \ + xorq (RT2, RT1, 8), to##1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ shrl $16, RW1d; \ - xorq s7(, RT3, 8), to##1; \ - xorq s5(, RT1, 8), to##1; \ + leaq s7(%rip), RT2; \ + xorq (RT2, RT3, 8), to##1; \ + leaq s5(%rip), RT2; \ + xorq (RT2, RT1, 8), to##1; \ movzbl RW1bl, RT3d; \ movzbl RW1bh, RT1d; \ do_movq(RW0, RW1); \ - xorq s3(, RT3, 8), to##1; \ - xorq s1(, RT1, 8), to##1; \ + leaq s3(%rip), RT2; \ + xorq (RT2, RT3, 8), to##1; \ + leaq s1(%rip), RT2; \ + xorq (RT2, RT1, 8), to##1; \ xorq from##2, RW2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ shrq $16, RW2; \ - xorq s8(, RT3, 8), to##2; \ - xorq s6(, RT1, 8), to##2; \ + leaq s8(%rip), RT2; \ + xorq (RT2, RT3, 8), to##2; \ + leaq s6(%rip), RT2; \ + xorq (RT2, RT1, 8), to##2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ shrq $16, RW2; \ - xorq s4(, RT3, 8), to##2; \ - xorq s2(, RT1, 8), to##2; \ + leaq s4(%rip), RT2; \ + xorq (RT2, RT3, 8), to##2; \ + leaq s2(%rip), RT2; \ + xorq (RT2, RT1, 8), to##2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ shrl $16, RW2d; \ - xorq s7(, RT3, 8), to##2; \ - xorq s5(, RT1, 8), to##2; \ + leaq s7(%rip), RT2; \ + xorq (RT2, RT3, 8), to##2; \ + leaq s5(%rip), RT2; \ + xorq (RT2, RT1, 8), to##2; \ movzbl RW2bl, RT3d; \ movzbl RW2bh, RT1d; \ do_movq(RW0, RW2); \ - xorq s3(, RT3, 8), to##2; \ - xorq s1(, RT1, 8), to##2; + leaq s3(%rip), RT2; \ + xorq (RT2, RT3, 8), to##2; \ + leaq s1(%rip), RT2; \ + xorq (RT2, RT1, 8), to##2; #define __movq(src, dst) \ movq src, dst; diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 257ed9446f3e..99cb983ded9e 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -93,7 +93,7 @@ SYM_FUNC_START(clmul_ghash_mul) FRAME_BEGIN movups (%rdi), DATA movups (%rsi), SHASH - movaps .Lbswap_mask, BSWAP + movaps .Lbswap_mask(%rip), BSWAP pshufb BSWAP, DATA call __clmul_gf128mul_ble pshufb BSWAP, DATA @@ -110,7 +110,7 @@ SYM_FUNC_START(clmul_ghash_update) FRAME_BEGIN cmp $16, %rdx jb .Lupdate_just_ret # check length - movaps .Lbswap_mask, BSWAP + movaps .Lbswap_mask(%rip), BSWAP movups (%rdi), DATA movups (%rcx), SHASH pshufb BSWAP, DATA diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index a96b2fd26dab..4b49bdc95265 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -485,18 +485,18 @@ xchg WK_BUF, PRECALC_BUF .align 32 -_loop: +.L_loop: /* * code loops through more than one block * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ test BLOCKS_CTR, BLOCKS_CTR - jnz _begin + jnz .L_begin .align 32 - jmp _end + jmp .L_end .align 32 -_begin: +.L_begin: /* * Do first block @@ -508,9 +508,6 @@ _begin: .set j, j+2 .endr - jmp _loop0 -_loop0: - /* * rounds: * 10,12,14,16,18 @@ -545,7 +542,7 @@ _loop0: UPDATE_HASH 16(HASH_PTR), E test BLOCKS_CTR, BLOCKS_CTR - jz _loop + jz .L_loop mov TB, B @@ -562,8 +559,6 @@ _loop0: .set j, j+2 .endr - jmp _loop1 -_loop1: /* * rounds * 20+80,22+80,24+80,26+80,28+80 @@ -574,9 +569,6 @@ _loop1: .set j, j+2 .endr - jmp _loop2 -_loop2: - /* * rounds * 40+80,42+80,44+80,46+80,48+80 @@ -592,9 +584,6 @@ _loop2: /* Move to the next block only if needed*/ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 - jmp _loop3 -_loop3: - /* * rounds * 60+80,62+80,64+80,66+80,68+80 @@ -623,10 +612,10 @@ _loop3: xchg WK_BUF, PRECALC_BUF - jmp _loop + jmp .L_loop .align 32 - _end: +.L_end: .endm /* diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 5555b5d5215a..53de72bdd851 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -360,7 +360,7 @@ SYM_TYPED_FUNC_START(sha256_transform_avx) and $~15, %rsp # align stack pointer shl $6, NUM_BLKS # convert to bytes - jz done_hash + jz .Ldone_hash add INP, NUM_BLKS # pointer to end of data mov NUM_BLKS, _INP_END(%rsp) @@ -377,7 +377,7 @@ SYM_TYPED_FUNC_START(sha256_transform_avx) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK vmovdqa _SHUF_00BA(%rip), SHUF_00BA vmovdqa _SHUF_DC00(%rip), SHUF_DC00 -loop0: +.Lloop0: lea K256(%rip), TBL ## byte swap first 16 dwords @@ -391,7 +391,7 @@ loop0: ## schedule 48 input dwords, by doing 3 rounds of 16 each mov $3, SRND .align 16 -loop1: +.Lloop1: vpaddd (TBL), X0, XFER vmovdqa XFER, _XFER(%rsp) FOUR_ROUNDS_AND_SCHED @@ -410,10 +410,10 @@ loop1: FOUR_ROUNDS_AND_SCHED sub $1, SRND - jne loop1 + jne .Lloop1 mov $2, SRND -loop2: +.Lloop2: vpaddd (TBL), X0, XFER vmovdqa XFER, _XFER(%rsp) DO_ROUND 0 @@ -433,7 +433,7 @@ loop2: vmovdqa X3, X1 sub $1, SRND - jne loop2 + jne .Lloop2 addm (4*0)(CTX),a addm (4*1)(CTX),b @@ -447,9 +447,9 @@ loop2: mov _INP(%rsp), INP add $64, INP cmp _INP_END(%rsp), INP - jne loop0 + jne .Lloop0 -done_hash: +.Ldone_hash: mov %rbp, %rsp popq %rbp diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 3eada9416852..9918212faf91 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -538,12 +538,12 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) and $-32, %rsp # align rsp to 32 byte boundary shl $6, NUM_BLKS # convert to bytes - jz done_hash + jz .Ldone_hash lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block mov NUM_BLKS, _INP_END(%rsp) cmp NUM_BLKS, INP - je only_one_block + je .Lonly_one_block ## load initial digest mov (CTX), a @@ -561,7 +561,7 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) mov CTX, _CTX(%rsp) -loop0: +.Lloop0: ## Load first 16 dwords from two blocks VMOVDQ 0*32(INP),XTMP0 VMOVDQ 1*32(INP),XTMP1 @@ -580,7 +580,7 @@ loop0: vperm2i128 $0x20, XTMP3, XTMP1, X2 vperm2i128 $0x31, XTMP3, XTMP1, X3 -last_block_enter: +.Llast_block_enter: add $64, INP mov INP, _INP(%rsp) @@ -588,34 +588,40 @@ last_block_enter: xor SRND, SRND .align 16 -loop1: - vpaddd K256+0*32(SRND), X0, XFER +.Lloop1: + leaq K256+0*32(%rip), INP ## reuse INP as scratch reg + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 0*32 - vpaddd K256+1*32(SRND), X0, XFER + leaq K256+1*32(%rip), INP + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 1*32 - vpaddd K256+2*32(SRND), X0, XFER + leaq K256+2*32(%rip), INP + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 2*32 - vpaddd K256+3*32(SRND), X0, XFER + leaq K256+3*32(%rip), INP + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 3*32 add $4*32, SRND cmp $3*4*32, SRND - jb loop1 + jb .Lloop1 -loop2: +.Lloop2: ## Do last 16 rounds with no scheduling - vpaddd K256+0*32(SRND), X0, XFER + leaq K256+0*32(%rip), INP + vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 0*32 - vpaddd K256+1*32(SRND), X1, XFER + leaq K256+1*32(%rip), INP + vpaddd (INP, SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 1*32 add $2*32, SRND @@ -624,7 +630,7 @@ loop2: vmovdqa X3, X1 cmp $4*4*32, SRND - jb loop2 + jb .Lloop2 mov _CTX(%rsp), CTX mov _INP(%rsp), INP @@ -639,17 +645,17 @@ loop2: addm (4*7)(CTX),h cmp _INP_END(%rsp), INP - ja done_hash + ja .Ldone_hash #### Do second block using previously scheduled results xor SRND, SRND .align 16 -loop3: +.Lloop3: DO_4ROUNDS _XFER + 0*32 + 16 DO_4ROUNDS _XFER + 1*32 + 16 add $2*32, SRND cmp $4*4*32, SRND - jb loop3 + jb .Lloop3 mov _CTX(%rsp), CTX mov _INP(%rsp), INP @@ -665,10 +671,10 @@ loop3: addm (4*7)(CTX),h cmp _INP_END(%rsp), INP - jb loop0 - ja done_hash + jb .Lloop0 + ja .Ldone_hash -do_last_block: +.Ldo_last_block: VMOVDQ 0*16(INP),XWORD0 VMOVDQ 1*16(INP),XWORD1 VMOVDQ 2*16(INP),XWORD2 @@ -679,9 +685,9 @@ do_last_block: vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 - jmp last_block_enter + jmp .Llast_block_enter -only_one_block: +.Lonly_one_block: ## load initial digest mov (4*0)(CTX),a @@ -698,9 +704,9 @@ only_one_block: vmovdqa _SHUF_DC00(%rip), SHUF_DC00 mov CTX, _CTX(%rsp) - jmp do_last_block + jmp .Ldo_last_block -done_hash: +.Ldone_hash: mov %rbp, %rsp pop %rbp diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 959288eecc68..93264ee44543 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -369,7 +369,7 @@ SYM_TYPED_FUNC_START(sha256_transform_ssse3) and $~15, %rsp shl $6, NUM_BLKS # convert to bytes - jz done_hash + jz .Ldone_hash add INP, NUM_BLKS mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data @@ -387,7 +387,7 @@ SYM_TYPED_FUNC_START(sha256_transform_ssse3) movdqa _SHUF_00BA(%rip), SHUF_00BA movdqa _SHUF_DC00(%rip), SHUF_DC00 -loop0: +.Lloop0: lea K256(%rip), TBL ## byte swap first 16 dwords @@ -401,7 +401,7 @@ loop0: ## schedule 48 input dwords, by doing 3 rounds of 16 each mov $3, SRND .align 16 -loop1: +.Lloop1: movdqa (TBL), XFER paddd X0, XFER movdqa XFER, _XFER(%rsp) @@ -424,10 +424,10 @@ loop1: FOUR_ROUNDS_AND_SCHED sub $1, SRND - jne loop1 + jne .Lloop1 mov $2, SRND -loop2: +.Lloop2: paddd (TBL), X0 movdqa X0, _XFER(%rsp) DO_ROUND 0 @@ -446,7 +446,7 @@ loop2: movdqa X3, X1 sub $1, SRND - jne loop2 + jne .Lloop2 addm (4*0)(CTX),a addm (4*1)(CTX),b @@ -460,9 +460,9 @@ loop2: mov _INP(%rsp), INP add $64, INP cmp _INP_END(%rsp), INP - jne loop0 + jne .Lloop0 -done_hash: +.Ldone_hash: mov %rbp, %rsp popq %rbp diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index b0984f19fdb4..d902b8ea0721 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -276,7 +276,7 @@ frame_size = frame_WK + WK_SIZE ######################################################################## SYM_TYPED_FUNC_START(sha512_transform_avx) test msglen, msglen - je nowork + je .Lnowork # Save GPRs push %rbx @@ -291,7 +291,7 @@ SYM_TYPED_FUNC_START(sha512_transform_avx) sub $frame_size, %rsp and $~(0x20 - 1), %rsp -updateblock: +.Lupdateblock: # Load state variables mov DIGEST(0), a_64 @@ -348,7 +348,7 @@ updateblock: # Advance to next message block add $16*8, msg dec msglen - jnz updateblock + jnz .Lupdateblock # Restore Stack Pointer mov %rbp, %rsp @@ -361,7 +361,7 @@ updateblock: pop %r12 pop %rbx -nowork: +.Lnowork: RET SYM_FUNC_END(sha512_transform_avx) diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index b1ca99055ef9..f08496cd6870 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -581,7 +581,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx) and $~(0x20 - 1), %rsp shl $7, NUM_BLKS # convert to bytes - jz done_hash + jz .Ldone_hash add INP, NUM_BLKS # pointer to end of data mov NUM_BLKS, frame_INPEND(%rsp) @@ -600,7 +600,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK -loop0: +.Lloop0: lea K512(%rip), TBL ## byte swap first 16 dwords @@ -615,7 +615,7 @@ loop0: movq $4, frame_SRND(%rsp) .align 16 -loop1: +.Lloop1: vpaddq (TBL), Y_0, XFER vmovdqa XFER, frame_XFER(%rsp) FOUR_ROUNDS_AND_SCHED @@ -634,10 +634,10 @@ loop1: FOUR_ROUNDS_AND_SCHED subq $1, frame_SRND(%rsp) - jne loop1 + jne .Lloop1 movq $2, frame_SRND(%rsp) -loop2: +.Lloop2: vpaddq (TBL), Y_0, XFER vmovdqa XFER, frame_XFER(%rsp) DO_4ROUNDS @@ -650,7 +650,7 @@ loop2: vmovdqa Y_3, Y_1 subq $1, frame_SRND(%rsp) - jne loop2 + jne .Lloop2 mov frame_CTX(%rsp), CTX2 addm 8*0(CTX2), a @@ -665,9 +665,9 @@ loop2: mov frame_INP(%rsp), INP add $128, INP cmp frame_INPEND(%rsp), INP - jne loop0 + jne .Lloop0 -done_hash: +.Ldone_hash: # Restore Stack Pointer mov %rbp, %rsp diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index c06afb5270e5..65be30156816 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -278,7 +278,7 @@ frame_size = frame_WK + WK_SIZE SYM_TYPED_FUNC_START(sha512_transform_ssse3) test msglen, msglen - je nowork + je .Lnowork # Save GPRs push %rbx @@ -293,7 +293,7 @@ SYM_TYPED_FUNC_START(sha512_transform_ssse3) sub $frame_size, %rsp and $~(0x20 - 1), %rsp -updateblock: +.Lupdateblock: # Load state variables mov DIGEST(0), a_64 @@ -350,7 +350,7 @@ updateblock: # Advance to next message block add $16*8, msg dec msglen - jnz updateblock + jnz .Lupdateblock # Restore Stack Pointer mov %rbp, %rsp @@ -363,7 +363,7 @@ updateblock: pop %r12 pop %rbx -nowork: +.Lnowork: RET SYM_FUNC_END(sha512_transform_ssse3) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index eccc3431e515..f31e286c2977 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -8,7 +8,7 @@ * * entry.S contains the system-call and fault low-level handling routines. * - * Some of this is documented in Documentation/x86/entry_64.rst + * Some of this is documented in Documentation/arch/x86/entry_64.rst * * A note on terminology: * - iret frame: Architecture defined interrupt frame from SS to RIP @@ -205,7 +205,7 @@ syscall_return_via_sysret: */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -286,7 +286,7 @@ SYM_FUNC_END(__switch_to_asm) .pushsection .text, "ax" __FUNC_ALIGN SYM_CODE_START_NOALIGN(ret_from_fork) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR // copy_thread CALL_DEPTH_ACCOUNT movq %rax, %rdi @@ -303,7 +303,7 @@ SYM_CODE_START_NOALIGN(ret_from_fork) 1: /* kernel thread */ - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK movq %r12, %rdi CALL_NOSPEC rbx /* @@ -388,9 +388,9 @@ SYM_CODE_START(\asmsym) .if \vector == X86_TRAP_BP /* #BP advances %rip to the next instruction */ - UNWIND_HINT_IRET_REGS offset=\has_error_code*8 signal=0 + UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0 .else - UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 .endif ENDBR @@ -461,7 +461,7 @@ SYM_CODE_END(\asmsym) */ .macro idtentry_mce_db vector asmsym cfunc SYM_CODE_START(\asmsym) - UNWIND_HINT_IRET_REGS + UNWIND_HINT_IRET_ENTRY ENDBR ASM_CLAC cld @@ -518,7 +518,7 @@ SYM_CODE_END(\asmsym) */ .macro idtentry_vc vector asmsym cfunc SYM_CODE_START(\asmsym) - UNWIND_HINT_IRET_REGS + UNWIND_HINT_IRET_ENTRY ENDBR ASM_CLAC cld @@ -582,7 +582,7 @@ SYM_CODE_END(\asmsym) */ .macro idtentry_df vector asmsym cfunc SYM_CODE_START(\asmsym) - UNWIND_HINT_IRET_REGS offset=8 + UNWIND_HINT_IRET_ENTRY offset=8 ENDBR ASM_CLAC cld @@ -643,7 +643,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ @@ -869,7 +869,7 @@ SYM_CODE_END(exc_xen_hypervisor_callback) */ __FUNC_ALIGN SYM_CODE_START_NOALIGN(xen_failsafe_callback) - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ENDBR movl %ds, %ecx cmpw %cx, 0x10(%rsp) @@ -1027,7 +1027,7 @@ SYM_CODE_START_LOCAL(paranoid_exit) * * NB to anyone to try to optimize this code: this code does * not execute at all for exceptions from user mode. Those - * exceptions go through error_exit instead. + * exceptions go through error_return instead. */ RESTORE_CR3 scratch_reg=%rax save_reg=%r14 @@ -1107,7 +1107,7 @@ SYM_CODE_START(error_entry) FENCE_SWAPGS_KERNEL_ENTRY CALL_DEPTH_ACCOUNT leaq 8(%rsp), %rax /* return pt_regs pointer */ - ANNOTATE_UNRET_END + VALIDATE_UNRET_END RET .Lbstep_iret: @@ -1153,7 +1153,7 @@ SYM_CODE_END(error_return) * when PAGE_TABLE_ISOLATION is in use. Do not clobber. */ SYM_CODE_START(asm_exc_nmi) - UNWIND_HINT_IRET_REGS + UNWIND_HINT_IRET_ENTRY ENDBR /* @@ -1520,7 +1520,7 @@ SYM_CODE_END(asm_exc_nmi) * MSRs to fully disable 32-bit SYSCALL. */ SYM_CODE_START(ignore_sysret) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax sysretl diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 1506a22a4fb6..6a1821bd7d5e 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,10 +3,7 @@ # Building vDSO images for x86. # -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE| -ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 3b300a773c7e..f3b3cacbcbb0 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -70,18 +70,9 @@ static struct ctl_table abi_table2[] = { {} }; -static struct ctl_table abi_root_table2[] = { - { - .procname = "abi", - .mode = 0555, - .child = abi_table2 - }, - {} -}; - static __init int ia32_binfmt_init(void) { - register_sysctl_table(abi_root_table2); + register_sysctl("abi", abi_table2); return 0; } __initcall(ia32_binfmt_init); diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index d234ca797e4a..e0ca8120aea8 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -317,7 +317,7 @@ static struct vm_area_struct gate_vma __ro_after_init = { struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_COMPAT - if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL)) + if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags)) return NULL; #endif if (vsyscall_mode == NONE) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d096b04bf80e..9d248703cbdd 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1703,10 +1703,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) { - data.br_stack = &cpuc->lbr_stack; - data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; - } + if (has_branch_stack(event)) + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a3fb996a86a1..070cc4ef2672 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5470,6 +5470,15 @@ pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) } static umode_t +mem_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + if (attr == &event_attr_mem_ld_aux.attr.attr) + return x86_pmu.flags & PMU_FL_MEM_LOADS_AUX ? attr->mode : 0; + + return pebs_is_visible(kobj, attr, i); +} + +static umode_t lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i) { return x86_pmu.lbr_nr ? attr->mode : 0; @@ -5496,7 +5505,7 @@ static struct attribute_group group_events_td = { static struct attribute_group group_events_mem = { .name = "events", - .is_visible = pebs_is_visible, + .is_visible = mem_is_visible, }; static struct attribute_group group_events_tsx = { @@ -6486,6 +6495,10 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_EMERALDRAPIDS_X: + x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + fallthrough; + case INTEL_FAM6_GRANITERAPIDS_X: + case INTEL_FAM6_GRANITERAPIDS_D: pmem = true; x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -6502,7 +6515,6 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.flags |= PMU_FL_INSTR_LATENCY; - x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = spr_get_event_constraints; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 551741e79e03..835862c548cc 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -678,6 +678,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates), X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &icx_cstates), + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &icx_cstates), + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &icx_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a2e566e53076..df88576d6b2a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1229,12 +1229,14 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct perf_event *event, bool add) { struct pmu *pmu = event->pmu; + /* * Make sure we get updated with the first PEBS * event. It will trigger also during removal, but * that does not hurt: */ - bool update = cpuc->n_pebs == 1; + if (cpuc->n_pebs == 1) + cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW; if (needed_cb != pebs_needs_sched_cb(cpuc)) { if (!needed_cb) @@ -1242,7 +1244,7 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, else perf_sched_cb_dec(pmu); - update = true; + cpuc->pebs_data_cfg |= PEBS_UPDATE_DS_SW; } /* @@ -1252,24 +1254,13 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, if (x86_pmu.intel_cap.pebs_baseline && add) { u64 pebs_data_cfg; - /* Clear pebs_data_cfg and pebs_record_size for first PEBS. */ - if (cpuc->n_pebs == 1) { - cpuc->pebs_data_cfg = 0; - cpuc->pebs_record_size = sizeof(struct pebs_basic); - } - pebs_data_cfg = pebs_update_adaptive_cfg(event); - - /* Update pebs_record_size if new event requires more data. */ - if (pebs_data_cfg & ~cpuc->pebs_data_cfg) { - cpuc->pebs_data_cfg |= pebs_data_cfg; - adaptive_pebs_record_size_update(); - update = true; - } + /* + * Be sure to update the thresholds when we change the record. + */ + if (pebs_data_cfg & ~cpuc->pebs_data_cfg) + cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW; } - - if (update) - pebs_update_threshold(cpuc); } void intel_pmu_pebs_add(struct perf_event *event) @@ -1326,9 +1317,17 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event) wrmsrl(base + idx, value); } +static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc) +{ + if (cpuc->n_pebs == cpuc->n_large_pebs && + cpuc->n_pebs != cpuc->n_pebs_via_pt) + intel_pmu_drain_pebs_buffer(); +} + void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 pebs_data_cfg = cpuc->pebs_data_cfg & ~PEBS_UPDATE_DS_SW; struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; unsigned int idx = hwc->idx; @@ -1344,11 +1343,22 @@ void intel_pmu_pebs_enable(struct perf_event *event) if (x86_pmu.intel_cap.pebs_baseline) { hwc->config |= ICL_EVENTSEL_ADAPTIVE; - if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) { - wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg); - cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg; + if (pebs_data_cfg != cpuc->active_pebs_data_cfg) { + /* + * drain_pebs() assumes uniform record size; + * hence we need to drain when changing said + * size. + */ + intel_pmu_drain_large_pebs(cpuc); + adaptive_pebs_record_size_update(); + wrmsrl(MSR_PEBS_DATA_CFG, pebs_data_cfg); + cpuc->active_pebs_data_cfg = pebs_data_cfg; } } + if (cpuc->pebs_data_cfg & PEBS_UPDATE_DS_SW) { + cpuc->pebs_data_cfg = pebs_data_cfg; + pebs_update_threshold(cpuc); + } if (idx >= INTEL_PMC_IDX_FIXED) { if (x86_pmu.intel_cap.pebs_format < 5) @@ -1391,9 +1401,7 @@ void intel_pmu_pebs_disable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - if (cpuc->n_pebs == cpuc->n_large_pebs && - cpuc->n_pebs != cpuc->n_pebs_via_pt) - intel_pmu_drain_pebs_buffer(); + intel_pmu_drain_large_pebs(cpuc); cpuc->pebs_enabled &= ~(1ULL << hwc->idx); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 7d1199554fe3..fa9b209a11fa 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6068,6 +6068,17 @@ static struct intel_uncore_ops spr_uncore_mmio_ops = { .read_counter = uncore_mmio_read_counter, }; +static struct uncore_event_desc spr_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x01,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x05,umask=0xcf"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x05,umask=0xf0"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), + { /* end: all zeroes */ }, +}; + static struct intel_uncore_type spr_uncore_imc = { SPR_UNCORE_COMMON_FORMAT(), .name = "imc", @@ -6075,6 +6086,7 @@ static struct intel_uncore_type spr_uncore_imc = { .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, .ops = &spr_uncore_mmio_ops, + .event_descs = spr_uncore_imc_events, }; static void spr_uncore_pci_enable_event(struct intel_uncore_box *box, diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index c65d8906cbcf..0feaaa571303 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -70,6 +70,8 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_EMERALDRAPIDS_X: + case INTEL_FAM6_GRANITERAPIDS_X: + case INTEL_FAM6_GRANITERAPIDS_D: case INTEL_FAM6_ATOM_SILVERMONT: case INTEL_FAM6_ATOM_SILVERMONT_D: diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 5d2de10809ae..3a1548054b48 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o +obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o ifdef CONFIG_X86_64 obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index fb8b2c088681..1fbda2f94184 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -96,6 +96,11 @@ static void hv_apic_eoi_write(u32 reg, u32 val) wrmsr(HV_X64_MSR_EOI, val, 0); } +static bool cpu_is_self(int cpu) +{ + return cpu == smp_processor_id(); +} + /* * IPI implementation on Hyper-V. */ @@ -128,10 +133,9 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, */ if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - if (exclude_self) - nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask); - else - nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); + + nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask, + exclude_self ? cpu_is_self : NULL); /* * 'nr_bank <= 0' means some CPUs in cpumask can't be diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 41ef036ebb7b..a5f9474f08e1 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -29,7 +29,6 @@ #include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> #include <linux/highmem.h> -#include <linux/swiotlb.h> int hyperv_init_cpuhp; u64 hv_current_partition_id = ~0ull; @@ -64,7 +63,10 @@ static int hyperv_init_ghcb(void) * memory boundary and map it here. */ rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); - ghcb_va = memremap(ghcb_gpa, HV_HYP_PAGE_SIZE, MEMREMAP_WB); + + /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary; + ghcb_va = (void *)ioremap_cache(ghcb_gpa, HV_HYP_PAGE_SIZE); if (!ghcb_va) return -ENOMEM; @@ -218,7 +220,7 @@ static int hv_cpu_die(unsigned int cpu) if (hv_ghcb_pg) { ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg); if (*ghcb_va) - memunmap(*ghcb_va); + iounmap(*ghcb_va); *ghcb_va = NULL; } @@ -504,16 +506,6 @@ void __init hyperv_init(void) /* Query the VMs extended capability once, so that it can be cached. */ hv_query_ext_cap(0); -#ifdef CONFIG_SWIOTLB - /* - * Swiotlb bounce buffer needs to be mapped in extra address - * space. Map function doesn't work in the early place and so - * call swiotlb_update_mem_attributes() here. - */ - if (hv_is_isolation_supported()) - swiotlb_update_mem_attributes(); -#endif - return; clean_guest_os_id: diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c new file mode 100644 index 000000000000..1ba5d3b99b16 --- /dev/null +++ b/arch/x86/hyperv/hv_vtl.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Author: + * Saurabh Sengar <ssengar@microsoft.com> + */ + +#include <asm/apic.h> +#include <asm/boot.h> +#include <asm/desc.h> +#include <asm/i8259.h> +#include <asm/mshyperv.h> +#include <asm/realmode.h> + +extern struct boot_params boot_params; +static struct real_mode_header hv_vtl_real_mode_header; + +void __init hv_vtl_init_platform(void) +{ + pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); + + x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.timers.timer_init = x86_init_noop; + + x86_platform.get_wallclock = get_rtc_noop; + x86_platform.set_wallclock = set_rtc_noop; + x86_platform.get_nmi_reason = hv_get_nmi_reason; + + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.warm_reset = 0; + x86_platform.legacy.reserve_bios_regions = 0; + x86_platform.legacy.devices.pnpbios = 0; +} + +static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc) +{ + return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) | + (desc->base1 << 16) | desc->base0; +} + +static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc) +{ + return ((u32)desc->limit1 << 16) | (u32)desc->limit0; +} + +typedef void (*secondary_startup_64_fn)(void*, void*); +static void hv_vtl_ap_entry(void) +{ + ((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params); +} + +static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored) +{ + u64 status; + int ret = 0; + struct hv_enable_vp_vtl *input; + unsigned long irq_flags; + + struct desc_ptr gdt_ptr; + struct desc_ptr idt_ptr; + + struct ldttss_desc *tss; + struct ldttss_desc *ldt; + struct desc_struct *gdt; + + u64 rsp = current->thread.sp; + u64 rip = (u64)&hv_vtl_ap_entry; + + native_store_gdt(&gdt_ptr); + store_idt(&idt_ptr); + + gdt = (struct desc_struct *)((void *)(gdt_ptr.address)); + tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS); + ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT); + + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + + input->partition_id = HV_PARTITION_ID_SELF; + input->vp_index = target_vp_index; + input->target_vtl.target_vtl = HV_VTL_MGMT; + + /* + * The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit + * mode transition sequence after waking up an AP with SIPI whose + * vector points to the 16-bit AP startup trampoline code. Here in + * VTL2, we can't perform that sequence as the AP has to start in + * the 64-bit mode. + * + * To make this happen, we tell the hypervisor to load a valid 64-bit + * context (most of which is just magic numbers from the CPU manual) + * so that AP jumps right to the 64-bit entry of the kernel, and the + * control registers are loaded with values that let the AP fetch the + * code and data and carry on with work it gets assigned. + */ + + input->vp_context.rip = rip; + input->vp_context.rsp = rsp; + input->vp_context.rflags = 0x0000000000000002; + input->vp_context.efer = __rdmsr(MSR_EFER); + input->vp_context.cr0 = native_read_cr0(); + input->vp_context.cr3 = __native_read_cr3(); + input->vp_context.cr4 = native_read_cr4(); + input->vp_context.msr_cr_pat = __rdmsr(MSR_IA32_CR_PAT); + input->vp_context.idtr.limit = idt_ptr.size; + input->vp_context.idtr.base = idt_ptr.address; + input->vp_context.gdtr.limit = gdt_ptr.size; + input->vp_context.gdtr.base = gdt_ptr.address; + + /* Non-system desc (64bit), long, code, present */ + input->vp_context.cs.selector = __KERNEL_CS; + input->vp_context.cs.base = 0; + input->vp_context.cs.limit = 0xffffffff; + input->vp_context.cs.attributes = 0xa09b; + /* Non-system desc (64bit), data, present, granularity, default */ + input->vp_context.ss.selector = __KERNEL_DS; + input->vp_context.ss.base = 0; + input->vp_context.ss.limit = 0xffffffff; + input->vp_context.ss.attributes = 0xc093; + + /* System desc (128bit), present, LDT */ + input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8; + input->vp_context.ldtr.base = hv_vtl_system_desc_base(ldt); + input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(ldt); + input->vp_context.ldtr.attributes = 0x82; + + /* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */ + input->vp_context.tr.selector = GDT_ENTRY_TSS * 8; + input->vp_context.tr.base = hv_vtl_system_desc_base(tss); + input->vp_context.tr.limit = hv_vtl_system_desc_limit(tss); + input->vp_context.tr.attributes = 0x8b; + + status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, input, NULL); + + if (!hv_result_success(status) && + hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) { + pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]", + target_vp_index, status); + ret = -EINVAL; + goto free_lock; + } + + status = hv_do_hypercall(HVCALL_START_VP, input, NULL); + + if (!hv_result_success(status)) { + pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n", + target_vp_index, status); + ret = -EINVAL; + } + +free_lock: + local_irq_restore(irq_flags); + + return ret; +} + +static int hv_vtl_apicid_to_vp_id(u32 apic_id) +{ + u64 control; + u64 status; + unsigned long irq_flags; + struct hv_get_vp_from_apic_id_in *input; + u32 *output, ret; + + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = HV_PARTITION_ID_SELF; + input->apic_ids[0] = apic_id; + + output = (u32 *)input; + + control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID; + status = hv_do_hypercall(control, input, output); + ret = output[0]; + + local_irq_restore(irq_flags); + + if (!hv_result_success(status)) { + pr_err("failed to get vp id from apic id %d, status %#llx\n", + apic_id, status); + return -EINVAL; + } + + return ret; +} + +static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip) +{ + int vp_id; + + pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid); + vp_id = hv_vtl_apicid_to_vp_id(apicid); + + if (vp_id < 0) { + pr_err("Couldn't find CPU with APIC ID %d\n", apicid); + return -EINVAL; + } + if (vp_id > ms_hyperv.max_vp_index) { + pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid); + return -EINVAL; + } + + return hv_vtl_bringup_vcpu(vp_id, start_eip); +} + +static int __init hv_vtl_early_init(void) +{ + /* + * `boot_cpu_has` returns the runtime feature support, + * and here is the earliest it can be used. + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) + panic("XSAVE has to be disabled as it is not supported by this module.\n" + "Please add 'noxsave' to the kernel command line.\n"); + + real_mode_header = &hv_vtl_real_mode_header; + apic->wakeup_secondary_cpu_64 = hv_vtl_wakeup_secondary_cpu; + + return 0; +} +early_initcall(hv_vtl_early_init); diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 1dbcbd9da74d..cc92388b7a99 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -13,6 +13,8 @@ #include <asm/svm.h> #include <asm/sev.h> #include <asm/io.h> +#include <asm/coco.h> +#include <asm/mem_encrypt.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> @@ -127,7 +129,7 @@ static enum es_result hv_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code, return ES_OK; } -void hv_ghcb_terminate(unsigned int set, unsigned int reason) +void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; @@ -233,41 +235,6 @@ void hv_ghcb_msr_read(u64 msr, u64 *value) local_irq_restore(flags); } EXPORT_SYMBOL_GPL(hv_ghcb_msr_read); -#endif - -enum hv_isolation_type hv_get_isolation_type(void) -{ - if (!(ms_hyperv.priv_high & HV_ISOLATION)) - return HV_ISOLATION_TYPE_NONE; - return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); -} -EXPORT_SYMBOL_GPL(hv_get_isolation_type); - -/* - * hv_is_isolation_supported - Check system runs in the Hyper-V - * isolation VM. - */ -bool hv_is_isolation_supported(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) - return false; - - if (!hypervisor_is_type(X86_HYPER_MS_HYPERV)) - return false; - - return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; -} - -DEFINE_STATIC_KEY_FALSE(isolation_type_snp); - -/* - * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based - * isolation VM. - */ -bool hv_isolation_type_snp(void) -{ - return static_branch_unlikely(&isolation_type_snp); -} /* * hv_mark_gpa_visibility - Set pages visible to host via hvcall. @@ -320,27 +287,25 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], } /* - * hv_set_mem_host_visibility - Set specified memory visible to host. + * hv_vtom_set_host_visibility - Set specified memory visible to host. * * In Isolation VM, all guest memory is encrypted from host and guest * needs to set memory visible to host via hvcall before sharing memory * with host. This function works as wrap of hv_mark_gpa_visibility() * with memory base and size. */ -int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visible) +static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc) { - enum hv_mem_host_visibility visibility = visible ? - VMBUS_PAGE_VISIBLE_READ_WRITE : VMBUS_PAGE_NOT_VISIBLE; + enum hv_mem_host_visibility visibility = enc ? + VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE; u64 *pfn_array; int ret = 0; + bool result = true; int i, pfn; - if (!hv_is_isolation_supported() || !hv_hypercall_pg) - return 0; - pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!pfn_array) - return -ENOMEM; + return false; for (i = 0, pfn = 0; i < pagecount; i++) { pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE); @@ -349,41 +314,98 @@ int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visibl if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) { ret = hv_mark_gpa_visibility(pfn, pfn_array, visibility); - if (ret) + if (ret) { + result = false; goto err_free_pfn_array; + } pfn = 0; } } err_free_pfn_array: kfree(pfn_array); - return ret; + return result; } -/* - * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM. - */ -void *hv_map_memory(void *addr, unsigned long size) +static bool hv_vtom_tlb_flush_required(bool private) { - unsigned long *pfns = kcalloc(size / PAGE_SIZE, - sizeof(unsigned long), GFP_KERNEL); - void *vaddr; - int i; + return true; +} + +static bool hv_vtom_cache_flush_required(void) +{ + return false; +} - if (!pfns) - return NULL; +static bool hv_is_private_mmio(u64 addr) +{ + /* + * Hyper-V always provides a single IO-APIC in a guest VM. + * When a paravisor is used, it is emulated by the paravisor + * in the guest context and must be mapped private. + */ + if (addr >= HV_IOAPIC_BASE_ADDRESS && + addr < (HV_IOAPIC_BASE_ADDRESS + PAGE_SIZE)) + return true; + + /* Same with a vTPM */ + if (addr >= VTPM_BASE_ADDRESS && + addr < (VTPM_BASE_ADDRESS + PAGE_SIZE)) + return true; + + return false; +} + +void __init hv_vtom_init(void) +{ + /* + * By design, a VM using vTOM doesn't see the SEV setting, + * so SEV initialization is bypassed and sev_status isn't set. + * Set it here to indicate a vTOM VM. + */ + sev_status = MSR_AMD64_SNP_VTOM; + cc_set_vendor(CC_VENDOR_AMD); + cc_set_mask(ms_hyperv.shared_gpa_boundary); + physical_mask &= ms_hyperv.shared_gpa_boundary - 1; + + x86_platform.hyper.is_private_mmio = hv_is_private_mmio; + x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required; + x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; + x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; +} + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +enum hv_isolation_type hv_get_isolation_type(void) +{ + if (!(ms_hyperv.priv_high & HV_ISOLATION)) + return HV_ISOLATION_TYPE_NONE; + return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); +} +EXPORT_SYMBOL_GPL(hv_get_isolation_type); - for (i = 0; i < size / PAGE_SIZE; i++) - pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) + - (ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT); +/* + * hv_is_isolation_supported - Check system runs in the Hyper-V + * isolation VM. + */ +bool hv_is_isolation_supported(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return false; - vaddr = vmap_pfn(pfns, size / PAGE_SIZE, PAGE_KERNEL_IO); - kfree(pfns); + if (!hypervisor_is_type(X86_HYPER_MS_HYPERV)) + return false; - return vaddr; + return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; } -void hv_unmap_memory(void *addr) +DEFINE_STATIC_KEY_FALSE(isolation_type_snp); + +/* + * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based + * isolation VM. + */ +bool hv_isolation_type_snp(void) { - vunmap(addr); + return static_branch_unlikely(&isolation_type_snp); } diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 0ad2378fe6ad..8460bd35e10c 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -52,6 +52,11 @@ static inline int fill_gva_list(u64 gva_list[], int offset, return gva_n - offset; } +static bool cpu_is_lazy(int cpu) +{ + return per_cpu(cpu_tlbstate_shared.is_lazy, cpu); +} + static void hyperv_flush_tlb_multi(const struct cpumask *cpus, const struct flush_tlb_info *info) { @@ -60,6 +65,7 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus, struct hv_tlb_flush *flush; u64 status; unsigned long flags; + bool do_lazy = !info->freed_tables; trace_hyperv_mmu_flush_tlb_multi(cpus, info); @@ -112,6 +118,8 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus, goto do_ex_hypercall; for_each_cpu(cpu, cpus) { + if (do_lazy && cpu_is_lazy(cpu)) + continue; vcpu = hv_cpu_number_to_vp_number(cpu); if (vcpu == VP_INVAL) { local_irq_restore(flags); @@ -198,7 +206,8 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, flush->hv_vp_set.valid_bank_mask = 0; flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus); + nr_bank = cpumask_to_vpset_skip(&flush->hv_vp_set, cpus, + info->freed_tables ? NULL : cpu_is_lazy); if (nr_bank < 0) return HV_STATUS_INVALID_PARAMETER; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e2975a32d443..d7da28fada87 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -8,7 +8,7 @@ #define ALT_FLAGS_SHIFT 16 -#define ALT_FLAG_NOT BIT(0) +#define ALT_FLAG_NOT (1 << 0) #define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature)) #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 53e9b0620d96..d90ae472fb76 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -38,7 +38,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) * IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear * this field. The purpose of this field is to guarantee * compliance with the x86 boot spec located in - * Documentation/x86/boot.rst . That spec says that the + * Documentation/arch/x86/boot.rst . That spec says that the * *whole* structure should be cleared, after which only the * portion defined by struct setup_header (boot_params->hdr) * should be copied in. diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 94fbe6ae7431..540573f515b7 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -221,9 +221,15 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) +#define __try_cmpxchg_local(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), "") + #define arch_try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) +#define arch_try_cmpxchg_local(ptr, pold, new) \ + __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr))) + /* * xadd() adds "inc" to "*ptr" and atomically returns the previous * value of "*ptr". diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index 3d98c3a60d34..eb08796002f3 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -7,17 +7,33 @@ enum cc_vendor { CC_VENDOR_NONE, CC_VENDOR_AMD, - CC_VENDOR_HYPERV, CC_VENDOR_INTEL, }; -void cc_set_vendor(enum cc_vendor v); -void cc_set_mask(u64 mask); - #ifdef CONFIG_ARCH_HAS_CC_PLATFORM +extern enum cc_vendor cc_vendor; + +static inline enum cc_vendor cc_get_vendor(void) +{ + return cc_vendor; +} + +static inline void cc_set_vendor(enum cc_vendor vendor) +{ + cc_vendor = vendor; +} + +void cc_set_mask(u64 mask); u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); #else +static inline enum cc_vendor cc_get_vendor(void) +{ + return CC_VENDOR_NONE; +} + +static inline void cc_set_vendor(enum cc_vendor vendor) { } + static inline u64 cc_mkenc(u64 val) { return val; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73c9672c123b..cb8ca46213be 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -226,10 +226,9 @@ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ @@ -321,6 +320,7 @@ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ +#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ @@ -337,6 +337,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */ #define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ @@ -369,6 +370,7 @@ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ #define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ +#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5dfa4fb76f4b..fafe9be7a6f4 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -75,6 +75,12 @@ # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) #endif +#ifdef CONFIG_ADDRESS_MASKING +# define DISABLE_LAM 0 +#else +# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -115,7 +121,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING) -#define DISABLED_MASK12 0 +#define DISABLED_MASK12 (DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index ab4c960146e3..23873da8fb77 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -2,21 +2,16 @@ #ifndef _ASM_X86_FB_H #define _ASM_X86_FB_H -#include <linux/fb.h> -#include <linux/fs.h> -#include <asm/page.h> +struct fb_info; +struct file; +struct vm_area_struct; -static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, - unsigned long off) -{ - unsigned long prot; +void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off); +#define fb_pgprotect fb_pgprotect - prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK; - if (boot_cpu_data.x86 > 3) - pgprot_val(vma->vm_page_prot) = - prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); -} +int fb_is_primary_device(struct fb_info *info); +#define fb_is_primary_device fb_is_primary_device -extern int fb_is_primary_device(struct fb_info *info); +#include <asm-generic/fb.h> #endif /* _ASM_X86_FB_H */ diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 0b73a809e9e1..cea95dcd27c2 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -122,6 +122,9 @@ /* Recommend using enlightened VMCS */ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) +/* Use hypercalls for MMIO config space access */ +#define HV_X64_USE_MMIO_HYPERCALLS BIT(21) + /* * CPU management features identification. * These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits. @@ -713,6 +716,81 @@ union hv_msi_entry { } __packed; }; +struct hv_x64_segment_register { + u64 base; + u32 limit; + u16 selector; + union { + struct { + u16 segment_type : 4; + u16 non_system_segment : 1; + u16 descriptor_privilege_level : 2; + u16 present : 1; + u16 reserved : 4; + u16 available : 1; + u16 _long : 1; + u16 _default : 1; + u16 granularity : 1; + } __packed; + u16 attributes; + }; +} __packed; + +struct hv_x64_table_register { + u16 pad[3]; + u16 limit; + u64 base; +} __packed; + +struct hv_init_vp_context { + u64 rip; + u64 rsp; + u64 rflags; + + struct hv_x64_segment_register cs; + struct hv_x64_segment_register ds; + struct hv_x64_segment_register es; + struct hv_x64_segment_register fs; + struct hv_x64_segment_register gs; + struct hv_x64_segment_register ss; + struct hv_x64_segment_register tr; + struct hv_x64_segment_register ldtr; + + struct hv_x64_table_register idtr; + struct hv_x64_table_register gdtr; + + u64 efer; + u64 cr0; + u64 cr3; + u64 cr4; + u64 msr_cr_pat; +} __packed; + +union hv_input_vtl { + u8 as_uint8; + struct { + u8 target_vtl: 4; + u8 use_target_vtl: 1; + u8 reserved_z: 3; + }; +} __packed; + +struct hv_enable_vp_vtl { + u64 partition_id; + u32 vp_index; + union hv_input_vtl target_vtl; + u8 mbz0; + u16 mbz1; + struct hv_init_vp_context vp_context; +} __packed; + +struct hv_get_vp_from_apic_id_in { + u64 partition_id; + union hv_input_vtl target_vtl; + u8 res[7]; + u32 apic_ids[]; +} __packed; + #include <asm-generic/hyperv-tlfs.h> #endif diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index cbaf174d8efd..b3af2d45bbbb 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -125,6 +125,8 @@ #define INTEL_FAM6_LUNARLAKE_M 0xBD +#define INTEL_FAM6_ARROWLAKE 0xC6 + /* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index c201083b34f6..a3abdcd89a32 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -20,25 +20,4 @@ extern void intel_mid_pwr_power_off(void); extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev); -#ifdef CONFIG_X86_INTEL_MID - -extern void intel_scu_devices_create(void); -extern void intel_scu_devices_destroy(void); - -#else /* !CONFIG_X86_INTEL_MID */ - -static inline void intel_scu_devices_create(void) { } -static inline void intel_scu_devices_destroy(void) { } - -#endif /* !CONFIG_X86_INTEL_MID */ - -/* Bus Select SoC Fuse value */ -#define BSEL_SOC_FUSE_MASK 0x7 -/* FSB 133MHz */ -#define BSEL_SOC_FUSE_001 0x1 -/* FSB 100MHz */ -#define BSEL_SOC_FUSE_101 0x5 -/* FSB 83MHz */ -#define BSEL_SOC_FUSE_111 0x7 - #endif /* _ASM_X86_INTEL_MID_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index a3760ca796aa..5b77bbc28f96 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -200,9 +200,6 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *symtab); #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add -void *arch_kexec_kernel_image_load(struct kimage *image); -#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load - int arch_kimage_file_post_load_cleanup(struct kimage *image); #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup #endif diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 8dc345cc6318..13bc212cd4bc 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -54,8 +54,8 @@ KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) KVM_X86_OP(flush_tlb_all) KVM_X86_OP(flush_tlb_current) -KVM_X86_OP_OPTIONAL(tlb_remote_flush) -KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range) +KVM_X86_OP_OPTIONAL(flush_remote_tlbs) +KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range) KVM_X86_OP(flush_tlb_gva) KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_pre_run) @@ -68,6 +68,8 @@ KVM_X86_OP(get_interrupt_shadow) KVM_X86_OP(patch_hypercall) KVM_X86_OP(inject_irq) KVM_X86_OP(inject_nmi) +KVM_X86_OP_OPTIONAL_RET0(is_vnmi_pending) +KVM_X86_OP_OPTIONAL_RET0(set_vnmi_pending) KVM_X86_OP(inject_exception) KVM_X86_OP(cancel_injection) KVM_X86_OP(interrupt_allowed) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 808c292ad3f4..fb9d1f2d6136 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -420,6 +420,10 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) +#define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1) + #define KVM_HAVE_MMU_RWLOCK struct kvm_mmu_page; @@ -439,9 +443,8 @@ struct kvm_mmu { gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); - int (*sync_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); + int (*sync_spte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, int i); struct kvm_mmu_root_info root; union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; @@ -479,11 +482,6 @@ struct kvm_mmu { u64 pdptrs[4]; /* pae */ }; -struct kvm_tlb_range { - u64 start_gfn; - u64 pages; -}; - enum pmc_type { KVM_PMC_GP = 0, KVM_PMC_FIXED, @@ -515,6 +513,7 @@ struct kvm_pmc { #define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) #define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { + u8 version; unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; unsigned available_event_types; @@ -527,7 +526,6 @@ struct kvm_pmu { u64 global_ovf_ctrl_mask; u64 reserved_bits; u64 raw_event_mask; - u8 version; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; struct irq_work irq_work; @@ -876,7 +874,8 @@ struct kvm_vcpu_arch { u64 tsc_scaling_ratio; /* current scaling ratio */ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ - unsigned nmi_pending; /* NMI queued after currently running handler */ + /* Number of NMIs pending injection, not including hardware vNMIs. */ + unsigned int nmi_pending; bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ u8 handling_intr_from_guest; @@ -947,23 +946,6 @@ struct kvm_vcpu_arch { u64 msr_kvm_poll_control; - /* - * Indicates the guest is trying to write a gfn that contains one or - * more of the PTEs used to translate the write itself, i.e. the access - * is changing its own translation in the guest page tables. KVM exits - * to userspace if emulation of the faulting instruction fails and this - * flag is set, as KVM cannot make forward progress. - * - * If emulation fails for a write to guest page tables, KVM unprotects - * (zaps) the shadow page for the target gfn and resumes the guest to - * retry the non-emulatable instruction (on hardware). Unprotecting the - * gfn doesn't allow forward progress for a self-changing access because - * doing so also zaps the translation for the gfn, i.e. retrying the - * instruction will hit a !PRESENT fault, which results in a new shadow - * page and sends KVM back to square one. - */ - bool write_fault_to_shadow_pgtable; - /* set at EPT violation at this point */ unsigned long exit_qualification; @@ -1602,9 +1584,9 @@ struct kvm_x86_ops { void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); - int (*tlb_remote_flush)(struct kvm *kvm); - int (*tlb_remote_flush_with_range)(struct kvm *kvm, - struct kvm_tlb_range *range); + int (*flush_remote_tlbs)(struct kvm *kvm); + int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn, + gfn_t nr_pages); /* * Flush any TLB entries associated with the given GVA. @@ -1638,6 +1620,13 @@ struct kvm_x86_ops { int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); + /* Whether or not a virtual NMI is pending in hardware. */ + bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu); + /* + * Attempt to pend a virtual NMI in harware. Returns %true on success + * to allow using static_call_ret0 as the fallback. + */ + bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); @@ -1808,8 +1797,8 @@ void kvm_arch_free_vm(struct kvm *kvm); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) { - if (kvm_x86_ops.tlb_remote_flush && - !static_call(kvm_x86_tlb_remote_flush)(kvm)) + if (kvm_x86_ops.flush_remote_tlbs && + !static_call(kvm_x86_flush_remote_tlbs)(kvm)) return 0; else return -ENOTSUPP; @@ -1907,6 +1896,25 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility * state and inject single-step #DBs after skipping * an instruction (after completing userspace I/O). + * + * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that + * is attempting to write a gfn that contains one or + * more of the PTEs used to translate the write itself, + * and the owning page table is being shadowed by KVM. + * If emulation of the faulting instruction fails and + * this flag is set, KVM will exit to userspace instead + * of retrying emulation as KVM cannot make forward + * progress. + * + * If emulation fails for a write to guest page tables, + * KVM unprotects (zaps) the shadow page for the target + * gfn and resumes the guest to retry the non-emulatable + * instruction (on hardware). Unprotecting the gfn + * doesn't allow forward progress for a self-changing + * access because doing so also zaps the translation for + * the gfn, i.e. retrying the instruction will hit a + * !PRESENT fault, which results in a new shadow page + * and sends KVM back to square one. */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) @@ -1916,6 +1924,7 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); #define EMULTYPE_VMWARE_GP (1 << 5) #define EMULTYPE_PF (1 << 6) #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) +#define EMULTYPE_WRITE_PF_TO_SP (1 << 8) int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, @@ -1994,14 +2003,11 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } -#define KVM_MMU_ROOT_CURRENT BIT(0) -#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) -#define KVM_MMU_ROOTS_ALL (~0UL) - int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); +int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); @@ -2041,8 +2047,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t gva, hpa_t root_hpa); +void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + u64 addr, unsigned long roots); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); @@ -2204,4 +2210,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) +/* + * KVM previously used a u32 field in kvm_run to indicate the hypercall was + * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the + * remaining 31 lower bits must be 0 to preserve ABI. + */ +#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index dd9b8118f784..0953aa32a324 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -99,7 +99,7 @@ /* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ #define SYM_TYPED_FUNC_START(name) \ - SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ + SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ ENDBR /* SYM_FUNC_START -- use for global functions */ diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 349a47acaa4a..56d4ef604b91 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -120,8 +120,17 @@ static inline long local_sub_return(long i, local_t *l) #define local_inc_return(l) (local_add_return(1, l)) #define local_dec_return(l) (local_sub_return(1, l)) -#define local_cmpxchg(l, o, n) \ - (cmpxchg_local(&((l)->a.counter), (o), (n))) +static inline long local_cmpxchg(local_t *l, long old, long new) +{ + return cmpxchg_local(&l->a.counter, old, new); +} + +static inline bool local_try_cmpxchg(local_t *l, long *old, long new) +{ + typeof(l->a.counter) *__old = (typeof(l->a.counter) *) old; + return try_cmpxchg_local(&l->a.counter, __old, new); +} + /* Always has a lock prefix */ #define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 72ca90552b6a..b7126701574c 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -56,6 +56,7 @@ void __init sev_es_init_vc_handling(void); #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL +#define sev_status 0ULL static inline void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) { } diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 5d7494631ea9..0da5c227f490 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -9,9 +9,13 @@ #include <linux/bits.h> /* Uprobes on this MM assume 32-bit code */ -#define MM_CONTEXT_UPROBE_IA32 BIT(0) +#define MM_CONTEXT_UPROBE_IA32 0 /* vsyscall page is accessible on this MM */ -#define MM_CONTEXT_HAS_VSYSCALL BIT(1) +#define MM_CONTEXT_HAS_VSYSCALL 1 +/* Do not allow changing LAM mode */ +#define MM_CONTEXT_LOCK_LAM 2 +/* Allow LAM and SVA coexisting */ +#define MM_CONTEXT_FORCE_TAGGED_SVA 3 /* * x86 has arch-specific MMU state beyond what lives in mm_struct. @@ -39,7 +43,15 @@ typedef struct { #endif #ifdef CONFIG_X86_64 - unsigned short flags; + unsigned long flags; +#endif + +#ifdef CONFIG_ADDRESS_MASKING + /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */ + unsigned long lam_cr3_mask; + + /* Significant bits of the virtual address. Excludes tag bits. */ + u64 untag_mask; #endif struct mutex lock; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index e01aa74a6de7..1d29dc791f5a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -16,13 +16,6 @@ extern atomic64_t last_mm_ctx_id; -#ifndef CONFIG_PARAVIRT_XXL -static inline void paravirt_activate_mm(struct mm_struct *prev, - struct mm_struct *next) -{ -} -#endif /* !CONFIG_PARAVIRT_XXL */ - #ifdef CONFIG_PERF_EVENTS DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key); DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); @@ -92,6 +85,51 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) } #endif +#ifdef CONFIG_ADDRESS_MASKING +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) +{ + return mm->context.lam_cr3_mask; +} + +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) +{ + mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask; + mm->context.untag_mask = oldmm->context.untag_mask; +} + +#define mm_untag_mask mm_untag_mask +static inline unsigned long mm_untag_mask(struct mm_struct *mm) +{ + return mm->context.untag_mask; +} + +static inline void mm_reset_untag_mask(struct mm_struct *mm) +{ + mm->context.untag_mask = -1UL; +} + +#define arch_pgtable_dma_compat arch_pgtable_dma_compat +static inline bool arch_pgtable_dma_compat(struct mm_struct *mm) +{ + return !mm_lam_cr3_mask(mm) || + test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags); +} +#else + +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) +{ + return 0; +} + +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) +{ +} + +static inline void mm_reset_untag_mask(struct mm_struct *mm) +{ +} +#endif + #define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); @@ -116,6 +154,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif + mm_reset_untag_mask(mm); init_new_context_ldt(mm); return 0; } @@ -135,7 +174,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, #define activate_mm(prev, next) \ do { \ - paravirt_activate_mm((prev), (next)); \ + paravirt_enter_mmap(next); \ switch_mm((prev), (next), NULL); \ } while (0); @@ -168,7 +207,8 @@ static inline void arch_dup_pkeys(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { arch_dup_pkeys(oldmm, mm); - paravirt_arch_dup_mmap(oldmm, mm); + paravirt_enter_mmap(mm); + dup_lam(oldmm, mm); return ldt_dup_context(oldmm, mm); } @@ -182,7 +222,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm) static inline bool is_64bit_mm(struct mm_struct *mm) { return !IS_ENABLED(CONFIG_IA32_EMULATION) || - !(mm->context.flags & MM_CONTEXT_UPROBE_IA32); + !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags); } #else static inline bool is_64bit_mm(struct mm_struct *mm) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 4c4c0ec3b62e..49bb4f2bd300 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -11,6 +11,18 @@ #include <asm/paravirt.h> #include <asm/mshyperv.h> +/* + * Hyper-V always provides a single IO-APIC at this MMIO address. + * Ideally, the value should be looked up in ACPI tables, but it + * is needed for mapping the IO-APIC early in boot on Confidential + * VMs, before ACPI functions can be used. + */ +#define HV_IOAPIC_BASE_ADDRESS 0xfec00000 + +#define HV_VTL_NORMAL 0x0 +#define HV_VTL_SECURE 0x1 +#define HV_VTL_MGMT 0x2 + union hv_ghcb; DECLARE_STATIC_KEY_FALSE(isolation_type_snp); @@ -21,6 +33,11 @@ typedef int (*hyperv_fill_flush_list_func)( void hyperv_vector_handler(struct pt_regs *regs); +static inline unsigned char hv_get_nmi_reason(void) +{ + return 0; +} + #if IS_ENABLED(CONFIG_HYPERV) extern int hyperv_init_cpuhp; @@ -206,18 +223,19 @@ struct irq_domain *hv_create_pci_msi_domain(void); int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, struct hv_interrupt_entry *entry); int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); -int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible); #ifdef CONFIG_AMD_MEM_ENCRYPT void hv_ghcb_msr_write(u64 msr, u64 value); void hv_ghcb_msr_read(u64 msr, u64 *value); bool hv_ghcb_negotiate_protocol(void); -void hv_ghcb_terminate(unsigned int set, unsigned int reason); +void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason); +void hv_vtom_init(void); #else static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} static inline bool hv_ghcb_negotiate_protocol(void) { return false; } static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} +static inline void hv_vtom_init(void) {} #endif extern bool hv_isolation_type_snp(void); @@ -259,14 +277,15 @@ static inline void hv_set_register(unsigned int reg, u64 value) { } static inline u64 hv_get_register(unsigned int reg) { return 0; } static inline void hv_set_non_nested_register(unsigned int reg, u64 value) { } static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; } -static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages, - bool visible) -{ - return -1; -} #endif /* CONFIG_HYPERV */ +#ifdef CONFIG_HYPERV_VTL_MODE +void __init hv_vtl_init_platform(void); +#else +static inline void __init hv_vtl_init_platform(void) {} +#endif + #include <asm-generic/mshyperv.h> #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ad35355ee43e..3aedae61af4f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -206,6 +206,8 @@ /* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */ #define MSR_INTEGRITY_CAPS 0x000002d9 +#define MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT 2 +#define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT) #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 3ef70e54a858..edb2b0cb8efe 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -194,9 +194,9 @@ * builds. */ .macro ANNOTATE_RETPOLINE_SAFE - .Lannotate_\@: +.Lhere_\@: .pushsection .discard.retpoline_safe - _ASM_PTR .Lannotate_\@ + .long .Lhere_\@ - . .popsection .endm @@ -210,8 +210,8 @@ * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should * eventually turn into it's own annotation. */ -.macro ANNOTATE_UNRET_END -#ifdef CONFIG_DEBUG_ENTRY +.macro VALIDATE_UNRET_END +#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY) ANNOTATE_RETPOLINE_SAFE nop #endif @@ -286,7 +286,7 @@ .macro UNTRAIN_RET #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ defined(CONFIG_CALL_DEPTH_TRACKING) - ANNOTATE_UNRET_END + VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ @@ -297,7 +297,7 @@ .macro UNTRAIN_RET_FROM_CALL #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ defined(CONFIG_CALL_DEPTH_TRACKING) - ANNOTATE_UNRET_END + VALIDATE_UNRET_END ALTERNATIVE_3 "", \ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ @@ -318,7 +318,7 @@ #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ - _ASM_PTR " 999b\n\t" \ + ".long 999b - .\n\t" \ ".popsection\n\t" typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 1343a62106de..46d7e06763c9 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -39,6 +39,12 @@ #define ORC_REG_SP_INDIRECT 9 #define ORC_REG_MAX 15 +#define ORC_TYPE_UNDEFINED 0 +#define ORC_TYPE_END_OF_STACK 1 +#define ORC_TYPE_CALL 2 +#define ORC_TYPE_REGS 3 +#define ORC_TYPE_REGS_PARTIAL 4 + #ifndef __ASSEMBLY__ #include <asm/byteorder.h> @@ -56,16 +62,14 @@ struct orc_entry { #if defined(__LITTLE_ENDIAN_BITFIELD) unsigned sp_reg:4; unsigned bp_reg:4; - unsigned type:2; + unsigned type:3; unsigned signal:1; - unsigned end:1; #elif defined(__BIG_ENDIAN_BITFIELD) unsigned bp_reg:4; unsigned sp_reg:4; unsigned unused:4; - unsigned end:1; unsigned signal:1; - unsigned type:2; + unsigned type:3; #endif } __packed; diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index e9e2c3ba5923..06ef25411d62 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -49,7 +49,7 @@ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) -/* See Documentation/x86/x86_64/mm.rst for a description of the memory map. */ +/* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 52 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cf40e813b3d7..b49778664d2b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -334,16 +334,9 @@ static inline void tss_update_io_bitmap(void) } #endif -static inline void paravirt_activate_mm(struct mm_struct *prev, - struct mm_struct *next) +static inline void paravirt_enter_mmap(struct mm_struct *next) { - PVOP_VCALL2(mmu.activate_mm, prev, next); -} - -static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ - PVOP_VCALL2(mmu.dup_mmap, oldmm, mm); + PVOP_VCALL1(mmu.enter_mmap, next); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) @@ -789,8 +782,7 @@ extern void default_banner(void); #ifndef __ASSEMBLY__ #ifndef CONFIG_PARAVIRT_XXL -static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline void paravirt_enter_mmap(struct mm_struct *mm) { } #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8c1da419260f..4acbcddddc29 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -164,11 +164,8 @@ struct pv_mmu_ops { unsigned long (*read_cr3)(void); void (*write_cr3)(unsigned long); - /* Hooks for intercepting the creation/use of an mm_struct. */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); + /* Hook for intercepting the creation/use of an mm_struct. */ + void (*enter_mmap)(struct mm_struct *mm); /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); @@ -562,8 +559,14 @@ void paravirt_flush_lazy_mmu(void); void _paravirt_nop(void); void paravirt_BUG(void); -u64 _paravirt_ident_64(u64); unsigned long paravirt_ret0(void); +#ifdef CONFIG_PARAVIRT_XXL +u64 _paravirt_ident_64(u64); +unsigned long pv_native_save_fl(void); +void pv_native_irq_disable(void); +void pv_native_irq_enable(void); +unsigned long pv_native_read_cr2(void); +#endif #define paravirt_nop ((void *)_paravirt_nop) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8fc15ed5e60b..abf09882f58b 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -121,6 +121,9 @@ #define PEBS_DATACFG_LBRS BIT_ULL(3) #define PEBS_DATACFG_LBR_SHIFT 24 +/* Steal the highest bit of pebs_data_cfg for SW usage */ +#define PEBS_UPDATE_DS_SW BIT_ULL(63) + /* * Intel "Architectural Performance Monitoring" CPUID * detection/enumeration details: diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7425f32e5293..15ae4d6ba476 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1097,7 +1097,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); } -#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) +#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 38bf837e3554..38b54b992f32 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -104,7 +104,7 @@ extern unsigned int ptrs_per_p4d; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* - * See Documentation/x86/x86_64/mm.rst for a description of the memory map. + * See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. * * Be very careful vs. KASLR when changing anything here. The KASLR address * range must not overlap with anything except the KASAN shadow area, which diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index a7f3d9100adb..d8cccadc83a6 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -28,6 +28,8 @@ * On systems with SME, one bit (in a variable position!) is stolen to indicate * that the top-level paging structure is encrypted. * + * On systemms with LAM, bits 61 and 62 are used to indicate LAM mode. + * * All of the remaining bits indicate the physical address of the top-level * paging structure. * diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 8d73004e4cac..a1e4fa58b357 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -647,7 +647,11 @@ static inline void spin_lock_prefetch(const void *x) #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else -#define INIT_THREAD { } +extern unsigned long __end_init_task[]; + +#define INIT_THREAD { \ + .sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \ +} extern unsigned long KSTK_ESP(struct task_struct *task); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index a336feef0af1..f6a1737c77be 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -59,7 +59,6 @@ extern struct real_mode_header *real_mode_header; extern unsigned char real_mode_blob_end[]; extern unsigned long initial_code; -extern unsigned long initial_gs; extern unsigned long initial_stack; #ifdef CONFIG_AMD_MEM_ENCRYPT extern unsigned long initial_vc_handler; diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index bc5b4d788c08..9177b4354c3f 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -28,7 +28,6 @@ void __noreturn machine_real_restart(unsigned int type); void cpu_emergency_disable_virtualization(void); typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); -void nmi_panic_self_stop(struct pt_regs *regs); void nmi_shootdown_cpus(nmi_shootdown_cb callback); void run_crash_ipi_callback(struct pt_regs *regs); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f37cbff7354c..f3495623ac99 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -125,11 +125,11 @@ void clear_bss(void); #ifdef __i386__ -asmlinkage void __init i386_start_kernel(void); +asmlinkage void __init __noreturn i386_start_kernel(void); #else -asmlinkage void __init x86_64_start_kernel(char *real_mode); -asmlinkage void __init x86_64_start_reservations(char *real_mode_data); +asmlinkage void __init __noreturn x86_64_start_kernel(char *real_mode); +asmlinkage void __init __noreturn x86_64_start_reservations(char *real_mode_data); #endif /* __i386__ */ #endif /* _SETUP */ diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index b63be696b776..0759af9b1acf 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -128,10 +128,6 @@ struct snp_psc_desc { struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; } __packed; -/* Guest message request error codes */ -#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32) -#define SNP_GUEST_REQ_ERR_BUSY BIT_ULL(33) - #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ebc271bb6d8e..13dc2a9d23c1 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -9,6 +9,8 @@ #define __ASM_ENCRYPTED_STATE_H #include <linux/types.h> +#include <linux/sev-guest.h> + #include <asm/insn.h> #include <asm/sev-common.h> #include <asm/bootparam.h> @@ -185,6 +187,9 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) return rc; } + +struct snp_guest_request_ioctl; + void setup_ghcb(void); void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages); @@ -196,7 +201,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); void __init __noreturn snp_abort(void); -int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -216,8 +221,7 @@ static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npag static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } -static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, - unsigned long *fw_err) +static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) { return -ENOTTY; } diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 4a03993e0785..2631e01f6e0f 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -7,8 +7,6 @@ #define TDX_HYPERCALL_STANDARD 0 -#define TDX_HCALL_HAS_OUTPUT BIT(0) - #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " @@ -36,7 +34,8 @@ struct tdx_hypercall_args { }; /* Used to request services from the VMM */ -u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); +u64 __tdx_hypercall(struct tdx_hypercall_args *args); +u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); /* Called from __tdx_hypercall() for unrecoverable failure */ void __tdx_hypercall_failed(void); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index b4dbb20dab1a..4e91054c84be 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -93,12 +93,13 @@ static inline void __cpu_die(unsigned int cpu) smp_ops.cpu_die(cpu); } -static inline void play_dead(void) +static inline void __noreturn play_dead(void) { smp_ops.play_dead(); + BUG(); } -static inline void smp_send_reschedule(int cpu) +static inline void arch_smp_send_reschedule(int cpu) { smp_ops.smp_send_reschedule(cpu); } @@ -124,7 +125,7 @@ int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); int common_cpu_die(unsigned int cpu); void native_cpu_die(unsigned int cpu); -void hlt_play_dead(void); +void __noreturn hlt_play_dead(void); void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); @@ -199,5 +200,8 @@ extern void nmi_selftest(void); #define nmi_selftest() do { } while (0) #endif -#endif /* __ASSEMBLY__ */ +extern unsigned int smpboot_control; + +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index c1e14cee0722..857d364b9888 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -15,24 +15,18 @@ #endif #define __HAVE_ARCH_MEMCPY 1 -#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) -#undef memcpy -#define memcpy __msan_memcpy -#else extern void *memcpy(void *to, const void *from, size_t len); -#endif extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET -#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) -extern void *__msan_memset(void *s, int c, size_t n); -#undef memset -#define memset __msan_memset -#else void *memset(void *s, int c, size_t n); -#endif void *__memset(void *s, int c, size_t n); +/* + * KMSAN needs to instrument as much code as possible. Use C versions of + * memsetXX() from lib/string.c under KMSAN. + */ +#if !defined(CONFIG_KMSAN) #define __HAVE_ARCH_MEMSET16 static inline void *memset16(uint16_t *s, uint16_t v, size_t n) { @@ -68,15 +62,10 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) : "memory"); return s; } +#endif #define __HAVE_ARCH_MEMMOVE -#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) -#undef memmove -void *__msan_memmove(void *dest, const void *src, size_t len); -#define memmove __msan_memmove -#else void *memmove(void *dest, const void *src, size_t count); -#endif void *__memmove(void *dest, const void *src, size_t count); int memcmp(const void *cs, const void *ct, size_t count); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 770dcf75eaa9..e7c7379d6ac7 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -183,6 +183,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_GIF_SHIFT 9 #define V_GIF_MASK (1 << V_GIF_SHIFT) +#define V_NMI_PENDING_SHIFT 11 +#define V_NMI_PENDING_MASK (1 << V_NMI_PENDING_SHIFT) + +#define V_NMI_BLOCKING_SHIFT 12 +#define V_NMI_BLOCKING_MASK (1 << V_NMI_BLOCKING_SHIFT) + #define V_INTR_PRIO_SHIFT 16 #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) @@ -197,6 +203,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_GIF_ENABLE_SHIFT 25 #define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT) +#define V_NMI_ENABLE_SHIFT 26 +#define V_NMI_ENABLE_MASK (1 << V_NMI_ENABLE_SHIFT) + #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) @@ -278,7 +287,6 @@ static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL struct vmcb_seg { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cda3118f3b27..75bfaa421030 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_TLBFLUSH_H #define _ASM_X86_TLBFLUSH_H -#include <linux/mm.h> +#include <linux/mm_types.h> #include <linux/sched.h> #include <asm/processor.h> @@ -12,6 +12,7 @@ #include <asm/invpcid.h> #include <asm/pti.h> #include <asm/processor-flags.h> +#include <asm/pgtable.h> void __flush_tlb_all(void); @@ -53,6 +54,15 @@ static inline void cr4_clear_bits(unsigned long mask) local_irq_restore(flags); } +#ifdef CONFIG_ADDRESS_MASKING +DECLARE_PER_CPU(u64, tlbstate_untag_mask); + +static inline u64 current_untag_mask(void) +{ + return this_cpu_read(tlbstate_untag_mask); +} +#endif + #ifndef MODULE /* * 6 because 6 should be plenty and struct tlb_state will fit in two cache @@ -101,6 +111,16 @@ struct tlb_state { */ bool invalidate_other; +#ifdef CONFIG_ADDRESS_MASKING + /* + * Active LAM mode. + * + * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM + * disabled. + */ + u8 lam; +#endif + /* * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate * the corresponding user PCID needs a flush next time we @@ -357,6 +377,32 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) } #define huge_pmd_needs_flush huge_pmd_needs_flush +#ifdef CONFIG_ADDRESS_MASKING +static inline u64 tlbstate_lam_cr3_mask(void) +{ + u64 lam = this_cpu_read(cpu_tlbstate.lam); + + return lam << X86_CR3_LAM_U57_BIT; +} + +static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +{ + this_cpu_write(cpu_tlbstate.lam, + mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT); + this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask); +} + +#else + +static inline u64 tlbstate_lam_cr3_mask(void) +{ + return 0; +} + +static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +{ +} +#endif #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1cc756eafa44..8bae40a66282 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -7,43 +7,21 @@ #include <linux/compiler.h> #include <linux/instrumented.h> #include <linux/kasan-checks.h> +#include <linux/mm_types.h> #include <linux/string.h> +#include <linux/mmap_lock.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> +#include <asm/tlbflush.h> -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline bool pagefault_disabled(void); -# define WARN_ON_IN_IRQ() \ - WARN_ON_ONCE(!in_task() && !pagefault_disabled()) +#ifdef CONFIG_X86_32 +# include <asm/uaccess_32.h> #else -# define WARN_ON_IN_IRQ() +# include <asm/uaccess_64.h> #endif -/** - * access_ok - Checks if a user space pointer is valid - * @addr: User space pointer to start of block to check - * @size: Size of block to check - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Checks if a pointer to a block of memory in user space is valid. - * - * Note that, depending on architecture, this function probably just - * checks that the pointer is in the user space range - after calling - * this function, memory access functions may still return -EFAULT. - * - * Return: true (nonzero) if the memory block may be valid, false (zero) - * if it is definitely invalid. - */ -#define access_ok(addr, size) \ -({ \ - WARN_ON_IN_IRQ(); \ - likely(__access_ok(addr, size)); \ -}) - #include <asm-generic/access_ok.h> extern int __get_user_1(void); @@ -532,14 +510,6 @@ extern struct movsl_mask { #define ARCH_HAS_NOCACHE_UACCESS 1 -#ifdef CONFIG_X86_32 -unsigned long __must_check clear_user(void __user *mem, unsigned long len); -unsigned long __must_check __clear_user(void __user *mem, unsigned long len); -# include <asm/uaccess_32.h> -#else -# include <asm/uaccess_64.h> -#endif - /* * The "unsafe" user accesses aren't really "unsafe", but the naming * is a big fat warning: you have to not only do the access_ok() diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 388a40660c7b..40379a1adbb8 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -33,4 +33,7 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from, return __copy_from_user_ll_nocache_nozero(to, from, n); } +unsigned long __must_check clear_user(void __user *mem, unsigned long len); +unsigned long __must_check __clear_user(void __user *mem, unsigned long len); + #endif /* _ASM_X86_UACCESS_32_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index d13d71af5cf6..81b826d3b753 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -12,38 +12,113 @@ #include <asm/cpufeatures.h> #include <asm/page.h> +#ifdef CONFIG_ADDRESS_MASKING +/* + * Mask out tag bits from the address. + */ +static inline unsigned long __untagged_addr(unsigned long addr) +{ + /* + * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation + * in alternative instructions. The relocation gets wrong when gets + * copied to the target place. + */ + asm (ALTERNATIVE("", + "and %%gs:tlbstate_untag_mask, %[addr]\n\t", X86_FEATURE_LAM) + : [addr] "+r" (addr) : "m" (tlbstate_untag_mask)); + + return addr; +} + +#define untagged_addr(addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr(__addr); \ +}) + +static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, + unsigned long addr) +{ + mmap_assert_locked(mm); + return addr & (mm)->context.untag_mask; +} + +#define untagged_addr_remote(mm, addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \ +}) + +#endif + +/* + * The virtual address space space is logically divided into a kernel + * half and a user half. When cast to a signed type, user pointers + * are positive and kernel pointers are negative. + */ +#define valid_user_address(x) ((long)(x) >= 0) + +/* + * User pointers can have tag bits on x86-64. This scheme tolerates + * arbitrary values in those bits rather then masking them off. + * + * Enforce two rules: + * 1. 'ptr' must be in the user half of the address space + * 2. 'ptr+size' must not overflow into kernel addresses + * + * Note that addresses around the sign change are not valid addresses, + * and will GP-fault even with LAM enabled if the sign bit is set (see + * "CR3.LAM_SUP" that can narrow the canonicality check if we ever + * enable it, but not remove it entirely). + * + * So the "overflow into kernel addresses" does not imply some sudden + * exact boundary at the sign bit, and we can allow a lot of slop on the + * size check. + * + * In fact, we could probably remove the size check entirely, since + * any kernel accesses will be in increasing address order starting + * at 'ptr', and even if the end might be in kernel space, we'll + * hit the GP faults for non-canonical accesses before we ever get + * there. + * + * That's a separate optimization, for now just handle the small + * constant case. + */ +static inline bool __access_ok(const void __user *ptr, unsigned long size) +{ + if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) { + return valid_user_address(ptr); + } else { + unsigned long sum = size + (unsigned long)ptr; + return valid_user_address(sum) && sum >= (unsigned long)ptr; + } +} +#define __access_ok __access_ok + /* * Copy To/From Userspace */ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long -copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); -__must_check unsigned long -copy_user_generic_string(void *to, const void *from, unsigned len); -__must_check unsigned long -copy_user_generic_unrolled(void *to, const void *from, unsigned len); +rep_movs_alternative(void *to, const void *from, unsigned len); static __always_inline __must_check unsigned long -copy_user_generic(void *to, const void *from, unsigned len) +copy_user_generic(void *to, const void *from, unsigned long len) { - unsigned ret; - + stac(); /* - * If CPU has ERMS feature, use copy_user_enhanced_fast_string. - * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. - * Otherwise, use copy_user_generic_unrolled. + * If CPU has FSRM feature, use 'rep movs'. + * Otherwise, use rep_movs_alternative. */ - alternative_call_2(copy_user_generic_unrolled, - copy_user_generic_string, - X86_FEATURE_REP_GOOD, - copy_user_enhanced_fast_string, - X86_FEATURE_ERMS, - ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), - "=d" (len)), - "1" (to), "2" (from), "3" (len) - : "memory", "rcx", "r8", "r9", "r10", "r11"); - return ret; + asm volatile( + "1:\n\t" + ALTERNATIVE("rep movsb", + "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM)) + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT + : : "memory", "rax", "r8", "r9", "r10", "r11"); + clac(); + return len; } static __always_inline __must_check unsigned long @@ -58,19 +133,19 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) return copy_user_generic((__force void *)dst, src, size); } -extern long __copy_user_nocache(void *dst, const void __user *src, - unsigned size, int zerorest); - +extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size); extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); -extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, - size_t len); static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { + long ret; kasan_check_write(dst, size); - return __copy_user_nocache(dst, src, size, 0); + stac(); + ret = __copy_user_nocache(dst, src, size); + clac(); + return ret; } static inline int @@ -85,11 +160,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) */ __must_check unsigned long -clear_user_original(void __user *addr, unsigned long len); -__must_check unsigned long -clear_user_rep_good(void __user *addr, unsigned long len); -__must_check unsigned long -clear_user_erms(void __user *addr, unsigned long len); +rep_stos_alternative(void __user *addr, unsigned long len); static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) { @@ -102,16 +173,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr */ asm volatile( "1:\n\t" - ALTERNATIVE_3("rep stosb", - "call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM), - "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS), - "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD)) + ALTERNATIVE("rep stosb", + "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)) "2:\n" _ASM_EXTABLE_UA(1b, 2b) : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT - : "a" (0) - /* rep_good clobbers %rdx */ - : "rdx"); + : "a" (0)); clac(); @@ -120,7 +187,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr static __always_inline unsigned long clear_user(void __user *to, unsigned long n) { - if (access_ok(to, n)) + if (__access_ok(to, n)) return __clear_user(to, n); return n; } diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index e7c71750b309..01cb9692b160 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -7,12 +7,17 @@ #ifdef __ASSEMBLY__ -.macro UNWIND_HINT_EMPTY - UNWIND_HINT type=UNWIND_HINT_TYPE_CALL end=1 +.macro UNWIND_HINT_END_OF_STACK + UNWIND_HINT type=UNWIND_HINT_TYPE_END_OF_STACK +.endm + +.macro UNWIND_HINT_UNDEFINED + UNWIND_HINT type=UNWIND_HINT_TYPE_UNDEFINED .endm .macro UNWIND_HINT_ENTRY - UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1 + VALIDATE_UNRET_BEGIN + UNWIND_HINT_END_OF_STACK .endm .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1 @@ -52,6 +57,11 @@ UNWIND_HINT_REGS base=\base offset=\offset partial=1 signal=\signal .endm +.macro UNWIND_HINT_IRET_ENTRY base=%rsp offset=0 signal=1 + VALIDATE_UNRET_BEGIN + UNWIND_HINT_IRET_REGS base=\base offset=\offset signal=\signal +.endm + .macro UNWIND_HINT_FUNC UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC .endm @@ -67,7 +77,7 @@ #else #define UNWIND_HINT_FUNC \ - UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0, 0) + UNWIND_HINT(UNWIND_HINT_TYPE_FUNC, ORC_REG_SP, 8, 0) #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c1c8c581759d..88085f369ff6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -259,11 +259,15 @@ struct x86_legacy_features { * VMMCALL under SEV-ES. Needs to return 'false' * if the checks fail. Called from the #VC * exception handler. + * @is_private_mmio: For CoCo VMs, must map MMIO address as private. + * Used when device is emulated by a paravisor + * layer in the VM context. */ struct x86_hyper_runtime { void (*pin_vcpu)(int cpu); void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs); bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs); + bool (*is_private_mmio)(u64 addr); }; /** @@ -326,5 +330,7 @@ extern void x86_init_uint_noop(unsigned int unused); extern bool bool_x86_init_noop(void); extern void x86_op_int_noop(int cpu); extern bool x86_pnpbios_disabled(void); +extern int set_rtc_noop(const struct timespec64 *now); +extern void get_rtc_noop(struct timespec64 *now); #endif diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7f467fe05d42..1a6a1f987949 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -559,4 +559,7 @@ struct kvm_pmu_event_filter { #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ +/* x86-specific KVM_EXIT_HYPERCALL flags. */ +#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 500b96e71f18..e8d7ebbca1a4 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -16,8 +16,16 @@ #define ARCH_GET_XCOMP_GUEST_PERM 0x1024 #define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 +#define ARCH_XCOMP_TILECFG 17 +#define ARCH_XCOMP_TILEDATA 18 + #define ARCH_MAP_VDSO_X32 0x2001 #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 +#define ARCH_GET_UNTAG_MASK 0x4001 +#define ARCH_ENABLE_TAGGED_ADDR 0x4002 +#define ARCH_GET_MAX_TAG_BITS 0x4003 +#define ARCH_FORCE_TAGGED_SVA 0x4004 + #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index c47cc7f2feeb..d898432947ff 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -82,6 +82,10 @@ #define X86_CR3_PCID_BITS 12 #define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) +#define X86_CR3_LAM_U57_BIT 61 /* Activate LAM for userspace, 62:57 bits masked */ +#define X86_CR3_LAM_U57 _BITULL(X86_CR3_LAM_U57_BIT) +#define X86_CR3_LAM_U48_BIT 62 /* Activate LAM for userspace, 62:48 bits masked */ +#define X86_CR3_LAM_U48 _BITULL(X86_CR3_LAM_U48_BIT) #define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ #define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) @@ -132,6 +136,8 @@ #define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT) #define X86_CR4_CET_BIT 23 /* enable Control-flow Enforcement Technology */ #define X86_CR4_CET _BITUL(X86_CR4_CET_BIT) +#define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */ +#define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT) /* * x86-64 Task Priority Register, CR8 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1c38174b5f01..21b542a6866c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -146,7 +146,11 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) pr_debug("Local APIC address 0x%08x\n", madt->address); } - if (madt->header.revision >= 5) + + /* ACPI 6.3 and newer support the online capable bit. */ + if (acpi_gbl_FADT.header.revision > 6 || + (acpi_gbl_FADT.header.revision == 6 && + acpi_gbl_FADT.minor_revision >= 3)) acpi_support_online_capable = true; default_acpi_madt_oem_check(madt->header.oem_id, @@ -193,7 +197,8 @@ static bool __init acpi_is_processor_usable(u32 lapic_flags) if (lapic_flags & ACPI_MADT_ENABLED) return true; - if (acpi_support_online_capable && (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) + if (!acpi_support_online_capable || + (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) return true; return false; @@ -1853,13 +1858,18 @@ early_param("acpi_sci", setup_acpi_sci); int __acpi_acquire_global_lock(unsigned int *lock) { - unsigned int old, new; + unsigned int old, new, val; old = READ_ONCE(*lock); do { - new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); + val = (old >> 1) & 0x1; + new = (old & ~0x3) + 2 + val; } while (!try_cmpxchg(lock, &old, new)); - return ((new & 0x3) < 3) ? -1 : 0; + + if (val) + return 0; + + return -1; } int __acpi_release_global_lock(unsigned int *lock) diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3b7f4cdbf2e0..1328c221af30 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -111,13 +111,26 @@ int x86_acpi_suspend_lowlevel(void) saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP - initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); - early_gdt_descr.address = - (unsigned long)get_cpu_gdt_rw(smp_processor_id()); - initial_gs = per_cpu_offset(smp_processor_id()); + /* + * As each CPU starts up, it will find its own stack pointer + * from its current_task->thread.sp. Typically that will be + * the idle thread for a newly-started AP, or even the boot + * CPU which will find it set to &init_task in the static + * per-cpu data. + * + * Make the resuming CPU use the temporary stack at startup + * by setting current->thread.sp to point to that. The true + * %rsp will be restored with the rest of the CPU context, + * by do_suspend_lowlevel(). And unwinders don't care about + * the abuse of ->thread.sp because it's a dead variable + * while the thread is running on the CPU anyway; the true + * value is in the actual %rsp register. + */ + current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack); + smpboot_control = smp_processor_id(); #endif initial_code = (unsigned long)wakeup_long64; - saved_magic = 0x123456789abcdef0L; + saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ /* diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4266b64631a4..7e331e8f3692 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -36,6 +36,7 @@ #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e #define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 +#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -79,6 +80,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, {} }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 20d9a604da7c..770557110051 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -422,10 +422,9 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new) if (vector && !eilvt_entry_is_changeable(vector, new)) /* may not change if vectors are different */ return rsvd; - rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); - } while (rsvd != new); + } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new)); - rsvd &= ~APIC_EILVT_MASKED; + rsvd = new & ~APIC_EILVT_MASKED; if (rsvd && rsvd != vector) pr_info("LVT offset %d assigned for vector 0x%02x\n", offset, rsvd); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1f83b052bb74..4241dc243aa8 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -66,6 +66,7 @@ #include <asm/hw_irq.h> #include <asm/apic.h> #include <asm/pgtable.h> +#include <asm/x86_init.h> #define for_each_ioapic(idx) \ for ((idx) = 0; (idx) < nr_ioapics; (idx)++) @@ -2477,17 +2478,21 @@ static int io_apic_get_redir_entries(int ioapic) unsigned int arch_dynirq_lower_bound(unsigned int from) { + unsigned int ret; + /* * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use * gsi_top if ioapic_dynirq_base hasn't been initialized yet. */ - if (!ioapic_initialized) - return gsi_top; + ret = ioapic_dynirq_base ? : gsi_top; + /* - * For DT enabled machines ioapic_dynirq_base is irrelevant and not - * updated. So simply return @from if ioapic_dynirq_base == 0. + * For DT enabled machines ioapic_dynirq_base is irrelevant and + * always 0. gsi_top can be 0 if there is no IO/APIC registered. + * 0 is an invalid interrupt number for dynamic allocations. Return + * @from instead. */ - return ioapic_dynirq_base ? : from; + return ret ? : from; } #ifdef CONFIG_X86_32 @@ -2680,10 +2685,15 @@ static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys) pgprot_t flags = FIXMAP_PAGE_NOCACHE; /* - * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot + * Ensure fixmaps for IO-APIC MMIO respect memory encryption pgprot * bits, just like normal ioremap(): */ - flags = pgprot_decrypted(flags); + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { + if (x86_platform.hyper.is_private_mmio(phys)) + flags = pgprot_encrypted(flags); + else + flags = pgprot_decrypted(flags); + } __set_fixmap(idx, phys, flags); } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e696e22d0531..b2b2b7f3e03f 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -9,11 +9,7 @@ #include "local.h" -struct cluster_mask { - unsigned int clusterid; - int node; - struct cpumask mask; -}; +#define apic_cluster(apicid) ((apicid) >> 4) /* * __x2apic_send_IPI_mask() possibly needs to read @@ -23,8 +19,7 @@ struct cluster_mask { static u32 *x86_cpu_to_logical_apicid __read_mostly; static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); -static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks); -static struct cluster_mask *cluster_hotplug_mask; +static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { @@ -60,10 +55,10 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) /* Collapse cpus in a cluster so a single IPI per cluster is sent */ for_each_cpu(cpu, tmpmsk) { - struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu); + struct cpumask *cmsk = per_cpu(cluster_masks, cpu); dest = 0; - for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) + for_each_cpu_and(clustercpu, tmpmsk, cmsk) dest |= x86_cpu_to_logical_apicid[clustercpu]; if (!dest) @@ -71,7 +66,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); /* Remove cluster CPUs from tmpmask */ - cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); + cpumask_andnot(tmpmsk, tmpmsk, cmsk); } local_irq_restore(flags); @@ -105,55 +100,98 @@ static u32 x2apic_calc_apicid(unsigned int cpu) static void init_x2apic_ldr(void) { - struct cluster_mask *cmsk = this_cpu_read(cluster_masks); - u32 cluster, apicid = apic_read(APIC_LDR); - unsigned int cpu; + struct cpumask *cmsk = this_cpu_read(cluster_masks); - x86_cpu_to_logical_apicid[smp_processor_id()] = apicid; + BUG_ON(!cmsk); - if (cmsk) - goto update; - - cluster = apicid >> 16; - for_each_online_cpu(cpu) { - cmsk = per_cpu(cluster_masks, cpu); - /* Matching cluster found. Link and update it. */ - if (cmsk && cmsk->clusterid == cluster) - goto update; + cpumask_set_cpu(smp_processor_id(), cmsk); +} + +/* + * As an optimisation during boot, set the cluster_mask for all present + * CPUs at once, to prevent each of them having to iterate over the others + * to find the existing cluster_mask. + */ +static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster) +{ + int cpu_i; + + for_each_present_cpu(cpu_i) { + struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu_i); + u32 apicid = apic->cpu_present_to_apicid(cpu_i); + + if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster) + continue; + + if (WARN_ON_ONCE(*cpu_cmsk == cmsk)) + continue; + + BUG_ON(*cpu_cmsk); + *cpu_cmsk = cmsk; } - cmsk = cluster_hotplug_mask; - cmsk->clusterid = cluster; - cluster_hotplug_mask = NULL; -update: - this_cpu_write(cluster_masks, cmsk); - cpumask_set_cpu(smp_processor_id(), &cmsk->mask); } -static int alloc_clustermask(unsigned int cpu, int node) +static int alloc_clustermask(unsigned int cpu, u32 cluster, int node) { + struct cpumask *cmsk = NULL; + unsigned int cpu_i; + + /* + * At boot time, the CPU present mask is stable. The cluster mask is + * allocated for the first CPU in the cluster and propagated to all + * present siblings in the cluster. If the cluster mask is already set + * on entry to this function for a given CPU, there is nothing to do. + */ if (per_cpu(cluster_masks, cpu)) return 0; + + if (system_state < SYSTEM_RUNNING) + goto alloc; + /* - * If a hotplug spare mask exists, check whether it's on the right - * node. If not, free it and allocate a new one. + * On post boot hotplug for a CPU which was not present at boot time, + * iterate over all possible CPUs (even those which are not present + * any more) to find any existing cluster mask. */ - if (cluster_hotplug_mask) { - if (cluster_hotplug_mask->node == node) - return 0; - kfree(cluster_hotplug_mask); + for_each_possible_cpu(cpu_i) { + u32 apicid = apic->cpu_present_to_apicid(cpu_i); + + if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) { + cmsk = per_cpu(cluster_masks, cpu_i); + /* + * If the cluster is already initialized, just store + * the mask and return. There's no need to propagate. + */ + if (cmsk) { + per_cpu(cluster_masks, cpu) = cmsk; + return 0; + } + } } - - cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask), - GFP_KERNEL, node); - if (!cluster_hotplug_mask) + /* + * No CPU in the cluster has ever been initialized, so fall through to + * the boot time code which will also populate the cluster mask for any + * other CPU in the cluster which is (now) present. + */ +alloc: + cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node); + if (!cmsk) return -ENOMEM; - cluster_hotplug_mask->node = node; + per_cpu(cluster_masks, cpu) = cmsk; + prefill_clustermask(cmsk, cpu, cluster); + return 0; } static int x2apic_prepare_cpu(unsigned int cpu) { - if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0) + u32 phys_apicid = apic->cpu_present_to_apicid(cpu); + u32 cluster = apic_cluster(phys_apicid); + u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf)); + + x86_cpu_to_logical_apicid[cpu] = logical_apicid; + + if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0) return -ENOMEM; if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) return -ENOMEM; @@ -162,10 +200,10 @@ static int x2apic_prepare_cpu(unsigned int cpu) static int x2apic_dead_cpu(unsigned int dead_cpu) { - struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); + struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu); if (cmsk) - cpumask_clear_cpu(dead_cpu, &cmsk->mask); + cpumask_clear_cpu(dead_cpu, cmsk); free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 283dcd2f62c8..dc3576303f1a 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -115,6 +115,7 @@ static void __used common(void) OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); + OFFSET(X86_current_task, pcpu_hot, current_task); #ifdef CONFIG_CALL_DEPTH_TRACKING OFFSET(X86_call_depth, pcpu_hot, call_depth); #endif diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index ffea98f9064b..22ab13966427 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -330,8 +330,8 @@ void noinline callthunks_patch_module_calls(struct callthunk_sites *cs, struct module *mod) { struct core_text ct = { - .base = (unsigned long)mod->core_layout.base, - .end = (unsigned long)mod->core_layout.base + mod->core_layout.size, + .base = (unsigned long)mod->mem[MOD_TEXT].base, + .end = (unsigned long)mod->mem[MOD_TEXT].base + mod->mem[MOD_TEXT].size, .name = mod->name, }; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 95cdd08c4cbb..571abf808ea3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + /* AMD FSRM also implies FSRS */ + if (cpu_has(c, X86_FEATURE_FSRM)) + set_cpu_cap(c, X86_FEATURE_FSRS); + /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); @@ -1005,6 +1009,17 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); + + /* + * Make sure EFER[AIBRSE - Automatic IBRS Enable] is set. The APs are brought up + * using the trampoline code and as part of it, MSR_EFER gets prepared there in + * order to be replicated onto them. Regardless, set it here again, if not set, + * to protect against any future refactoring/code reorganization which might + * miss setting this important bit. + */ + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + cpu_has(c, X86_FEATURE_AUTOIBRS)) + WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index f9d060e71c3e..182af64387d0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -784,8 +784,7 @@ static int __init nospectre_v1_cmdline(char *str) } early_param("nospectre_v1", nospectre_v1_cmdline); -static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = - SPECTRE_V2_NONE; +enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt @@ -1133,13 +1132,6 @@ spectre_v2_parse_user_cmdline(void) return SPECTRE_V2_USER_CMD_AUTO; } -static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) -{ - return mode == SPECTRE_V2_EIBRS || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_EIBRS_LFENCE; -} - static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8cd4126d8253..80710a68ef7d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -121,6 +121,7 @@ static const struct x86_cpu_id ppin_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 57a5349e6954..f97b0fe13da8 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -83,4 +83,12 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); +extern enum spectre_v2_mitigation spectre_v2_enabled; + +static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +{ + return mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; +} #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 291d4167fab8..1c4639588ff9 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1177,7 +1177,7 @@ static const struct { static struct ratelimit_state bld_ratelimit; static unsigned int sysctl_sld_mitigate = 1; -static DEFINE_SEMAPHORE(buslock_sem); +static DEFINE_SEMAPHORE(buslock_sem, 1); #ifdef CONFIG_PROC_SYSCTL static struct ctl_table sld_sysctls[] = { @@ -1451,31 +1451,13 @@ void handle_bus_lock(struct pt_regs *regs) } /* - * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should - * only be trusted if it is confirmed that a CPU model implements a - * specific feature at a particular bit position. - * - * The possible driver data field values: - * - * - 0: CPU models that are known to have the per-core split-lock detection - * feature even though they do not enumerate IA32_CORE_CAPABILITIES. - * - * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use - * bit 5 to enumerate the per-core split-lock detection feature. + * CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. */ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), {} }; @@ -1487,24 +1469,27 @@ static void __init split_lock_setup(struct cpuinfo_x86 *c) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; + /* Check for CPUs that have support but do not enumerate it: */ m = x86_match_cpu(split_lock_cpu_ids); - if (!m) - return; + if (m) + goto supported; - switch (m->driver_data) { - case 0: - break; - case 1: - if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) - return; - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); - if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)) - return; - break; - default: + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) return; - } + /* + * Not all bits in MSR_IA32_CORE_CAPS are architectural, but + * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set + * it have split lock detection. + */ + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + goto supported; + + /* CPU is not in the model list and does not have the MSR bit: */ + return; + +supported: cpu_model_supports_sld = true; __split_lock_setup(); } diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 23c5072fbbb7..0b971f974096 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -235,10 +235,10 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); * A list of the banks enabled on each logical CPU. Controls which respective * descriptors to initialize later in mce_threshold_create_device(). */ -static DEFINE_PER_CPU(unsigned int, bank_map); +static DEFINE_PER_CPU(u64, bank_map); /* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u32, smca_misc_banks_map); +static DEFINE_PER_CPU(u64, smca_misc_banks_map); static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -267,7 +267,7 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) return; if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT(bank); + per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); } @@ -530,7 +530,7 @@ static u32 smca_get_block_address(unsigned int bank, unsigned int block, if (!block) return MSR_AMD64_SMCA_MCx_MISC(bank); - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank))) + if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) return 0; return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); @@ -574,7 +574,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, int new; if (!block) - per_cpu(bank_map, cpu) |= (1 << bank); + per_cpu(bank_map, cpu) |= BIT_ULL(bank); memset(&b, 0, sizeof(b)); b.cpu = cpu; @@ -878,7 +878,7 @@ static void amd_threshold_interrupt(void) return; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) + if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; first_block = bp[bank]->blocks; @@ -1029,7 +1029,7 @@ static const struct sysfs_ops threshold_ops = { static void threshold_block_release(struct kobject *kobj); -static struct kobj_type threshold_ktype = { +static const struct kobj_type threshold_ktype = { .sysfs_ops = &threshold_ops, .default_groups = default_groups, .release = threshold_block_release, @@ -1356,7 +1356,7 @@ int mce_threshold_create_device(unsigned int cpu) return -ENOMEM; for (bank = 0; bank < numbanks; ++bank) { - if (!(this_cpu_read(bank_map) & (1 << bank))) + if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; err = threshold_create_bank(bp, cpu, bank); if (err) { diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 91a415553c27..d2412ce2d312 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -244,11 +244,11 @@ noinstr void pentium_machine_check(struct pt_regs *regs); noinstr void winchip_machine_check(struct pt_regs *regs); static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } #else -static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void enable_p5_mce(void) {} -static inline void pentium_machine_check(struct pt_regs *regs) {} -static inline void winchip_machine_check(struct pt_regs *regs) {} +static __always_inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static __always_inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static __always_inline void enable_p5_mce(void) {} +static __always_inline void pentium_machine_check(struct pt_regs *regs) {} +static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif noinstr u64 mce_rdmsrl(u32 msr); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 9eb457b10341..f5fdeb1e3606 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -61,7 +61,7 @@ static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; /* * Microcode patch container file is prepended to the initrd in cpio - * format. See Documentation/x86/microcode.rst + * format. See Documentation/arch/x86/microcode.rst */ static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7a329e561354..3afcf3de0dd4 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -632,6 +632,7 @@ static const struct attribute_group cpu_root_microcode_group = { static int __init microcode_init(void) { + struct device *dev_root; struct cpuinfo_x86 *c = &boot_cpu_data; int error; @@ -652,10 +653,14 @@ static int __init microcode_init(void) if (IS_ERR(microcode_pdev)) return PTR_ERR(microcode_pdev); - error = sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpu_root_microcode_group); - if (error) { - pr_err("Error creating microcode group!\n"); - goto out_pdev; + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + error = sysfs_create_group(&dev_root->kobj, &cpu_root_microcode_group); + put_device(dev_root); + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_pdev; + } } /* Do per-CPU setup */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index f36dc2f796c5..c7969e806c64 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,7 +18,6 @@ #include <linux/kexec.h> #include <linux/i8253.h> #include <linux/random.h> -#include <linux/swiotlb.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv-tlfs.h> @@ -33,7 +32,6 @@ #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> #include <asm/numa.h> -#include <asm/coco.h> /* Is Linux running as the root partition? */ bool hv_root_partition; @@ -250,11 +248,6 @@ static uint32_t __init ms_hyperv_platform(void) return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; } -static unsigned char hv_get_nmi_reason(void) -{ - return 0; -} - #ifdef CONFIG_X86_LOCAL_APIC /* * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes @@ -358,12 +351,16 @@ static void __init ms_hyperv_init_platform(void) * To mirror what Windows does we should extract CPU management * features and use the ReservedIdentityBit to detect if Linux is the * root partition. But that requires negotiating CPU management - * interface (a process to be finalized). + * interface (a process to be finalized). For now, use the privilege + * flag as the indicator for running as root. * - * For now, use the privilege flag as the indicator for running as - * root. + * Hyper-V should never specify running as root and as a Confidential + * VM. But to protect against a compromised/malicious Hyper-V trying + * to exploit root behavior to expose Confidential VM memory, ignore + * the root partition setting if also a Confidential VM. */ - if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) { + if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && + !(ms_hyperv.priv_high & HV_ISOLATION)) { hv_root_partition = true; pr_info("Hyper-V: running as root partition\n"); } @@ -397,23 +394,16 @@ static void __init ms_hyperv_init_platform(void) if (ms_hyperv.priv_high & HV_ISOLATION) { ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); - ms_hyperv.shared_gpa_boundary = - BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); + + if (ms_hyperv.shared_gpa_boundary_active) + ms_hyperv.shared_gpa_boundary = + BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); - if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { + if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) static_branch_enable(&isolation_type_snp); -#ifdef CONFIG_SWIOTLB - swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary; -#endif - } - /* Isolation VMs are unenlightened SEV-based VMs, thus this check: */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { - if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE) - cc_set_vendor(CC_VENDOR_HYPERV); - } } if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { @@ -482,6 +472,9 @@ static void __init ms_hyperv_init_platform(void) i8253_clear_counter_on_shutdown = false; #if IS_ENABLED(CONFIG_HYPERV) + if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || + (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)) + hv_vtom_init(); /* * Setup the hook to get control post apic initialization. */ @@ -521,6 +514,7 @@ static void __init ms_hyperv_init_platform(void) /* Register Hyper-V specific clocksource */ hv_init_clocksource(); + hv_vtl_init_platform(); #endif /* * TSC should be marked as unstable only after Hyper-V diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 7fe51488e136..ded1fc7cb7cb 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -76,7 +76,7 @@ unsigned int resctrl_rmid_realloc_limit; #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) /* - * The correction factor table is documented in Documentation/x86/resctrl.rst. + * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied * by the correction factor. * @@ -383,41 +383,36 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } +static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid, + enum resctrl_event_id evtid) +{ + switch (evtid) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &d->mbm_total[rmid]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &d->mbm_local[rmid]; + default: + return NULL; + } +} + static int __mon_event_count(u32 rmid, struct rmid_read *rr) { struct mbm_state *m; u64 tval = 0; - if (rr->first) + if (rr->first) { resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); + m = get_mbm_state(rr->d, rmid, rr->evtid); + if (m) + memset(m, 0, sizeof(struct mbm_state)); + return 0; + } rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); if (rr->err) return rr->err; - switch (rr->evtid) { - case QOS_L3_OCCUP_EVENT_ID: - rr->val += tval; - return 0; - case QOS_L3_MBM_TOTAL_EVENT_ID: - m = &rr->d->mbm_total[rmid]; - break; - case QOS_L3_MBM_LOCAL_EVENT_ID: - m = &rr->d->mbm_local[rmid]; - break; - default: - /* - * Code would never reach here because an invalid - * event id would fail in resctrl_arch_rmid_read(). - */ - return -EINVAL; - } - - if (rr->first) { - memset(m, 0, sizeof(struct mbm_state)); - return 0; - } - rr->val += tval; return 0; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 524f8ff3e69c..458cb7419502 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1580,7 +1580,7 @@ int rdt_pseudo_lock_init(void) pseudo_lock_major = ret; - pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock"); + pseudo_lock_class = class_create("pseudo_lock"); if (IS_ERR(pseudo_lock_class)) { ret = PTR_ERR(pseudo_lock_class); unregister_chrdev(pseudo_lock_major, "pseudo_lock"); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index e5a37b6e9aa5..166692f2d501 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -892,20 +892,19 @@ static struct miscdevice sgx_dev_provision = { int sgx_set_attribute(unsigned long *allowed_attributes, unsigned int attribute_fd) { - struct file *file; + struct fd f = fdget(attribute_fd); - file = fget(attribute_fd); - if (!file) + if (!f.file) return -EINVAL; - if (file->f_op != &sgx_provision_fops) { - fput(file); + if (f.file->f_op != &sgx_provision_fops) { + fdput(f); return -EINVAL; } *allowed_attributes |= SGX_ATTR_PROVISIONKEY; - fput(file); + fdput(f); return 0; } EXPORT_SYMBOL_GPL(sgx_set_attribute); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 0f2020653fba..d2dad21259a8 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -15,7 +15,7 @@ #define EREMOVE_ERROR_MESSAGE \ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ - "Refer to Documentation/x86/sgx.rst for more information." + "Refer to Documentation/arch/x86/sgx.rst for more information." #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index ec8064c0ae03..2293efd6ffa6 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -232,7 +232,11 @@ static int __init umwait_init(void) * Add umwait control interface. Ignore failure, so at least the * default values are set up in case the machine manages to boot. */ - dev = cpu_subsys.dev_root; - return sysfs_create_group(&dev->kobj, &umwait_attr_group); + dev = bus_get_dev_root(&cpu_subsys); + if (dev) { + ret = sysfs_create_group(&dev->kobj, &umwait_attr_group); + put_device(dev); + } + return ret; } device_initcall(umwait_init); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 621ba9c0f17a..bdc0d5539b57 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -154,7 +154,7 @@ static int __init cpuid_init(void) CPUID_MAJOR); return -EBUSY; } - cpuid_class = class_create(THIS_MODULE, "cpuid"); + cpuid_class = class_create("cpuid"); if (IS_ERR(cpuid_class)) { err = PTR_ERR(cpuid_class); goto out_chrdev; diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index a0ed0e4a2c0c..0d9a14528176 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -163,6 +163,11 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) jmp .Lftrace_ret SYM_CODE_END(ftrace_regs_caller) +SYM_FUNC_START(ftrace_stub_direct_tramp) + CALL_DEPTH_ACCOUNT + RET +SYM_FUNC_END(ftrace_stub_direct_tramp) + #ifdef CONFIG_FUNCTION_GRAPH_TRACER SYM_CODE_START(ftrace_graph_caller) pushl %eax diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index fb4f1e01b64a..b8c720b5dab2 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -309,6 +309,10 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) +SYM_FUNC_START(ftrace_stub_direct_tramp) + CALL_DEPTH_ACCOUNT + RET +SYM_FUNC_END(ftrace_stub_direct_tramp) #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -342,7 +346,7 @@ STACK_FRAME_NON_STANDARD_FP(__fentry__) #ifdef CONFIG_FUNCTION_GRAPH_TRACER SYM_CODE_START(return_to_handler) - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR subq $16, %rsp diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index ec6fefbfd3c0..10c27b4261eb 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void) x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; } -asmlinkage __visible void __init i386_start_kernel(void) +asmlinkage __visible void __init __noreturn i386_start_kernel(void) { /* Make sure IDT is set up before any exception happens */ idt_setup_early_handler(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 387e4b12e823..49f7629b17f7 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -471,7 +471,7 @@ static void __init copy_bootdata(char *real_mode_data) sme_unmap_bootdata(real_mode_data); } -asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode_data) { /* * Build-time sanity checks on the kernel image and module @@ -537,7 +537,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) x86_64_start_reservations(real_mode_data); } -void __init x86_64_start_reservations(char *real_mode_data) +void __init __noreturn x86_64_start_reservations(char *real_mode_data) { /* version is always not zero if it is copied */ if (!boot_params.hdr.version) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 222efd4a09bc..a5df3e994f04 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -42,7 +42,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) __HEAD .code64 SYM_CODE_START_NOALIGN(startup_64) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded an identity mapped page table @@ -61,23 +61,15 @@ SYM_CODE_START_NOALIGN(startup_64) * tables and then reload them. */ - /* Set up the stack for verify_cpu(), similar to initial_stack below */ - leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp + /* Set up the stack for verify_cpu() */ + leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp leaq _text(%rip), %rdi - /* - * initial_gs points to initial fixed_percpu_data struct with storage for - * the stack protector canary. Global pointer fixups are needed at this - * stage, so apply them as is done in fixup_pointer(), and initialize %gs - * such that the canary can be accessed at %gs:40 for subsequent C calls. - */ + /* Setup GSBASE to allow stack canary access for C code */ movl $MSR_GS_BASE, %ecx - movq initial_gs(%rip), %rax - movq $_text, %rdx - subq %rdx, %rax - addq %rdi, %rax - movq %rax, %rdx + leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx + movl %edx, %eax shrq $32, %rdx wrmsr @@ -105,7 +97,7 @@ SYM_CODE_START_NOALIGN(startup_64) lretq .Lon_kernel_cs: - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK /* Sanitize CPU configuration */ call verify_cpu @@ -127,7 +119,7 @@ SYM_CODE_START_NOALIGN(startup_64) SYM_CODE_END(startup_64) SYM_CODE_START(secondary_startup_64) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, @@ -156,7 +148,7 @@ SYM_CODE_START(secondary_startup_64) * verify_cpu() above to make sure NX is enabled. */ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR /* @@ -238,16 +230,39 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR // above +#ifdef CONFIG_SMP + movl smpboot_control(%rip), %ecx + + /* Get the per cpu offset for the given CPU# which is in ECX */ + movq __per_cpu_offset(,%rcx,8), %rdx +#else + xorl %edx, %edx /* zero-extended to clear all of RDX */ +#endif /* CONFIG_SMP */ + + /* + * Setup a boot time stack - Any secondary CPU will have lost its stack + * by now because the cr3-switch above unmaps the real-mode stack. + * + * RDX contains the per-cpu offset + */ + movq pcpu_hot + X86_current_task(%rdx), %rax + movq TASK_threadsp(%rax), %rsp + /* * We must switch to a new descriptor in kernel space for the GDT * because soon the kernel won't have access anymore to the userspace * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt early_gdt_descr(%rip) + subq $16, %rsp + movw $(GDT_SIZE-1), (%rsp) + leaq gdt_page(%rdx), %rax + movq %rax, 2(%rsp) + lgdt (%rsp) + addq $16, %rsp /* set up data segments */ xorl %eax,%eax @@ -271,16 +286,13 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movl initial_gs(%rip),%eax - movl initial_gs+4(%rip),%edx +#ifndef CONFIG_SMP + leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx +#endif + movl %edx, %eax + shrq $32, %rdx wrmsr - /* - * Setup a boot time stack - Any secondary CPU will have lost its stack - * by now because the cr3-switch above unmaps the real-mode stack - */ - movq initial_stack(%rip), %rsp - /* Setup and Load IDT */ pushq %rsi call early_setup_idt @@ -371,8 +383,12 @@ SYM_CODE_END(secondary_startup_64) */ SYM_CODE_START(start_cpu0) ANNOTATE_NOENDBR - UNWIND_HINT_EMPTY - movq initial_stack(%rip), %rsp + UNWIND_HINT_END_OF_STACK + + /* Find the idle task stack */ + movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx + movq TASK_threadsp(%rcx), %rsp + jmp .Ljump_to_C_code SYM_CODE_END(start_cpu0) #endif @@ -390,8 +406,6 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb) UNWIND_HINT_IRET_REGS offset=8 ENDBR - ANNOTATE_UNRET_END - /* Build pt_regs */ PUSH_AND_CLEAR_REGS @@ -416,16 +430,9 @@ SYM_CODE_END(vc_boot_ghcb) __REFDATA .balign 8 SYM_DATA(initial_code, .quad x86_64_start_kernel) -SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) #ifdef CONFIG_AMD_MEM_ENCRYPT SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) #endif - -/* - * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder - * reliably detect the end of the stack. - */ -SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE) __FINITDATA __INIT @@ -451,7 +458,6 @@ SYM_CODE_END(early_idt_handler_array) SYM_CODE_START_LOCAL(early_idt_handler_common) UNWIND_HINT_IRET_REGS offset=16 - ANNOTATE_UNRET_END /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -501,8 +507,6 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) UNWIND_HINT_IRET_REGS offset=8 ENDBR - ANNOTATE_UNRET_END - /* Build pt_regs */ PUSH_AND_CLEAR_REGS @@ -657,8 +661,7 @@ SYM_DATA_END(level1_fixmap_pgt) .data .align 16 -SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1) -SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page)) +SYM_DATA(smpboot_control, .long 0) .align 16 /* This must match the first entry in level2_kernel_pgt */ diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 9ff480e94511..670eb08b972a 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -77,15 +77,6 @@ static struct ctl_table itmt_kern_table[] = { {} }; -static struct ctl_table itmt_root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = itmt_kern_table, - }, - {} -}; - static struct ctl_table_header *itmt_sysctl_header; /** @@ -114,7 +105,7 @@ int sched_set_itmt_support(void) return 0; } - itmt_sysctl_header = register_sysctl_table(itmt_root_table); + itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table); if (!itmt_sysctl_header) { mutex_unlock(&itmt_update_mutex); return -ENOMEM; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 6b58610a1552..a61c12c01270 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -476,7 +476,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, efi_map_offset = params_cmdline_sz; efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16); - /* Copy setup header onto bootparams. Documentation/x86/boot.rst */ + /* Copy setup header onto bootparams. Documentation/arch/x86/boot.rst */ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; /* Is there a limit on setup header size? */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 0611fd83858e..1a3e2c05a8a5 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -374,17 +374,6 @@ void machine_kexec(struct kimage *image) /* arch-dependent functionality related to kexec file-based syscall */ #ifdef CONFIG_KEXEC_FILE -void *arch_kexec_kernel_image_load(struct kimage *image) -{ - if (!image->fops || !image->fops->load) - return ERR_PTR(-ENOEXEC); - - return image->fops->load(image, image->kernel_buf, - image->kernel_buf_len, image->initrd_buf, - image->initrd_buf_len, image->cmdline_buf, - image->cmdline_buf_len); -} - /* * Apply purgatory relocations. * diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 84ad0e61ba6e..b05f62ee2344 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -362,8 +362,8 @@ int module_finalize(const Elf_Ehdr *hdr, } if (locks) { void *lseg = (void *)locks->sh_addr; - void *text = me->core_layout.base; - void *text_end = text + me->core_layout.text_size; + void *text = me->mem[MOD_TEXT].base; + void *text_end = text + me->mem[MOD_TEXT].size; alternatives_smp_module_add(me, me->name, lseg, lseg + locks->sh_size, text, text_end); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 708751311786..7bb17d37db01 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -263,7 +263,7 @@ static int __init msr_init(void) pr_err("unable to get major %d for msr\n", MSR_MAJOR); return -EBUSY; } - msr_class = class_create(THIS_MODULE, "msr"); + msr_class = class_create("msr"); if (IS_ERR(msr_class)) { err = PTR_ERR(msr_class); goto out_chrdev; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 42e182868873..ac10b46c5832 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -64,11 +64,11 @@ static unsigned paravirt_patch_call(void *insn_buff, const void *target, } #ifdef CONFIG_PARAVIRT_XXL -/* identity function, which can be inlined */ -u64 notrace _paravirt_ident_64(u64 x) -{ - return x; -} +DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text); +DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); +DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text); +DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text); +DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -197,11 +197,6 @@ void paravirt_end_context_switch(struct task_struct *next) arch_enter_lazy_mmu_mode(); } -static noinstr unsigned long pv_native_read_cr2(void) -{ - return native_read_cr2(); -} - static noinstr void pv_native_write_cr2(unsigned long val) { native_write_cr2(val); @@ -222,16 +217,6 @@ noinstr void pv_native_wbinvd(void) native_wbinvd(); } -static noinstr void pv_native_irq_enable(void) -{ - native_irq_enable(); -} - -static noinstr void pv_native_irq_disable(void) -{ - native_irq_disable(); -} - static noinstr void pv_native_safe_halt(void) { native_safe_halt(); @@ -298,7 +283,7 @@ struct paravirt_patch_template pv_ops = { .cpu.end_context_switch = paravirt_nop, /* Irq ops. */ - .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), + .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = pv_native_safe_halt, @@ -363,8 +348,7 @@ struct paravirt_patch_template pv_ops = { .mmu.make_pte = PTE_IDENT, .mmu.make_pgd = PTE_IDENT, - .mmu.dup_mmap = paravirt_nop, - .mmu.activate_mm = paravirt_nop, + .mmu.enter_mmap = paravirt_nop, .mmu.lazy_mode = { .enter = paravirt_nop, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 30bbe4abb5d6..de6be0a3965e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -124,7 +124,7 @@ void __init pci_iommu_alloc(void) } /* - * See <Documentation/x86/x86_64/boot-options.rst> for the iommu kernel + * See <Documentation/arch/x86/x86_64/boot-options.rst> for the iommu kernel * parameter documentation. */ static __init int iommu_setup(char *p) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b650cde3f64d..dac41a0072ea 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -5,6 +5,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/prctl.h> #include <linux/slab.h> #include <linux/sched.h> @@ -48,6 +49,7 @@ #include <asm/frame.h> #include <asm/unwind.h> #include <asm/tdx.h> +#include <asm/mmu_context.h> #include "process.h" @@ -162,6 +164,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + + if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) + set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags); #else p->thread.sp0 = (unsigned long) (childregs + 1); savesegment(gs, p->thread.gs); @@ -368,6 +373,8 @@ void arch_setup_new_exec(void) task_clear_spec_ssb_noexec(current); speculation_ctrl_update(read_thread_flags()); } + + mm_reset_untag_mask(current->mm); } #ifdef CONFIG_X86_IOPL_IOPERM @@ -715,7 +722,7 @@ static bool x86_idle_set(void) } #ifndef CONFIG_SMP -static inline void play_dead(void) +static inline void __noreturn play_dead(void) { BUG(); } @@ -727,7 +734,7 @@ void arch_cpu_idle_enter(void) local_touch_nmi(); } -void arch_cpu_idle_dead(void) +void __noreturn arch_cpu_idle_dead(void) { play_dead(); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bb65a68b4b49..3d181c16a2f6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -39,6 +39,7 @@ #include <linux/io.h> #include <linux/ftrace.h> #include <linux/syscalls.h> +#include <linux/iommu.h> #include <asm/processor.h> #include <asm/pkru.h> @@ -671,7 +672,7 @@ void set_personality_64bit(void) task_pt_regs(current)->orig_ax = __NR_execve; current_thread_info()->status &= ~TS_COMPAT; if (current->mm) - current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL; + __set_bit(MM_CONTEXT_HAS_VSYSCALL, ¤t->mm->context.flags); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, @@ -708,7 +709,7 @@ static void __set_personality_ia32(void) * uprobes applied to this MM need to know this and * cannot use user_64bit_mode() at that time. */ - current->mm->context.flags = MM_CONTEXT_UPROBE_IA32; + __set_bit(MM_CONTEXT_UPROBE_IA32, ¤t->mm->context.flags); } current->personality |= force_personality32; @@ -743,6 +744,52 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) } #endif +#ifdef CONFIG_ADDRESS_MASKING + +#define LAM_U57_BITS 6 + +static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) +{ + if (!cpu_feature_enabled(X86_FEATURE_LAM)) + return -ENODEV; + + /* PTRACE_ARCH_PRCTL */ + if (current->mm != mm) + return -EINVAL; + + if (mm_valid_pasid(mm) && + !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) + return -EINVAL; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { + mmap_write_unlock(mm); + return -EBUSY; + } + + if (!nr_bits) { + mmap_write_unlock(mm); + return -EINVAL; + } else if (nr_bits <= LAM_U57_BITS) { + mm->context.lam_cr3_mask = X86_CR3_LAM_U57; + mm->context.untag_mask = ~GENMASK(62, 57); + } else { + mmap_write_unlock(mm); + return -EINVAL; + } + + write_cr3(__read_cr3() | mm->context.lam_cr3_mask); + set_tlbstate_lam_mode(mm); + set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); + + mmap_write_unlock(mm); + + return 0; +} +#endif + long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; @@ -830,7 +877,23 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) case ARCH_MAP_VDSO_64: return prctl_map_vdso(&vdso_image_64, arg2); #endif - +#ifdef CONFIG_ADDRESS_MASKING + case ARCH_GET_UNTAG_MASK: + return put_user(task->mm->context.untag_mask, + (unsigned long __user *)arg2); + case ARCH_ENABLE_TAGGED_ADDR: + return prctl_enable_tagged_addr(task->mm, arg2); + case ARCH_FORCE_TAGGED_SVA: + if (current != task) + return -EINVAL; + set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); + return 0; + case ARCH_GET_MAX_TAG_BITS: + if (!cpu_feature_enabled(X86_FEATURE_LAM)) + return put_user(0, (unsigned long __user *)arg2); + else + return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); +#endif default: ret = -EINVAL; break; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d03c551defcc..3adbe97015c1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -920,7 +920,7 @@ void run_crash_ipi_callback(struct pt_regs *regs) } /* Override the weak function in kernel/panic.c */ -void nmi_panic_self_stop(struct pt_regs *regs) +void __noreturn nmi_panic_self_stop(struct pt_regs *regs) { while (1) { /* If no CPU is preparing crash dump, we simply loop here. */ diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4a73351f87f8..56cab1bb25f5 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -43,7 +43,7 @@ .code64 SYM_CODE_START_NOALIGN(relocate_range) SYM_CODE_START_NOALIGN(relocate_kernel) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR /* * %rdi indirection_page @@ -113,7 +113,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK /* set return address to 0 if not preserving context */ pushq $0 /* store the start address on the stack */ @@ -231,7 +231,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR // RET target, above movq RSP(%r8), %rsp movq CR4(%r8), %rax @@ -256,8 +256,8 @@ SYM_CODE_END(virtual_mapped) /* Do the copies */ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) - UNWIND_HINT_EMPTY - movq %rdi, %rcx /* Put the page_list in %rcx */ + UNWIND_HINT_END_OF_STACK + movq %rdi, %rcx /* Put the page_list in %rcx */ xorl %edi, %edi xorl %esi, %esi jmp 1f diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 3f664ab277c4..b031244d6d2d 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -22,6 +22,8 @@ #include <linux/efi.h> #include <linux/platform_device.h> #include <linux/io.h> +#include <linux/psp-sev.h> +#include <uapi/linux/sev-guest.h> #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> @@ -2175,7 +2177,7 @@ static int __init init_sev_config(char *str) } __setup("sev=", init_sev_config); -int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err) +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) { struct ghcb_state state; struct es_em_ctxt ctxt; @@ -2183,8 +2185,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned struct ghcb *ghcb; int ret; - if (!fw_err) - return -EINVAL; + rio->exitinfo2 = SEV_RET_NO_FW_CALL; /* * __sev_get_ghcb() needs to run with IRQs disabled because it is using @@ -2209,16 +2210,16 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned if (ret) goto e_put; - *fw_err = ghcb->save.sw_exit_info_2; - switch (*fw_err) { + rio->exitinfo2 = ghcb->save.sw_exit_info_2; + switch (rio->exitinfo2) { case 0: break; - case SNP_GUEST_REQ_ERR_BUSY: + case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY): ret = -EAGAIN; break; - case SNP_GUEST_REQ_INVALID_LEN: + case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN): /* Number of expected pages are returned in RBX */ if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { input->data_npages = ghcb_get_rbx(ghcb); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9013bb28255a..352f0ce1ece4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -121,17 +121,20 @@ int arch_update_cpu_topology(void) return retval; } + +static unsigned int smpboot_warm_reset_vector_count; + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0xa, 0xf); + if (!smpboot_warm_reset_vector_count++) { + CMOS_WRITE(0xa, 0xf); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf; + } spin_unlock_irqrestore(&rtc_lock, flags); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = - start_eip >> 4; - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = - start_eip & 0xf; } static inline void smpboot_restore_warm_reset_vector(void) @@ -143,10 +146,12 @@ static inline void smpboot_restore_warm_reset_vector(void) * to default values. */ spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0, 0xf); + if (!--smpboot_warm_reset_vector_count) { + CMOS_WRITE(0, 0xf); + *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; + } spin_unlock_irqrestore(&rtc_lock, flags); - *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } /* @@ -1059,8 +1064,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); -#else - initial_gs = per_cpu_offset(cpu); #endif return 0; } @@ -1086,9 +1089,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, start_ip = real_mode_header->trampoline_start64; #endif idle->thread.sp = (unsigned long)task_pt_regs(idle); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; - initial_stack = idle->thread.sp; + + if (IS_ENABLED(CONFIG_X86_32)) { + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); + initial_stack = idle->thread.sp; + } else { + smpboot_control = cpu; + } /* Enable the espfix hack for this CPU */ init_espfix_ap(cpu); @@ -1816,7 +1824,7 @@ static inline void mwait_play_dead(void) } } -void hlt_play_dead(void) +void __noreturn hlt_play_dead(void) { if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d317dc3d06a3..58b1f208eff5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -40,7 +40,7 @@ #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> -#include <linux/ioasid.h> +#include <linux/iommu.h> #include <asm/stacktrace.h> #include <asm/processor.h> @@ -671,15 +671,15 @@ static bool try_fixup_enqcmd_gp(void) if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) return false; - pasid = current->mm->pasid; - /* * If the mm has not been allocated a * PASID, the #GP can not be fixed up. */ - if (!pasid_valid(pasid)) + if (!mm_valid_pasid(current->mm)) return false; + pasid = current->mm->pasid; + /* * Did this thread already have its PASID activated? * If so, the #GP must be from something else. diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 37307b40f8da..3ac50b7298d1 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -133,7 +133,7 @@ static struct orc_entry null_orc_entry = { .sp_offset = sizeof(long), .sp_reg = ORC_REG_SP, .bp_reg = ORC_REG_UNDEFINED, - .type = UNWIND_HINT_TYPE_CALL + .type = ORC_TYPE_CALL }; #ifdef CONFIG_CALL_THUNKS @@ -153,12 +153,11 @@ static struct orc_entry *orc_callthunk_find(unsigned long ip) /* Fake frame pointer entry -- used as a fallback for generated code */ static struct orc_entry orc_fp_entry = { - .type = UNWIND_HINT_TYPE_CALL, + .type = ORC_TYPE_CALL, .sp_reg = ORC_REG_BP, .sp_offset = 16, .bp_reg = ORC_REG_PREV_SP, .bp_offset = -16, - .end = 0, }; static struct orc_entry *orc_find(unsigned long ip) @@ -250,13 +249,13 @@ static int orc_sort_cmp(const void *_a, const void *_b) return -1; /* - * The "weak" section terminator entries need to always be on the left + * The "weak" section terminator entries need to always be first * to ensure the lookup code skips them in favor of real entries. * These terminator entries exist to handle any gaps created by * whitelisted .o files which didn't get objtool generation. */ orc_a = cur_orc_table + (a - cur_orc_ip_table); - return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1; + return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1; } void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, @@ -474,14 +473,12 @@ bool unwind_next_frame(struct unwind_state *state) */ orc = &orc_fp_entry; state->error = true; - } - - /* End-of-stack check for kernel threads: */ - if (orc->sp_reg == ORC_REG_UNDEFINED) { - if (!orc->end) + } else { + if (orc->type == ORC_TYPE_UNDEFINED) goto err; - goto the_end; + if (orc->type == ORC_TYPE_END_OF_STACK) + goto the_end; } state->signal = orc->signal; @@ -554,7 +551,7 @@ bool unwind_next_frame(struct unwind_state *state) /* Find IP, SP and possibly regs: */ switch (orc->type) { - case UNWIND_HINT_TYPE_CALL: + case ORC_TYPE_CALL: ip_p = sp - sizeof(long); if (!deref_stack_reg(state, ip_p, &state->ip)) @@ -567,7 +564,7 @@ bool unwind_next_frame(struct unwind_state *state) state->prev_regs = NULL; break; - case UNWIND_HINT_TYPE_REGS: + case ORC_TYPE_REGS: if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn_current("can't access registers at %pB\n", (void *)orig_ip); @@ -590,13 +587,13 @@ bool unwind_next_frame(struct unwind_state *state) state->full_regs = true; break; - case UNWIND_HINT_TYPE_REGS_PARTIAL: + case ORC_TYPE_REGS_PARTIAL: if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn_current("can't access iret registers at %pB\n", (void *)orig_ip); goto err; } - /* See UNWIND_HINT_TYPE_REGS case comment. */ + /* See ORC_TYPE_REGS case comment. */ state->ip = unwind_recover_rethook(state, state->ip, (unsigned long *)(state->sp - sizeof(long))); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ef80d361b463..d82f4fa2f1bf 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -33,8 +33,8 @@ static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } -static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } -static __init void get_rtc_noop(struct timespec64 *now) { } +int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +void get_rtc_noop(struct timespec64 *now) { } static __initconst const struct of_device_id of_cmos_match[] = { { .compatible = "motorola,mc146818" }, @@ -134,6 +134,7 @@ static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } +static bool is_private_mmio_noop(u64 addr) {return false; } struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu_early, @@ -149,6 +150,7 @@ struct x86_platform_ops x86_platform __ro_after_init = { .realmode_reserve = reserve_real_mode, .realmode_init = init_real_mode, .hyper.pin_vcpu = x86_op_int_noop, + .hyper.is_private_mmio = is_private_mmio_noop, .guest = { .enc_status_change_prepare = enc_status_change_prepare_noop, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8e578311ca9d..89ca7f4c1464 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -46,7 +46,6 @@ config KVM select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO - select SRCU select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 599aebec2d52..123bf8b97a4b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -60,12 +60,6 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } -/* - * This one is tied to SSB in the user API, and not - * visible in /proc/cpuinfo. - */ -#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ - #define F feature_bit /* Scattered Flag - For features that are scattered by cpufeatures.h. */ @@ -266,7 +260,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) cpuid_entry_change(best, X86_FEATURE_OSXSAVE, - kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)); + kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)); cpuid_entry_change(best, X86_FEATURE_APIC, vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); @@ -275,7 +269,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e best = cpuid_entry2_find(entries, nent, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, - kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)); best = cpuid_entry2_find(entries, nent, 0xD, 0); if (best) @@ -420,7 +414,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, * KVM_SET_CPUID{,2} again. To support this legacy behavior, check * whether the supplied CPUID data is equal to what's already set. */ - if (vcpu->arch.last_vmentry_cpu != -1) { + if (kvm_vcpu_has_run(vcpu)) { r = kvm_cpuid_check_equal(vcpu, e2, nent); if (r) return r; @@ -653,7 +647,7 @@ void kvm_set_cpu_caps(void) F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ @@ -715,7 +709,7 @@ void kvm_set_cpu_caps(void) F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | - __feature_bit(KVM_X86_FEATURE_AMD_PSFD) + F(AMD_PSFD) ); /* @@ -1002,7 +996,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = 0; break; case 0xd: { - u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xcr0 = kvm_get_filtered_xcr0(); u64 permitted_xss = kvm_caps.supported_xss; entry->eax &= permitted_xcr0; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a20bec931764..936a397a08cd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1640,6 +1640,14 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; break; case VCPU_SREG_CS: + /* + * KVM uses "none" when loading CS as part of emulating Real + * Mode exceptions and IRET (handled above). In all other + * cases, loading CS without a control transfer is a KVM bug. + */ + if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE)) + goto exception; + if (!(seg_desc.type & 8)) goto exception; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 042dee556125..995eb5054360 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -368,9 +368,39 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); - if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG - && ioapic->irr & (1 << index)) - ioapic_service(ioapic, index, false); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && + ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) { + /* + * Pending status in irr may be outdated: the IRQ line may have + * already been deasserted by a device while the IRQ was masked. + * This occurs, for instance, if the interrupt is handled in a + * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this + * case the guest acknowledges the interrupt to the device in + * its threaded irq handler, i.e. after the EOI but before + * unmasking, so at the time of unmasking the IRQ line is + * already down but our pending irr bit is still set. In such + * cases, injecting this pending interrupt to the guest is + * buggy: the guest will receive an extra unwanted interrupt. + * + * So we need to check here if the IRQ is actually still pending. + * As we are generally not able to probe the IRQ line status + * directly, we do it through irqfd resampler. Namely, we clear + * the pending status and notify the resampler that this interrupt + * is done, without actually injecting it into the guest. If the + * IRQ line is actually already deasserted, we are done. If it is + * still asserted, a new interrupt will be shortly triggered + * through irqfd and injected into the guest. + * + * If, however, it's not possible to resample (no irqfd resampler + * registered for this irq), then unconditionally inject this + * pending interrupt into the guest, so the guest will not miss + * an interrupt, although may get an extra unwanted interrupt. + */ + if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index)) + ioapic->irr &= ~(1 << index); + else + ioapic_service(ioapic, index, false); + } if (e->fields.delivery_mode == APIC_DM_FIXED) { struct kvm_lapic_irq irq; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 4c91f626c058..75eae9c4998a 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -4,7 +4,7 @@ #include <linux/kvm_host.h> -#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP) #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) @@ -157,6 +157,14 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) return vcpu->arch.cr0 & mask; } +static __always_inline bool kvm_is_cr0_bit_set(struct kvm_vcpu *vcpu, + unsigned long cr0_bit) +{ + BUILD_BUG_ON(!is_power_of_2(cr0_bit)); + + return !!kvm_read_cr0_bits(vcpu, cr0_bit); +} + static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) { return kvm_read_cr0_bits(vcpu, ~0UL); @@ -171,6 +179,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) return vcpu->arch.cr4 & mask; } +static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu, + unsigned long cr4_bit) +{ + BUILD_BUG_ON(!is_power_of_2(cr4_bit)); + + return !!kvm_read_cr4_bits(vcpu, cr4_bit); +} + static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index 482d6639ef88..ded0bd688c65 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -10,17 +10,22 @@ #include "hyperv.h" #include "kvm_onhyperv.h" +struct kvm_hv_tlb_range { + u64 start_gfn; + u64 pages; +}; + static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, void *data) { - struct kvm_tlb_range *range = data; + struct kvm_hv_tlb_range *range = data; return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, range->pages); } static inline int hv_remote_flush_root_tdp(hpa_t root_tdp, - struct kvm_tlb_range *range) + struct kvm_hv_tlb_range *range) { if (range) return hyperv_flush_guest_mapping_range(root_tdp, @@ -29,8 +34,8 @@ static inline int hv_remote_flush_root_tdp(hpa_t root_tdp, return hyperv_flush_guest_mapping(root_tdp); } -int hv_remote_flush_tlb_with_range(struct kvm *kvm, - struct kvm_tlb_range *range) +static int __hv_flush_remote_tlbs_range(struct kvm *kvm, + struct kvm_hv_tlb_range *range) { struct kvm_arch *kvm_arch = &kvm->arch; struct kvm_vcpu *vcpu; @@ -86,19 +91,29 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, spin_unlock(&kvm_arch->hv_root_tdp_lock); return ret; } -EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range); -int hv_remote_flush_tlb(struct kvm *kvm) +int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages) +{ + struct kvm_hv_tlb_range range = { + .start_gfn = start_gfn, + .pages = nr_pages, + }; + + return __hv_flush_remote_tlbs_range(kvm, &range); +} +EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs_range); + +int hv_flush_remote_tlbs(struct kvm *kvm) { - return hv_remote_flush_tlb_with_range(kvm, NULL); + return __hv_flush_remote_tlbs_range(kvm, NULL); } -EXPORT_SYMBOL_GPL(hv_remote_flush_tlb); +EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { struct kvm_arch *kvm_arch = &vcpu->kvm->arch; - if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + if (kvm_x86_ops.flush_remote_tlbs == hv_flush_remote_tlbs) { spin_lock(&kvm_arch->hv_root_tdp_lock); vcpu->arch.hv_root_tdp = root_tdp; if (root_tdp != kvm_arch->hv_root_tdp) diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index 287e98ef9df3..f9ca3e7432b2 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -7,11 +7,15 @@ #define __ARCH_X86_KVM_KVM_ONHYPERV_H__ #if IS_ENABLED(CONFIG_HYPERV) -int hv_remote_flush_tlb_with_range(struct kvm *kvm, - struct kvm_tlb_range *range); -int hv_remote_flush_tlb(struct kvm *kvm); +int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); +int hv_flush_remote_tlbs(struct kvm *kvm); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); #else /* !CONFIG_HYPERV */ +static inline int hv_flush_remote_tlbs(struct kvm *kvm) +{ + return -EOPNOTSUPP; +} + static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) { } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 168c46fd8dd1..92d5a1924fc1 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -113,6 +113,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); +void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); @@ -132,7 +134,7 @@ static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3) { BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0); - return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE) + return kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE) ? cr3 & X86_CR3_PCID_MASK : 0; } @@ -153,6 +155,24 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) vcpu->arch.mmu->root_role.level); } +static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) +{ + /* + * When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e. + * @mmu's snapshot of CR0.WP and thus all related paging metadata may + * be stale. Refresh CR0.WP and the metadata on-demand when checking + * for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing + * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does + * need to refresh nested_mmu, a.k.a. the walker used to translate L2 + * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP. + */ + if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu) + return; + + __kvm_mmu_refresh_passthrough_bits(vcpu, mmu); +} + /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE @@ -184,8 +204,12 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; - bool fault = (mmu->permissions[index] >> pte_access) & 1; u32 errcode = PFERR_PRESENT_MASK; + bool fault; + + kvm_mmu_refresh_passthrough_bits(vcpu, mmu); + + fault = (mmu->permissions[index] >> pte_access) & 1; WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); if (unlikely(mmu->pkru_mask)) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8ebe542c565..c8961f45e3b1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -125,17 +125,31 @@ module_param(dbg, bool, 0644); #define PTE_LIST_EXT 14 /* - * Slight optimization of cacheline layout, by putting `more' and `spte_count' - * at the start; then accessing it will only use one single cacheline for - * either full (entries==PTE_LIST_EXT) case or entries<=6. + * struct pte_list_desc is the core data structure used to implement a custom + * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a + * given GFN when used in the context of rmaps. Using a custom list allows KVM + * to optimize for the common case where many GFNs will have at most a handful + * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small + * memory footprint, which in turn improves runtime performance by exploiting + * cache locality. + * + * A list is comprised of one or more pte_list_desc objects (descriptors). + * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor + * is full and a new SPTEs needs to be added, a new descriptor is allocated and + * becomes the head of the list. This means that by definitions, all tail + * descriptors are full. + * + * Note, the meta data fields are deliberately placed at the start of the + * structure to optimize the cacheline layout; accessing the descriptor will + * touch only a single cacheline so long as @spte_count<=6 (or if only the + * descriptors metadata is accessed). */ struct pte_list_desc { struct pte_list_desc *more; - /* - * Stores number of entries stored in the pte_list_desc. No need to be - * u64 but just for easier alignment. When PTE_LIST_EXT, means full. - */ - u64 spte_count; + /* The number of PTEs stored in _this_ descriptor. */ + u32 spte_count; + /* The number of PTEs stored in all tails of this descriptor. */ + u32 tail_count; u64 *sptes[PTE_LIST_EXT]; }; @@ -242,32 +256,35 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) return regs; } -static inline bool kvm_available_flush_tlb_with_range(void) +static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.tlb_remote_flush_with_range; + return kvm_read_cr3(vcpu); } -static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, - struct kvm_tlb_range *range) +static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) { - int ret = -ENOTSUPP; - - if (range && kvm_x86_ops.tlb_remote_flush_with_range) - ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range); + if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) + return kvm_read_cr3(vcpu); - if (ret) - kvm_flush_remote_tlbs(kvm); + return mmu->get_guest_pgd(vcpu); } -void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, - u64 start_gfn, u64 pages) +static inline bool kvm_available_flush_remote_tlbs_range(void) { - struct kvm_tlb_range range; + return kvm_x86_ops.flush_remote_tlbs_range; +} - range.start_gfn = start_gfn; - range.pages = pages; +void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, + gfn_t nr_pages) +{ + int ret = -EOPNOTSUPP; - kvm_flush_remote_tlbs_with_range(kvm, &range); + if (kvm_x86_ops.flush_remote_tlbs_range) + ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn, + nr_pages); + if (ret) + kvm_flush_remote_tlbs(kvm); } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); @@ -888,9 +905,9 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) untrack_possible_nx_huge_page(kvm, sp); } -static struct kvm_memory_slot * -gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) +static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, + gfn_t gfn, + bool no_dirty_log) { struct kvm_memory_slot *slot; @@ -929,53 +946,69 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; + desc->tail_count = 0; rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->spte_count == PTE_LIST_EXT) { - count += PTE_LIST_EXT; - if (!desc->more) { - desc->more = kvm_mmu_memory_cache_alloc(cache); - desc = desc->more; - desc->spte_count = 0; - break; - } - desc = desc->more; + count = desc->tail_count + desc->spte_count; + + /* + * If the previous head is full, allocate a new head descriptor + * as tail descriptors are always kept full. + */ + if (desc->spte_count == PTE_LIST_EXT) { + desc = kvm_mmu_memory_cache_alloc(cache); + desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc->spte_count = 0; + desc->tail_count = count; + rmap_head->val = (unsigned long)desc | 1; } - count += desc->spte_count; desc->sptes[desc->spte_count++] = spte; } return count; } -static void -pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, - struct pte_list_desc *desc, int i, - struct pte_list_desc *prev_desc) +static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, + struct pte_list_desc *desc, int i) { - int j = desc->spte_count - 1; + struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + int j = head_desc->spte_count - 1; + + /* + * The head descriptor should never be empty. A new head is added only + * when adding an entry and the previous head is full, and heads are + * removed (this flow) when they become empty. + */ + BUG_ON(j < 0); - desc->sptes[i] = desc->sptes[j]; - desc->sptes[j] = NULL; - desc->spte_count--; - if (desc->spte_count) + /* + * Replace the to-be-freed SPTE with the last valid entry from the head + * descriptor to ensure that tail descriptors are full at all times. + * Note, this also means that tail_count is stable for each descriptor. + */ + desc->sptes[i] = head_desc->sptes[j]; + head_desc->sptes[j] = NULL; + head_desc->spte_count--; + if (head_desc->spte_count) return; - if (!prev_desc && !desc->more) + + /* + * The head descriptor is empty. If there are no tail descriptors, + * nullify the rmap head to mark the list as emtpy, else point the rmap + * head at the next descriptor, i.e. the new head. + */ + if (!head_desc->more) rmap_head->val = 0; else - if (prev_desc) - prev_desc->more = desc->more; - else - rmap_head->val = (unsigned long)desc->more | 1; - mmu_free_pte_list_desc(desc); + rmap_head->val = (unsigned long)head_desc->more | 1; + mmu_free_pte_list_desc(head_desc); } static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - struct pte_list_desc *prev_desc; int i; if (!rmap_head->val) { @@ -991,16 +1024,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } else { rmap_printk("%p many->many\n", spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - prev_desc = NULL; while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(rmap_head, - desc, i, prev_desc); + pte_list_desc_remove_entry(rmap_head, desc, i); return; } } - prev_desc = desc; desc = desc->more; } pr_err("%s: %p many->many\n", __func__, spte); @@ -1047,7 +1077,6 @@ out: unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - unsigned int count = 0; if (!rmap_head->val) return 0; @@ -1055,13 +1084,7 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) return 1; desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - - while (desc) { - count += desc->spte_count; - desc = desc->more; - } - - return count; + return desc->tail_count + desc->spte_count; } static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, @@ -1073,14 +1096,6 @@ static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } -static bool rmap_can_add(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_memory_cache *mc; - - mc = &vcpu->arch.mmu_pte_list_desc_cache; - return kvm_mmu_memory_cache_nr_free_objects(mc); -} - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; @@ -1479,7 +1494,7 @@ restart: } } - if (need_flush && kvm_available_flush_tlb_with_range()) { + if (need_flush && kvm_available_flush_remote_tlbs_range()) { kvm_flush_remote_tlbs_gfn(kvm, gfn, level); return false; } @@ -1504,8 +1519,8 @@ struct slot_rmap_walk_iterator { struct kvm_rmap_head *end_rmap; }; -static void -rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) +static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, + int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; @@ -1513,10 +1528,10 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } -static void -slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, - const struct kvm_memory_slot *slot, int start_level, - int end_level, gfn_t start_gfn, gfn_t end_gfn) +static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, + const struct kvm_memory_slot *slot, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; iterator->start_level = start_level; @@ -1789,12 +1804,6 @@ static void mark_unsync(u64 *spte) kvm_mmu_mark_parents_unsync(sp); } -static int nonpaging_sync_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - return -1; -} - #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { @@ -1914,10 +1923,79 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp) &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else +static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; + + /* + * Ignore various flags when verifying that it's safe to sync a shadow + * page using the current MMU context. + * + * - level: not part of the overall MMU role and will never match as the MMU's + * level tracks the root level + * - access: updated based on the new guest PTE + * - quadrant: not part of the overall MMU role (similar to level) + */ + const union kvm_mmu_page_role sync_role_ign = { + .level = 0xf, + .access = 0x7, + .quadrant = 0x3, + .passthrough = 0x1, + }; + + /* + * Direct pages can never be unsync, and KVM should never attempt to + * sync a shadow page for a different MMU context, e.g. if the role + * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the + * reserved bits checks will be wrong, etc... + */ + if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte || + (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) + return false; + + return true; +} + +static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) +{ + if (!sp->spt[i]) + return 0; + + return vcpu->arch.mmu->sync_spte(vcpu, sp, i); +} + +static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + int flush = 0; + int i; + + if (!kvm_sync_page_check(vcpu, sp)) + return -1; + + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { + int ret = kvm_sync_spte(vcpu, sp, i); + + if (ret < -1) + return -1; + flush |= ret; + } + + /* + * Note, any flush is purely for KVM's correctness, e.g. when dropping + * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier + * unmap or dirty logging event doesn't fail to flush. The guest is + * responsible for flushing the TLB to ensure any changes in protection + * bits are recognized, i.e. until the guest flushes or page faults on + * a relevant address, KVM is architecturally allowed to let vCPUs use + * cached translations with the old protection bits. + */ + return flush; +} + static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - int ret = vcpu->arch.mmu->sync_page(vcpu, sp); + int ret = __kvm_sync_page(vcpu, sp); if (ret < 0) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); @@ -3304,9 +3382,9 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) * Returns true if the SPTE was fixed successfully. Otherwise, * someone else modified the SPTE from its original value. */ -static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - u64 *sptep, u64 old_spte, u64 new_spte) +static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + u64 *sptep, u64 old_spte, u64 new_spte) { /* * Theoretically we could also set dirty bit (and flush TLB) here in @@ -3513,6 +3591,8 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, LIST_HEAD(invalid_list); bool free_active_root; + WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL); + BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); /* Before acquiring the MMU lock, see if we need to do any real work. */ @@ -3731,7 +3811,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) int quadrant, i, r; hpa_t root; - root_pgd = mmu->get_guest_pgd(vcpu); + root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); root_gfn = root_pgd >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) @@ -4181,7 +4261,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, arch.token = alloc_apf_token(vcpu); arch.gfn = gfn; arch.direct_map = vcpu->arch.mmu->root_role.direct; - arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); + arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); @@ -4200,10 +4280,10 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) return; if (!vcpu->arch.mmu->root_role.direct && - work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) + work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); } static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4469,8 +4549,7 @@ static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; - context->sync_page = nonpaging_sync_page; - context->invlpg = NULL; + context->sync_spte = NULL; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, @@ -4604,11 +4683,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); -static unsigned long get_cr3(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr3(vcpu); -} - static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) { @@ -4638,10 +4712,9 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, #include "paging_tmpl.h" #undef PTTYPE -static void -__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, - u64 pa_bits_rsvd, int level, bool nx, bool gbpages, - bool pse, bool amd) +static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, + u64 pa_bits_rsvd, int level, bool nx, + bool gbpages, bool pse, bool amd) { u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; @@ -4754,9 +4827,9 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, guest_cpuid_is_amd_or_hygon(vcpu)); } -static void -__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - u64 pa_bits_rsvd, bool execonly, int huge_page_level) +static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, + u64 pa_bits_rsvd, bool execonly, + int huge_page_level) { u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 large_1g_rsvd = 0, large_2m_rsvd = 0; @@ -4856,8 +4929,7 @@ static inline bool boot_cpu_is_amd(void) * the direct page table on host, use as much mmu features as * possible, however, kvm currently does not do execution-protection. */ -static void -reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) +static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { struct rsvd_bits_validate *shadow_zero_check; int i; @@ -5060,20 +5132,18 @@ static void paging64_init_context(struct kvm_mmu *context) { context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; - context->sync_page = paging64_sync_page; - context->invlpg = paging64_invlpg; + context->sync_spte = paging64_sync_spte; } static void paging32_init_context(struct kvm_mmu *context) { context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; - context->sync_page = paging32_sync_page; - context->invlpg = paging32_invlpg; + context->sync_spte = paging32_sync_spte; } -static union kvm_cpu_role -kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) +static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, + const struct kvm_mmu_role_regs *regs) { union kvm_cpu_role role = {0}; @@ -5112,6 +5182,21 @@ kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) return role; } +void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) +{ + const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP); + + BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP); + BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS)); + + if (is_cr0_wp(mmu) == cr0_wp) + return; + + mmu->cpu_role.base.cr0_wp = cr0_wp; + reset_guest_paging_metadata(vcpu, mmu); +} + static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { /* tdp_root_level is architecture forced level, use it if nonzero */ @@ -5157,9 +5242,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; context->page_fault = kvm_tdp_page_fault; - context->sync_page = nonpaging_sync_page; - context->invlpg = NULL; - context->get_guest_pgd = get_cr3; + context->sync_spte = NULL; + context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; @@ -5289,8 +5373,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; - context->sync_page = ept_sync_page; - context->invlpg = ept_invlpg; + context->sync_spte = ept_sync_spte; update_permission_bitmask(context, true); context->pkru_mask = 0; @@ -5309,7 +5392,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu, kvm_init_shadow_mmu(vcpu, cpu_role); - context->get_guest_pgd = get_cr3; + context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } @@ -5323,7 +5406,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, return; g_context->cpu_role.as_u64 = new_mode.as_u64; - g_context->get_guest_pgd = get_cr3; + g_context->get_guest_pgd = get_guest_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; @@ -5331,7 +5414,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, * L2 page tables are never shadowed, so there is no need to sync * SPTEs. */ - g_context->invlpg = NULL; + g_context->sync_spte = NULL; /* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using @@ -5393,7 +5476,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * Changing guest CPUID after KVM_RUN is forbidden, see the comment in * kvm_arch_vcpu_ioctl(). */ - KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); + KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -5664,7 +5747,8 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, - lower_32_bits(error_code), false); + lower_32_bits(error_code), false, + &emulation_type); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } @@ -5706,48 +5790,77 @@ emulate: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); -void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t gva, hpa_t root_hpa) +static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + u64 addr, hpa_t root_hpa) +{ + struct kvm_shadow_walk_iterator iterator; + + vcpu_clear_mmio_info(vcpu, addr); + + if (!VALID_PAGE(root_hpa)) + return; + + write_lock(&vcpu->kvm->mmu_lock); + for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) { + struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep); + + if (sp->unsync) { + int ret = kvm_sync_spte(vcpu, sp, iterator.index); + + if (ret < 0) + mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL); + if (ret) + kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep); + } + + if (!sp->unsync_children) + break; + } + write_unlock(&vcpu->kvm->mmu_lock); +} + +void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + u64 addr, unsigned long roots) { int i; + WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL); + /* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ - if (is_noncanonical_address(gva, vcpu)) + if (is_noncanonical_address(addr, vcpu)) return; - static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); + static_call(kvm_x86_flush_tlb_gva)(vcpu, addr); } - if (!mmu->invlpg) + if (!mmu->sync_spte) return; - if (root_hpa == INVALID_PAGE) { - mmu->invlpg(vcpu, gva, mmu->root.hpa); + if (roots & KVM_MMU_ROOT_CURRENT) + __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa); - /* - * INVLPG is required to invalidate any global mappings for the VA, - * irrespective of PCID. Since it would take us roughly similar amount - * of work to determine whether any of the prev_root mappings of the VA - * is marked global, or to just sync it blindly, so we might as well - * just always sync it. - * - * Mappings not reachable via the current cr3 or the prev_roots will be - * synced when switching to that cr3, so nothing needs to be done here - * for them. - */ - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (VALID_PAGE(mmu->prev_roots[i].hpa)) - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); - } else { - mmu->invlpg(vcpu, gva, root_hpa); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (roots & KVM_MMU_ROOT_PREVIOUS(i)) + __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa); } } +EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Blindly sync all roots as it would take + * roughly the same amount of work/time to determine whether any of the + * previous roots have a global mapping. + * + * Mappings not reachable via the current or previous cached roots will + * be synced when switching to that new cr3, so nothing needs to be + * done here for them. + */ + kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -5756,27 +5869,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; - bool tlb_flush = false; + unsigned long roots = 0; uint i; - if (pcid == kvm_get_active_pcid(vcpu)) { - if (mmu->invlpg) - mmu->invlpg(vcpu, gva, mmu->root.hpa); - tlb_flush = true; - } + if (pcid == kvm_get_active_pcid(vcpu)) + roots |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { - if (mmu->invlpg) - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); - tlb_flush = true; - } + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) + roots |= KVM_MMU_ROOT_PREVIOUS(i); } - if (tlb_flush) - static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); - + if (roots) + kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots); ++vcpu->stat.invlpg; /* @@ -5813,29 +5919,30 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, +typedef bool (*slot_rmaps_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot); -/* The caller should hold mmu-lock before calling this function. */ -static __always_inline bool -slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, - bool flush) +static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, + bool flush_on_yield, bool flush) { struct slot_rmap_walk_iterator iterator; - for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + lockdep_assert_held_write(&kvm->mmu_lock); + + for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) - flush |= fn(kvm, iterator.rmap, memslot); + flush |= fn(kvm, iterator.rmap, slot); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && flush_on_yield) { - kvm_flush_remote_tlbs_with_address(kvm, - start_gfn, - iterator.gfn - start_gfn + 1); + kvm_flush_remote_tlbs_range(kvm, start_gfn, + iterator.gfn - start_gfn + 1); flush = false; } cond_resched_rwlock_write(&kvm->mmu_lock); @@ -5845,23 +5952,23 @@ slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, return flush; } -static __always_inline bool -slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - bool flush_on_yield) +static __always_inline bool walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + bool flush_on_yield) { - return slot_handle_level_range(kvm, memslot, fn, start_level, - end_level, memslot->base_gfn, - memslot->base_gfn + memslot->npages - 1, - flush_on_yield, false); + return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, + slot->base_gfn, slot->base_gfn + slot->npages - 1, + flush_on_yield, false); } -static __always_inline bool -slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, bool flush_on_yield) +static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + bool flush_on_yield) { - return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, - PG_LEVEL_4K, flush_on_yield); + return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); } static void free_mmu_pages(struct kvm_mmu *mmu) @@ -6156,9 +6263,9 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e if (WARN_ON_ONCE(start >= end)) continue; - flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap, - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, true, flush); } } @@ -6190,8 +6297,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start); + kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); kvm_mmu_invalidate_end(kvm, 0, -1ul); @@ -6211,8 +6317,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, false); + walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); write_unlock(&kvm->mmu_lock); } @@ -6447,10 +6553,9 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, * all the way to the target level. There's no need to split pages * already at the target level. */ - for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) { - slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages, - level, level, start, end - 1, true, false); - } + for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) + __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, + level, level, start, end - 1, true, false); } /* Must be called with the mmu_lock held in write-mode. */ @@ -6529,7 +6634,7 @@ restart: PG_LEVEL_NUM)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); - if (kvm_available_flush_tlb_with_range()) + if (kvm_available_flush_remote_tlbs_range()) kvm_flush_remote_tlbs_sptep(kvm, sptep); else need_tlb_flush = 1; @@ -6548,8 +6653,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap * pages that are already mapped at the maximum hugepage level. */ - if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte, - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) + if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); } @@ -6580,8 +6685,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, * is observed by any other operation on the same memslot. */ lockdep_assert_held(&kvm->slots_lock); - kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, - memslot->npages); + kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages); } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, @@ -6593,7 +6697,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity. */ - slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); + walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } @@ -6663,8 +6767,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) } } -static unsigned long -mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long mmu_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct kvm *kvm; int nr_to_scan = sc->nr_to_scan; @@ -6722,8 +6826,8 @@ unlock: return freed; } -static unsigned long -mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long mmu_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) { return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index cc58631e2336..d39af5639ce9 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -170,14 +170,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); -void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, - u64 start_gfn, u64 pages); +void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, + gfn_t nr_pages); /* Flush the given page (huge or not) of guest memory. */ static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level) { - kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level), - KVM_PAGES_PER_HPAGE(level)); + kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level), + KVM_PAGES_PER_HPAGE(level)); } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); @@ -240,6 +240,13 @@ struct kvm_page_fault { kvm_pfn_t pfn; hva_t hva; bool map_writable; + + /* + * Indicates the guest is trying to write a gfn that contains one or + * more of the PTEs used to translate the write itself, i.e. the access + * is changing its own translation in the guest page tables. + */ + bool write_fault_to_shadow_pgtable; }; int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); @@ -273,7 +280,7 @@ enum { }; static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefetch) + u32 err, bool prefetch, int *emulation_type) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -312,6 +319,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, else r = vcpu->arch.mmu->page_fault(vcpu, &fault); + if (fault.write_fault_to_shadow_pgtable && emulation_type) + *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; + /* * Similar to above, prefetch faults aren't truly spurious, and the * async #PF path doesn't do emulation. Do count faults that are fixed diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 57f0b75c80f9..0662e0278e70 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -324,7 +324,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: walker->level = mmu->cpu_role.base.level; - pte = mmu->get_guest_pgd(vcpu); + pte = kvm_mmu_get_guest_pgd(vcpu, mmu); have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 @@ -519,7 +519,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, pt_element_t gpte, bool no_dirty_log) + u64 *spte, pt_element_t gpte) { struct kvm_memory_slot *slot; unsigned pte_access; @@ -535,8 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, - no_dirty_log && (pte_access & ACC_WRITE_MASK)); + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK); if (!slot) return false; @@ -605,7 +604,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (is_shadow_present_pte(*spte)) continue; - if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i])) break; } } @@ -685,8 +684,17 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); + + if (fault->write && table_gfn == fault->gfn) + fault->write_fault_to_shadow_pgtable = true; } + /* + * Adjust the hugepage size _after_ resolving indirect shadow pages. + * KVM doesn't support mapping hugepages into the guest for gfns that + * are being shadowed by KVM, i.e. allocating a new shadow page may + * affect the allowed hugepage size. + */ kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); @@ -731,46 +739,6 @@ out_gpte_changed: return RET_PF_RETRY; } - /* - * To see whether the mapped gfn can write its page table in the current - * mapping. - * - * It is the helper function of FNAME(page_fault). When guest uses large page - * size to map the writable gfn which is used as current page table, we should - * force kvm to use small page size to map it because new shadow page will be - * created when kvm establishes shadow page table that stop kvm using large - * page size. Do it early can avoid unnecessary #PF and emulation. - * - * @write_fault_to_shadow_pgtable will return true if the fault gfn is - * currently used as its page table. - * - * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok - * since the PDPT is always shadowed, that means, we can not use large page - * size to map the gfn which is used as PDPT. - */ -static bool -FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, bool user_fault, - bool *write_fault_to_shadow_pgtable) -{ - int level; - gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); - bool self_changed = false; - - if (!(walker->pte_access & ACC_WRITE_MASK || - (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) - return false; - - for (level = walker->level; level <= walker->max_level; level++) { - gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; - - self_changed |= !(gfn & mask); - *write_fault_to_shadow_pgtable |= !gfn; - } - - return self_changed; -} - /* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte @@ -789,7 +757,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct guest_walker walker; int r; - bool is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); WARN_ON_ONCE(fault->is_tdp); @@ -814,6 +781,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault } fault->gfn = walker.gfn; + fault->max_level = walker.level; fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); if (page_fault_handle_page_track(vcpu, fault)) { @@ -825,16 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - vcpu->arch.write_fault_to_shadow_pgtable = false; - - is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); - - if (is_self_change_mapping) - fault->max_level = PG_LEVEL_4K; - else - fault->max_level = walker.level; - r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; @@ -887,64 +845,6 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) -{ - struct kvm_shadow_walk_iterator iterator; - struct kvm_mmu_page *sp; - u64 old_spte; - int level; - u64 *sptep; - - vcpu_clear_mmio_info(vcpu, gva); - - /* - * No need to check return value here, rmap_can_add() can - * help us to skip pte prefetch later. - */ - mmu_topup_memory_caches(vcpu, true); - - if (!VALID_PAGE(root_hpa)) { - WARN_ON(1); - return; - } - - write_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { - level = iterator.level; - sptep = iterator.sptep; - - sp = sptep_to_sp(sptep); - old_spte = *sptep; - if (is_last_spte(old_spte, level)) { - pt_element_t gpte; - gpa_t pte_gpa; - - if (!sp->unsync) - break; - - pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += spte_index(sptep) * sizeof(pt_element_t); - - mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); - if (is_shadow_present_pte(old_spte)) - kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); - - if (!rmap_can_add(vcpu)) - break; - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - break; - - FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); - } - - if (!sp->unsync_children) - break; - } - write_unlock(&vcpu->kvm->mmu_lock); -} - /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t addr, u64 access, @@ -977,114 +877,75 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * can't change unless all sptes pointing to it are nuked first. * * Returns - * < 0: the sp should be zapped - * 0: the sp is synced and no tlb flushing is required - * > 0: the sp is synced and tlb flushing is required + * < 0: failed to sync spte + * 0: the spte is synced and no tlb flushing is required + * > 0: the spte is synced and tlb flushing is required */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { - union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; - int i; bool host_writable; gpa_t first_pte_gpa; - bool flush = false; - - /* - * Ignore various flags when verifying that it's safe to sync a shadow - * page using the current MMU context. - * - * - level: not part of the overall MMU role and will never match as the MMU's - * level tracks the root level - * - access: updated based on the new guest PTE - * - quadrant: not part of the overall MMU role (similar to level) - */ - const union kvm_mmu_page_role sync_role_ign = { - .level = 0xf, - .access = 0x7, - .quadrant = 0x3, - .passthrough = 0x1, - }; + u64 *sptep, spte; + struct kvm_memory_slot *slot; + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn; - /* - * Direct pages can never be unsync, and KVM should never attempt to - * sync a shadow page for a different MMU context, e.g. if the role - * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the - * reserved bits checks will be wrong, etc... - */ - if (WARN_ON_ONCE(sp->role.direct || - (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) - return -1; + if (WARN_ON_ONCE(!sp->spt[i])) + return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); + pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { - u64 *sptep, spte; - struct kvm_memory_slot *slot; - unsigned pte_access; - pt_element_t gpte; - gpa_t pte_gpa; - gfn_t gfn; - - if (!sp->spt[i]) - continue; - - pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - return -1; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - flush = true; - continue; - } - - gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - - if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) - continue; + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -1; - /* - * Drop the SPTE if the new protections would result in a RWX=0 - * SPTE or if the gfn is changing. The RWX=0 case only affects - * EPT with execute-only support, i.e. EPT without an effective - * "present" bit, as all other paging modes will create a - * read-only SPTE if pte_access is zero. - */ - if ((!pte_access && !shadow_present_mask) || - gfn != kvm_mmu_page_get_gfn(sp, i)) { - drop_spte(vcpu->kvm, &sp->spt[i]); - flush = true; - continue; - } + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) + return 1; - /* Update the shadowed access bits in case they changed. */ - kvm_mmu_page_set_access(sp, i, pte_access); + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access; + pte_access &= FNAME(gpte_access)(gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - sptep = &sp->spt[i]; - spte = *sptep; - host_writable = spte & shadow_host_writable_mask; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - make_spte(vcpu, sp, slot, pte_access, gfn, - spte_to_pfn(spte), spte, true, false, - host_writable, &spte); + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) + return 0; - flush |= mmu_spte_update(sptep, spte); + /* + * Drop the SPTE if the new protections would result in a RWX=0 + * SPTE or if the gfn is changing. The RWX=0 case only affects + * EPT with execute-only support, i.e. EPT without an effective + * "present" bit, as all other paging modes will create a + * read-only SPTE if pte_access is zero. + */ + if ((!pte_access && !shadow_present_mask) || + gfn != kvm_mmu_page_get_gfn(sp, i)) { + drop_spte(vcpu->kvm, &sp->spt[i]); + return 1; } - /* - * Note, any flush is purely for KVM's correctness, e.g. when dropping - * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier - * unmap or dirty logging event doesn't fail to flush. The guest is - * responsible for flushing the TLB to ensure any changes in protection - * bits are recognized, i.e. until the guest flushes or page faults on - * a relevant address, KVM is architecturally allowed to let vCPUs use - * cached translations with the old protection bits. + * Do nothing if the permissions are unchanged. The existing SPTE is + * still, and prefetch_invalid_gpte() has verified that the A/D bits + * are set in the "new" gPTE, i.e. there is no danger of missing an A/D + * update due to A/D bits being set in the SPTE but not the gPTE. */ - return flush; + if (kvm_mmu_page_get_access(sp, i) == pte_access) + return 0; + + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); + + sptep = &sp->spt[i]; + spte = *sptep; + host_writable = spte & shadow_host_writable_mask; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + make_spte(vcpu, sp, slot, pte_access, gfn, + spte_to_pfn(spte), spte, true, false, + host_writable, &spte); + + return mmu_spte_update(sptep, spte); } #undef pt_element_t diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c15bfca3ed15..cf2c6426a6fc 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -164,7 +164,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, /* * For simplicity, enforce the NX huge page mitigation even if not * strictly necessary. KVM could ignore the mitigation if paging is - * disabled in the guest, as the guest doesn't have an page tables to + * disabled in the guest, as the guest doesn't have any page tables to * abuse. But to safely ignore the mitigation, KVM would have to * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG * is toggled on, and that's a net negative for performance when TDP is diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index f0af385c56e0..fae559559a80 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -29,29 +29,49 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) WRITE_ONCE(*rcu_dereference(sptep), new_spte); } +/* + * SPTEs must be modified atomically if they are shadow-present, leaf + * SPTEs, and have volatile bits, i.e. has bits that can be set outside + * of mmu_lock. The Writable bit can be set by KVM's fast page fault + * handler, and Accessed and Dirty bits can be set by the CPU. + * + * Note, non-leaf SPTEs do have Accessed bits and those bits are + * technically volatile, but KVM doesn't consume the Accessed bit of + * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This + * logic needs to be reassessed if KVM were to use non-leaf Accessed + * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + */ +static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level) +{ + return is_shadow_present_pte(old_spte) && + is_last_spte(old_spte, level) && + spte_has_volatile_bits(old_spte); +} + static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level) { - /* - * Atomically write the SPTE if it is a shadow-present, leaf SPTE with - * volatile bits, i.e. has bits that can be set outside of mmu_lock. - * The Writable bit can be set by KVM's fast page fault handler, and - * Accessed and Dirty bits can be set by the CPU. - * - * Note, non-leaf SPTEs do have Accessed bits and those bits are - * technically volatile, but KVM doesn't consume the Accessed bit of - * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This - * logic needs to be reassessed if KVM were to use non-leaf Accessed - * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. - */ - if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && - spte_has_volatile_bits(old_spte)) + if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); __kvm_tdp_mmu_write_spte(sptep, new_spte); return old_spte; } +static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, + u64 mask, int level) +{ + atomic64_t *sptep_atomic; + + if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) { + sptep_atomic = (atomic64_t *)rcu_dereference(sptep); + return (u64)atomic64_fetch_and(~mask, sptep_atomic); + } + + __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask); + return old_spte; +} + /* * A TDP iterator performs a pre-order walk over a TDP paging structure. */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7c25dbf32ecc..08340219c35a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -40,7 +40,17 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { - /* Also waits for any queued work items. */ + /* + * Invalidate all roots, which besides the obvious, schedules all roots + * for zapping and thus puts the TDP MMU's reference to each root, i.e. + * ultimately frees all roots. + */ + kvm_tdp_mmu_invalidate_all_roots(kvm); + + /* + * Destroying a workqueue also first flushes the workqueue, i.e. no + * need to invoke kvm_tdp_mmu_zap_invalidated_roots(). + */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); @@ -116,16 +126,6 @@ static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); } -static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) -{ - union kvm_mmu_page_role role = page->role; - role.invalid = true; - - /* No need to use cmpxchg, only the invalid bit can change. */ - role.word = xchg(&page->role.word, role.word); - return role.invalid; -} - void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { @@ -134,45 +134,12 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; - WARN_ON(!is_tdp_mmu_page(root)); - /* - * The root now has refcount=0. It is valid, but readers already - * cannot acquire a reference to it because kvm_tdp_mmu_get_root() - * rejects it. This remains true for the rest of the execution - * of this function, because readers visit valid roots only - * (except for tdp_mmu_zap_root_work(), which however - * does not acquire any reference itself). - * - * Even though there are flows that need to visit all roots for - * correctness, they all take mmu_lock for write, so they cannot yet - * run concurrently. The same is true after kvm_tdp_root_mark_invalid, - * since the root still has refcount=0. - * - * However, tdp_mmu_zap_root can yield, and writers do not expect to - * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). - * So the root temporarily gets an extra reference, going to refcount=1 - * while staying invalid. Readers still cannot acquire any reference; - * but writers are now allowed to run if tdp_mmu_zap_root yields and - * they might take an extra reference if they themselves yield. - * Therefore, when the reference is given back by the worker, - * there is no guarantee that the refcount is still 1. If not, whoever - * puts the last reference will free the page, but they will not have to - * zap the root because a root cannot go from invalid to valid. + * The TDP MMU itself holds a reference to each root until the root is + * explicitly invalidated, i.e. the final reference should be never be + * put for a valid root. */ - if (!kvm_tdp_root_mark_invalid(root)) { - refcount_set(&root->tdp_mmu_root_count, 1); - - /* - * Zapping the root in a worker is not just "nice to have"; - * it is required because kvm_tdp_mmu_invalidate_all_roots() - * skips already-invalid roots. If kvm_tdp_mmu_put_root() did - * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() - * might return with some roots not zapped yet. - */ - tdp_mmu_schedule_zap_root(kvm, root); - return; - } + KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_del_rcu(&root->link); @@ -320,7 +287,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) root = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_sp(root, NULL, 0, role); - refcount_set(&root->tdp_mmu_root_count, 1); + /* + * TDP MMU roots are kept until they are explicitly invalidated, either + * by a memslot update or by the destruction of the VM. Initialize the + * refcount to two; one reference for the vCPU, and one reference for + * the TDP MMU itself, which is held until the root is invalidated and + * is ultimately put by tdp_mmu_zap_root_work(). + */ + refcount_set(&root->tdp_mmu_root_count, 2); spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); @@ -334,35 +308,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) -{ - if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) - return; - - if (is_accessed_spte(old_spte) && - (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || - spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); -} - -static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) -{ - bool pfn_changed; - struct kvm_memory_slot *slot; - - if (level > PG_LEVEL_4K) - return; - - pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - - if ((!is_writable_pte(old_spte) || pfn_changed) && - is_writable_pte(new_spte)) { - slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); - mark_page_dirty_in_slot(kvm, slot, gfn); - } -} - static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); @@ -505,7 +450,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) } /** - * __handle_changed_spte - handle bookkeeping associated with an SPTE change + * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance * @as_id: the address space of the paging structure the SPTE was a part of * @gfn: the base GFN that was mapped by the SPTE @@ -516,12 +461,13 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. * - * Handle bookkeeping that might result from the modification of a SPTE. - * This function must be called for all TDP SPTE modifications. + * Handle bookkeeping that might result from the modification of a SPTE. Note, + * dirty logging updates are handled in common code, not here (see make_spte() + * and fast_pf_fix_direct_spte()). */ -static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared) +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level, + bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); @@ -605,17 +551,10 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_present && !was_leaf && (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); -} -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared) -{ - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, - shared); - handle_changed_spte_acc_track(old_spte, new_spte, level); - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + if (was_leaf && is_accessed_spte(old_spte) && + (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } /* @@ -658,9 +597,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; - __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, true); - handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return 0; } @@ -696,7 +634,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, /* - * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping + * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance * @as_id: Address space ID, i.e. regular vs. SMM * @sptep: Pointer to the SPTE @@ -704,23 +642,12 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * @new_spte: The new value that will be set for the SPTE * @gfn: The base GFN that was (or will be) mapped by the SPTE * @level: The level _containing_ the SPTE (its parent PT's level) - * @record_acc_track: Notify the MM subsystem of changes to the accessed state - * of the page. Should be set unless handling an MMU - * notifier for access tracking. Leaving record_acc_track - * unset in that case prevents page accesses from being - * double counted. - * @record_dirty_log: Record the page as dirty in the dirty bitmap if - * appropriate for the change being made. Should be set - * unless performing certain dirty logging operations. - * Leaving record_dirty_log unset in that case prevents page - * writes from being double counted. * * Returns the old SPTE value, which _may_ be different than @old_spte if the * SPTE had voldatile bits. */ -static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, - u64 old_spte, u64 new_spte, gfn_t gfn, int level, - bool record_acc_track, bool record_dirty_log) +static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, + u64 old_spte, u64 new_spte, gfn_t gfn, int level) { lockdep_assert_held_write(&kvm->mmu_lock); @@ -735,46 +662,17 @@ static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); - - if (record_acc_track) - handle_changed_spte_acc_track(old_spte, new_spte, level); - if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); return old_spte; } -static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte, bool record_acc_track, - bool record_dirty_log) +static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) { WARN_ON_ONCE(iter->yielded); - - iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, - iter->old_spte, new_spte, - iter->gfn, iter->level, - record_acc_track, record_dirty_log); -} - -static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); -} - -static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); -} - -static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); + iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, + iter->old_spte, new_spte, + iter->gfn, iter->level); } #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ @@ -866,7 +764,7 @@ retry: continue; if (!shared) - tdp_mmu_set_spte(kvm, &iter, 0); + tdp_mmu_iter_set_spte(kvm, &iter, 0); else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) goto retry; } @@ -923,8 +821,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) return false; - __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, - sp->gfn, sp->role.level + 1, true, true); + tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, + sp->gfn, sp->role.level + 1); return true; } @@ -958,7 +856,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); + tdp_mmu_iter_set_spte(kvm, &iter, 0); flush = true; } @@ -1022,32 +920,49 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) /* * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that * is about to be zapped, e.g. in response to a memslots update. The actual - * zapping is performed asynchronously, so a reference is taken on all roots. - * Using a separate workqueue makes it easy to ensure that the destruction is - * performed before the "fast zap" completes, without keeping a separate list - * of invalidated roots; the list is effectively the list of work items in - * the workqueue. - * - * Get a reference even if the root is already invalid, the asynchronous worker - * assumes it was gifted a reference to the root it processes. Because mmu_lock - * is held for write, it should be impossible to observe a root with zero refcount, - * i.e. the list of roots cannot be stale. + * zapping is performed asynchronously. Using a separate workqueue makes it + * easy to ensure that the destruction is performed before the "fast zap" + * completes, without keeping a separate list of invalidated roots; the list is + * effectively the list of work items in the workqueue. * - * This has essentially the same effect for the TDP MMU - * as updating mmu_valid_gen does for the shadow MMU. + * Note, the asynchronous worker is gifted the TDP MMU's reference. + * See kvm_tdp_mmu_get_vcpu_root_hpa(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) { struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); - list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { - if (!root->role.invalid && - !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { + /* + * mmu_lock must be held for write to ensure that a root doesn't become + * invalid while there are active readers (invalidating a root while + * there are active readers may or may not be problematic in practice, + * but it's uncharted territory and not supported). + * + * Waive the assertion if there are no users of @kvm, i.e. the VM is + * being destroyed after all references have been put, or if no vCPUs + * have been created (which means there are no roots), i.e. the VM is + * being destroyed in an error path of KVM_CREATE_VM. + */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && + refcount_read(&kvm->users_count) && kvm->created_vcpus) + lockdep_assert_held_write(&kvm->mmu_lock); + + /* + * As above, mmu_lock isn't held when destroying the VM! There can't + * be other references to @kvm, i.e. nothing else can invalidate roots + * or be consuming roots, but walking the list of roots does need to be + * guarded against roots being deleted by the asynchronous zap worker. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { + if (!root->role.invalid) { root->role.invalid = true; tdp_mmu_schedule_zap_root(kvm, root); } } + + rcu_read_unlock(); } /* @@ -1128,7 +1043,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, if (ret) return ret; } else { - tdp_mmu_set_spte(kvm, iter, spte); + tdp_mmu_iter_set_spte(kvm, iter, spte); } tdp_account_mmu_page(kvm, sp); @@ -1262,33 +1177,42 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. + * + * No need to mark the corresponding PFN as accessed as this call is coming + * from the clear_young() or clear_flush_young() notifier, which uses the + * return value to determine if the page has been accessed. */ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { - u64 new_spte = 0; + u64 new_spte; /* If we have a non-accessed entry we don't need to change the pte. */ if (!is_accessed_spte(iter->old_spte)) return false; - new_spte = iter->old_spte; - - if (spte_ad_enabled(new_spte)) { - new_spte &= ~shadow_accessed_mask; + if (spte_ad_enabled(iter->old_spte)) { + iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, + iter->old_spte, + shadow_accessed_mask, + iter->level); + new_spte = iter->old_spte & ~shadow_accessed_mask; } else { /* * Capture the dirty status of the page, so that it doesn't get * lost when the SPTE is marked for access tracking. */ - if (is_writable_pte(new_spte)) - kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + if (is_writable_pte(iter->old_spte)) + kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); - new_spte = mark_spte_for_access_track(new_spte); + new_spte = mark_spte_for_access_track(iter->old_spte); + iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, + iter->old_spte, new_spte, + iter->level); } - tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); - + trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, + iter->old_spte, new_spte); return true; } @@ -1324,15 +1248,15 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, * Note, when changing a read-only SPTE, it's not strictly necessary to * zero the SPTE before setting the new PFN, but doing so preserves the * invariant that the PFN of a present * leaf SPTE can never change. - * See __handle_changed_spte(). + * See handle_changed_spte(). */ - tdp_mmu_set_spte(kvm, iter, 0); + tdp_mmu_iter_set_spte(kvm, iter, 0); if (!pte_write(range->pte)) { new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, pte_pfn(range->pte)); - tdp_mmu_set_spte(kvm, iter, new_spte); + tdp_mmu_iter_set_spte(kvm, iter, new_spte); } return true; @@ -1349,7 +1273,7 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) /* * No need to handle the remote TLB flush under RCU protection, the * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a - * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). + * shadow page. See the WARN on pfn_changed in handle_changed_spte(). */ return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); } @@ -1607,8 +1531,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { + u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; struct tdp_iter iter; - u64 new_spte; bool spte_set = false; rcu_read_lock(); @@ -1621,19 +1545,13 @@ retry: if (!is_shadow_present_pte(iter.old_spte)) continue; - if (spte_ad_need_write_protect(iter.old_spte)) { - if (is_writable_pte(iter.old_spte)) - new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - else - continue; - } else { - if (iter.old_spte & shadow_dirty_mask) - new_spte = iter.old_spte & ~shadow_dirty_mask; - else - continue; - } + MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); - if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) + if (!(iter.old_spte & dbit)) + continue; + + if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) goto retry; spte_set = true; @@ -1675,8 +1593,9 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { + u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : + shadow_dirty_mask; struct tdp_iter iter; - u64 new_spte; rcu_read_lock(); @@ -1685,25 +1604,26 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, if (!mask) break; + MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); + if (iter.level > PG_LEVEL_4K || !(mask & (1UL << (iter.gfn - gfn)))) continue; mask &= ~(1UL << (iter.gfn - gfn)); - if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { - if (is_writable_pte(iter.old_spte)) - new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - else - continue; - } else { - if (iter.old_spte & shadow_dirty_mask) - new_spte = iter.old_spte & ~shadow_dirty_mask; - else - continue; - } + if (!(iter.old_spte & dbit)) + continue; + + iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, + iter.old_spte, dbit, + iter.level); - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, + iter.old_spte, + iter.old_spte & ~dbit); + kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); } rcu_read_unlock(); @@ -1821,7 +1741,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, if (new_spte == iter.old_spte) break; - tdp_mmu_set_spte(kvm, &iter, new_spte); + tdp_mmu_iter_set_spte(kvm, &iter, new_spte); spte_set = true; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 612e6c70ce2e..1690d41c1830 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -93,7 +93,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static inline bool pmc_is_enabled(struct kvm_pmc *pmc) +static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) { return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); } @@ -400,6 +400,12 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } +static bool pmc_event_is_allowed(struct kvm_pmc *pmc) +{ + return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && + check_pmu_event_filter(pmc); +} + static void reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -409,10 +415,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) pmc_pause_counter(pmc); - if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) - goto reprogram_complete; - - if (!check_pmu_event_filter(pmc)) + if (!pmc_event_is_allowed(pmc)) goto reprogram_complete; if (pmc->counter < pmc->prev_counter) @@ -540,9 +543,9 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (!pmc) return 1; - if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) && + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) && (static_call(kvm_x86_get_cpl)(vcpu) != 0) && - (kvm_read_cr0(vcpu) & X86_CR0_PE)) + kvm_is_cr0_bit_set(vcpu, X86_CR0_PE)) return 1; *data = pmc_read_counter(pmc) & mask; @@ -589,6 +592,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) + return; + + bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); static_call(kvm_x86_pmu_refresh)(vcpu); } @@ -646,7 +653,7 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { pmc->prev_counter = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, @@ -684,7 +691,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); - if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) + if (!pmc || !pmc_event_is_allowed(pmc)) continue; /* Ignore checks for edge detect, pin control, invert and CMASK bits */ diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index be62c16f2265..5c7bbf03b599 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -195,7 +195,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) KVM_PMC_MAX_FIXED); } -static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc) +static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 05d38944a6c0..96936ddf1b3c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -139,13 +139,18 @@ void recalc_intercepts(struct vcpu_svm *svm) if (g->int_ctl & V_INTR_MASKING_MASK) { /* - * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8 - * does not affect any interrupt we may want to inject; - * therefore, writes to CR8 are irrelevant to L0, as are - * interrupt window vmexits. + * If L2 is active and V_INTR_MASKING is enabled in vmcb12, + * disable intercept of CR8 writes as L2's CR8 does not affect + * any interrupt KVM may want to inject. + * + * Similarly, disable intercept of virtual interrupts (used to + * detect interrupt windows) if the saved RFLAGS.IF is '0', as + * the effective RFLAGS.IF for L1 interrupts will never be set + * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs). */ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); - vmcb_clr_intercept(c, INTERCEPT_VINTR); + if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)) + vmcb_clr_intercept(c, INTERCEPT_VINTR); } /* @@ -276,6 +281,11 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl))) return false; + if (CC((control->int_ctl & V_NMI_ENABLE_MASK) && + !vmcb12_is_intercept(control, INTERCEPT_NMI))) { + return false; + } + return true; } @@ -416,22 +426,24 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) /* Only a few fields of int_ctl are written by the processor. */ mask = V_IRQ_MASK | V_TPR_MASK; - if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) && - svm_is_intercept(svm, INTERCEPT_VINTR)) { - /* - * In order to request an interrupt window, L0 is usurping - * svm->vmcb->control.int_ctl and possibly setting V_IRQ - * even if it was clear in L1's VMCB. Restoring it would be - * wrong. However, in this case V_IRQ will remain true until - * interrupt_window_interception calls svm_clear_vintr and - * restores int_ctl. We can just leave it aside. - */ + /* + * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting + * virtual interrupts in order to request an interrupt window, as KVM + * has usurped vmcb02's int_ctl. If an interrupt window opens before + * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl. + * If no window opens, V_IRQ will be correctly preserved in vmcb12's + * int_ctl (because it was never recognized while L2 was running). + */ + if (svm_is_intercept(svm, INTERCEPT_VINTR) && + !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts)) mask &= ~V_IRQ_MASK; - } if (nested_vgif_enabled(svm)) mask |= V_GIF_MASK; + if (nested_vnmi_enabled(svm)) + mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK; + svm->nested.ctl.int_ctl &= ~mask; svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask; } @@ -651,6 +663,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, else int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); + if (vnmi) { + if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) { + svm->vcpu.arch.nmi_pending++; + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + } + if (nested_vnmi_enabled(svm)) + int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK | + V_NMI_ENABLE_MASK | + V_NMI_BLOCKING_MASK); + } + /* Copied from vmcb01. msrpm_base can be overwritten later. */ vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; @@ -1021,6 +1044,28 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm_switch_vmcb(svm, &svm->vmcb01); + /* + * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01: + * + * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't + * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related + * flags) to detect interrupt windows for L1 IRQs (even if L1 uses + * virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that + * KVM re-requests an interrupt window if necessary, which implicitly + * copies this bits from vmcb02 to vmcb01. + * + * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR + * is stored in vmcb02, but its value doesn't need to be copied from/to + * vmcb01 because it is copied from/to the virtual APIC's TPR register + * on each VM entry/exit. + * + * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's + * V_GIF. However, GIF is architecturally clear on each VM exit, thus + * there is no need to copy V_GIF from vmcb02 to vmcb01. + */ + if (!nested_exit_on_intr(svm)) + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); @@ -1029,6 +1074,20 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm_update_lbrv(vcpu); } + if (vnmi) { + if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK) + vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK; + else + vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK; + + if (vcpu->arch.nmi_pending) { + vcpu->arch.nmi_pending--; + vmcb01->control.int_ctl |= V_NMI_PENDING_MASK; + } else { + vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK; + } + } + /* * On vmexit the GIF is set to false and * no event can be injected in L1. diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index cc77a0681800..5fa939e411d8 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -161,7 +161,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } return 0; } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c25aeb550cd9..69ae5e1b3120 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -12,6 +12,7 @@ #include <linux/kvm_host.h> #include <linux/kernel.h> #include <linux/highmem.h> +#include <linux/psp.h> #include <linux/psp-sev.h> #include <linux/pagemap.h> #include <linux/swap.h> @@ -1767,18 +1768,20 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_info *src_sev, *cg_cleanup_sev; - struct file *source_kvm_file; + struct fd f = fdget(source_fd); struct kvm *source_kvm; bool charged = false; int ret; - source_kvm_file = fget(source_fd); - if (!file_is_kvm(source_kvm_file)) { + if (!f.file) + return -EBADF; + + if (!file_is_kvm(f.file)) { ret = -EBADF; goto out_fput; } - source_kvm = source_kvm_file->private_data; + source_kvm = f.file->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) goto out_fput; @@ -1828,8 +1831,7 @@ out_dst_cgroup: out_unlock: sev_unlock_two_vms(kvm, source_kvm); out_fput: - if (source_kvm_file) - fput(source_kvm_file); + fdput(f); return ret; } @@ -2046,18 +2048,20 @@ failed: int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct file *source_kvm_file; + struct fd f = fdget(source_fd); struct kvm *source_kvm; struct kvm_sev_info *source_sev, *mirror_sev; int ret; - source_kvm_file = fget(source_fd); - if (!file_is_kvm(source_kvm_file)) { + if (!f.file) + return -EBADF; + + if (!file_is_kvm(f.file)) { ret = -EBADF; goto e_source_fput; } - source_kvm = source_kvm_file->private_data; + source_kvm = f.file->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) goto e_source_fput; @@ -2103,8 +2107,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) e_unlock: sev_unlock_two_vms(kvm, source_kvm); e_source_fput: - if (source_kvm_file) - fput(source_kvm_file); + fdput(f); return ret; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 252e7f37e4e2..ca32389f3c36 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -27,6 +27,7 @@ #include <linux/swap.h> #include <linux/rwsem.h> #include <linux/cc_platform.h> +#include <linux/smp.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -41,6 +42,9 @@ #include <asm/fpu/api.h> #include <asm/virtext.h> + +#include <trace/events/ipi.h> + #include "trace.h" #include "svm.h" @@ -95,6 +99,7 @@ static const struct svm_direct_access_msrs { #endif { .index = MSR_IA32_SPEC_CTRL, .always = false }, { .index = MSR_IA32_PRED_CMD, .always = false }, + { .index = MSR_IA32_FLUSH_CMD, .always = false }, { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, @@ -230,6 +235,8 @@ module_param(dump_invalid_vmcb, bool, 0644); bool intercept_smi = true; module_param(intercept_smi, bool, 0444); +bool vnmi = true; +module_param(vnmi, bool, 0444); static bool svm_gp_erratum_intercept = true; @@ -1311,6 +1318,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); + if (vnmi) + svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK; + if (vgif) { svm_clr_intercept(svm, INTERCEPT_STGI); svm_clr_intercept(svm, INTERCEPT_CLGI); @@ -1584,6 +1594,16 @@ static void svm_set_vintr(struct vcpu_svm *svm) svm_set_intercept(svm, INTERCEPT_VINTR); /* + * Recalculating intercepts may have cleared the VINTR intercept. If + * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF + * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN. + * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as + * interrupts will never be unblocked while L2 is running. + */ + if (!svm_is_intercept(svm, INTERCEPT_VINTR)) + return; + + /* * This is just a dummy VINTR to actually cause a vmexit to happen. * Actual injection of virtual interrupts happens through EVENTINJ. */ @@ -2480,16 +2500,29 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) has_error_code, error_code); } +static void svm_clr_iret_intercept(struct vcpu_svm *svm) +{ + if (!sev_es_guest(svm->vcpu.kvm)) + svm_clr_intercept(svm, INTERCEPT_IRET); +} + +static void svm_set_iret_intercept(struct vcpu_svm *svm) +{ + if (!sev_es_guest(svm->vcpu.kvm)) + svm_set_intercept(svm, INTERCEPT_IRET); +} + static int iret_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); ++vcpu->stat.nmi_window_exits; svm->awaiting_iret_completion = true; - if (!sev_es_guest(vcpu->kvm)) { - svm_clr_intercept(svm, INTERCEPT_IRET); + + svm_clr_iret_intercept(svm); + if (!sev_es_guest(vcpu->kvm)) svm->nmi_iret_rip = kvm_rip_read(vcpu); - } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -2872,7 +2905,7 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); - int r; + int ret = 0; u32 ecx = msr->index; u64 data = msr->data; @@ -2942,21 +2975,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) */ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); break; - case MSR_IA32_PRED_CMD: - if (!msr->host_initiated && - !guest_has_pred_cmd_msr(vcpu)) - return 1; - - if (data & ~PRED_CMD_IBPB) - return 1; - if (!boot_cpu_has(X86_FEATURE_IBPB)) - return 1; - if (!data) - break; - - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); - break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) @@ -3009,10 +3027,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * guest via direct_access_msrs, and switch it via user return. */ preempt_disable(); - r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); + ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); preempt_enable(); - if (r) - return 1; + if (ret) + break; svm->tsc_aux = data; break; @@ -3070,7 +3088,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) default: return kvm_set_msr_common(vcpu, msr); } - return 0; + return ret; } static int msr_interception(struct kvm_vcpu *vcpu) @@ -3481,9 +3499,41 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) return; svm->nmi_masked = true; - if (!sev_es_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_IRET); + svm_set_iret_intercept(svm); + ++vcpu->stat.nmi_injections; +} + +static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!is_vnmi_enabled(svm)) + return false; + + return !!(svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK); +} + +static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!is_vnmi_enabled(svm)) + return false; + + if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK) + return false; + + svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK; + vmcb_mark_dirty(svm->vmcb, VMCB_INTR); + + /* + * Because the pending NMI is serviced by hardware, KVM can't know when + * the NMI is "injected", but for all intents and purposes, passing the + * NMI off to hardware counts as injection. + */ ++vcpu->stat.nmi_injections; + + return true; } static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) @@ -3581,6 +3631,35 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) svm_set_intercept(svm, INTERCEPT_CR8_WRITE); } +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_vnmi_enabled(svm)) + return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK; + else + return svm->nmi_masked; +} + +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (is_vnmi_enabled(svm)) { + if (masked) + svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK; + else + svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK; + + } else { + svm->nmi_masked = masked; + if (masked) + svm_set_iret_intercept(svm); + else + svm_clr_iret_intercept(svm); + } +} + bool svm_nmi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3592,8 +3671,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return false; - return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - svm->nmi_masked; + if (svm_get_nmi_mask(vcpu)) + return true; + + return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK; } static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) @@ -3611,26 +3692,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return 1; } -static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) -{ - return to_svm(vcpu)->nmi_masked; -} - -static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (masked) { - svm->nmi_masked = true; - if (!sev_es_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_IRET); - } else { - svm->nmi_masked = false; - if (!sev_es_guest(vcpu->kvm)) - svm_clr_intercept(svm, INTERCEPT_IRET); - } -} - bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3711,7 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->nmi_masked && !svm->awaiting_iret_completion) + /* + * KVM should never request an NMI window when vNMI is enabled, as KVM + * allows at most one to-be-injected NMI and one pending NMI, i.e. if + * two NMIs arrive simultaneously, KVM will inject one and set + * V_NMI_PENDING for the other. WARN, but continue with the standard + * single-step approach to try and salvage the pending NMI. + */ + WARN_ON_ONCE(is_vnmi_enabled(svm)); + + if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { @@ -3729,7 +3799,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) +static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3753,6 +3823,37 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) svm->current_vmcb->asid_generation--; } +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + hpa_t root_tdp = vcpu->arch.mmu->root.hpa; + + /* + * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly + * flush the NPT mappings via hypercall as flushing the ASID only + * affects virtual to physical mappings, it does not invalidate guest + * physical to host physical mappings. + */ + if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp)) + hyperv_flush_guest_mapping(root_tdp); + + svm_flush_tlb_asid(vcpu); +} + +static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + /* + * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB + * flushes should be routed to hv_flush_remote_tlbs() without requesting + * a "regular" remote flush. Reaching this point means either there's + * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of + * which might be fatal to the guest. Yell, but try to recover. + */ + if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) + hv_flush_remote_tlbs(vcpu->kvm); + + svm_flush_tlb_asid(vcpu); +} + static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4107,7 +4208,7 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: return false; case MSR_IA32_SMBASE: if (!IS_ENABLED(CONFIG_KVM_SMM)) @@ -4149,8 +4250,18 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); + svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI); + svm_recalc_instruction_intercepts(vcpu, svm); + if (boot_cpu_has(X86_FEATURE_IBPB)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, + !!guest_has_pred_cmd_msr(vcpu)); + + if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, + !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { best = kvm_find_cpuid_entry(vcpu, 0x8000001F); @@ -4528,7 +4639,6 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { bool smep, smap, is_user; - unsigned long cr4; u64 error_code; /* Emulation is always possible when KVM has access to all guest state. */ @@ -4620,9 +4730,8 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) goto resume_guest; - cr4 = kvm_read_cr4(vcpu); - smep = cr4 & X86_CR4_SMEP; - smap = cr4 & X86_CR4_SMAP; + smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP); + smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP); is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); @@ -4745,10 +4854,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, - .flush_tlb_all = svm_flush_tlb_current, + .flush_tlb_all = svm_flush_tlb_all, .flush_tlb_current = svm_flush_tlb_current, .flush_tlb_gva = svm_flush_tlb_gva, - .flush_tlb_guest = svm_flush_tlb_current, + .flush_tlb_guest = svm_flush_tlb_asid, .vcpu_pre_run = svm_vcpu_pre_run, .vcpu_run = svm_vcpu_run, @@ -4760,6 +4869,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .patch_hypercall = svm_patch_hypercall, .inject_irq = svm_inject_irq, .inject_nmi = svm_inject_nmi, + .is_vnmi_pending = svm_is_vnmi_pending, + .set_vnmi_pending = svm_set_vnmi_pending, .inject_exception = svm_inject_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, @@ -4902,6 +5013,9 @@ static __init void svm_set_cpu_caps(void) if (vgif) kvm_cpu_cap_set(X86_FEATURE_VGIF); + if (vnmi) + kvm_cpu_cap_set(X86_FEATURE_VNMI); + /* Nested VM can receive #VMEXIT instead of triggering #GP */ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } @@ -5053,6 +5167,16 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } + vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI); + if (vnmi) + pr_info("Virtual NMI enabled\n"); + + if (!vnmi) { + svm_x86_ops.is_vnmi_pending = NULL; + svm_x86_ops.set_vnmi_pending = NULL; + } + + if (lbrv) { if (!boot_cpu_has(X86_FEATURE_LBRV)) lbrv = false; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 839809972da1..f44751dd8d5d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -36,6 +36,7 @@ extern bool npt_enabled; extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; +extern bool vnmi; /* * Clean bits in VMCB. @@ -265,6 +266,7 @@ struct vcpu_svm { bool pause_filter_enabled : 1; bool pause_threshold_enabled : 1; bool vgif_enabled : 1; + bool vnmi_enabled : 1; u32 ldr_reg; u32 dfr_reg; @@ -539,6 +541,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } +static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) +{ + return svm->vnmi_enabled && + (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK); +} + static inline bool is_x2apic_msrpm_offset(u32 offset) { /* 4 msrs per u8, and 4 u8 in u32 */ @@ -548,6 +556,27 @@ static inline bool is_x2apic_msrpm_offset(u32 offset) (msr < (APIC_BASE_MSR + 0x100)); } +static inline struct vmcb *get_vnmi_vmcb_l1(struct vcpu_svm *svm) +{ + if (!vnmi) + return NULL; + + if (is_guest_mode(&svm->vcpu)) + return NULL; + else + return svm->vmcb01.ptr; +} + +static inline bool is_vnmi_enabled(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = get_vnmi_vmcb_l1(svm); + + if (vmcb) + return !!(vmcb->control.int_ctl & V_NMI_ENABLE_MASK); + else + return false; +} + /* svm.c */ #define MSR_INVALID 0xffffffffU diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index cff838f15db5..f85bc617ffe4 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -6,6 +6,8 @@ #ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__ #define __ARCH_X86_KVM_SVM_ONHYPERV_H__ +#include <asm/mshyperv.h> + #if IS_ENABLED(CONFIG_HYPERV) #include "kvm_onhyperv.h" @@ -15,6 +17,14 @@ static struct kvm_x86_ops svm_x86_ops; int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); +static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) +{ + struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments; + + return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB && + !!hve->hv_enlightenments_control.enlightened_npt_tlb; +} + static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; @@ -35,9 +45,8 @@ static inline __init void svm_hv_hardware_setup(void) if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); - svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; - svm_x86_ops.tlb_remote_flush_with_range = - hv_remote_flush_tlb_with_range; + svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { @@ -80,6 +89,11 @@ static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu) } #else +static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu) +{ + return false; +} + static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { } diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 22daca752797..79450e1ed7cf 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -13,7 +13,110 @@ #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK -DEFINE_STATIC_KEY_FALSE(enable_evmcs); +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ @@ -506,6 +609,8 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) } #if IS_ENABLED(CONFIG_HYPERV) +DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); + /* * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption * is: in case a feature has corresponding fields in eVMCS described and it was diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 78d17667e7ec..9623fe1651c4 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -16,117 +16,10 @@ struct vmcs_config; -DECLARE_STATIC_KEY_FALSE(enable_evmcs); - #define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) #define KVM_EVMCS_VERSION 1 -/* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - * VM_FUNCTION_CONTROL = 0x00002018, - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - * - * TSC_MULTIPLIER = 0x00002032, - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ -#define EVMCS1_SUPPORTED_PINCTRL \ - (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - PIN_BASED_EXT_INTR_MASK | \ - PIN_BASED_NMI_EXITING | \ - PIN_BASED_VIRTUAL_NMIS) - -#define EVMCS1_SUPPORTED_EXEC_CTRL \ - (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ - CPU_BASED_HLT_EXITING | \ - CPU_BASED_CR3_LOAD_EXITING | \ - CPU_BASED_CR3_STORE_EXITING | \ - CPU_BASED_UNCOND_IO_EXITING | \ - CPU_BASED_MOV_DR_EXITING | \ - CPU_BASED_USE_TSC_OFFSETTING | \ - CPU_BASED_MWAIT_EXITING | \ - CPU_BASED_MONITOR_EXITING | \ - CPU_BASED_INVLPG_EXITING | \ - CPU_BASED_RDPMC_EXITING | \ - CPU_BASED_INTR_WINDOW_EXITING | \ - CPU_BASED_CR8_LOAD_EXITING | \ - CPU_BASED_CR8_STORE_EXITING | \ - CPU_BASED_RDTSC_EXITING | \ - CPU_BASED_TPR_SHADOW | \ - CPU_BASED_USE_IO_BITMAPS | \ - CPU_BASED_MONITOR_TRAP_FLAG | \ - CPU_BASED_USE_MSR_BITMAPS | \ - CPU_BASED_NMI_WINDOW_EXITING | \ - CPU_BASED_PAUSE_EXITING | \ - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - -#define EVMCS1_SUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ - SECONDARY_EXEC_WBINVD_EXITING | \ - SECONDARY_EXEC_ENABLE_VPID | \ - SECONDARY_EXEC_ENABLE_EPT | \ - SECONDARY_EXEC_UNRESTRICTED_GUEST | \ - SECONDARY_EXEC_DESC | \ - SECONDARY_EXEC_ENABLE_RDTSCP | \ - SECONDARY_EXEC_ENABLE_INVPCID | \ - SECONDARY_EXEC_XSAVES | \ - SECONDARY_EXEC_RDSEED_EXITING | \ - SECONDARY_EXEC_RDRAND_EXITING | \ - SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ - SECONDARY_EXEC_PT_USE_GPA | \ - SECONDARY_EXEC_PT_CONCEAL_VMX | \ - SECONDARY_EXEC_BUS_LOCK_DETECTION | \ - SECONDARY_EXEC_NOTIFY_VM_EXITING | \ - SECONDARY_EXEC_ENCLS_EXITING) - -#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) - -#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_EXIT_SAVE_DEBUG_CONTROLS | \ - VM_EXIT_ACK_INTR_ON_EXIT | \ - VM_EXIT_HOST_ADDR_SPACE_SIZE | \ - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_EXIT_SAVE_IA32_PAT | \ - VM_EXIT_LOAD_IA32_PAT | \ - VM_EXIT_SAVE_IA32_EFER | \ - VM_EXIT_LOAD_IA32_EFER | \ - VM_EXIT_CLEAR_BNDCFGS | \ - VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ - VM_ENTRY_LOAD_DEBUG_CONTROLS | \ - VM_ENTRY_IA32E_MODE | \ - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_ENTRY_LOAD_IA32_PAT | \ - VM_ENTRY_LOAD_IA32_EFER | \ - VM_ENTRY_LOAD_BNDCFGS | \ - VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) - -#define EVMCS1_SUPPORTED_VMFUNC (0) - struct evmcs_field { u16 offset; u16 clean_field; @@ -174,6 +67,13 @@ static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs, #if IS_ENABLED(CONFIG_HYPERV) +DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs); + +static __always_inline bool kvm_is_using_evmcs(void) +{ + return static_branch_unlikely(&__kvm_is_using_evmcs); +} + static __always_inline int get_evmcs_offset(unsigned long field, u16 *clean_field) { @@ -263,6 +163,7 @@ static inline void evmcs_load(u64 phys_addr) void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ +static __always_inline bool kvm_is_using_evmcs(void) { return false; } static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static __always_inline void evmcs_write32(unsigned long field, u32 value) {} static __always_inline void evmcs_write16(unsigned long field, u16 value) {} diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1bc2b80273c9..e35cf0bd0df9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -358,6 +358,7 @@ static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, gpa_t addr) { + unsigned long roots = 0; uint i; struct kvm_mmu_root_info *cached_root; @@ -368,8 +369,10 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, eptp)) - vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); + roots |= KVM_MMU_ROOT_PREVIOUS(i); } + if (roots) + kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); } static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, @@ -654,6 +657,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PRED_CMD, MSR_TYPE_W); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_FLUSH_CMD, MSR_TYPE_W); + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); vmx->nested.force_msr_bitmap_recalc = false; @@ -3868,7 +3874,12 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) exit_qual = 0; } - if (ex->has_error_code) { + /* + * Unlike AMD's Paged Real Mode, which reports an error code on #PF + * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the + * "has error code" flags on VM-Exit if the CPU is in Real Mode. + */ + if (ex->has_error_code && is_protmode(vcpu)) { /* * Intel CPUs do not generate error codes with bits 31:16 set, * and more importantly VMX disallows setting bits 31:16 in the @@ -4478,7 +4489,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * CR0_GUEST_HOST_MASK is already set in the original vmcs01 * (KVM doesn't change it); */ - vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs12->host_cr0); /* Same as above - no reason to call set_cr4_guest_host_mask(). */ @@ -4629,7 +4640,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) */ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); - vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); @@ -5151,7 +5162,7 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) * does force CR0.PE=1, but only to also force VM86 in order to emulate * Real Mode, and so there's no need to check CR0.PE manually. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6750,36 +6761,9 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void) return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; } -/* - * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be - * returned for the various VMX controls MSRs when nested VMX is enabled. - * The same values should also be used to verify that vmcs12 control fields are - * valid during nested entry from L1 to L2. - * Each of these control msrs has a low and high 32-bit half: A low bit is on - * if the corresponding bit in the (32-bit) control field *must* be on, and a - * bit in the high half is on if the corresponding bit in the control field - * may be on. See also vmx_control_verify(). - */ -void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) +static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) { - struct nested_vmx_msrs *msrs = &vmcs_conf->nested; - - /* - * Note that as a general rule, the high half of the MSRs (bits in - * the control fields which may be 1) should be initialized by the - * intersection of the underlying hardware's MSR (i.e., features which - * can be supported) and the list of features we want to expose - - * because they are known to be properly supported in our code. - * Also, usually, the low half of the MSRs (bits which must be 1) can - * be set to 0, meaning that L1 may turn off any of these bits. The - * reason is that if one of these bits is necessary, it will appear - * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control - * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_l1_wants_exit() will not pass related exits to L1. - * These rules have exceptions below. - */ - - /* pin-based controls */ msrs->pinbased_ctls_low = PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6792,8 +6776,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) msrs->pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; +} - /* exit controls */ +static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6812,8 +6799,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; +} - /* entry controls */ +static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6829,8 +6819,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; +} - /* cpu-based controls */ +static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6862,12 +6855,12 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) /* We support free control of CR3 access interception. */ msrs->procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); +} - /* - * secondary cpu-based controls. Do not include those that - * depend on CPUID bits, they are added later by - * vmx_vcpu_after_set_cpuid. - */ +static void nested_vmx_setup_secondary_ctls(u32 ept_caps, + struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; @@ -6945,8 +6938,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) if (enable_sgx) msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; +} - /* miscellaneous data */ +static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | @@ -6954,7 +6950,10 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_WAIT_SIPI; msrs->misc_high = 0; +} +static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) +{ /* * This MSR reports some information about VMX support. We * should return information about the VMX we emulate for the @@ -6969,7 +6968,10 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; +} +static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) +{ /* * These MSRs specify bits which the guest must keep fixed on * while L1 is in VMXON mode (in L1's root mode, or running an L2). @@ -6986,6 +6988,51 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) if (vmx_umip_emulated()) msrs->cr4_fixed1 |= X86_CR4_UMIP; +} + +/* + * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be + * returned for the various VMX controls MSRs when nested VMX is enabled. + * The same values should also be used to verify that vmcs12 control fields are + * valid during nested entry from L1 to L2. + * Each of these control msrs has a low and high 32-bit half: A low bit is on + * if the corresponding bit in the (32-bit) control field *must* be on, and a + * bit in the high half is on if the corresponding bit in the control field + * may be on. See also vmx_control_verify(). + */ +void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) +{ + struct nested_vmx_msrs *msrs = &vmcs_conf->nested; + + /* + * Note that as a general rule, the high half of the MSRs (bits in + * the control fields which may be 1) should be initialized by the + * intersection of the underlying hardware's MSR (i.e., features which + * can be supported) and the list of features we want to expose - + * because they are known to be properly supported in our code. + * Also, usually, the low half of the MSRs (bits which must be 1) can + * be set to 0, meaning that L1 may turn off any of these bits. The + * reason is that if one of these bits is necessary, it will appear + * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control + * fields of vmcs01 and vmcs02, will turn these bits off - and + * nested_vmx_l1_wants_exit() will not pass related exits to L1. + * These rules have exceptions below. + */ + nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); + + nested_vmx_setup_exit_ctls(vmcs_conf, msrs); + + nested_vmx_setup_entry_ctls(vmcs_conf, msrs); + + nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); + + nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); + + nested_vmx_setup_misc_data(vmcs_conf, msrs); + + nested_vmx_setup_basic(msrs); + + nested_vmx_setup_cr_fixed(msrs); msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index e8a3be0b9df9..741efe2c497b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -57,7 +57,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - kvm_pmu_request_counter_reprogam(pmc); + kvm_pmu_request_counter_reprogram(pmc); } } @@ -76,13 +76,13 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) { int bit; - struct kvm_pmc *pmc; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { - pmc = intel_pmc_idx_to_pmc(pmu, bit); - if (pmc) - kvm_pmu_request_counter_reprogam(pmc); - } + if (!diff) + return; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + set_bit(bit, pmu->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); } static bool intel_hw_event_available(struct kvm_pmc *pmc) @@ -351,45 +351,47 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: msr_info->data = pmu->fixed_ctr_ctrl; - return 0; + break; case MSR_CORE_PERF_GLOBAL_STATUS: msr_info->data = pmu->global_status; - return 0; + break; case MSR_CORE_PERF_GLOBAL_CTRL: msr_info->data = pmu->global_ctrl; - return 0; + break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; - return 0; + break; case MSR_IA32_PEBS_ENABLE: msr_info->data = pmu->pebs_enable; - return 0; + break; case MSR_IA32_DS_AREA: msr_info->data = pmu->ds_area; - return 0; + break; case MSR_PEBS_DATA_CFG: msr_info->data = pmu->pebs_data_cfg; - return 0; + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { u64 val = pmc_read_counter(pmc); msr_info->data = val & pmu->counter_bitmask[KVM_PMC_GP]; - return 0; + break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { u64 val = pmc_read_counter(pmc); msr_info->data = val & pmu->counter_bitmask[KVM_PMC_FIXED]; - return 0; + break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { msr_info->data = pmc->eventsel; - return 0; - } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) - return 0; + break; + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) { + break; + } + return 1; } - return 1; + return 0; } static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -402,44 +404,43 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (pmu->fixed_ctr_ctrl == data) - return 0; - if (!(data & pmu->fixed_ctr_ctrl_mask)) { + if (data & pmu->fixed_ctr_ctrl_mask) + return 1; + + if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); - return 0; - } break; case MSR_CORE_PERF_GLOBAL_STATUS: - if (msr_info->host_initiated) { - pmu->global_status = data; - return 0; - } - break; /* RO MSR */ + if (!msr_info->host_initiated) + return 1; /* RO MSR */ + + pmu->global_status = data; + break; case MSR_CORE_PERF_GLOBAL_CTRL: - if (pmu->global_ctrl == data) - return 0; - if (kvm_valid_perf_global_ctrl(pmu, data)) { + if (!kvm_valid_perf_global_ctrl(pmu, data)) + return 1; + + if (pmu->global_ctrl != data) { diff = pmu->global_ctrl ^ data; pmu->global_ctrl = data; reprogram_counters(pmu, diff); - return 0; } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & pmu->global_ovf_ctrl_mask)) { - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - return 0; - } + if (data & pmu->global_ovf_ctrl_mask) + return 1; + + if (!msr_info->host_initiated) + pmu->global_status &= ~data; break; case MSR_IA32_PEBS_ENABLE: - if (pmu->pebs_enable == data) - return 0; - if (!(data & pmu->pebs_enable_mask)) { + if (data & pmu->pebs_enable_mask) + return 1; + + if (pmu->pebs_enable != data) { diff = pmu->pebs_enable ^ data; pmu->pebs_enable = data; reprogram_counters(pmu, diff); - return 0; } break; case MSR_IA32_DS_AREA: @@ -447,15 +448,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (is_noncanonical_address(data, vcpu)) return 1; + pmu->ds_area = data; - return 0; + break; case MSR_PEBS_DATA_CFG: - if (pmu->pebs_data_cfg == data) - return 0; - if (!(data & pmu->pebs_data_cfg_mask)) { - pmu->pebs_data_cfg = data; - return 0; - } + if (data & pmu->pebs_data_cfg_mask) + return 1; + + pmu->pebs_data_cfg = data; break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || @@ -463,33 +463,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((msr & MSR_PMC_FULL_WIDTH_BIT) && (data & ~pmu->counter_bitmask[KVM_PMC_GP])) return 1; + if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); pmc_update_sample_period(pmc); - return 0; + break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); pmc_update_sample_period(pmc); - return 0; + break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { - if (data == pmc->eventsel) - return 0; reserved_bits = pmu->reserved_bits; if ((pmc->idx == 2) && (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; - if (!(data & reserved_bits)) { + if (data & reserved_bits) + return 1; + + if (data != pmc->eventsel) { pmc->eventsel = data; - kvm_pmu_request_counter_reprogam(pmc); - return 0; + kvm_pmu_request_counter_reprogram(pmc); } - } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) - return 0; + break; + } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) { + break; + } + /* Not a known PMU MSR. */ + return 1; } - return 1; + return 0; } static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) @@ -531,6 +536,16 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; + memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); + + /* + * Setting passthrough of LBR MSRs is done only in the VM-Entry loop, + * and PMU refresh is disallowed after the vCPU has run, i.e. this code + * should never be reached while KVM is passing through MSRs. + */ + if (KVM_BUG_ON(lbr_desc->msr_passthrough, vcpu->kvm)) + return; + entry = kvm_find_cpuid_entry(vcpu, 0xa); if (!entry || !vcpu->kvm->arch.enable_pmu) return; diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index aa53c98034bf..0574030b071f 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -29,14 +29,14 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */ *gva = offset; - if (!is_long_mode(vcpu)) { + if (!is_64_bit_mode(vcpu)) { vmx_get_segment(vcpu, &s, VCPU_SREG_DS); *gva += s.base; } if (!IS_ALIGNED(*gva, alignment)) { fault = true; - } else if (likely(is_long_mode(vcpu))) { + } else if (likely(is_64_bit_mode(vcpu))) { fault = is_noncanonical_address(*gva, vcpu); } else { *gva &= 0xffffffff; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d2d6e1b6c788..44fb619803b8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -164,6 +164,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_IA32_SPEC_CTRL, MSR_IA32_PRED_CMD, + MSR_IA32_FLUSH_CMD, MSR_IA32_TSC, #ifdef CONFIG_X86_64 MSR_FS_BASE, @@ -579,7 +580,7 @@ static __init void hv_init_evmcs(void) if (enlightened_vmcs) { pr_info("Using Hyper-V Enlightened VMCS\n"); - static_branch_enable(&enable_evmcs); + static_branch_enable(&__kvm_is_using_evmcs); } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) @@ -595,7 +596,7 @@ static void hv_reset_evmcs(void) { struct hv_vp_assist_page *vp_ap; - if (!static_branch_unlikely(&enable_evmcs)) + if (!kvm_is_using_evmcs()) return; /* @@ -1945,7 +1946,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, static int vmx_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); @@ -2030,7 +2031,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested_vmx_allowed(vcpu)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, @@ -2285,33 +2286,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) return 1; goto find_uret_msr; - case MSR_IA32_PRED_CMD: - if (!msr_info->host_initiated && - !guest_has_pred_cmd_msr(vcpu)) - return 1; - - if (data & ~PRED_CMD_IBPB) - return 1; - if (!boot_cpu_has(X86_FEATURE_IBPB)) - return 1; - if (!data) - break; - - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - - /* - * For non-nested: - * When it's written (to non-zero) for the first time, pass - * it through. - * - * For nested: - * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_prepare_msr_bitmap. We should not touch the - * vmcs02.msr_bitmap here since it gets completely overwritten - * in the merging. - */ - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); - break; case MSR_IA32_CR_PAT: if (!kvm_pat_valid(data)) return 1; @@ -2366,7 +2340,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->msr_ia32_sgxlepubkeyhash [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ if (!nested_vmx_allowed(vcpu)) @@ -2816,8 +2790,7 @@ static int vmx_hardware_enable(void) * This can happen if we hot-added a CPU but failed to allocate * VP assist page for it. */ - if (static_branch_unlikely(&enable_evmcs) && - !hv_get_vp_assist_page(cpu)) + if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) return -EFAULT; intel_pt_handle_vmx(1); @@ -2869,7 +2842,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) memset(vmcs, 0, vmcs_config.size); /* KVM supports Enlightened VMCS v1 only */ - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else vmcs->hdr.revision_id = vmcs_config.revision_id; @@ -2964,7 +2937,7 @@ static __init int alloc_kvm_area(void) * still be marked with revision_id reported by * physical CPU. */ - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = vmcs_config.revision_id; per_cpu(vmxarea, cpu) = vmcs; @@ -3931,7 +3904,7 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR * bitmap has changed. */ - if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) { + if (kvm_is_using_evmcs()) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; if (evmcs->hv_enlightenments_control.msr_bitmap) @@ -4773,7 +4746,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) /* 22.2.1, 20.8.1 */ vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); - vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); set_cr4_guest_host_mask(vmx); @@ -5163,7 +5136,7 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) return true; - return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) && + return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); } @@ -5500,7 +5473,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; case 3: /* lmsw */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); kvm_lmsw(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -6957,7 +6930,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) * real mode. */ return enable_unrestricted_guest || emulate_invalid_guest_state; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: return nested; case MSR_AMD64_VIRT_SPEC_CTRL: case MSR_AMD64_TSC_RATIO: @@ -7310,7 +7283,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ - if (static_branch_unlikely(&enable_evmcs)) { + if (kvm_is_using_evmcs()) { current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; @@ -7440,7 +7413,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) * feature only for vmcs01, KVM currently isn't equipped to realize any * performance benefits from enabling it for vmcs02. */ - if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && + if (kvm_is_using_evmcs() && (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; @@ -7558,7 +7531,7 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; - if (kvm_read_cr0(vcpu) & X86_CR0_CD) { + if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else @@ -7744,6 +7717,13 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + if (boot_cpu_has(X86_FEATURE_IBPB)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, + !guest_has_pred_cmd_msr(vcpu)); + + if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, + !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); set_cr4_guest_host_mask(vmx); @@ -7776,9 +7756,11 @@ static u64 vmx_get_perf_capabilities(void) if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - x86_perf_get_lbr(&lbr); - if (lbr.nr) - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { + x86_perf_get_lbr(&lbr); + if (lbr.nr) + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; + } if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; @@ -7918,6 +7900,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ break; + case x86_intercept_pause: + /* + * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides + * with vanilla NOPs in the emulator. Apply the interception + * check only to actual PAUSE instructions. Don't check + * PAUSE-loop-exiting, software can't expect a given PAUSE to + * exit, i.e. KVM is within its rights to allow L2 to execute + * the PAUSE. + */ + if ((info->rep_prefix != REPE_PREFIX) || + !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + return X86EMUL_CONTINUE; + + break; + /* TODO: check more intercepts... */ default: break; @@ -8415,9 +8412,8 @@ static __init int hardware_setup(void) #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { - vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; - vmx_x86_ops.tlb_remote_flush_with_range = - hv_remote_flush_tlb_with_range; + vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; } #endif diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 2acdc54bc34b..9e66531861cf 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -369,7 +369,7 @@ struct vcpu_vmx { struct lbr_desc lbr_desc; /* Save desired MSR intercept (read: pass-through) state */ -#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15 +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16 struct { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); @@ -640,6 +640,24 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) (1 << VCPU_EXREG_EXIT_INFO_1) | \ (1 << VCPU_EXREG_EXIT_INFO_2)) +static inline unsigned long vmx_l1_guest_owned_cr0_bits(void) +{ + unsigned long bits = KVM_POSSIBLE_CR0_GUEST_BITS; + + /* + * CR0.WP needs to be intercepted when KVM is shadowing legacy paging + * in order to construct shadow PTEs with the correct protections. + * Note! CR0.WP technically can be passed through to the guest if + * paging is disabled, but checking CR0.PG would generate a cyclical + * dependency of sorts due to forcing the caller to ensure CR0 holds + * the correct value prior to determining which CR0 bits can be owned + * by L1. Keep it simple and limit the optimization to EPT. + */ + if (!enable_ept) + bits &= ~X86_CR0_WP; + return bits; +} + static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index db95bde52998..ce47dc265f89 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -147,7 +147,7 @@ do_exception: static __always_inline u16 vmcs_read16(unsigned long field) { vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_read16(field); return __vmcs_readl(field); } @@ -155,7 +155,7 @@ static __always_inline u16 vmcs_read16(unsigned long field) static __always_inline u32 vmcs_read32(unsigned long field) { vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_read32(field); return __vmcs_readl(field); } @@ -163,7 +163,7 @@ static __always_inline u32 vmcs_read32(unsigned long field) static __always_inline u64 vmcs_read64(unsigned long field) { vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_read64(field); #ifdef CONFIG_X86_64 return __vmcs_readl(field); @@ -175,7 +175,7 @@ static __always_inline u64 vmcs_read64(unsigned long field) static __always_inline unsigned long vmcs_readl(unsigned long field) { vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_read64(field); return __vmcs_readl(field); } @@ -222,7 +222,7 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val static __always_inline void vmcs_write16(unsigned long field, u16 value) { vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write16(field, value); __vmcs_writel(field, value); @@ -231,7 +231,7 @@ static __always_inline void vmcs_write16(unsigned long field, u16 value) static __always_inline void vmcs_write32(unsigned long field, u32 value) { vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write32(field, value); __vmcs_writel(field, value); @@ -240,7 +240,7 @@ static __always_inline void vmcs_write32(unsigned long field, u32 value) static __always_inline void vmcs_write64(unsigned long field, u64 value) { vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write64(field, value); __vmcs_writel(field, value); @@ -252,7 +252,7 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value) static __always_inline void vmcs_writel(unsigned long field, unsigned long value) { vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write64(field, value); __vmcs_writel(field, value); @@ -262,7 +262,7 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_clear_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write32(field, evmcs_read32(field) & ~mask); __vmcs_writel(field, __vmcs_readl(field) & ~mask); @@ -272,7 +272,7 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, "vmcs_set_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_write32(field, evmcs_read32(field) | mask); __vmcs_writel(field, __vmcs_readl(field) | mask); @@ -289,7 +289,7 @@ static inline void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - if (static_branch_unlikely(&enable_evmcs)) + if (kvm_is_using_evmcs()) return evmcs_load(phys_addr); vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7713420abab0..ceb7c5e9cf9e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -60,7 +60,9 @@ #include <linux/mem_encrypt.h> #include <linux/entry-kvm.h> #include <linux/suspend.h> +#include <linux/smp.h> +#include <trace/events/ipi.h> #include <trace/events/kvm.h> #include <asm/debugreg.h> @@ -194,7 +196,7 @@ bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); /* Enable/disable SMT_RSB bug mitigation */ -bool __read_mostly mitigate_smt_rsb; +static bool __read_mostly mitigate_smt_rsb; module_param(mitigate_smt_rsb, bool, 0444); /* @@ -802,8 +804,8 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, */ if ((fault->error_code & PFERR_PRESENT_MASK) && !(fault->error_code & PFERR_RSVD_MASK)) - kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, - fault_mmu->root.hpa); + kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address, + KVM_MMU_ROOT_CURRENT); fault_mmu->inject_page_fault(vcpu, fault); } @@ -841,7 +843,7 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { - if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE)) return true; kvm_queue_exception(vcpu, UD_VECTOR); @@ -906,6 +908,24 @@ EXPORT_SYMBOL_GPL(load_pdptrs); void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { + /* + * CR0.WP is incorporated into the MMU role, but only for non-nested, + * indirect shadow MMUs. If paging is disabled, no updates are needed + * as there are no permission bits to emulate. If TDP is enabled, the + * MMU's metadata needs to be updated, e.g. so that emulating guest + * translations does the right thing, but there's no need to unload the + * root as CR0.WP doesn't affect SPTEs. + */ + if ((cr0 ^ old_cr0) == X86_CR0_WP) { + if (!(cr0 & X86_CR0_PG)) + return; + + if (tdp_enabled) { + kvm_init_mmu(vcpu); + return; + } + } + if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); @@ -965,7 +985,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; if (!(cr0 & X86_CR0_PG) && - (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))) + (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; static_call(kvm_x86_set_cr0)(vcpu, cr0); @@ -987,7 +1007,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return; - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); @@ -1001,7 +1021,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (static_cpu_has(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || - kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) write_pkru(vcpu->arch.pkru); #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ } @@ -1015,14 +1035,14 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (static_cpu_has(X86_FEATURE_PKU) && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || - kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) { + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); @@ -1178,9 +1198,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) - return 1; - /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) return 1; @@ -1227,7 +1244,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB * with PCIDE=0. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) return; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) @@ -1242,9 +1259,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) bool skip_tlb_flush = false; unsigned long pcid = 0; #ifdef CONFIG_X86_64 - bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - - if (pcid_enabled) { + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) { skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; cr3 &= ~X86_CR3_PCID_NOFLUSH; pcid = cr3 & X86_CR3_PCID_MASK; @@ -1543,39 +1558,41 @@ static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* - * List of msr numbers which are used to expose MSR-based features that - * can be used by a hypervisor to validate requested CPU features. + * List of MSRs that control the existence of MSR-based features, i.e. MSRs + * that are effectively CPUID leafs. VMX MSRs are also included in the set of + * feature MSRs, but are handled separately to allow expedited lookups. */ -static const u32 msr_based_features_all[] = { - MSR_IA32_VMX_BASIC, - MSR_IA32_VMX_TRUE_PINBASED_CTLS, - MSR_IA32_VMX_PINBASED_CTLS, - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, - MSR_IA32_VMX_PROCBASED_CTLS, - MSR_IA32_VMX_TRUE_EXIT_CTLS, - MSR_IA32_VMX_EXIT_CTLS, - MSR_IA32_VMX_TRUE_ENTRY_CTLS, - MSR_IA32_VMX_ENTRY_CTLS, - MSR_IA32_VMX_MISC, - MSR_IA32_VMX_CR0_FIXED0, - MSR_IA32_VMX_CR0_FIXED1, - MSR_IA32_VMX_CR4_FIXED0, - MSR_IA32_VMX_CR4_FIXED1, - MSR_IA32_VMX_VMCS_ENUM, - MSR_IA32_VMX_PROCBASED_CTLS2, - MSR_IA32_VMX_EPT_VPID_CAP, - MSR_IA32_VMX_VMFUNC, - +static const u32 msr_based_features_all_except_vmx[] = { MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, }; -static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; static unsigned int num_msr_based_features; /* + * All feature MSRs except uCode revID, which tracks the currently loaded uCode + * patch, are immutable once the vCPU model is defined. + */ +static bool kvm_is_immutable_feature_msr(u32 msr) +{ + int i; + + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) + return true; + + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { + if (msr == msr_based_features_all_except_vmx[i]) + return msr != MSR_IA32_UCODE_REV; + } + + return false; +} + +/* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM * does not yet virtualize. These include: * 10 - MISC_PACKAGE_CTRLS @@ -2192,6 +2209,22 @@ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { + u64 val; + + /* + * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does + * not support modifying the guest vCPU model on the fly, e.g. changing + * the nVMX capabilities while L2 is running is nonsensical. Ignore + * writes of the same value, e.g. to allow userspace to blindly stuff + * all MSRs when emulating RESET. + */ + if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) { + if (do_get_msr(vcpu, index, &val) || *data != val) + return -EINVAL; + + return 0; + } + return kvm_set_msr_ignored_check(vcpu, index, *data, true); } @@ -3614,9 +3647,40 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~kvm_caps.supported_perf_cap) return 1; + /* + * Note, this is not just a performance optimization! KVM + * disallows changing feature MSRs after the vCPU has run; PMU + * refresh will bug the VM if called after the vCPU has run. + */ + if (vcpu->arch.perf_capabilities == data) + break; + vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); - return 0; + break; + case MSR_IA32_PRED_CMD: + if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu)) + return 1; + + if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB)) + return 1; + if (!data) + break; + + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + break; + case MSR_IA32_FLUSH_CMD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) + return 1; + + if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) + return 1; + if (!data) + break; + + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + break; case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -4432,6 +4496,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4531,9 +4596,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 0; break; case KVM_CAP_XSAVE2: { - u64 guest_perm = xstate_get_guest_group_perm(); - - r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false); + r = xstate_required_size(kvm_get_filtered_xcr0(), false); if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; @@ -5033,7 +5096,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { + !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } @@ -5125,7 +5188,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; - events->nmi.pending = vcpu->arch.nmi_pending != 0; + events->nmi.pending = kvm_get_nr_pending_nmis(vcpu); events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); /* events->sipi_vector is never valid when reporting to user space */ @@ -5212,8 +5275,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; - if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) - vcpu->arch.nmi_pending = events->nmi.pending; + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) { + vcpu->arch.nmi_pending = 0; + atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending); + kvm_make_request(KVM_REQ_NMI, vcpu); + } static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && @@ -6021,11 +6087,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) -{ - return kvm->arch.n_max_mmu_pages; -} - static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { struct kvm_pic *pic = kvm->arch.vpic; @@ -6672,8 +6733,7 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; @@ -6711,9 +6771,6 @@ set_identity_unlock: case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); break; - case KVM_GET_NR_MMU_PAGES: - r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); - break; case KVM_CREATE_IRQCHIP: { mutex_lock(&kvm->lock); @@ -7018,6 +7075,18 @@ out: return r; } +static void kvm_probe_feature_msr(u32 msr_index) +{ + struct kvm_msr_entry msr = { + .index = msr_index, + }; + + if (kvm_get_msr_feature(&msr)) + return; + + msr_based_features[num_msr_based_features++] = msr_index; +} + static void kvm_probe_msr_to_save(u32 msr_index) { u32 dummy[2]; @@ -7093,7 +7162,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) msrs_to_save[num_msrs_to_save++] = msr_index; } -static void kvm_init_msr_list(void) +static void kvm_init_msr_lists(void) { unsigned i; @@ -7119,15 +7188,11 @@ static void kvm_init_msr_list(void) emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } - for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { - struct kvm_msr_entry msr; + for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++) + kvm_probe_feature_msr(i); - msr.index = msr_based_features_all[i]; - if (kvm_get_msr_feature(&msr)) - continue; - - msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; - } + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) + kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]); } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -8463,7 +8528,6 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - bool write_fault_to_shadow_pgtable, int emulation_type) { gpa_t gpa = cr2_or_gpa; @@ -8534,7 +8598,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * be fixed by unprotecting shadow page and it should * be reported to userspace. */ - return !write_fault_to_shadow_pgtable; + return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, @@ -8782,20 +8846,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int r; struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - bool write_fault_to_spt; if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len))) return 1; vcpu->arch.l1tf_flush_l1d = true; - /* - * Clear write_fault_to_shadow_pgtable here to ensure it is - * never reused. - */ - write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; - vcpu->arch.write_fault_to_shadow_pgtable = false; - if (!(emulation_type & EMULTYPE_NO_DECODE)) { kvm_clear_exception_queue(vcpu); @@ -8816,7 +8872,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return 1; } if (reexecute_instruction(vcpu, cr2_or_gpa, - write_fault_to_spt, emulation_type)) return 1; @@ -8895,14 +8950,15 @@ restart: return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, - emulation_type)) + if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type)) return 1; return handle_emulation_failure(vcpu, emulation_type); } if (ctxt->have_exception) { + WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write); + vcpu->mmio_needed = false; r = 1; inject_emulated_exception(vcpu); } else if (vcpu->arch.pio.count) { @@ -9472,7 +9528,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_caps.max_guest_tsc_khz = max; } kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; - kvm_init_msr_list(); + kvm_init_msr_lists(); return 0; out_unwind_ops: @@ -9803,7 +9859,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) vcpu->run->hypercall.args[0] = gpa; vcpu->run->hypercall.args[1] = npages; vcpu->run->hypercall.args[2] = attrs; - vcpu->run->hypercall.longmode = op_64_bit; + vcpu->run->hypercall.flags = 0; + if (op_64_bit) + vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE; + + WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); vcpu->arch.complete_userspace_io = complete_hypercall_exit; return 0; } @@ -9906,13 +9966,20 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + /* + * Suppress the error code if the vCPU is in Real Mode, as Real Mode + * exceptions don't report error codes. The presence of an error code + * is carried with the exception and only stripped when the exception + * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do + * report an error code despite the CPU being in Real Mode. + */ + vcpu->arch.exception.has_error_code &= is_protmode(vcpu); + trace_kvm_inj_exception(vcpu->arch.exception.vector, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.injected); - if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) - vcpu->arch.exception.error_code = false; static_call(kvm_x86_inject_exception)(vcpu); } @@ -10158,19 +10225,46 @@ out: static void process_nmi(struct kvm_vcpu *vcpu) { - unsigned limit = 2; + unsigned int limit; /* - * x86 is limited to one NMI running, and one NMI pending after it. - * If an NMI is already in progress, limit further NMIs to just one. - * Otherwise, allow two (and we'll inject the first one immediately). + * x86 is limited to one NMI pending, but because KVM can't react to + * incoming NMIs as quickly as bare metal, e.g. if the vCPU is + * scheduled out, KVM needs to play nice with two queued NMIs showing + * up at the same time. To handle this scenario, allow two NMIs to be + * (temporarily) pending so long as NMIs are not blocked and KVM is not + * waiting for a previous NMI injection to complete (which effectively + * blocks NMIs). KVM will immediately inject one of the two NMIs, and + * will request an NMI window to handle the second NMI. */ if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) limit = 1; + else + limit = 2; + + /* + * Adjust the limit to account for pending virtual NMIs, which aren't + * tracked in vcpu->arch.nmi_pending. + */ + if (static_call(kvm_x86_is_vnmi_pending)(vcpu)) + limit--; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); - kvm_make_request(KVM_REQ_EVENT, vcpu); + + if (vcpu->arch.nmi_pending && + (static_call(kvm_x86_set_vnmi_pending)(vcpu))) + vcpu->arch.nmi_pending--; + + if (vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + +/* Return total number of NMIs pending injection to the VM */ +int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.nmi_pending + + static_call(kvm_x86_is_vnmi_pending)(vcpu); } void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, @@ -13256,7 +13350,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } - pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE); switch (type) { case INVPCID_TYPE_INDIV_ADDR: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a8167b47b8c8..c544602d07a3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -3,6 +3,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include <asm/fpu/xstate.h> #include <asm/mce.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" @@ -40,6 +41,14 @@ void kvm_spurious_fault(void); failed; \ }) +/* + * The first...last VMX feature MSRs that are emulated by KVM. This may or may + * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an + * associated feature that KVM supports for nested virtualization. + */ +#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC +#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC + #define KVM_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 #define KVM_DEFAULT_PLE_WINDOW_GROW 2 @@ -83,6 +92,11 @@ static inline unsigned int __shrink_ple_window(unsigned int val, void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); +static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.last_vmentry_cpu != -1; +} + static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu) { return vcpu->arch.exception.pending || @@ -123,15 +137,15 @@ static inline bool kvm_exception_is_soft(unsigned int nr) static inline bool is_protmode(struct kvm_vcpu *vcpu) { - return kvm_read_cr0_bits(vcpu, X86_CR0_PE); + return kvm_is_cr0_bit_set(vcpu, X86_CR0_PE); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) +static inline bool is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - return vcpu->arch.efer & EFER_LMA; + return !!(vcpu->arch.efer & EFER_LMA); #else - return 0; + return false; #endif } @@ -171,19 +185,19 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; } -static inline int is_pae(struct kvm_vcpu *vcpu) +static inline bool is_pae(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); + return kvm_is_cr4_bit_set(vcpu, X86_CR4_PAE); } -static inline int is_pse(struct kvm_vcpu *vcpu) +static inline bool is_pse(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); + return kvm_is_cr4_bit_set(vcpu, X86_CR4_PSE); } -static inline int is_paging(struct kvm_vcpu *vcpu) +static inline bool is_paging(struct kvm_vcpu *vcpu) { - return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); + return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG)); } static inline bool is_pae_paging(struct kvm_vcpu *vcpu) @@ -193,7 +207,7 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu) static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) { - return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; + return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48; } static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) @@ -315,6 +329,34 @@ extern struct kvm_caps kvm_caps; extern bool enable_pmu; +/* + * Get a filtered version of KVM's supported XCR0 that strips out dynamic + * features for which the current process doesn't (yet) have permission to use. + * This is intended to be used only when enumerating support to userspace, + * e.g. in KVM_GET_SUPPORTED_CPUID and KVM_CAP_XSAVE2, it does NOT need to be + * used to check/restrict guest behavior as KVM rejects KVM_SET_CPUID{2} if + * userspace attempts to enable unpermitted features. + */ +static inline u64 kvm_get_filtered_xcr0(void) +{ + u64 permitted_xcr0 = kvm_caps.supported_xcr0; + + BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); + + if (permitted_xcr0 & XFEATURE_MASK_USER_DYNAMIC) { + permitted_xcr0 &= xstate_get_guest_group_perm(); + + /* + * Treat XTILE_CFG as unsupported if the current process isn't + * allowed to use XTILE_DATA, as attempting to set XTILE_CFG in + * XCR0 without setting XTILE_DATA is architecturally illegal. + */ + if (!(permitted_xcr0 & XFEATURE_MASK_XTILE_DATA)) + permitted_xcr0 &= ~XFEATURE_MASK_XTILE_CFG; + } + return permitted_xcr0; +} + static inline bool kvm_mpx_supported(void) { return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 4f1a40a86534..01932af64193 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y) endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o - lib-y += copy_user_64.o + lib-y += copy_user_64.o copy_user_uncached_64.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index ecbfb4dd3b01..f74a3e704a1c 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms) * Input: * rdi destination * rcx count + * rax is zero * * Output: * rcx: uncleared bytes or 0 if successful. */ -SYM_FUNC_START(clear_user_original) - /* - * Copy only the lower 32 bits of size as that is enough to handle the rest bytes, - * i.e., no need for a 'q' suffix and thus a REX prefix. - */ - mov %ecx,%eax - shr $3,%rcx - jz .Lrest_bytes +SYM_FUNC_START(rep_stos_alternative) + cmpq $64,%rcx + jae .Lunrolled - # do the qwords first - .p2align 4 -.Lqwords: - movq $0,(%rdi) - lea 8(%rdi),%rdi - dec %rcx - jnz .Lqwords + cmp $8,%ecx + jae .Lword -.Lrest_bytes: - and $7, %eax - jz .Lexit + testl %ecx,%ecx + je .Lexit - # now do the rest bytes -.Lbytes: - movb $0,(%rdi) +.Lclear_user_tail: +0: movb %al,(%rdi) inc %rdi - dec %eax - jnz .Lbytes - + dec %rcx + jnz .Lclear_user_tail .Lexit: - /* - * %rax still needs to be cleared in the exception case because this function is called - * from inline asm and the compiler expects %rax to be zero when exiting the inline asm, - * in case it might reuse it somewhere. - */ - xor %eax,%eax - RET - -.Lqwords_exception: - # convert remaining qwords back into bytes to return to caller - shl $3, %rcx - and $7, %eax - add %rax,%rcx - jmp .Lexit - -.Lbytes_exception: - mov %eax,%ecx - jmp .Lexit - - _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception) - _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception) -SYM_FUNC_END(clear_user_original) -EXPORT_SYMBOL(clear_user_original) - -/* - * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is - * present. - * Input: - * rdi destination - * rcx count - * - * Output: - * rcx: uncleared bytes or 0 if successful. - */ -SYM_FUNC_START(clear_user_rep_good) - # call the original thing for less than a cacheline - cmp $64, %rcx - jb clear_user_original - -.Lprep: - # copy lower 32-bits for rest bytes - mov %ecx, %edx - shr $3, %rcx - jz .Lrep_good_rest_bytes - -.Lrep_good_qwords: - rep stosq - -.Lrep_good_rest_bytes: - and $7, %edx - jz .Lrep_good_exit - -.Lrep_good_bytes: - mov %edx, %ecx - rep stosb - -.Lrep_good_exit: - # see .Lexit comment above - xor %eax, %eax RET -.Lrep_good_qwords_exception: - # convert remaining qwords back into bytes to return to caller - shl $3, %rcx - and $7, %edx - add %rdx, %rcx - jmp .Lrep_good_exit + _ASM_EXTABLE_UA( 0b, .Lexit) - _ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception) - _ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit) -SYM_FUNC_END(clear_user_rep_good) -EXPORT_SYMBOL(clear_user_rep_good) +.Lword: +1: movq %rax,(%rdi) + addq $8,%rdi + sub $8,%ecx + je .Lexit + cmp $8,%ecx + jae .Lword + jmp .Lclear_user_tail -/* - * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present. - * Input: - * rdi destination - * rcx count - * - * Output: - * rcx: uncleared bytes or 0 if successful. - * - */ -SYM_FUNC_START(clear_user_erms) - # call the original thing for less than a cacheline - cmp $64, %rcx - jb clear_user_original - -.Lerms_bytes: - rep stosb - -.Lerms_exit: - xorl %eax,%eax + .p2align 4 +.Lunrolled: +10: movq %rax,(%rdi) +11: movq %rax,8(%rdi) +12: movq %rax,16(%rdi) +13: movq %rax,24(%rdi) +14: movq %rax,32(%rdi) +15: movq %rax,40(%rdi) +16: movq %rax,48(%rdi) +17: movq %rax,56(%rdi) + addq $64,%rdi + subq $64,%rcx + cmpq $64,%rcx + jae .Lunrolled + cmpl $8,%ecx + jae .Lword + testl %ecx,%ecx + jne .Lclear_user_tail RET - _ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit) -SYM_FUNC_END(clear_user_erms) -EXPORT_SYMBOL(clear_user_erms) + /* + * If we take an exception on any of the + * word stores, we know that %rcx isn't zero, + * so we can just go to the tail clearing to + * get the exact count. + * + * The unrolled case might end up clearing + * some bytes twice. Don't care. + * + * We could use the value in %rdi to avoid + * a second fault on the exact count case, + * but do we really care? No. + * + * Finally, we could try to align %rdi at the + * top of the unrolling. But unaligned stores + * just aren't that common or expensive. + */ + _ASM_EXTABLE_UA( 1b, .Lclear_user_tail) + _ASM_EXTABLE_UA(10b, .Lclear_user_tail) + _ASM_EXTABLE_UA(11b, .Lclear_user_tail) + _ASM_EXTABLE_UA(12b, .Lclear_user_tail) + _ASM_EXTABLE_UA(13b, .Lclear_user_tail) + _ASM_EXTABLE_UA(14b, .Lclear_user_tail) + _ASM_EXTABLE_UA(15b, .Lclear_user_tail) + _ASM_EXTABLE_UA(16b, .Lclear_user_tail) + _ASM_EXTABLE_UA(17b, .Lclear_user_tail) +SYM_FUNC_END(rep_stos_alternative) +EXPORT_SYMBOL(rep_stos_alternative) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 9dec1b38a98f..4fc5c2de2de4 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,404 +7,108 @@ */ #include <linux/linkage.h> -#include <asm/current.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/cpufeatures.h> -#include <asm/alternative.h> #include <asm/asm.h> -#include <asm/smap.h> #include <asm/export.h> -#include <asm/trapnr.h> - -.macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - - _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align) - _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align) -.endm /* - * copy_user_generic_unrolled - memory copy with exception handling. - * This version is for CPUs like P4 that don't have efficient micro - * code for rep movsq - * - * Input: - * rdi destination - * rsi source - * rdx count - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_FUNC_START(copy_user_generic_unrolled) - ASM_STAC - cmpl $8,%edx - jb .Lcopy_user_short_string_bytes - ALIGN_DESTINATION - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz copy_user_short_string -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movq %r8,(%rdi) -6: movq %r9,1*8(%rdi) -7: movq %r10,2*8(%rdi) -8: movq %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movq %r8,4*8(%rdi) -14: movq %r9,5*8(%rdi) -15: movq %r10,6*8(%rdi) -16: movq %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz 1b - jmp copy_user_short_string - -30: shll $6,%ecx - addl %ecx,%edx - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, 30b) - _ASM_EXTABLE_CPY(2b, 30b) - _ASM_EXTABLE_CPY(3b, 30b) - _ASM_EXTABLE_CPY(4b, 30b) - _ASM_EXTABLE_CPY(5b, 30b) - _ASM_EXTABLE_CPY(6b, 30b) - _ASM_EXTABLE_CPY(7b, 30b) - _ASM_EXTABLE_CPY(8b, 30b) - _ASM_EXTABLE_CPY(9b, 30b) - _ASM_EXTABLE_CPY(10b, 30b) - _ASM_EXTABLE_CPY(11b, 30b) - _ASM_EXTABLE_CPY(12b, 30b) - _ASM_EXTABLE_CPY(13b, 30b) - _ASM_EXTABLE_CPY(14b, 30b) - _ASM_EXTABLE_CPY(15b, 30b) - _ASM_EXTABLE_CPY(16b, 30b) -SYM_FUNC_END(copy_user_generic_unrolled) -EXPORT_SYMBOL(copy_user_generic_unrolled) - -/* Some CPUs run faster using the string copy instructions. - * This is also a lot simpler. Use them when possible. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. + * rep_movs_alternative - memory copy with exception handling. + * This version is for CPUs that don't have FSRM (Fast Short Rep Movs) * * Input: * rdi destination * rsi source - * rdx count + * rcx count * * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_FUNC_START(copy_user_generic_string) - ASM_STAC - cmpl $8,%edx - jb 2f /* less than 8 bytes, go to byte copy loop */ - ALIGN_DESTINATION - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx -1: rep movsq -2: movl %edx,%ecx -3: rep movsb - xorl %eax,%eax - ASM_CLAC - RET - -11: leal (%rdx,%rcx,8),%ecx -12: movl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, 11b) - _ASM_EXTABLE_CPY(3b, 12b) -SYM_FUNC_END(copy_user_generic_string) -EXPORT_SYMBOL(copy_user_generic_string) - -/* - * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. - * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. - * - * Input: - * rdi destination - * rsi source - * rdx count + * rcx uncopied bytes or 0 if successful. * - * Output: - * eax uncopied bytes or 0 if successful. + * NOTE! The calling convention is very intentionally the same as + * for 'rep movs', so that we can rewrite the function call with + * just a plain 'rep movs' on machines that have FSRM. But to make + * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely. */ -SYM_FUNC_START(copy_user_enhanced_fast_string) - ASM_STAC - /* CPUs without FSRM should avoid rep movsb for short copies */ - ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM - movl %edx,%ecx -1: rep movsb - xorl %eax,%eax - ASM_CLAC +SYM_FUNC_START(rep_movs_alternative) + cmpq $64,%rcx + jae .Lunrolled + + cmp $8,%ecx + jae .Lword + + testl %ecx,%ecx + je .Lexit + +.Lcopy_user_tail: +0: movb (%rsi),%al +1: movb %al,(%rdi) + inc %rdi + inc %rsi + dec %rcx + jne .Lcopy_user_tail +.Lexit: RET -12: movl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, 12b) -SYM_FUNC_END(copy_user_enhanced_fast_string) -EXPORT_SYMBOL(copy_user_enhanced_fast_string) - -/* - * Try to copy last bytes and clear the rest if needed. - * Since protection fault in copy_from/to_user is not a normal situation, - * it is not necessary to optimize tail handling. - * Don't try to copy the tail if machine check happened - * - * Input: - * eax trap number written by ex_handler_copy() - * rdi destination - * rsi source - * rdx count - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) - cmp $X86_TRAP_MC,%eax - je 3f - - movl %edx,%ecx -1: rep movsb -2: mov %ecx,%eax - ASM_CLAC + _ASM_EXTABLE_UA( 0b, .Lexit) + _ASM_EXTABLE_UA( 1b, .Lexit) + + .p2align 4 +.Lword: +2: movq (%rsi),%rax +3: movq %rax,(%rdi) + addq $8,%rsi + addq $8,%rdi + sub $8,%ecx + je .Lexit + cmp $8,%ecx + jae .Lword + jmp .Lcopy_user_tail + + _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail) + _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail) + + .p2align 4 +.Lunrolled: +10: movq (%rsi),%r8 +11: movq 8(%rsi),%r9 +12: movq 16(%rsi),%r10 +13: movq 24(%rsi),%r11 +14: movq %r8,(%rdi) +15: movq %r9,8(%rdi) +16: movq %r10,16(%rdi) +17: movq %r11,24(%rdi) +20: movq 32(%rsi),%r8 +21: movq 40(%rsi),%r9 +22: movq 48(%rsi),%r10 +23: movq 56(%rsi),%r11 +24: movq %r8,32(%rdi) +25: movq %r9,40(%rdi) +26: movq %r10,48(%rdi) +27: movq %r11,56(%rdi) + addq $64,%rsi + addq $64,%rdi + subq $64,%rcx + cmpq $64,%rcx + jae .Lunrolled + cmpl $8,%ecx + jae .Lword + testl %ecx,%ecx + jne .Lcopy_user_tail RET -3: - movl %edx,%eax - ASM_CLAC - RET - - _ASM_EXTABLE_CPY(1b, 2b) - -.Lcopy_user_handle_align: - addl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - -SYM_CODE_END(.Lcopy_user_handle_tail) - -/* - * Finish memcpy of less than 64 bytes. #AC should already be set. - * - * Input: - * rdi destination - * rsi source - * rdx count (< 64) - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -SYM_CODE_START_LOCAL(copy_user_short_string) - movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz .Lcopy_user_short_string_bytes -18: movq (%rsi),%r8 -19: movq %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz 18b -.Lcopy_user_short_string_bytes: - andl %edx,%edx - jz 23f - movl %edx,%ecx -21: movb (%rsi),%al -22: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 21b -23: xor %eax,%eax - ASM_CLAC - RET - -40: leal (%rdx,%rcx,8),%edx - jmp 60f -50: movl %ecx,%edx /* ecx is zerorest also */ -60: jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(18b, 40b) - _ASM_EXTABLE_CPY(19b, 40b) - _ASM_EXTABLE_CPY(21b, 50b) - _ASM_EXTABLE_CPY(22b, 50b) -SYM_CODE_END(copy_user_short_string) - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination out of cache for more performance. - * - * Note: Cached memory copy is used when destination or size is not - * naturally aligned. That is: - * - Require 8-byte alignment when size is 8 bytes or larger. - * - Require 4-byte alignment when size is 4 bytes. - */ -SYM_FUNC_START(__copy_user_nocache) - ASM_STAC - - /* If size is less than 8 bytes, go to 4-byte copy */ - cmpl $8,%edx - jb .L_4b_nocache_copy_entry - - /* If destination is not 8-byte aligned, "cache" copy to align it */ - ALIGN_DESTINATION - - /* Set 4x8-byte copy count and remainder */ - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz .L_8b_nocache_copy_entry /* jump if count is 0 */ - - /* Perform 4x8-byte nocache loop-copy */ -.L_4x8b_nocache_copy_loop: -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movnti %r8,(%rdi) -6: movnti %r9,1*8(%rdi) -7: movnti %r10,2*8(%rdi) -8: movnti %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movnti %r8,4*8(%rdi) -14: movnti %r9,5*8(%rdi) -15: movnti %r10,6*8(%rdi) -16: movnti %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz .L_4x8b_nocache_copy_loop - - /* Set 8-byte copy count and remainder */ -.L_8b_nocache_copy_entry: - movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz .L_4b_nocache_copy_entry /* jump if count is 0 */ - - /* Perform 8-byte nocache loop-copy */ -.L_8b_nocache_copy_loop: -20: movq (%rsi),%r8 -21: movnti %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz .L_8b_nocache_copy_loop - - /* If no byte left, we're done */ -.L_4b_nocache_copy_entry: - andl %edx,%edx - jz .L_finish_copy - - /* If destination is not 4-byte aligned, go to byte copy: */ - movl %edi,%ecx - andl $3,%ecx - jnz .L_1b_cache_copy_entry - - /* Set 4-byte copy count (1 or 0) and remainder */ - movl %edx,%ecx - andl $3,%edx - shrl $2,%ecx - jz .L_1b_cache_copy_entry /* jump if count is 0 */ - - /* Perform 4-byte nocache copy: */ -30: movl (%rsi),%r8d -31: movnti %r8d,(%rdi) - leaq 4(%rsi),%rsi - leaq 4(%rdi),%rdi - - /* If no bytes left, we're done: */ - andl %edx,%edx - jz .L_finish_copy - - /* Perform byte "cache" loop-copy for the remainder */ -.L_1b_cache_copy_entry: - movl %edx,%ecx -.L_1b_cache_copy_loop: -40: movb (%rsi),%al -41: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .L_1b_cache_copy_loop - - /* Finished copying; fence the prior stores */ -.L_finish_copy: - xorl %eax,%eax - ASM_CLAC - sfence - RET - -.L_fixup_4x8b_copy: - shll $6,%ecx - addl %ecx,%edx - jmp .L_fixup_handle_tail -.L_fixup_8b_copy: - lea (%rdx,%rcx,8),%rdx - jmp .L_fixup_handle_tail -.L_fixup_4b_copy: - lea (%rdx,%rcx,4),%rdx - jmp .L_fixup_handle_tail -.L_fixup_1b_copy: - movl %ecx,%edx -.L_fixup_handle_tail: - sfence - jmp .Lcopy_user_handle_tail - - _ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy) - _ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy) - _ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy) - _ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy) - _ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy) - _ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy) - _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy) -SYM_FUNC_END(__copy_user_nocache) -EXPORT_SYMBOL(__copy_user_nocache) + _ASM_EXTABLE_UA(10b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(11b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(12b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(13b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(14b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(15b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(16b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(17b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(20b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(21b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(22b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(23b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(24b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(25b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(26b, .Lcopy_user_tail) + _ASM_EXTABLE_UA(27b, .Lcopy_user_tail) +SYM_FUNC_END(rep_movs_alternative) +EXPORT_SYMBOL(rep_movs_alternative) diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S new file mode 100644 index 000000000000..5c5f38d32672 --- /dev/null +++ b/arch/x86/lib/copy_user_uncached_64.S @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> + */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/export.h> + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * + * This copies from user space into kernel space, but the kernel + * space accesses can take a machine check exception, so they too + * need exception handling. + * + * Note: only 32-bit and 64-bit stores have non-temporal versions, + * and we only use aligned versions. Any unaligned parts at the + * start or end of the copy will be done using normal cached stores. + * + * Input: + * rdi destination + * rsi source + * edx count + * + * Output: + * rax uncopied bytes or 0 if successful. + */ +SYM_FUNC_START(__copy_user_nocache) + /* If destination is not 7-byte aligned, we'll have to align it */ + testb $7,%dil + jne .Lalign + +.Lis_aligned: + cmp $64,%edx + jb .Lquadwords + + .p2align 4,0x90 +.Lunrolled: +10: movq (%rsi),%r8 +11: movq 8(%rsi),%r9 +12: movq 16(%rsi),%r10 +13: movq 24(%rsi),%r11 +20: movnti %r8,(%rdi) +21: movnti %r9,8(%rdi) +22: movnti %r10,16(%rdi) +23: movnti %r11,24(%rdi) +30: movq 32(%rsi),%r8 +31: movq 40(%rsi),%r9 +32: movq 48(%rsi),%r10 +33: movq 56(%rsi),%r11 +40: movnti %r8,32(%rdi) +41: movnti %r9,40(%rdi) +42: movnti %r10,48(%rdi) +43: movnti %r11,56(%rdi) + + addq $64,%rsi + addq $64,%rdi + sub $64,%edx + cmp $64,%edx + jae .Lunrolled + +/* + * First set of user mode loads have been done + * without any stores, so if they fail, we can + * just try the non-unrolled loop. + */ +_ASM_EXTABLE_UA(10b, .Lquadwords) +_ASM_EXTABLE_UA(11b, .Lquadwords) +_ASM_EXTABLE_UA(12b, .Lquadwords) +_ASM_EXTABLE_UA(13b, .Lquadwords) + +/* + * The second set of user mode loads have been + * done with 32 bytes stored to the destination, + * so we need to take that into account before + * falling back to the unrolled loop. + */ +_ASM_EXTABLE_UA(30b, .Lfixup32) +_ASM_EXTABLE_UA(31b, .Lfixup32) +_ASM_EXTABLE_UA(32b, .Lfixup32) +_ASM_EXTABLE_UA(33b, .Lfixup32) + +/* + * An exception on a write means that we're + * done, but we need to update the count + * depending on where in the unrolled loop + * we were. + */ +_ASM_EXTABLE_UA(20b, .Ldone0) +_ASM_EXTABLE_UA(21b, .Ldone8) +_ASM_EXTABLE_UA(22b, .Ldone16) +_ASM_EXTABLE_UA(23b, .Ldone24) +_ASM_EXTABLE_UA(40b, .Ldone32) +_ASM_EXTABLE_UA(41b, .Ldone40) +_ASM_EXTABLE_UA(42b, .Ldone48) +_ASM_EXTABLE_UA(43b, .Ldone56) + +.Lquadwords: + cmp $8,%edx + jb .Llong +50: movq (%rsi),%rax +51: movnti %rax,(%rdi) + addq $8,%rsi + addq $8,%rdi + sub $8,%edx + jmp .Lquadwords + +/* + * If we fail on the last full quadword, we will + * not try to do any byte-wise cached accesses. + * We will try to do one more 4-byte uncached + * one, though. + */ +_ASM_EXTABLE_UA(50b, .Llast4) +_ASM_EXTABLE_UA(51b, .Ldone0) + +.Llong: + test $4,%dl + je .Lword +60: movl (%rsi),%eax +61: movnti %eax,(%rdi) + addq $4,%rsi + addq $4,%rdi + sub $4,%edx +.Lword: + sfence + test $2,%dl + je .Lbyte +70: movw (%rsi),%ax +71: movw %ax,(%rdi) + addq $2,%rsi + addq $2,%rdi + sub $2,%edx +.Lbyte: + test $1,%dl + je .Ldone +80: movb (%rsi),%al +81: movb %al,(%rdi) + dec %edx +.Ldone: + mov %edx,%eax + RET + +/* + * If we fail on the last four bytes, we won't + * bother with any fixups. It's dead, Jim. Note + * that there's no need for 'sfence' for any + * of this, since the exception will have been + * serializing. + */ +_ASM_EXTABLE_UA(60b, .Ldone) +_ASM_EXTABLE_UA(61b, .Ldone) +_ASM_EXTABLE_UA(70b, .Ldone) +_ASM_EXTABLE_UA(71b, .Ldone) +_ASM_EXTABLE_UA(80b, .Ldone) +_ASM_EXTABLE_UA(81b, .Ldone) + +/* + * This is the "head needs aliging" case when + * the destination isn't 8-byte aligned. The + * 4-byte case can be done uncached, but any + * smaller alignment is done with regular stores. + */ +.Lalign: + test $1,%dil + je .Lalign_word + test %edx,%edx + je .Ldone +90: movb (%rsi),%al +91: movb %al,(%rdi) + inc %rsi + inc %rdi + dec %edx +.Lalign_word: + test $2,%dil + je .Lalign_long + cmp $2,%edx + jb .Lbyte +92: movw (%rsi),%ax +93: movw %ax,(%rdi) + addq $2,%rsi + addq $2,%rdi + sub $2,%edx +.Lalign_long: + test $4,%dil + je .Lis_aligned + cmp $4,%edx + jb .Lword +94: movl (%rsi),%eax +95: movnti %eax,(%rdi) + addq $4,%rsi + addq $4,%rdi + sub $4,%edx + jmp .Lis_aligned + +/* + * If we fail on the initial alignment accesses, + * we're all done. Again, no point in trying to + * do byte-by-byte probing if the 4-byte load + * fails - we're not doing any uncached accesses + * any more. + */ +_ASM_EXTABLE_UA(90b, .Ldone) +_ASM_EXTABLE_UA(91b, .Ldone) +_ASM_EXTABLE_UA(92b, .Ldone) +_ASM_EXTABLE_UA(93b, .Ldone) +_ASM_EXTABLE_UA(94b, .Ldone) +_ASM_EXTABLE_UA(95b, .Ldone) + +/* + * Exception table fixups for faults in the middle + */ +.Ldone56: sub $8,%edx +.Ldone48: sub $8,%edx +.Ldone40: sub $8,%edx +.Ldone32: sub $8,%edx +.Ldone24: sub $8,%edx +.Ldone16: sub $8,%edx +.Ldone8: sub $8,%edx +.Ldone0: + mov %edx,%eax + RET + +.Lfixup32: + addq $32,%rsi + addq $32,%rdi + sub $32,%edx + jmp .Lquadwords + +.Llast4: +52: movl (%rsi),%eax +53: movnti %eax,(%rdi) + sfence + sub $4,%edx + mov %edx,%eax + RET +_ASM_EXTABLE_UA(52b, .Ldone0) +_ASM_EXTABLE_UA(53b, .Ldone0) + +SYM_FUNC_END(__copy_user_nocache) +EXPORT_SYMBOL(__copy_user_nocache) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index b70d98d79a9d..b64a2bd1a1ef 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -37,22 +37,22 @@ #define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC -#ifdef CONFIG_X86_5LEVEL -#define LOAD_TASK_SIZE_MINUS_N(n) \ - ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \ - __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57 -#else -#define LOAD_TASK_SIZE_MINUS_N(n) \ - mov $(TASK_SIZE_MAX - (n)),%_ASM_DX -#endif +.macro check_range size:req +.if IS_ENABLED(CONFIG_X86_64) + mov %rax, %rdx + sar $63, %rdx + or %rdx, %rax +.else + cmp $TASK_SIZE_MAX-\size+1, %eax + jae .Lbad_get_user + sbb %edx, %edx /* array_index_mask_nospec() */ + and %edx, %eax +.endif +.endm .text SYM_FUNC_START(__get_user_1) - LOAD_TASK_SIZE_MINUS_N(0) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX + check_range size=1 ASM_STAC 1: movzbl (%_ASM_AX),%edx xor %eax,%eax @@ -62,11 +62,7 @@ SYM_FUNC_END(__get_user_1) EXPORT_SYMBOL(__get_user_1) SYM_FUNC_START(__get_user_2) - LOAD_TASK_SIZE_MINUS_N(1) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX + check_range size=2 ASM_STAC 2: movzwl (%_ASM_AX),%edx xor %eax,%eax @@ -76,11 +72,7 @@ SYM_FUNC_END(__get_user_2) EXPORT_SYMBOL(__get_user_2) SYM_FUNC_START(__get_user_4) - LOAD_TASK_SIZE_MINUS_N(3) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX + check_range size=4 ASM_STAC 3: movl (%_ASM_AX),%edx xor %eax,%eax @@ -90,30 +82,17 @@ SYM_FUNC_END(__get_user_4) EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) -#ifdef CONFIG_X86_64 - LOAD_TASK_SIZE_MINUS_N(7) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX + check_range size=8 ASM_STAC +#ifdef CONFIG_X86_64 4: movq (%_ASM_AX),%rdx - xor %eax,%eax - ASM_CLAC - RET #else - LOAD_TASK_SIZE_MINUS_N(7) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user_8 - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX - ASM_STAC 4: movl (%_ASM_AX),%edx 5: movl 4(%_ASM_AX),%ecx +#endif xor %eax,%eax ASM_CLAC RET -#endif SYM_FUNC_END(__get_user_8) EXPORT_SYMBOL(__get_user_8) @@ -166,7 +145,7 @@ EXPORT_SYMBOL(__get_user_nocheck_8) SYM_CODE_START_LOCAL(.Lbad_get_user_clac) ASM_CLAC -bad_get_user: +.Lbad_get_user: xor %edx,%edx mov $(-EFAULT),%_ASM_AX RET @@ -184,23 +163,23 @@ SYM_CODE_END(.Lbad_get_user_8_clac) #endif /* get_user */ - _ASM_EXTABLE_UA(1b, .Lbad_get_user_clac) - _ASM_EXTABLE_UA(2b, .Lbad_get_user_clac) - _ASM_EXTABLE_UA(3b, .Lbad_get_user_clac) + _ASM_EXTABLE(1b, .Lbad_get_user_clac) + _ASM_EXTABLE(2b, .Lbad_get_user_clac) + _ASM_EXTABLE(3b, .Lbad_get_user_clac) #ifdef CONFIG_X86_64 - _ASM_EXTABLE_UA(4b, .Lbad_get_user_clac) + _ASM_EXTABLE(4b, .Lbad_get_user_clac) #else - _ASM_EXTABLE_UA(4b, .Lbad_get_user_8_clac) - _ASM_EXTABLE_UA(5b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(4b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(5b, .Lbad_get_user_8_clac) #endif /* __get_user */ - _ASM_EXTABLE_UA(6b, .Lbad_get_user_clac) - _ASM_EXTABLE_UA(7b, .Lbad_get_user_clac) - _ASM_EXTABLE_UA(8b, .Lbad_get_user_clac) + _ASM_EXTABLE(6b, .Lbad_get_user_clac) + _ASM_EXTABLE(7b, .Lbad_get_user_clac) + _ASM_EXTABLE(8b, .Lbad_get_user_clac) #ifdef CONFIG_X86_64 - _ASM_EXTABLE_UA(9b, .Lbad_get_user_clac) + _ASM_EXTABLE(9b, .Lbad_get_user_clac) #else - _ASM_EXTABLE_UA(9b, .Lbad_get_user_8_clac) - _ASM_EXTABLE_UA(10b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(9b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(10b, .Lbad_get_user_8_clac) #endif diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index a64017602010..8f95fb267caa 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -11,13 +11,6 @@ .section .noinstr.text, "ax" /* - * We build a jump to memcpy_orig by default which gets NOPped out on - * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which - * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs - * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. - */ - -/* * memcpy - Copy a memory block. * * Input: @@ -27,17 +20,21 @@ * * Output: * rax original destination + * + * The FSRM alternative should be done inline (avoiding the call and + * the disgusting return handling), but that would require some help + * from the compiler for better calling conventions. + * + * The 'rep movsb' itself is small enough to replace the call, but the + * two register moves blow up the code. And one of them is "needed" + * only for the return value that is the same as the source input, + * which the compiler could/should do much better anyway. */ SYM_TYPED_FUNC_START(__memcpy) - ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ - "jmp memcpy_erms", X86_FEATURE_ERMS + ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM movq %rdi, %rax movq %rdx, %rcx - shrq $3, %rcx - andl $7, %edx - rep movsq - movl %edx, %ecx rep movsb RET SYM_FUNC_END(__memcpy) @@ -46,17 +43,6 @@ EXPORT_SYMBOL(__memcpy) SYM_FUNC_ALIAS(memcpy, __memcpy) EXPORT_SYMBOL(memcpy) -/* - * memcpy_erms() - enhanced fast string memcpy. This is faster and - * simpler than memcpy. Use memcpy_erms when possible. - */ -SYM_FUNC_START_LOCAL(memcpy_erms) - movq %rdi, %rax - movq %rdx, %rcx - rep movsb - RET -SYM_FUNC_END(memcpy_erms) - SYM_FUNC_START_LOCAL(memcpy_orig) movq %rdi, %rax diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6143b1a6fa2c..7c59a704c458 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -18,27 +18,22 @@ * rdx count (bytes) * * rax original destination + * + * The FSRS alternative should be done inline (avoiding the call and + * the disgusting return handling), but that would require some help + * from the compiler for better calling conventions. + * + * The 'rep stosb' itself is small enough to replace the call, but all + * the register moves blow up the code. And two of them are "needed" + * only for the return value that is the same as the source input, + * which the compiler could/should do much better anyway. */ SYM_FUNC_START(__memset) - /* - * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended - * to use it when possible. If not available, use fast string instructions. - * - * Otherwise, use original memset function. - */ - ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ - "jmp memset_erms", X86_FEATURE_ERMS + ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS movq %rdi,%r9 + movb %sil,%al movq %rdx,%rcx - andl $7,%edx - shrq $3,%rcx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - imulq %rsi,%rax - rep stosq - movl %edx,%ecx rep stosb movq %r9,%rax RET @@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset) SYM_FUNC_ALIAS(memset, __memset) EXPORT_SYMBOL(memset) -/* - * ISO C memset - set a memory block to a byte value. This function uses - * enhanced rep stosb to override the fast string function. - * The code is simpler and shorter than the fast string function as well. - * - * rdi destination - * rsi value (char) - * rdx count (bytes) - * - * rax original destination - */ -SYM_FUNC_START_LOCAL(memset_erms) - movq %rdi,%r9 - movb %sil,%al - movq %rdx,%rcx - rep stosb - movq %r9,%rax - RET -SYM_FUNC_END(memset_erms) - SYM_FUNC_START_LOCAL(memset_orig) movq %rdi,%r10 diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 32125224fcca..3062d09a776d 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -33,20 +33,20 @@ * as they get called from within inline assembly. */ -#ifdef CONFIG_X86_5LEVEL -#define LOAD_TASK_SIZE_MINUS_N(n) \ - ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \ - __stringify(mov $((1 << 56) - 4096 - (n)),%rbx), X86_FEATURE_LA57 -#else -#define LOAD_TASK_SIZE_MINUS_N(n) \ - mov $(TASK_SIZE_MAX - (n)),%_ASM_BX -#endif +.macro check_range size:req +.if IS_ENABLED(CONFIG_X86_64) + mov %rcx, %rbx + sar $63, %rbx + or %rbx, %rcx +.else + cmp $TASK_SIZE_MAX-\size+1, %ecx + jae .Lbad_put_user +.endif +.endm .text SYM_FUNC_START(__put_user_1) - LOAD_TASK_SIZE_MINUS_N(0) - cmp %_ASM_BX,%_ASM_CX - jae .Lbad_put_user + check_range size=1 ASM_STAC 1: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -66,9 +66,7 @@ SYM_FUNC_END(__put_user_nocheck_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) - LOAD_TASK_SIZE_MINUS_N(1) - cmp %_ASM_BX,%_ASM_CX - jae .Lbad_put_user + check_range size=2 ASM_STAC 3: movw %ax,(%_ASM_CX) xor %ecx,%ecx @@ -88,9 +86,7 @@ SYM_FUNC_END(__put_user_nocheck_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) - LOAD_TASK_SIZE_MINUS_N(3) - cmp %_ASM_BX,%_ASM_CX - jae .Lbad_put_user + check_range size=4 ASM_STAC 5: movl %eax,(%_ASM_CX) xor %ecx,%ecx @@ -110,9 +106,7 @@ SYM_FUNC_END(__put_user_nocheck_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) - LOAD_TASK_SIZE_MINUS_N(7) - cmp %_ASM_BX,%_ASM_CX - jae .Lbad_put_user + check_range size=8 ASM_STAC 7: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 @@ -144,15 +138,15 @@ SYM_CODE_START_LOCAL(.Lbad_put_user_clac) RET SYM_CODE_END(.Lbad_put_user_clac) - _ASM_EXTABLE_UA(1b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(6b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(7b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(9b, .Lbad_put_user_clac) + _ASM_EXTABLE(1b, .Lbad_put_user_clac) + _ASM_EXTABLE(2b, .Lbad_put_user_clac) + _ASM_EXTABLE(3b, .Lbad_put_user_clac) + _ASM_EXTABLE(4b, .Lbad_put_user_clac) + _ASM_EXTABLE(5b, .Lbad_put_user_clac) + _ASM_EXTABLE(6b, .Lbad_put_user_clac) + _ASM_EXTABLE(7b, .Lbad_put_user_clac) + _ASM_EXTABLE(9b, .Lbad_put_user_clac) #ifdef CONFIG_X86_32 - _ASM_EXTABLE_UA(8b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(10b, .Lbad_put_user_clac) + _ASM_EXTABLE(8b, .Lbad_put_user_clac) + _ASM_EXTABLE(10b, .Lbad_put_user_clac) #endif diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 5f61c65322be..b3b1e376dce8 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -33,7 +33,7 @@ .align RETPOLINE_THUNK_SIZE SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR ALTERNATIVE_2 __stringify(RETPOLINE \reg), \ @@ -75,7 +75,7 @@ SYM_CODE_END(__x86_indirect_thunk_array) .align RETPOLINE_THUNK_SIZE SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT @@ -103,7 +103,7 @@ SYM_CODE_END(__x86_indirect_call_thunk_array) .align RETPOLINE_THUNK_SIZE SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR POLINE \reg ANNOTATE_UNRET_SAFE @@ -144,8 +144,8 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) */ .align 64 .skip 63, 0xcc -SYM_FUNC_START_NOALIGN(zen_untrain_ret); - +SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) + ANNOTATE_NOENDBR /* * As executed from zen_untrain_ret, this is: * diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 6c1f8ac5e721..003d90138e20 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) { unsigned long flushed, dest = (unsigned long) dst; - long rc = __copy_user_nocache(dst, src, size, 0); + long rc; + + stac(); + rc = __copy_user_nocache(dst, src, size); + clac(); /* * __copy_user_nocache() uses non-temporal stores for the bulk @@ -136,13 +140,4 @@ void __memcpy_flushcache(void *_dst, const void *_src, size_t size) } } EXPORT_SYMBOL_GPL(__memcpy_flushcache); - -void memcpy_page_flushcache(char *to, struct page *page, size_t offset, - size_t len) -{ - char *from = kmap_atomic(page); - - memcpy_flushcache(to, from + offset, len); - kunmap_atomic(from); -} #endif diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 092ea436c7e6..b43301cb2a80 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -71,6 +71,5 @@ static void __exit pt_dump_debug_exit(void) module_init(pt_dump_debug_init); module_exit(pt_dump_debug_exit); -MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 60814e110a54..271dcb2deabc 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -130,10 +130,36 @@ static bool ex_handler_fprestore(const struct exception_table_entry *fixup, return true; } +/* + * On x86-64, we end up being imprecise with 'access_ok()', and allow + * non-canonical user addresses to make the range comparisons simpler, + * and to not have to worry about LAM being enabled. + * + * In fact, we allow up to one page of "slop" at the sign boundary, + * which means that we can do access_ok() by just checking the sign + * of the pointer for the common case of having a small access size. + */ +static bool gp_fault_address_ok(unsigned long fault_address) +{ +#ifdef CONFIG_X86_64 + /* Is it in the "user space" part of the non-canonical space? */ + if (valid_user_address(fault_address)) + return true; + + /* .. or just above it? */ + fault_address -= PAGE_SIZE; + if (valid_user_address(fault_address)) + return true; +#endif + return false; +} + static bool ex_handler_uaccess(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) + struct pt_regs *regs, int trapnr, + unsigned long fault_address) { - WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); + WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address), + "General protection fault in user access. Non-canonical address?"); return ex_handler_default(fixup, regs); } @@ -189,10 +215,12 @@ static bool ex_handler_imm_reg(const struct exception_table_entry *fixup, } static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, int reg, int imm) + struct pt_regs *regs, int trapnr, + unsigned long fault_address, + int reg, int imm) { regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg); - return ex_handler_uaccess(fixup, regs, trapnr); + return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } int ex_get_fixup_type(unsigned long ip) @@ -238,7 +266,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, case EX_TYPE_FAULT_MCE_SAFE: return ex_handler_fault(e, regs, trapnr); case EX_TYPE_UACCESS: - return ex_handler_uaccess(e, regs, trapnr); + return ex_handler_uaccess(e, regs, trapnr, fault_addr); case EX_TYPE_COPY: return ex_handler_copy(e, regs, trapnr); case EX_TYPE_CLEAR_FS: @@ -269,7 +297,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, case EX_TYPE_FAULT_SGX: return ex_handler_sgx(e, regs, trapnr); case EX_TYPE_UCOPY_LEN: - return ex_handler_ucopy_len(e, regs, trapnr, reg, imm); + return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a498ae1fbe66..e4399983c50c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -19,6 +19,7 @@ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> +#include <linux/mm.h> /* find_and_lock_vma() */ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ @@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif +#ifdef CONFIG_PER_VMA_LOCK + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + + if (unlikely(access_error(error_code, vma))) { + vma_end_read(vma); + goto lock_mmap; + } + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + vma_end_read(vma); + + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); + return; + } +lock_mmap: +#endif /* CONFIG_PER_VMA_LOCK */ + /* * Kernel-mode access to the user address space should only occur * on well-defined single instructions listed in the exception @@ -1433,6 +1466,9 @@ good_area: } mmap_read_unlock(mm); +#ifdef CONFIG_PER_VMA_LOCK +done: +#endif if (likely(!(fault & VM_FAULT_ERROR))) return; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cb258f58fdc8..3cdac0f0055d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -806,7 +806,7 @@ void __init poking_init(void) BUG_ON(!poking_mm); /* Xen PV guests need the PGD to be pinned. */ - paravirt_arch_dup_mmap(NULL, poking_mm); + paravirt_enter_mmap(poking_mm); /* * Randomize the poking address, but make sure that the following page @@ -1048,6 +1048,11 @@ __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; +#ifdef CONFIG_ADDRESS_MASKING +DEFINE_PER_CPU(u64, tlbstate_untag_mask); +EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask); +#endif + void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { /* entry 0 MUST be WB (hardwired to speed up translations) */ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6453fbaedb08..aa7d279321ea 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -116,6 +116,11 @@ static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *des if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; + if (x86_platform.hyper.is_private_mmio(addr)) { + desc->flags |= IORES_MAP_ENCRYPTED; + return; + } + if (!IS_ENABLED(CONFIG_EFI)) return; diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 9c4d8dbcb129..e0b51c09109f 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -513,10 +513,14 @@ void __init mem_encrypt_free_decrypted_mem(void) npages = (vaddr_end - vaddr) >> PAGE_SHIFT; /* - * The unused memory range was mapped decrypted, change the encryption - * attribute from decrypted to encrypted before freeing it. + * If the unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. Base the + * re-encryption on the same condition used for the decryption in + * sme_postprocess_startup(). Higher level abstractions, such as + * CC_ATTR_MEM_ENCRYPT, aren't necessarily equivalent in a Hyper-V VM + * using vTOM, where sme_me_mask is always zero. */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { + if (sme_me_mask) { r = set_memory_encrypted(vaddr, npages); if (r) { pr_warn("failed to free unused decrypted pages\n"); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 46a00aa858b6..de10800cd4dd 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1073,11 +1073,15 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, } /* - * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, - * with the old vma after its pfnmap page table has been removed. The new - * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. + * untrack_pfn_clear is called if the following situation fits: + * + * 1) while mremapping a pfnmap for a new region, with the old vma after + * its pfnmap page table has been removed. The new vma has a new pfnmap + * to the same pfn & cache type with VM_PAT set. + * 2) while duplicating vm area, the new vma fails to copy the pgtable from + * old vma. */ -void untrack_pfn_moved(struct vm_area_struct *vma) +void untrack_pfn_clear(struct vm_area_struct *vma) { vm_flags_clear(vma, VM_PAT); } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 356758b7d4b4..7159cf787613 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -234,7 +234,7 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end) * take full advantage of the the limited (s32) immediate addressing range (2G) * of x86_64. * - * See Documentation/x86/x86_64/mm.rst for more detail. + * See Documentation/arch/x86/x86_64/mm.rst for more detail. */ static inline unsigned long highmap_start_pfn(void) @@ -2175,9 +2175,6 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { - if (hv_is_isolation_supported()) - return hv_set_mem_host_visibility(addr, numpages, !enc); - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return __set_memory_enc_pgtable(addr, numpages, enc); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 92d73ccede70..267acf27480a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -154,26 +154,30 @@ static inline u16 user_pcid(u16 asid) return ret; } -static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) { + unsigned long cr3 = __sme_pa(pgd) | lam; + if (static_cpu_has(X86_FEATURE_PCID)) { - return __sme_pa(pgd) | kern_pcid(asid); + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + cr3 |= kern_pcid(asid); } else { VM_WARN_ON_ONCE(asid != 0); - return __sme_pa(pgd); } + + return cr3; } -static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, + unsigned long lam) { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); /* * Use boot_cpu_has() instead of this_cpu_has() as this function * might be called during early boot. This should work even after * boot because all CPU's the have same capabilities: */ VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); - return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; + return build_cr3(pgd, asid, lam) | CR3_NOFLUSH; } /* @@ -274,15 +278,16 @@ static inline void invalidate_user_asid(u16 asid) (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); } -static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, + bool need_flush) { unsigned long new_mm_cr3; if (need_flush) { invalidate_user_asid(new_asid); - new_mm_cr3 = build_cr3(pgdir, new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid, lam); } else { - new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam); } /* @@ -491,6 +496,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; @@ -520,7 +526,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, + tlbstate_lam_cr3_mask()))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -552,10 +559,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * instruction. */ if (real_prev == next) { + /* Not actually switching mm's */ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* + * If this races with another thread that enables lam, 'new_lam' + * might not match tlbstate_lam_cr3_mask(). + */ + + /* * Even in lazy TLB mode, the CPU should stay set in the * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. @@ -622,15 +635,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, barrier(); } + set_tlbstate_lam_mode(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); + load_new_mm_cr3(next->pgd, new_asid, new_lam, true); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); + load_new_mm_cr3(next->pgd, new_asid, new_lam, false); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -691,6 +705,10 @@ void initialize_tlbstate_and_flush(void) /* Assert that CR3 already references the right mm. */ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + /* LAM expected to be disabled */ + WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); + WARN_ON(mm_lam_cr3_mask(mm)); + /* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization * doesn't work like other CR4 bits because it can only be set from @@ -699,8 +717,8 @@ void initialize_tlbstate_and_flush(void) WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && !(cr4_read_shadow() & X86_CR4_PCIDE)); - /* Force ASID 0 and force a TLB flush. */ - write_cr3(build_cr3(mm->pgd, 0)); + /* Disable LAM, force ASID 0 and force a TLB flush. */ + write_cr3(build_cr3(mm->pgd, 0, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); @@ -708,6 +726,7 @@ void initialize_tlbstate_and_flush(void) this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + set_tlbstate_lam_mode(mm); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); @@ -925,7 +944,7 @@ void flush_tlb_multi(const struct cpumask *cpumask, } /* - * See Documentation/x86/tlb.rst for details. We choose 33 + * See Documentation/arch/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at * least 95%) of allocations, and is small enough that we are * confident it will not cause too much overhead. Each single @@ -1071,8 +1090,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) */ unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, - this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + unsigned long cr3 = + build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid), + tlbstate_lam_cr3_mask()); /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 615a76d70019..e3ec02e6ac9f 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -7,6 +7,7 @@ #include <linux/dmi.h> #include <linux/pci.h> #include <linux/vgaarb.h> +#include <asm/amd_nb.h> #include <asm/hpet.h> #include <asm/pci_x86.h> @@ -824,3 +825,82 @@ static void rs690_fix_64bit_dma(struct pci_dev *pdev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma); #endif + +#ifdef CONFIG_AMD_NB + +#define AMD_15B8_RCC_DEV2_EPF0_STRAP2 0x10136008 +#define AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK 0x00000080L + +static void quirk_clear_strap_no_soft_reset_dev2_f0(struct pci_dev *dev) +{ + u32 data; + + if (!amd_smn_read(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, &data)) { + data &= ~AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK; + if (amd_smn_write(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, data)) + pci_err(dev, "Failed to write data 0x%x\n", data); + } else { + pci_err(dev, "Failed to read data\n"); + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b8, quirk_clear_strap_no_soft_reset_dev2_f0); +#endif + +/* + * When returning from D3cold to D0, firmware on some Google Coral and Reef + * family Chromebooks with Intel Apollo Lake SoC clobbers the headers of + * both the L1 PM Substates capability and the previous capability for the + * "Celeron N3350/Pentium N4200/Atom E3900 Series PCI Express Port B #1". + * + * Save those values at enumeration-time and restore them at resume. + */ + +static u16 prev_cap, l1ss_cap; +static u32 prev_header, l1ss_header; + +static void chromeos_save_apl_pci_l1ss_capability(struct pci_dev *dev) +{ + int pos = PCI_CFG_SPACE_SIZE, prev = 0; + u32 header, pheader = 0; + + while (pos) { + pci_read_config_dword(dev, pos, &header); + if (PCI_EXT_CAP_ID(header) == PCI_EXT_CAP_ID_L1SS) { + prev_cap = prev; + prev_header = pheader; + l1ss_cap = pos; + l1ss_header = header; + return; + } + + prev = pos; + pheader = header; + pos = PCI_EXT_CAP_NEXT(header); + } +} + +static void chromeos_fixup_apl_pci_l1ss_capability(struct pci_dev *dev) +{ + u32 header; + + if (!prev_cap || !prev_header || !l1ss_cap || !l1ss_header) + return; + + /* Fixup the header of L1SS Capability if missing */ + pci_read_config_dword(dev, l1ss_cap, &header); + if (header != l1ss_header) { + pci_write_config_dword(dev, l1ss_cap, l1ss_header); + pci_info(dev, "restore L1SS Capability header (was %#010x now %#010x)\n", + header, l1ss_header); + } + + /* Fixup the link to L1SS Capability if missing */ + pci_read_config_dword(dev, prev_cap, &header); + if (header != prev_header) { + pci_write_config_dword(dev, prev_cap, prev_header); + pci_info(dev, "restore previous Capability header (was %#010x now %#010x)\n", + header, prev_header); + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_save_apl_pci_l1ss_capability); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_fixup_apl_pci_l1ss_capability); diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index ed0442e35434..00a92cb2c814 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -86,7 +86,7 @@ static void __init init_pvh_bootparams(bool xen_guest) } /* - * See Documentation/x86/boot.rst. + * See Documentation/arch/x86/boot.rst. * * Version 2.12 supports Xen entry point but we will use default x86/PC * environment (i.e. hardware_subarch 0). diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 7fe564eaf228..c4365a05ab83 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -50,7 +50,7 @@ #define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8) SYM_CODE_START_LOCAL(pvh_start_xen) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK cld lgdt (_pa(gdt)) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 236447ee9beb..7a4d5e911415 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -288,7 +288,7 @@ EXPORT_SYMBOL(restore_processor_state); #endif #if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU) -static void resume_play_dead(void) +static void __noreturn resume_play_dead(void) { play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 17f09dc26381..82fec66d46d2 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -69,8 +69,7 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS) CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_string.o += $(PURGATORY_CFLAGS) -AFLAGS_REMOVE_setup-x86_$(BITS).o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_entry64.o += -Wa,-gdwarf-2 +asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x)) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 2925074b9a58..d30949e25ebd 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -406,7 +406,7 @@ static void read_ehdr(FILE *fp) if (ehdr.e_version != EV_CURRENT) die("Unknown ELF version\n"); if (ehdr.e_ehsize != sizeof(Elf_Ehdr)) - die("Bad Elf header size\n"); + die("Bad ELF header size\n"); if (ehdr.e_phentsize != sizeof(Elf_Phdr)) die("Bad program header entry\n"); if (ehdr.e_shentsize != sizeof(Elf_Shdr)) diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index 4c6c2be0c899..38fa894b65d0 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -89,19 +89,19 @@ static inline void remap_stack_and_trap(void) "addl %4,%%ebx ; movl %%eax, (%%ebx) ;" "int $3" : : - "g" (~(UM_KERN_PAGE_SIZE - 1)), + "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)), "g" (STUB_MMAP_NR), "g" (UML_STUB_FIELD_FD), "g" (UML_STUB_FIELD_OFFSET), "g" (UML_STUB_FIELD_CHILD_ERR), - "c" (UM_KERN_PAGE_SIZE), + "c" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE), "d" (PROT_READ | PROT_WRITE), "S" (MAP_FIXED | MAP_SHARED) : "memory"); } -static __always_inline void *get_stub_page(void) +static __always_inline void *get_stub_data(void) { unsigned long ret; @@ -109,7 +109,7 @@ static __always_inline void *get_stub_page(void) "movl %%esp,%0 ;" "andl %1,%0" : "=a" (ret) - : "g" (~(UM_KERN_PAGE_SIZE - 1))); + : "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1))); return (void *)ret; } diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 92ea1670cf1c..2de1c8f88173 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -98,18 +98,18 @@ static inline void remap_stack_and_trap(void) "int3" : : "g" (STUB_MMAP_NR), - "g" (~(UM_KERN_PAGE_SIZE - 1)), + "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)), "g" (MAP_FIXED | MAP_SHARED), "g" (UML_STUB_FIELD_FD), "g" (UML_STUB_FIELD_OFFSET), "g" (UML_STUB_FIELD_CHILD_ERR), - "S" (UM_KERN_PAGE_SIZE), + "S" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE), "d" (PROT_READ | PROT_WRITE) : __syscall_clobber, "r10", "r8", "r9"); } -static __always_inline void *get_stub_page(void) +static __always_inline void *get_stub_data(void) { unsigned long ret; @@ -117,7 +117,7 @@ static __always_inline void *get_stub_page(void) "movq %%rsp,%0 ;" "andq %1,%0" : "=a" (ret) - : "g" (~(UM_KERN_PAGE_SIZE - 1))); + : "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1))); return (void *)ret; } diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c index f7eefba034f9..040668b989b5 100644 --- a/arch/x86/um/stub_segv.c +++ b/arch/x86/um/stub_segv.c @@ -11,7 +11,7 @@ void __attribute__ ((__section__ (".__syscall_stub"))) stub_segv_handler(int sig, siginfo_t *info, void *p) { - struct faultinfo *f = get_stub_page(); + struct faultinfo *f = get_stub_data(); ucontext_t *uc = p; GET_FAULTINFO_FROM_MC(*f, &uc->uc_mcontext); diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c index 9fd24846d094..f41a17ebac48 100644 --- a/arch/x86/video/fbdev.c +++ b/arch/x86/video/fbdev.c @@ -6,35 +6,37 @@ * for more details. * */ + #include <linux/fb.h> -#include <linux/pci.h> #include <linux/module.h> +#include <linux/pci.h> #include <linux/vgaarb.h> +void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off) +{ + unsigned long prot; + + prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK; + if (boot_cpu_data.x86 > 3) + pgprot_val(vma->vm_page_prot) = + prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); +} +EXPORT_SYMBOL(fb_pgprotect); + int fb_is_primary_device(struct fb_info *info) { struct device *device = info->device; - struct pci_dev *default_device = vga_default_device(); struct pci_dev *pci_dev; - struct resource *res; if (!device || !dev_is_pci(device)) return 0; pci_dev = to_pci_dev(device); - if (default_device) { - if (pci_dev == default_device) - return 1; - return 0; - } - - res = pci_dev->resource + PCI_ROM_RESOURCE; - - if (res->flags & IORESOURCE_ROM_SHADOW) + if (pci_dev == vga_default_device()) return 1; - return 0; } EXPORT_SYMBOL(fb_is_primary_device); + MODULE_LICENSE("GPL"); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index ee29fb558f2e..b3b8d289b9ab 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -885,14 +885,7 @@ void xen_mm_unpin_all(void) spin_unlock(&pgd_lock); } -static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) -{ - spin_lock(&next->page_table_lock); - xen_pgd_pin(next); - spin_unlock(&next->page_table_lock); -} - -static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static void xen_enter_mmap(struct mm_struct *mm) { spin_lock(&mm->page_table_lock); xen_pgd_pin(mm); @@ -2153,8 +2146,7 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), #endif - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, + .enter_mmap = xen_enter_mmap, .exit_mmap = xen_exit_mmap, .lazy_mode = { diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 4a184f6e4e4d..08f1ceb9eb81 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -165,7 +165,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback SYM_CODE_START(xen_early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ENDBR pop %rcx pop %r11 @@ -193,7 +193,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 * rsp->rax } */ SYM_CODE_START(xen_iret) - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR pushq $0 jmp hypercall_iret diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index e36ea4268bd2..643d02900fbb 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -45,11 +45,11 @@ SYM_CODE_END(hypercall_page) #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR cld - mov initial_stack(%rip), %rsp + leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp /* Set up %gs. * @@ -71,7 +71,7 @@ SYM_CODE_END(startup_xen) #ifdef CONFIG_XEN_PV_SMP .pushsection .text SYM_CODE_START(asm_cpu_bringup_and_idle) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ENDBR call cpu_bringup_and_idle |