diff options
Diffstat (limited to 'arch/x86')
33 files changed, 977 insertions, 126 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5873c9e39919..e21cca404943 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2711,6 +2711,18 @@ config MITIGATION_SSB of speculative execution in a similar way to the Meltdown and Spectre security vulnerabilities. +config MITIGATION_ITS + bool "Enable Indirect Target Selection mitigation" + depends on CPU_SUP_INTEL && X86_64 + depends on MITIGATION_RETPOLINE && MITIGATION_RETHUNK + select EXECMEM + default y + help + Enable Indirect Target Selection (ITS) mitigation. ITS is a bug in + BPU on some Intel CPUs that may allow Spectre V2 style attacks. If + disabled, mitigation cannot be enabled via cmdline. + See <file:Documentation/admin-guide/hw-vuln/indirect-target-selection.rst> + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f40bdf97d390..ed04a968cc7d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1525,7 +1525,9 @@ SYM_CODE_END(rewind_stack_and_make_dead) * ORC to unwind properly. * * The alignment is for performance and not for safety, and may be safely - * refactored in the future if needed. + * refactored in the future if needed. The .skips are for safety, to ensure + * that all RETs are in the second half of a cacheline to mitigate Indirect + * Target Selection, rather than taking the slowpath via its_return_thunk. */ SYM_FUNC_START(clear_bhb_loop) ANNOTATE_NOENDBR @@ -1536,10 +1538,22 @@ SYM_FUNC_START(clear_bhb_loop) call 1f jmp 5f .align 64, 0xcc + /* + * Shift instructions so that the RET is in the upper half of the + * cacheline and don't take the slowpath to its_return_thunk. + */ + .skip 32 - (.Lret1 - 1f), 0xcc ANNOTATE_INTRA_FUNCTION_CALL 1: call 2f - RET +.Lret1: RET .align 64, 0xcc + /* + * As above shift instructions for RET at .Lret2 as well. + * + * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc + * but some Clang versions (e.g. 18) don't like this. + */ + .skip 32 - 18, 0xcc 2: movl $5, %eax 3: jmp 4f nop @@ -1547,7 +1561,7 @@ SYM_FUNC_START(clear_bhb_loop) jnz 3b sub $1, %ecx jnz 1b - RET +.Lret2: RET 5: lfence pop %rbp RET diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4a37a8bd87fd..f2294784babc 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -6,6 +6,7 @@ #include <linux/stringify.h> #include <linux/objtool.h> #include <asm/asm.h> +#include <asm/bug.h> #define ALT_FLAGS_SHIFT 16 @@ -124,6 +125,37 @@ static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, } #endif +#ifdef CONFIG_MITIGATION_ITS +extern void its_init_mod(struct module *mod); +extern void its_fini_mod(struct module *mod); +extern void its_free_mod(struct module *mod); +extern u8 *its_static_thunk(int reg); +#else /* CONFIG_MITIGATION_ITS */ +static inline void its_init_mod(struct module *mod) { } +static inline void its_fini_mod(struct module *mod) { } +static inline void its_free_mod(struct module *mod) { } +static inline u8 *its_static_thunk(int reg) +{ + WARN_ONCE(1, "ITS not compiled in"); + + return NULL; +} +#endif + +#if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL) +extern bool cpu_wants_rethunk(void); +extern bool cpu_wants_rethunk_at(void *addr); +#else +static __always_inline bool cpu_wants_rethunk(void) +{ + return false; +} +static __always_inline bool cpu_wants_rethunk_at(void *addr) +{ + return false; +} +#endif + #ifdef CONFIG_SMP extern void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6c2c152d8a67..39e61212ac9a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -481,6 +481,7 @@ #define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */ #define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32 + 7) /* Workload Classification */ #define X86_FEATURE_PREFER_YMM (21*32 + 8) /* Avoid ZMM registers due to downclocking */ +#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32 + 9) /* Use thunk for indirect branches in lower half of cacheline */ /* * BUG word(s) @@ -533,4 +534,6 @@ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ #define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ #define X86_BUG_SPECTRE_V2_USER X86_BUG(1*32 + 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */ +#define X86_BUG_ITS X86_BUG(1*32 + 6) /* "its" CPU is affected by Indirect Target Selection */ +#define X86_BUG_ITS_NATIVE_ONLY X86_BUG(1*32 + 7) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 695e569159c1..be7cddc414e4 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -17,10 +17,12 @@ struct ucode_cpu_info { void load_ucode_bsp(void); void load_ucode_ap(void); void microcode_bsp_resume(void); +bool __init microcode_loader_disabled(void); #else static inline void load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void microcode_bsp_resume(void) { } +static inline bool __init microcode_loader_disabled(void) { return false; } #endif extern unsigned long initrd_start_early; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e6134ef2263d..e7d2f460fcc6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -211,6 +211,14 @@ * VERW clears CPU Register * File. */ +#define ARCH_CAP_ITS_NO BIT_ULL(62) /* + * Not susceptible to + * Indirect Target Selection. + * This bit is not set by + * HW, but is synthesized by + * VMMs for guests to know + * their affected status. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 5c43f145454d..7d04ade33541 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -336,10 +336,14 @@ #else /* __ASSEMBLER__ */ +#define ITS_THUNK_SIZE 64 + typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; +typedef u8 its_thunk_t[ITS_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; +extern its_thunk_t __x86_indirect_its_thunk_array[]; #ifdef CONFIG_MITIGATION_RETHUNK extern void __x86_return_thunk(void); @@ -363,6 +367,12 @@ static inline void srso_return_thunk(void) {} static inline void srso_alias_return_thunk(void) {} #endif +#ifdef CONFIG_MITIGATION_ITS +extern void its_return_thunk(void); +#else +static inline void its_return_thunk(void) {} +#endif + extern void retbleed_return_thunk(void); extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bf82c6f7d690..48fd04e90114 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -18,6 +18,7 @@ #include <linux/mmu_context.h> #include <linux/bsearch.h> #include <linux/sync_core.h> +#include <linux/execmem.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -31,6 +32,8 @@ #include <asm/paravirt.h> #include <asm/asm-prototypes.h> #include <asm/cfi.h> +#include <asm/ibt.h> +#include <asm/set_memory.h> int __read_mostly alternatives_patched; @@ -124,6 +127,165 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = #endif }; +#ifdef CONFIG_FINEIBT +static bool cfi_paranoid __ro_after_init; +#endif + +#ifdef CONFIG_MITIGATION_ITS + +static struct module *its_mod; +static void *its_page; +static unsigned int its_offset; + +/* Initialize a thunk with the "jmp *reg; int3" instructions. */ +static void *its_init_thunk(void *thunk, int reg) +{ + u8 *bytes = thunk; + int offset = 0; + int i = 0; + +#ifdef CONFIG_FINEIBT + if (cfi_paranoid) { + /* + * When ITS uses indirect branch thunk the fineibt_paranoid + * caller sequence doesn't fit in the caller site. So put the + * remaining part of the sequence (<ea> + JNE) into the ITS + * thunk. + */ + bytes[i++] = 0xea; /* invalid instruction */ + bytes[i++] = 0x75; /* JNE */ + bytes[i++] = 0xfd; + + offset = 1; + } +#endif + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + bytes[i++] = 0xff; + bytes[i++] = 0xe0 + reg; /* jmp *reg */ + bytes[i++] = 0xcc; + + return thunk + offset; +} + +void its_init_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + mutex_lock(&text_mutex); + its_mod = mod; + its_page = NULL; +} + +void its_fini_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + WARN_ON_ONCE(its_mod != mod); + + its_mod = NULL; + its_page = NULL; + mutex_unlock(&text_mutex); + + for (int i = 0; i < mod->its_num_pages; i++) { + void *page = mod->its_page_array[i]; + execmem_restore_rox(page, PAGE_SIZE); + } +} + +void its_free_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + for (int i = 0; i < mod->its_num_pages; i++) { + void *page = mod->its_page_array[i]; + execmem_free(page); + } + kfree(mod->its_page_array); +} + +static void *its_alloc(void) +{ + void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + + if (!page) + return NULL; + + if (its_mod) { + void *tmp = krealloc(its_mod->its_page_array, + (its_mod->its_num_pages+1) * sizeof(void *), + GFP_KERNEL); + if (!tmp) + return NULL; + + its_mod->its_page_array = tmp; + its_mod->its_page_array[its_mod->its_num_pages++] = page; + + execmem_make_temp_rw(page, PAGE_SIZE); + } + + return no_free_ptr(page); +} + +static void *its_allocate_thunk(int reg) +{ + int size = 3 + (reg / 8); + void *thunk; + +#ifdef CONFIG_FINEIBT + /* + * The ITS thunk contains an indirect jump and an int3 instruction so + * its size is 3 or 4 bytes depending on the register used. If CFI + * paranoid is used then 3 extra bytes are added in the ITS thunk to + * complete the fineibt_paranoid caller sequence. + */ + if (cfi_paranoid) + size += 3; +#endif + + if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { + its_page = its_alloc(); + if (!its_page) { + pr_err("ITS page allocation failed\n"); + return NULL; + } + memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE); + its_offset = 32; + } + + /* + * If the indirect branch instruction will be in the lower half + * of a cacheline, then update the offset to reach the upper half. + */ + if ((its_offset + size - 1) % 64 < 32) + its_offset = ((its_offset - 1) | 0x3F) + 33; + + thunk = its_page + its_offset; + its_offset += size; + + return its_init_thunk(thunk, reg); +} + +u8 *its_static_thunk(int reg) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + +#ifdef CONFIG_FINEIBT + /* Paranoid thunk starts 2 bytes before */ + if (cfi_paranoid) + return thunk - 2; +#endif + return thunk; +} + +#endif + /* * Nomenclature for variable names to simplify and clarify this code and ease * any potential staring at it: @@ -581,7 +743,8 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } -static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, + void *call_dest, void *jmp_dest) { u8 op = insn->opcode.bytes[0]; int i = 0; @@ -602,7 +765,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 switch (op) { case CALL_INSN_OPCODE: __text_gen_insn(bytes+i, op, addr+i, - __x86_indirect_call_thunk_array[reg], + call_dest, CALL_INSN_SIZE); i += CALL_INSN_SIZE; break; @@ -610,7 +773,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 case JMP32_INSN_OPCODE: clang_jcc: __text_gen_insn(bytes+i, op, addr+i, - __x86_indirect_jump_thunk_array[reg], + jmp_dest, JMP32_INSN_SIZE); i += JMP32_INSN_SIZE; break; @@ -625,6 +788,48 @@ clang_jcc: return i; } +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + return __emit_trampoline(addr, insn, bytes, + __x86_indirect_call_thunk_array[reg], + __x86_indirect_jump_thunk_array[reg]); +} + +#ifdef CONFIG_MITIGATION_ITS +static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + u8 *tmp = its_allocate_thunk(reg); + + if (tmp) + thunk = tmp; + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + +/* Check if an indirect branch is at ITS-unsafe address */ +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return false; + + /* Indirect branch opcode is 2 or 3 bytes depending on reg */ + addr += 1 + reg / 8; + + /* Lower-half of the cacheline? */ + return !(addr & 0x20); +} +#else /* CONFIG_MITIGATION_ITS */ + +#ifdef CONFIG_FINEIBT +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + return false; +} +#endif + +#endif /* CONFIG_MITIGATION_ITS */ + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -699,6 +904,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) bytes[i++] = 0xe8; /* LFENCE */ } +#ifdef CONFIG_MITIGATION_ITS + /* + * Check if the address of last byte of emitted-indirect is in + * lower-half of the cacheline. Such branches need ITS mitigation. + */ + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg)) + return emit_its_trampoline(addr, insn, reg, bytes); +#endif + ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; @@ -732,6 +946,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) int len, ret; u8 bytes[16]; u8 op1, op2; + u8 *dest; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) @@ -748,6 +963,12 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: + /* Check for cfi_paranoid + ITS */ + dest = addr + insn.length + insn.immediate.value; + if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) { + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + } break; case 0x0f: /* escape */ @@ -775,6 +996,21 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) #ifdef CONFIG_MITIGATION_RETHUNK +bool cpu_wants_rethunk(void) +{ + return cpu_feature_enabled(X86_FEATURE_RETHUNK); +} + +bool cpu_wants_rethunk_at(void *addr) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return false; + if (x86_return_thunk != its_return_thunk) + return true; + + return !((unsigned long)addr & 0x20); +} + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -791,7 +1027,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) int i = 0; /* Patch the custom return thunks... */ - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk_at(addr)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { @@ -808,7 +1044,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk()) static_call_force_reinit(); for (s = start; s < end; s++) { @@ -1022,8 +1258,6 @@ int cfi_get_func_arity(void *func) static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; -static bool cfi_paranoid __ro_after_init = false; - /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. @@ -1436,6 +1670,19 @@ static int cfi_rand_callers(s32 *start, s32 *end) return 0; } +static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2; + +#ifdef CONFIG_MITIGATION_ITS + u8 *tmp = its_allocate_thunk(reg); + if (tmp) + thunk = tmp; +#endif + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; @@ -1477,9 +1724,14 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); memcpy(bytes + fineibt_caller_hash, &hash, 4); - ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); - if (WARN_ON_ONCE(ret != 3)) - continue; + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) { + emit_paranoid_trampoline(addr + fineibt_caller_size, + &insn, 11, bytes + fineibt_caller_size); + } else { + ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); + if (WARN_ON_ONCE(ret != 3)) + continue; + } text_poke_early(addr, bytes, fineibt_paranoid_size); } @@ -1706,29 +1958,66 @@ Efault: return false; } +static bool is_paranoid_thunk(unsigned long addr) +{ + u32 thunk; + + __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault); + return (thunk & 0x00FFFFFF) == 0xfd75ea; + +Efault: + return false; +} + /* * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] - * sequence. + * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS + * thunk. */ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_paranoid_ud; - u32 hash; - if (!cfi_paranoid || !is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) + if (!cfi_paranoid) return false; - __get_kernel_nofault(&hash, addr + fineibt_caller_hash, u32, Efault); - *target = regs->r11 + fineibt_preamble_size; - *type = regs->r10; + if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + /* + * Since the trapping instruction is the exact, but LOCK prefixed, + * Jcc.d8 that got us here, the normal fixup will work. + */ + return true; + } /* - * Since the trapping instruction is the exact, but LOCK prefixed, - * Jcc.d8 that got us here, the normal fixup will work. + * The cfi_paranoid + ITS thunk combination results in: + * + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 + * + * Where the paranoid_thunk looks like: + * + * 1d: <ea> (bad) + * __x86_indirect_paranoid_thunk_r11: + * 1e: 75 fd jne 1d + * __x86_indirect_its_thunk_r11: + * 20: 41 ff eb jmp *%r11 + * 23: cc int3 + * */ - return true; + if (is_paranoid_thunk(regs->ip)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + regs->ip = *target; + return true; + } -Efault: return false; } @@ -2031,6 +2320,8 @@ static noinline void __init alt_reloc_selftest(void) void __init alternative_instructions(void) { + u64 ibt; + int3_selftest(); /* @@ -2057,6 +2348,9 @@ void __init alternative_instructions(void) */ paravirt_set_cap(); + /* Keep CET-IBT disabled until caller/callee are patched */ + ibt = ibt_save(/*disable*/ true); + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, true); @@ -2080,6 +2374,8 @@ void __init alternative_instructions(void) */ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + ibt_restore(ibt); + #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 362602b705cc..8596ce85026c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -49,6 +49,7 @@ static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); static void __init gds_select_mitigation(void); +static void __init its_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -66,6 +67,14 @@ static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; +static void __init set_return_thunk(void *thunk) +{ + if (x86_return_thunk != __x86_return_thunk) + pr_warn("x86/bugs: return thunk changed\n"); + + x86_return_thunk = thunk; +} + /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { @@ -178,6 +187,7 @@ void __init cpu_select_mitigations(void) */ srso_select_mitigation(); gds_select_mitigation(); + its_select_mitigation(); } /* @@ -1118,7 +1128,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - x86_return_thunk = retbleed_return_thunk; + set_return_thunk(retbleed_return_thunk); if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1153,7 +1163,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_return_thunk = call_depth_return_thunk; + set_return_thunk(call_depth_return_thunk); break; default: @@ -1188,6 +1198,145 @@ do_cmd_auto: } #undef pr_fmt +#define pr_fmt(fmt) "ITS: " fmt + +enum its_mitigation_cmd { + ITS_CMD_OFF, + ITS_CMD_ON, + ITS_CMD_VMEXIT, + ITS_CMD_RSB_STUFF, +}; + +enum its_mitigation { + ITS_MITIGATION_OFF, + ITS_MITIGATION_VMEXIT_ONLY, + ITS_MITIGATION_ALIGNED_THUNKS, + ITS_MITIGATION_RETPOLINE_STUFF, +}; + +static const char * const its_strings[] = { + [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected", + [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", + [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB", +}; + +static enum its_mitigation its_mitigation __ro_after_init = ITS_MITIGATION_ALIGNED_THUNKS; + +static enum its_mitigation_cmd its_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_CMD_ON : ITS_CMD_OFF; + +static int __init its_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) { + pr_err("Mitigation disabled at compile time, ignoring option (%s)", str); + return 0; + } + + if (!strcmp(str, "off")) { + its_cmd = ITS_CMD_OFF; + } else if (!strcmp(str, "on")) { + its_cmd = ITS_CMD_ON; + } else if (!strcmp(str, "force")) { + its_cmd = ITS_CMD_ON; + setup_force_cpu_bug(X86_BUG_ITS); + } else if (!strcmp(str, "vmexit")) { + its_cmd = ITS_CMD_VMEXIT; + } else if (!strcmp(str, "stuff")) { + its_cmd = ITS_CMD_RSB_STUFF; + } else { + pr_err("Ignoring unknown indirect_target_selection option (%s).", str); + } + + return 0; +} +early_param("indirect_target_selection", its_parse_cmdline); + +static void __init its_select_mitigation(void) +{ + enum its_mitigation_cmd cmd = its_cmd; + + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) { + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + /* Retpoline+CDT mitigates ITS, bail out */ + if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + boot_cpu_has(X86_FEATURE_CALL_DEPTH)) { + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + goto out; + } + + /* Exit early to avoid irrelevant warnings */ + if (cmd == ITS_CMD_OFF) { + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (spectre_v2_enabled == SPECTRE_V2_NONE) { + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || + !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) { + pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) { + pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { + pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + + if (cmd == ITS_CMD_RSB_STUFF && + (!boot_cpu_has(X86_FEATURE_RETPOLINE) || !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING))) { + pr_err("RSB stuff mitigation not supported, using default\n"); + cmd = ITS_CMD_ON; + } + + switch (cmd) { + case ITS_CMD_OFF: + its_mitigation = ITS_MITIGATION_OFF; + break; + case ITS_CMD_VMEXIT: + if (boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) { + its_mitigation = ITS_MITIGATION_VMEXIT_ONLY; + goto out; + } + fallthrough; + case ITS_CMD_ON: + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); + break; + case ITS_CMD_RSB_STUFF: + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + set_return_thunk(call_depth_return_thunk); + if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) { + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + pr_info("Retbleed mitigation updated to stuffing\n"); + } + break; + } +out: + pr_info("%s\n", its_strings[its_mitigation]); +} + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = @@ -1697,11 +1846,11 @@ static void __init bhi_select_mitigation(void) return; } - /* Mitigate in hardware if supported */ - if (spec_ctrl_bhi_dis()) + if (!IS_ENABLED(CONFIG_X86_64)) return; - if (!IS_ENABLED(CONFIG_X86_64)) + /* Mitigate in hardware if supported */ + if (spec_ctrl_bhi_dis()) return; if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) { @@ -2607,10 +2756,10 @@ static void __init srso_select_mitigation(void) if (boot_cpu_data.x86 == 0x19) { setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - x86_return_thunk = srso_alias_return_thunk; + set_return_thunk(srso_alias_return_thunk); } else { setup_force_cpu_cap(X86_FEATURE_SRSO); - x86_return_thunk = srso_return_thunk; + set_return_thunk(srso_return_thunk); } if (has_microcode) srso_mitigation = SRSO_MITIGATION_SAFE_RET; @@ -2800,6 +2949,11 @@ static ssize_t rfds_show_state(char *buf) return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } +static ssize_t its_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2982,6 +3136,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RFDS: return rfds_show_state(buf); + case X86_BUG_ITS: + return its_show_state(buf); + default: break; } @@ -3061,6 +3218,11 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib { return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } + +ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITS); +} #endif void __warn_thunk(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 12126adbc3a9..0ff057ff11ce 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1227,6 +1227,10 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define GDS BIT(6) /* CPU is affected by Register File Data Sampling */ #define RFDS BIT(7) +/* CPU is affected by Indirect Target Selection */ +#define ITS BIT(8) +/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */ +#define ITS_NATIVE_ONLY BIT(9) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), @@ -1238,22 +1242,25 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), @@ -1318,6 +1325,32 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) return cpu_matches(cpu_vuln_blacklist, RFDS); } +static bool __init vulnerable_to_its(u64 x86_arch_cap_msr) +{ + /* The "immunity" bit trumps everything else: */ + if (x86_arch_cap_msr & ARCH_CAP_ITS_NO) + return false; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + /* None of the affected CPUs have BHI_CTRL */ + if (boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + /* + * If a VMM did not expose ITS_NO, assume that a guest could + * be running on a vulnerable hardware or may migrate to such + * hardware. + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; + + if (cpu_matches(cpu_vuln_blacklist, ITS)) + return true; + + return false; +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); @@ -1439,9 +1472,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); - /* When virtualized, eIBRS could be hidden, assume vulnerable */ - if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && - !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + /* + * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with + * BHI_NO still need to use the BHI mitigation to prevent Intra-mode + * attacks. When virtualized, eIBRS could be hidden, assume vulnerable. + */ + if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) && (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || boot_cpu_has(X86_FEATURE_HYPERVISOR))) setup_force_cpu_bug(X86_BUG_BHI); @@ -1449,6 +1485,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (vulnerable_to_its(x86_arch_cap_msr)) { + setup_force_cpu_bug(X86_BUG_ITS); + if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY)) + setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY); + } + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 4a10d35e70aa..96cb992d50ef 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -1098,15 +1098,17 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz static int __init save_microcode_in_initrd(void) { - unsigned int cpuid_1_eax = native_cpuid_eax(1); struct cpuinfo_x86 *c = &boot_cpu_data; struct cont_desc desc = { 0 }; + unsigned int cpuid_1_eax; enum ucode_state ret; struct cpio_data cp; - if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) + if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) return 0; + cpuid_1_eax = native_cpuid_eax(1); + if (!find_blobs_in_containers(&cp)) return -EINVAL; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b3658d11e7b6..079f046ee26d 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -41,8 +41,8 @@ #include "internal.h" -static struct microcode_ops *microcode_ops; -bool dis_ucode_ldr = true; +static struct microcode_ops *microcode_ops; +static bool dis_ucode_ldr = false; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); module_param(force_minrev, bool, S_IRUSR | S_IWUSR); @@ -84,6 +84,9 @@ static bool amd_check_current_patch_level(void) u32 lvl, dummy, i; u32 *levels; + if (x86_cpuid_vendor() != X86_VENDOR_AMD) + return false; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); levels = final_levels; @@ -95,27 +98,29 @@ static bool amd_check_current_patch_level(void) return false; } -static bool __init check_loader_disabled_bsp(void) +bool __init microcode_loader_disabled(void) { - static const char *__dis_opt_str = "dis_ucode_ldr"; - const char *cmdline = boot_command_line; - const char *option = __dis_opt_str; + if (dis_ucode_ldr) + return true; /* - * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not - * completely accurate as xen pv guests don't see that CPUID bit set but - * that's good enough as they don't land on the BSP path anyway. + * Disable when: + * + * 1) The CPU does not support CPUID. + * + * 2) Bit 31 in CPUID[1]:ECX is clear + * The bit is reserved for hypervisor use. This is still not + * completely accurate as XEN PV guests don't see that CPUID bit + * set, but that's good enough as they don't land on the BSP + * path anyway. + * + * 3) Certain AMD patch levels are not allowed to be + * overwritten. */ - if (native_cpuid_ecx(1) & BIT(31)) - return true; - - if (x86_cpuid_vendor() == X86_VENDOR_AMD) { - if (amd_check_current_patch_level()) - return true; - } - - if (cmdline_find_option_bool(cmdline, option) <= 0) - dis_ucode_ldr = false; + if (!have_cpuid_p() || + native_cpuid_ecx(1) & BIT(31) || + amd_check_current_patch_level()) + dis_ucode_ldr = true; return dis_ucode_ldr; } @@ -125,7 +130,10 @@ void __init load_ucode_bsp(void) unsigned int cpuid_1_eax; bool intel = true; - if (!have_cpuid_p()) + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; + + if (microcode_loader_disabled()) return; cpuid_1_eax = native_cpuid_eax(1); @@ -146,9 +154,6 @@ void __init load_ucode_bsp(void) return; } - if (check_loader_disabled_bsp()) - return; - if (intel) load_ucode_intel_bsp(&early_data); else @@ -159,6 +164,11 @@ void load_ucode_ap(void) { unsigned int cpuid_1_eax; + /* + * Can't use microcode_loader_disabled() here - .init section + * hell. It doesn't have to either - the BSP variant must've + * parsed cmdline already anyway. + */ if (dis_ucode_ldr) return; @@ -810,7 +820,7 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (dis_ucode_ldr) + if (microcode_loader_disabled()) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 819199bc0119..2a397da43923 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -389,7 +389,7 @@ static int __init save_builtin_microcode(void) if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED) return 0; - if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; uci.mc = get_microcode_blob(&uci, true); diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 5df621752fef..50a9702ae4e2 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -94,7 +94,6 @@ static inline unsigned int x86_cpuid_family(void) return x86_family(eax); } -extern bool dis_ucode_ldr; extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cace6e8d7cc7..5eb1514af559 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -354,7 +354,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) goto fail; ip = trampoline + size; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(ip)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else memcpy(ip, retq, sizeof(retq)); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index de001b2146ab..375f2d7f1762 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void) *ptr = (unsigned long)ptep + PAGE_OFFSET; #ifdef CONFIG_MICROCODE_INITRD32 - /* Running on a hypervisor? */ - if (native_cpuid_ecx(1) & BIT(31)) - return; - params = (struct boot_params *)__pa_nodebug(&boot_params); if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image) return; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index a7998f351701..ff07558b7ebc 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -266,6 +266,8 @@ int module_finalize(const Elf_Ehdr *hdr, ibt_endbr = s; } + its_init_mod(me); + if (retpolines || cfi) { void *rseg = NULL, *cseg = NULL; unsigned int rsize = 0, csize = 0; @@ -286,6 +288,9 @@ int module_finalize(const Elf_Ehdr *hdr, void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); } + + its_fini_mod(me); + if (returns) { void *rseg = (void *)returns->sh_addr; apply_returns(rseg, rseg + returns->sh_size); @@ -326,4 +331,5 @@ int module_finalize(const Elf_Ehdr *hdr, void module_arch_cleanup(struct module *mod) { alternatives_smp_module_del(mod); + its_free_mod(mod); } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index a59c72e77645..c3d7ff44b29a 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -81,7 +81,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, break; case RET: - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(insn)) code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; @@ -90,7 +90,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, case JCC: if (!func) { func = __static_call_return; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk()) func = x86_return_thunk; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ccdc45e5b759..cda5f8362e9d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -466,10 +466,18 @@ SECTIONS } /* - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause + * this. Let's assume that nobody will be running a COMPILE_TEST kernel and + * let's assert that fuller build coverage is more valuable than being able to + * run a COMPILE_TEST kernel. + */ +#ifndef CONFIG_COMPILE_TEST +/* + * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility: */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); +#endif /* needed for Clang - see arch/x86/entry/entry.S */ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); @@ -497,6 +505,16 @@ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); "SRSO function pair won't alias"); #endif +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline"); +. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart"); +. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array"); +#endif + +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline"); +#endif + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 050a0e229a4d..f2b36d32ef40 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -104,6 +104,9 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); + /* * Checking root.hpa is sufficient even when KVM has mirror root. * We can have either: diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 63bb77ee1bb1..8d1b632e33d2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5974,6 +5974,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } +EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -7669,9 +7670,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { + struct kvm_memory_slot *slot = range->slot; + int level; + /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM @@ -7686,6 +7708,38 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; + if (WARN_ON_ONCE(range->end <= range->start)) + return false; + + /* + * If the head and tail pages of the range currently allow a hugepage, + * i.e. reside fully in the slot and don't have mixed attributes, then + * add each corresponding hugepage range to the ongoing invalidation, + * e.g. to prevent KVM from creating a hugepage in response to a fault + * for a gfn whose attributes aren't changing. Note, only the range + * of gfns whose attributes are being modified needs to be explicitly + * unmapped, as that will unmap any existing hugepages. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t start = gfn_round_for_level(range->start, level); + gfn_t end = gfn_round_for_level(range->end - 1, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + + if ((start != range->start || start + nr_pages > range->end) && + start >= slot->base_gfn && + start + nr_pages <= slot->base_gfn + slot->npages && + !hugepage_test_mixed(slot, start, level)) + kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages); + + if (end == start) + continue; + + if ((end + nr_pages) > range->end && + (end + nr_pages) <= (slot->base_gfn + slot->npages) && + !hugepage_test_mixed(slot, end, level)) + kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages); + } + /* Unmap the old attribute page. */ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE) range->attr_filter = KVM_FILTER_SHARED; @@ -7695,23 +7749,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, return kvm_unmap_gfn_range(kvm, range); } -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; -} - -static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; -} -static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; -} static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 699e551ec93b..9864c057187d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) kvm_mmu_reset_context(vcpu); } +EXPORT_SYMBOL_GPL(kvm_smm_changed); void process_smi(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0bc708ee2788..a7a7dc507336 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3173,9 +3173,14 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -3184,18 +3189,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -3266,11 +3277,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d5d0c5c3300b..a89c271a1951 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void) kvm_cpu_svm_disable(); amd_pmu_disable_virt(); - - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); } static int svm_enable_virtualization_cpu(void) @@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void) rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); - return 0; } @@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) +{ + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + /* + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. + */ + if (atomic_read(&srso_nr_vms)) + return; + + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} + +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); +} +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1550,6 +1601,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -2231,6 +2287,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -5036,6 +5096,8 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) @@ -5061,6 +5123,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d4490eaed55d..f16b068c4228 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -335,6 +335,8 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; + bool bp_spec_reduce_set; + struct vmcb *save_area; unsigned long save_area_pa; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index df5b99ea1f18..be7bb6d20129 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1584,7 +1584,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -1618,6 +1618,8 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* @@ -4597,7 +4599,7 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } -static inline u32 kvm_sync_valid_fields(struct kvm *kvm) +static inline u64 kvm_sync_valid_fields(struct kvm *kvm) { return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; } @@ -11493,7 +11495,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; - u32 sync_valid_fields; + u64 sync_valid_fields; int r; r = kvm_mmu_post_init_vm(vcpu->kvm); diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index a26c43abd47d..39374949daa2 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -367,6 +367,54 @@ SYM_FUNC_END(call_depth_return_thunk) #endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ +#ifdef CONFIG_MITIGATION_ITS + +.macro ITS_THUNK reg + +/* + * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) + * that complete the fineibt_paranoid caller sequence. + */ +1: .byte 0xea +SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + jne 1b +SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + jmp *%\reg + int3 + .align 32, 0xcc /* fill to the end of the line */ + .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ +.endm + +/* ITS mitigation requires thunks be aligned to upper half of cacheline */ +.align 64, 0xcc +.skip 29, 0xcc + +#define GEN(reg) ITS_THUNK reg +#include <asm/GEN-for-each-reg.h> +#undef GEN + + .align 64, 0xcc +SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) +SYM_CODE_END(__x86_indirect_its_thunk_array) + +.align 64, 0xcc +.skip 32, 0xcc +SYM_CODE_START(its_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(its_return_thunk) +EXPORT_SYMBOL(its_return_thunk) + +#endif /* CONFIG_MITIGATION_ITS */ + /* * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ad662cc4605c..7fa3965dcef1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -30,6 +30,7 @@ #include <linux/initrd.h> #include <linux/cpumask.h> #include <linux/gfp.h> +#include <linux/execmem.h> #include <asm/asm.h> #include <asm/bios_ebda.h> @@ -755,6 +756,8 @@ void mark_rodata_ro(void) pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); + execmem_cache_make_ro(); + kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c4f6f591f2b..949a447f75ec 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,6 +34,7 @@ #include <linux/gfp.h> #include <linux/kcore.h> #include <linux/bootmem_info.h> +#include <linux/execmem.h> #include <asm/processor.h> #include <asm/bios_ebda.h> @@ -1391,6 +1392,8 @@ void mark_rodata_ro(void) (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + execmem_cache_make_ro(); + kernel_set_to_readonly = 1; /* diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index eb83348f9305..b6d6750e4bd1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -899,8 +899,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* - * Let nmi_uaccess_okay() and finish_asid_transition() - * know that CR3 is changing. + * Indicate that CR3 is about to change. nmi_uaccess_okay() + * and others are sensitive to the window where mm_cpumask(), + * CR3 and cpu_tlbstate.loaded_mm are not all in sync. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); @@ -1204,8 +1205,16 @@ done: static bool should_flush_tlb(int cpu, void *data) { + struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); struct flush_tlb_info *info = data; + /* + * Order the 'loaded_mm' and 'is_lazy' against their + * write ordering in switch_mm_irqs_off(). Ensure + * 'is_lazy' is at least as new as 'loaded_mm'. + */ + smp_rmb(); + /* Lazy TLB will get flushed at the next context switch. */ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) return false; @@ -1214,8 +1223,15 @@ static bool should_flush_tlb(int cpu, void *data) if (!info->mm) return true; + /* + * While switching, the remote CPU could have state from + * either the prev or next mm. Assume the worst and flush. + */ + if (loaded_mm == LOADED_MM_SWITCHING) + return true; + /* The target mm is loaded, and the CPU is not lazy. */ - if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + if (loaded_mm == info->mm) return true; /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9e5fe2ba858f..ea4dd5b393aa 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -41,6 +41,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) #define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) #define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) +#define EMIT5(b1, b2, b3, b4, b5) \ + do { EMIT1(b1); EMIT4(b2, b3, b4, b5); } while (0) #define EMIT1_off32(b1, off) \ do { EMIT1(b1); EMIT(off, 4); } while (0) @@ -661,7 +663,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) { u8 *prog = *pprog; - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) { + OPTIMIZER_HIDE_VAR(reg); + emit_jump(&prog, its_static_thunk(reg), ip); + } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { @@ -683,7 +688,7 @@ static void emit_return(u8 **pprog, u8 *ip) { u8 *prog = *pprog; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk()) { emit_jump(&prog, x86_return_thunk, ip); } else { EMIT1(0xC3); /* ret */ @@ -1502,6 +1507,48 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr) #define PRIV_STACK_GUARD_SZ 8 #define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL +static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, + struct bpf_prog *bpf_prog) +{ + u8 *prog = *pprog; + u8 *func; + + if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP)) { + /* The clearing sequence clobbers eax and ecx. */ + EMIT1(0x50); /* push rax */ + EMIT1(0x51); /* push rcx */ + ip += 2; + + func = (u8 *)clear_bhb_loop; + ip += x86_call_depth_emit_accounting(&prog, func, ip); + + if (emit_call(&prog, func, ip)) + return -EINVAL; + EMIT1(0x59); /* pop rcx */ + EMIT1(0x58); /* pop rax */ + } + /* Insert IBHF instruction */ + if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) && + cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) || + cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW)) { + /* + * Add an Indirect Branch History Fence (IBHF). IBHF acts as a + * fence preventing branch history from before the fence from + * affecting indirect branches after the fence. This is + * specifically used in cBPF jitted code to prevent Intra-mode + * BHI attacks. The IBHF instruction is designed to be a NOP on + * hardware that doesn't need or support it. The REP and REX.W + * prefixes are required by the microcode, and they also ensure + * that the NOP is unlikely to be used in existing code. + * + * IBHF is not a valid instruction in 32-bit mode. + */ + EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */ + } + *pprog = prog; + return 0; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { @@ -2544,6 +2591,13 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; + if (bpf_prog_was_classic(bpf_prog) && + !capable(CAP_SYS_ADMIN)) { + u8 *ip = image + addrs[i - 1]; + + if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog)) + return -EINVAL; + } if (bpf_prog->aux->exception_boundary) { pop_callee_regs(&prog, all_callee_regs_used); pop_r12(&prog); diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h index ab5c8e47049c..9193a7790a71 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_32.h +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h @@ -31,8 +31,8 @@ struct faultinfo { #define ___backtrack_faulted(_faulted) \ asm volatile ( \ - "mov $0, %0\n" \ "movl $__get_kernel_nofault_faulted_%=,%1\n" \ + "mov $0, %0\n" \ "jmp _end_%=\n" \ "__get_kernel_nofault_faulted_%=:\n" \ "mov $1, %0;" \ diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h index 26fb4835d3e9..61e4ca1e0ab5 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_64.h +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h @@ -31,8 +31,8 @@ struct faultinfo { #define ___backtrack_faulted(_faulted) \ asm volatile ( \ - "mov $0, %0\n" \ "movq $__get_kernel_nofault_faulted_%=,%1\n" \ + "mov $0, %0\n" \ "jmp _end_%=\n" \ "__get_kernel_nofault_faulted_%=:\n" \ "mov $1, %0;" \ |