From 8b37357a78d7fa13d88ea822b35b40137da1c85e Mon Sep 17 00:00:00 2001 From: Petr Vaněk Date: Mon, 7 Apr 2025 15:24:27 +0200 Subject: x86/acpi: Don't limit CPUs to 1 for Xen PV guests due to disabled ACPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Xen disables ACPI for PV guests in DomU, which causes acpi_mps_check() to return 1 when CONFIG_X86_MPPARSE is not set. As a result, the local APIC is disabled and the guest is later limited to a single vCPU, despite being configured with more. This regression was introduced in version 6.9 in commit 7c0edad3643f ("x86/cpu/topology: Rework possible CPU management"), which added an early check that limits CPUs to 1 if apic_is_disabled. Update the acpi_mps_check() logic to return 0 early when running as a Xen PV guest in DomU, preventing APIC from being disabled in this specific case and restoring correct multi-vCPU behaviour. Fixes: 7c0edad3643f ("x86/cpu/topology: Rework possible CPU management") Signed-off-by: Petr Vaněk Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250407132445.6732-2-arkamar@atlas.cz --- arch/x86/kernel/acpi/boot.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index dae6a73be40e..9fa321a95eb3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -23,6 +23,8 @@ #include #include +#include + #include #include #include @@ -1729,6 +1731,15 @@ int __init acpi_mps_check(void) { #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) /* mptable code is not built-in*/ + + /* + * Xen disables ACPI in PV DomU guests but it still emulates APIC and + * supports SMP. Returning early here ensures that APIC is not disabled + * unnecessarily and the guest is not limited to a single vCPU. + */ + if (xen_pv_domain() && !xen_initial_domain()) + return 0; + if (acpi_disabled || acpi_noirq) { pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n"); return 1; -- cgit From f2f29da9f0d4367f6ff35e0d9d021257bb53e273 Mon Sep 17 00:00:00 2001 From: Myrrh Periwinkle Date: Sun, 6 Apr 2025 11:45:22 +0700 Subject: x86/e820: Fix handling of subpage regions when calculating nosave ranges in e820__register_nosave_regions() While debugging kexec/hibernation hangs and crashes, it turned out that the current implementation of e820__register_nosave_regions() suffers from multiple serious issues: - The end of last region is tracked by PFN, causing it to find holes that aren't there if two consecutive subpage regions are present - The nosave PFN ranges derived from holes are rounded out (instead of rounded in) which makes it inconsistent with how explicitly reserved regions are handled Fix this by: - Treating reserved regions as if they were holes, to ensure consistent handling (rounding out nosave PFN ranges is more correct as the kernel does not use partial pages) - Tracking the end of the last RAM region by address instead of pages to detect holes more precisely These bugs appear to have been introduced about ~18 years ago with the very first version of e820_mark_nosave_regions(), and its flawed assumptions were carried forward uninterrupted through various waves of rewrites and renames. [ mingo: Added Git archeology details, for kicks and giggles. ] Fixes: e8eff5ac294e ("[PATCH] Make swsusp avoid memory holes and reserved memory regions on x86_64") Reported-by: Roberto Ricci Tested-by: Roberto Ricci Signed-off-by: Myrrh Periwinkle Signed-off-by: Ingo Molnar Cc: Rafael J. Wysocki Cc: Ard Biesheuvel Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: David Woodhouse Cc: Len Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250406-fix-e820-nosave-v3-1-f3787bc1ee1d@qtmlabs.xyz Closes: https://lore.kernel.org/all/Z4WFjBVHpndct7br@desktop0a/ --- arch/x86/kernel/e820.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 57120f0749cc..9d8dd8deb2a7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -753,22 +753,21 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) void __init e820__register_nosave_regions(unsigned long limit_pfn) { int i; - unsigned long pfn = 0; + u64 last_addr = 0; for (i = 0; i < e820_table->nr_entries; i++) { struct e820_entry *entry = &e820_table->entries[i]; - if (pfn < PFN_UP(entry->addr)) - register_nosave_region(pfn, PFN_UP(entry->addr)); - - pfn = PFN_DOWN(entry->addr + entry->size); - if (entry->type != E820_TYPE_RAM) - register_nosave_region(PFN_UP(entry->addr), pfn); + continue; - if (pfn >= limit_pfn) - break; + if (last_addr < entry->addr) + register_nosave_region(PFN_DOWN(last_addr), PFN_UP(entry->addr)); + + last_addr = entry->addr + entry->size; } + + register_nosave_region(PFN_DOWN(last_addr), limit_pfn); } #ifdef CONFIG_ACPI -- cgit From 45c2e30bbd64d75559b99f3a5c455129a7a34b06 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 7 Apr 2025 13:46:37 +0100 Subject: x86/resctrl: Fix rdtgroup_mkdir()'s unlocked use of kernfs_node::name Since 741c10b096bc ("kernfs: Use RCU to access kernfs_node::name.") a helper rdt_kn_name() that checks that rdtgroup_mutex is held has been used for all accesses to the kernfs node name. rdtgroup_mkdir() uses the name to determine if a valid monitor group is being created by checking the parent name is "mon_groups". This is done without holding rdtgroup_mutex, and now triggers the following warning: | WARNING: suspicious RCU usage | 6.15.0-rc1 #4465 Tainted: G E | ----------------------------- | arch/x86/kernel/cpu/resctrl/internal.h:408 suspicious rcu_dereference_check() usage! [...] | Call Trace: | | dump_stack_lvl | lockdep_rcu_suspicious.cold | is_mon_groups | rdtgroup_mkdir | kernfs_iop_mkdir | vfs_mkdir | do_mkdirat | __x64_sys_mkdir | do_syscall_64 | entry_SYSCALL_64_after_hwframe Creating a control or monitor group calls mkdir_rdt_prepare(), which uses rdtgroup_kn_lock_live() to take the rdtgroup_mutex. To avoid taking and dropping the lock, move the check for the monitor group name and position into mkdir_rdt_prepare() so that it occurs under rdtgroup_mutex. Hoist is_mon_groups() earlier in the file. [ bp: Massage. ] Fixes: 741c10b096bc ("kernfs: Use RCU to access kernfs_node::name.") Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Acked-by: Ingo Molnar Link: https://lore.kernel.org/r/20250407124637.2433230-1-james.morse@arm.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 48 +++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 93ec829015f1..cc4a54145c83 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3553,6 +3553,22 @@ static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) free_rmid(rgrp->closid, rgrp->mon.rmid); } +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(rdt_kn_name(kn), "mon_groups") && + strcmp(name, "mon_groups")); +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) @@ -3568,6 +3584,15 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_unlock; } + /* + * Check that the parent directory for a monitor group is a "mon_groups" + * directory. + */ + if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { + ret = -EPERM; + goto out_unlock; + } + if (rtype == RDTMON_GROUP && (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { @@ -3751,22 +3776,6 @@ out_unlock: return ret; } -/* - * We allow creating mon groups only with in a directory called "mon_groups" - * which is present in every ctrl_mon group. Check if this is a valid - * "mon_groups" directory. - * - * 1. The directory should be named "mon_groups". - * 2. The mon group itself should "not" be named "mon_groups". - * This makes sure "mon_groups" directory always has a ctrl_mon group - * as parent. - */ -static bool is_mon_groups(struct kernfs_node *kn, const char *name) -{ - return (!strcmp(rdt_kn_name(kn), "mon_groups") && - strcmp(name, "mon_groups")); -} - static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { @@ -3782,11 +3791,8 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); - /* - * If RDT monitoring is supported and the parent directory is a valid - * "mon_groups" directory, add a monitoring subdirectory. - */ - if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) + /* Else, attempt to add a monitoring subdirectory. */ + if (resctrl_arch_mon_capable()) return rdtgroup_mkdir_mon(parent_kn, name, mode); return -EPERM; -- cgit From 996457176bb7c64b3d30996592c754205ec4d3ea Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Apr 2025 20:22:14 +0300 Subject: x86/early_printk: Use 'mmio32' for consistency, fix comments First of all, using 'mmio' prevents proper implementation of 8-bit accessors. Second, it's simply inconsistent with uart8250 set of options. Rename it to 'mmio32'. While at it, remove rather misleading comment in the documentation. From now on mmio32 is self-explanatory and pciserial supports not only 32-bit MMIO accessors. Also, while at it, fix the comment for the "pciserial" case. The comment seems to be a copy'n'paste error when mentioning "serial" instead of "pciserial" (with double quotes). Fix this. With that, move it upper, so we don't calculate 'buf' twice. Fixes: 3181424aeac2 ("x86/early_printk: Add support for MMIO-based UARTs") Signed-off-by: Andy Shevchenko Signed-off-by: Ingo Molnar Reviewed-by: Denis Mukhin Link: https://lore.kernel.org/r/20250407172214.792745-1-andriy.shevchenko@linux.intel.com --- Documentation/admin-guide/kernel-parameters.txt | 5 +---- arch/x86/kernel/early_printk.c | 10 +++++----- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 76e538c77e31..d9fd26b95b34 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1407,18 +1407,15 @@ earlyprintk=serial[,0x...[,baudrate]] earlyprintk=ttySn[,baudrate] earlyprintk=dbgp[debugController#] + earlyprintk=mmio32,membase[,{nocfg|baudrate}] earlyprintk=pciserial[,force],bus:device.function[,{nocfg|baudrate}] earlyprintk=xdbc[xhciController#] earlyprintk=bios - earlyprintk=mmio,membase[,{nocfg|baudrate}] earlyprintk is useful when the kernel crashes before the normal console is initialized. It is not enabled by default because it has some cosmetic problems. - Only 32-bit memory addresses are supported for "mmio" - and "pciserial" devices. - Use "nocfg" to skip UART configuration, assume BIOS/firmware has configured UART correctly. diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 611f27e3890c..3aad78bfcb26 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -389,10 +389,10 @@ static int __init setup_early_printk(char *buf) keep = (strstr(buf, "keep") != NULL); while (*buf != '\0') { - if (!strncmp(buf, "mmio", 4)) { - early_mmio_serial_init(buf + 4); + if (!strncmp(buf, "mmio32", 6)) { + buf += 6; + early_mmio_serial_init(buf); early_console_register(&early_serial_console, keep); - buf += 4; } if (!strncmp(buf, "serial", 6)) { buf += 6; @@ -407,9 +407,9 @@ static int __init setup_early_printk(char *buf) } #ifdef CONFIG_PCI if (!strncmp(buf, "pciserial", 9)) { - early_pci_serial_init(buf + 9); + buf += 9; /* Keep from match the above "pciserial" */ + early_pci_serial_init(buf); early_console_register(&early_serial_console, keep); - buf += 9; /* Keep from match the above "serial" */ } #endif if (!strncmp(buf, "vga", 3) && -- cgit From 13235d6d50bba99931c4392c0f813cfae0de3eac Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:30 -0700 Subject: x86/bugs: Rename entry_ibpb() to write_ibpb() There's nothing entry-specific about entry_ibpb(). In preparation for calling it from elsewhere, rename it to write_ibpb(). Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/1e54ace131e79b760de3fe828264e26d0896e3ac.1744148254.git.jpoimboe@kernel.org --- arch/x86/entry/entry.S | 7 ++++--- arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/kernel/cpu/bugs.c | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index d3caa31240ed..cabe65ac8379 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -17,7 +17,8 @@ .pushsection .noinstr.text, "ax" -SYM_FUNC_START(entry_ibpb) +/* Clobbers AX, CX, DX */ +SYM_FUNC_START(write_ibpb) ANNOTATE_NOENDBR movl $MSR_IA32_PRED_CMD, %ecx movl $PRED_CMD_IBPB, %eax @@ -27,9 +28,9 @@ SYM_FUNC_START(entry_ibpb) /* Make sure IBPB clears return stack preductions too. */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET -SYM_FUNC_END(entry_ibpb) +SYM_FUNC_END(write_ibpb) /* For KVM */ -EXPORT_SYMBOL_GPL(entry_ibpb); +EXPORT_SYMBOL_GPL(write_ibpb); .popsection diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 8a5cc8e70439..591d1dbca60a 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -269,7 +269,7 @@ * typically has NO_MELTDOWN). * * While retbleed_untrain_ret() doesn't clobber anything but requires stack, - * entry_ibpb() will clobber AX, CX, DX. + * write_ibpb() will clobber AX, CX, DX. * * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. @@ -279,7 +279,7 @@ VALIDATE_UNRET_END CALL_UNTRAIN_RET ALTERNATIVE_2 "", \ - "call entry_ibpb", \ibpb_feature, \ + "call write_ibpb", \ibpb_feature, \ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm @@ -368,7 +368,7 @@ extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); -extern void entry_ibpb(void); +extern void write_ibpb(void); #ifdef CONFIG_X86_64 extern void clear_bhb_loop(void); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4386aa6c69e1..608bbe6cf730 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1142,7 +1142,7 @@ do_cmd_auto: setup_clear_cpu_cap(X86_FEATURE_RETHUNK); /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -2676,7 +2676,7 @@ static void __init srso_select_mitigation(void) setup_clear_cpu_cap(X86_FEATURE_RETHUNK); /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -2701,7 +2701,7 @@ ibpb_on_vmexit: srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ -- cgit From fc9fd3f98423367c79e0bd85a9515df26dc1b3cc Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:31 -0700 Subject: x86/bugs: Use SBPB in write_ibpb() if applicable write_ibpb() does IBPB, which (among other things) flushes branch type predictions on AMD. If the CPU has SRSO_NO, or if the SRSO mitigation has been disabled, branch type flushing isn't needed, in which case the lighter-weight SBPB can be used. The 'x86_pred_cmd' variable already keeps track of whether IBPB or SBPB should be used. Use that instead of hardcoding IBPB. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/17c5dcd14b29199b75199d67ff7758de9d9a4928.1744148254.git.jpoimboe@kernel.org --- arch/x86/entry/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index cabe65ac8379..175958b02f2b 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -21,7 +21,7 @@ SYM_FUNC_START(write_ibpb) ANNOTATE_NOENDBR movl $MSR_IA32_PRED_CMD, %ecx - movl $PRED_CMD_IBPB, %eax + movl _ASM_RIP(x86_pred_cmd), %eax xorl %edx, %edx wrmsr -- cgit From b1b19cfcf4656c75088dc06b7499f493e0dec3e5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:32 -0700 Subject: x86/bugs: Fix RSB clearing in indirect_branch_prediction_barrier() IBPB is expected to clear the RSB. However, if X86_BUG_IBPB_NO_RET is set, that doesn't happen. Make indirect_branch_prediction_barrier() take that into account by calling write_ibpb() which clears RSB on X86_BUG_IBPB_NO_RET: /* Make sure IBPB clears return stack preductions too. */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET Note that, as of the previous patch, write_ibpb() also reads 'x86_pred_cmd' in order to use SBPB when applicable: movl _ASM_RIP(x86_pred_cmd), %eax Therefore that existing behavior in indirect_branch_prediction_barrier() is not lost. Fixes: 50e4b3b94090 ("x86/entry: Have entry_ibpb() invalidate return predictions") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/bba68888c511743d4cd65564d1fc41438907523f.1744148254.git.jpoimboe@kernel.org --- arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/kernel/cpu/bugs.c | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 591d1dbca60a..5c43f145454d 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -514,11 +514,11 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } -extern u64 x86_pred_cmd; - static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_IBPB); + asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_IBPB) + : ASM_CALL_CONSTRAINT + :: "rax", "rcx", "rdx", "memory"); } /* The Intel SPEC CTRL MSR base value cache */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 608bbe6cf730..99265098d045 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -59,7 +59,6 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; -EXPORT_SYMBOL_GPL(x86_pred_cmd); static u64 __ro_after_init x86_arch_cap_msr; -- cgit From 18bae0dfec15b24ec14ca17dc18603372f5f254f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:33 -0700 Subject: x86/bugs: Don't fill RSB on VMEXIT with eIBRS+retpoline eIBRS protects against guest->host RSB underflow/poisoning attacks. Adding retpoline to the mix doesn't change that. Retpoline has a balanced CALL/RET anyway. So the current full RSB filling on VMEXIT with eIBRS+retpoline is overkill. Disable it or do the VMEXIT_LITE mitigation if needed. Suggested-by: Pawan Gupta Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Pawan Gupta Reviewed-by: Amit Shah Reviewed-by: Nikolay Borisov Cc: Paolo Bonzini Cc: Vitaly Kuznetsov Cc: Sean Christopherson Cc: David Woodhouse Link: https://lore.kernel.org/r/84a1226e5c9e2698eae1b5ade861f1b8bf3677dc.1744148254.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 99265098d045..a10b37bb747e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1617,20 +1617,20 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ case SPECTRE_V2_NONE: return; - case SPECTRE_V2_EIBRS_LFENCE: case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS_RETPOLINE: if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } return; - case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); return; } -- cgit From 27ce8299bc1ec6df8306073785ff82b30b3cc5ee Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:34 -0700 Subject: x86/bugs: Don't fill RSB on context switch with eIBRS User->user Spectre v2 attacks (including RSB) across context switches are already mitigated by IBPB in cond_mitigation(), if enabled globally or if either the prev or the next task has opted in to protection. RSB filling without IBPB serves no purpose for protecting user space, as indirect branches are still vulnerable. User->kernel RSB attacks are mitigated by eIBRS. In which case the RSB filling on context switch isn't needed, so remove it. Suggested-by: Pawan Gupta Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Pawan Gupta Reviewed-by: Amit Shah Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/98cdefe42180358efebf78e3b80752850c7a3e1b.1744148254.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 24 ++++++++++++------------ arch/x86/mm/tlb.c | 6 +++--- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a10b37bb747e..e2a672f925e3 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1591,7 +1591,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) rrsba_disabled = true; } -static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* * Similar to context switches, there are two types of RSB attacks @@ -1615,7 +1615,7 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ */ switch (mode) { case SPECTRE_V2_NONE: - return; + break; case SPECTRE_V2_EIBRS: case SPECTRE_V2_EIBRS_LFENCE: @@ -1624,18 +1624,21 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } - return; + break; case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: - pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); - return; - } + break; - pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); - dump_stack(); + default: + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n"); + dump_stack(); + break; + } } /* @@ -1867,10 +1870,7 @@ static void __init spectre_v2_select_mitigation(void) * * FIXME: Is this pointless for retbleed-affected AMD? */ - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - - spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + spectre_v2_select_rsb_mitigation(mode); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e459d97ef397..eb83348f9305 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -667,9 +667,9 @@ static void cond_mitigation(struct task_struct *next) prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); /* - * Avoid user/user BTB poisoning by flushing the branch predictor - * when switching between processes. This stops one process from - * doing Spectre-v2 attacks on another. + * Avoid user->user BTB/RSB poisoning by flushing them when switching + * between processes. This stops one process from doing Spectre-v2 + * attacks on another. * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the -- cgit From 83f6665a49c3d44ad0c08f837d352dd290f5d10b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:35 -0700 Subject: x86/bugs: Add RSB mitigation document Create a document to summarize hard-earned knowledge about RSB-related mitigations, with references, and replace the overly verbose yet incomplete comments with a reference to the document. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/ab73f4659ba697a974759f07befd41ae605e33dd.1744148254.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/index.rst | 1 + Documentation/admin-guide/hw-vuln/rsb.rst | 268 ++++++++++++++++++++++++++++ arch/x86/kernel/cpu/bugs.c | 64 ++----- 3 files changed, 282 insertions(+), 51 deletions(-) create mode 100644 Documentation/admin-guide/hw-vuln/rsb.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index ff0b440ef2dc..451874b8135d 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -22,3 +22,4 @@ are configurable at compile, boot or run time. srso gather_data_sampling reg-file-data-sampling + rsb diff --git a/Documentation/admin-guide/hw-vuln/rsb.rst b/Documentation/admin-guide/hw-vuln/rsb.rst new file mode 100644 index 000000000000..21dbf9cf25f8 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/rsb.rst @@ -0,0 +1,268 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +RSB-related mitigations +======================= + +.. warning:: + Please keep this document up-to-date, otherwise you will be + volunteered to update it and convert it to a very long comment in + bugs.c! + +Since 2018 there have been many Spectre CVEs related to the Return Stack +Buffer (RSB) (sometimes referred to as the Return Address Stack (RAS) or +Return Address Predictor (RAP) on AMD). + +Information about these CVEs and how to mitigate them is scattered +amongst a myriad of microarchitecture-specific documents. + +This document attempts to consolidate all the relevant information in +once place and clarify the reasoning behind the current RSB-related +mitigations. It's meant to be as concise as possible, focused only on +the current kernel mitigations: what are the RSB-related attack vectors +and how are they currently being mitigated? + +It's *not* meant to describe how the RSB mechanism operates or how the +exploits work. More details about those can be found in the references +below. + +Rather, this is basically a glorified comment, but too long to actually +be one. So when the next CVE comes along, a kernel developer can +quickly refer to this as a refresher to see what we're actually doing +and why. + +At a high level, there are two classes of RSB attacks: RSB poisoning +(Intel and AMD) and RSB underflow (Intel only). They must each be +considered individually for each attack vector (and microarchitecture +where applicable). + +---- + +RSB poisoning (Intel and AMD) +============================= + +SpectreRSB +~~~~~~~~~~ + +RSB poisoning is a technique used by SpectreRSB [#spectre-rsb]_ where +an attacker poisons an RSB entry to cause a victim's return instruction +to speculate to an attacker-controlled address. This can happen when +there are unbalanced CALLs/RETs after a context switch or VMEXIT. + +* All attack vectors can potentially be mitigated by flushing out any + poisoned RSB entries using an RSB filling sequence + [#intel-rsb-filling]_ [#amd-rsb-filling]_ when transitioning between + untrusted and trusted domains. But this has a performance impact and + should be avoided whenever possible. + + .. DANGER:: + **FIXME**: Currently we're flushing 32 entries. However, some CPU + models have more than 32 entries. The loop count needs to be + increased for those. More detailed information is needed about RSB + sizes. + +* On context switch, the user->user mitigation requires ensuring the + RSB gets filled or cleared whenever IBPB gets written [#cond-ibpb]_ + during a context switch: + + * AMD: + On Zen 4+, IBPB (or SBPB [#amd-sbpb]_ if used) clears the RSB. + This is indicated by IBPB_RET in CPUID [#amd-ibpb-rsb]_. + + On Zen < 4, the RSB filling sequence [#amd-rsb-filling]_ must be + always be done in addition to IBPB [#amd-ibpb-no-rsb]_. This is + indicated by X86_BUG_IBPB_NO_RET. + + * Intel: + IBPB always clears the RSB: + + "Software that executed before the IBPB command cannot control + the predicted targets of indirect branches executed after the + command on the same logical processor. The term indirect branch + in this context includes near return instructions, so these + predicted targets may come from the RSB." [#intel-ibpb-rsb]_ + +* On context switch, user->kernel attacks are prevented by SMEP. User + space can only insert user space addresses into the RSB. Even + non-canonical addresses can't be inserted due to the page gap at the + end of the user canonical address space reserved by TASK_SIZE_MAX. + A SMEP #PF at instruction fetch prevents the kernel from speculatively + executing user space. + + * AMD: + "Finally, branches that are predicted as 'ret' instructions get + their predicted targets from the Return Address Predictor (RAP). + AMD recommends software use a RAP stuffing sequence (mitigation + V2-3 in [2]) and/or Supervisor Mode Execution Protection (SMEP) + to ensure that the addresses in the RAP are safe for + speculation. Collectively, we refer to these mitigations as "RAP + Protection"." [#amd-smep-rsb]_ + + * Intel: + "On processors with enhanced IBRS, an RSB overwrite sequence may + not suffice to prevent the predicted target of a near return + from using an RSB entry created in a less privileged predictor + mode. Software can prevent this by enabling SMEP (for + transitions from user mode to supervisor mode) and by having + IA32_SPEC_CTRL.IBRS set during VM exits." [#intel-smep-rsb]_ + +* On VMEXIT, guest->host attacks are mitigated by eIBRS (and PBRSB + mitigation if needed): + + * AMD: + "When Automatic IBRS is enabled, the internal return address + stack used for return address predictions is cleared on VMEXIT." + [#amd-eibrs-vmexit]_ + + * Intel: + "On processors with enhanced IBRS, an RSB overwrite sequence may + not suffice to prevent the predicted target of a near return + from using an RSB entry created in a less privileged predictor + mode. Software can prevent this by enabling SMEP (for + transitions from user mode to supervisor mode) and by having + IA32_SPEC_CTRL.IBRS set during VM exits. Processors with + enhanced IBRS still support the usage model where IBRS is set + only in the OS/VMM for OSes that enable SMEP. To do this, such + processors will ensure that guest behavior cannot control the + RSB after a VM exit once IBRS is set, even if IBRS was not set + at the time of the VM exit." [#intel-eibrs-vmexit]_ + + Note that some Intel CPUs are susceptible to Post-barrier Return + Stack Buffer Predictions (PBRSB) [#intel-pbrsb]_, where the last + CALL from the guest can be used to predict the first unbalanced RET. + In this case the PBRSB mitigation is needed in addition to eIBRS. + +AMD RETBleed / SRSO / Branch Type Confusion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On AMD, poisoned RSB entries can also be created by the AMD RETBleed +variant [#retbleed-paper]_ [#amd-btc]_ or by Speculative Return Stack +Overflow [#amd-srso]_ (Inception [#inception-paper]_). The kernel +protects itself by replacing every RET in the kernel with a branch to a +single safe RET. + +---- + +RSB underflow (Intel only) +========================== + +RSB Alternate (RSBA) ("Intel Retbleed") +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some Intel Skylake-generation CPUs are susceptible to the Intel variant +of RETBleed [#retbleed-paper]_ (Return Stack Buffer Underflow +[#intel-rsbu]_). If a RET is executed when the RSB buffer is empty due +to mismatched CALLs/RETs or returning from a deep call stack, the branch +predictor can fall back to using the Branch Target Buffer (BTB). If a +user forces a BTB collision then the RET can speculatively branch to a +user-controlled address. + +* Note that RSB filling doesn't fully mitigate this issue. If there + are enough unbalanced RETs, the RSB may still underflow and fall back + to using a poisoned BTB entry. + +* On context switch, user->user underflow attacks are mitigated by the + conditional IBPB [#cond-ibpb]_ on context switch which effectively + clears the BTB: + + * "The indirect branch predictor barrier (IBPB) is an indirect branch + control mechanism that establishes a barrier, preventing software + that executed before the barrier from controlling the predicted + targets of indirect branches executed after the barrier on the same + logical processor." [#intel-ibpb-btb]_ + +* On context switch and VMEXIT, user->kernel and guest->host RSB + underflows are mitigated by IBRS or eIBRS: + + * "Enabling IBRS (including enhanced IBRS) will mitigate the "RSBU" + attack demonstrated by the researchers. As previously documented, + Intel recommends the use of enhanced IBRS, where supported. This + includes any processor that enumerates RRSBA but not RRSBA_DIS_S." + [#intel-rsbu]_ + + However, note that eIBRS and IBRS do not mitigate intra-mode attacks. + Like RRSBA below, this is mitigated by clearing the BHB on kernel + entry. + + As an alternative to classic IBRS, call depth tracking (combined with + retpolines) can be used to track kernel returns and fill the RSB when + it gets close to being empty. + +Restricted RSB Alternate (RRSBA) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some newer Intel CPUs have Restricted RSB Alternate (RRSBA) behavior, +which, similar to RSBA described above, also falls back to using the BTB +on RSB underflow. The only difference is that the predicted targets are +restricted to the current domain when eIBRS is enabled: + +* "Restricted RSB Alternate (RRSBA) behavior allows alternate branch + predictors to be used by near RET instructions when the RSB is + empty. When eIBRS is enabled, the predicted targets of these + alternate predictors are restricted to those belonging to the + indirect branch predictor entries of the current prediction domain. + [#intel-eibrs-rrsba]_ + +When a CPU with RRSBA is vulnerable to Branch History Injection +[#bhi-paper]_ [#intel-bhi]_, an RSB underflow could be used for an +intra-mode BTI attack. This is mitigated by clearing the BHB on +kernel entry. + +However if the kernel uses retpolines instead of eIBRS, it needs to +disable RRSBA: + +* "Where software is using retpoline as a mitigation for BHI or + intra-mode BTI, and the processor both enumerates RRSBA and + enumerates RRSBA_DIS controls, it should disable this behavior." + [#intel-retpoline-rrsba]_ + +---- + +References +========== + +.. [#spectre-rsb] `Spectre Returns! Speculation Attacks using the Return Stack Buffer `_ + +.. [#intel-rsb-filling] "Empty RSB Mitigation on Skylake-generation" in `Retpoline: A Branch Target Injection Mitigation `_ + +.. [#amd-rsb-filling] "Mitigation V2-3" in `Software Techniques for Managing Speculation `_ + +.. [#cond-ibpb] Whether IBPB is written depends on whether the prev and/or next task is protected from Spectre attacks. It typically requires opting in per task or system-wide. For more details see the documentation for the ``spectre_v2_user`` cmdline option in Documentation/admin-guide/kernel-parameters.txt. + +.. [#amd-sbpb] IBPB without flushing of branch type predictions. Only exists for AMD. + +.. [#amd-ibpb-rsb] "Function 8000_0008h -- Processor Capacity Parameters and Extended Feature Identification" in `AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions `_. SBPB behaves the same way according to `this email `_. + +.. [#amd-ibpb-no-rsb] `Spectre Attacks: Exploiting Speculative Execution `_ + +.. [#intel-ibpb-rsb] "Introduction" in `Post-barrier Return Stack Buffer Predictions / CVE-2022-26373 / INTEL-SA-00706 `_ + +.. [#amd-smep-rsb] "Existing Mitigations" in `Technical Guidance for Mitigating Branch Type Confusion `_ + +.. [#intel-smep-rsb] "Enhanced IBRS" in `Indirect Branch Restricted Speculation `_ + +.. [#amd-eibrs-vmexit] "Extended Feature Enable Register (EFER)" in `AMD64 Architecture Programmer's Manual Volume 2: System Programming `_ + +.. [#intel-eibrs-vmexit] "Enhanced IBRS" in `Indirect Branch Restricted Speculation `_ + +.. [#intel-pbrsb] `Post-barrier Return Stack Buffer Predictions / CVE-2022-26373 / INTEL-SA-00706 `_ + +.. [#retbleed-paper] `RETBleed: Arbitrary Speculative Code Execution with Return Instruction `_ + +.. [#amd-btc] `Technical Guidance for Mitigating Branch Type Confusion `_ + +.. [#amd-srso] `Technical Update Regarding Speculative Return Stack Overflow `_ + +.. [#inception-paper] `Inception: Exposing New Attack Surfaces with Training in Transient Execution `_ + +.. [#intel-rsbu] `Return Stack Buffer Underflow / Return Stack Buffer Underflow / CVE-2022-29901, CVE-2022-28693 / INTEL-SA-00702 `_ + +.. [#intel-ibpb-btb] `Indirect Branch Predictor Barrier' `_ + +.. [#intel-eibrs-rrsba] "Guidance for RSBU" in `Return Stack Buffer Underflow / Return Stack Buffer Underflow / CVE-2022-29901, CVE-2022-28693 / INTEL-SA-00702 `_ + +.. [#bhi-paper] `Branch History Injection: On the Effectiveness of Hardware Mitigations Against Cross-Privilege Spectre-v2 Attacks `_ + +.. [#intel-bhi] `Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 `_ + +.. [#intel-retpoline-rrsba] "Retpoline" in `Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 `_ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e2a672f925e3..362602b705cc 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1594,25 +1594,25 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* - * Similar to context switches, there are two types of RSB attacks - * after VM exit: + * WARNING! There are many subtleties to consider when changing *any* + * code related to RSB-related mitigations. Before doing so, carefully + * read the following document, and update if necessary: * - * 1) RSB underflow + * Documentation/admin-guide/hw-vuln/rsb.rst * - * 2) Poisoned RSB entry + * In an overly simplified nutshell: * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. + * - User->user RSB attacks are conditionally mitigated during + * context switches by cond_mitigation -> write_ibpb(). * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. + * - User->kernel and guest->host attacks are mitigated by eIBRS or + * RSB filling. * - * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB - * bug is present then a LITE version of RSB protection is required, - * just a single call needs to retire before a RET is executed. + * Though, depending on config, note that other alternative + * mitigations may end up getting used instead, e.g., IBPB on + * entry/vmexit, call depth tracking, or return thunks. */ + switch (mode) { case SPECTRE_V2_NONE: break; @@ -1832,44 +1832,6 @@ static void __init spectre_v2_select_mitigation(void) spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); - /* - * If Spectre v2 protection has been enabled, fill the RSB during a - * context switch. In general there are two types of RSB attacks - * across context switches, for which the CALLs/RETs may be unbalanced. - * - * 1) RSB underflow - * - * Some Intel parts have "bottomless RSB". When the RSB is empty, - * speculated return targets may come from the branch predictor, - * which could have a user-poisoned BTB or BHB entry. - * - * AMD has it even worse: *all* returns are speculated from the BTB, - * regardless of the state of the RSB. - * - * When IBRS or eIBRS is enabled, the "user -> kernel" attack - * scenario is mitigated by the IBRS branch prediction isolation - * properties, so the RSB buffer filling wouldn't be necessary to - * protect against this type of attack. - * - * The "user -> user" attack scenario is mitigated by RSB filling. - * - * 2) Poisoned RSB entry - * - * If the 'next' in-kernel return stack is shorter than 'prev', - * 'next' could be tricked into speculating with a user-poisoned RSB - * entry. - * - * The "user -> kernel" attack scenario is mitigated by SMEP and - * eIBRS. - * - * The "user -> user" scenario, also known as SpectreBHB, requires - * RSB clearing. - * - * So to mitigate all cases, unconditionally fill RSB on context - * switches. - * - * FIXME: Is this pointless for retbleed-affected AMD? - */ spectre_v2_select_rsb_mitigation(mode); /* -- cgit From af76f7d57ee9a3be7b3840595ce3e2bdedd594a7 Mon Sep 17 00:00:00 2001 From: "Naveen N Rao (AMD)" Date: Wed, 9 Apr 2025 13:13:41 +0200 Subject: Documentation/x86: Update the naming of CPU features for /proc/cpuinfo Commit: 78ce84b9e0a5 ("x86/cpufeatures: Flip the /proc/cpuinfo appearance logic") changed how CPU feature names should be specified. Update document to reflect the same. Signed-off-by: Naveen N Rao (AMD) Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250409111341.GDZ_ZWZS4LckBcirLE@fat_crate.local --- Documentation/arch/x86/cpuinfo.rst | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/Documentation/arch/x86/cpuinfo.rst b/Documentation/arch/x86/cpuinfo.rst index 6ef426a52cdc..7114f34ba3e6 100644 --- a/Documentation/arch/x86/cpuinfo.rst +++ b/Documentation/arch/x86/cpuinfo.rst @@ -130,14 +130,18 @@ x86_cap/bug_flags[] arrays in kernel/cpu/capflags.c. The names in the resulting x86_cap/bug_flags[] are used to populate /proc/cpuinfo. The naming of flags in the x86_cap/bug_flags[] are as follows: -a: The name of the flag is from the string in X86_FEATURE_ by default. ----------------------------------------------------------------------------- -By default, the flag in /proc/cpuinfo is extracted from the respective -X86_FEATURE_ in cpufeatures.h. For example, the flag "avx2" is from -X86_FEATURE_AVX2. - -b: The naming can be overridden. --------------------------------- +a: Flags do not appear by default in /proc/cpuinfo +-------------------------------------------------- + +Feature flags are omitted by default from /proc/cpuinfo as it does not make +sense for the feature to be exposed to userspace in most cases. For example, +X86_FEATURE_ALWAYS is defined in cpufeatures.h but that flag is an internal +kernel feature used in the alternative runtime patching functionality. So the +flag does not appear in /proc/cpuinfo. + +b: Specify a flag name if absolutely needed +------------------------------------------- + If the comment on the line for the #define X86_FEATURE_* starts with a double-quote character (""), the string inside the double-quote characters will be the name of the flags. For example, the flag "sse4_1" comes from @@ -148,14 +152,6 @@ needed. For instance, /proc/cpuinfo is a userspace interface and must remain constant. If, for some reason, the naming of X86_FEATURE_ changes, one shall override the new naming with the name already used in /proc/cpuinfo. -c: The naming override can be "", which means it will not appear in /proc/cpuinfo. ----------------------------------------------------------------------------------- -The feature shall be omitted from /proc/cpuinfo if it does not make sense for -the feature to be exposed to userspace. For example, X86_FEATURE_ALWAYS is -defined in cpufeatures.h but that flag is an internal kernel feature used -in the alternative runtime patching functionality. So, its name is overridden -with "". Its flag will not appear in /proc/cpuinfo. - Flags are missing when one or more of these happen ================================================== -- cgit From 254a6d14c9c952e8eae0fafd4fed3778721b948e Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 9 Apr 2025 13:14:35 +0200 Subject: Documentation/x86: Zap the subsection letters The subsections already have numbering - no need for the letters too. Zap the latter. Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250409111435.GEZ_ZWmz3_lkP8S9Lb@fat_crate.local --- Documentation/arch/x86/cpuinfo.rst | 51 ++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/Documentation/arch/x86/cpuinfo.rst b/Documentation/arch/x86/cpuinfo.rst index 7114f34ba3e6..f80e2a558d2a 100644 --- a/Documentation/arch/x86/cpuinfo.rst +++ b/Documentation/arch/x86/cpuinfo.rst @@ -79,8 +79,9 @@ feature flags. How are feature flags created? ============================== -a: Feature flags can be derived from the contents of CPUID leaves. ------------------------------------------------------------------- +Feature flags can be derived from the contents of CPUID leaves +-------------------------------------------------------------- + These feature definitions are organized mirroring the layout of CPUID leaves and grouped in words with offsets as mapped in enum cpuid_leafs in cpufeatures.h (see arch/x86/include/asm/cpufeatures.h for details). @@ -89,8 +90,9 @@ cpufeatures.h, and if it is detected at run time, the flags will be displayed accordingly in /proc/cpuinfo. For example, the flag "avx2" comes from X86_FEATURE_AVX2 in cpufeatures.h. -b: Flags can be from scattered CPUID-based features. ----------------------------------------------------- +Flags can be from scattered CPUID-based features +------------------------------------------------ + Hardware features enumerated in sparsely populated CPUID leaves get software-defined values. Still, CPUID needs to be queried to determine if a given feature is present. This is done in init_scattered_cpuid_features(). @@ -104,8 +106,9 @@ has only one feature and would waste 31 bits of space in the x86_capability[] array. Since there is a struct cpuinfo_x86 for each possible CPU, the wasted memory is not trivial. -c: Flags can be created synthetically under certain conditions for hardware features. -------------------------------------------------------------------------------------- +Flags can be created synthetically under certain conditions for hardware features +--------------------------------------------------------------------------------- + Examples of conditions include whether certain features are present in MSR_IA32_CORE_CAPS or specific CPU models are identified. If the needed conditions are met, the features are enabled by the set_cpu_cap or @@ -114,8 +117,8 @@ the feature X86_FEATURE_SPLIT_LOCK_DETECT will be enabled and "split_lock_detect" will be displayed. The flag "ring3mwait" will be displayed only when running on INTEL_XEON_PHI_[KNL|KNM] processors. -d: Flags can represent purely software features. ------------------------------------------------- +Flags can represent purely software features +-------------------------------------------- These flags do not represent hardware features. Instead, they represent a software feature implemented in the kernel. For example, Kernel Page Table Isolation is purely software feature and its feature flag X86_FEATURE_PTI is @@ -130,8 +133,8 @@ x86_cap/bug_flags[] arrays in kernel/cpu/capflags.c. The names in the resulting x86_cap/bug_flags[] are used to populate /proc/cpuinfo. The naming of flags in the x86_cap/bug_flags[] are as follows: -a: Flags do not appear by default in /proc/cpuinfo --------------------------------------------------- +Flags do not appear by default in /proc/cpuinfo +----------------------------------------------- Feature flags are omitted by default from /proc/cpuinfo as it does not make sense for the feature to be exposed to userspace in most cases. For example, @@ -139,8 +142,8 @@ X86_FEATURE_ALWAYS is defined in cpufeatures.h but that flag is an internal kernel feature used in the alternative runtime patching functionality. So the flag does not appear in /proc/cpuinfo. -b: Specify a flag name if absolutely needed -------------------------------------------- +Specify a flag name if absolutely needed +---------------------------------------- If the comment on the line for the #define X86_FEATURE_* starts with a double-quote character (""), the string inside the double-quote characters @@ -155,25 +158,28 @@ shall override the new naming with the name already used in /proc/cpuinfo. Flags are missing when one or more of these happen ================================================== -a: The hardware does not enumerate support for it. --------------------------------------------------- +The hardware does not enumerate support for it +---------------------------------------------- + For example, when a new kernel is running on old hardware or the feature is not enabled by boot firmware. Even if the hardware is new, there might be a problem enabling the feature at run time, the flag will not be displayed. -b: The kernel does not know about the flag. -------------------------------------------- +The kernel does not know about the flag +--------------------------------------- + For example, when an old kernel is running on new hardware. -c: The kernel disabled support for it at compile-time. ------------------------------------------------------- +The kernel disabled support for it at compile-time +-------------------------------------------------- + For example, if 5-level-paging is not enabled when building (i.e., CONFIG_X86_5LEVEL is not selected) the flag "la57" will not show up [#f1]_. Even though the feature will still be detected via CPUID, the kernel disables it by clearing via setup_clear_cpu_cap(X86_FEATURE_LA57). -d: The feature is disabled at boot-time. ----------------------------------------- +The feature is disabled at boot-time +------------------------------------ A feature can be disabled either using a command-line parameter or because it failed to be enabled. The command-line parameter clearcpuid= can be used to disable features using the feature number as defined in @@ -186,8 +192,9 @@ disable specific features. The list of parameters includes, but is not limited to, nofsgsbase, nosgx, noxsave, etc. 5-level paging can also be disabled using "no5lvl". -e: The feature was known to be non-functional. ----------------------------------------------- +The feature was known to be non-functional +------------------------------------------ + The feature was known to be non-functional because a dependency was missing at runtime. For example, AVX flags will not show up if XSAVE feature is disabled since they depend on XSAVE feature. Another example would be broken -- cgit From f0df00ebc57f803603f2a2e0df197e51f06fbe90 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 9 Apr 2025 06:58:37 -0700 Subject: x86/cpu: Avoid running off the end of an AMD erratum table The NULL array terminator at the end of erratum_1386_microcode was removed during the switch from x86_cpu_desc to x86_cpu_id. This causes readers to run off the end of the array. Replace the NULL. Fixes: f3f325152673 ("x86/cpu: Move AMD erratum 1386 table over to 'x86_cpu_id'") Reported-by: Jiri Slaby Signed-off-by: Dave Hansen --- arch/x86/kernel/cpu/amd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 79569f72b8ee..a839ff506f45 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -805,6 +805,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c) static const struct x86_cpu_id erratum_1386_microcode[] = { X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e), X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052), + {} }; static void fix_erratum_1386(struct cpuinfo_x86 *c) -- cgit From 1fac13956e9877483ece9d090a62239cdfe9deb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Apr 2025 21:16:01 +0200 Subject: x86/ibt: Fix hibernate Todd reported, and Len confirmed, that commit 582077c94052 ("x86/cfi: Clean up linkage") broke S4 hiberate on a fair number of machines. Turns out these machines trip #CP when trying to restore the image. As it happens, the commit in question removes two ENDBR instructions in the hibernate code, and clearly got it wrong. Notably restore_image() does an indirect jump to relocated_restore_code(), which is a relocated copy of core_restore_code(). In turn, core_restore_code(), will at the end do an indirect jump to restore_jump_address (r8), which is pointing at a relocated restore_registers(). So both sites do indeed need to be ENDBR. Fixes: 582077c94052 ("x86/cfi: Clean up linkage") Reported-by: Todd Brandt Signed-off-by: Peter Zijlstra (Intel) Tested-by: Todd Brandt Tested-by: Len Brown Link: https://bugzilla.kernel.org/show_bug.cgi?id=219998 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219998 --- arch/x86/power/hibernate_asm_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 8c534c36adfa..66f066b8feda 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -26,7 +26,7 @@ /* code below belongs to the image kernel */ .align PAGE_SIZE SYM_FUNC_START(restore_registers) - ANNOTATE_NOENDBR + ENDBR /* go back to the original page tables */ movq %r9, %cr3 @@ -120,7 +120,7 @@ SYM_FUNC_END(restore_image) /* code below has been relocated to a safe page */ SYM_FUNC_START(core_restore_code) - ANNOTATE_NOENDBR + ENDBR /* switch to temporary page tables */ movq %rax, %cr3 /* flush TLB */ -- cgit