summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/acpi/cppc.c14
-rw-r--r--arch/x86/kernel/acpi/cstate.c21
-rw-r--r--arch/x86/kernel/acpi/madt_playdead.S1
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c73
-rw-r--r--arch/x86/kernel/acpi/sleep.c1
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S1
-rw-r--r--arch/x86/kernel/alternative.c1398
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/amd_nb.c19
-rw-r--r--arch/x86/kernel/amd_node.c151
-rw-r--r--arch/x86/kernel/aperture_64.c2
-rw-r--r--arch/x86/kernel/apic/Makefile3
-rw-r--r--arch/x86/kernel/apic/apic.c24
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c7
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c105
-rw-r--r--arch/x86/kernel/apic/io_apic.c4
-rw-r--r--arch/x86/kernel/apic/ipi.c33
-rw-r--r--arch/x86/kernel/apic/local.h13
-rw-r--r--arch/x86/kernel/apic/probe_32.c29
-rw-r--r--arch/x86/kernel/apic/vector.c235
-rw-r--r--arch/x86/kernel/asm-offsets.c13
-rw-r--r--arch/x86/kernel/asm-offsets_32.c9
-rw-r--r--arch/x86/kernel/asm-offsets_64.c6
-rw-r--r--arch/x86/kernel/bootflag.c29
-rw-r--r--arch/x86/kernel/callthunks.c22
-rw-r--r--arch/x86/kernel/cet.c3
-rw-r--r--arch/x86/kernel/cfi.c26
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/amd.c142
-rw-r--r--arch/x86/kernel/cpu/amd_cache_disable.c301
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c31
-rw-r--r--arch/x86/kernel/cpu/bugs.c1597
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c54
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c1078
-rw-r--r--arch/x86/kernel/cpu/common.c468
-rw-r--r--arch/x86/kernel/cpu/cpu.h17
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c40
-rw-r--r--arch/x86/kernel/cpu/cpuid_0x2_table.c128
-rw-r--r--arch/x86/kernel/cpu/cyrix.c4
-rw-r--r--arch/x86/kernel/cpu/debugfs.c4
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c5
-rw-r--r--arch/x86/kernel/cpu/hygon.c23
-rw-r--r--arch/x86/kernel/cpu/intel.c347
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c12
-rw-r--r--arch/x86/kernel/cpu/match.c30
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c22
-rw-r--r--arch/x86/kernel/cpu/mce/core.c116
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c36
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c32
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h2
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c11
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c287
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_shas.c444
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c61
-rw-r--r--arch/x86/kernel/cpu/microcode/intel-ucode-defs.h150
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c53
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c19
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c6
-rw-r--r--arch/x86/kernel/cpu/proc.c7
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile7
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c219
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c610
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h509
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c878
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c1150
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h (renamed from arch/x86/kernel/cpu/resctrl/trace.h)26
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c4038
-rw-r--r--arch/x86/kernel/cpu/scattered.c3
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c10
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c37
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c5
-rw-r--r--arch/x86/kernel/cpu/topology.c3
-rw-r--r--arch/x86/kernel/cpu/topology_amd.c7
-rw-r--r--arch/x86/kernel/cpu/tsx.c21
-rw-r--r--arch/x86/kernel/cpu/umwait.c6
-rw-r--r--arch/x86/kernel/cpu/vmware.c4
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c1
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/devicetree.c3
-rw-r--r--arch/x86/kernel/dumpstack.c16
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c2
-rw-r--r--arch/x86/kernel/e820.c138
-rw-r--r--arch/x86/kernel/early_printk.c107
-rw-r--r--arch/x86/kernel/fpu/context.h4
-rw-r--r--arch/x86/kernel/fpu/core.c137
-rw-r--r--arch/x86/kernel/fpu/init.c21
-rw-r--r--arch/x86/kernel/fpu/internal.h2
-rw-r--r--arch/x86/kernel/fpu/regset.c22
-rw-r--r--arch/x86/kernel/fpu/signal.c38
-rw-r--r--arch/x86/kernel/fpu/xstate.c205
-rw-r--r--arch/x86/kernel/fpu/xstate.h62
-rw-r--r--arch/x86/kernel/fred.c21
-rw-r--r--arch/x86/kernel/ftrace.c50
-rw-r--r--arch/x86/kernel/ftrace_64.S5
-rw-r--r--arch/x86/kernel/head32.c4
-rw-r--r--arch/x86/kernel/head64.c286
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/head_64.S34
-rw-r--r--arch/x86/kernel/hpet.c5
-rw-r--r--arch/x86/kernel/i8253.c3
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq.c7
-rw-r--r--arch/x86/kernel/irq_32.c51
-rw-r--r--arch/x86/kernel/irq_64.c8
-rw-r--r--arch/x86/kernel/irqflags.S1
-rw-r--r--arch/x86/kernel/jailhouse.c2
-rw-r--r--arch/x86/kernel/jump_label.c6
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c15
-rw-r--r--arch/x86/kernel/kprobes/opt.c6
-rw-r--r--arch/x86/kernel/kvm.c34
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c48
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c8
-rw-r--r--arch/x86/kernel/module.c89
-rw-r--r--arch/x86/kernel/nmi.c129
-rw-r--r--arch/x86/kernel/nmi_selftest.c52
-rw-r--r--arch/x86/kernel/paravirt.c61
-rw-r--r--arch/x86/kernel/process.c53
-rw-r--r--arch/x86/kernel/process_32.c18
-rw-r--r--arch/x86/kernel/process_64.c59
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c12
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c2
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S6
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S273
-rw-r--r--arch/x86/kernel/setup.c224
-rw-r--r--arch/x86/kernel/setup_percpu.c15
-rw-r--r--arch/x86/kernel/shstk.c18
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/signal_32.c62
-rw-r--r--arch/x86/kernel/smpboot.c144
-rw-r--r--arch/x86/kernel/static_call.c8
-rw-r--r--arch/x86/kernel/tboot.c3
-rw-r--r--arch/x86/kernel/trace_clock.c2
-rw-r--r--arch/x86/kernel/tracepoint.c21
-rw-r--r--arch/x86/kernel/traps.c169
-rw-r--r--arch/x86/kernel/tsc.c9
-rw-r--r--arch/x86/kernel/tsc_msr.c2
-rw-r--r--arch/x86/kernel/tsc_sync.c15
-rw-r--r--arch/x86/kernel/unwind_orc.c4
-rw-r--r--arch/x86/kernel/uprobes.c19
-rw-r--r--arch/x86/kernel/verify_cpu.S4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S67
152 files changed, 6573 insertions, 11689 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b43eb7e384eb..99a783fd4691 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,6 +44,8 @@ KCOV_INSTRUMENT_unwind_orc.o := n
KCOV_INSTRUMENT_unwind_frame.o := n
KCOV_INSTRUMENT_unwind_guess.o := n
+CFLAGS_head32.o := -fno-stack-protector
+CFLAGS_head64.o := -fno-stack-protector
CFLAGS_irq.o := -I $(src)/../include/asm/trace
obj-y += head_$(BITS).o
@@ -139,7 +141,6 @@ obj-$(CONFIG_OF) += devicetree.o
obj-$(CONFIG_UPROBES) += uprobes.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
-obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
obj-$(CONFIG_X86_UMIP) += umip.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dae6a73be40e..9fa321a95eb3 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -23,6 +23,8 @@
#include <linux/serial_core.h>
#include <linux/pgtable.h>
+#include <xen/xen.h>
+
#include <asm/e820/api.h>
#include <asm/irqdomain.h>
#include <asm/pci_x86.h>
@@ -1729,6 +1731,15 @@ int __init acpi_mps_check(void)
{
#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
/* mptable code is not built-in*/
+
+ /*
+ * Xen disables ACPI in PV DomU guests but it still emulates APIC and
+ * supports SMP. Returning early here ensures that APIC is not disabled
+ * unnecessarily and the guest is not limited to a single vCPU.
+ */
+ if (xen_pv_domain() && !xen_initial_domain())
+ return 0;
+
if (acpi_disabled || acpi_noirq) {
pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n");
return 1;
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index d745dd586303..7047124490f6 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -4,6 +4,8 @@
* Copyright (c) 2016, Intel Corporation.
*/
+#include <linux/bitfield.h>
+
#include <acpi/cppc_acpi.h>
#include <asm/msr.h>
#include <asm/processor.h>
@@ -47,7 +49,7 @@ int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
{
int err;
- err = rdmsrl_safe_on_cpu(cpunum, reg->address, val);
+ err = rdmsrq_safe_on_cpu(cpunum, reg->address, val);
if (!err) {
u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
reg->bit_offset);
@@ -63,7 +65,7 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
u64 rd_val;
int err;
- err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val);
+ err = rdmsrq_safe_on_cpu(cpunum, reg->address, &rd_val);
if (!err) {
u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
reg->bit_offset);
@@ -72,7 +74,7 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
val &= mask;
rd_val &= ~mask;
rd_val |= val;
- err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val);
+ err = wrmsrq_safe_on_cpu(cpunum, reg->address, rd_val);
}
return err;
}
@@ -145,11 +147,11 @@ int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
int ret;
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
- ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+ ret = rdmsrq_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
if (ret)
goto out;
- val = AMD_CPPC_HIGHEST_PERF(val);
+ val = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, val);
} else {
ret = cppc_get_highest_perf(cpu, &val);
if (ret)
@@ -270,7 +272,7 @@ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
}
/* detect if running on heterogeneous design */
- if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) {
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) {
switch (core_type) {
case TOPO_CPU_TYPE_UNKNOWN:
pr_warn("Undefined core type found for cpu %d\n", cpu);
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 5854f0b8f0f1..8698d66563ed 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,9 +13,11 @@
#include <linux/sched.h>
#include <acpi/processor.h>
-#include <asm/cpuid.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpuid/api.h>
#include <asm/mwait.h>
#include <asm/special_insns.h>
+#include <asm/smp.h>
/*
* Initialize bm_flags based on the CPU cache properties
@@ -47,12 +49,11 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
/*
* On all recent Intel platforms, ARB_DISABLE is a nop.
* So, set bm_control to zero to indicate that ARB_DISABLE
- * is not required while entering C3 type state on
- * P4, Core and beyond CPUs
+ * is not required while entering C3 type state.
*/
if (c->x86_vendor == X86_VENDOR_INTEL &&
- (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
- flags->bm_control = 0;
+ (c->x86 > 15 || (c->x86_vfm >= INTEL_CORE2_MEROM && c->x86_vfm <= INTEL_FAM6_LAST)))
+ flags->bm_control = 0;
if (c->x86_vendor == X86_VENDOR_CENTAUR) {
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f &&
@@ -205,6 +206,16 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
+void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cstate_entry *percpu_entry;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ mwait_play_dead(percpu_entry->states[cx->index].eax);
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_play_dead);
+
void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
index 4e498d28cdc8..aefb9cb583ad 100644
--- a/arch/x86/kernel/acpi/madt_playdead.S
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -14,6 +14,7 @@
* rsi: PGD of the identity mapping
*/
SYM_FUNC_START(asm_acpi_mp_play_dead)
+ ANNOTATE_NOENDBR
/* Turn off global entries. Following CR3 write will flush them. */
movq %cr4, %rdx
andq $~(X86_CR4_PGE), %rdx
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index d5ef6215583b..f36f28405dcc 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -70,58 +70,6 @@ static void __init free_pgt_page(void *pgt, void *dummy)
return memblock_free(pgt, PAGE_SIZE);
}
-/*
- * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
- * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
- * to the identity mapping and the function has be present at the same spot in
- * the virtual address space before and after switching page tables.
- */
-static int __init init_transition_pgtable(pgd_t *pgd)
-{
- pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
- unsigned long vaddr, paddr;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- vaddr = (unsigned long)asm_acpi_mp_play_dead;
- pgd += pgd_index(vaddr);
- if (!pgd_present(*pgd)) {
- p4d = (p4d_t *)alloc_pgt_page(NULL);
- if (!p4d)
- return -ENOMEM;
- set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
- }
- p4d = p4d_offset(pgd, vaddr);
- if (!p4d_present(*p4d)) {
- pud = (pud_t *)alloc_pgt_page(NULL);
- if (!pud)
- return -ENOMEM;
- set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
- }
- pud = pud_offset(p4d, vaddr);
- if (!pud_present(*pud)) {
- pmd = (pmd_t *)alloc_pgt_page(NULL);
- if (!pmd)
- return -ENOMEM;
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- }
- pmd = pmd_offset(pud, vaddr);
- if (!pmd_present(*pmd)) {
- pte = (pte_t *)alloc_pgt_page(NULL);
- if (!pte)
- return -ENOMEM;
- set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
- }
- pte = pte_offset_kernel(pmd, vaddr);
-
- paddr = __pa(vaddr);
- set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
-
- return 0;
-}
-
static int __init acpi_mp_setup_reset(u64 reset_vector)
{
struct x86_mapping_info info = {
@@ -130,6 +78,7 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
.page_flag = __PAGE_KERNEL_LARGE_EXEC,
.kernpg_flag = _KERNPG_TABLE_NOENC,
};
+ unsigned long mstart, mend;
pgd_t *pgd;
pgd = alloc_pgt_page(NULL);
@@ -137,8 +86,6 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
return -ENOMEM;
for (int i = 0; i < nr_pfn_mapped; i++) {
- unsigned long mstart, mend;
-
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
@@ -147,14 +94,24 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
}
}
- if (kernel_ident_mapping_init(&info, pgd,
- PAGE_ALIGN_DOWN(reset_vector),
- PAGE_ALIGN(reset_vector + 1))) {
+ mstart = PAGE_ALIGN_DOWN(reset_vector);
+ mend = mstart + PAGE_SIZE;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
- if (init_transition_pgtable(pgd)) {
+ /*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping
+ * at the same place as in the kernel page tables.
+ * asm_acpi_mp_play_dead() switches to the identity mapping and the
+ * function must be present at the same spot in the virtual address space
+ * before and after switching page tables.
+ */
+ info.offset = __START_KERNEL_map - phys_base;
+ mstart = PAGE_ALIGN_DOWN(__pa(asm_acpi_mp_play_dead));
+ mend = mstart + PAGE_SIZE;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 6dfecb27b846..91fa262f0e30 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -16,6 +16,7 @@
#include <asm/cacheflush.h>
#include <asm/realmode.h>
#include <asm/hypervisor.h>
+#include <asm/msr.h>
#include <asm/smp.h>
#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index b200a193beeb..04f561f75e99 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -17,6 +17,7 @@
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
SYM_FUNC_START(wakeup_long64)
+ ANNOTATE_NOENDBR
movq saved_magic(%rip), %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c71b575bf229..ecfe7b497cad 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,36 +1,17 @@
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "SMP alternatives: " fmt
-#include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/mmu_context.h>
#include <linux/perf_event.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/stringify.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
-#include <linux/stop_machine.h>
-#include <linux/slab.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-#include <linux/mmu_context.h>
-#include <linux/bsearch.h>
-#include <linux/sync_core.h>
+#include <linux/execmem.h>
+
#include <asm/text-patching.h>
-#include <asm/alternative.h>
-#include <asm/sections.h>
-#include <asm/mce.h>
-#include <asm/nmi.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
#include <asm/insn.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/paravirt.h>
-#include <asm/asm-prototypes.h>
-#include <asm/cfi.h>
+#include <asm/ibt.h>
+#include <asm/set_memory.h>
+#include <asm/nmi.h>
int __read_mostly alternatives_patched;
@@ -124,6 +105,171 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
#endif
};
+#ifdef CONFIG_FINEIBT
+static bool cfi_paranoid __ro_after_init;
+#endif
+
+#ifdef CONFIG_MITIGATION_ITS
+
+#ifdef CONFIG_MODULES
+static struct module *its_mod;
+#endif
+static void *its_page;
+static unsigned int its_offset;
+
+/* Initialize a thunk with the "jmp *reg; int3" instructions. */
+static void *its_init_thunk(void *thunk, int reg)
+{
+ u8 *bytes = thunk;
+ int offset = 0;
+ int i = 0;
+
+#ifdef CONFIG_FINEIBT
+ if (cfi_paranoid) {
+ /*
+ * When ITS uses indirect branch thunk the fineibt_paranoid
+ * caller sequence doesn't fit in the caller site. So put the
+ * remaining part of the sequence (<ea> + JNE) into the ITS
+ * thunk.
+ */
+ bytes[i++] = 0xea; /* invalid instruction */
+ bytes[i++] = 0x75; /* JNE */
+ bytes[i++] = 0xfd;
+
+ offset = 1;
+ }
+#endif
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+ bytes[i++] = 0xff;
+ bytes[i++] = 0xe0 + reg; /* jmp *reg */
+ bytes[i++] = 0xcc;
+
+ return thunk + offset;
+}
+
+#ifdef CONFIG_MODULES
+void its_init_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ mutex_lock(&text_mutex);
+ its_mod = mod;
+ its_page = NULL;
+}
+
+void its_fini_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ WARN_ON_ONCE(its_mod != mod);
+
+ its_mod = NULL;
+ its_page = NULL;
+ mutex_unlock(&text_mutex);
+
+ for (int i = 0; i < mod->its_num_pages; i++) {
+ void *page = mod->its_page_array[i];
+ execmem_restore_rox(page, PAGE_SIZE);
+ }
+}
+
+void its_free_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ for (int i = 0; i < mod->its_num_pages; i++) {
+ void *page = mod->its_page_array[i];
+ execmem_free(page);
+ }
+ kfree(mod->its_page_array);
+}
+#endif /* CONFIG_MODULES */
+
+static void *its_alloc(void)
+{
+ void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+
+ if (!page)
+ return NULL;
+
+#ifdef CONFIG_MODULES
+ if (its_mod) {
+ void *tmp = krealloc(its_mod->its_page_array,
+ (its_mod->its_num_pages+1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ its_mod->its_page_array = tmp;
+ its_mod->its_page_array[its_mod->its_num_pages++] = page;
+
+ execmem_make_temp_rw(page, PAGE_SIZE);
+ }
+#endif /* CONFIG_MODULES */
+
+ return no_free_ptr(page);
+}
+
+static void *its_allocate_thunk(int reg)
+{
+ int size = 3 + (reg / 8);
+ void *thunk;
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * The ITS thunk contains an indirect jump and an int3 instruction so
+ * its size is 3 or 4 bytes depending on the register used. If CFI
+ * paranoid is used then 3 extra bytes are added in the ITS thunk to
+ * complete the fineibt_paranoid caller sequence.
+ */
+ if (cfi_paranoid)
+ size += 3;
+#endif
+
+ if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
+ its_page = its_alloc();
+ if (!its_page) {
+ pr_err("ITS page allocation failed\n");
+ return NULL;
+ }
+ memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
+ its_offset = 32;
+ }
+
+ /*
+ * If the indirect branch instruction will be in the lower half
+ * of a cacheline, then update the offset to reach the upper half.
+ */
+ if ((its_offset + size - 1) % 64 < 32)
+ its_offset = ((its_offset - 1) | 0x3F) + 33;
+
+ thunk = its_page + its_offset;
+ its_offset += size;
+
+ return its_init_thunk(thunk, reg);
+}
+
+u8 *its_static_thunk(int reg)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+
+#ifdef CONFIG_FINEIBT
+ /* Paranoid thunk starts 2 bytes before */
+ if (cfi_paranoid)
+ return thunk - 2;
+#endif
+ return thunk;
+}
+
+#endif
+
/*
* Nomenclature for variable names to simplify and clarify this code and ease
* any potential staring at it:
@@ -171,13 +317,6 @@ static void add_nop(u8 *buf, unsigned int len)
*buf = INT3_INSN_OPCODE;
}
-extern s32 __retpoline_sites[], __retpoline_sites_end[];
-extern s32 __return_sites[], __return_sites_end[];
-extern s32 __cfi_sites[], __cfi_sites_end[];
-extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
-extern s32 __smp_locks[], __smp_locks_end[];
-void text_poke_early(void *addr, const void *opcode, size_t len);
-
/*
* Matches NOP and NOPL, not any of the other possible NOPs.
*/
@@ -369,7 +508,7 @@ static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen,
}
}
-void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
__apply_relocation(buf, instr, instrlen, repl, repl_len);
optimize_nops(instr, buf, instrlen);
@@ -392,10 +531,8 @@ EXPORT_SYMBOL(BUG_func);
* Rewrite the "call BUG_func" replacement to point to the target of the
* indirect pv_ops call "call *disp(%ip)".
*/
-static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a,
- struct module *mod)
+static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
{
- u8 *wr_instr = module_writable_address(mod, instr);
void *target, *bug = &BUG_func;
s32 disp;
@@ -405,14 +542,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a,
}
if (a->instrlen != 6 ||
- wr_instr[0] != CALL_RIP_REL_OPCODE ||
- wr_instr[1] != CALL_RIP_REL_MODRM) {
+ instr[0] != CALL_RIP_REL_OPCODE ||
+ instr[1] != CALL_RIP_REL_MODRM) {
pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
BUG();
}
/* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
- disp = *(s32 *)(wr_instr + 2);
+ disp = *(s32 *)(instr + 2);
#ifdef CONFIG_X86_64
/* ff 15 00 00 00 00 call *0x0(%rip) */
/* target address is stored at "next instruction + disp". */
@@ -450,8 +587,7 @@ static inline u8 * instr_va(struct alt_instr *i)
* to refetch changed I$ lines.
*/
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
- struct alt_instr *end,
- struct module *mod)
+ struct alt_instr *end)
{
u8 insn_buff[MAX_PATCH_LEN];
u8 *instr, *replacement;
@@ -460,7 +596,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
DPRINTK(ALT, "alt table %px, -> %px", start, end);
/*
- * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
+ * KASAN_SHADOW_START is defined using
* cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
* During the process, KASAN becomes confused seeing partial LA57
* conversion and triggers a false-positive out-of-bound report.
@@ -480,7 +616,6 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
- u8 *wr_instr, *wr_replacement;
/*
* In case of nested ALTERNATIVE()s the outer alternative might
@@ -494,11 +629,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
}
instr = instr_va(a);
- wr_instr = module_writable_address(mod, instr);
-
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- wr_replacement = module_writable_address(mod, replacement);
-
BUG_ON(a->instrlen > sizeof(insn_buff));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
@@ -509,9 +640,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- memcpy(insn_buff, wr_instr, a->instrlen);
+ memcpy(insn_buff, instr, a->instrlen);
optimize_nops(instr, insn_buff, a->instrlen);
- text_poke_early(wr_instr, insn_buff, a->instrlen);
+ text_poke_early(instr, insn_buff, a->instrlen);
continue;
}
@@ -521,12 +652,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
instr, instr, a->instrlen,
replacement, a->replacementlen, a->flags);
- memcpy(insn_buff, wr_replacement, a->replacementlen);
+ memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
if (a->flags & ALT_FLAG_DIRECT_CALL) {
- insn_buff_sz = alt_replace_call(instr, insn_buff, a,
- mod);
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a);
if (insn_buff_sz < 0)
continue;
}
@@ -534,13 +664,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
- apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
+ text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
- DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
- text_poke_early(wr_instr, insn_buff, insn_buff_sz);
+ text_poke_early(instr, insn_buff, insn_buff_sz);
}
kasan_enable_current();
@@ -590,7 +720,8 @@ static int emit_indirect(int op, int reg, u8 *bytes)
return i;
}
-static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
+ void *call_dest, void *jmp_dest)
{
u8 op = insn->opcode.bytes[0];
int i = 0;
@@ -611,7 +742,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8
switch (op) {
case CALL_INSN_OPCODE:
__text_gen_insn(bytes+i, op, addr+i,
- __x86_indirect_call_thunk_array[reg],
+ call_dest,
CALL_INSN_SIZE);
i += CALL_INSN_SIZE;
break;
@@ -619,7 +750,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8
case JMP32_INSN_OPCODE:
clang_jcc:
__text_gen_insn(bytes+i, op, addr+i,
- __x86_indirect_jump_thunk_array[reg],
+ jmp_dest,
JMP32_INSN_SIZE);
i += JMP32_INSN_SIZE;
break;
@@ -634,6 +765,48 @@ clang_jcc:
return i;
}
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ return __emit_trampoline(addr, insn, bytes,
+ __x86_indirect_call_thunk_array[reg],
+ __x86_indirect_jump_thunk_array[reg]);
+}
+
+#ifdef CONFIG_MITIGATION_ITS
+static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+ u8 *tmp = its_allocate_thunk(reg);
+
+ if (tmp)
+ thunk = tmp;
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+/* Check if an indirect branch is at ITS-unsafe address */
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return false;
+
+ /* Indirect branch opcode is 2 or 3 bytes depending on reg */
+ addr += 1 + reg / 8;
+
+ /* Lower-half of the cacheline? */
+ return !(addr & 0x20);
+}
+#else /* CONFIG_MITIGATION_ITS */
+
+#ifdef CONFIG_FINEIBT
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ return false;
+}
+#endif
+
+#endif /* CONFIG_MITIGATION_ITS */
+
/*
* Rewrite the compiler generated retpoline thunk calls.
*
@@ -708,6 +881,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
bytes[i++] = 0xe8; /* LFENCE */
}
+#ifdef CONFIG_MITIGATION_ITS
+ /*
+ * Check if the address of last byte of emitted-indirect is in
+ * lower-half of the cacheline. Such branches need ITS mitigation.
+ */
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
+ return emit_its_trampoline(addr, insn, reg, bytes);
+#endif
+
ret = emit_indirect(op, reg, bytes + i);
if (ret < 0)
return ret;
@@ -731,20 +913,19 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
/*
* Generated by 'objtool --retpoline'.
*/
-void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
- struct module *mod)
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
struct insn insn;
int len, ret;
u8 bytes[16];
u8 op1, op2;
+ u8 *dest;
- ret = insn_decode_kernel(&insn, wr_addr);
+ ret = insn_decode_kernel(&insn, addr);
if (WARN_ON_ONCE(ret < 0))
continue;
@@ -752,8 +933,19 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
op2 = insn.opcode.bytes[1];
switch (op1) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ /* See cfi_paranoid. */
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+
case CALL_INSN_OPCODE:
case JMP32_INSN_OPCODE:
+ /* Check for cfi_paranoid + ITS */
+ dest = addr + insn.length + insn.immediate.value;
+ if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) {
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+ }
break;
case 0x0f: /* escape */
@@ -772,15 +964,30 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
len = patch_retpoline(addr, &insn, bytes);
if (len == insn.length) {
optimize_nops(addr, bytes, len);
- DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
- text_poke_early(wr_addr, bytes, len);
+ text_poke_early(addr, bytes, len);
}
}
}
#ifdef CONFIG_MITIGATION_RETHUNK
+bool cpu_wants_rethunk(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_RETHUNK);
+}
+
+bool cpu_wants_rethunk_at(void *addr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ return false;
+ if (x86_return_thunk != its_return_thunk)
+ return true;
+
+ return !((unsigned long)addr & 0x20);
+}
+
/*
* Rewrite the compiler generated return thunk tail-calls.
*
@@ -797,7 +1004,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
int i = 0;
/* Patch the custom return thunks... */
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
+ if (cpu_wants_rethunk_at(addr)) {
i = JMP32_INSN_SIZE;
__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
} else {
@@ -810,23 +1017,21 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
return i;
}
-void __init_or_module noinline apply_returns(s32 *start, s32 *end,
- struct module *mod)
+void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
s32 *s;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk())
static_call_force_reinit();
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
struct insn insn;
int len, ret;
u8 bytes[16];
u8 op;
- ret = insn_decode_kernel(&insn, wr_addr);
+ ret = insn_decode_kernel(&insn, addr);
if (WARN_ON_ONCE(ret < 0))
continue;
@@ -846,41 +1051,59 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end,
len = patch_return(addr, &insn, bytes);
if (len == insn.length) {
- DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
- text_poke_early(wr_addr, bytes, len);
+ text_poke_early(addr, bytes, len);
}
}
}
-#else
-void __init_or_module noinline apply_returns(s32 *start, s32 *end,
- struct module *mod) { }
-#endif /* CONFIG_MITIGATION_RETHUNK */
+#else /* !CONFIG_MITIGATION_RETHUNK: */
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+#endif /* !CONFIG_MITIGATION_RETHUNK */
#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
-void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
- struct module *mod) { }
-void __init_or_module noinline apply_returns(s32 *start, s32 *end,
- struct module *mod) { }
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
-#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
+#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_cfi(void *addr, void *wr_addr);
+__noendbr bool is_endbr(u32 *val)
+{
+ u32 endbr;
+
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return __is_endbr(endbr);
-static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
+Efault:
+ return false;
+}
+
+#ifdef CONFIG_FINEIBT
+
+static __noendbr bool exact_endbr(u32 *val)
{
- u32 endbr, poison = gen_endbr_poison();
+ u32 endbr;
- if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr)))
- return;
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return endbr == gen_endbr();
- if (!is_endbr(endbr)) {
- WARN_ON_ONCE(warn);
+Efault:
+ return false;
+}
+
+#endif
+
+static void poison_cfi(void *addr);
+
+static void __init_or_module poison_endbr(void *addr)
+{
+ u32 poison = gen_endbr_poison();
+
+ if (WARN_ON_ONCE(!is_endbr(addr)))
return;
- }
DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
@@ -889,7 +1112,7 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
*/
DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
- text_poke_early(wr_addr, &poison, 4);
+ text_poke_early(addr, &poison, 4);
}
/*
@@ -898,36 +1121,39 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
* Seal the functions for indirect calls by clobbering the ENDBR instructions
* and the kCFI hash value.
*/
-void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
+void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
- poison_endbr(addr, wr_addr, true);
+ poison_endbr(addr);
if (IS_ENABLED(CONFIG_FINEIBT))
- poison_cfi(addr - 16, wr_addr - 16);
+ poison_cfi(addr - 16);
}
}
-#else
+#else /* !CONFIG_X86_KERNEL_IBT: */
-void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { }
+void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
-#endif /* CONFIG_X86_KERNEL_IBT */
+#endif /* !CONFIG_X86_KERNEL_IBT */
#ifdef CONFIG_CFI_AUTO_DEFAULT
-#define __CFI_DEFAULT CFI_AUTO
+# define __CFI_DEFAULT CFI_AUTO
#elif defined(CONFIG_CFI_CLANG)
-#define __CFI_DEFAULT CFI_KCFI
+# define __CFI_DEFAULT CFI_KCFI
#else
-#define __CFI_DEFAULT CFI_OFF
+# define __CFI_DEFAULT CFI_OFF
#endif
enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+#ifdef CONFIG_FINEIBT_BHI
+bool cfi_bhi __ro_after_init = false;
+#endif
+
#ifdef CONFIG_CFI_CLANG
struct bpf_insn;
@@ -935,11 +1161,7 @@ struct bpf_insn;
extern unsigned int __bpf_prog_runX(const void *ctx,
const struct bpf_insn *insn);
-/*
- * Force a reference to the external symbol so the compiler generates
- * __kcfi_typid.
- */
-__ADDRESSABLE(__bpf_prog_runX);
+KCFI_REFERENCE(__bpf_prog_runX);
/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
asm (
@@ -956,7 +1178,7 @@ asm (
/* Must match bpf_callback_t */
extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
-__ADDRESSABLE(__bpf_callback_fn);
+KCFI_REFERENCE(__bpf_callback_fn);
/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
asm (
@@ -991,6 +1213,21 @@ u32 cfi_get_func_hash(void *func)
return hash;
}
+
+int cfi_get_func_arity(void *func)
+{
+ bhi_thunk *target;
+ s32 disp;
+
+ if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
+ return 0;
+
+ if (get_kernel_nofault(disp, func - 4))
+ return 0;
+
+ target = func + disp;
+ return target - __bhi_args;
+}
#endif
#ifdef CONFIG_FINEIBT
@@ -1005,7 +1242,7 @@ static u32 cfi_seed __ro_after_init;
static u32 cfi_rehash(u32 hash)
{
hash ^= cfi_seed;
- while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
+ while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
bool lsb = hash & 1;
hash >>= 1;
if (lsb)
@@ -1037,6 +1274,25 @@ static __init int cfi_parse_cmdline(char *str)
cfi_mode = CFI_FINEIBT;
} else if (!strcmp(str, "norand")) {
cfi_rand = false;
+ } else if (!strcmp(str, "warn")) {
+ pr_alert("CFI mismatch non-fatal!\n");
+ cfi_warn = true;
+ } else if (!strcmp(str, "paranoid")) {
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_paranoid = true;
+ } else {
+ pr_err("Ignoring paranoid; depends on fineibt.\n");
+ }
+ } else if (!strcmp(str, "bhi")) {
+#ifdef CONFIG_FINEIBT_BHI
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_bhi = true;
+ } else {
+ pr_err("Ignoring bhi; depends on fineibt.\n");
+ }
+#else
+ pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n");
+#endif
} else {
pr_err("Ignoring unknown cfi option (%s).", str);
}
@@ -1054,9 +1310,9 @@ early_param("cfi", cfi_parse_cmdline);
* __cfi_\func: __cfi_\func:
* movl $0x12345678,%eax // 5 endbr64 // 4
* nop subl $0x12345678,%r10d // 7
- * nop jz 1f // 2
- * nop ud2 // 2
- * nop 1: nop // 1
+ * nop jne __cfi_\func+6 // 2
+ * nop nop3 // 3
+ * nop
* nop
* nop
* nop
@@ -1068,34 +1324,53 @@ early_param("cfi", cfi_parse_cmdline);
*
* caller: caller:
* movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
- * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
+ * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
* je 1f // 2 nop4 // 4
* ud2 // 2
- * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
+ * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
*
*/
-asm( ".pushsection .rodata \n"
- "fineibt_preamble_start: \n"
- " endbr64 \n"
- " subl $0x12345678, %r10d \n"
- " je fineibt_preamble_end \n"
- " ud2 \n"
- " nop \n"
- "fineibt_preamble_end: \n"
+/*
+ * <fineibt_preamble_start>:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d
+ * b: 75 f9 jne 6 <fineibt_preamble_start+0x6>
+ * d: 0f 1f 00 nopl (%rax)
+ *
+ * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as
+ * (bad) on x86_64 and raises #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_preamble_start: \n"
+ " endbr64 \n"
+ " subl $0x12345678, %r10d \n"
+ "fineibt_preamble_bhi: \n"
+ " jne fineibt_preamble_start+6 \n"
+ ASM_NOP3
+ "fineibt_preamble_end: \n"
".popsection\n"
);
extern u8 fineibt_preamble_start[];
+extern u8 fineibt_preamble_bhi[];
extern u8 fineibt_preamble_end[];
#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
+#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start)
+#define fineibt_preamble_ud 6
#define fineibt_preamble_hash 7
+/*
+ * <fineibt_caller_start>:
+ * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
+ * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11
+ * a: 0f 1f 40 00 nopl 0x0(%rax)
+ */
asm( ".pushsection .rodata \n"
"fineibt_caller_start: \n"
" movl $0x12345678, %r10d \n"
- " sub $16, %r11 \n"
+ " lea -0x10(%r11), %r11 \n"
ASM_NOP4
"fineibt_caller_end: \n"
".popsection \n"
@@ -1109,13 +1384,62 @@ extern u8 fineibt_caller_end[];
#define fineibt_caller_jmp (fineibt_caller_size - 2)
-static u32 decode_preamble_hash(void *addr)
+/*
+ * Since FineIBT does hash validation on the callee side it is prone to
+ * circumvention attacks where a 'naked' ENDBR instruction exists that
+ * is not part of the fineibt_preamble sequence.
+ *
+ * Notably the x86 entry points must be ENDBR and equally cannot be
+ * fineibt_preamble.
+ *
+ * The fineibt_paranoid caller sequence adds additional caller side
+ * hash validation. This stops such circumvention attacks dead, but at the cost
+ * of adding a load.
+ *
+ * <fineibt_paranoid_start>:
+ * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
+ * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d
+ * a: 4d 8d 5b <f0> lea -0x10(%r11), %r11
+ * e: 75 fd jne d <fineibt_paranoid_start+0xd>
+ * 10: 41 ff d3 call *%r11
+ * 13: 90 nop
+ *
+ * Notably LEA does not modify flags and can be reordered with the CMP,
+ * avoiding a dependency. Again, using a non-taken (backwards) branch
+ * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
+ * Jcc.d8, causing #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_paranoid_start: \n"
+ " movl $0x12345678, %r10d \n"
+ " cmpl -9(%r11), %r10d \n"
+ " lea -0x10(%r11), %r11 \n"
+ " jne fineibt_paranoid_start+0xd \n"
+ "fineibt_paranoid_ind: \n"
+ " call *%r11 \n"
+ " nop \n"
+ "fineibt_paranoid_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_paranoid_start[];
+extern u8 fineibt_paranoid_ind[];
+extern u8 fineibt_paranoid_end[];
+
+#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
+#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start)
+#define fineibt_paranoid_ud 0xd
+
+static u32 decode_preamble_hash(void *addr, int *reg)
{
u8 *p = addr;
- /* b8 78 56 34 12 mov $0x12345678,%eax */
- if (p[0] == 0xb8)
+ /* b8+reg 78 56 34 12 movl $0x12345678,\reg */
+ if (p[0] >= 0xb8 && p[0] < 0xc0) {
+ if (reg)
+ *reg = p[0] - 0xb8;
return *(u32 *)(addr + 1);
+ }
return 0; /* invalid hash value */
}
@@ -1124,11 +1448,11 @@ static u32 decode_caller_hash(void *addr)
{
u8 *p = addr;
- /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
+ /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */
if (p[0] == 0x41 && p[1] == 0xba)
return -*(u32 *)(addr + 2);
- /* e8 0c 78 56 34 12 jmp.d8 +12 */
+ /* e8 0c 88 a9 cb ed jmp.d8 +12 */
if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
return -*(u32 *)(addr + 2);
@@ -1136,7 +1460,7 @@ static u32 decode_caller_hash(void *addr)
}
/* .retpoline_sites */
-static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod)
+static int cfi_disable_callers(s32 *start, s32 *end)
{
/*
* Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
@@ -1148,23 +1472,20 @@ static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- wr_addr = module_writable_address(mod, addr);
- hash = decode_caller_hash(wr_addr);
-
+ hash = decode_caller_hash(addr);
if (!hash) /* nocfi callers */
continue;
- text_poke_early(wr_addr, jmp, 2);
+ text_poke_early(addr, jmp, 2);
}
return 0;
}
-static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod)
+static int cfi_enable_callers(s32 *start, s32 *end)
{
/*
* Re-enable kCFI, undo what cfi_disable_callers() did.
@@ -1174,126 +1495,230 @@ static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- wr_addr = module_writable_address(mod, addr);
- hash = decode_caller_hash(wr_addr);
+ hash = decode_caller_hash(addr);
if (!hash) /* nocfi callers */
continue;
- text_poke_early(wr_addr, mov, 2);
+ text_poke_early(addr, mov, 2);
}
return 0;
}
/* .cfi_sites */
-static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod)
+static int cfi_rand_preamble(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
u32 hash;
- hash = decode_preamble_hash(wr_addr);
+ hash = decode_preamble_hash(addr, NULL);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
hash = cfi_rehash(hash);
- text_poke_early(wr_addr + 1, &hash, 4);
+ text_poke_early(addr + 1, &hash, 4);
}
return 0;
}
-static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod)
+static void cfi_fineibt_bhi_preamble(void *addr, int arity)
+{
+ if (!arity)
+ return;
+
+ if (!cfi_warn && arity == 1) {
+ /*
+ * Crazy scheme to allow arity-1 inline:
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d
+ * b: 49 0f 45 fa cmovne %r10, %rdi
+ * f: 75 f5 jne __cfi_foo+6
+ * 11: 0f 1f 00 nopl (%rax)
+ *
+ * Code that direct calls to foo()+0, decodes the tail end as:
+ *
+ * foo:
+ * 0: f5 cmc
+ * 1: 0f 1f 00 nopl (%rax)
+ *
+ * which clobbers CF, but does not affect anything ABI
+ * wise.
+ *
+ * Notably, this scheme is incompatible with permissive CFI
+ * because the CMOVcc is unconditional and RDI will have been
+ * clobbered.
+ */
+ const u8 magic[9] = {
+ 0x49, 0x0f, 0x45, 0xfa,
+ 0x75, 0xf5,
+ BYTES_NOP3,
+ };
+
+ text_poke_early(addr + fineibt_preamble_bhi, magic, 9);
+
+ return;
+ }
+
+ text_poke_early(addr + fineibt_preamble_bhi,
+ text_gen_insn(CALL_INSN_OPCODE,
+ addr + fineibt_preamble_bhi,
+ __bhi_args[arity]),
+ CALL_INSN_SIZE);
+}
+
+static int cfi_rewrite_preamble(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
+ int arity;
u32 hash;
- hash = decode_preamble_hash(wr_addr);
+ /*
+ * When the function doesn't start with ENDBR the compiler will
+ * have determined there are no indirect calls to it and we
+ * don't need no CFI either.
+ */
+ if (!is_endbr(addr + 16))
+ continue;
+
+ hash = decode_preamble_hash(addr, &arity);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
- text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size);
- WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678);
- text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4);
+ text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
+ WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+
+ WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
+ "kCFI preamble has wrong register at: %pS %*ph\n",
+ addr, 5, addr);
+
+ if (cfi_bhi)
+ cfi_fineibt_bhi_preamble(addr, arity);
}
return 0;
}
-static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod)
+static void cfi_rewrite_endbr(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr = module_writable_address(mod, addr);
- poison_endbr(addr + 16, wr_addr + 16, false);
+ if (!exact_endbr(addr + 16))
+ continue;
+
+ poison_endbr(addr + 16);
}
}
/* .retpoline_sites */
-static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod)
+static int cfi_rand_callers(s32 *start, s32 *end)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- wr_addr = module_writable_address(mod, addr);
- hash = decode_caller_hash(wr_addr);
+ hash = decode_caller_hash(addr);
if (hash) {
hash = -cfi_rehash(hash);
- text_poke_early(wr_addr + 2, &hash, 4);
+ text_poke_early(addr + 2, &hash, 4);
}
}
return 0;
}
-static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod)
+static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
+
+#ifdef CONFIG_MITIGATION_ITS
+ u8 *tmp = its_allocate_thunk(reg);
+ if (tmp)
+ thunk = tmp;
+#endif
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+static int cfi_rewrite_callers(s32 *start, s32 *end)
{
s32 *s;
+ BUG_ON(fineibt_paranoid_size != 20);
+
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
- void *wr_addr;
+ struct insn insn;
+ u8 bytes[20];
u32 hash;
+ int ret;
+ u8 op;
addr -= fineibt_caller_size;
- wr_addr = module_writable_address(mod, addr);
- hash = decode_caller_hash(wr_addr);
- if (hash) {
- text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size);
- WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678);
- text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4);
+ hash = decode_caller_hash(addr);
+ if (!hash)
+ continue;
+
+ if (!cfi_paranoid) {
+ text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
+ WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ /* rely on apply_retpolines() */
+ continue;
+ }
+
+ /* cfi_paranoid */
+ ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op = insn.opcode.bytes[0];
+ if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
+ memcpy(bytes + fineibt_caller_hash, &hash, 4);
+
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
+ emit_paranoid_trampoline(addr + fineibt_caller_size,
+ &insn, 11, bytes + fineibt_caller_size);
+ } else {
+ ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind);
+ if (WARN_ON_ONCE(ret != 3))
+ continue;
}
- /* rely on apply_retpolines() */
+
+ text_poke_early(addr, bytes, fineibt_paranoid_size);
}
return 0;
}
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, struct module *mod)
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
{
- bool builtin = mod ? false : true;
int ret;
if (WARN_ONCE(fineibt_preamble_size != 16,
@@ -1302,8 +1727,15 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
if (cfi_mode == CFI_AUTO) {
cfi_mode = CFI_KCFI;
- if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
+ /*
+ * FRED has much saner context on exception entry and
+ * is less easy to take advantage of.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ cfi_paranoid = true;
cfi_mode = CFI_FINEIBT;
+ }
}
/*
@@ -1311,7 +1743,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
* rewrite them. This disables all CFI. If this succeeds but any of the
* later stages fails, we're without CFI.
*/
- ret = cfi_disable_callers(start_retpoline, end_retpoline, mod);
+ ret = cfi_disable_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
@@ -1322,11 +1754,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
}
- ret = cfi_rand_preamble(start_cfi, end_cfi, mod);
+ ret = cfi_rand_preamble(start_cfi, end_cfi);
if (ret)
goto err;
- ret = cfi_rand_callers(start_retpoline, end_retpoline, mod);
+ ret = cfi_rand_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
}
@@ -1338,7 +1770,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
return;
case CFI_KCFI:
- ret = cfi_enable_callers(start_retpoline, end_retpoline, mod);
+ ret = cfi_enable_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
@@ -1348,20 +1780,23 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
case CFI_FINEIBT:
/* place the FineIBT preamble at func()-16 */
- ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod);
+ ret = cfi_rewrite_preamble(start_cfi, end_cfi);
if (ret)
goto err;
/* rewrite the callers to target func()-16 */
- ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod);
+ ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
if (ret)
goto err;
/* now that nobody targets func()+0, remove ENDBR there */
- cfi_rewrite_endbr(start_cfi, end_cfi, mod);
+ cfi_rewrite_endbr(start_cfi, end_cfi);
- if (builtin)
- pr_info("Using FineIBT CFI\n");
+ if (builtin) {
+ pr_info("Using %sFineIBT%s CFI\n",
+ cfi_paranoid ? "paranoid " : "",
+ cfi_bhi ? "+BHI" : "");
+ }
return;
default:
@@ -1377,11 +1812,25 @@ static inline void poison_hash(void *addr)
*(u32 *)addr = 0;
}
-static void poison_cfi(void *addr, void *wr_addr)
+static void poison_cfi(void *addr)
{
+ /*
+ * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
+ * some (static) functions for which they can determine the address
+ * is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
+ *
+ * As such, these functions will get sealed, but we need to be careful
+ * to not unconditionally scribble the previous function.
+ */
switch (cfi_mode) {
case CFI_FINEIBT:
/*
+ * FineIBT prefix should start with an ENDBR.
+ */
+ if (!is_endbr(addr))
+ break;
+
+ /*
* __cfi_\func:
* osp nopl (%rax)
* subl $0, %r10d
@@ -1389,17 +1838,23 @@ static void poison_cfi(void *addr, void *wr_addr)
* ud2
* 1: nop
*/
- poison_endbr(addr, wr_addr, false);
- poison_hash(wr_addr + fineibt_preamble_hash);
+ poison_endbr(addr);
+ poison_hash(addr + fineibt_preamble_hash);
break;
case CFI_KCFI:
/*
+ * kCFI prefix should start with a valid hash.
+ */
+ if (!decode_preamble_hash(addr, NULL))
+ break;
+
+ /*
* __cfi_\func:
* movl $0, %eax
* .skip 11, 0x90
*/
- poison_hash(wr_addr + 1);
+ poison_hash(addr + 1);
break;
default:
@@ -1407,24 +1862,172 @@ static void poison_cfi(void *addr, void *wr_addr)
}
}
-#else
+/*
+ * When regs->ip points to a 0xEA byte in the FineIBT preamble,
+ * return true and fill out target and type.
+ *
+ * We check the preamble by checking for the ENDBR instruction relative to the
+ * 0xEA instruction.
+ */
+static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_preamble_ud;
+ u32 hash;
+
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ *target = addr + fineibt_preamble_size;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->r10 + hash;
+
+ /*
+ * Since regs->ip points to the middle of an instruction; it cannot
+ * continue with the normal fixup.
+ */
+ regs->ip = *target;
+
+ return true;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to one of the UD2 in __bhi_args[].
+ */
+static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr;
+ u32 hash;
+
+ if (!cfi_bhi)
+ return false;
+
+ if (regs->ip < (unsigned long)__bhi_args ||
+ regs->ip >= (unsigned long)__bhi_args_end)
+ return false;
+
+ /*
+ * Fetch the return address from the stack, this points to the
+ * FineIBT preamble. Since the CALL instruction is in the 5 last
+ * bytes of the preamble, the return address is in fact the target
+ * address.
+ */
+ __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
+ *target = addr;
+
+ addr -= fineibt_preamble_size;
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->r10 + hash;
+
+ /*
+ * The UD2 sites are constructed with a RET immediately following,
+ * as such the non-fatal case can use the regular fixup.
+ */
+ return true;
+
+Efault:
+ return false;
+}
+
+static bool is_paranoid_thunk(unsigned long addr)
+{
+ u32 thunk;
+
+ __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
+ return (thunk & 0x00FFFFFF) == 0xfd75ea;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
+ * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS
+ * thunk.
+ */
+static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_paranoid_ud;
+
+ if (!cfi_paranoid)
+ return false;
+
+ if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
+ *target = regs->r11 + fineibt_preamble_size;
+ *type = regs->r10;
+
+ /*
+ * Since the trapping instruction is the exact, but LOCK prefixed,
+ * Jcc.d8 that got us here, the normal fixup will work.
+ */
+ return true;
+ }
+
+ /*
+ * The cfi_paranoid + ITS thunk combination results in:
+ *
+ * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d
+ * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d
+ * a: 4d 8d 5b f0 lea -0x10(%r11), %r11
+ * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11
+ *
+ * Where the paranoid_thunk looks like:
+ *
+ * 1d: <ea> (bad)
+ * __x86_indirect_paranoid_thunk_r11:
+ * 1e: 75 fd jne 1d
+ * __x86_indirect_its_thunk_r11:
+ * 20: 41 ff eb jmp *%r11
+ * 23: cc int3
+ *
+ */
+ if (is_paranoid_thunk(regs->ip)) {
+ *target = regs->r11 + fineibt_preamble_size;
+ *type = regs->r10;
+
+ regs->ip = *target;
+ return true;
+ }
+
+ return false;
+}
+
+bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ if (decode_fineibt_paranoid(regs, target, type))
+ return true;
+
+ if (decode_fineibt_bhi(regs, target, type))
+ return true;
+
+ return decode_fineibt_preamble(regs, target, type);
+}
+
+#else /* !CONFIG_FINEIBT: */
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, struct module *mod)
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
{
}
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_cfi(void *addr, void *wr_addr) { }
+static void poison_cfi(void *addr) { }
#endif
-#endif
+#endif /* !CONFIG_FINEIBT */
void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, struct module *mod)
+ s32 *start_cfi, s32 *end_cfi)
{
return __apply_fineibt(start_retpoline, end_retpoline,
- start_cfi, end_cfi, mod);
+ start_cfi, end_cfi,
+ /* .builtin = */ false);
}
#ifdef CONFIG_SMP
@@ -1673,7 +2276,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg)
static noinline void __init alt_reloc_selftest(void)
{
/*
- * Tests apply_relocation().
+ * Tests text_poke_apply_relocation().
*
* This has a relative immediate (CALL) in a place other than the first
* instruction and additionally on x86_64 we get a RIP-relative LEA:
@@ -1694,6 +2297,8 @@ static noinline void __init alt_reloc_selftest(void)
void __init alternative_instructions(void)
{
+ u64 ibt;
+
int3_selftest();
/*
@@ -1720,28 +2325,33 @@ void __init alternative_instructions(void)
*/
paravirt_set_cap();
+ /* Keep CET-IBT disabled until caller/callee are patched */
+ ibt = ibt_save(/*disable*/ true);
+
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
- __cfi_sites, __cfi_sites_end, NULL);
+ __cfi_sites, __cfi_sites_end, true);
/*
* Rewrite the retpolines, must be done before alternatives since
* those can rewrite the retpoline thunks.
*/
- apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL);
- apply_returns(__return_sites, __return_sites_end, NULL);
-
- apply_alternatives(__alt_instructions, __alt_instructions_end, NULL);
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end);
+ apply_returns(__return_sites, __return_sites_end);
/*
- * Now all calls are established. Apply the call thunks if
- * required.
+ * Adjust all CALL instructions to point to func()-10, including
+ * those in .altinstr_replacement.
*/
callthunks_patch_builtin_calls();
+ apply_alternatives(__alt_instructions, __alt_instructions_end);
+
/*
* Seal all functions that do not have their address taken.
*/
- apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL);
+ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+
+ ibt_restore(ibt);
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
@@ -1803,76 +2413,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
}
}
-typedef struct {
- struct mm_struct *mm;
-} temp_mm_state_t;
-
-/*
- * Using a temporary mm allows to set temporary mappings that are not accessible
- * by other CPUs. Such mappings are needed to perform sensitive memory writes
- * that override the kernel memory protections (e.g., W^X), without exposing the
- * temporary page-table mappings that are required for these write operations to
- * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
- * mapping is torn down.
- *
- * Context: The temporary mm needs to be used exclusively by a single core. To
- * harden security IRQs must be disabled while the temporary mm is
- * loaded, thereby preventing interrupt handler bugs from overriding
- * the kernel memory protection.
- */
-static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
-{
- temp_mm_state_t temp_state;
-
- lockdep_assert_irqs_disabled();
-
- /*
- * Make sure not to be in TLB lazy mode, as otherwise we'll end up
- * with a stale address space WITHOUT being in lazy mode after
- * restoring the previous mm.
- */
- if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
- leave_mm();
-
- temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
- switch_mm_irqs_off(NULL, mm, current);
-
- /*
- * If breakpoints are enabled, disable them while the temporary mm is
- * used. Userspace might set up watchpoints on addresses that are used
- * in the temporary mm, which would lead to wrong signals being sent or
- * crashes.
- *
- * Note that breakpoints are not disabled selectively, which also causes
- * kernel breakpoints (e.g., perf's) to be disabled. This might be
- * undesirable, but still seems reasonable as the code that runs in the
- * temporary mm should be short.
- */
- if (hw_breakpoint_active())
- hw_breakpoint_disable();
-
- return temp_state;
-}
-
-__ro_after_init struct mm_struct *poking_mm;
-__ro_after_init unsigned long poking_addr;
-
-static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
-{
- lockdep_assert_irqs_disabled();
-
- switch_mm_irqs_off(NULL, prev_state.mm, current);
-
- /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
- cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
-
- /*
- * Restore the breakpoints if they were disabled before the temporary mm
- * was loaded.
- */
- if (hw_breakpoint_active())
- hw_breakpoint_restore();
-}
+__ro_after_init struct mm_struct *text_poke_mm;
+__ro_after_init unsigned long text_poke_mm_addr;
static void text_poke_memcpy(void *dst, const void *src, size_t len)
{
@@ -1892,7 +2434,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
{
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
- temp_mm_state_t prev;
+ struct mm_struct *prev_mm;
unsigned long flags;
pte_t pte, *ptep;
spinlock_t *ptl;
@@ -1929,7 +2471,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
/*
* The lock is not really needed, but this allows to avoid open-coding.
*/
- ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
+ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
/*
* This must not fail; preallocated in poking_init().
@@ -1939,21 +2481,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
local_irq_save(flags);
pte = mk_pte(pages[0], pgprot);
- set_pte_at(poking_mm, poking_addr, ptep, pte);
+ set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
if (cross_page_boundary) {
pte = mk_pte(pages[1], pgprot);
- set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
+ set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
}
/*
* Loading the temporary mm behaves as a compiler barrier, which
* guarantees that the PTE will be set at the time memcpy() is done.
*/
- prev = use_temporary_mm(poking_mm);
+ prev_mm = use_temporary_mm(text_poke_mm);
kasan_disable_current();
- func((u8 *)poking_addr + offset_in_page(addr), src, len);
+ func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
kasan_enable_current();
/*
@@ -1962,22 +2504,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
*/
barrier();
- pte_clear(poking_mm, poking_addr, ptep);
+ pte_clear(text_poke_mm, text_poke_mm_addr, ptep);
if (cross_page_boundary)
- pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
+ pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1);
/*
* Loading the previous page-table hierarchy requires a serializing
* instruction that already allows the core to see the updated version.
* Xen-PV is assumed to serialize execution in a similar manner.
*/
- unuse_temporary_mm(prev);
+ unuse_temporary_mm(prev_mm);
/*
* Flushing the TLB might involve IPIs, which would require enabled
* IRQs, but not if the mm is not used, as it is in this point.
*/
- flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
+ flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
PAGE_SHIFT, false);
@@ -2113,7 +2655,7 @@ static void do_sync_core(void *info)
sync_core();
}
-void text_poke_sync(void)
+void smp_text_poke_sync_each_cpu(void)
{
on_each_cpu(do_sync_core, NULL, 1);
}
@@ -2123,64 +2665,66 @@ void text_poke_sync(void)
* this thing. When len == 6 everything is prefixed with 0x0f and we map
* opcode to Jcc.d8, using len to distinguish.
*/
-struct text_poke_loc {
+struct smp_text_poke_loc {
/* addr := _stext + rel_addr */
s32 rel_addr;
s32 disp;
u8 len;
u8 opcode;
- const u8 text[POKE_MAX_OPCODE_SIZE];
- /* see text_poke_bp_batch() */
+ const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
+ /* see smp_text_poke_batch_finish() */
u8 old;
};
-struct bp_patching_desc {
- struct text_poke_loc *vec;
+#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
+
+static struct smp_text_poke_array {
+ struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
int nr_entries;
- atomic_t refs;
-};
+} text_poke_array;
-static struct bp_patching_desc bp_desc;
+static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
-static __always_inline
-struct bp_patching_desc *try_get_desc(void)
+/*
+ * These four __always_inline annotations imply noinstr, necessary
+ * due to smp_text_poke_int3_handler() being noinstr:
+ */
+
+static __always_inline bool try_get_text_poke_array(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
- if (!raw_atomic_inc_not_zero(&desc->refs))
- return NULL;
+ if (!raw_atomic_inc_not_zero(refs))
+ return false;
- return desc;
+ return true;
}
-static __always_inline void put_desc(void)
+static __always_inline void put_text_poke_array(void)
{
- struct bp_patching_desc *desc = &bp_desc;
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
smp_mb__before_atomic();
- raw_atomic_dec(&desc->refs);
+ raw_atomic_dec(refs);
}
-static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
+static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
{
- return _stext + tp->rel_addr;
+ return _stext + tpl->rel_addr;
}
-static __always_inline int patch_cmp(const void *key, const void *elt)
+static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
{
- struct text_poke_loc *tp = (struct text_poke_loc *) elt;
-
- if (key < text_poke_addr(tp))
+ if (tpl_a < text_poke_addr(tpl_b))
return -1;
- if (key > text_poke_addr(tp))
+ if (tpl_a > text_poke_addr(tpl_b))
return 1;
return 0;
}
-noinstr int poke_int3_handler(struct pt_regs *regs)
+noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
{
- struct bp_patching_desc *desc;
- struct text_poke_loc *tp;
+ struct smp_text_poke_loc *tpl;
int ret = 0;
void *ip;
@@ -2189,41 +2733,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
/*
* Having observed our INT3 instruction, we now must observe
- * bp_desc with non-zero refcount:
+ * text_poke_array with non-zero refcount:
*
- * bp_desc.refs = 1 INT3
- * WMB RMB
- * write INT3 if (bp_desc.refs != 0)
+ * text_poke_array_refs = 1 INT3
+ * WMB RMB
+ * write INT3 if (text_poke_array_refs != 0)
*/
smp_rmb();
- desc = try_get_desc();
- if (!desc)
+ if (!try_get_text_poke_array())
return 0;
/*
- * Discount the INT3. See text_poke_bp_batch().
+ * Discount the INT3. See smp_text_poke_batch_finish().
*/
ip = (void *) regs->ip - INT3_INSN_SIZE;
/*
* Skip the binary search if there is a single member in the vector.
*/
- if (unlikely(desc->nr_entries > 1)) {
- tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
- sizeof(struct text_poke_loc),
+ if (unlikely(text_poke_array.nr_entries > 1)) {
+ tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries,
+ sizeof(struct smp_text_poke_loc),
patch_cmp);
- if (!tp)
+ if (!tpl)
goto out_put;
} else {
- tp = desc->vec;
- if (text_poke_addr(tp) != ip)
+ tpl = text_poke_array.vec;
+ if (text_poke_addr(tpl) != ip)
goto out_put;
}
- ip += tp->len;
+ ip += tpl->len;
- switch (tp->opcode) {
+ switch (tpl->opcode) {
case INT3_INSN_OPCODE:
/*
* Someone poked an explicit INT3, they'll want to handle it,
@@ -2236,16 +2779,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
break;
case CALL_INSN_OPCODE:
- int3_emulate_call(regs, (long)ip + tp->disp);
+ int3_emulate_call(regs, (long)ip + tpl->disp);
break;
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
- int3_emulate_jmp(regs, (long)ip + tp->disp);
+ int3_emulate_jmp(regs, (long)ip + tpl->disp);
break;
case 0x70 ... 0x7f: /* Jcc */
- int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
+ int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp);
break;
default:
@@ -2255,51 +2798,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
ret = 1;
out_put:
- put_desc();
+ put_text_poke_array();
return ret;
}
-#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
-static struct text_poke_loc tp_vec[TP_VEC_MAX];
-static int tp_vec_nr;
-
/**
- * text_poke_bp_batch() -- update instructions on live kernel on SMP
- * @tp: vector of instructions to patch
- * @nr_entries: number of entries in the vector
+ * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
+ *
+ * Input state:
+ * text_poke_array.vec: vector of instructions to patch
+ * text_poke_array.nr_entries: number of entries in the vector
*
- * Modify multi-byte instruction by using int3 breakpoint on SMP.
- * We completely avoid stop_machine() here, and achieve the
- * synchronization using int3 breakpoint.
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
*
* The way it is done:
* - For each entry in the vector:
- * - add a int3 trap to the address that will be patched
- * - sync cores
+ * - add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
* - For each entry in the vector:
* - update all but the first byte of the patched range
- * - sync cores
+ * - SMP sync all CPUs
* - For each entry in the vector:
- * - replace the first byte (int3) by the first byte of
+ * - replace the first byte (INT3) by the first byte of the
* replacing opcode
- * - sync cores
+ * - SMP sync all CPUs
*/
-static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
+void smp_text_poke_batch_finish(void)
{
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
int do_sync;
- lockdep_assert_held(&text_mutex);
+ if (!text_poke_array.nr_entries)
+ return;
- bp_desc.vec = tp;
- bp_desc.nr_entries = nr_entries;
+ lockdep_assert_held(&text_mutex);
/*
- * Corresponds to the implicit memory barrier in try_get_desc() to
- * ensure reading a non-zero refcount provides up to date bp_desc data.
+ * Corresponds to the implicit memory barrier in try_get_text_poke_array() to
+ * ensure reading a non-zero refcount provides up to date text_poke_array data.
*/
- atomic_set_release(&bp_desc.refs, 1);
+ for_each_possible_cpu(i)
+ atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1);
/*
* Function tracing can enable thousands of places that need to be
@@ -2312,33 +2854,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
cond_resched();
/*
- * Corresponding read barrier in int3 notifier for making sure the
- * nr_entries and handler are correctly ordered wrt. patching.
+ * Corresponding read barrier in INT3 notifier for making sure the
+ * text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
*/
smp_wmb();
/*
- * First step: add a int3 trap to the address that will be patched.
+ * First step: add a INT3 trap to the address that will be patched.
*/
- for (i = 0; i < nr_entries; i++) {
- tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
- text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
+ for (i = 0; i < text_poke_array.nr_entries; i++) {
+ text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
}
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
/*
* Second step: update all but the first byte of the patched range.
*/
- for (do_sync = 0, i = 0; i < nr_entries; i++) {
- u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
- u8 _new[POKE_MAX_OPCODE_SIZE+1];
- const u8 *new = tp[i].text;
- int len = tp[i].len;
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
+ u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
+ const u8 *new = text_poke_array.vec[i].text;
+ int len = text_poke_array.vec[i].len;
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
- text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
if (len == 6) {
@@ -2347,7 +2889,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
new = _new;
}
- text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
new + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
@@ -2378,7 +2920,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* The old instruction is recorded so that the event can be
* processed forwards or backwards.
*/
- perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
+ perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len);
}
if (do_sync) {
@@ -2387,63 +2929,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
}
/*
- * Third step: replace the first byte (int3) by the first byte of
+ * Third step: replace the first byte (INT3) by the first byte of the
* replacing opcode.
*/
- for (do_sync = 0, i = 0; i < nr_entries; i++) {
- u8 byte = tp[i].text[0];
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 byte = text_poke_array.vec[i].text[0];
- if (tp[i].len == 6)
+ if (text_poke_array.vec[i].len == 6)
byte = 0x0f;
if (byte == INT3_INSN_OPCODE)
continue;
- text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE);
do_sync++;
}
if (do_sync)
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
/*
* Remove and wait for refs to be zero.
+ *
+ * Notably, if after step-3 above the INT3 got removed, then the
+ * smp_text_poke_sync_each_cpu() will have serialized against any running INT3
+ * handlers and the below spin-wait will not happen.
+ *
+ * IOW. unless the replacement instruction is INT3, this case goes
+ * unused.
*/
- if (!atomic_dec_and_test(&bp_desc.refs))
- atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+ for_each_possible_cpu(i) {
+ atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
+
+ if (unlikely(!atomic_dec_and_test(refs)))
+ atomic_cond_read_acquire(refs, !VAL);
+ }
+
+ /* They are all completed: */
+ text_poke_array.nr_entries = 0;
}
-static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
- const void *opcode, size_t len, const void *emulate)
+static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
{
+ struct smp_text_poke_loc *tpl;
struct insn insn;
int ret, i = 0;
+ tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
+
if (len == 6)
i = 1;
- memcpy((void *)tp->text, opcode+i, len-i);
+ memcpy((void *)tpl->text, opcode+i, len-i);
if (!emulate)
emulate = opcode;
ret = insn_decode_kernel(&insn, emulate);
BUG_ON(ret < 0);
- tp->rel_addr = addr - (void *)_stext;
- tp->len = len;
- tp->opcode = insn.opcode.bytes[0];
+ tpl->rel_addr = addr - (void *)_stext;
+ tpl->len = len;
+ tpl->opcode = insn.opcode.bytes[0];
if (is_jcc32(&insn)) {
/*
* Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
*/
- tp->opcode = insn.opcode.bytes[1] - 0x10;
+ tpl->opcode = insn.opcode.bytes[1] - 0x10;
}
- switch (tp->opcode) {
+ switch (tpl->opcode) {
case RET_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
@@ -2452,14 +3010,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
* next instruction can be padded with INT3.
*/
for (i = insn.length; i < len; i++)
- BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
+ BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
break;
default:
BUG_ON(len != insn.length);
}
- switch (tp->opcode) {
+ switch (tpl->opcode) {
case INT3_INSN_OPCODE:
case RET_INSN_OPCODE:
break;
@@ -2468,21 +3026,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
case 0x70 ... 0x7f: /* Jcc */
- tp->disp = insn.immediate.value;
+ tpl->disp = insn.immediate.value;
break;
default: /* assume NOP */
switch (len) {
case 2: /* NOP2 -- emulate as JMP8+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
- tp->opcode = JMP8_INSN_OPCODE;
- tp->disp = 0;
+ tpl->opcode = JMP8_INSN_OPCODE;
+ tpl->disp = 0;
break;
case 5: /* NOP5 -- emulate as JMP32+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
- tp->opcode = JMP32_INSN_OPCODE;
- tp->disp = 0;
+ tpl->opcode = JMP32_INSN_OPCODE;
+ tpl->disp = 0;
break;
default: /* unknown instruction */
@@ -2493,51 +3051,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
}
/*
- * We hard rely on the tp_vec being ordered; ensure this is so by flushing
+ * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
* early if needed.
*/
-static bool tp_order_fail(void *addr)
+static bool text_poke_addr_ordered(void *addr)
{
- struct text_poke_loc *tp;
+ WARN_ON_ONCE(!addr);
- if (!tp_vec_nr)
- return false;
-
- if (!addr) /* force */
- return true;
-
- tp = &tp_vec[tp_vec_nr - 1];
- if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
+ if (!text_poke_array.nr_entries)
return true;
- return false;
-}
-
-static void text_poke_flush(void *addr)
-{
- if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
- text_poke_bp_batch(tp_vec, tp_vec_nr);
- tp_vec_nr = 0;
- }
-}
+ /*
+ * If the last current entry's address is higher than the
+ * new entry's address we'd like to add, then ordering
+ * is violated and we must first flush all pending patching
+ * requests:
+ */
+ if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
+ return false;
-void text_poke_finish(void)
-{
- text_poke_flush(NULL);
+ return true;
}
-void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
+/**
+ * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @emulate: instruction to be emulated
+ *
+ * Add a new instruction to the current queue of to-be-patched instructions
+ * the kernel maintains. The patching request will not be executed immediately,
+ * but becomes part of an array of patching requests, optimized for batched
+ * execution. All pending patching requests will be executed on the next
+ * smp_text_poke_batch_finish() call.
+ */
+void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
{
- struct text_poke_loc *tp;
-
- text_poke_flush(addr);
-
- tp = &tp_vec[tp_vec_nr++];
- text_poke_loc_init(tp, addr, opcode, len, emulate);
+ if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
+ smp_text_poke_batch_finish();
+ __smp_text_poke_batch_add(addr, opcode, len, emulate);
}
/**
- * text_poke_bp() -- update instructions on live kernel on SMP
+ * smp_text_poke_single() -- update instruction on live kernel on SMP immediately
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
@@ -2545,12 +3102,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi
*
* Update a single instruction with the vector in the stack, avoiding
* dynamically allocated memory. This function should be used when it is
- * not possible to allocate memory.
+ * not possible to allocate memory for a vector. The single instruction
+ * is patched in immediately.
*/
-void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
{
- struct text_poke_loc tp;
-
- text_poke_loc_init(&tp, addr, opcode, len, emulate);
- text_poke_bp_batch(&tp, 1);
+ __smp_text_poke_batch_add(addr, opcode, len, emulate);
+ smp_text_poke_batch_finish();
}
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index c884deca839b..3485d419c2f5 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -39,7 +39,7 @@
#include <asm/gart.h>
#include <asm/set_memory.h>
#include <asm/dma.h>
-#include <asm/amd_nb.h>
+#include <asm/amd/nb.h>
#include <asm/x86_init.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 11fac09e3a8c..c1acead6227a 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -13,7 +13,9 @@
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/pci_ids.h>
-#include <asm/amd_nb.h>
+
+#include <asm/amd/nb.h>
+#include <asm/cpuid/api.h>
static u32 *flush_words;
@@ -73,7 +75,6 @@ static int amd_cache_northbridges(void)
amd_northbridges.nb = nb;
for (i = 0; i < amd_northbridges.num; i++) {
- node_to_amd_nb(i)->root = amd_node_get_root(i);
node_to_amd_nb(i)->misc = amd_node_get_func(i, 3);
/*
@@ -92,10 +93,7 @@ static int amd_cache_northbridges(void)
if (amd_gart_present())
amd_northbridges.flags |= AMD_NB_GART;
- /*
- * Check for L3 cache presence.
- */
- if (!cpuid_edx(0x80000006))
+ if (!cpuid_amd_hygon_has_l3_cache())
return 0;
/*
@@ -143,7 +141,6 @@ bool __init early_is_amd_nb(u32 device)
struct resource *amd_get_mmconfig_range(struct resource *res)
{
- u32 address;
u64 base, msr;
unsigned int segn_busn_bits;
@@ -151,13 +148,11 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return NULL;
- /* assume all cpus from fam10h have mmconfig */
- if (boot_cpu_data.x86 < 0x10)
+ /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */
+ if (boot_cpu_data.x86 < 0x10 ||
+ rdmsrq_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr))
return NULL;
- address = MSR_FAM10H_MMIO_CONF_BASE;
- rdmsrl(address, msr);
-
/* mmconfig is not enabled */
if (!(msr & FAM10H_MMIO_CONF_ENABLE))
return NULL;
diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
index d2ec7fd555c5..a40176b62eb5 100644
--- a/arch/x86/kernel/amd_node.c
+++ b/arch/x86/kernel/amd_node.c
@@ -8,7 +8,8 @@
* Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
*/
-#include <asm/amd_node.h>
+#include <linux/debugfs.h>
+#include <asm/amd/node.h>
/*
* AMD Nodes are a physical collection of I/O devices within an SoC. There can be one
@@ -93,10 +94,14 @@ static struct pci_dev **amd_roots;
/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
+static bool smn_exclusive;
#define SMN_INDEX_OFFSET 0x60
#define SMN_DATA_OFFSET 0x64
+#define HSMP_INDEX_OFFSET 0xc4
+#define HSMP_DATA_OFFSET 0xc8
+
/*
* SMN accesses may fail in ways that are difficult to detect here in the called
* functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
@@ -146,6 +151,9 @@ static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, b
if (!root)
return err;
+ if (!smn_exclusive)
+ return err;
+
guard(mutex)(&smn_mutex);
err = pci_write_config_dword(root, i_off, address);
@@ -179,6 +187,93 @@ int __must_check amd_smn_write(u16 node, u32 address, u32 value)
}
EXPORT_SYMBOL_GPL(amd_smn_write);
+int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write)
+{
+ return __amd_smn_rw(HSMP_INDEX_OFFSET, HSMP_DATA_OFFSET, node, address, value, write);
+}
+EXPORT_SYMBOL_GPL(amd_smn_hsmp_rdwr);
+
+static struct dentry *debugfs_dir;
+static u16 debug_node;
+static u32 debug_address;
+
+static ssize_t smn_node_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ u16 node;
+ int ret;
+
+ ret = kstrtou16_from_user(userbuf, count, 0, &node);
+ if (ret)
+ return ret;
+
+ if (node >= amd_num_nodes())
+ return -ENODEV;
+
+ debug_node = node;
+ return count;
+}
+
+static int smn_node_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%08x\n", debug_node);
+ return 0;
+}
+
+static ssize_t smn_address_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+
+ ret = kstrtouint_from_user(userbuf, count, 0, &debug_address);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static int smn_address_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%08x\n", debug_address);
+ return 0;
+}
+
+static int smn_value_show(struct seq_file *m, void *v)
+{
+ u32 val;
+ int ret;
+
+ ret = amd_smn_read(debug_node, debug_address, &val);
+ if (ret)
+ return ret;
+
+ seq_printf(m, "0x%08x\n", val);
+ return 0;
+}
+
+static ssize_t smn_value_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ u32 val;
+ int ret;
+
+ ret = kstrtouint_from_user(userbuf, count, 0, &val);
+ if (ret)
+ return ret;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ ret = amd_smn_write(debug_node, debug_address, val);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_node);
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_address);
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_value);
+
static int amd_cache_roots(void)
{
u16 node, num_nodes = amd_num_nodes();
@@ -193,6 +288,48 @@ static int amd_cache_roots(void)
return 0;
}
+static int reserve_root_config_spaces(void)
+{
+ struct pci_dev *root = NULL;
+ struct pci_bus *bus = NULL;
+
+ while ((bus = pci_find_next_bus(bus))) {
+ /* Root device is Device 0 Function 0 on each Primary Bus. */
+ root = pci_get_slot(bus, 0);
+ if (!root)
+ continue;
+
+ if (root->vendor != PCI_VENDOR_ID_AMD &&
+ root->vendor != PCI_VENDOR_ID_HYGON)
+ continue;
+
+ pci_dbg(root, "Reserving PCI config space\n");
+
+ /*
+ * There are a few SMN index/data pairs and other registers
+ * that shouldn't be accessed by user space.
+ * So reserve the entire PCI config space for simplicity rather
+ * than covering specific registers piecemeal.
+ */
+ if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
+ pci_err(root, "Failed to reserve config space\n");
+ return -EEXIST;
+ }
+ }
+
+ smn_exclusive = true;
+ return 0;
+}
+
+static bool enable_dfs;
+
+static int __init amd_smn_enable_dfs(char *str)
+{
+ enable_dfs = true;
+ return 1;
+}
+__setup("amd_smn_debugfs_enable", amd_smn_enable_dfs);
+
static int __init amd_smn_init(void)
{
int err;
@@ -209,6 +346,18 @@ static int __init amd_smn_init(void)
if (err)
return err;
+ err = reserve_root_config_spaces();
+ if (err)
+ return err;
+
+ if (enable_dfs) {
+ debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir);
+
+ debugfs_create_file("node", 0600, debugfs_dir, NULL, &smn_node_fops);
+ debugfs_create_file("address", 0600, debugfs_dir, NULL, &smn_address_fops);
+ debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops);
+ }
+
return 0;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 89c0c8a3fc7e..769321185a08 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -29,7 +29,7 @@
#include <asm/gart.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
-#include <asm/amd_nb.h>
+#include <asm/amd/nb.h>
#include <asm/x86_init.h>
#include <linux/crash_dump.h>
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 3bf0487cf3b7..52d1808ee360 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -23,8 +23,5 @@ obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
obj-y += apic_flat_64.o
endif
-# APIC probe will depend on the listing order here
-obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
-
# For 32bit, probe_32 need to be listed last
obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e893dc6f11c1..d73ba5a7b623 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -59,6 +59,7 @@
#include <asm/time.h>
#include <asm/smp.h>
#include <asm/mce.h>
+#include <asm/msr.h>
#include <asm/tsc.h>
#include <asm/hypervisor.h>
#include <asm/cpu_device_id.h>
@@ -425,7 +426,7 @@ static int lapic_next_deadline(unsigned long delta,
weak_wrmsr_fence();
tsc = rdtsc();
- wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
+ wrmsrq(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
return 0;
}
@@ -449,7 +450,7 @@ static int lapic_timer_shutdown(struct clock_event_device *evt)
* the timer _and_ zero the counter registers:
*/
if (v & APIC_LVT_TIMER_TSCDEADLINE)
- wrmsrl(MSR_IA32_TSC_DEADLINE, 0);
+ wrmsrq(MSR_IA32_TSC_DEADLINE, 0);
else
apic_write(APIC_TMICT, 0);
@@ -1371,8 +1372,6 @@ void __init apic_intr_mode_init(void)
x86_64_probe_apic();
- x86_32_install_bigsmp();
-
if (x86_platform.apic_post_init)
x86_platform.apic_post_init();
@@ -1674,7 +1673,6 @@ static __init void apic_read_boot_cpu_id(bool x2apic)
boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
topology_register_boot_apic(boot_cpu_physical_apicid);
- x86_32_probe_bigsmp_early();
}
#ifdef CONFIG_X86_X2APIC
@@ -1697,7 +1695,7 @@ static bool x2apic_hw_locked(void)
x86_arch_cap_msr = x86_read_arch_cap_msr();
if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) {
- rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
+ rdmsrq(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
return (msr & LEGACY_XAPIC_DISABLED);
}
return false;
@@ -1710,12 +1708,12 @@ static void __x2apic_disable(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return;
- rdmsrl(MSR_IA32_APICBASE, msr);
+ rdmsrq(MSR_IA32_APICBASE, msr);
if (!(msr & X2APIC_ENABLE))
return;
/* Disable xapic and x2apic first and then reenable xapic mode */
- wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
- wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+ wrmsrq(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrq(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
printk_once(KERN_INFO "x2apic disabled\n");
}
@@ -1723,10 +1721,10 @@ static void __x2apic_enable(void)
{
u64 msr;
- rdmsrl(MSR_IA32_APICBASE, msr);
+ rdmsrq(MSR_IA32_APICBASE, msr);
if (msr & X2APIC_ENABLE)
return;
- wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ wrmsrq(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
printk_once(KERN_INFO "x2apic enabled\n");
}
@@ -2014,8 +2012,8 @@ static bool __init detect_init_APIC(void)
case X86_VENDOR_HYGON:
break;
case X86_VENDOR_INTEL:
- if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
- (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)))
+ if ((boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)) ||
+ boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO)
break;
goto no_apic;
default:
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 16410f087b7a..e272bc7fdc8e 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/pgtable.h>
+#include <asm/msr.h>
#include <asm/numachip/numachip.h>
#include <asm/numachip/numachip_csr.h>
@@ -31,7 +32,7 @@ static u32 numachip1_get_apic_id(u32 x)
unsigned int id = (x >> 24) & 0xff;
if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
- rdmsrl(MSR_FAM10H_NODE_ID, value);
+ rdmsrq(MSR_FAM10H_NODE_ID, value);
id |= (value << 2) & 0xff00;
}
@@ -42,7 +43,7 @@ static u32 numachip2_get_apic_id(u32 x)
{
u64 mcfg;
- rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
+ rdmsrq(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
}
@@ -150,7 +151,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
/* Account for nodes per socket in multi-core-module processors */
if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
- rdmsrl(MSR_FAM10H_NODE_ID, val);
+ rdmsrq(MSR_FAM10H_NODE_ID, val);
nodes = ((val >> 3) & 7) + 1;
}
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
deleted file mode 100644
index 9285d500d5b4..000000000000
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
- *
- * Drives the local APIC in "clustered mode".
- */
-#include <linux/cpumask.h>
-#include <linux/dmi.h>
-#include <linux/smp.h>
-
-#include <asm/apic.h>
-#include <asm/io_apic.h>
-
-#include "local.h"
-
-static u32 bigsmp_get_apic_id(u32 x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static void bigsmp_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void bigsmp_send_IPI_all(int vector)
-{
- default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
-}
-
-static int dmi_bigsmp; /* can be set by dmi scanners */
-
-static int hp_ht_bigsmp(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
- dmi_bigsmp = 1;
-
- return 0;
-}
-
-
-static const struct dmi_system_id bigsmp_dmi_table[] = {
- { hp_ht_bigsmp, "HP ProLiant DL760 G2",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
- }
- },
-
- { hp_ht_bigsmp, "HP ProLiant DL740",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
- }
- },
- { } /* NULL entry stops DMI scanning */
-};
-
-static int probe_bigsmp(void)
-{
- return dmi_check_system(bigsmp_dmi_table);
-}
-
-static struct apic apic_bigsmp __ro_after_init = {
-
- .name = "bigsmp",
- .probe = probe_bigsmp,
-
- .dest_mode_logical = false,
-
- .disable_esr = 1,
-
- .cpu_present_to_apicid = default_cpu_present_to_apicid,
-
- .max_apic_id = 0xFE,
- .get_apic_id = bigsmp_get_apic_id,
-
- .calc_dest_apicid = apic_default_calc_apicid,
-
- .send_IPI = default_send_IPI_single_phys,
- .send_IPI_mask = default_send_IPI_mask_sequence_phys,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
- .send_IPI_all = bigsmp_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi = native_apic_mem_eoi,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = apic_mem_wait_icr_idle,
- .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
-};
-
-bool __init apic_bigsmp_possible(bool cmdline_override)
-{
- return apic == &apic_bigsmp || !cmdline_override;
-}
-
-void __init apic_bigsmp_force(void)
-{
- if (apic != &apic_bigsmp)
- apic_install_driver(&apic_bigsmp);
-}
-
-apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index eebc360ed1bb..5ba2feb2c04c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1486,7 +1486,7 @@ static void __init delay_with_tsc(void)
* 1 GHz == 40 jiffies
*/
do {
- rep_nop();
+ native_pause();
now = rdtsc();
} while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end));
}
@@ -2225,7 +2225,7 @@ static int mp_irqdomain_create(int ioapic)
/* Handle device tree enumerated APICs proper */
if (cfg->dev) {
- fn = of_node_to_fwnode(cfg->dev);
+ fn = of_fwnode_handle(cfg->dev);
} else {
fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic));
if (!fn)
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 5da693d633b7..98a57cb4aa86 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -3,6 +3,7 @@
#include <linux/cpumask.h>
#include <linux/delay.h>
#include <linux/smp.h>
+#include <linux/string_choices.h>
#include <asm/io_apic.h>
@@ -23,7 +24,7 @@ __setup("no_ipi_broadcast=", apic_ipi_shorthand);
static int __init print_ipi_mode(void)
{
pr_info("IPI shorthand broadcast: %s\n",
- apic_ipi_shorthand_off ? "disabled" : "enabled");
+ str_disabled_enabled(apic_ipi_shorthand_off));
return 0;
}
late_initcall(print_ipi_mode);
@@ -287,34 +288,4 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
__default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
-
-#ifdef CONFIG_SMP
-static int convert_apicid_to_cpu(u32 apic_id)
-{
- int i;
-
- for_each_possible_cpu(i) {
- if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
- return i;
- }
- return -1;
-}
-
-int safe_smp_processor_id(void)
-{
- u32 apicid;
- int cpuid;
-
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return 0;
-
- apicid = read_apic_id();
- if (apicid == BAD_APICID)
- return 0;
-
- cpuid = convert_apicid_to_cpu(apicid);
-
- return cpuid >= 0 ? cpuid : 0;
-}
-#endif
#endif
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index 842fe28496be..bdcf609eb283 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -65,17 +65,4 @@ void default_send_IPI_self(int vector);
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector);
void default_send_IPI_mask_logical(const struct cpumask *mask, int vector);
-void x86_32_probe_bigsmp_early(void);
-void x86_32_install_bigsmp(void);
-#else
-static inline void x86_32_probe_bigsmp_early(void) { }
-static inline void x86_32_install_bigsmp(void) { }
-#endif
-
-#ifdef CONFIG_X86_BIGSMP
-bool apic_bigsmp_possible(bool cmdline_selected);
-void apic_bigsmp_force(void);
-#else
-static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; };
-static inline void apic_bigsmp_force(void) { }
#endif
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index f75ee345c02d..87bc9e7ca5d6 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -93,35 +93,6 @@ static int __init parse_apic(char *arg)
}
early_param("apic", parse_apic);
-void __init x86_32_probe_bigsmp_early(void)
-{
- if (nr_cpu_ids <= 8 || xen_pv_domain())
- return;
-
- if (IS_ENABLED(CONFIG_X86_BIGSMP)) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(boot_cpu_apic_version))
- break;
- /* P4 and above */
- fallthrough;
- case X86_VENDOR_HYGON:
- case X86_VENDOR_AMD:
- if (apic_bigsmp_possible(cmdline_apic))
- return;
- break;
- }
- }
- pr_info("Limiting to 8 possible CPUs\n");
- set_nr_cpu_ids(8);
-}
-
-void __init x86_32_install_bigsmp(void)
-{
- if (nr_cpu_ids > 8 && !xen_pv_domain())
- apic_bigsmp_force();
-}
-
void __init x86_32_probe_apic(void)
{
if (!cmdline_apic) {
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 736f62812f5c..93069b13d3af 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -799,7 +799,7 @@ int __init arch_early_irq_init(void)
x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
NULL);
BUG_ON(x86_vector_domain == NULL);
- irq_set_default_host(x86_vector_domain);
+ irq_set_default_domain(x86_vector_domain);
BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
@@ -864,7 +864,7 @@ void lapic_offline(void)
__vector_cleanup(cl, false);
irq_matrix_offline(vector_matrix);
- WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0);
+ WARN_ON_ONCE(timer_delete_sync_try(&cl->timer) < 0);
WARN_ON_ONCE(!hlist_empty(&cl->head));
unlock_vector_lock();
@@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd,
return err ? err : IRQ_SET_MASK_OK;
}
+static void free_moved_vector(struct apic_chip_data *apicd)
+{
+ unsigned int vector = apicd->prev_vector;
+ unsigned int cpu = apicd->prev_cpu;
+ bool managed = apicd->is_managed;
+
+ /*
+ * Managed interrupts are usually not migrated away
+ * from an online CPU, but CPU isolation 'managed_irq'
+ * can make that happen.
+ * 1) Activation does not take the isolation into account
+ * to keep the code simple
+ * 2) Migration away from an isolated CPU can happen when
+ * a non-isolated CPU which is in the calculated
+ * affinity mask comes online.
+ */
+ trace_vector_free_moved(apicd->irq, cpu, vector, managed);
+ irq_matrix_free(vector_matrix, cpu, vector, managed);
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ hlist_del_init(&apicd->clist);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+}
+
+/*
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
+ */
+static void apic_force_complete_move(struct irq_data *irqd)
+{
+ unsigned int cpu = smp_processor_id();
+ struct apic_chip_data *apicd;
+ unsigned int vector;
+
+ guard(raw_spinlock)(&vector_lock);
+ apicd = apic_chip_data(irqd);
+ if (!apicd)
+ return;
+
+ /*
+ * If prev_vector is empty or the descriptor is neither currently
+ * nor previously on the outgoing CPU no action required.
+ */
+ vector = apicd->prev_vector;
+ if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
+ return;
+
+ /*
+ * This is tricky. If the cleanup of the old vector has not been
+ * done yet, then the following setaffinity call will fail with
+ * -EBUSY. This can leave the interrupt in a stale state.
+ *
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
+ *
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (apicd->move_in_progress) {
+ /*
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendezvous in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theoretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
+ */
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqd->irq, vector);
+ }
+ free_moved_vector(apicd);
+}
+
#else
-# define apic_set_affinity NULL
+# define apic_set_affinity NULL
+# define apic_force_complete_move NULL
#endif
static int apic_retrigger_irq(struct irq_data *irqd)
@@ -923,39 +1024,16 @@ static void x86_vector_msi_compose_msg(struct irq_data *data,
}
static struct irq_chip lapic_controller = {
- .name = "APIC",
- .irq_ack = apic_ack_edge,
- .irq_set_affinity = apic_set_affinity,
- .irq_compose_msi_msg = x86_vector_msi_compose_msg,
- .irq_retrigger = apic_retrigger_irq,
+ .name = "APIC",
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_compose_msi_msg = x86_vector_msi_compose_msg,
+ .irq_force_complete_move = apic_force_complete_move,
+ .irq_retrigger = apic_retrigger_irq,
};
#ifdef CONFIG_SMP
-static void free_moved_vector(struct apic_chip_data *apicd)
-{
- unsigned int vector = apicd->prev_vector;
- unsigned int cpu = apicd->prev_cpu;
- bool managed = apicd->is_managed;
-
- /*
- * Managed interrupts are usually not migrated away
- * from an online CPU, but CPU isolation 'managed_irq'
- * can make that happen.
- * 1) Activation does not take the isolation into account
- * to keep the code simple
- * 2) Migration away from an isolated CPU can happen when
- * a non-isolated CPU which is in the calculated
- * affinity mask comes online.
- */
- trace_vector_free_moved(apicd->irq, cpu, vector, managed);
- irq_matrix_free(vector_matrix, cpu, vector, managed);
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
- hlist_del_init(&apicd->clist);
- apicd->prev_vector = 0;
- apicd->move_in_progress = 0;
-}
-
static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
{
struct apic_chip_data *apicd;
@@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg)
__vector_schedule_cleanup(apicd);
}
-/*
- * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
- */
-void irq_force_complete_move(struct irq_desc *desc)
-{
- unsigned int cpu = smp_processor_id();
- struct apic_chip_data *apicd;
- struct irq_data *irqd;
- unsigned int vector;
-
- /*
- * The function is called for all descriptors regardless of which
- * irqdomain they belong to. For example if an IRQ is provided by
- * an irq_chip as part of a GPIO driver, the chip data for that
- * descriptor is specific to the irq_chip in question.
- *
- * Check first that the chip_data is what we expect
- * (apic_chip_data) before touching it any further.
- */
- irqd = irq_domain_get_irq_data(x86_vector_domain,
- irq_desc_get_irq(desc));
- if (!irqd)
- return;
-
- raw_spin_lock(&vector_lock);
- apicd = apic_chip_data(irqd);
- if (!apicd)
- goto unlock;
-
- /*
- * If prev_vector is empty or the descriptor is neither currently
- * nor previously on the outgoing CPU no action required.
- */
- vector = apicd->prev_vector;
- if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
- goto unlock;
-
- /*
- * This is tricky. If the cleanup of the old vector has not been
- * done yet, then the following setaffinity call will fail with
- * -EBUSY. This can leave the interrupt in a stale state.
- *
- * All CPUs are stuck in stop machine with interrupts disabled so
- * calling __irq_complete_move() would be completely pointless.
- *
- * 1) The interrupt is in move_in_progress state. That means that we
- * have not seen an interrupt since the io_apic was reprogrammed to
- * the new vector.
- *
- * 2) The interrupt has fired on the new vector, but the cleanup IPIs
- * have not been processed yet.
- */
- if (apicd->move_in_progress) {
- /*
- * In theory there is a race:
- *
- * set_ioapic(new_vector) <-- Interrupt is raised before update
- * is effective, i.e. it's raised on
- * the old vector.
- *
- * So if the target cpu cannot handle that interrupt before
- * the old vector is cleaned up, we get a spurious interrupt
- * and in the worst case the ioapic irq line becomes stale.
- *
- * But in case of cpu hotplug this should be a non issue
- * because if the affinity update happens right before all
- * cpus rendezvous in stop machine, there is no way that the
- * interrupt can be blocked on the target cpu because all cpus
- * loops first with interrupts enabled in stop machine, so the
- * old vector is not yet cleaned up when the interrupt fires.
- *
- * So the only way to run into this issue is if the delivery
- * of the interrupt on the apic/system bus would be delayed
- * beyond the point where the target cpu disables interrupts
- * in stop machine. I doubt that it can happen, but at least
- * there is a theoretical chance. Virtualization might be
- * able to expose this, but AFAICT the IOAPIC emulation is not
- * as stupid as the real hardware.
- *
- * Anyway, there is nothing we can do about that at this point
- * w/o refactoring the whole fixup_irq() business completely.
- * We print at least the irq number and the old vector number,
- * so we have the necessary information when a problem in that
- * area arises.
- */
- pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
- irqd->irq, vector);
- }
- free_moved_vector(apicd);
-unlock:
- raw_spin_unlock(&vector_lock);
-}
-
#ifdef CONFIG_HOTPLUG_CPU
/*
* Note, this is not accurate accounting, but at least good enough to
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index a98020bf31bb..6259b474073b 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -33,6 +33,14 @@
static void __used common(void)
{
+ OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
+ OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
+ OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
+ OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping);
+ OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
+ OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
+ OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
+
BLANK();
OFFSET(TASK_threadsp, task_struct, thread.sp);
#ifdef CONFIG_STACKPROTECTOR
@@ -107,11 +115,6 @@ static void __used common(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
- OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
- OFFSET(X86_current_task, pcpu_hot, current_task);
-#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
- OFFSET(X86_call_depth, pcpu_hot, call_depth);
-#endif
#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64)
/* Offset for fields in aria_ctx */
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 2b411cd00a4e..e0a292db97b2 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -12,15 +12,6 @@ void foo(void);
void foo(void)
{
- OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
- OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
- OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
- OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping);
- OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
- OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
- OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
- BLANK();
-
OFFSET(PT_EBX, pt_regs, bx);
OFFSET(PT_ECX, pt_regs, cx);
OFFSET(PT_EDX, pt_regs, dx);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bb65371ea9df..590b6cd0eac0 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -54,11 +54,5 @@ int main(void)
BLANK();
#undef ENTRY
- BLANK();
-
-#ifdef CONFIG_STACKPROTECTOR
- OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary);
- BLANK();
-#endif
return 0;
}
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 3fed7ae58b60..73274d76ce16 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -8,6 +8,7 @@
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/acpi.h>
+#include <linux/bitops.h>
#include <asm/io.h>
#include <linux/mc146818rtc.h>
@@ -20,27 +21,13 @@
int sbf_port __initdata = -1; /* set via acpi_boot_init() */
-static int __init parity(u8 v)
-{
- int x = 0;
- int i;
-
- for (i = 0; i < 8; i++) {
- x ^= (v & 1);
- v >>= 1;
- }
-
- return x;
-}
-
static void __init sbf_write(u8 v)
{
unsigned long flags;
if (sbf_port != -1) {
- v &= ~SBF_PARITY;
- if (!parity(v))
- v |= SBF_PARITY;
+ if (!parity8(v))
+ v ^= SBF_PARITY;
printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
sbf_port, v);
@@ -66,14 +53,14 @@ static u8 __init sbf_read(void)
return v;
}
-static int __init sbf_value_valid(u8 v)
+static bool __init sbf_value_valid(u8 v)
{
if (v & SBF_RESERVED) /* Reserved bits */
- return 0;
- if (!parity(v))
- return 0;
+ return false;
+ if (!parity8(v))
+ return false;
- return 1;
+ return true;
}
static int __init sbf_init(void)
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index 8418a892d195..a951333c5995 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -98,11 +98,10 @@ static inline bool within_module_coretext(void *addr)
#ifdef CONFIG_MODULES
struct module *mod;
- preempt_disable();
+ guard(rcu)();
mod = __module_address((unsigned long)addr);
if (mod && within_module_core((unsigned long)addr, mod))
ret = true;
- preempt_enable();
#endif
return ret;
}
@@ -186,7 +185,7 @@ static void *patch_dest(void *dest, bool direct)
u8 *pad = dest - tsize;
memcpy(insn_buff, skl_call_thunk_template, tsize);
- apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
+ text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
/* Already patched? */
if (!bcmp(pad, insn_buff, tsize))
@@ -240,21 +239,10 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
}
static __init_or_module void
-patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end,
- const struct core_text *ct)
-{
- struct alt_instr *a;
-
- for (a = start; a < end; a++)
- patch_call((void *)&a->instr_offset + a->instr_offset, ct);
-}
-
-static __init_or_module void
callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
{
prdbg("Patching call sites %s\n", ct->name);
patch_call_sites(cs->call_start, cs->call_end, ct);
- patch_alt_call_sites(cs->alt_start, cs->alt_end, ct);
prdbg("Patching call sites done%s\n", ct->name);
}
@@ -263,8 +251,6 @@ void __init callthunks_patch_builtin_calls(void)
struct callthunk_sites cs = {
.call_start = __call_sites,
.call_end = __call_sites_end,
- .alt_start = __alt_instructions,
- .alt_end = __alt_instructions_end
};
if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
@@ -308,7 +294,7 @@ static bool is_callthunk(void *addr)
pad = (void *)(dest - tmpl_size);
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
+ text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
return !bcmp(pad, insn_buff, tmpl_size);
}
@@ -326,7 +312,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
return 0;
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
+ text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
memcpy(*pprog, insn_buff, tmpl_size);
*pprog += tmpl_size;
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 303bf74d175b..99444409c026 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -2,6 +2,7 @@
#include <linux/ptrace.h>
#include <asm/bugs.h>
+#include <asm/msr.h>
#include <asm/traps.h>
enum cp_error_code {
@@ -55,7 +56,7 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code)
* will be whatever is live in userspace. So read the SSP before enabling
* interrupts so locking the fpregs to do it later is not required.
*/
- rdmsrl(MSR_IA32_PL3_SSP, ssp);
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
cond_local_irq_enable(regs);
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
index e6bf78fac146..77086cf565ec 100644
--- a/arch/x86/kernel/cfi.c
+++ b/arch/x86/kernel/cfi.c
@@ -67,16 +67,30 @@ static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target,
*/
enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
{
- unsigned long target;
+ unsigned long target, addr = regs->ip;
u32 type;
- if (!is_cfi_trap(regs->ip))
- return BUG_TRAP_TYPE_NONE;
+ switch (cfi_mode) {
+ case CFI_KCFI:
+ if (!is_cfi_trap(addr))
+ return BUG_TRAP_TYPE_NONE;
+
+ if (!decode_cfi_insn(regs, &target, &type))
+ return report_cfi_failure_noaddr(regs, addr);
+
+ break;
- if (!decode_cfi_insn(regs, &target, &type))
- return report_cfi_failure_noaddr(regs, regs->ip);
+ case CFI_FINEIBT:
+ if (!decode_fineibt_insn(regs, &target, &type))
+ return BUG_TRAP_TYPE_NONE;
+
+ break;
+
+ default:
+ return BUG_TRAP_TYPE_NONE;
+ }
- return report_cfi_failure(regs, regs->ip, &target, type);
+ return report_cfi_failure(regs, addr, &target, type);
}
/*
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4efdf5c2efc8..1e26179ff18c 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -24,7 +24,7 @@ obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
obj-y += aperfmperf.o
-obj-y += cpuid-deps.o
+obj-y += cpuid-deps.o cpuid_0x2_table.o
obj-y += umwait.o
obj-y += capflags.o powerflags.o
@@ -38,6 +38,9 @@ obj-y += intel.o tsx.o
obj-$(CONFIG_PM) += intel_epb.o
endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
+ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy)
+obj-y += amd_cache_disable.o
+endif
obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 54194f5995de..93da466dfe2c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -9,6 +9,7 @@
#include <linux/sched/clock.h>
#include <linux/random.h>
#include <linux/topology.h>
+#include <asm/amd/fch.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cacheinfo.h>
@@ -21,6 +22,7 @@
#include <asm/delay.h>
#include <asm/debugreg.h>
#include <asm/resctrl.h>
+#include <asm/msr.h>
#include <asm/sev.h>
#ifdef CONFIG_X86_64
@@ -29,7 +31,9 @@
#include "cpu.h"
-static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+u16 invlpgb_count_max __ro_after_init;
+
+static inline int rdmsrq_amd_safe(unsigned msr, u64 *p)
{
u32 gprs[8] = { 0 };
int err;
@@ -47,7 +51,7 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
return err;
}
-static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
+static inline int wrmsrq_amd_safe(unsigned msr, u64 val)
{
u32 gprs[8] = { 0 };
@@ -381,7 +385,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
(c->x86 == 0x10 && c->x86_model >= 0x2)) {
u64 val;
- rdmsrl(MSR_K7_HWCR, val);
+ rdmsrq(MSR_K7_HWCR, val);
if (!(val & BIT(24)))
pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
}
@@ -420,7 +424,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
* Try to cache the base value so further operations can
* avoid RMW. If that faults, do not enable SSBD.
*/
- if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
setup_force_cpu_cap(X86_FEATURE_SSBD);
x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
@@ -470,6 +474,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
case 0x60 ... 0x7f:
setup_force_cpu_cap(X86_FEATURE_ZEN5);
break;
+ case 0x50 ... 0x5f:
+ case 0x90 ... 0xaf:
+ case 0xc0 ... 0xcf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN6);
+ break;
default:
goto warn;
}
@@ -506,7 +515,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
*/
if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
/* Check if memory encryption is enabled */
- rdmsrl(MSR_AMD64_SYSCFG, msr);
+ rdmsrq(MSR_AMD64_SYSCFG, msr);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
goto clear_all;
@@ -523,7 +532,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
if (!sme_me_mask)
setup_clear_cpu_cap(X86_FEATURE_SME);
- rdmsrl(MSR_K7_HWCR, msr);
+ rdmsrq(MSR_K7_HWCR, msr);
if (!(msr & MSR_K7_HWCR_SMMLOCK))
goto clear_sev;
@@ -610,7 +619,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
- else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
setup_force_cpu_cap(X86_FEATURE_SBPB);
}
@@ -632,16 +641,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
* (model = 0x14) and later actually support it.
* (AMD Erratum #110, docId: 25759).
*/
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) {
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ if (!rdmsrq_amd_safe(0xc001100d, &value)) {
value &= ~BIT_64(32);
- wrmsrl_amd_safe(0xc001100d, value);
+ wrmsrq_amd_safe(0xc001100d, value);
}
}
if (!c->x86_model_id[0])
- strcpy(c->x86_model_id, "Hammer");
+ strscpy(c->x86_model_id, "Hammer");
#ifdef CONFIG_SMP
/*
@@ -786,9 +795,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
* Disable it on the affected CPUs.
*/
if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
- if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
+ if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
value |= 0x1E;
- wrmsrl_safe(MSR_F15H_IC_CFG, value);
+ wrmsrq_safe(MSR_F15H_IC_CFG, value);
}
}
@@ -803,6 +812,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
static const struct x86_cpu_id erratum_1386_microcode[] = {
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
+ {}
};
static void fix_erratum_1386(struct cpuinfo_x86 *c)
@@ -836,9 +846,9 @@ void init_spectral_chicken(struct cpuinfo_x86 *c)
* suppresses non-branch predictions.
*/
if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
- if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
+ if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
- wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
+ wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
}
}
#endif
@@ -866,6 +876,16 @@ static void init_amd_zen1(struct cpuinfo_x86 *c)
pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
setup_force_cpu_bug(X86_BUG_DIV0);
+
+ /*
+ * Turn off the Instructions Retired free counter on machines that are
+ * susceptible to erratum #1054 "Instructions Retired Performance
+ * Counter May Be Inaccurate".
+ */
+ if (c->x86_model < 0x30) {
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+ clear_cpu_cap(c, X86_FEATURE_IRPERF);
+ }
}
static bool cpu_has_zenbleed_microcode(void)
@@ -1012,7 +1032,7 @@ static void init_amd(struct cpuinfo_x86 *c)
init_amd_cacheinfo(c);
if (cpu_has(c, X86_FEATURE_SVM)) {
- rdmsrl(MSR_VM_CR, vm_cr);
+ rdmsrq(MSR_VM_CR, vm_cr);
if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
clear_cpu_cap(c, X86_FEATURE_SVM);
@@ -1049,13 +1069,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (!cpu_feature_enabled(X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
- /*
- * Turn on the Instructions Retired free counter on machines not
- * susceptible to erratum #1054 "Instructions Retired Performance
- * Counter May Be Inaccurate".
- */
- if (cpu_has(c, X86_FEATURE_IRPERF) &&
- (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f))
+ /* Enable the Instructions Retired free counter */
+ if (cpu_has(c, X86_FEATURE_IRPERF))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
@@ -1073,6 +1088,10 @@ static void init_amd(struct cpuinfo_x86 *c)
/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+ /* Enable Translation Cache Extension */
+ if (cpu_has(c, X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}
#ifdef CONFIG_X86_32
@@ -1105,8 +1124,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
- tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
- tlb_lli_4k[ENTRIES] = ebx & mask;
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
/*
* K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
@@ -1119,26 +1138,30 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
- tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
- tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ tlb_lld_2m = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
- tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+ tlb_lld_4m = tlb_lld_2m >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
/* Erratum 658 */
if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
- tlb_lli_2m[ENTRIES] = 1024;
+ tlb_lli_2m = 1024;
} else {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
- tlb_lli_2m[ENTRIES] = eax & 0xff;
+ tlb_lli_2m = eax & 0xff;
}
} else
- tlb_lli_2m[ENTRIES] = eax & mask;
+ tlb_lli_2m = eax & mask;
- tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+ tlb_lli_4m = tlb_lli_2m >> 1;
+
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
}
static const struct cpu_dev amd_cpu_dev = {
@@ -1190,7 +1213,7 @@ void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr)
if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask)
return;
- wrmsr(amd_msr_dr_addr_masks[dr], mask, 0);
+ wrmsrq(amd_msr_dr_addr_masks[dr], mask);
per_cpu(amd_dr_addr_mask, cpu)[dr] = mask;
}
@@ -1221,3 +1244,56 @@ void amd_check_microcode(void)
if (cpu_feature_enabled(X86_FEATURE_ZEN2))
on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
+
+static const char * const s5_reset_reason_txt[] = {
+ [0] = "thermal pin BP_THERMTRIP_L was tripped",
+ [1] = "power button was pressed for 4 seconds",
+ [2] = "shutdown pin was tripped",
+ [4] = "remote ASF power off command was received",
+ [9] = "internal CPU thermal limit was tripped",
+ [16] = "system reset pin BP_SYS_RST_L was tripped",
+ [17] = "software issued PCI reset",
+ [18] = "software wrote 0x4 to reset control register 0xCF9",
+ [19] = "software wrote 0x6 to reset control register 0xCF9",
+ [20] = "software wrote 0xE to reset control register 0xCF9",
+ [21] = "ACPI power state transition occurred",
+ [22] = "keyboard reset pin KB_RST_L was tripped",
+ [23] = "internal CPU shutdown event occurred",
+ [24] = "system failed to boot before failed boot timer expired",
+ [25] = "hardware watchdog timer expired",
+ [26] = "remote ASF reset command was received",
+ [27] = "an uncorrected error caused a data fabric sync flood event",
+ [29] = "FCH and MP1 failed warm reset handshake",
+ [30] = "a parity error occurred",
+ [31] = "a software sync flood event occurred",
+};
+
+static __init int print_s5_reset_status_mmio(void)
+{
+ unsigned long value;
+ void __iomem *addr;
+ int i;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return 0;
+
+ addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value));
+ if (!addr)
+ return 0;
+
+ value = ioread32(addr);
+ iounmap(addr);
+
+ for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) {
+ if (!(value & BIT(i)))
+ continue;
+
+ if (s5_reset_reason_txt[i]) {
+ pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n",
+ value, s5_reset_reason_txt[i]);
+ }
+ }
+
+ return 0;
+}
+late_initcall(print_s5_reset_status_mmio);
diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c
new file mode 100644
index 000000000000..8843b9557aea
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_cache_disable.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD L3 cache_disable_{0,1} sysfs handling
+ * Documentation/ABI/testing/sysfs-devices-system-cpu
+ */
+
+#include <linux/cacheinfo.h>
+#include <linux/capability.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+
+#include <asm/amd/nb.h>
+
+#include "cpu.h"
+
+/*
+ * L3 cache descriptors
+ */
+static void amd_calc_l3_indices(struct amd_northbridge *nb)
+{
+ struct amd_l3_cache *l3 = &nb->l3_cache;
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(nb->misc, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ l3->subcaches[0] = sc0 = !(val & BIT(0));
+ l3->subcaches[1] = sc1 = !(val & BIT(4));
+
+ if (boot_cpu_data.x86 == 0x15) {
+ l3->subcaches[0] = sc0 += !(val & BIT(1));
+ l3->subcaches[1] = sc1 += !(val & BIT(5));
+ }
+
+ l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
+}
+
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot)
+{
+ unsigned int reg = 0;
+
+ pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
+
+ return -1;
+}
+
+static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot)
+{
+ int index;
+ struct amd_northbridge *nb = ci->priv;
+
+ index = amd_get_l3_disable_slot(nb, slot);
+ if (index >= 0)
+ return sysfs_emit(buf, "%d\n", index);
+
+ return sysfs_emit(buf, "FREE\n");
+}
+
+#define SHOW_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return show_cache_disable(ci, buf, slot); \
+}
+
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long idx)
+{
+ int i;
+
+ idx |= BIT(30);
+
+ /*
+ * disable index in all 4 subcaches
+ */
+ for (i = 0; i < 4; i++) {
+ u32 reg = idx | (i << 20);
+
+ if (!nb->l3_cache.subcaches[i])
+ continue;
+
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+
+ /*
+ * We need to WBINVD on a core on the node containing the L3
+ * cache which indices we disable therefore a simple wbinvd()
+ * is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+
+ reg |= BIT(31);
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+ }
+}
+
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long index)
+{
+ int ret = 0;
+
+ /* check if @slot is already used or the index is already disabled */
+ ret = amd_get_l3_disable_slot(nb, slot);
+ if (ret >= 0)
+ return -EEXIST;
+
+ if (index > nb->l3_cache.indices)
+ return -EINVAL;
+
+ /* check whether the other slot has disabled the same index already */
+ if (index == amd_get_l3_disable_slot(nb, !slot))
+ return -EEXIST;
+
+ amd_l3_disable_index(nb, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf,
+ size_t count, unsigned int slot)
+{
+ struct amd_northbridge *nb = ci->priv;
+ unsigned long val = 0;
+ int cpu, err = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cpu = cpumask_first(&ci->shared_cpu_map);
+
+ if (kstrtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ pr_warn("L3 slot %d in use/index already disabled!\n",
+ slot);
+ return err;
+ }
+ return count;
+}
+
+#define STORE_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return store_cache_disable(ci, buf, count, slot); \
+}
+
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+
+ return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu));
+}
+
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+ unsigned long val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (kstrtoul(buf, 16, &val) < 0)
+ return -EINVAL;
+
+ if (amd_set_subcaches(cpu, val))
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!ci->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ static struct attribute **amd_l3_attrs;
+ int n = 1;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci)
+{
+ struct amd_northbridge *nb = ci->priv;
+
+ if (ci->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ struct amd_northbridge *nb;
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return NULL;
+
+ node = topology_amd_node_id(smp_processor_id());
+ nb = node_to_amd_nb(node);
+ if (nb && !nb->l3_cache.indices)
+ amd_calc_l3_indices(nb);
+
+ return nb;
+}
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index f642de2ebdac..a315b0627dfb 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -20,6 +20,7 @@
#include <asm/cpu.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
+#include <asm/msr.h>
#include "cpu.h"
@@ -40,8 +41,8 @@ static void init_counter_refs(void)
{
u64 aperf, mperf;
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
this_cpu_write(cpu_samples.aperf, aperf);
this_cpu_write(cpu_samples.mperf, mperf);
@@ -99,7 +100,7 @@ static bool __init turbo_disabled(void)
u64 misc_en;
int err;
- err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en);
if (err)
return false;
@@ -110,11 +111,11 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
int err;
- err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq);
if (err)
return false;
- err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
if (err)
return false;
@@ -152,13 +153,13 @@ static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
int err, i;
u64 msr;
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
@@ -190,17 +191,17 @@ static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int s
u32 group_size;
int err, i;
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
if (err)
return false;
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
if (err)
return false;
@@ -220,11 +221,11 @@ static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
u64 msr;
int err;
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
@@ -474,8 +475,8 @@ void arch_scale_freq_tick(void)
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
return;
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
acnt = aperf - s->aperf;
mcnt = mperf - s->mperf;
@@ -498,7 +499,7 @@ void arch_scale_freq_tick(void)
*/
#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
-unsigned int arch_freq_get_on_cpu(int cpu)
+int arch_freq_get_on_cpu(int cpu)
{
struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
unsigned int seq, freq;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index a5d0998d7604..7f94e6a5497d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -34,21 +34,66 @@
#include "cpu.h"
+/*
+ * Speculation Vulnerability Handling
+ *
+ * Each vulnerability is handled with the following functions:
+ * <vuln>_select_mitigation() -- Selects a mitigation to use. This should
+ * take into account all relevant command line
+ * options.
+ * <vuln>_update_mitigation() -- This is called after all vulnerabilities have
+ * selected a mitigation, in case the selection
+ * may want to change based on other choices
+ * made. This function is optional.
+ * <vuln>_apply_mitigation() -- Enable the selected mitigation.
+ *
+ * The compile-time mitigation in all cases should be AUTO. An explicit
+ * command-line option can override AUTO. If no such option is
+ * provided, <vuln>_select_mitigation() will override AUTO to the best
+ * mitigation option.
+ */
+
static void __init spectre_v1_select_mitigation(void);
+static void __init spectre_v1_apply_mitigation(void);
static void __init spectre_v2_select_mitigation(void);
+static void __init spectre_v2_update_mitigation(void);
+static void __init spectre_v2_apply_mitigation(void);
static void __init retbleed_select_mitigation(void);
+static void __init retbleed_update_mitigation(void);
+static void __init retbleed_apply_mitigation(void);
static void __init spectre_v2_user_select_mitigation(void);
+static void __init spectre_v2_user_update_mitigation(void);
+static void __init spectre_v2_user_apply_mitigation(void);
static void __init ssb_select_mitigation(void);
+static void __init ssb_apply_mitigation(void);
static void __init l1tf_select_mitigation(void);
+static void __init l1tf_apply_mitigation(void);
static void __init mds_select_mitigation(void);
-static void __init md_clear_update_mitigation(void);
-static void __init md_clear_select_mitigation(void);
+static void __init mds_update_mitigation(void);
+static void __init mds_apply_mitigation(void);
static void __init taa_select_mitigation(void);
+static void __init taa_update_mitigation(void);
+static void __init taa_apply_mitigation(void);
static void __init mmio_select_mitigation(void);
+static void __init mmio_update_mitigation(void);
+static void __init mmio_apply_mitigation(void);
+static void __init rfds_select_mitigation(void);
+static void __init rfds_update_mitigation(void);
+static void __init rfds_apply_mitigation(void);
static void __init srbds_select_mitigation(void);
+static void __init srbds_apply_mitigation(void);
static void __init l1d_flush_select_mitigation(void);
static void __init srso_select_mitigation(void);
+static void __init srso_update_mitigation(void);
+static void __init srso_apply_mitigation(void);
static void __init gds_select_mitigation(void);
+static void __init gds_apply_mitigation(void);
+static void __init bhi_select_mitigation(void);
+static void __init bhi_update_mitigation(void);
+static void __init bhi_apply_mitigation(void);
+static void __init its_select_mitigation(void);
+static void __init its_update_mitigation(void);
+static void __init its_apply_mitigation(void);
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
@@ -59,7 +104,6 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
-EXPORT_SYMBOL_GPL(x86_pred_cmd);
static u64 __ro_after_init x86_arch_cap_msr;
@@ -67,11 +111,19 @@ static DEFINE_MUTEX(spec_ctrl_mutex);
void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
+static void __init set_return_thunk(void *thunk)
+{
+ if (x86_return_thunk != __x86_return_thunk)
+ pr_warn("x86/bugs: return thunk changed\n");
+
+ x86_return_thunk = thunk;
+}
+
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
static void update_spec_ctrl(u64 val)
{
this_cpu_write(x86_spec_ctrl_current, val);
- wrmsrl(MSR_IA32_SPEC_CTRL, val);
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
}
/*
@@ -90,7 +142,7 @@ void update_spec_ctrl_cond(u64 val)
* forced the update can be delayed until that time.
*/
if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
- wrmsrl(MSR_IA32_SPEC_CTRL, val);
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
}
noinstr u64 spec_ctrl_current(void)
@@ -113,6 +165,10 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+/* Control IBPB on vCPU load */
+DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
+
/* Control MDS CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
@@ -124,9 +180,13 @@ EXPORT_SYMBOL_GPL(mds_idle_clear);
*/
DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
-/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */
-DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
-EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
+/*
+ * Controls CPU Fill buffer clear before VMenter. This is a subset of
+ * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only
+ * mitigation is required.
+ */
+DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
+EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
void __init cpu_select_mitigations(void)
{
@@ -136,7 +196,7 @@ void __init cpu_select_mitigations(void)
* init code as it is not enumerated and depends on the family.
*/
if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
- rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
/*
* Previously running kernel (kexec), may have some controls
@@ -151,30 +211,67 @@ void __init cpu_select_mitigations(void)
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
- /*
- * retbleed_select_mitigation() relies on the state set by
- * spectre_v2_select_mitigation(); specifically it wants to know about
- * spectre_v2=ibrs.
- */
retbleed_select_mitigation();
- /*
- * spectre_v2_user_select_mitigation() relies on the state set by
- * retbleed_select_mitigation(); specifically the STIBP selection is
- * forced for UNRET or IBPB.
- */
spectre_v2_user_select_mitigation();
ssb_select_mitigation();
l1tf_select_mitigation();
- md_clear_select_mitigation();
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+ rfds_select_mitigation();
srbds_select_mitigation();
l1d_flush_select_mitigation();
+ srso_select_mitigation();
+ gds_select_mitigation();
+ its_select_mitigation();
+ bhi_select_mitigation();
/*
- * srso_select_mitigation() depends and must run after
- * retbleed_select_mitigation().
+ * After mitigations are selected, some may need to update their
+ * choices.
*/
- srso_select_mitigation();
- gds_select_mitigation();
+ spectre_v2_update_mitigation();
+ /*
+ * retbleed_update_mitigation() relies on the state set by
+ * spectre_v2_update_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
+ */
+ retbleed_update_mitigation();
+ /*
+ * its_update_mitigation() depends on spectre_v2_update_mitigation()
+ * and retbleed_update_mitigation().
+ */
+ its_update_mitigation();
+
+ /*
+ * spectre_v2_user_update_mitigation() depends on
+ * retbleed_update_mitigation(), specifically the STIBP
+ * selection is forced for UNRET or IBPB.
+ */
+ spectre_v2_user_update_mitigation();
+ mds_update_mitigation();
+ taa_update_mitigation();
+ mmio_update_mitigation();
+ rfds_update_mitigation();
+ bhi_update_mitigation();
+ /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
+ srso_update_mitigation();
+
+ spectre_v1_apply_mitigation();
+ spectre_v2_apply_mitigation();
+ retbleed_apply_mitigation();
+ spectre_v2_user_apply_mitigation();
+ ssb_apply_mitigation();
+ l1tf_apply_mitigation();
+ mds_apply_mitigation();
+ taa_apply_mitigation();
+ mmio_apply_mitigation();
+ rfds_apply_mitigation();
+ srbds_apply_mitigation();
+ srso_apply_mitigation();
+ gds_apply_mitigation();
+ its_apply_mitigation();
+ bhi_apply_mitigation();
}
/*
@@ -224,9 +321,9 @@ static void x86_amd_ssb_disable(void)
u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
- wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+ wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
- wrmsrl(MSR_AMD64_LS_CFG, msrval);
+ wrmsrq(MSR_AMD64_LS_CFG, msrval);
}
#undef pr_fmt
@@ -234,7 +331,7 @@ static void x86_amd_ssb_disable(void)
/* Default mitigation for MDS-affected CPUs */
static enum mds_mitigations mds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
static bool mds_nosmt __ro_after_init = false;
static const char * const mds_strings[] = {
@@ -243,6 +340,46 @@ static const char * const mds_strings[] = {
[MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
};
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_AUTO,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF;
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_AUTO,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF;
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_AUTO,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
+
+/*
+ * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing
+ * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry.
+ */
+static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
+
static void __init mds_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
@@ -250,12 +387,37 @@ static void __init mds_select_mitigation(void)
return;
}
+ if (mds_mitigation == MDS_MITIGATION_AUTO)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
+ if (mds_mitigation == MDS_MITIGATION_OFF)
+ return;
+
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ return;
+
+ /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
if (mds_mitigation == MDS_MITIGATION_FULL) {
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
+ }
- setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ pr_info("%s\n", mds_strings[mds_mitigation]);
+}
+static void __init mds_apply_mitigation(void)
+{
+ if (mds_mitigation == MDS_MITIGATION_FULL ||
+ mds_mitigation == MDS_MITIGATION_VMWERV) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
(mds_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
@@ -286,16 +448,6 @@ early_param("mds", mds_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "TAA: " fmt
-enum taa_mitigations {
- TAA_MITIGATION_OFF,
- TAA_MITIGATION_UCODE_NEEDED,
- TAA_MITIGATION_VERW,
- TAA_MITIGATION_TSX_DISABLED,
-};
-
-/* Default mitigation for TAA-affected CPUs */
-static enum taa_mitigations taa_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF;
static bool taa_nosmt __ro_after_init;
static const char * const taa_strings[] = {
@@ -305,6 +457,11 @@ static const char * const taa_strings[] = {
[TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
};
+static bool __init taa_vulnerable(void)
+{
+ return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM);
+}
+
static void __init taa_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_TAA)) {
@@ -318,48 +475,63 @@ static void __init taa_select_mitigation(void)
return;
}
- if (cpu_mitigations_off()) {
+ if (cpu_mitigations_off())
taa_mitigation = TAA_MITIGATION_OFF;
- return;
- }
- /*
- * TAA mitigation via VERW is turned off if both
- * tsx_async_abort=off and mds=off are specified.
- */
- if (taa_mitigation == TAA_MITIGATION_OFF &&
- mds_mitigation == MDS_MITIGATION_OFF)
+ /* Microcode will be checked in taa_update_mitigation(). */
+ if (taa_mitigation == TAA_MITIGATION_AUTO)
+ taa_mitigation = TAA_MITIGATION_VERW;
+
+ if (taa_mitigation != TAA_MITIGATION_OFF)
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init taa_update_mitigation(void)
+{
+ if (!taa_vulnerable() || cpu_mitigations_off())
return;
- if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ if (verw_clear_cpu_buf_mitigation_selected)
taa_mitigation = TAA_MITIGATION_VERW;
- else
- taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
- /*
- * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
- * A microcode update fixes this behavior to clear CPU buffers. It also
- * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
- * ARCH_CAP_TSX_CTRL_MSR bit.
- *
- * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
- * update is required.
- */
- if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
- !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
- taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+ if (taa_mitigation == TAA_MITIGATION_VERW) {
+ /* Check if the requisite ucode is available. */
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
- /*
- * TSX is enabled, select alternate mitigation for TAA which is
- * the same as MDS. Enable MDS static branch to clear CPU buffers.
- *
- * For guests that can't determine whether the correct microcode is
- * present on host, enable the mitigation for UCODE_NEEDED as well.
- */
- setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+ }
- if (taa_nosmt || cpu_mitigations_auto_nosmt())
- cpu_smt_disable(false);
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static void __init taa_apply_mitigation(void)
+{
+ if (taa_mitigation == TAA_MITIGATION_VERW ||
+ taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) {
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+
+ if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+ }
}
static int __init tsx_async_abort_parse_cmdline(char *str)
@@ -386,15 +558,6 @@ early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "MMIO Stale Data: " fmt
-enum mmio_mitigations {
- MMIO_MITIGATION_OFF,
- MMIO_MITIGATION_UCODE_NEEDED,
- MMIO_MITIGATION_VERW,
-};
-
-/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
-static enum mmio_mitigations mmio_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF;
static bool mmio_nosmt __ro_after_init = false;
static const char * const mmio_strings[] = {
@@ -406,31 +569,67 @@ static const char * const mmio_strings[] = {
static void __init mmio_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
- boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) ||
cpu_mitigations_off()) {
mmio_mitigation = MMIO_MITIGATION_OFF;
return;
}
+ /* Microcode will be checked in mmio_update_mitigation(). */
+ if (mmio_mitigation == MMIO_MITIGATION_AUTO)
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return;
/*
* Enable CPU buffer clear mitigation for host and VMM, if also affected
- * by MDS or TAA. Otherwise, enable mitigation for VMM only.
+ * by MDS or TAA.
*/
- if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
- boot_cpu_has(X86_FEATURE_RTM)))
- setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mmio_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off())
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+
+ if (mmio_mitigation == MMIO_MITIGATION_VERW) {
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))))
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", mmio_strings[mmio_mitigation]);
+}
+
+static void __init mmio_apply_mitigation(void)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
/*
- * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based
- * mitigations, disable KVM-only mitigation in that case.
+ * Only enable the VMM mitigation if the CPU buffer clear mitigation is
+ * not being used.
*/
- if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
- static_branch_disable(&mmio_stale_data_clear);
- else
- static_branch_enable(&mmio_stale_data_clear);
+ if (verw_clear_cpu_buf_mitigation_selected) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ static_branch_disable(&cpu_buf_vm_clear);
+ } else {
+ static_branch_enable(&cpu_buf_vm_clear);
+ }
/*
* If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
@@ -440,21 +639,6 @@ static void __init mmio_select_mitigation(void)
if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
static_branch_enable(&mds_idle_clear);
- /*
- * Check if the system has the right microcode.
- *
- * CPU Fill buffer clear mitigation is enumerated by either an explicit
- * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
- * affected systems.
- */
- if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
- (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
- boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
- !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))
- mmio_mitigation = MMIO_MITIGATION_VERW;
- else
- mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
-
if (mmio_nosmt || cpu_mitigations_auto_nosmt())
cpu_smt_disable(false);
}
@@ -483,35 +667,54 @@ early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "Register File Data Sampling: " fmt
-enum rfds_mitigations {
- RFDS_MITIGATION_OFF,
- RFDS_MITIGATION_VERW,
- RFDS_MITIGATION_UCODE_NEEDED,
-};
-
-/* Default mitigation for Register File Data Sampling */
-static enum rfds_mitigations rfds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF;
-
static const char * const rfds_strings[] = {
[RFDS_MITIGATION_OFF] = "Vulnerable",
[RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
[RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
};
+static inline bool __init verw_clears_cpu_reg_file(void)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR);
+}
+
static void __init rfds_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
rfds_mitigation = RFDS_MITIGATION_OFF;
return;
}
+
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
if (rfds_mitigation == RFDS_MITIGATION_OFF)
return;
- if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ if (verw_clears_cpu_reg_file())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init rfds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off())
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
+ if (!verw_clears_cpu_reg_file())
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", rfds_strings[rfds_mitigation]);
+}
+
+static void __init rfds_apply_mitigation(void)
+{
+ if (rfds_mitigation == RFDS_MITIGATION_VERW)
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
- else
- rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
}
static __init int rfds_parse_cmdline(char *str)
@@ -532,76 +735,11 @@ static __init int rfds_parse_cmdline(char *str)
early_param("reg_file_data_sampling", rfds_parse_cmdline);
#undef pr_fmt
-#define pr_fmt(fmt) "" fmt
-
-static void __init md_clear_update_mitigation(void)
-{
- if (cpu_mitigations_off())
- return;
-
- if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
- goto out;
-
- /*
- * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO
- * Stale Data mitigation, if necessary.
- */
- if (mds_mitigation == MDS_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MDS)) {
- mds_mitigation = MDS_MITIGATION_FULL;
- mds_select_mitigation();
- }
- if (taa_mitigation == TAA_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_TAA)) {
- taa_mitigation = TAA_MITIGATION_VERW;
- taa_select_mitigation();
- }
- /*
- * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear
- * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state.
- */
- if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
- mmio_mitigation = MMIO_MITIGATION_VERW;
- mmio_select_mitigation();
- }
- if (rfds_mitigation == RFDS_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_RFDS)) {
- rfds_mitigation = RFDS_MITIGATION_VERW;
- rfds_select_mitigation();
- }
-out:
- if (boot_cpu_has_bug(X86_BUG_MDS))
- pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
- if (boot_cpu_has_bug(X86_BUG_TAA))
- pr_info("TAA: %s\n", taa_strings[taa_mitigation]);
- if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
- pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
- else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
- pr_info("MMIO Stale Data: Unknown: No mitigations\n");
- if (boot_cpu_has_bug(X86_BUG_RFDS))
- pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]);
-}
-
-static void __init md_clear_select_mitigation(void)
-{
- mds_select_mitigation();
- taa_select_mitigation();
- mmio_select_mitigation();
- rfds_select_mitigation();
-
- /*
- * As these mitigations are inter-related and rely on VERW instruction
- * to clear the microarchitural buffers, update and print their status
- * after mitigation selection is done for each of these vulnerabilities.
- */
- md_clear_update_mitigation();
-}
-
-#undef pr_fmt
#define pr_fmt(fmt) "SRBDS: " fmt
enum srbds_mitigations {
SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_AUTO,
SRBDS_MITIGATION_UCODE_NEEDED,
SRBDS_MITIGATION_FULL,
SRBDS_MITIGATION_TSX_OFF,
@@ -609,7 +747,7 @@ enum srbds_mitigations {
};
static enum srbds_mitigations srbds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF;
static const char * const srbds_strings[] = {
[SRBDS_MITIGATION_OFF] = "Vulnerable",
@@ -641,7 +779,7 @@ void update_srbds_msr(void)
if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
return;
- rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
switch (srbds_mitigation) {
case SRBDS_MITIGATION_OFF:
@@ -655,13 +793,18 @@ void update_srbds_msr(void)
break;
}
- wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
}
static void __init srbds_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
return;
+ }
+
+ if (srbds_mitigation == SRBDS_MITIGATION_AUTO)
+ srbds_mitigation = SRBDS_MITIGATION_FULL;
/*
* Check to see if this is one of the MDS_NO systems supporting TSX that
@@ -675,13 +818,17 @@ static void __init srbds_select_mitigation(void)
srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
- else if (cpu_mitigations_off() || srbds_off)
+ else if (srbds_off)
srbds_mitigation = SRBDS_MITIGATION_OFF;
- update_srbds_msr();
pr_info("%s\n", srbds_strings[srbds_mitigation]);
}
+static void __init srbds_apply_mitigation(void)
+{
+ update_srbds_msr();
+}
+
static int __init srbds_parse_cmdline(char *str)
{
if (!str)
@@ -728,6 +875,7 @@ early_param("l1d_flush", l1d_flush_parse_cmdline);
enum gds_mitigations {
GDS_MITIGATION_OFF,
+ GDS_MITIGATION_AUTO,
GDS_MITIGATION_UCODE_NEEDED,
GDS_MITIGATION_FORCE,
GDS_MITIGATION_FULL,
@@ -736,7 +884,7 @@ enum gds_mitigations {
};
static enum gds_mitigations gds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF;
static const char * const gds_strings[] = {
[GDS_MITIGATION_OFF] = "Vulnerable",
@@ -761,7 +909,7 @@ void update_gds_msr(void)
switch (gds_mitigation) {
case GDS_MITIGATION_OFF:
- rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
mcu_ctrl |= GDS_MITG_DIS;
break;
case GDS_MITIGATION_FULL_LOCKED:
@@ -771,23 +919,24 @@ void update_gds_msr(void)
* CPUs.
*/
case GDS_MITIGATION_FULL:
- rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
mcu_ctrl &= ~GDS_MITG_DIS;
break;
case GDS_MITIGATION_FORCE:
case GDS_MITIGATION_UCODE_NEEDED:
case GDS_MITIGATION_HYPERVISOR:
+ case GDS_MITIGATION_AUTO:
return;
}
- wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
/*
* Check to make sure that the WRMSR value was not ignored. Writes to
* GDS_MITG_DIS will be ignored if this processor is locked but the boot
* processor was not.
*/
- rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after);
}
@@ -800,33 +949,28 @@ static void __init gds_select_mitigation(void)
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
gds_mitigation = GDS_MITIGATION_HYPERVISOR;
- goto out;
+ return;
}
if (cpu_mitigations_off())
gds_mitigation = GDS_MITIGATION_OFF;
/* Will verify below that mitigation _can_ be disabled */
+ if (gds_mitigation == GDS_MITIGATION_AUTO)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
/* No microcode */
if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
- if (gds_mitigation == GDS_MITIGATION_FORCE) {
- /*
- * This only needs to be done on the boot CPU so do it
- * here rather than in update_gds_msr()
- */
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
- } else {
+ if (gds_mitigation != GDS_MITIGATION_FORCE)
gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
- }
- goto out;
+ return;
}
/* Microcode has mitigation, use it */
if (gds_mitigation == GDS_MITIGATION_FORCE)
gds_mitigation = GDS_MITIGATION_FULL;
- rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
if (mcu_ctrl & GDS_MITG_LOCKED) {
if (gds_mitigation == GDS_MITIGATION_OFF)
pr_warn("Mitigation locked. Disable failed.\n");
@@ -840,9 +984,25 @@ static void __init gds_select_mitigation(void)
*/
gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
}
+}
+
+static void __init gds_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ /* Microcode is present */
+ if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)
+ update_gds_msr();
+ else if (gds_mitigation == GDS_MITIGATION_FORCE) {
+ /*
+ * This only needs to be done on the boot CPU so do it
+ * here rather than in update_gds_msr()
+ */
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+ }
- update_gds_msr();
-out:
pr_info("%s\n", gds_strings[gds_mitigation]);
}
@@ -903,10 +1063,14 @@ static bool smap_works_speculatively(void)
static void __init spectre_v1_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+}
+
+static void __init spectre_v1_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
return;
- }
if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
/*
@@ -959,8 +1123,20 @@ enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
#undef pr_fmt
#define pr_fmt(fmt) "RETBleed: " fmt
+enum its_mitigation {
+ ITS_MITIGATION_OFF,
+ ITS_MITIGATION_AUTO,
+ ITS_MITIGATION_VMEXIT_ONLY,
+ ITS_MITIGATION_ALIGNED_THUNKS,
+ ITS_MITIGATION_RETPOLINE_STUFF,
+};
+
+static enum its_mitigation its_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF;
+
enum retbleed_mitigation {
RETBLEED_MITIGATION_NONE,
+ RETBLEED_MITIGATION_AUTO,
RETBLEED_MITIGATION_UNRET,
RETBLEED_MITIGATION_IBPB,
RETBLEED_MITIGATION_IBRS,
@@ -968,14 +1144,6 @@ enum retbleed_mitigation {
RETBLEED_MITIGATION_STUFF,
};
-enum retbleed_mitigation_cmd {
- RETBLEED_CMD_OFF,
- RETBLEED_CMD_AUTO,
- RETBLEED_CMD_UNRET,
- RETBLEED_CMD_IBPB,
- RETBLEED_CMD_STUFF,
-};
-
static const char * const retbleed_strings[] = {
[RETBLEED_MITIGATION_NONE] = "Vulnerable",
[RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk",
@@ -986,9 +1154,7 @@ static const char * const retbleed_strings[] = {
};
static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
- RETBLEED_MITIGATION_NONE;
-static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE;
static int __ro_after_init retbleed_nosmt = false;
@@ -1005,15 +1171,15 @@ static int __init retbleed_parse_cmdline(char *str)
}
if (!strcmp(str, "off")) {
- retbleed_cmd = RETBLEED_CMD_OFF;
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
} else if (!strcmp(str, "auto")) {
- retbleed_cmd = RETBLEED_CMD_AUTO;
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
} else if (!strcmp(str, "unret")) {
- retbleed_cmd = RETBLEED_CMD_UNRET;
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
} else if (!strcmp(str, "ibpb")) {
- retbleed_cmd = RETBLEED_CMD_IBPB;
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
} else if (!strcmp(str, "stuff")) {
- retbleed_cmd = RETBLEED_CMD_STUFF;
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
} else if (!strcmp(str, "nosmt")) {
retbleed_nosmt = true;
} else if (!strcmp(str, "force")) {
@@ -1034,77 +1200,122 @@ early_param("retbleed", retbleed_parse_cmdline);
static void __init retbleed_select_mitigation(void)
{
- bool mitigate_smt = false;
-
- if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
- return;
-
- switch (retbleed_cmd) {
- case RETBLEED_CMD_OFF:
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
return;
+ }
- case RETBLEED_CMD_UNRET:
- if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
- retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
- } else {
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_UNRET:
+ if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n");
- goto do_cmd_auto;
}
break;
-
- case RETBLEED_CMD_IBPB:
+ case RETBLEED_MITIGATION_IBPB:
if (!boot_cpu_has(X86_FEATURE_IBPB)) {
pr_err("WARNING: CPU does not support IBPB.\n");
- goto do_cmd_auto;
- } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
- retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
- } else {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
- goto do_cmd_auto;
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
}
break;
+ case RETBLEED_MITIGATION_STUFF:
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ }
+ break;
+ default:
+ break;
+ }
- case RETBLEED_CMD_STUFF:
- if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) &&
- spectre_v2_enabled == SPECTRE_V2_RETPOLINE) {
- retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+ if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO)
+ return;
+
+ /* Intel mitigation selected in retbleed_update_mitigation() */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
+ boot_cpu_has(X86_FEATURE_IBPB))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+}
+
+static void __init retbleed_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
+ return;
+ if (retbleed_mitigation == RETBLEED_MITIGATION_NONE)
+ goto out;
+
+ /*
+ * retbleed=stuff is only allowed on Intel. If stuffing can't be used
+ * then a different mitigation will be selected below.
+ *
+ * its=stuff will also attempt to enable stuffing.
+ */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF ||
+ its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) {
+ if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) {
+ pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
} else {
- if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING))
- pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
- else
- pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ pr_info("Retbleed mitigation updated to stuffing\n");
- goto do_cmd_auto;
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
}
- break;
-
-do_cmd_auto:
- case RETBLEED_CMD_AUTO:
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
- retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
- else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
- boot_cpu_has(X86_FEATURE_IBPB))
- retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ }
+ /*
+ * Let IBRS trump all on Intel without affecting the effects of the
+ * retbleed= cmdline option except for call depth based stuffing
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_IBRS:
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ break;
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ break;
+ default:
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ pr_err(RETBLEED_INTEL_MSG);
}
+ /* If nothing has set the mitigation yet, default to NONE. */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO)
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+out:
+ pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+}
- /*
- * The Intel mitigation (IBRS or eIBRS) was already selected in
- * spectre_v2_select_mitigation(). 'retbleed_mitigation' will
- * be set accordingly below.
- */
- break;
- }
+static void __init retbleed_apply_mitigation(void)
+{
+ bool mitigate_smt = false;
switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_NONE:
+ return;
+
case RETBLEED_MITIGATION_UNRET:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_UNRET);
- x86_return_thunk = retbleed_return_thunk;
+ set_return_thunk(retbleed_return_thunk);
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -1127,7 +1338,7 @@ do_cmd_auto:
setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
/*
- * There is no need for RSB filling: entry_ibpb() ensures
+ * There is no need for RSB filling: write_ibpb() ensures
* all predictions, including the RSB, are invalidated,
* regardless of IBPB implementation.
*/
@@ -1139,7 +1350,7 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
- x86_return_thunk = call_depth_return_thunk;
+ set_return_thunk(call_depth_return_thunk);
break;
default:
@@ -1149,28 +1360,131 @@ do_cmd_auto:
if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
(retbleed_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "ITS: " fmt
+
+static const char * const its_strings[] = {
+ [ITS_MITIGATION_OFF] = "Vulnerable",
+ [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected",
+ [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks",
+ [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB",
+};
+
+static int __init its_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) {
+ pr_err("Mitigation disabled at compile time, ignoring option (%s)", str);
+ return 0;
+ }
+
+ if (!strcmp(str, "off")) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ } else if (!strcmp(str, "on")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ } else if (!strcmp(str, "force")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ setup_force_cpu_bug(X86_BUG_ITS);
+ } else if (!strcmp(str, "vmexit")) {
+ its_mitigation = ITS_MITIGATION_VMEXIT_ONLY;
+ } else if (!strcmp(str, "stuff")) {
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ } else {
+ pr_err("Ignoring unknown indirect_target_selection option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("indirect_target_selection", its_parse_cmdline);
+
+static void __init its_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_AUTO)
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+
+ if (its_mitigation == ITS_MITIGATION_OFF)
+ return;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) {
+ pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) {
+ pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("RSB stuff mitigation not supported, using default\n");
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY &&
+ !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+}
+
+static void __init its_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off())
+ return;
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_NONE:
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ case SPECTRE_V2_RETPOLINE:
+ /* Retpoline+CDT mitigates ITS */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF)
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ break;
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ default:
+ break;
+ }
/*
- * Let IBRS trump all on Intel without affecting the effects of the
- * retbleed= cmdline option except for call depth based stuffing
+ * retbleed_update_mitigation() will try to do stuffing if its=stuff.
+ * If it can't, such as if spectre_v2!=retpoline, then fall back to
+ * aligned thunks.
*/
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
- switch (spectre_v2_enabled) {
- case SPECTRE_V2_IBRS:
- retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
- break;
- case SPECTRE_V2_EIBRS:
- case SPECTRE_V2_EIBRS_RETPOLINE:
- case SPECTRE_V2_EIBRS_LFENCE:
- retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
- break;
- default:
- if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
- pr_err(RETBLEED_INTEL_MSG);
- }
- }
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
- pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+ pr_info("%s\n", its_strings[its_mitigation]);
+}
+
+static void __init its_apply_mitigation(void)
+{
+ /* its=stuff forces retbleed stuffing and is enabled there. */
+ if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS)
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
}
#undef pr_fmt
@@ -1250,6 +1564,8 @@ enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_IBRS,
};
+static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO;
+
enum spectre_v2_user_cmd {
SPECTRE_V2_USER_CMD_NONE,
SPECTRE_V2_USER_CMD_AUTO,
@@ -1288,22 +1604,13 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure)
pr_info("spectre_v2_user=%s forced on command line.\n", reason);
}
-static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
-
-static enum spectre_v2_user_cmd __init
-spectre_v2_parse_user_cmdline(void)
+static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void)
{
char arg[20];
int ret, i;
- switch (spectre_v2_cmd) {
- case SPECTRE_V2_CMD_NONE:
+ if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
return SPECTRE_V2_USER_CMD_NONE;
- case SPECTRE_V2_CMD_FORCE:
- return SPECTRE_V2_USER_CMD_FORCE;
- default:
- break;
- }
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
arg, sizeof(arg));
@@ -1318,7 +1625,7 @@ spectre_v2_parse_user_cmdline(void)
}
}
- pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
+ pr_err("Unknown user space protection option (%s). Switching to default\n", arg);
return SPECTRE_V2_USER_CMD_AUTO;
}
@@ -1327,65 +1634,72 @@ static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS;
}
-static void __init
-spectre_v2_user_select_mitigation(void)
+static void __init spectre_v2_user_select_mitigation(void)
{
- enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
- bool smt_possible = IS_ENABLED(CONFIG_SMP);
- enum spectre_v2_user_cmd cmd;
-
if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
return;
- if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
- cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
- smt_possible = false;
-
- cmd = spectre_v2_parse_user_cmdline();
- switch (cmd) {
+ switch (spectre_v2_parse_user_cmdline()) {
case SPECTRE_V2_USER_CMD_NONE:
- goto set_mode;
+ return;
case SPECTRE_V2_USER_CMD_FORCE:
- mode = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_PRCTL:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
- mode = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
break;
case SPECTRE_V2_USER_CMD_SECCOMP:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP;
+ else
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = spectre_v2_user_ibpb;
+ break;
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
if (IS_ENABLED(CONFIG_SECCOMP))
- mode = SPECTRE_V2_USER_SECCOMP;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP;
else
- mode = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
break;
}
- /* Initialize Indirect Branch Prediction Barrier */
- if (boot_cpu_has(X86_FEATURE_IBPB)) {
- setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+ /*
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
+ */
+ if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
- spectre_v2_user_ibpb = mode;
- switch (cmd) {
- case SPECTRE_V2_USER_CMD_NONE:
- break;
- case SPECTRE_V2_USER_CMD_FORCE:
- case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
- case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
- static_branch_enable(&switch_mm_always_ibpb);
- spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
- break;
- case SPECTRE_V2_USER_CMD_PRCTL:
- case SPECTRE_V2_USER_CMD_AUTO:
- case SPECTRE_V2_USER_CMD_SECCOMP:
- static_branch_enable(&switch_mm_cond_ibpb);
- break;
- }
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
- pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
- static_key_enabled(&switch_mm_always_ibpb) ?
- "always-on" : "conditional");
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+}
+
+static void __init spectre_v2_user_update_mitigation(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ /* The spectre_v2 cmd line can override spectre_v2_user options */
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
}
/*
@@ -1401,32 +1715,46 @@ spectre_v2_user_select_mitigation(void)
* so allow for STIBP to be selected in those cases.
*/
if (!boot_cpu_has(X86_FEATURE_STIBP) ||
- !smt_possible ||
+ !cpu_smt_possible() ||
(spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
- !boot_cpu_has(X86_FEATURE_AUTOIBRS)))
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))) {
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
return;
+ }
- /*
- * At this point, an STIBP mode other than "off" has been set.
- * If STIBP support is not being forced, check if STIBP always-on
- * is preferred.
- */
- if (mode != SPECTRE_V2_USER_STRICT &&
- boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
- mode = SPECTRE_V2_USER_STRICT_PREFERRED;
-
- if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
- retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
- if (mode != SPECTRE_V2_USER_STRICT &&
- mode != SPECTRE_V2_USER_STRICT_PREFERRED)
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE &&
+ (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) {
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT &&
+ spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED)
pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n");
- mode = SPECTRE_V2_USER_STRICT_PREFERRED;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
}
+ pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]);
+}
- spectre_v2_user_stibp = mode;
+static void __init spectre_v2_user_apply_mitigation(void)
+{
+ /* Initialize Indirect Branch Prediction Barrier */
+ if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) {
+ static_branch_enable(&switch_vcpu_ibpb);
-set_mode:
- pr_info("%s\n", spectre_v2_user_strings[mode]);
+ switch (spectre_v2_user_ibpb) {
+ case SPECTRE_V2_USER_STRICT:
+ static_branch_enable(&switch_mm_always_ibpb);
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ static_branch_enable(&switch_mm_cond_ibpb);
+ break;
+ default:
+ break;
+ }
+
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
+ static_key_enabled(&switch_mm_always_ibpb) ?
+ "always-on" : "conditional");
+ }
}
static const char * const spectre_v2_strings[] = {
@@ -1578,51 +1906,54 @@ static void __init spec_ctrl_disable_kernel_rrsba(void)
rrsba_disabled = true;
}
-static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
+static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode)
{
/*
- * Similar to context switches, there are two types of RSB attacks
- * after VM exit:
+ * WARNING! There are many subtleties to consider when changing *any*
+ * code related to RSB-related mitigations. Before doing so, carefully
+ * read the following document, and update if necessary:
*
- * 1) RSB underflow
+ * Documentation/admin-guide/hw-vuln/rsb.rst
*
- * 2) Poisoned RSB entry
+ * In an overly simplified nutshell:
*
- * When retpoline is enabled, both are mitigated by filling/clearing
- * the RSB.
+ * - User->user RSB attacks are conditionally mitigated during
+ * context switches by cond_mitigation -> write_ibpb().
*
- * When IBRS is enabled, while #1 would be mitigated by the IBRS branch
- * prediction isolation protections, RSB still needs to be cleared
- * because of #2. Note that SMEP provides no protection here, unlike
- * user-space-poisoned RSB entries.
+ * - User->kernel and guest->host attacks are mitigated by eIBRS or
+ * RSB filling.
*
- * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB
- * bug is present then a LITE version of RSB protection is required,
- * just a single call needs to retire before a RET is executed.
+ * Though, depending on config, note that other alternative
+ * mitigations may end up getting used instead, e.g., IBPB on
+ * entry/vmexit, call depth tracking, or return thunks.
*/
+
switch (mode) {
case SPECTRE_V2_NONE:
- return;
+ break;
- case SPECTRE_V2_EIBRS_LFENCE:
case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
- setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
}
- return;
+ break;
- case SPECTRE_V2_EIBRS_RETPOLINE:
case SPECTRE_V2_RETPOLINE:
case SPECTRE_V2_LFENCE:
case SPECTRE_V2_IBRS:
+ pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
- pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n");
- return;
- }
+ break;
- pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit");
- dump_stack();
+ default:
+ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n");
+ dump_stack();
+ break;
+ }
}
/*
@@ -1643,12 +1974,13 @@ static bool __init spec_ctrl_bhi_dis(void)
enum bhi_mitigations {
BHI_MITIGATION_OFF,
+ BHI_MITIGATION_AUTO,
BHI_MITIGATION_ON,
BHI_MITIGATION_VMEXIT_ONLY,
};
static enum bhi_mitigations bhi_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
static int __init spectre_bhi_parse_cmdline(char *str)
{
@@ -1670,6 +2002,25 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline);
static void __init bhi_select_mitigation(void)
{
+ if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off())
+ bhi_mitigation = BHI_MITIGATION_OFF;
+
+ if (bhi_mitigation == BHI_MITIGATION_AUTO)
+ bhi_mitigation = BHI_MITIGATION_ON;
+}
+
+static void __init bhi_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE)
+ bhi_mitigation = BHI_MITIGATION_OFF;
+
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
+ spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)
+ bhi_mitigation = BHI_MITIGATION_OFF;
+}
+
+static void __init bhi_apply_mitigation(void)
+{
if (bhi_mitigation == BHI_MITIGATION_OFF)
return;
@@ -1681,95 +2032,101 @@ static void __init bhi_select_mitigation(void)
return;
}
- /* Mitigate in hardware if supported */
- if (spec_ctrl_bhi_dis())
+ if (!IS_ENABLED(CONFIG_X86_64))
return;
- if (!IS_ENABLED(CONFIG_X86_64))
+ /* Mitigate in hardware if supported */
+ if (spec_ctrl_bhi_dis())
return;
if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n");
- setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
return;
}
pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n");
setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
- setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
}
static void __init spectre_v2_select_mitigation(void)
{
- enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
- enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
+ spectre_v2_cmd = spectre_v2_parse_cmdline();
- /*
- * If the CPU is not affected and the command line mode is NONE or AUTO
- * then nothing to do.
- */
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
- (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
+ (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO))
return;
- switch (cmd) {
+ switch (spectre_v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return;
case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- mode = SPECTRE_V2_EIBRS;
- break;
- }
-
- if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
- boot_cpu_has_bug(X86_BUG_RETBLEED) &&
- retbleed_cmd != RETBLEED_CMD_OFF &&
- retbleed_cmd != RETBLEED_CMD_STUFF &&
- boot_cpu_has(X86_FEATURE_IBRS) &&
- boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
- mode = SPECTRE_V2_IBRS;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
break;
}
- mode = spectre_v2_select_retpoline();
+ spectre_v2_enabled = spectre_v2_select_retpoline();
break;
case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
pr_err(SPECTRE_V2_LFENCE_MSG);
- mode = SPECTRE_V2_LFENCE;
+ spectre_v2_enabled = SPECTRE_V2_LFENCE;
break;
case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
- mode = SPECTRE_V2_RETPOLINE;
+ spectre_v2_enabled = SPECTRE_V2_RETPOLINE;
break;
case SPECTRE_V2_CMD_RETPOLINE:
- mode = spectre_v2_select_retpoline();
+ spectre_v2_enabled = spectre_v2_select_retpoline();
break;
case SPECTRE_V2_CMD_IBRS:
- mode = SPECTRE_V2_IBRS;
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
break;
case SPECTRE_V2_CMD_EIBRS:
- mode = SPECTRE_V2_EIBRS;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
break;
case SPECTRE_V2_CMD_EIBRS_LFENCE:
- mode = SPECTRE_V2_EIBRS_LFENCE;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE;
break;
case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
- mode = SPECTRE_V2_EIBRS_RETPOLINE;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE;
break;
}
+}
+
+static void __init spectre_v2_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO &&
+ !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
+ boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ retbleed_mitigation != RETBLEED_MITIGATION_NONE &&
+ retbleed_mitigation != RETBLEED_MITIGATION_STUFF &&
+ boot_cpu_has(X86_FEATURE_IBRS) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
+ }
+ }
+
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off())
+ pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]);
+}
- if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+static void __init spectre_v2_apply_mitigation(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
- if (spectre_v2_in_ibrs_mode(mode)) {
+ if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) {
msr_set_bit(MSR_EFER, _EFER_AUTOIBRS);
} else {
@@ -1778,8 +2135,10 @@ static void __init spectre_v2_select_mitigation(void)
}
}
- switch (mode) {
+ switch (spectre_v2_enabled) {
case SPECTRE_V2_NONE:
+ return;
+
case SPECTRE_V2_EIBRS:
break;
@@ -1805,59 +2164,12 @@ static void __init spectre_v2_select_mitigation(void)
* JMPs gets protection against BHI and Intramode-BTI, but RET
* prediction from a non-RSB predictor is still a risk.
*/
- if (mode == SPECTRE_V2_EIBRS_LFENCE ||
- mode == SPECTRE_V2_EIBRS_RETPOLINE ||
- mode == SPECTRE_V2_RETPOLINE)
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE ||
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE ||
+ spectre_v2_enabled == SPECTRE_V2_RETPOLINE)
spec_ctrl_disable_kernel_rrsba();
- if (boot_cpu_has(X86_BUG_BHI))
- bhi_select_mitigation();
-
- spectre_v2_enabled = mode;
- pr_info("%s\n", spectre_v2_strings[mode]);
-
- /*
- * If Spectre v2 protection has been enabled, fill the RSB during a
- * context switch. In general there are two types of RSB attacks
- * across context switches, for which the CALLs/RETs may be unbalanced.
- *
- * 1) RSB underflow
- *
- * Some Intel parts have "bottomless RSB". When the RSB is empty,
- * speculated return targets may come from the branch predictor,
- * which could have a user-poisoned BTB or BHB entry.
- *
- * AMD has it even worse: *all* returns are speculated from the BTB,
- * regardless of the state of the RSB.
- *
- * When IBRS or eIBRS is enabled, the "user -> kernel" attack
- * scenario is mitigated by the IBRS branch prediction isolation
- * properties, so the RSB buffer filling wouldn't be necessary to
- * protect against this type of attack.
- *
- * The "user -> user" attack scenario is mitigated by RSB filling.
- *
- * 2) Poisoned RSB entry
- *
- * If the 'next' in-kernel return stack is shorter than 'prev',
- * 'next' could be tricked into speculating with a user-poisoned RSB
- * entry.
- *
- * The "user -> kernel" attack scenario is mitigated by SMEP and
- * eIBRS.
- *
- * The "user -> user" scenario, also known as SpectreBHB, requires
- * RSB clearing.
- *
- * So to mitigate all cases, unconditionally fill RSB on context
- * switches.
- *
- * FIXME: Is this pointless for retbleed-affected AMD?
- */
- setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
- pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
-
- spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
+ spectre_v2_select_rsb_mitigation(spectre_v2_enabled);
/*
* Retpoline protects the kernel, but doesn't protect firmware. IBRS
@@ -1865,28 +2177,26 @@ static void __init spectre_v2_select_mitigation(void)
* firmware calls only when IBRS / Enhanced / Automatic IBRS aren't
* otherwise enabled.
*
- * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
- * the user might select retpoline on the kernel command line and if
- * the CPU supports Enhanced IBRS, kernel might un-intentionally not
- * enable IBRS around firmware calls.
+ * Use "spectre_v2_enabled" to check Enhanced IBRS instead of
+ * boot_cpu_has(), because the user might select retpoline on the kernel
+ * command line and if the CPU supports Enhanced IBRS, kernel might
+ * un-intentionally not enable IBRS around firmware calls.
*/
if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
boot_cpu_has(X86_FEATURE_IBPB) &&
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) {
- if (retbleed_cmd != RETBLEED_CMD_IBPB) {
+ if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW);
pr_info("Enabling Speculation Barrier for firmware calls\n");
}
- } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) {
+ } else if (boot_cpu_has(X86_FEATURE_IBRS) &&
+ !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
-
- /* Set up IBPB and STIBP depending on the general spectre V2 command */
- spectre_v2_cmd = cmd;
}
static void update_stibp_msr(void * __unused)
@@ -1973,6 +2283,7 @@ void cpu_bugs_smt_update(void)
switch (mds_mitigation) {
case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_AUTO:
case MDS_MITIGATION_VMWERV:
if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
pr_warn_once(MDS_MSG_SMT);
@@ -1984,6 +2295,7 @@ void cpu_bugs_smt_update(void)
switch (taa_mitigation) {
case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_AUTO:
case TAA_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(TAA_MSG_SMT);
@@ -1995,6 +2307,7 @@ void cpu_bugs_smt_update(void)
switch (mmio_mitigation) {
case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_AUTO:
case MMIO_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(MMIO_MSG_SMT);
@@ -2072,19 +2385,18 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
return cmd;
}
-static enum ssb_mitigation __init __ssb_select_mitigation(void)
+static void __init ssb_select_mitigation(void)
{
- enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
enum ssb_mitigation_cmd cmd;
if (!boot_cpu_has(X86_FEATURE_SSBD))
- return mode;
+ goto out;
cmd = ssb_parse_cmdline();
if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
(cmd == SPEC_STORE_BYPASS_CMD_NONE ||
cmd == SPEC_STORE_BYPASS_CMD_AUTO))
- return mode;
+ return;
switch (cmd) {
case SPEC_STORE_BYPASS_CMD_SECCOMP:
@@ -2093,28 +2405,35 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
* enabled.
*/
if (IS_ENABLED(CONFIG_SECCOMP))
- mode = SPEC_STORE_BYPASS_SECCOMP;
+ ssb_mode = SPEC_STORE_BYPASS_SECCOMP;
else
- mode = SPEC_STORE_BYPASS_PRCTL;
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
break;
case SPEC_STORE_BYPASS_CMD_ON:
- mode = SPEC_STORE_BYPASS_DISABLE;
+ ssb_mode = SPEC_STORE_BYPASS_DISABLE;
break;
case SPEC_STORE_BYPASS_CMD_AUTO:
case SPEC_STORE_BYPASS_CMD_PRCTL:
- mode = SPEC_STORE_BYPASS_PRCTL;
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
break;
case SPEC_STORE_BYPASS_CMD_NONE:
break;
}
+out:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+static void __init ssb_apply_mitigation(void)
+{
/*
* We have three CPU feature flags that are in play here:
* - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
* - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
* - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
*/
- if (mode == SPEC_STORE_BYPASS_DISABLE) {
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) {
setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
/*
* Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
@@ -2128,16 +2447,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
update_spec_ctrl(x86_spec_ctrl_base);
}
}
-
- return mode;
-}
-
-static void ssb_select_mitigation(void)
-{
- ssb_mode = __ssb_select_mitigation();
-
- if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
- pr_info("%s\n", ssb_strings[ssb_mode]);
}
#undef pr_fmt
@@ -2393,7 +2702,7 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
/* Default mitigation for L1TF-affected CPUs */
enum l1tf_mitigations l1tf_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF;
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
#if IS_ENABLED(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(l1tf_mitigation);
#endif
@@ -2441,22 +2750,33 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
static void __init l1tf_select_mitigation(void)
{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
+ }
+
+ if (l1tf_mitigation == L1TF_MITIGATION_AUTO) {
+ if (cpu_mitigations_auto_nosmt())
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ }
+}
+
+static void __init l1tf_apply_mitigation(void)
+{
u64 half_pa;
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return;
- if (cpu_mitigations_off())
- l1tf_mitigation = L1TF_MITIGATION_OFF;
- else if (cpu_mitigations_auto_nosmt())
- l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
-
override_cache_bits(&boot_cpu_data);
switch (l1tf_mitigation) {
case L1TF_MITIGATION_OFF:
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_AUTO:
break;
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
@@ -2516,20 +2836,14 @@ early_param("l1tf", l1tf_cmdline);
enum srso_mitigation {
SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_AUTO,
SRSO_MITIGATION_UCODE_NEEDED,
SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
SRSO_MITIGATION_MICROCODE,
SRSO_MITIGATION_SAFE_RET,
SRSO_MITIGATION_IBPB,
SRSO_MITIGATION_IBPB_ON_VMEXIT,
-};
-
-enum srso_mitigation_cmd {
- SRSO_CMD_OFF,
- SRSO_CMD_MICROCODE,
- SRSO_CMD_SAFE_RET,
- SRSO_CMD_IBPB,
- SRSO_CMD_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
};
static const char * const srso_strings[] = {
@@ -2539,11 +2853,11 @@ static const char * const srso_strings[] = {
[SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
[SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
[SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
- [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
+ [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
};
-static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
-static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET;
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
static int __init srso_parse_cmdline(char *str)
{
@@ -2551,15 +2865,15 @@ static int __init srso_parse_cmdline(char *str)
return -EINVAL;
if (!strcmp(str, "off"))
- srso_cmd = SRSO_CMD_OFF;
+ srso_mitigation = SRSO_MITIGATION_NONE;
else if (!strcmp(str, "microcode"))
- srso_cmd = SRSO_CMD_MICROCODE;
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
else if (!strcmp(str, "safe-ret"))
- srso_cmd = SRSO_CMD_SAFE_RET;
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
else if (!strcmp(str, "ibpb"))
- srso_cmd = SRSO_CMD_IBPB;
+ srso_mitigation = SRSO_MITIGATION_IBPB;
else if (!strcmp(str, "ibpb-vmexit"))
- srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT;
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
else
pr_err("Ignoring unknown SRSO option (%s).", str);
@@ -2571,127 +2885,138 @@ early_param("spec_rstack_overflow", srso_parse_cmdline);
static void __init srso_select_mitigation(void)
{
- bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
+ bool has_microcode;
- if (!boot_cpu_has_bug(X86_BUG_SRSO) ||
- cpu_mitigations_off() ||
- srso_cmd == SRSO_CMD_OFF) {
- if (boot_cpu_has(X86_FEATURE_SBPB))
- x86_pred_cmd = PRED_CMD_SBPB;
+ if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
+ srso_mitigation = SRSO_MITIGATION_NONE;
+
+ if (srso_mitigation == SRSO_MITIGATION_NONE)
return;
- }
+ if (srso_mitigation == SRSO_MITIGATION_AUTO)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+
+ has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
if (has_microcode) {
/*
* Zen1/2 with SMT off aren't vulnerable after the right
* IBPB microcode has been applied.
- *
- * Zen1/2 don't have SBPB, no need to try to enable it here.
*/
if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ srso_mitigation = SRSO_MITIGATION_NONE;
return;
}
-
- if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
- srso_mitigation = SRSO_MITIGATION_IBPB;
- goto out;
- }
} else {
pr_warn("IBPB-extending microcode not applied!\n");
pr_warn(SRSO_NOTICE);
-
- /* may be overwritten by SRSO_CMD_SAFE_RET below */
- srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
}
- switch (srso_cmd) {
- case SRSO_CMD_MICROCODE:
- if (has_microcode) {
- srso_mitigation = SRSO_MITIGATION_MICROCODE;
- pr_warn(SRSO_NOTICE);
- }
- break;
-
- case SRSO_CMD_SAFE_RET:
- if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) {
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
goto ibpb_on_vmexit;
+ }
- if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
- /*
- * Enable the return thunk for generated code
- * like ftrace, static_call, etc.
- */
- setup_force_cpu_cap(X86_FEATURE_RETHUNK);
- setup_force_cpu_cap(X86_FEATURE_UNRET);
-
- if (boot_cpu_data.x86 == 0x19) {
- setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
- x86_return_thunk = srso_alias_return_thunk;
- } else {
- setup_force_cpu_cap(X86_FEATURE_SRSO);
- x86_return_thunk = srso_return_thunk;
- }
- if (has_microcode)
- srso_mitigation = SRSO_MITIGATION_SAFE_RET;
- else
- srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
- } else {
+ if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
}
- break;
- case SRSO_CMD_IBPB:
- if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
- if (has_microcode) {
- setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
- setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
- srso_mitigation = SRSO_MITIGATION_IBPB;
-
- /*
- * IBPB on entry already obviates the need for
- * software-based untraining so clear those in case some
- * other mitigation like Retbleed has selected them.
- */
- setup_clear_cpu_cap(X86_FEATURE_UNRET);
- setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
-
- /*
- * There is no need for RSB filling: entry_ibpb() ensures
- * all predictions, including the RSB, are invalidated,
- * regardless of IBPB implementation.
- */
- setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
- }
- } else {
+ if (!has_microcode)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
+ break;
+ibpb_on_vmexit:
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
+ fallthrough;
+ case SRSO_MITIGATION_IBPB:
+ if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
}
+
+ if (!has_microcode)
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
+ break;
+ default:
break;
+ }
+}
-ibpb_on_vmexit:
- case SRSO_CMD_IBPB_ON_VMEXIT:
- if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
- if (has_microcode) {
- setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
- srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
-
- /*
- * There is no need for RSB filling: entry_ibpb() ensures
- * all predictions, including the RSB, are invalidated,
- * regardless of IBPB implementation.
- */
- setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
- }
+static void __init srso_update_mitigation(void)
+{
+ /* If retbleed is using IBPB, that works for SRSO as well */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB &&
+ boot_cpu_has(X86_FEATURE_IBPB_BRTYPE))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+
+ if (boot_cpu_has_bug(X86_BUG_SRSO) &&
+ !cpu_mitigations_off() &&
+ !boot_cpu_has(X86_FEATURE_SRSO_NO))
+ pr_info("%s\n", srso_strings[srso_mitigation]);
+}
+
+static void __init srso_apply_mitigation(void)
+{
+ /*
+ * Clear the feature flag if this mitigation is not selected as that
+ * feature flag controls the BpSpecReduce MSR bit toggling in KVM.
+ */
+ if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
+ setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
+
+ if (srso_mitigation == SRSO_MITIGATION_NONE) {
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+ return;
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
+ /*
+ * Enable the return thunk for generated code
+ * like ftrace, static_call, etc.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ if (boot_cpu_data.x86 == 0x19) {
+ setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
+ set_return_thunk(srso_alias_return_thunk);
} else {
- pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ setup_force_cpu_cap(X86_FEATURE_SRSO);
+ set_return_thunk(srso_return_thunk);
}
break;
+ case SRSO_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like Retbleed has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+ fallthrough;
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
default:
break;
}
-
-out:
- pr_info("%s\n", srso_strings[srso_mitigation]);
}
#undef pr_fmt
@@ -2786,9 +3111,6 @@ static ssize_t tsx_async_abort_show_state(char *buf)
static ssize_t mmio_stale_data_show_state(char *buf)
{
- if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
- return sysfs_emit(buf, "Unknown: No mitigations\n");
-
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]);
@@ -2806,6 +3128,19 @@ static ssize_t rfds_show_state(char *buf)
return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
}
+static ssize_t old_microcode_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return sysfs_emit(buf, "Unknown: running under hypervisor");
+
+ return sysfs_emit(buf, "Vulnerable\n");
+}
+
+static ssize_t its_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]);
+}
+
static char *stibp_state(void)
{
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
@@ -2864,7 +3199,7 @@ static const char *spectre_bhi_state(void)
!boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) &&
rrsba_disabled)
return "; BHI: Retpoline";
- else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT))
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT))
return "; BHI: Vulnerable, KVM: SW loop";
return "; BHI: Vulnerable";
@@ -2973,7 +3308,6 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
return srbds_show_state(buf);
case X86_BUG_MMIO_STALE_DATA:
- case X86_BUG_MMIO_UNKNOWN:
return mmio_stale_data_show_state(buf);
case X86_BUG_RETBLEED:
@@ -2988,6 +3322,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_RFDS:
return rfds_show_state(buf);
+ case X86_BUG_OLD_MICROCODE:
+ return old_microcode_show_state(buf);
+
+ case X86_BUG_ITS:
+ return its_show_state(buf);
+
default:
break;
}
@@ -3042,10 +3382,7 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *
ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)
{
- if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
- return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN);
- else
- return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
+ return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
}
ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf)
@@ -3067,6 +3404,16 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib
{
return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
}
+
+ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE);
+}
+
+ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
+}
#endif
void __warn_thunk(void)
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
index 6cba85c79d42..981f8b1f0792 100644
--- a/arch/x86/kernel/cpu/bus_lock.c
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -10,6 +10,7 @@
#include <asm/cmdline.h>
#include <asm/traps.h>
#include <asm/cpu.h>
+#include <asm/msr.h>
enum split_lock_detect_state {
sld_off = 0,
@@ -95,15 +96,15 @@ static bool split_lock_verify_msr(bool on)
{
u64 ctrl, tmp;
- if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
+ if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl))
return false;
if (on)
ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
else
ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
- if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
+ if (wrmsrq_safe(MSR_TEST_CTRL, ctrl))
return false;
- rdmsrl(MSR_TEST_CTRL, tmp);
+ rdmsrq(MSR_TEST_CTRL, tmp);
return ctrl == tmp;
}
@@ -137,7 +138,7 @@ static void __init __split_lock_setup(void)
return;
}
- rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+ rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
if (!split_lock_verify_msr(true)) {
pr_info("MSR access failed: Disabled\n");
@@ -145,7 +146,7 @@ static void __init __split_lock_setup(void)
}
/* Restore the MSR to its cached value. */
- wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+ wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
}
@@ -162,7 +163,7 @@ static void sld_update_msr(bool on)
if (on)
test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
- wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
+ wrmsrq(MSR_TEST_CTRL, test_ctrl_val);
}
void split_lock_init(void)
@@ -192,7 +193,33 @@ static void __split_lock_reenable(struct work_struct *work)
{
sld_update_msr(true);
}
-static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
+/*
+ * In order for each CPU to schedule its delayed work independently of the
+ * others, delayed work struct must be per-CPU. This is not required when
+ * sysctl_sld_mitigate is enabled because of the semaphore that limits
+ * the number of simultaneously scheduled delayed works to 1.
+ */
+static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
+
+/*
+ * Per-CPU delayed_work can't be statically initialized properly because
+ * the struct address is unknown. Thus per-CPU delayed_work structures
+ * have to be initialized during kernel initialization and after calling
+ * setup_per_cpu_areas().
+ */
+static int __init setup_split_lock_delayed_work(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct delayed_work *work = per_cpu_ptr(&sl_reenable, cpu);
+
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
+ return 0;
+}
+pure_initcall(setup_split_lock_delayed_work);
/*
* If a CPU goes offline with pending delayed work to re-enable split lock
@@ -215,13 +242,14 @@ static void split_lock_warn(unsigned long ip)
{
struct delayed_work *work;
int cpu;
+ unsigned int saved_sld_mitigate = READ_ONCE(sysctl_sld_mitigate);
if (!current->reported_split_lock)
pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
current->comm, current->pid, ip);
current->reported_split_lock = 1;
- if (sysctl_sld_mitigate) {
+ if (saved_sld_mitigate) {
/*
* misery factor #1:
* sleep 10ms before trying to execute split lock.
@@ -234,12 +262,10 @@ static void split_lock_warn(unsigned long ip)
*/
if (down_interruptible(&buslock_sem) == -EINTR)
return;
- work = &sl_reenable_unlock;
- } else {
- work = &sl_reenable;
}
cpu = get_cpu();
+ work = saved_sld_mitigate ? &sl_reenable_unlock : per_cpu_ptr(&sl_reenable, cpu);
schedule_delayed_work_on(cpu, work, 2);
/* Disable split lock detection on this CPU to make progress */
@@ -272,7 +298,7 @@ void bus_lock_init(void)
if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
return;
- rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, val);
if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
(sld_state == sld_warn || sld_state == sld_fatal)) ||
@@ -286,7 +312,7 @@ void bus_lock_init(void)
val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
}
- wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, val);
}
bool handle_user_split_lock(struct pt_regs *regs, long error_code)
@@ -350,7 +376,7 @@ static void __init split_lock_setup(struct cpuinfo_x86 *c)
* MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
* it have split lock detection.
*/
- rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps);
if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
goto supported;
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index e6fa03ed9172..adfa7e8bb865 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -1,38 +1,28 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Routines to identify caches on Intel CPU.
+ * x86 CPU caches detection and configuration
*
- * Changes:
- * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
- * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
- * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
+ * Previous changes
+ * - Venkatesh Pallipadi: Cache identification through CPUID(0x4)
+ * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure
+ * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD
*/
-#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/cpuhotplug.h>
-#include <linux/sched.h>
-#include <linux/capability.h>
-#include <linux/sysfs.h>
-#include <linux/pci.h>
#include <linux/stop_machine.h>
-#include <asm/cpufeature.h>
+#include <asm/amd/nb.h>
#include <asm/cacheinfo.h>
-#include <asm/amd_nb.h>
-#include <asm/smp.h>
+#include <asm/cpufeature.h>
+#include <asm/cpuid/api.h>
#include <asm/mtrr.h>
+#include <asm/smp.h>
#include <asm/tlbflush.h>
#include "cpu.h"
-#define LVL_1_INST 1
-#define LVL_1_DATA 2
-#define LVL_2 3
-#define LVL_3 4
-#define LVL_TRACE 5
-
/* Shared last level cache maps */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
@@ -44,212 +34,127 @@ static cpumask_var_t cpu_cacheinfo_mask;
/* Kernel controls MTRR and/or PAT MSRs. */
unsigned int memory_caching_control __ro_after_init;
-struct _cache_table {
- unsigned char descriptor;
- char cache_type;
- short size;
-};
-
-#define MB(x) ((x) * 1024)
-
-/* All the cache descriptor types we care about (no TLB or
- trace cache entries) */
-
-static const struct _cache_table cache_table[] =
-{
- { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
- { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */
- { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
- { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
- { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
- { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
- { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */
- { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */
- { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
- { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
- { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
- { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
- { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
- { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
- { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
- { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
- { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
- { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
- { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
- { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
- { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
- { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
- { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
- { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
- { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
- { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
- { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
- { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
- { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
- { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
- { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
- { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
- { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
- { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
- { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
- { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
- { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
- { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
- { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
- { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
- { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
- { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
- { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
- { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
- { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
- { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
- { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
- { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
- { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
- { 0x00, 0, 0}
-};
-
-
enum _cache_type {
- CTYPE_NULL = 0,
- CTYPE_DATA = 1,
- CTYPE_INST = 2,
- CTYPE_UNIFIED = 3
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
};
union _cpuid4_leaf_eax {
struct {
- enum _cache_type type:5;
- unsigned int level:3;
- unsigned int is_self_initializing:1;
- unsigned int is_fully_associative:1;
- unsigned int reserved:4;
- unsigned int num_threads_sharing:12;
- unsigned int num_cores_on_die:6;
+ enum _cache_type type :5;
+ unsigned int level :3;
+ unsigned int is_self_initializing :1;
+ unsigned int is_fully_associative :1;
+ unsigned int reserved :4;
+ unsigned int num_threads_sharing :12;
+ unsigned int num_cores_on_die :6;
} split;
u32 full;
};
union _cpuid4_leaf_ebx {
struct {
- unsigned int coherency_line_size:12;
- unsigned int physical_line_partition:10;
- unsigned int ways_of_associativity:10;
+ unsigned int coherency_line_size :12;
+ unsigned int physical_line_partition :10;
+ unsigned int ways_of_associativity :10;
} split;
u32 full;
};
union _cpuid4_leaf_ecx {
struct {
- unsigned int number_of_sets:32;
+ unsigned int number_of_sets :32;
} split;
u32 full;
};
-struct _cpuid4_info_regs {
+struct _cpuid4_info {
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned int id;
unsigned long size;
- struct amd_northbridge *nb;
};
-/* AMD doesn't have CPUID4. Emulate it here to report the same
- information to the user. This makes some assumptions about the machine:
- L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
+/*
+ * Fallback AMD CPUID(0x4) emulation
+ * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d)
+ *
+ * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should
+ * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006).
+ */
+
+#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff
+#define AMD_L2_L3_INVALID_ASSOC 0x9
- In theory the TLBs could be reported as fake type (they are in "dummy").
- Maybe later */
union l1_cache {
struct {
- unsigned line_size:8;
- unsigned lines_per_tag:8;
- unsigned assoc:8;
- unsigned size_in_kb:8;
+ unsigned line_size :8;
+ unsigned lines_per_tag :8;
+ unsigned assoc :8;
+ unsigned size_in_kb :8;
};
- unsigned val;
+ unsigned int val;
};
union l2_cache {
struct {
- unsigned line_size:8;
- unsigned lines_per_tag:4;
- unsigned assoc:4;
- unsigned size_in_kb:16;
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned size_in_kb :16;
};
- unsigned val;
+ unsigned int val;
};
union l3_cache {
struct {
- unsigned line_size:8;
- unsigned lines_per_tag:4;
- unsigned assoc:4;
- unsigned res:2;
- unsigned size_encoded:14;
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned res :2;
+ unsigned size_encoded :14;
};
- unsigned val;
+ unsigned int val;
};
+/* L2/L3 associativity mapping */
static const unsigned short assocs[] = {
- [1] = 1,
- [2] = 2,
- [4] = 4,
- [6] = 8,
- [8] = 16,
- [0xa] = 32,
- [0xb] = 48,
- [0xc] = 64,
- [0xd] = 96,
- [0xe] = 128,
- [0xf] = 0xffff /* fully associative - no way to show this currently */
+ [1] = 1,
+ [2] = 2,
+ [3] = 3,
+ [4] = 4,
+ [5] = 6,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
+ [0xc] = 64,
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE
};
static const unsigned char levels[] = { 1, 1, 2, 3 };
-static const unsigned char types[] = { 1, 2, 3, 3 };
-
-static const enum cache_type cache_type_map[] = {
- [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
- [CTYPE_DATA] = CACHE_TYPE_DATA,
- [CTYPE_INST] = CACHE_TYPE_INST,
- [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
-};
+static const unsigned char types[] = { 1, 2, 3, 3 };
-static void
-amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
- union _cpuid4_leaf_ebx *ebx,
- union _cpuid4_leaf_ecx *ecx)
+static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax,
+ union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx)
{
- unsigned dummy;
- unsigned line_size, lines_per_tag, assoc, size_in_kb;
- union l1_cache l1i, l1d;
+ unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb;
+ union l1_cache l1i, l1d, *l1;
union l2_cache l2;
union l3_cache l3;
- union l1_cache *l1 = &l1d;
eax->full = 0;
ebx->full = 0;
@@ -258,430 +163,155 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
- switch (leaf) {
+ l1 = &l1d;
+ switch (index) {
case 1:
l1 = &l1i;
fallthrough;
case 0:
if (!l1->val)
return;
- assoc = assocs[l1->assoc];
- line_size = l1->line_size;
- lines_per_tag = l1->lines_per_tag;
- size_in_kb = l1->size_in_kb;
+
+ assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc;
+ line_size = l1->line_size;
+ lines_per_tag = l1->lines_per_tag;
+ size_in_kb = l1->size_in_kb;
break;
case 2:
- if (!l2.val)
+ if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC)
return;
- assoc = assocs[l2.assoc];
- line_size = l2.line_size;
- lines_per_tag = l2.lines_per_tag;
- /* cpu_data has errata corrections for K7 applied */
- size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
+
+ /* Use x86_cache_size as it might have K7 errata fixes */
+ assoc = assocs[l2.assoc];
+ line_size = l2.line_size;
+ lines_per_tag = l2.lines_per_tag;
+ size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
break;
case 3:
- if (!l3.val)
+ if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC)
return;
- assoc = assocs[l3.assoc];
- line_size = l3.line_size;
- lines_per_tag = l3.lines_per_tag;
- size_in_kb = l3.size_encoded * 512;
+
+ assoc = assocs[l3.assoc];
+ line_size = l3.line_size;
+ lines_per_tag = l3.lines_per_tag;
+ size_in_kb = l3.size_encoded * 512;
if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
- size_in_kb = size_in_kb >> 1;
- assoc = assoc >> 1;
+ size_in_kb = size_in_kb >> 1;
+ assoc = assoc >> 1;
}
break;
default:
return;
}
- eax->split.is_self_initializing = 1;
- eax->split.type = types[leaf];
- eax->split.level = levels[leaf];
- eax->split.num_threads_sharing = 0;
- eax->split.num_cores_on_die = topology_num_cores_per_package();
-
+ eax->split.is_self_initializing = 1;
+ eax->split.type = types[index];
+ eax->split.level = levels[index];
+ eax->split.num_threads_sharing = 0;
+ eax->split.num_cores_on_die = topology_num_cores_per_package();
- if (assoc == 0xffff)
+ if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE)
eax->split.is_fully_associative = 1;
- ebx->split.coherency_line_size = line_size - 1;
- ebx->split.ways_of_associativity = assoc - 1;
- ebx->split.physical_line_partition = lines_per_tag - 1;
- ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
- (ebx->split.ways_of_associativity + 1) - 1;
-}
-
-#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
-
-/*
- * L3 cache descriptors
- */
-static void amd_calc_l3_indices(struct amd_northbridge *nb)
-{
- struct amd_l3_cache *l3 = &nb->l3_cache;
- unsigned int sc0, sc1, sc2, sc3;
- u32 val = 0;
-
- pci_read_config_dword(nb->misc, 0x1C4, &val);
-
- /* calculate subcache sizes */
- l3->subcaches[0] = sc0 = !(val & BIT(0));
- l3->subcaches[1] = sc1 = !(val & BIT(4));
-
- if (boot_cpu_data.x86 == 0x15) {
- l3->subcaches[0] = sc0 += !(val & BIT(1));
- l3->subcaches[1] = sc1 += !(val & BIT(5));
- }
-
- l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
- l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
-
- l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
-}
-
-/*
- * check whether a slot used for disabling an L3 index is occupied.
- * @l3: L3 cache descriptor
- * @slot: slot number (0..1)
- *
- * @returns: the disabled index if used or negative value if slot free.
- */
-static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
-{
- unsigned int reg = 0;
-
- pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg);
-
- /* check whether this slot is activated already */
- if (reg & (3UL << 30))
- return reg & 0xfff;
-
- return -1;
-}
-
-static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
- unsigned int slot)
-{
- int index;
- struct amd_northbridge *nb = this_leaf->priv;
-
- index = amd_get_l3_disable_slot(nb, slot);
- if (index >= 0)
- return sprintf(buf, "%d\n", index);
-
- return sprintf(buf, "FREE\n");
-}
-
-#define SHOW_CACHE_DISABLE(slot) \
-static ssize_t \
-cache_disable_##slot##_show(struct device *dev, \
- struct device_attribute *attr, char *buf) \
-{ \
- struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
- return show_cache_disable(this_leaf, buf, slot); \
-}
-SHOW_CACHE_DISABLE(0)
-SHOW_CACHE_DISABLE(1)
-
-static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
- unsigned slot, unsigned long idx)
-{
- int i;
-
- idx |= BIT(30);
-
- /*
- * disable index in all 4 subcaches
- */
- for (i = 0; i < 4; i++) {
- u32 reg = idx | (i << 20);
-
- if (!nb->l3_cache.subcaches[i])
- continue;
-
- pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
-
- /*
- * We need to WBINVD on a core on the node containing the L3
- * cache which indices we disable therefore a simple wbinvd()
- * is not sufficient.
- */
- wbinvd_on_cpu(cpu);
-
- reg |= BIT(31);
- pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
- }
-}
-
-/*
- * disable a L3 cache index by using a disable-slot
- *
- * @l3: L3 cache descriptor
- * @cpu: A CPU on the node containing the L3 cache
- * @slot: slot number (0..1)
- * @index: index to disable
- *
- * @return: 0 on success, error status on failure
- */
-static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
- unsigned slot, unsigned long index)
-{
- int ret = 0;
-
- /* check if @slot is already used or the index is already disabled */
- ret = amd_get_l3_disable_slot(nb, slot);
- if (ret >= 0)
- return -EEXIST;
-
- if (index > nb->l3_cache.indices)
- return -EINVAL;
-
- /* check whether the other slot has disabled the same index already */
- if (index == amd_get_l3_disable_slot(nb, !slot))
- return -EEXIST;
-
- amd_l3_disable_index(nb, cpu, slot, index);
-
- return 0;
-}
-static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
-{
- unsigned long val = 0;
- int cpu, err = 0;
- struct amd_northbridge *nb = this_leaf->priv;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- cpu = cpumask_first(&this_leaf->shared_cpu_map);
-
- if (kstrtoul(buf, 10, &val) < 0)
- return -EINVAL;
-
- err = amd_set_l3_disable_slot(nb, cpu, slot, val);
- if (err) {
- if (err == -EEXIST)
- pr_warn("L3 slot %d in use/index already disabled!\n",
- slot);
- return err;
- }
- return count;
-}
-
-#define STORE_CACHE_DISABLE(slot) \
-static ssize_t \
-cache_disable_##slot##_store(struct device *dev, \
- struct device_attribute *attr, \
- const char *buf, size_t count) \
-{ \
- struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
- return store_cache_disable(this_leaf, buf, count, slot); \
-}
-STORE_CACHE_DISABLE(0)
-STORE_CACHE_DISABLE(1)
-
-static ssize_t subcaches_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct cacheinfo *this_leaf = dev_get_drvdata(dev);
- int cpu = cpumask_first(&this_leaf->shared_cpu_map);
-
- return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
-}
-
-static ssize_t subcaches_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct cacheinfo *this_leaf = dev_get_drvdata(dev);
- int cpu = cpumask_first(&this_leaf->shared_cpu_map);
- unsigned long val;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (kstrtoul(buf, 16, &val) < 0)
- return -EINVAL;
-
- if (amd_set_subcaches(cpu, val))
- return -EINVAL;
-
- return count;
+ ebx->split.coherency_line_size = line_size - 1;
+ ebx->split.ways_of_associativity = assoc - 1;
+ ebx->split.physical_line_partition = lines_per_tag - 1;
+ ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+ (ebx->split.ways_of_associativity + 1) - 1;
}
-static DEVICE_ATTR_RW(cache_disable_0);
-static DEVICE_ATTR_RW(cache_disable_1);
-static DEVICE_ATTR_RW(subcaches);
-
-static umode_t
-cache_private_attrs_is_visible(struct kobject *kobj,
- struct attribute *attr, int unused)
+static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax,
+ union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx)
{
- struct device *dev = kobj_to_dev(kobj);
- struct cacheinfo *this_leaf = dev_get_drvdata(dev);
- umode_t mode = attr->mode;
-
- if (!this_leaf->priv)
- return 0;
-
- if ((attr == &dev_attr_subcaches.attr) &&
- amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return mode;
+ if (eax.split.type == CTYPE_NULL)
+ return -EIO;
- if ((attr == &dev_attr_cache_disable_0.attr ||
- attr == &dev_attr_cache_disable_1.attr) &&
- amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return mode;
+ id4->eax = eax;
+ id4->ebx = ebx;
+ id4->ecx = ecx;
+ id4->size = (ecx.split.number_of_sets + 1) *
+ (ebx.split.coherency_line_size + 1) *
+ (ebx.split.physical_line_partition + 1) *
+ (ebx.split.ways_of_associativity + 1);
return 0;
}
-static struct attribute_group cache_private_group = {
- .is_visible = cache_private_attrs_is_visible,
-};
-
-static void init_amd_l3_attrs(void)
+static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
{
- int n = 1;
- static struct attribute **amd_l3_attrs;
-
- if (amd_l3_attrs) /* already initialized */
- return;
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- n += 2;
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- n += 1;
-
- amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
- if (!amd_l3_attrs)
- return;
-
- n = 0;
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
- amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
- amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
- }
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
-
- cache_private_group.attrs = amd_l3_attrs;
-}
-
-const struct attribute_group *
-cache_get_priv_group(struct cacheinfo *this_leaf)
-{
- struct amd_northbridge *nb = this_leaf->priv;
-
- if (this_leaf->level < 3 || !nb)
- return NULL;
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
- if (nb && nb->l3_cache.indices)
- init_amd_l3_attrs();
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored);
+ else
+ legacy_amd_cpuid4(index, &eax, &ebx, &ecx);
- return &cache_private_group;
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
}
-static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
{
- int node;
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
- /* only for L3, and not in virtualized environments */
- if (index < 3)
- return;
+ cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored);
- node = topology_amd_node_id(smp_processor_id());
- this_leaf->nb = node_to_amd_nb(node);
- if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
- amd_calc_l3_indices(this_leaf->nb);
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
}
-#else
-#define amd_init_l3_cache(x, y)
-#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
-static int
-cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
+static int fill_cpuid4_info(int index, struct _cpuid4_info *id4)
{
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned edx;
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- if (boot_cpu_has(X86_FEATURE_TOPOEXT))
- cpuid_count(0x8000001d, index, &eax.full,
- &ebx.full, &ecx.full, &edx);
- else
- amd_cpuid4(index, &eax, &ebx, &ecx);
- amd_init_l3_cache(this_leaf, index);
- } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- cpuid_count(0x8000001d, index, &eax.full,
- &ebx.full, &ecx.full, &edx);
- amd_init_l3_cache(this_leaf, index);
- } else {
- cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
- }
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
- if (eax.split.type == CTYPE_NULL)
- return -EIO; /* better error ? */
-
- this_leaf->eax = eax;
- this_leaf->ebx = ebx;
- this_leaf->ecx = ecx;
- this_leaf->size = (ecx.split.number_of_sets + 1) *
- (ebx.split.coherency_line_size + 1) *
- (ebx.split.physical_line_partition + 1) *
- (ebx.split.ways_of_associativity + 1);
- return 0;
+ return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ?
+ amd_fill_cpuid4_info(index, id4) :
+ intel_fill_cpuid4_info(index, id4);
}
static int find_num_cache_leaves(struct cpuinfo_x86 *c)
{
- unsigned int eax, ebx, ecx, edx, op;
- union _cpuid4_leaf_eax cache_eax;
- int i = -1;
-
- if (c->x86_vendor == X86_VENDOR_AMD ||
- c->x86_vendor == X86_VENDOR_HYGON)
- op = 0x8000001d;
- else
- op = 4;
+ unsigned int eax, ebx, ecx, edx, op;
+ union _cpuid4_leaf_eax cache_eax;
+ int i = -1;
+ /* Do a CPUID(op) loop to calculate num_cache_leaves */
+ op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4;
do {
++i;
- /* Do cpuid(op) loop to find out num_cache_leaves */
cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
cache_eax.full = eax;
} while (cache_eax.split.type != CTYPE_NULL);
return i;
}
+/*
+ * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist.
+ */
+
void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
{
- /*
- * We may have multiple LLCs if L3 caches exist, so check if we
- * have an L3 cache by looking at the L3 cache CPUID leaf.
- */
- if (!cpuid_edx(0x80000006))
+ if (!cpuid_amd_hygon_has_l3_cache())
return;
if (c->x86 < 0x17) {
- /* LLC is at the node level. */
+ /* Pre-Zen: LLC is at the node level */
c->topo.llc_id = die_id;
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
- * LLC is at the core complex level.
- * Core complex ID is ApicId[3] for these processors.
+ * Family 17h up to 1F models: LLC is at the core
+ * complex level. Core complex ID is ApicId[3].
*/
c->topo.llc_id = c->topo.apicid >> 3;
} else {
/*
- * LLC ID is calculated from the number of threads sharing the
- * cache.
- * */
+ * Newer families: LLC ID is calculated from the number
+ * of threads sharing the L3 cache.
+ */
u32 eax, ebx, ecx, edx, num_sharing_cache = 0;
u32 llc_index = find_num_cache_leaves(c) - 1;
@@ -690,25 +320,21 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
if (num_sharing_cache) {
- int bits = get_count_order(num_sharing_cache);
+ int index_msb = get_count_order(num_sharing_cache);
- c->topo.llc_id = c->topo.apicid >> bits;
+ c->topo.llc_id = c->topo.apicid >> index_msb;
}
}
}
void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
{
- /*
- * We may have multiple LLCs if L3 caches exist, so check if we
- * have an L3 cache by looking at the L3 cache CPUID leaf.
- */
- if (!cpuid_edx(0x80000006))
+ if (!cpuid_amd_hygon_has_l3_cache())
return;
/*
- * LLC is at the core complex level.
- * Core complex ID is ApicId[3] for these processors.
+ * Hygons are similar to AMD Family 17h up to 1F models: LLC is
+ * at the core complex level. Core complex ID is ApicId[3].
*/
c->topo.llc_id = c->topo.apicid >> 3;
}
@@ -717,14 +343,10 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c)
{
struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT))
ci->num_leaves = find_num_cache_leaves(c);
- } else if (c->extended_cpuid_level >= 0x80000006) {
- if (cpuid_edx(0x80000006) & 0xf000)
- ci->num_leaves = 4;
- else
- ci->num_leaves = 3;
- }
+ else if (c->extended_cpuid_level >= 0x80000006)
+ ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3;
}
void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
@@ -734,156 +356,131 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
ci->num_leaves = find_num_cache_leaves(c);
}
-void init_intel_cacheinfo(struct cpuinfo_x86 *c)
+static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3,
+ unsigned int l2, unsigned int l1i, unsigned int l1d)
{
- /* Cache sizes */
- unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0;
- unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
- unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
- unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
- struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+ /*
+ * If llc_id is still unset, then cpuid_level < 4, which implies
+ * that the only possibility left is SMT. Since CPUID(0x2) doesn't
+ * specify any shared caches and SMT shares all caches, we can
+ * unconditionally set LLC ID to the package ID so that all
+ * threads share it.
+ */
+ if (c->topo.llc_id == BAD_APICID)
+ c->topo.llc_id = c->topo.pkg_id;
- if (c->cpuid_level > 3) {
- /*
- * There should be at least one leaf. A non-zero value means
- * that the number of leaves has been initialized.
- */
- if (!ci->num_leaves)
- ci->num_leaves = find_num_cache_leaves(c);
+ c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d);
- /*
- * Whenever possible use cpuid(4), deterministic cache
- * parameters cpuid leaf to find the cache details
- */
- for (i = 0; i < ci->num_leaves; i++) {
- struct _cpuid4_info_regs this_leaf = {};
- int retval;
+ if (!l2)
+ cpu_detect_cache_sizes(c);
+}
- retval = cpuid4_cache_lookup_regs(i, &this_leaf);
- if (retval < 0)
- continue;
+/*
+ * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available.
+ */
+static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c)
+{
+ unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0;
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
- switch (this_leaf.eax.split.level) {
- case 1:
- if (this_leaf.eax.split.type == CTYPE_DATA)
- new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type == CTYPE_INST)
- new_l1i = this_leaf.size/1024;
- break;
- case 2:
- new_l2 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l2_id = c->topo.apicid & ~((1 << index_msb) - 1);
- break;
- case 3:
- new_l3 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l3_id = c->topo.apicid & ~((1 << index_msb) - 1);
- break;
- default:
- break;
- }
+ if (c->cpuid_level < 2)
+ return;
+
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc) {
+ switch (desc->c_type) {
+ case CACHE_L1_INST: l1i += desc->c_size; break;
+ case CACHE_L1_DATA: l1d += desc->c_size; break;
+ case CACHE_L2: l2 += desc->c_size; break;
+ case CACHE_L3: l3 += desc->c_size; break;
}
}
+
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+}
+
+static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4)
+{
+ unsigned int num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ return c->topo.apicid & ~((1 << index_msb) - 1);
+}
+
+static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+ unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID;
+ unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0;
+
+ if (c->cpuid_level < 4)
+ return false;
+
/*
- * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
- * trace cache
+ * There should be at least one leaf. A non-zero value means
+ * that the number of leaves has been previously initialized.
*/
- if ((!ci->num_leaves || c->x86 == 15) && c->cpuid_level > 1) {
- /* supports eax=2 call */
- int j, n;
- unsigned int regs[4];
- unsigned char *dp = (unsigned char *)regs;
- int only_trace = 0;
-
- if (ci->num_leaves && c->x86 == 15)
- only_trace = 1;
-
- /* Number of times to iterate */
- n = cpuid_eax(2) & 0xFF;
-
- for (i = 0 ; i < n ; i++) {
- cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
-
- /* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
- if (regs[j] & (1 << 31))
- regs[j] = 0;
-
- /* Byte 0 is level count, not a descriptor */
- for (j = 1 ; j < 16 ; j++) {
- unsigned char des = dp[j];
- unsigned char k = 0;
-
- /* look up this descriptor in the table */
- while (cache_table[k].descriptor != 0) {
- if (cache_table[k].descriptor == des) {
- if (only_trace && cache_table[k].cache_type != LVL_TRACE)
- break;
- switch (cache_table[k].cache_type) {
- case LVL_1_INST:
- l1i += cache_table[k].size;
- break;
- case LVL_1_DATA:
- l1d += cache_table[k].size;
- break;
- case LVL_2:
- l2 += cache_table[k].size;
- break;
- case LVL_3:
- l3 += cache_table[k].size;
- break;
- }
-
- break;
- }
-
- k++;
- }
- }
- }
- }
+ if (!ci->num_leaves)
+ ci->num_leaves = find_num_cache_leaves(c);
- if (new_l1d)
- l1d = new_l1d;
+ if (!ci->num_leaves)
+ return false;
- if (new_l1i)
- l1i = new_l1i;
+ for (int i = 0; i < ci->num_leaves; i++) {
+ struct _cpuid4_info id4 = {};
+ int ret;
- if (new_l2) {
- l2 = new_l2;
- c->topo.llc_id = l2_id;
- c->topo.l2c_id = l2_id;
- }
+ ret = intel_fill_cpuid4_info(i, &id4);
+ if (ret < 0)
+ continue;
- if (new_l3) {
- l3 = new_l3;
- c->topo.llc_id = l3_id;
+ switch (id4.eax.split.level) {
+ case 1:
+ if (id4.eax.split.type == CTYPE_DATA)
+ l1d = id4.size / 1024;
+ else if (id4.eax.split.type == CTYPE_INST)
+ l1i = id4.size / 1024;
+ break;
+ case 2:
+ l2 = id4.size / 1024;
+ l2_id = calc_cache_topo_id(c, &id4);
+ break;
+ case 3:
+ l3 = id4.size / 1024;
+ l3_id = calc_cache_topo_id(c, &id4);
+ break;
+ default:
+ break;
+ }
}
- /*
- * If llc_id is not yet set, this means cpuid_level < 4 which in
- * turns means that the only possibility is SMT (as indicated in
- * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
- * that SMT shares all caches, we can unconditionally set cpu_llc_id to
- * c->topo.pkg_id.
- */
- if (c->topo.llc_id == BAD_APICID)
- c->topo.llc_id = c->topo.pkg_id;
+ c->topo.l2c_id = l2_id;
+ c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id;
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+ return true;
+}
- c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
+{
+ /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */
+ if (intel_cacheinfo_0x4(c))
+ return;
- if (!l2)
- cpu_detect_cache_sizes(c);
+ intel_cacheinfo_0x2(c);
}
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon
+ */
static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
- struct _cpuid4_info_regs *base)
+ const struct _cpuid4_info *id4)
{
struct cpu_cacheinfo *this_cpu_ci;
- struct cacheinfo *this_leaf;
+ struct cacheinfo *ci;
int i, sibling;
/*
@@ -895,18 +492,18 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
this_cpu_ci = get_cpu_cacheinfo(i);
if (!this_cpu_ci->info_list)
continue;
- this_leaf = this_cpu_ci->info_list + index;
+
+ ci = this_cpu_ci->info_list + index;
for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
if (!cpu_online(sibling))
continue;
- cpumask_set_cpu(sibling,
- &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
}
}
} else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
unsigned int apicid, nshared, first, last;
- nshared = base->eax.split.num_threads_sharing + 1;
+ nshared = id4->eax.split.num_threads_sharing + 1;
apicid = cpu_data(cpu).topo.apicid;
first = apicid - (apicid % nshared);
last = first + nshared - 1;
@@ -920,14 +517,13 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
if ((apicid < first) || (apicid > last))
continue;
- this_leaf = this_cpu_ci->info_list + index;
+ ci = this_cpu_ci->info_list + index;
for_each_online_cpu(sibling) {
apicid = cpu_data(sibling).topo.apicid;
if ((apicid < first) || (apicid > last))
continue;
- cpumask_set_cpu(sibling,
- &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
}
}
} else
@@ -936,25 +532,27 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
return 1;
}
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon
+ */
static void __cache_cpumap_setup(unsigned int cpu, int index,
- struct _cpuid4_info_regs *base)
+ const struct _cpuid4_info *id4)
{
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
- struct cacheinfo *this_leaf, *sibling_leaf;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cacheinfo *ci, *sibling_ci;
unsigned long num_threads_sharing;
int index_msb, i;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- if (c->x86_vendor == X86_VENDOR_AMD ||
- c->x86_vendor == X86_VENDOR_HYGON) {
- if (__cache_amd_cpumap_setup(cpu, index, base))
+ if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
+ if (__cache_amd_cpumap_setup(cpu, index, id4))
return;
}
- this_leaf = this_cpu_ci->info_list + index;
- num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
+ ci = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
- cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, &ci->shared_cpu_map);
if (num_threads_sharing == 1)
return;
@@ -964,30 +562,29 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) {
struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
+ /* Skip if itself or no cacheinfo */
if (i == cpu || !sib_cpu_ci->info_list)
- continue;/* skip if itself or no cacheinfo */
- sibling_leaf = sib_cpu_ci->info_list + index;
- cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
- cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
+ continue;
+
+ sibling_ci = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &ci->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map);
}
}
-static void ci_leaf_init(struct cacheinfo *this_leaf,
- struct _cpuid4_info_regs *base)
+static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4,
+ struct amd_northbridge *nb)
{
- this_leaf->id = base->id;
- this_leaf->attributes = CACHE_ID;
- this_leaf->level = base->eax.split.level;
- this_leaf->type = cache_type_map[base->eax.split.type];
- this_leaf->coherency_line_size =
- base->ebx.split.coherency_line_size + 1;
- this_leaf->ways_of_associativity =
- base->ebx.split.ways_of_associativity + 1;
- this_leaf->size = base->size;
- this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
- this_leaf->physical_line_partition =
- base->ebx.split.physical_line_partition + 1;
- this_leaf->priv = base->nb;
+ ci->id = id4->id;
+ ci->attributes = CACHE_ID;
+ ci->level = id4->eax.split.level;
+ ci->type = cache_type_map[id4->eax.split.type];
+ ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1;
+ ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1;
+ ci->size = id4->size;
+ ci->number_of_sets = id4->ecx.split.number_of_sets + 1;
+ ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1;
+ ci->priv = nb;
}
int init_cache_level(unsigned int cpu)
@@ -1002,38 +599,45 @@ int init_cache_level(unsigned int cpu)
}
/*
- * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input
* ECX as cache index. Then right shift apicid by the number's order to get
* cache id for this cache node.
*/
-static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+static void get_cache_id(int cpu, struct _cpuid4_info *id4)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
unsigned long num_threads_sharing;
int index_msb;
- num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- id4_regs->id = c->topo.apicid >> index_msb;
+ id4->id = c->topo.apicid >> index_msb;
}
int populate_cache_leaves(unsigned int cpu)
{
- unsigned int idx, ret;
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
- struct cacheinfo *this_leaf = this_cpu_ci->info_list;
- struct _cpuid4_info_regs id4_regs = {};
+ struct cacheinfo *ci = this_cpu_ci->info_list;
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
+ struct amd_northbridge *nb = NULL;
+ struct _cpuid4_info id4 = {};
+ int idx, ret;
for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
- ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+ ret = fill_cpuid4_info(idx, &id4);
if (ret)
return ret;
- get_cache_id(cpu, &id4_regs);
- ci_leaf_init(this_leaf++, &id4_regs);
- __cache_cpumap_setup(cpu, idx, &id4_regs);
+
+ get_cache_id(cpu, &id4);
+
+ if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON)
+ nb = amd_init_l3_cache(idx);
+
+ ci_info_init(ci++, &id4, nb);
+ __cache_cpumap_setup(cpu, idx, &id4);
}
- this_cpu_ci->cpu_map_populated = true;
+ this_cpu_ci->cpu_map_populated = true;
return 0;
}
@@ -1049,31 +653,33 @@ int populate_cache_leaves(unsigned int cpu)
static unsigned long saved_cr4;
static DEFINE_RAW_SPINLOCK(cache_disable_lock);
+/*
+ * Cache flushing is the most time-consuming step when programming the
+ * MTRRs. On many Intel CPUs without known erratas, it can be skipped
+ * if the CPU declares cache self-snooping support.
+ */
+static void maybe_flush_caches(void)
+{
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
+}
+
void cache_disable(void) __acquires(cache_disable_lock)
{
unsigned long cr0;
/*
- * Note that this is not ideal
- * since the cache is only flushed/disabled for this CPU while the
- * MTRRs are changed, but changing this requires more invasive
- * changes to the way the kernel boots
+ * This is not ideal since the cache is only flushed/disabled
+ * for this CPU while the MTRRs are changed, but changing this
+ * requires more invasive changes to the way the kernel boots.
*/
-
raw_spin_lock(&cache_disable_lock);
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD;
write_cr0(cr0);
- /*
- * Cache flushing is the most time-consuming step when programming
- * the MTRRs. Fortunately, as per the Intel Software Development
- * Manual, we can skip it if the processor supports cache self-
- * snooping.
- */
- if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
- wbinvd();
+ maybe_flush_caches();
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_feature_enabled(X86_FEATURE_PGE)) {
@@ -1088,9 +694,7 @@ void cache_disable(void) __acquires(cache_disable_lock)
if (cpu_feature_enabled(X86_FEATURE_MTRR))
mtrr_disable();
- /* Again, only flush caches if we have to. */
- if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
- wbinvd();
+ maybe_flush_caches();
}
void cache_enable(void) __releases(cache_disable_lock)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7cce91b19fb2..8feb8fd2957a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -29,7 +29,7 @@
#include <asm/alternative.h>
#include <asm/cmdline.h>
-#include <asm/cpuid.h>
+#include <asm/cpuid/api.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/doublefault.h>
@@ -148,7 +148,7 @@ static void ppin_init(struct cpuinfo_x86 *c)
*/
info = (struct ppin_info *)id->driver_data;
- if (rdmsrl_safe(info->msr_ppin_ctl, &val))
+ if (rdmsrq_safe(info->msr_ppin_ctl, &val))
goto clear_ppin;
if ((val & 3UL) == 1UL) {
@@ -158,13 +158,13 @@ static void ppin_init(struct cpuinfo_x86 *c)
/* If PPIN is disabled, try to enable */
if (!(val & 2UL)) {
- wrmsrl_safe(info->msr_ppin_ctl, val | 2UL);
- rdmsrl_safe(info->msr_ppin_ctl, &val);
+ wrmsrq_safe(info->msr_ppin_ctl, val | 2UL);
+ rdmsrq_safe(info->msr_ppin_ctl, &val);
}
/* Is the enable bit set? */
if (val & 2UL) {
- c->ppin = __rdmsr(info->msr_ppin);
+ c->ppin = native_rdmsrq(info->msr_ppin);
set_cpu_cap(c, info->feature);
return;
}
@@ -242,6 +242,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+SYM_PIC_ALIAS(gdt_page);
#ifdef CONFIG_X86_64
static int __init x86_nopcid_setup(char *s)
@@ -321,7 +322,7 @@ static int __init cachesize_setup(char *str)
__setup("cachesize=", cachesize_setup);
/* Probe for the CPUID instruction */
-bool have_cpuid_p(void)
+bool cpuid_feature(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
@@ -562,9 +563,9 @@ __noendbr u64 ibt_save(bool disable)
u64 msr = 0;
if (cpu_feature_enabled(X86_FEATURE_IBT)) {
- rdmsrl(MSR_IA32_S_CET, msr);
+ rdmsrq(MSR_IA32_S_CET, msr);
if (disable)
- wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
+ wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
}
return msr;
@@ -575,10 +576,10 @@ __noendbr void ibt_restore(u64 save)
u64 msr;
if (cpu_feature_enabled(X86_FEATURE_IBT)) {
- rdmsrl(MSR_IA32_S_CET, msr);
+ rdmsrq(MSR_IA32_S_CET, msr);
msr &= ~CET_ENDBR_EN;
msr |= (save & CET_ENDBR_EN);
- wrmsrl(MSR_IA32_S_CET, msr);
+ wrmsrq(MSR_IA32_S_CET, msr);
}
}
@@ -602,15 +603,15 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_USER_SHSTK);
if (kernel_ibt)
- wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN);
+ wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN);
else
- wrmsrl(MSR_IA32_S_CET, 0);
+ wrmsrq(MSR_IA32_S_CET, 0);
cr4_set_bits(X86_CR4_CET);
if (kernel_ibt && ibt_selftest()) {
pr_err("IBT selftest: Failed!\n");
- wrmsrl(MSR_IA32_S_CET, 0);
+ wrmsrq(MSR_IA32_S_CET, 0);
setup_clear_cpu_cap(X86_FEATURE_IBT);
}
}
@@ -621,8 +622,8 @@ __noendbr void cet_disable(void)
cpu_feature_enabled(X86_FEATURE_SHSTK)))
return;
- wrmsrl(MSR_IA32_S_CET, 0);
- wrmsrl(MSR_IA32_U_CET, 0);
+ wrmsrq(MSR_IA32_S_CET, 0);
+ wrmsrq(MSR_IA32_U_CET, 0);
}
/*
@@ -667,8 +668,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
if (!warn)
continue;
- pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
- x86_cap_flag(df->feature), df->level);
+ pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
}
}
@@ -751,9 +752,9 @@ void __init switch_gdt_and_percpu_base(int cpu)
* No need to load %gs. It is already correct.
*
* Writing %gs on 64bit would zero GSBASE which would make any per
- * CPU operation up to the point of the wrmsrl() fault.
+ * CPU operation up to the point of the wrmsrq() fault.
*
- * Set GSBASE to the new offset. Until the wrmsrl() happens the
+ * Set GSBASE to the new offset. Until the wrmsrq() happens the
* early mapping is still valid. That means the GSBASE update will
* lose any prior per CPU data which was not copied over in
* setup_per_cpu_areas().
@@ -761,7 +762,7 @@ void __init switch_gdt_and_percpu_base(int cpu)
* This works even with stackprotector enabled because the
* per CPU stack canary is 0 in both per CPU areas.
*/
- wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
+ wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
#else
/*
* %fs is already set to __KERNEL_PERCPU, but after switching GDT
@@ -846,13 +847,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_cache_size = l2size;
}
-u16 __read_mostly tlb_lli_4k[NR_INFO];
-u16 __read_mostly tlb_lli_2m[NR_INFO];
-u16 __read_mostly tlb_lli_4m[NR_INFO];
-u16 __read_mostly tlb_lld_4k[NR_INFO];
-u16 __read_mostly tlb_lld_2m[NR_INFO];
-u16 __read_mostly tlb_lld_4m[NR_INFO];
-u16 __read_mostly tlb_lld_1g[NR_INFO];
+u16 __read_mostly tlb_lli_4k;
+u16 __read_mostly tlb_lli_2m;
+u16 __read_mostly tlb_lli_4m;
+u16 __read_mostly tlb_lld_4k;
+u16 __read_mostly tlb_lld_2m;
+u16 __read_mostly tlb_lld_4m;
+u16 __read_mostly tlb_lld_1g;
static void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
@@ -860,12 +861,10 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
this_cpu->c_detect_tlb(c);
pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
- tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
- tlb_lli_4m[ENTRIES]);
+ tlb_lli_4k, tlb_lli_2m, tlb_lli_4m);
pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
- tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
- tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
+ tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g);
}
void get_cpu_vendor(struct cpuinfo_x86 *c)
@@ -1007,17 +1006,18 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_D_1_EAX] = eax;
}
- /* AMD-defined flags: level 0x80000001 */
+ /*
+ * Check if extended CPUID leaves are implemented: Max extended
+ * CPUID leaf must be in the 0x80000001-0x8000ffff range.
+ */
eax = cpuid_eax(0x80000000);
- c->extended_cpuid_level = eax;
+ c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0;
- if ((eax & 0xffff0000) == 0x80000000) {
- if (eax >= 0x80000001) {
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+ if (c->extended_cpuid_level >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- c->x86_capability[CPUID_8000_0001_ECX] = ecx;
- c->x86_capability[CPUID_8000_0001_EDX] = edx;
- }
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
if (c->extended_cpuid_level >= 0x80000007) {
@@ -1164,7 +1164,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
- VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
@@ -1205,6 +1205,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
+#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \
+ X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues)
+
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@@ -1226,6 +1229,10 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define GDS BIT(6)
/* CPU is affected by Register File Data Sampling */
#define RFDS BIT(7)
+/* CPU is affected by Indirect Target Selection */
+#define ITS BIT(8)
+/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
+#define ITS_NATIVE_ONLY BIT(9)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS),
@@ -1237,25 +1244,28 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS),
VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO),
VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS),
- VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS),
VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS),
VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED),
- VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS),
- VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS),
- VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED),
- VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS),
- VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS),
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS),
- VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS),
VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS),
@@ -1287,7 +1297,7 @@ u64 x86_read_arch_cap_msr(void)
u64 x86_arch_cap_msr = 0;
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr);
+ rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr);
return x86_arch_cap_msr;
}
@@ -1317,10 +1327,78 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
return cpu_matches(cpu_vuln_blacklist, RFDS);
}
+static bool __init vulnerable_to_its(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_ITS_NO)
+ return false;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ /* None of the affected CPUs have BHI_CTRL */
+ if (boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ /*
+ * If a VMM did not expose ITS_NO, assume that a guest could
+ * be running on a vulnerable hardware or may migrate to such
+ * hardware.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ if (cpu_matches(cpu_vuln_blacklist, ITS))
+ return true;
+
+ return false;
+}
+
+static struct x86_cpu_id cpu_latest_microcode[] = {
+#include "microcode/intel-ucode-defs.h"
+ {}
+};
+
+static bool __init cpu_has_old_microcode(void)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode);
+
+ /* Give unknown CPUs a pass: */
+ if (!m) {
+ /* Intel CPUs should be in the list. Warn if not: */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ pr_info("x86/CPU: Model not found in latest microcode list\n");
+ return false;
+ }
+
+ /*
+ * Hosts usually lie to guests with a super high microcode
+ * version. Just ignore what hosts tell guests:
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ /* Consider all debug microcode to be old: */
+ if (boot_cpu_data.microcode & BIT(31))
+ return true;
+
+ /* Give new microcode a pass: */
+ if (boot_cpu_data.microcode >= m->driver_data)
+ return false;
+
+ /* Uh oh, too old: */
+ return true;
+}
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
+ if (cpu_has_old_microcode()) {
+ pr_warn("x86/CPU: Running old microcode\n");
+ setup_force_cpu_bug(X86_BUG_OLD_MICROCODE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+
/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
!(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO))
@@ -1331,8 +1409,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) {
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ }
if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
!(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
@@ -1399,15 +1479,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* Affected CPU list is generally enough to enumerate the vulnerability,
* but for virtualization case check for ARCH_CAP MSR bits also, VMM may
* not want the guest to enumerate the bug.
- *
- * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist,
- * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits.
*/
if (!arch_cap_mmio_immune(x86_arch_cap_msr)) {
if (cpu_matches(cpu_vuln_blacklist, MMIO))
setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
- else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO))
- setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN);
}
if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
@@ -1436,9 +1511,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (vulnerable_to_rfds(x86_arch_cap_msr))
setup_force_cpu_bug(X86_BUG_RFDS);
- /* When virtualized, eIBRS could be hidden, assume vulnerable */
- if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) &&
- !cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ /*
+ * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with
+ * BHI_NO still need to use the BHI mitigation to prevent Intra-mode
+ * attacks. When virtualized, eIBRS could be hidden, assume vulnerable.
+ */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
(boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
boot_cpu_has(X86_FEATURE_HYPERVISOR)))
setup_force_cpu_bug(X86_BUG_BHI);
@@ -1446,6 +1524,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
+ if (vulnerable_to_its(x86_arch_cap_msr)) {
+ setup_force_cpu_bug(X86_BUG_ITS);
+ if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY))
+ setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
+ }
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -1479,15 +1563,96 @@ static void detect_nopl(void)
#endif
}
+static inline bool parse_set_clear_cpuid(char *arg, bool set)
+{
+ char *opt;
+ int taint = 0;
+
+ while (arg) {
+ bool found __maybe_unused = false;
+ unsigned int bit;
+
+ opt = strsep(&arg, ",");
+
+ /*
+ * Handle naked numbers first for feature flags which don't
+ * have names. It doesn't make sense for a bug not to have a
+ * name so don't handle bug flags here.
+ */
+ if (!kstrtouint(opt, 10, &bit)) {
+ if (bit < NCAPINTS * 32) {
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU feature flag:");
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU feature flag:");
+ setup_clear_cpu_cap(bit);
+ }
+ /* empty-string, i.e., ""-defined feature flags */
+ if (!x86_cap_flags[bit])
+ pr_cont(" %d:%d\n", bit >> 5, bit & 31);
+ else
+ pr_cont(" %s\n", x86_cap_flags[bit]);
+
+ taint++;
+ }
+ /*
+ * The assumption is that there are no feature names with only
+ * numbers in the name thus go to the next argument.
+ */
+ continue;
+ }
+
+ for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) {
+ const char *flag;
+ const char *kind;
+
+ if (bit < 32 * NCAPINTS) {
+ flag = x86_cap_flags[bit];
+ kind = "feature";
+ } else {
+ kind = "bug";
+ flag = x86_bug_flags[bit - (32 * NCAPINTS)];
+ }
+
+ if (!flag)
+ continue;
+
+ if (strcmp(flag, opt))
+ continue;
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_clear_cpu_cap(bit);
+ }
+ taint++;
+ found = true;
+ break;
+ }
+
+ if (!found)
+ pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt);
+ }
+
+ return taint;
+}
+
+
/*
* We parse cpu parameters early because fpu__init_system() is executed
* before parse_early_param().
*/
static void __init cpu_parse_early_param(void)
{
+ bool cpuid_taint = false;
char arg[128];
- char *argptr = arg, *opt;
- int arglen, taint = 0;
+ int arglen;
#ifdef CONFIG_X86_32
if (cmdline_find_option_bool(boot_command_line, "no387"))
@@ -1519,61 +1684,17 @@ static void __init cpu_parse_early_param(void)
setup_clear_cpu_cap(X86_FEATURE_FRED);
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
- if (arglen <= 0)
- return;
-
- pr_info("Clearing CPUID bits:");
-
- while (argptr) {
- bool found __maybe_unused = false;
- unsigned int bit;
-
- opt = strsep(&argptr, ",");
-
- /*
- * Handle naked numbers first for feature flags which don't
- * have names.
- */
- if (!kstrtouint(opt, 10, &bit)) {
- if (bit < NCAPINTS * 32) {
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, false);
- /* empty-string, i.e., ""-defined feature flags */
- if (!x86_cap_flags[bit])
- pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit));
- else
- pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, true);
- setup_clear_cpu_cap(bit);
- taint++;
- }
- /*
- * The assumption is that there are no feature names with only
- * numbers in the name thus go to the next argument.
- */
- continue;
- }
-
- for (bit = 0; bit < 32 * NCAPINTS; bit++) {
- if (!x86_cap_flag(bit))
- continue;
-
- if (strcmp(x86_cap_flag(bit), opt))
- continue;
-
- pr_cont(" %s", opt);
- setup_clear_cpu_cap(bit);
- taint++;
- found = true;
- break;
- }
-
- if (!found)
- pr_cont(" (unknown: %s)", opt);
- }
- pr_cont("\n");
-
- if (taint)
+ if (cpuid_taint) {
+ pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
}
/*
@@ -1590,11 +1711,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (have_cpuid_p()) {
+ if (cpuid_feature()) {
cpu_detect(c);
get_cpu_vendor(c);
intel_unlock_cpuid_leafs(c);
@@ -1610,6 +1731,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
c->cpu_index = 0;
filter_cpuid_features(c, false);
+ check_cpufeature_deps(c);
if (this_cpu->c_bsp_init)
this_cpu->c_bsp_init(c);
@@ -1708,11 +1830,11 @@ static bool detect_null_seg_behavior(void)
*/
unsigned long old_base, tmp;
- rdmsrl(MSR_FS_BASE, old_base);
- wrmsrl(MSR_FS_BASE, 1);
+ rdmsrq(MSR_FS_BASE, old_base);
+ wrmsrq(MSR_FS_BASE, 1);
loadsegment(fs, 0);
- rdmsrl(MSR_FS_BASE, tmp);
- wrmsrl(MSR_FS_BASE, old_base);
+ rdmsrq(MSR_FS_BASE, tmp);
+ wrmsrq(MSR_FS_BASE, old_base);
return tmp == 0;
}
@@ -1753,11 +1875,11 @@ static void generic_identify(struct cpuinfo_x86 *c)
{
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
+ if (!cpuid_feature())
return;
cpu_detect(c);
@@ -1870,6 +1992,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Filter out anything that depends on CPUID levels we don't have */
filter_cpuid_features(c, true);
+ /* Check for unmet dependencies based on the CPUID dependency table */
+ check_cpufeature_deps(c);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
const char *p;
@@ -1938,9 +2063,9 @@ void enable_sep_cpu(void)
*/
tss->x86_tss.ss1 = __KERNEL_CS;
- wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
- wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
+ wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1);
+ wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+ wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32);
put_cpu();
}
@@ -1962,9 +2087,15 @@ static __init void identify_boot_cpu(void)
lkgs_init();
}
-void identify_secondary_cpu(struct cpuinfo_x86 *c)
+void identify_secondary_cpu(unsigned int cpu)
{
- BUG_ON(c == &boot_cpu_data);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
+ c->cpu_index = cpu;
+
identify_cpu(c);
#ifdef CONFIG_X86_32
enable_sep_cpu();
@@ -1975,6 +2106,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
update_gds_msr();
tsx_ap_init();
+ c->initialized = true;
}
void print_cpu_info(struct cpuinfo_x86 *c)
@@ -2005,29 +2137,42 @@ void print_cpu_info(struct cpuinfo_x86 *c)
}
/*
- * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy
- * function prevents it from becoming an environment variable for init.
+ * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param().
+ * These dummy functions prevent them from becoming an environment variable for
+ * init.
*/
+
static __init int setup_clearcpuid(char *arg)
{
return 1;
}
__setup("clearcpuid=", setup_clearcpuid);
-DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
- .current_task = &init_task,
- .preempt_count = INIT_PREEMPT_COUNT,
- .top_of_stack = TOP_OF_INIT_STACK,
-};
-EXPORT_PER_CPU_SYMBOL(pcpu_hot);
-EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);
+static __init int setup_setcpuid(char *arg)
+{
+ return 1;
+}
+__setup("setcpuid=", setup_setcpuid);
+
+DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+EXPORT_PER_CPU_SYMBOL(const_current_task);
+
+DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
#ifdef CONFIG_X86_64
-DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
- fixed_percpu_data) __aligned(PAGE_SIZE) __visible;
-EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
+/*
+ * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+ * so that this space is reserved in the hot cache section even when the
+ * mitigation is disabled.
+ */
+DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth);
+EXPORT_PER_CPU_SYMBOL(__x86_call_depth);
-static void wrmsrl_cstar(unsigned long val)
+static void wrmsrq_cstar(unsigned long val)
{
/*
* Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR
@@ -2035,37 +2180,37 @@ static void wrmsrl_cstar(unsigned long val)
* guest. Avoid the pointless write on all Intel CPUs.
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
- wrmsrl(MSR_CSTAR, val);
+ wrmsrq(MSR_CSTAR, val);
}
static inline void idt_syscall_init(void)
{
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+ wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
if (ia32_enabled()) {
- wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
+ wrmsrq_cstar((unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP,
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
} else {
- wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+ wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore);
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
}
/*
* Flags to clear on syscall; clear as much as possible
* to minimize user space-kernel interference.
*/
- wrmsrl(MSR_SYSCALL_MASK,
+ wrmsrq(MSR_SYSCALL_MASK,
X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
@@ -2089,18 +2234,15 @@ void syscall_init(void)
if (!cpu_feature_enabled(X86_FEATURE_FRED))
idt_syscall_init();
}
-
-#else /* CONFIG_X86_64 */
+#endif /* CONFIG_X86_64 */
#ifdef CONFIG_STACKPROTECTOR
-DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard);
#ifndef CONFIG_SMP
EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif
#endif
-#endif /* CONFIG_X86_64 */
-
/*
* Clear all 6 debug registers:
*/
@@ -2137,7 +2279,7 @@ static inline void setup_getcpu(int cpu)
struct desc_struct d = { };
if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
- wrmsr(MSR_TSC_AUX, cpudata, 0);
+ wrmsrq(MSR_TSC_AUX, cpudata);
/* Store CPU and node number in limit. */
d.limit0 = cpudata;
@@ -2252,8 +2394,8 @@ void cpu_init(void)
memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
syscall_init();
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ wrmsrq(MSR_FS_BASE, 0);
+ wrmsrq(MSR_KERNEL_GS_BASE, 0);
barrier();
x2apic_setup();
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1beccefbaff9..bc38b2d56f26 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -33,14 +33,6 @@ struct cpu_dev {
#endif
};
-struct _tlb_table {
- unsigned char descriptor;
- char tlb_type;
- unsigned int entries;
- /* unsigned int ways; */
- char info[128];
-};
-
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
__section(".x86_cpu_dev.init") = \
@@ -83,6 +75,15 @@ extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id);
void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c);
+#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+struct amd_northbridge *amd_init_l3_cache(int index);
+#else
+static inline struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ return NULL;
+}
+#endif
+
unsigned int aperfmperf_get_khz(int cpu);
void cpu_select_mitigations(void);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index df838e3bdbe0..46efcbd6afa4 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -28,6 +28,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_PKU, X86_FEATURE_XSAVE },
{ X86_FEATURE_MPX, X86_FEATURE_XSAVE },
{ X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_APX, X86_FEATURE_XSAVE },
{ X86_FEATURE_CMOV, X86_FEATURE_FXSR },
{ X86_FEATURE_MMX, X86_FEATURE_FXSR },
{ X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
@@ -82,8 +83,12 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_XFD, X86_FEATURE_XSAVES },
{ X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
+ { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE },
{ X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
{ X86_FEATURE_FRED, X86_FEATURE_LKGS },
+ { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL },
{}
};
@@ -147,3 +152,38 @@ void setup_clear_cpu_cap(unsigned int feature)
{
do_clear_cpu_cap(NULL, feature);
}
+
+/*
+ * Return the feature "name" if available, otherwise return
+ * the X86_FEATURE_* numerals to make it easier to identify
+ * the feature.
+ */
+static const char *x86_feature_name(unsigned int feature, char *buf)
+{
+ if (x86_cap_flags[feature])
+ return x86_cap_flags[feature];
+
+ snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32);
+
+ return buf;
+}
+
+void check_cpufeature_deps(struct cpuinfo_x86 *c)
+{
+ char feature_buf[16], depends_buf[16];
+ const struct cpuid_dep *d;
+
+ for (d = cpuid_deps; d->feature; d++) {
+ if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) {
+ /*
+ * Only warn about the first unmet dependency on the
+ * first CPU where it is encountered to avoid spamming
+ * the kernel log.
+ */
+ pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n",
+ smp_processor_id(),
+ x86_feature_name(d->feature, feature_buf),
+ x86_feature_name(d->depends, depends_buf));
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c
new file mode 100644
index 000000000000..89bc8db5e9c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sizes.h>
+
+#include <asm/cpuid/types.h>
+
+#include "cpu.h"
+
+#define CACHE_ENTRY(_desc, _type, _size) \
+ [_desc] = { \
+ .c_type = (_type), \
+ .c_size = (_size) / SZ_1K, \
+ }
+
+#define TLB_ENTRY(_desc, _type, _entries) \
+ [_desc] = { \
+ .t_type = (_type), \
+ .entries = (_entries), \
+ }
+
+const struct leaf_0x2_table cpuid_0x2_table[256] = {
+ CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */
+
+ TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */
+ TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */
+ TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */
+ TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */
+ TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative
+ * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */
+ TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */
+ TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */
+ TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */
+ TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */
+ TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */
+ TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */
+ TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */
+ TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */
+};
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 9651275aecd1..dfec2c61e354 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -153,8 +153,8 @@ static void geode_configure(void)
u8 ccr3;
local_irq_save(flags);
- /* Suspend on halt power saving and enable #SUSP pin */
- setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+ /* Suspend on halt power saving */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
index cacfd3f6abef..1976fef2dfe5 100644
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -16,8 +16,8 @@ static int cpu_debug_show(struct seq_file *m, void *p)
if (!c->initialized)
return 0;
- seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid);
- seq_printf(m, "apicid: %x\n", c->topo.apicid);
+ seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid);
+ seq_printf(m, "apicid: 0x%x\n", c->topo.apicid);
seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id);
seq_printf(m, "die_id: %u\n", c->topo.die_id);
seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 4a4118784c13..d69757246bde 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -4,6 +4,7 @@
#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include <asm/msr-index.h>
+#include <asm/msr.h>
#include <asm/processor.h>
#include <asm/vmx.h>
@@ -118,7 +119,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
bool enable_vmx;
u64 msr;
- if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
+ if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
clear_cpu_cap(c, X86_FEATURE_SGX);
return;
@@ -165,7 +166,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
msr |= FEAT_CTL_SGX_LC_ENABLED;
}
- wrmsrl(MSR_IA32_FEAT_CTL, msr);
+ wrmsrq(MSR_IA32_FEAT_CTL, msr);
update_caps:
set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index c5191b06f9f2..2154f12766fb 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -15,6 +15,7 @@
#include <asm/cacheinfo.h>
#include <asm/spec-ctrl.h>
#include <asm/delay.h>
+#include <asm/msr.h>
#include "cpu.h"
@@ -96,7 +97,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
u64 val;
- rdmsrl(MSR_K7_HWCR, val);
+ rdmsrq(MSR_K7_HWCR, val);
if (!(val & BIT(24)))
pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
}
@@ -110,7 +111,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c)
* Try to cache the base value so further operations can
* avoid RMW. If that faults, do not enable SSBD.
*/
- if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
setup_force_cpu_cap(X86_FEATURE_SSBD);
x86_amd_ls_cfg_ssbd_mask = 1ULL << 10;
@@ -194,7 +195,7 @@ static void init_hygon(struct cpuinfo_x86 *c)
init_hygon_cacheinfo(c);
if (cpu_has(c, X86_FEATURE_SVM)) {
- rdmsrl(MSR_VM_CR, vm_cr);
+ rdmsrq(MSR_VM_CR, vm_cr);
if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
clear_cpu_cap(c, X86_FEATURE_SVM);
@@ -240,26 +241,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
- tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
- tlb_lli_4k[ENTRIES] = ebx & mask;
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
- tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
- tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ tlb_lld_2m = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
- tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+ tlb_lld_4m = tlb_lld_2m >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
- tlb_lli_2m[ENTRIES] = eax & 0xff;
+ tlb_lli_2m = eax & 0xff;
} else
- tlb_lli_2m[ENTRIES] = eax & mask;
+ tlb_lli_2m = eax & mask;
- tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+ tlb_lli_4m = tlb_lli_2m >> 1;
}
static const struct cpu_dev hygon_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3dce22f00dc3..076eaa41b8c8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,40 +1,33 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/pgtable.h>
-#include <linux/string.h>
#include <linux/bitops.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/thread_info.h>
#include <linux/init.h>
-#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/minmax.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_X86_64
+#include <linux/topology.h>
+#endif
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
#include <asm/bugs.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
#include <asm/cpu.h>
+#include <asm/cpuid/api.h>
+#include <asm/hwcap2.h>
#include <asm/intel-family.h>
#include <asm/microcode.h>
-#include <asm/hwcap2.h>
-#include <asm/elf.h>
-#include <asm/cpu_device_id.h>
-#include <asm/resctrl.h>
+#include <asm/msr.h>
#include <asm/numa.h>
+#include <asm/resctrl.h>
#include <asm/thermal.h>
-
-#ifdef CONFIG_X86_64
-#include <linux/topology.h>
-#endif
+#include <asm/uaccess.h>
#include "cpu.h"
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#endif
-
/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
@@ -166,7 +159,7 @@ static void detect_tme_early(struct cpuinfo_x86 *c)
u64 tme_activate;
int keyid_bits;
- rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
+ rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate);
if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
pr_info_once("x86/tme: not enabled by BIOS\n");
@@ -195,7 +188,7 @@ void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
- if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd))
+ if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN)
return;
/*
@@ -210,10 +203,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
@@ -256,8 +245,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
#endif
/* CPUID workaround for 0F33/0F34 CPU */
- if (c->x86 == 0xF && c->x86_model == 0x3
- && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
+ if (c->x86_vfm == INTEL_P4_PRESCOTT &&
+ (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
c->x86_phys_bits = 36;
/*
@@ -266,10 +255,16 @@ static void early_init_intel(struct cpuinfo_x86 *c)
*
* It is also reliable across cores and sockets. (but not across
* cabinets - we turn it off in that case explicitly.)
+ *
+ * Use a model-specific check for some older CPUs that have invariant
+ * TSC but may not report it architecturally via 8000_0007.
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_WILLAMETTE) ||
+ (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
@@ -298,12 +293,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_PAT);
/*
- * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
- * clear the fast string and enhanced fast string CPU capabilities.
+ * Modern CPUs are generally expected to have a sane fast string
+ * implementation. However, BIOSes typically have a knob to tweak
+ * the architectural MISC_ENABLE.FAST_STRING enable bit.
+ *
+ * Adhere to the preference and program the Linux-defined fast
+ * string flag and enhanced fast string capabilities accordingly.
*/
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) {
+ rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
+ /* X86_FEATURE_ERMS is set based on CPUID */
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ } else {
pr_info("Disabled fast string operations\n");
setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
setup_clear_cpu_cap(X86_FEATURE_ERMS);
@@ -350,9 +352,7 @@ static void bsp_init_intel(struct cpuinfo_x86 *c)
int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
+ if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
boot_cpu_data.x86_stepping < 8) {
pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
@@ -369,9 +369,8 @@ static void intel_smp_check(struct cpuinfo_x86 *c)
/*
* Mask B, Pentium, but not Pentium MMX
*/
- if (c->x86 == 5 &&
- c->x86_stepping >= 1 && c->x86_stepping <= 4 &&
- c->x86_model <= 3) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4) {
/*
* Remember we have B step Pentia with bugs
*/
@@ -398,7 +397,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* The Quark is also family 5, but does not have the same bug.
*/
clear_cpu_bug(c, X86_BUG_F00F);
- if (c->x86 == 5 && c->x86_model < 9) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) {
static int f00f_workaround_enabled;
set_cpu_bug(c, X86_BUG_F00F);
@@ -413,7 +412,8 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
- if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633)
+ if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) ||
+ c->x86_vfm < INTEL_PENTIUM_II_KLAMATH)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
@@ -431,7 +431,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* P4 Xeon erratum 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
- if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) {
+ if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) {
if (msr_set_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
pr_info("CPU: C0 stepping P4 Xeon detected.\n");
@@ -445,27 +445,20 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
- if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+ if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 &&
(c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
set_cpu_bug(c, X86_BUG_11AP);
-
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
- * Set up the preferred alignment for movsl bulk memory moves
+ * MOVSL bulk memory moves can be slow when source and dest are not
+ * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment.
+ *
+ * Set the preferred alignment for Pentium Pro and newer processors, as
+ * it has only been tested on these.
*/
- switch (c->x86) {
- case 4: /* 486: untested */
- break;
- case 5: /* Old Pentia: untested */
- break;
- case 6: /* PII/PIII only like movsl with 8-byte alignment */
- movsl_mask.mask = 7;
- break;
- case 15: /* P4 is OK down to 8-byte alignment */
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO)
movsl_mask.mask = 7;
- break;
- }
#endif
intel_smp_check(c);
@@ -497,7 +490,7 @@ static void init_cpuid_fault(struct cpuinfo_x86 *c)
{
u64 msr;
- if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) {
+ if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) {
if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
}
@@ -507,7 +500,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
{
u64 msr;
- if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+ if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr))
return;
/* Clear all MISC features */
@@ -518,9 +511,28 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
probe_xeon_phi_r3mwait(c);
msr = this_cpu_read(msr_misc_features_shadow);
- wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msr);
}
+/*
+ * This is a list of Intel CPUs that are known to suffer from downclocking when
+ * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel
+ * executes SIMD-optimized code such as cryptography functions or CRCs, it
+ * should prefer 256-bit (YMM) code to 512-bit (ZMM) code.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, 0),
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ {},
+};
+
static void init_intel(struct cpuinfo_x86 *c)
{
early_init_intel(c);
@@ -563,8 +575,6 @@ static void init_intel(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
#else
/*
* Names for the Pentium II/Celeron processors
@@ -601,6 +611,9 @@ static void init_intel(struct cpuinfo_x86 *c)
}
#endif
+ if (x86_match_cpu(zmm_exclusion_list))
+ set_cpu_cap(c, X86_FEATURE_PREFER_YMM);
+
/* Work around errata */
srat_detect_node(c);
@@ -622,191 +635,90 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
*/
- if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+ if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0)
size = 256;
/*
* Intel Quark SoC X1000 contains a 4-way set associative
* 16K cache with a 16 byte cache line and 256 lines per tag
*/
- if ((c->x86 == 5) && (c->x86_model == 9))
+ if (c->x86_vfm == INTEL_QUARK_X1000)
size = 16;
return size;
}
#endif
-#define TLB_INST_4K 0x01
-#define TLB_INST_4M 0x02
-#define TLB_INST_2M_4M 0x03
-
-#define TLB_INST_ALL 0x05
-#define TLB_INST_1G 0x06
-
-#define TLB_DATA_4K 0x11
-#define TLB_DATA_4M 0x12
-#define TLB_DATA_2M_4M 0x13
-#define TLB_DATA_4K_4M 0x14
-
-#define TLB_DATA_1G 0x16
-
-#define TLB_DATA0_4K 0x21
-#define TLB_DATA0_4M 0x22
-#define TLB_DATA0_2M_4M 0x23
-
-#define STLB_4K 0x41
-#define STLB_4K_2M 0x42
-
-static const struct _tlb_table intel_tlb_table[] = {
- { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
- { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
- { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
- { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
- { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
- { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
- { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
- { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
- { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
- { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
- { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" },
- { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
- { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" },
- { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
- { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
- { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
- { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" },
- { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" },
- { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
- { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
- { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
- { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
- { 0x00, 0, 0 }
-};
-
-static void intel_tlb_lookup(const unsigned char desc)
+static void intel_tlb_lookup(const struct leaf_0x2_table *desc)
{
- unsigned char k;
- if (desc == 0)
- return;
+ short entries = desc->entries;
- /* look up this descriptor in the table */
- for (k = 0; intel_tlb_table[k].descriptor != desc &&
- intel_tlb_table[k].descriptor != 0; k++)
- ;
-
- if (intel_tlb_table[k].tlb_type == 0)
- return;
-
- switch (intel_tlb_table[k].tlb_type) {
+ switch (desc->t_type) {
case STLB_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
break;
case STLB_4K_2M:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_INST_ALL:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
break;
case TLB_INST_4M:
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_2M_4M:
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_DATA_4K:
case TLB_DATA0_4K:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
break;
case TLB_DATA_4M:
case TLB_DATA0_4M:
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_2M_4M:
case TLB_DATA0_2M_4M:
- if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_4K_4M:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
+ case TLB_DATA_1G_2M_4M:
+ tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES);
+ tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES);
+ fallthrough;
case TLB_DATA_1G:
- if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_1g = max(tlb_lld_1g, entries);
break;
}
}
static void intel_detect_tlb(struct cpuinfo_x86 *c)
{
- int i, j, n;
- unsigned int regs[4];
- unsigned char *desc = (unsigned char *)regs;
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
if (c->cpuid_level < 2)
return;
- /* Number of times to iterate */
- n = cpuid_eax(2) & 0xFF;
-
- for (i = 0 ; i < n ; i++) {
- cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
-
- /* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
- if (regs[j] & (1 << 31))
- regs[j] = 0;
-
- /* Byte 0 is level count, not a descriptor */
- for (j = 1 ; j < 16 ; j++)
- intel_tlb_lookup(desc[j]);
- }
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc)
+ intel_tlb_lookup(desc);
}
static const struct cpu_dev intel_cpu_dev = {
@@ -873,34 +785,3 @@ static const struct cpu_dev intel_cpu_dev = {
};
cpu_dev_register(intel_cpu_dev);
-
-#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24
-
-/**
- * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU
- *
- * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in
- * a hybrid processor. If the processor is not hybrid, returns 0.
- */
-u8 get_this_hybrid_cpu_type(void)
-{
- if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- return 0;
-
- return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT;
-}
-
-/**
- * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU
- *
- * Returns the uarch native ID [23:0] of a CPU in a hybrid processor.
- * If the processor is not hybrid, returns 0.
- */
-u32 get_this_hybrid_cpu_native_id(void)
-{
- if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- return 0;
-
- return cpuid_eax(0x0000001a) &
- (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1);
-}
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index 30b1d63b97f3..bc7671f920a7 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -79,7 +79,7 @@ static int intel_epb_save(void)
{
u64 epb;
- rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
/*
* Ensure that saved_epb will always be nonzero after this write even if
* the EPB value read from the MSR is 0.
@@ -94,7 +94,7 @@ static void intel_epb_restore(void)
u64 val = this_cpu_read(saved_epb);
u64 epb;
- rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
if (val) {
val &= EPB_MASK;
} else {
@@ -111,7 +111,7 @@ static void intel_epb_restore(void)
pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
}
}
- wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val);
+ wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val);
}
static struct syscore_ops intel_epb_syscore_ops = {
@@ -135,7 +135,7 @@ static ssize_t energy_perf_bias_show(struct device *dev,
u64 epb;
int ret;
- ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
if (ret < 0)
return ret;
@@ -157,11 +157,11 @@ static ssize_t energy_perf_bias_store(struct device *dev,
else if (kstrtou64(buf, 0, &val) || val > MAX_EPB)
return -EINVAL;
- ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
if (ret < 0)
return ret;
- ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS,
+ ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS,
(epb & ~EPB_MASK) | val);
if (ret < 0)
return ret;
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 4f3c65429f82..6af1e8baeb0f 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -6,6 +6,34 @@
#include <linux/slab.h>
/**
+ * x86_match_vendor_cpu_type - helper function to match the hardware defined
+ * cpu-type for a single entry in the x86_cpu_id
+ * table. Note, this function does not match the
+ * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and
+ * TOPO_CPU_TYPE_PERFORMANCE.
+ * @c: Pointer to the cpuinfo_x86 structure of the CPU to match.
+ * @m: Pointer to the x86_cpu_id entry to match against.
+ *
+ * Return: true if the cpu-type matches, false otherwise.
+ */
+static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m)
+{
+ if (m->type == X86_CPU_TYPE_ANY)
+ return true;
+
+ /* Hybrid CPUs are special, they are assumed to match all cpu-types */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return true;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ return m->type == c->topo.intel_type;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ return m->type == c->topo.amd_type;
+
+ return false;
+}
+
+/**
* x86_match_cpu - match current CPU against an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
@@ -50,6 +78,8 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
+ if (!x86_match_vendor_cpu_type(c, m))
+ continue;
return m;
}
return NULL;
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1075a90141da..9d852c3b2cb5 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -662,12 +662,12 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
return;
}
- rdmsrl(MSR_K7_HWCR, hwcr);
+ rdmsrq(MSR_K7_HWCR, hwcr);
/* McStatusWrEn has to be set */
need_toggle = !(hwcr & BIT(18));
if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+ wrmsrq(MSR_K7_HWCR, hwcr | BIT(18));
/* Clear CntP bit safely */
for (i = 0; i < num_msrs; i++)
@@ -675,7 +675,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
/* restore old settings */
if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
+ wrmsrq(MSR_K7_HWCR, hwcr);
}
/* cpu init entry point, called from mce.c with preempt off */
@@ -805,12 +805,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
}
if (mce_flags.smca) {
- rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
+ rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
if (m->status & MCI_STATUS_SYNDV) {
- rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
- rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
- rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
+ rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
+ rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
+ rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
}
}
@@ -834,16 +834,16 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
{
u64 status, addr = 0;
- rdmsrl(msr_stat, status);
+ rdmsrq(msr_stat, status);
if (!(status & MCI_STATUS_VAL))
return false;
if (status & MCI_STATUS_ADDRV)
- rdmsrl(msr_addr, addr);
+ rdmsrq(msr_addr, addr);
__log_error(bank, status, addr, misc);
- wrmsrl(msr_stat, 0);
+ wrmsrq(msr_stat, 0);
return status & MCI_STATUS_DEFERRED;
}
@@ -862,7 +862,7 @@ static bool _log_error_deferred(unsigned int bank, u32 misc)
return true;
/* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
return true;
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 0dc00c9894c7..e9b3c5d4a52e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -121,7 +121,7 @@ void mce_prep_record_common(struct mce *m)
{
m->cpuid = cpuid_eax(1);
m->cpuvendor = boot_cpu_data.x86_vendor;
- m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
+ m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP);
/* need the internal __ version to avoid deadlocks */
m->time = __ktime_get_real_seconds();
}
@@ -388,9 +388,9 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
}
/* MSR access wrappers used for error injection */
-noinstr u64 mce_rdmsrl(u32 msr)
+noinstr u64 mce_rdmsrq(u32 msr)
{
- DECLARE_ARGS(val, low, high);
+ EAX_EDX_DECLARE_ARGS(val, low, high);
if (__this_cpu_read(injectm.finished)) {
int offset;
@@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrl(u32 msr)
return EAX_EDX_VAL(val, low, high);
}
-static noinstr void mce_wrmsrl(u32 msr, u64 v)
+static noinstr void mce_wrmsrq(u32 msr, u64 v)
{
u32 low, high;
@@ -444,7 +444,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
low = (u32)v;
high = (u32)(v >> 32);
- /* See comment in mce_rdmsrl() */
+ /* See comment in mce_rdmsrq() */
asm volatile("1: wrmsr\n"
"2:\n"
_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
@@ -468,7 +468,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs
instrumentation_end();
m = &err->m;
- m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
if (regs) {
/*
* Get the address of the instruction at the time of
@@ -488,7 +488,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs
}
/* Use accurate RIP reporting if available. */
if (mca_cfg.rip_msr)
- m->ip = mce_rdmsrl(mca_cfg.rip_msr);
+ m->ip = mce_rdmsrq(mca_cfg.rip_msr);
}
}
@@ -584,6 +584,28 @@ bool mce_is_correctable(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_correctable);
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+static bool mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return true;
+ }
+
+ return false;
+}
+
static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -662,10 +684,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
struct mce *m = &err->m;
if (m->status & MCI_STATUS_MISCV)
- m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
+ m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));
if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
+ m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
/*
* Mask the reported address by the reported granularity.
@@ -680,12 +702,12 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
}
if (mce_flags.smca) {
- m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+ m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i));
if (m->status & MCI_STATUS_SYNDV) {
- m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
- err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
- err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
+ m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i));
+ err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i));
+ err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i));
}
}
}
@@ -731,7 +753,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m->bank = i;
barrier();
- m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
/*
* Update storm tracking here, before checking for the
@@ -807,7 +829,7 @@ clear_it:
/*
* Clear state for this bank.
*/
- mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
+ mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
}
/*
@@ -865,8 +887,8 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
*/
static noinstr bool quirk_skylake_repmov(void)
{
- u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
+ u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE);
u64 mc1_status;
/*
@@ -877,7 +899,7 @@ static noinstr bool quirk_skylake_repmov(void)
!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
return false;
- mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+ mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1));
/* Check for a software-recoverable data fetch error. */
if ((mc1_status &
@@ -888,8 +910,8 @@ static noinstr bool quirk_skylake_repmov(void)
MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
MCI_STATUS_AR | MCI_STATUS_S)) {
misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
- mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+ mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
+ mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0);
instrumentation_begin();
pr_err_once("Erratum detected, disable fast string copy instructions.\n");
@@ -933,7 +955,7 @@ static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, un
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
@@ -1252,7 +1274,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear)
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (arch_test_bit(i, toclear))
- mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
+ mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1276,7 +1298,7 @@ static noinstr bool mce_check_crashing_cpu(void)
(crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
- mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
+ mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS);
if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
if (mcgstatus & MCG_STATUS_LMCES)
@@ -1284,7 +1306,7 @@ static noinstr bool mce_check_crashing_cpu(void)
}
if (mcgstatus & MCG_STATUS_RIPV) {
- __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
+ native_wrmsrq(MSR_IA32_MCG_STATUS, 0);
return true;
}
}
@@ -1313,7 +1335,7 @@ __mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs,
m->addr = 0;
m->bank = i;
- m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
@@ -1671,7 +1693,7 @@ out:
instrumentation_end();
clear:
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ mce_wrmsrq(MSR_IA32_MCG_STATUS, 0);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1764,36 +1786,14 @@ void mce_timer_kick(bool storm)
__this_cpu_write(mce_next_interval, check_interval * HZ);
}
-/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+/* Must not be called in IRQ context where timer_delete_sync() can deadlock */
static void mce_timer_delete_all(void)
{
int cpu;
for_each_online_cpu(cpu)
- del_timer_sync(&per_cpu(mce_timer, cpu));
-}
-
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-bool mce_notify_irq(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- if (test_and_clear_bit(0, &mce_need_notify)) {
- mce_work_trigger();
-
- if (__ratelimit(&ratelimit))
- pr_info(HW_ERR "Machine check events logged\n");
-
- return true;
- }
- return false;
+ timer_delete_sync(&per_cpu(mce_timer, cpu));
}
-EXPORT_SYMBOL_GPL(mce_notify_irq);
static void __mcheck_cpu_mce_banks_init(void)
{
@@ -1822,7 +1822,7 @@ static void __mcheck_cpu_cap_init(void)
u64 cap;
u8 b;
- rdmsrl(MSR_IA32_MCG_CAP, cap);
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
@@ -1863,7 +1863,7 @@ static void __mcheck_cpu_init_generic(void)
cr4_set_bits(X86_CR4_MCE);
- rdmsrl(MSR_IA32_MCG_CAP, cap);
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
}
@@ -1878,8 +1878,8 @@ static void __mcheck_cpu_init_clear_banks(void)
if (!b->init)
continue;
- wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
- wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1905,7 +1905,7 @@ static void __mcheck_cpu_check_banks(void)
if (!b->init)
continue;
- rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
+ rdmsrq(mca_msr_reg(i, MCA_CTL), msrval);
b->init = !!msrval;
}
}
@@ -2436,7 +2436,7 @@ static void mce_disable_error_reporting(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(mca_msr_reg(i, MCA_CTL), 0);
+ wrmsrq(mca_msr_reg(i, MCA_CTL), 0);
}
return;
}
@@ -2786,7 +2786,7 @@ static void mce_reenable_cpu(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
}
}
@@ -2820,7 +2820,7 @@ static int mce_cpu_pre_down(unsigned int cpu)
struct timer_list *t = this_cpu_ptr(&mce_timer);
mce_disable_cpu();
- del_timer_sync(t);
+ timer_delete_sync(t);
mce_threshold_remove_device(cpu);
mce_device_remove(cpu);
return 0;
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 313fe682db33..d02c4f556cd0 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -24,10 +24,11 @@
#include <linux/pci.h>
#include <linux/uaccess.h>
-#include <asm/amd_nb.h>
+#include <asm/amd/nb.h>
#include <asm/apic.h>
#include <asm/irq_vectors.h>
#include <asm/mce.h>
+#include <asm/msr.h>
#include <asm/nmi.h>
#include <asm/smp.h>
@@ -229,7 +230,6 @@ static int raise_local(void)
} else if (m->status) {
pr_info("Starting machine check poll CPU %d\n", cpu);
raise_poll(m);
- mce_notify_irq();
pr_info("Machine check poll done on CPU %d\n", cpu);
} else
m->finished = 0;
@@ -476,27 +476,27 @@ static void prepare_msrs(void *info)
struct mce m = *(struct mce *)info;
u8 b = m.bank;
- wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+ wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus);
if (boot_cpu_has(X86_FEATURE_SMCA)) {
if (m.inject_flags == DFR_INT_INJ) {
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+ wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
} else {
- wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+ wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
}
- wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+ wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
if (m.misc)
- wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
} else {
- wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
- wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+ wrmsrq(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr);
if (m.misc)
- wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+ wrmsrq(MSR_IA32_MCx_MISC(b), m.misc);
}
}
@@ -590,7 +590,7 @@ static int inj_bank_set(void *data, u64 val)
u64 cap;
/* Get bank count on target CPU so we can handle non-uniform values. */
- rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap);
+ rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap);
n_banks = cap & MCG_BANKCNT_MASK;
if (val >= n_banks) {
@@ -614,7 +614,7 @@ static int inj_bank_set(void *data, u64 val)
if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
u64 ipid;
- if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
+ if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
pr_err("Error reading IPID on CPU%d\n", m->extcpu);
return -EINVAL;
}
@@ -742,15 +742,15 @@ static void check_hw_inj_possible(void)
u64 status = MCI_STATUS_VAL, ipid;
/* Check whether bank is populated */
- rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid);
+ rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid);
if (!ipid)
continue;
toggle_hw_mce_inject(cpu, true);
- wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status);
- rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status);
- wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0);
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status);
+ rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0);
if (!status) {
hw_injection_possible = false;
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index f863df0ff42c..efcf21e9552e 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -94,7 +94,7 @@ static bool cmci_supported(int *banks)
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
return false;
- rdmsrl(MSR_IA32_MCG_CAP, cap);
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
*banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
return !!(cap & MCG_CMCI_P);
}
@@ -106,7 +106,7 @@ static bool lmce_supported(void)
if (mca_cfg.lmce_disabled)
return false;
- rdmsrl(MSR_IA32_MCG_CAP, tmp);
+ rdmsrq(MSR_IA32_MCG_CAP, tmp);
/*
* LMCE depends on recovery support in the processor. Hence both
@@ -123,7 +123,7 @@ static bool lmce_supported(void)
* WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally
* locks the MSR in the event that it wasn't already locked by BIOS.
*/
- rdmsrl(MSR_IA32_FEAT_CTL, tmp);
+ rdmsrq(MSR_IA32_FEAT_CTL, tmp);
if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED)))
return false;
@@ -141,9 +141,9 @@ static void cmci_set_threshold(int bank, int thresh)
u64 val;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh);
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
@@ -184,7 +184,7 @@ static bool cmci_skip_bank(int bank, u64 *val)
if (test_bit(bank, mce_banks_ce_disabled))
return true;
- rdmsrl(MSR_IA32_MCx_CTL2(bank), *val);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), *val);
/* Already owned by someone else? */
if (*val & MCI_CTL2_CMCI_EN) {
@@ -232,8 +232,8 @@ static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_w
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
val |= MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
/* If the enable bit did not stick, this bank should be polled. */
if (!(val & MCI_CTL2_CMCI_EN)) {
@@ -324,9 +324,9 @@ static void __cmci_disable_bank(int bank)
if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
return;
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
@@ -430,10 +430,10 @@ void intel_init_lmce(void)
if (!lmce_supported())
return;
- rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
if (!(val & MCG_EXT_CTL_LMCE_EN))
- wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}
void intel_clear_lmce(void)
@@ -443,9 +443,9 @@ void intel_clear_lmce(void)
if (!lmce_supported())
return;
- rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
val &= ~MCG_EXT_CTL_LMCE_EN;
- wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val);
}
/*
@@ -460,10 +460,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
case INTEL_SANDYBRIDGE_X:
case INTEL_IVYBRIDGE_X:
case INTEL_HASWELL_X:
- if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
+ if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control))
return;
error_control |= 2;
- wrmsrl_safe(MSR_ERROR_CONTROL, error_control);
+ wrmsrq_safe(MSR_ERROR_CONTROL, error_control);
break;
}
}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 95a504ece43e..b5ba598e54cb 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -312,7 +312,7 @@ static __always_inline void pentium_machine_check(struct pt_regs *regs) {}
static __always_inline void winchip_machine_check(struct pt_regs *regs) {}
#endif
-noinstr u64 mce_rdmsrl(u32 msr);
+noinstr u64 mce_rdmsrq(u32 msr);
static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
{
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index dac4d64dfb2a..2235a7477436 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -300,13 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
copy_user = is_copy_from_user(regs);
instrumentation_end();
- switch (fixup_type) {
- case EX_TYPE_UACCESS:
- if (!copy_user)
- return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
- fallthrough;
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+ switch (fixup_type) {
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index a5dac7f3c0a0..097e39327942 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -23,14 +23,18 @@
#include <linux/earlycpio.h>
#include <linux/firmware.h>
+#include <linux/bsearch.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/initrd.h>
#include <linux/kernel.h>
#include <linux/pci.h>
+#include <crypto/sha2.h>
+
#include <asm/microcode.h>
#include <asm/processor.h>
+#include <asm/cmdline.h>
#include <asm/setup.h>
#include <asm/cpu.h>
#include <asm/msr.h>
@@ -145,6 +149,115 @@ ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
*/
static u32 bsp_cpuid_1_eax __ro_after_init;
+static bool sha_check = true;
+
+struct patch_digest {
+ u32 patch_id;
+ u8 sha256[SHA256_DIGEST_SIZE];
+};
+
+#include "amd_shas.c"
+
+static int cmp_id(const void *key, const void *elem)
+{
+ struct patch_digest *pd = (struct patch_digest *)elem;
+ u32 patch_id = *(u32 *)key;
+
+ if (patch_id == pd->patch_id)
+ return 0;
+ else if (patch_id < pd->patch_id)
+ return -1;
+ else
+ return 1;
+}
+
+static bool need_sha_check(u32 cur_rev)
+{
+ switch (cur_rev >> 8) {
+ case 0x80012: return cur_rev <= 0x800126f; break;
+ case 0x80082: return cur_rev <= 0x800820f; break;
+ case 0x83010: return cur_rev <= 0x830107c; break;
+ case 0x86001: return cur_rev <= 0x860010e; break;
+ case 0x86081: return cur_rev <= 0x8608108; break;
+ case 0x87010: return cur_rev <= 0x8701034; break;
+ case 0x8a000: return cur_rev <= 0x8a0000a; break;
+ case 0xa0010: return cur_rev <= 0xa00107a; break;
+ case 0xa0011: return cur_rev <= 0xa0011da; break;
+ case 0xa0012: return cur_rev <= 0xa001243; break;
+ case 0xa0082: return cur_rev <= 0xa00820e; break;
+ case 0xa1011: return cur_rev <= 0xa101153; break;
+ case 0xa1012: return cur_rev <= 0xa10124e; break;
+ case 0xa1081: return cur_rev <= 0xa108109; break;
+ case 0xa2010: return cur_rev <= 0xa20102f; break;
+ case 0xa2012: return cur_rev <= 0xa201212; break;
+ case 0xa4041: return cur_rev <= 0xa404109; break;
+ case 0xa5000: return cur_rev <= 0xa500013; break;
+ case 0xa6012: return cur_rev <= 0xa60120a; break;
+ case 0xa7041: return cur_rev <= 0xa704109; break;
+ case 0xa7052: return cur_rev <= 0xa705208; break;
+ case 0xa7080: return cur_rev <= 0xa708009; break;
+ case 0xa70c0: return cur_rev <= 0xa70C009; break;
+ case 0xaa001: return cur_rev <= 0xaa00116; break;
+ case 0xaa002: return cur_rev <= 0xaa00218; break;
+ case 0xb0021: return cur_rev <= 0xb002146; break;
+ case 0xb1010: return cur_rev <= 0xb101046; break;
+ case 0xb2040: return cur_rev <= 0xb204031; break;
+ case 0xb4040: return cur_rev <= 0xb404031; break;
+ case 0xb6000: return cur_rev <= 0xb600031; break;
+ case 0xb7000: return cur_rev <= 0xb700031; break;
+ default: break;
+ }
+
+ pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n");
+ pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev);
+ return true;
+}
+
+static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
+{
+ struct patch_digest *pd = NULL;
+ u8 digest[SHA256_DIGEST_SIZE];
+ int i;
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17)
+ return true;
+
+ if (!need_sha_check(cur_rev))
+ return true;
+
+ if (!sha_check)
+ return true;
+
+ pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id);
+ if (!pd) {
+ pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id);
+ return false;
+ }
+
+ sha256(data, len, digest);
+
+ if (memcmp(digest, pd->sha256, sizeof(digest))) {
+ pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id);
+
+ for (i = 0; i < SHA256_DIGEST_SIZE; i++)
+ pr_cont("0x%x ", digest[i]);
+ pr_info("\n");
+
+ return false;
+ }
+
+ return true;
+}
+
+static u32 get_patch_level(void)
+{
+ u32 rev, dummy __always_unused;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ return rev;
+}
+
static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
{
union zen_patch_rev p;
@@ -246,8 +359,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
* On success, @sh_psize returns the patch size according to the section header,
* to the caller.
*/
-static bool
-__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
+static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
{
u32 p_type, p_size;
const u32 *hdr;
@@ -484,12 +596,15 @@ static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
}
}
-static bool __apply_microcode_amd(struct microcode_amd *mc, unsigned int psize)
+static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
+ unsigned int psize)
{
unsigned long p_addr = (unsigned long)&mc->hdr.data_code;
- u32 rev, dummy;
- native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr);
+ if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize))
+ return false;
+
+ native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr);
if (x86_family(bsp_cpuid_1_eax) == 0x17) {
unsigned long p_addr_end = p_addr + psize - 1;
@@ -505,47 +620,13 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, unsigned int psize)
}
/* verify patch application was successful */
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
- if (rev != mc->hdr.patch_id)
+ *cur_rev = get_patch_level();
+ if (*cur_rev != mc->hdr.patch_id)
return false;
return true;
}
-/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
- *
- * Returns true if container found (sets @desc), false otherwise.
- */
-static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size)
-{
- struct cont_desc desc = { 0 };
- struct microcode_amd *mc;
-
- scan_containers(ucode, size, &desc);
-
- mc = desc.mc;
- if (!mc)
- return false;
-
- /*
- * Allow application of the same revision to pick up SMT-specific
- * changes even if the revision of the other SMT thread is already
- * up-to-date.
- */
- if (old_rev > mc->hdr.patch_id)
- return false;
-
- return __apply_microcode_amd(mc, desc.psize);
-}
-
static bool get_builtin_microcode(struct cpio_data *cp)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
@@ -583,14 +664,35 @@ static bool __init find_blobs_in_containers(struct cpio_data *ret)
return found;
}
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
{
+ struct cont_desc desc = { };
+ struct microcode_amd *mc;
struct cpio_data cp = { };
- u32 dummy;
+ char buf[4];
+ u32 rev;
+
+ if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) {
+ if (!strncmp(buf, "off", 3)) {
+ sha_check = false;
+ pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+ }
bsp_cpuid_1_eax = cpuid_1_eax;
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy);
+ rev = get_patch_level();
+ ed->old_rev = rev;
/* Needed in load_microcode_amd() */
ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
@@ -598,37 +700,23 @@ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_
if (!find_blobs_in_containers(&cp))
return;
- if (early_apply_microcode(ed->old_rev, cp.data, cp.size))
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy);
-}
-
-static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size);
-
-static int __init save_microcode_in_initrd(void)
-{
- unsigned int cpuid_1_eax = native_cpuid_eax(1);
- struct cpuinfo_x86 *c = &boot_cpu_data;
- struct cont_desc desc = { 0 };
- enum ucode_state ret;
- struct cpio_data cp;
-
- if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
- return 0;
-
- if (!find_blobs_in_containers(&cp))
- return -EINVAL;
-
scan_containers(cp.data, cp.size, &desc);
- if (!desc.mc)
- return -EINVAL;
- ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
- if (ret > UCODE_UPDATED)
- return -EINVAL;
+ mc = desc.mc;
+ if (!mc)
+ return;
- return 0;
+ /*
+ * Allow application of the same revision to pick up SMT-specific
+ * changes even if the revision of the other SMT thread is already
+ * up-to-date.
+ */
+ if (ed->old_rev > mc->hdr.patch_id)
+ return;
+
+ if (__apply_microcode_amd(mc, &rev, desc.psize))
+ ed->new_rev = rev;
}
-early_initcall(save_microcode_in_initrd);
static inline bool patch_cpus_equivalent(struct ucode_patch *p,
struct ucode_patch *n,
@@ -729,14 +817,9 @@ static void free_cache(void)
static struct ucode_patch *find_patch(unsigned int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u32 rev, dummy __always_unused;
u16 equiv_id = 0;
- /* fetch rev if not populated yet: */
- if (!uci->cpu_sig.rev) {
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- uci->cpu_sig.rev = rev;
- }
+ uci->cpu_sig.rev = get_patch_level();
if (x86_family(bsp_cpuid_1_eax) < 0x17) {
equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
@@ -759,22 +842,20 @@ void reload_ucode_amd(unsigned int cpu)
mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
+ rev = get_patch_level();
if (rev < mc->hdr.patch_id) {
- if (__apply_microcode_amd(mc, p->size))
- pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id);
+ if (__apply_microcode_amd(mc, &rev, p->size))
+ pr_info_once("reload revision: 0x%08x\n", rev);
}
}
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct ucode_patch *p;
csig->sig = cpuid_eax(0x00000001);
- csig->rev = c->microcode;
+ csig->rev = get_patch_level();
/*
* a patch could have been loaded early, set uci->mc so that
@@ -815,7 +896,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
goto out;
}
- if (!__apply_microcode_amd(mc_amd, p->size)) {
+ if (!__apply_microcode_amd(mc_amd, &rev, p->size)) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
return UCODE_ERROR;
@@ -937,8 +1018,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
}
/* Scan the blob in @data and add microcode patches to the cache. */
-static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
- size_t size)
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size)
{
u8 *fw = (u8 *)data;
size_t offset;
@@ -996,7 +1076,7 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
if (ret != UCODE_OK)
return ret;
- for_each_node(nid) {
+ for_each_node_with_cpus(nid) {
cpu = cpumask_first(cpumask_of_node(nid));
c = &cpu_data(cpu);
@@ -1013,6 +1093,34 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
return ret;
}
+static int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ struct cont_desc desc = { 0 };
+ unsigned int cpuid_1_eax;
+ enum ucode_state ret;
+ struct cpio_data cp;
+
+ if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ return 0;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ if (!find_blobs_in_containers(&cp))
+ return -EINVAL;
+
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
+
+ ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
+ if (ret > UCODE_UPDATED)
+ return -EINVAL;
+
+ return 0;
+}
+early_initcall(save_microcode_in_initrd);
+
/*
* AMD microcode firmware naming convention, up to family 15h they are in
* the legacy file:
@@ -1067,11 +1175,18 @@ static void microcode_fini_cpu_amd(int cpu)
uci->mc = NULL;
}
+static void finalize_late_load_amd(int result)
+{
+ if (result)
+ cleanup();
+}
+
static struct microcode_ops microcode_amd_ops = {
.request_microcode_fw = request_microcode_amd,
.collect_cpu_info = collect_cpu_info_amd,
.apply_microcode = apply_microcode_amd,
.microcode_fini_cpu = microcode_fini_cpu_amd,
+ .finalize_late_load = finalize_late_load_amd,
.nmi_safe = true,
};
diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c
new file mode 100644
index 000000000000..2a1655b1fdd8
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd_shas.c
@@ -0,0 +1,444 @@
+/* Keep 'em sorted. */
+static const struct patch_digest phashes[] = {
+ { 0x8001227, {
+ 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b,
+ 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46,
+ 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8,
+ 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18,
+ }
+ },
+ { 0x8001250, {
+ 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60,
+ 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa,
+ 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3,
+ 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19,
+ }
+ },
+ { 0x800126e, {
+ 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c,
+ 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3,
+ 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6,
+ 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43,
+ }
+ },
+ { 0x800126f, {
+ 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec,
+ 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b,
+ 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18,
+ 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33,
+ }
+ },
+ { 0x800820d, {
+ 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59,
+ 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67,
+ 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c,
+ 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e,
+ }
+ },
+ { 0x8301025, {
+ 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36,
+ 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1,
+ 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30,
+ 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77,
+ }
+ },
+ { 0x8301055, {
+ 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a,
+ 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea,
+ 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b,
+ 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b,
+ }
+ },
+ { 0x8301072, {
+ 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e,
+ 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28,
+ 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1,
+ 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30,
+ }
+ },
+ { 0x830107a, {
+ 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72,
+ 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34,
+ 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20,
+ 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a,
+ }
+ },
+ { 0x830107b, {
+ 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad,
+ 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a,
+ 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04,
+ 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19,
+ }
+ },
+ { 0x830107c, {
+ 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47,
+ 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac,
+ 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3,
+ 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38,
+ }
+ },
+ { 0x860010d, {
+ 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0,
+ 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7,
+ 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c,
+ 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8,
+ }
+ },
+ { 0x8608108, {
+ 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2,
+ 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38,
+ 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc,
+ 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd,
+ }
+ },
+ { 0x8701034, {
+ 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83,
+ 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d,
+ 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8,
+ 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b,
+ }
+ },
+ { 0x8a00008, {
+ 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e,
+ 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e,
+ 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72,
+ 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c,
+ }
+ },
+ { 0x8a0000a, {
+ 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c,
+ 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44,
+ 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb,
+ 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20,
+ }
+ },
+ { 0xa00104c, {
+ 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe,
+ 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76,
+ 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b,
+ 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b,
+ }
+ },
+ { 0xa00104e, {
+ 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2,
+ 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0,
+ 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e,
+ 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b,
+ }
+ },
+ { 0xa001053, {
+ 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d,
+ 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25,
+ 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb,
+ 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3,
+ }
+ },
+ { 0xa001058, {
+ 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36,
+ 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19,
+ 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec,
+ 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2,
+ }
+ },
+ { 0xa001075, {
+ 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9,
+ 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a,
+ 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f,
+ 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0,
+ }
+ },
+ { 0xa001078, {
+ 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25,
+ 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce,
+ 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04,
+ 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e,
+ }
+ },
+ { 0xa001079, {
+ 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb,
+ 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93,
+ 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb,
+ 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a,
+ }
+ },
+ { 0xa00107a, {
+ 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f,
+ 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd,
+ 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f,
+ 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18,
+ }
+ },
+ { 0xa001143, {
+ 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80,
+ 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42,
+ 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d,
+ 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8,
+ }
+ },
+ { 0xa001144, {
+ 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41,
+ 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b,
+ 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25,
+ 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc,
+ }
+ },
+ { 0xa00115d, {
+ 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd,
+ 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75,
+ 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e,
+ 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05,
+ }
+ },
+ { 0xa001173, {
+ 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a,
+ 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35,
+ 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3,
+ 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e,
+ }
+ },
+ { 0xa0011a8, {
+ 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b,
+ 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6,
+ 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4,
+ 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e,
+ }
+ },
+ { 0xa0011ce, {
+ 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71,
+ 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86,
+ 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e,
+ 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a,
+ }
+ },
+ { 0xa0011d1, {
+ 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e,
+ 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09,
+ 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5,
+ 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8,
+ }
+ },
+ { 0xa0011d3, {
+ 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b,
+ 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6,
+ 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00,
+ 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26,
+ }
+ },
+ { 0xa0011d5, {
+ 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13,
+ 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7,
+ 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62,
+ 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21,
+ }
+ },
+ { 0xa001223, {
+ 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8,
+ 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4,
+ 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15,
+ 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45,
+ }
+ },
+ { 0xa001224, {
+ 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25,
+ 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad,
+ 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9,
+ 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a,
+ }
+ },
+ { 0xa001227, {
+ 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad,
+ 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d,
+ 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9,
+ 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0,
+ }
+ },
+ { 0xa001229, {
+ 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6,
+ 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75,
+ 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4,
+ 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d,
+ }
+ },
+ { 0xa00122e, {
+ 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf,
+ 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15,
+ 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa,
+ 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10,
+ }
+ },
+ { 0xa001231, {
+ 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e,
+ 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64,
+ 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b,
+ 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd,
+ }
+ },
+ { 0xa001234, {
+ 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7,
+ 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc,
+ 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b,
+ 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a,
+ }
+ },
+ { 0xa001236, {
+ 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78,
+ 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d,
+ 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b,
+ 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25,
+ }
+ },
+ { 0xa001238, {
+ 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc,
+ 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a,
+ 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e,
+ 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59,
+ }
+ },
+ { 0xa00820c, {
+ 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3,
+ 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63,
+ 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1,
+ 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2,
+ }
+ },
+ { 0xa10113e, {
+ 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10,
+ 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0,
+ 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5,
+ 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53,
+ }
+ },
+ { 0xa101144, {
+ 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26,
+ 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09,
+ 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc,
+ 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47,
+ }
+ },
+ { 0xa101148, {
+ 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90,
+ 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f,
+ 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08,
+ 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4,
+ }
+ },
+ { 0xa10123e, {
+ 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18,
+ 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d,
+ 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc,
+ 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b,
+ }
+ },
+ { 0xa101244, {
+ 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c,
+ 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1,
+ 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72,
+ 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0,
+ }
+ },
+ { 0xa101248, {
+ 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e,
+ 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22,
+ 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e,
+ 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75,
+ }
+ },
+ { 0xa108108, {
+ 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9,
+ 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6,
+ 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c,
+ 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16,
+ }
+ },
+ { 0xa20102d, {
+ 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11,
+ 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89,
+ 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23,
+ 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4,
+ }
+ },
+ { 0xa201210, {
+ 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe,
+ 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9,
+ 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68,
+ 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41,
+ }
+ },
+ { 0xa404107, {
+ 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45,
+ 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0,
+ 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81,
+ 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99,
+ }
+ },
+ { 0xa500011, {
+ 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4,
+ 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1,
+ 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2,
+ 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74,
+ }
+ },
+ { 0xa601209, {
+ 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32,
+ 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30,
+ 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46,
+ 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d,
+ }
+ },
+ { 0xa704107, {
+ 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6,
+ 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93,
+ 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2,
+ 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39,
+ }
+ },
+ { 0xa705206, {
+ 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4,
+ 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7,
+ 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b,
+ 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc,
+ }
+ },
+ { 0xa708007, {
+ 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3,
+ 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2,
+ 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80,
+ 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93,
+ }
+ },
+ { 0xa70c005, {
+ 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b,
+ 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f,
+ 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55,
+ 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13,
+ }
+ },
+ { 0xaa00116, {
+ 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63,
+ 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5,
+ 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d,
+ 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9,
+ }
+ },
+ { 0xaa00212, {
+ 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75,
+ 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71,
+ 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2,
+ 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1,
+ }
+ },
+ { 0xaa00213, {
+ 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a,
+ 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f,
+ 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed,
+ 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b,
+ }
+ },
+ { 0xaa00215, {
+ 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9,
+ 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4,
+ 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b,
+ 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef,
+ }
+ },
+};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b3658d11e7b6..fe50eb5b7c4a 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -37,12 +37,13 @@
#include <asm/perf_event.h>
#include <asm/processor.h>
#include <asm/cmdline.h>
+#include <asm/msr.h>
#include <asm/setup.h>
#include "internal.h"
-static struct microcode_ops *microcode_ops;
-bool dis_ucode_ldr = true;
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr = false;
bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
@@ -84,6 +85,9 @@ static bool amd_check_current_patch_level(void)
u32 lvl, dummy, i;
u32 *levels;
+ if (x86_cpuid_vendor() != X86_VENDOR_AMD)
+ return false;
+
native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
levels = final_levels;
@@ -95,27 +99,29 @@ static bool amd_check_current_patch_level(void)
return false;
}
-static bool __init check_loader_disabled_bsp(void)
+bool __init microcode_loader_disabled(void)
{
- static const char *__dis_opt_str = "dis_ucode_ldr";
- const char *cmdline = boot_command_line;
- const char *option = __dis_opt_str;
+ if (dis_ucode_ldr)
+ return true;
/*
- * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
- * completely accurate as xen pv guests don't see that CPUID bit set but
- * that's good enough as they don't land on the BSP path anyway.
+ * Disable when:
+ *
+ * 1) The CPU does not support CPUID.
+ *
+ * 2) Bit 31 in CPUID[1]:ECX is clear
+ * The bit is reserved for hypervisor use. This is still not
+ * completely accurate as XEN PV guests don't see that CPUID bit
+ * set, but that's good enough as they don't land on the BSP
+ * path anyway.
+ *
+ * 3) Certain AMD patch levels are not allowed to be
+ * overwritten.
*/
- if (native_cpuid_ecx(1) & BIT(31))
- return true;
-
- if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
- if (amd_check_current_patch_level())
- return true;
- }
-
- if (cmdline_find_option_bool(cmdline, option) <= 0)
- dis_ucode_ldr = false;
+ if (!cpuid_feature() ||
+ native_cpuid_ecx(1) & BIT(31) ||
+ amd_check_current_patch_level())
+ dis_ucode_ldr = true;
return dis_ucode_ldr;
}
@@ -125,7 +131,10 @@ void __init load_ucode_bsp(void)
unsigned int cpuid_1_eax;
bool intel = true;
- if (!have_cpuid_p())
+ if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+ dis_ucode_ldr = true;
+
+ if (microcode_loader_disabled())
return;
cpuid_1_eax = native_cpuid_eax(1);
@@ -146,9 +155,6 @@ void __init load_ucode_bsp(void)
return;
}
- if (check_loader_disabled_bsp())
- return;
-
if (intel)
load_ucode_intel_bsp(&early_data);
else
@@ -159,6 +165,11 @@ void load_ucode_ap(void)
{
unsigned int cpuid_1_eax;
+ /*
+ * Can't use microcode_loader_disabled() here - .init section
+ * hell. It doesn't have to either - the BSP variant must've
+ * parsed cmdline already anyway.
+ */
if (dis_ucode_ldr)
return;
@@ -686,6 +697,8 @@ static int load_late_locked(void)
return load_late_stop_cpus(true);
case UCODE_NFOUND:
return -ENOENT;
+ case UCODE_OK:
+ return 0;
default:
return -EBADFD;
}
@@ -810,7 +823,7 @@ static int __init microcode_init(void)
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
- if (dis_ucode_ldr)
+ if (microcode_loader_disabled())
return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
new file mode 100644
index 000000000000..cb6e601701ab
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
@@ -0,0 +1,150 @@
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 },
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index f3d534807d91..371ca6eac00e 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -74,7 +74,7 @@ void intel_collect_cpu_info(struct cpu_signature *sig)
sig->pf = 0;
sig->rev = intel_get_microcode_revision();
- if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) {
+ if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) {
unsigned int val[2];
/* get processor flags from MSR 0x17 */
@@ -320,7 +320,7 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
}
/* write microcode via MSR 0x79 */
- native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+ native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
rev = intel_get_microcode_revision();
if (rev != mc->hdr.rev)
@@ -389,7 +389,7 @@ static int __init save_builtin_microcode(void)
if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
return 0;
- if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
uci.mc = get_microcode_blob(&uci, true);
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index 21776c529fa9..50a9702ae4e2 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -94,20 +94,17 @@ static inline unsigned int x86_cpuid_family(void)
return x86_family(eax);
}
-extern bool dis_ucode_ldr;
extern bool force_minrev;
#ifdef CONFIG_CPU_SUP_AMD
void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
void load_ucode_amd_ap(unsigned int family);
-int save_microcode_in_initrd_amd(unsigned int family);
void reload_ucode_amd(unsigned int cpu);
struct microcode_ops *init_amd_microcode(void);
void exit_amd_microcode(void);
#else /* CONFIG_CPU_SUP_AMD */
static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
static inline void load_ucode_amd_ap(unsigned int family) { }
-static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
static inline void reload_ucode_amd(unsigned int cpu) { }
static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
static inline void exit_amd_microcode(void) { }
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f285757618fc..c78f860419d6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -30,11 +30,10 @@
#include <asm/reboot.h>
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
+#include <asm/msr.h>
#include <asm/numa.h>
#include <asm/svm.h>
-/* Is Linux running as the root partition? */
-bool hv_root_partition;
/* Is Linux running on nested Microsoft Hypervisor */
bool hv_nested;
struct ms_hyperv_info ms_hyperv;
@@ -72,7 +71,7 @@ u64 hv_get_non_nested_msr(unsigned int reg)
if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present)
hv_ivm_msr_read(reg, &value);
else
- rdmsrl(reg, value);
+ rdmsrq(reg, value);
return value;
}
EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
@@ -84,9 +83,9 @@ void hv_set_non_nested_msr(unsigned int reg, u64 value)
/* Write proxy bit via wrmsl instruction */
if (hv_is_sint_msr(reg))
- wrmsrl(reg, value | 1 << 20);
+ wrmsrq(reg, value | 1 << 20);
} else {
- wrmsrl(reg, value);
+ wrmsrq(reg, value);
}
}
EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
@@ -109,6 +108,7 @@ void hv_set_msr(unsigned int reg, u64 value)
}
EXPORT_SYMBOL_GPL(hv_set_msr);
+static void (*mshv_handler)(void);
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
@@ -119,6 +119,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
struct pt_regs *old_regs = set_irq_regs(regs);
inc_irq_stat(irq_hv_callback_count);
+ if (mshv_handler)
+ mshv_handler();
+
if (vmbus_handler)
vmbus_handler();
@@ -128,6 +131,11 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
set_irq_regs(old_regs);
}
+void hv_setup_mshv_handler(void (*handler)(void))
+{
+ mshv_handler = handler;
+}
+
void hv_setup_vmbus_handler(void (*handler)(void))
{
vmbus_handler = handler;
@@ -338,7 +346,7 @@ static unsigned long hv_get_tsc_khz(void)
{
unsigned long freq;
- rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+ rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq);
return freq / 1000;
}
@@ -422,6 +430,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
return 0;
}
+EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static void __init ms_hyperv_init_platform(void)
{
@@ -436,13 +445,15 @@ static void __init ms_hyperv_init_platform(void)
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
- pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
- ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+ pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n",
+ ms_hyperv.features, ms_hyperv.priv_high,
+ ms_hyperv.ext_features, ms_hyperv.hints,
ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -451,25 +462,7 @@ static void __init ms_hyperv_init_platform(void)
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
- /*
- * Check CPU management privilege.
- *
- * To mirror what Windows does we should extract CPU management
- * features and use the ReservedIdentityBit to detect if Linux is the
- * root partition. But that requires negotiating CPU management
- * interface (a process to be finalized). For now, use the privilege
- * flag as the indicator for running as root.
- *
- * Hyper-V should never specify running as root and as a Confidential
- * VM. But to protect against a compromised/malicious Hyper-V trying
- * to exploit root behavior to expose Confidential VM memory, ignore
- * the root partition setting if also a Confidential VM.
- */
- if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
- !(ms_hyperv.priv_high & HV_ISOLATION)) {
- hv_root_partition = true;
- pr_info("Hyper-V: running as root partition\n");
- }
+ hv_identify_partition_type();
if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
hv_nested = true;
@@ -549,7 +542,7 @@ static void __init ms_hyperv_init_platform(void)
*/
u64 hv_lapic_frequency;
- rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
+ rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
lapic_timer_period = hv_lapic_frequency;
pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
@@ -582,7 +575,7 @@ static void __init ms_hyperv_init_platform(void)
* setting of this MSR bit should happen before init_intel()
* is called.
*/
- wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
+ wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
}
@@ -618,7 +611,7 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition ||
+ if (hv_root_partition() ||
(!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 2fdfda2b60e4..8c18327eb10b 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -9,9 +9,11 @@
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/cc_platform.h>
+#include <linux/string_choices.h>
#include <asm/processor-flags.h>
#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
#include <asm/tlbflush.h>
@@ -591,7 +593,7 @@ static void get_fixed_ranges(mtrr_type *frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (boot_cpu_has(X86_FEATURE_MTRR))
+ if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
@@ -646,10 +648,10 @@ static void __init print_mtrr_state(void)
pr_info("MTRR default type: %s\n",
mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
- pr_info("MTRR fixed ranges %sabled:\n",
- ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
- (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
- "en" : "dis");
+ pr_info("MTRR fixed ranges %s:\n",
+ str_enabled_disabled(
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)));
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -661,8 +663,8 @@ static void __init print_mtrr_state(void)
/* tail */
print_fixed_last();
}
- pr_info("MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
+ pr_info("MTRR variable ranges %s:\n",
+ str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED));
high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
@@ -1025,8 +1027,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
* For Intel PPro stepping <= 7
* must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
*/
- if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
+ if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
boot_cpu_data.x86_stepping <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a5c506f6da7f..4049235b1bfe 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
char *ptr;
char line[LINE_SIZE];
int length;
- size_t linelen;
memset(line, 0, LINE_SIZE);
@@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
if (length < 0)
return length;
- linelen = strlen(line);
- ptr = line + linelen - 1;
- if (linelen && *ptr == '\n')
+ ptr = line + length - 1;
+ if (length && *ptr == '\n')
*ptr = '\0';
if (!strncmp(line, "disable=", 8)) {
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 41ed01f46bd9..6571d432cbe3 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -86,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = arch_freq_get_on_cpu(cpu);
+ int freq = arch_freq_get_on_cpu(cpu);
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
+ if (freq < 0)
+ seq_puts(m, "cpu MHz\t\t: Unknown\n");
+ else
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index 4a06c37b9cf1..d8a04b195da2 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,4 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
-obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
+
+# To allow define_trace.h's recursive include:
CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 3d1735ed8d1f..7109cbfcad4f 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -22,6 +22,7 @@
#include <linux/cpuhotplug.h>
#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
#include <asm/resctrl.h>
#include "internal.h"
@@ -44,12 +45,6 @@ static DEFINE_MUTEX(domain_list_lock);
DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
/*
- * Used to store the max resource name width and max resource data width
- * to display the schemata in a tabular format
- */
-int max_name_width, max_data_width;
-
-/*
* Global boolean for rdt_alloc which is true if any
* resource allocation is enabled.
*/
@@ -62,19 +57,16 @@ static void mba_wrmsr_amd(struct msr_param *m);
#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
-struct rdt_hw_resource rdt_resources_all[] = {
+struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
[RDT_RESOURCE_L3] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_L3,
.name = "L3",
.ctrl_scope = RESCTRL_L3_CACHE,
.mon_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
.mon_domains = mon_domain_init(RDT_RESOURCE_L3),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L3_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -82,13 +74,10 @@ struct rdt_hw_resource rdt_resources_all[] = {
[RDT_RESOURCE_L2] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_L2,
.name = "L2",
.ctrl_scope = RESCTRL_L2_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L2_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -96,25 +85,19 @@ struct rdt_hw_resource rdt_resources_all[] = {
[RDT_RESOURCE_MBA] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_MBA,
.name = "MB",
.ctrl_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
},
},
[RDT_RESOURCE_SMBA] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_SMBA,
.name = "SMBA",
.ctrl_scope = RESCTRL_L3_CACHE,
.ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
},
},
};
@@ -127,6 +110,14 @@ u32 resctrl_arch_system_num_rmid_idx(void)
return r->num_rmid;
}
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &rdt_resources_all[l].r_resctrl;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
@@ -151,17 +142,16 @@ static inline void cache_alloc_hsw_probe(void)
struct rdt_resource *r = &hw_res->r_resctrl;
u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
- if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
+ if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
return;
- rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
+ rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
/* If all the bits were set in MSR, return success */
if (l3_cbm_0 != max_cbm)
return;
hw_res->num_closid = 4;
- r->default_ctrl = max_cbm;
r->cache.cbm_len = 20;
r->cache.shareable_bits = 0xc0000;
r->cache.min_cbm_bits = 2;
@@ -171,21 +161,6 @@ static inline void cache_alloc_hsw_probe(void)
rdt_alloc_capable = true;
}
-bool is_mba_sc(struct rdt_resource *r)
-{
- if (!r)
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
-
- /*
- * The software controller support is only applicable to MBA resource.
- * Make sure to check for resource type.
- */
- if (r->rid != RDT_RESOURCE_MBA)
- return false;
-
- return r->membw.mba_sc;
-}
-
/*
* rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
* exposed to user interface and the h/w understandable delay values.
@@ -217,7 +192,7 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r)
cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
max_delay = eax.split.max_delay + 1;
- r->default_ctrl = MAX_MBA_BW;
+ r->membw.max_bw = MAX_MBA_BW;
r->membw.arch_needs_linear = true;
if (ecx & MBA_IS_LINEAR) {
r->membw.delay_linear = true;
@@ -228,16 +203,12 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r)
return false;
r->membw.arch_needs_linear = false;
}
- r->data_width = 3;
if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
else
r->membw.throttle_mode = THREAD_THROTTLE_MAX;
- resctrl_file_fflags_init("thread_throttle_mode",
- RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
-
r->alloc_capable = true;
return true;
@@ -256,7 +227,7 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
hw_res->num_closid = edx + 1;
- r->default_ctrl = 1 << eax;
+ r->membw.max_bw = 1 << eax;
/* AMD does not use delay */
r->membw.delay_linear = false;
@@ -269,8 +240,6 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
r->membw.min_bw = 0;
r->membw.bw_gran = 1;
- /* Max value is 2048, Data width should be 4 in decimal */
- r->data_width = 4;
r->alloc_capable = true;
@@ -283,14 +252,13 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_ecx ecx;
union cpuid_0x10_x_edx edx;
- u32 ebx;
+ u32 ebx, default_ctrl;
cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
- r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
- r->cache.shareable_bits = ebx & r->default_ctrl;
- r->data_width = (r->cache.cbm_len + 3) / 4;
+ default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & default_ctrl;
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
r->alloc_capable = true;
@@ -323,7 +291,7 @@ static void mba_wrmsr_amd(struct msr_param *m)
unsigned int i;
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
/*
@@ -337,7 +305,7 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
return MAX_MBA_BW - bw;
pr_warn_once("Non Linear delay-bw map not supported but queried\n");
- return r->default_ctrl;
+ return MAX_MBA_BW;
}
static void mba_wrmsr_intel(struct msr_param *m)
@@ -348,7 +316,7 @@ static void mba_wrmsr_intel(struct msr_param *m)
/* Write the delay values for mba. */
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
+ wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
}
static void cat_wrmsr(struct msr_param *m)
@@ -358,37 +326,7 @@ static void cat_wrmsr(struct msr_param *m)
unsigned int i;
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
-}
-
-struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r)
-{
- struct rdt_ctrl_domain *d;
-
- lockdep_assert_cpus_held();
-
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
- return d;
- }
-
- return NULL;
-}
-
-struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r)
-{
- struct rdt_mon_domain *d;
-
- lockdep_assert_cpus_held();
-
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
- return d;
- }
-
- return NULL;
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
@@ -405,36 +343,6 @@ void rdt_ctrl_update(void *arg)
hw_res->msr_update(m);
}
-/*
- * rdt_find_domain - Search for a domain id in a resource domain list.
- *
- * Search the domain list to find the domain id. If the domain id is
- * found, return the domain. NULL otherwise. If the domain id is not
- * found (and NULL returned) then the first domain with id bigger than
- * the input id can be returned to the caller via @pos.
- */
-struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
- struct list_head **pos)
-{
- struct rdt_domain_hdr *d;
- struct list_head *l;
-
- list_for_each(l, h) {
- d = list_entry(l, struct rdt_domain_hdr, list);
- /* When id is found, return its domain. */
- if (id == d->id)
- return d;
- /* Stop searching when finding id's position in sorted list. */
- if (id < d->id)
- break;
- }
-
- if (pos)
- *pos = l;
-
- return NULL;
-}
-
static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
@@ -446,7 +354,7 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
* For Memory Allocation: Set b/w requested to 100%
*/
for (i = 0; i < hw_res->num_closid; i++, dc++)
- *dc = r->default_ctrl;
+ *dc = resctrl_get_default_ctrl(r);
}
static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
@@ -494,13 +402,13 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
{
size_t tsize;
- if (is_mbm_total_enabled()) {
+ if (resctrl_arch_is_mbm_total_enabled()) {
tsize = sizeof(*hw_dom->arch_mbm_total);
hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL);
if (!hw_dom->arch_mbm_total)
return -ENOMEM;
}
- if (is_mbm_local_enabled()) {
+ if (resctrl_arch_is_mbm_local_enabled()) {
tsize = sizeof(*hw_dom->arch_mbm_local);
hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL);
if (!hw_dom->arch_mbm_local) {
@@ -545,7 +453,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos);
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
if (hdr) {
if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
return;
@@ -600,7 +508,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->mon_domains, id, &add_pos);
+ hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
if (hdr) {
if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
return;
@@ -665,7 +573,7 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->ctrl_domains, id, NULL);
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
if (!hdr) {
pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
id, cpu, r->name);
@@ -711,7 +619,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
return;
}
- hdr = rdt_find_domain(&r->mon_domains, id, NULL);
+ hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
if (!hdr) {
pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
id, cpu, r->name);
@@ -786,20 +694,6 @@ static int resctrl_arch_offline_cpu(unsigned int cpu)
return 0;
}
-/*
- * Choose a width for the resource name and resource data based on the
- * resource that has widest name and cbm.
- */
-static __init void rdt_init_padding(void)
-{
- struct rdt_resource *r;
-
- for_each_alloc_capable_rdt_resource(r) {
- if (r->data_width > max_data_width)
- max_data_width = r->data_width;
- }
-}
-
enum {
RDT_FLAG_CMT,
RDT_FLAG_MBM_TOTAL,
@@ -825,7 +719,7 @@ struct rdt_options {
bool force_off, force_on;
};
-static struct rdt_options rdt_options[] __initdata = {
+static struct rdt_options rdt_options[] __ro_after_init = {
RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
@@ -865,7 +759,7 @@ static int __init set_rdt_options(char *str)
}
__setup("rdt", set_rdt_options);
-bool __init rdt_cpu_has(int flag)
+bool rdt_cpu_has(int flag)
{
bool ret = boot_cpu_has(flag);
struct rdt_options *o;
@@ -885,6 +779,21 @@ bool __init rdt_cpu_has(int flag)
return ret;
}
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ if (!rdt_cpu_has(X86_FEATURE_BMEC))
+ return false;
+
+ switch (evt) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
+ default:
+ return false;
+ }
+}
+
static __init bool get_mem_config(void)
{
struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
@@ -963,11 +872,6 @@ static __init bool get_rdt_mon_resources(void)
if (!rdt_mon_features)
return false;
- if (is_mbm_local_enabled())
- mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
- else if (is_mbm_total_enabled())
- mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
-
return !rdt_get_mon_l3_config(r);
}
@@ -1086,10 +990,14 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
}
}
-static int __init resctrl_late_init(void)
+static int __init resctrl_arch_late_init(void)
{
struct rdt_resource *r;
- int state, ret;
+ int state, ret, i;
+
+ /* for_each_rdt_resource() requires all rid to be initialised. */
+ for (i = 0; i < RDT_NUM_RESOURCES; i++)
+ rdt_resources_all[i].r_resctrl.rid = i;
/*
* Initialize functions(or definitions) that are different
@@ -1102,8 +1010,6 @@ static int __init resctrl_late_init(void)
if (!get_rdt_resources())
return -ENODEV;
- rdt_init_padding();
-
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/resctrl/cat:online:",
resctrl_arch_online_cpu,
@@ -1111,7 +1017,7 @@ static int __init resctrl_late_init(void)
if (state < 0)
return state;
- ret = rdtgroup_init();
+ ret = resctrl_init();
if (ret) {
cpuhp_remove_state(state);
return ret;
@@ -1127,18 +1033,13 @@ static int __init resctrl_late_init(void)
return 0;
}
-late_initcall(resctrl_late_init);
+late_initcall(resctrl_arch_late_init);
-static void __exit resctrl_exit(void)
+static void __exit resctrl_arch_exit(void)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
cpuhp_remove_state(rdt_online);
- rdtgroup_exit();
-
- if (r->mon_capable)
- rdt_put_mon_l3_config();
+ resctrl_exit();
}
-__exitcall(resctrl_exit);
+__exitcall(resctrl_arch_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 536351159cc2..1189c0df4ad7 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -16,273 +16,15 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpu.h>
-#include <linux/kernfs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/tick.h>
#include "internal.h"
-/*
- * Check whether MBA bandwidth percentage value is correct. The value is
- * checked against the minimum and max bandwidth values specified by the
- * hardware. The allocated bandwidth percentage is rounded to the next
- * control step available on the hardware.
- */
-static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
-{
- int ret;
- u32 bw;
-
- /*
- * Only linear delay values is supported for current Intel SKUs.
- */
- if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
- rdt_last_cmd_puts("No support for non-linear MB domains\n");
- return false;
- }
-
- ret = kstrtou32(buf, 10, &bw);
- if (ret) {
- rdt_last_cmd_printf("Invalid MB value %s\n", buf);
- return false;
- }
-
- /* Nothing else to do if software controller is enabled. */
- if (is_mba_sc(r)) {
- *data = bw;
- return true;
- }
-
- if (bw < r->membw.min_bw || bw > r->default_ctrl) {
- rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n",
- bw, r->membw.min_bw, r->default_ctrl);
- return false;
- }
-
- *data = roundup(bw, (unsigned long)r->membw.bw_gran);
- return true;
-}
-
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d)
-{
- struct resctrl_staged_config *cfg;
- u32 closid = data->rdtgrp->closid;
- struct rdt_resource *r = s->res;
- u32 bw_val;
-
- cfg = &d->staged_config[s->conf_type];
- if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
- return -EINVAL;
- }
-
- if (!bw_validate(data->buf, &bw_val, r))
- return -EINVAL;
-
- if (is_mba_sc(r)) {
- d->mbps_val[closid] = bw_val;
- return 0;
- }
-
- cfg->new_ctrl = bw_val;
- cfg->have_new_ctrl = true;
-
- return 0;
-}
-
-/*
- * Check whether a cache bit mask is valid.
- * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
- * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
- * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
- *
- * Haswell does not support a non-contiguous 1s value and additionally
- * requires at least two bits set.
- * AMD allows non-contiguous bitmasks.
- */
-static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
-{
- unsigned long first_bit, zero_bit, val;
- unsigned int cbm_len = r->cache.cbm_len;
- int ret;
-
- ret = kstrtoul(buf, 16, &val);
- if (ret) {
- rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
- return false;
- }
-
- if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) {
- rdt_last_cmd_puts("Mask out of range\n");
- return false;
- }
-
- first_bit = find_first_bit(&val, cbm_len);
- zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
- /* Are non-contiguous bitmasks allowed? */
- if (!r->cache.arch_has_sparse_bitmasks &&
- (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
- rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
- return false;
- }
-
- if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("Need at least %d bits in the mask\n",
- r->cache.min_cbm_bits);
- return false;
- }
-
- *data = val;
- return true;
-}
-
-/*
- * Read one cache bit mask (hex). Check that it is valid for the current
- * resource type.
- */
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d)
-{
- struct rdtgroup *rdtgrp = data->rdtgrp;
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- u32 cbm_val;
-
- cfg = &d->staged_config[s->conf_type];
- if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
- return -EINVAL;
- }
-
- /*
- * Cannot set up more than one pseudo-locked region in a cache
- * hierarchy.
- */
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
- rdtgroup_pseudo_locked_in_hierarchy(d)) {
- rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n");
- return -EINVAL;
- }
-
- if (!cbm_validate(data->buf, &cbm_val, r))
- return -EINVAL;
-
- if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
- rdtgrp->mode == RDT_MODE_SHAREABLE) &&
- rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
- rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n");
- return -EINVAL;
- }
-
- /*
- * The CBM may not overlap with the CBM of another closid if
- * either is exclusive.
- */
- if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
- rdt_last_cmd_puts("Overlaps with exclusive group\n");
- return -EINVAL;
- }
-
- if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
- if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- rdt_last_cmd_puts("Overlaps with other group\n");
- return -EINVAL;
- }
- }
-
- cfg->new_ctrl = cbm_val;
- cfg->have_new_ctrl = true;
-
- return 0;
-}
-
-/*
- * For each domain in this resource we expect to find a series of:
- * id=mask
- * separated by ";". The "id" is in decimal, and must match one of
- * the "id"s for this resource.
- */
-static int parse_line(char *line, struct resctrl_schema *s,
- struct rdtgroup *rdtgrp)
-{
- enum resctrl_conf_type t = s->conf_type;
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- struct rdt_parse_data data;
- struct rdt_ctrl_domain *d;
- char *dom = NULL, *id;
- unsigned long dom_id;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
- (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
- rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
- return -EINVAL;
- }
-
-next:
- if (!line || line[0] == '\0')
- return 0;
- dom = strsep(&line, ";");
- id = strsep(&dom, "=");
- if (!dom || kstrtoul(id, 10, &dom_id)) {
- rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
- return -EINVAL;
- }
- dom = strim(dom);
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- if (d->hdr.id == dom_id) {
- data.buf = dom;
- data.rdtgrp = rdtgrp;
- if (r->parse_ctrlval(&data, s, d))
- return -EINVAL;
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- cfg = &d->staged_config[t];
- /*
- * In pseudo-locking setup mode and just
- * parsed a valid CBM that should be
- * pseudo-locked. Only one locked region per
- * resource group and domain so just do
- * the required initialization for single
- * region and return.
- */
- rdtgrp->plr->s = s;
- rdtgrp->plr->d = d;
- rdtgrp->plr->cbm = cfg->new_ctrl;
- d->plr = rdtgrp->plr;
- return 0;
- }
- goto next;
- }
- }
- return -EINVAL;
-}
-
-static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
-{
- switch (type) {
- default:
- case CDP_NONE:
- return closid;
- case CDP_CODE:
- return closid * 2 + 1;
- case CDP_DATA:
- return closid * 2;
- }
-}
-
int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- u32 idx = get_config_index(closid, t);
+ u32 idx = resctrl_get_config_index(closid, t);
struct msr_param msr_param;
if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
@@ -319,7 +61,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
if (!cfg->have_new_ctrl)
continue;
- idx = get_config_index(closid, t);
+ idx = resctrl_get_config_index(closid, t);
if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
continue;
hw_dom->ctrl_val[idx] = cfg->new_ctrl;
@@ -341,357 +83,11 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
return 0;
}
-static int rdtgroup_parse_resource(char *resname, char *tok,
- struct rdtgroup *rdtgrp)
-{
- struct resctrl_schema *s;
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
- return parse_line(tok, s, rdtgrp);
- }
- rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
- return -EINVAL;
-}
-
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct resctrl_schema *s;
- struct rdtgroup *rdtgrp;
- struct rdt_resource *r;
- char *tok, *resname;
- int ret = 0;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
- buf[nbytes - 1] = '\0';
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
- rdt_last_cmd_clear();
-
- /*
- * No changes to pseudo-locked region allowed. It has to be removed
- * and re-created instead.
- */
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Resource group is pseudo-locked\n");
- goto out;
- }
-
- rdt_staged_configs_clear();
-
- while ((tok = strsep(&buf, "\n")) != NULL) {
- resname = strim(strsep(&tok, ":"));
- if (!tok) {
- rdt_last_cmd_puts("Missing ':'\n");
- ret = -EINVAL;
- goto out;
- }
- if (tok[0] == '\0') {
- rdt_last_cmd_printf("Missing '%s' value\n", resname);
- ret = -EINVAL;
- goto out;
- }
- ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
- if (ret)
- goto out;
- }
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
-
- /*
- * Writes to mba_sc resources update the software controller,
- * not the control MSR.
- */
- if (is_mba_sc(r))
- continue;
-
- ret = resctrl_arch_update_domains(r, rdtgrp->closid);
- if (ret)
- goto out;
- }
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- /*
- * If pseudo-locking fails we keep the resource group in
- * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
- * active and updated for just the domain the pseudo-locked
- * region was requested for.
- */
- ret = rdtgroup_pseudo_lock_create(rdtgrp);
- }
-
-out:
- rdt_staged_configs_clear();
- rdtgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type type)
{
struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
- u32 idx = get_config_index(closid, type);
+ u32 idx = resctrl_get_config_index(closid, type);
return hw_dom->ctrl_val[idx];
}
-
-static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
-{
- struct rdt_resource *r = schema->res;
- struct rdt_ctrl_domain *dom;
- bool sep = false;
- u32 ctrl_val;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
- if (sep)
- seq_puts(s, ";");
-
- if (is_mba_sc(r))
- ctrl_val = dom->mbps_val[closid];
- else
- ctrl_val = resctrl_arch_get_config(r, dom, closid,
- schema->conf_type);
-
- seq_printf(s, r->format_str, dom->hdr.id, max_data_width,
- ctrl_val);
- sep = true;
- }
- seq_puts(s, "\n");
-}
-
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct resctrl_schema *schema;
- struct rdtgroup *rdtgrp;
- int ret = 0;
- u32 closid;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- list_for_each_entry(schema, &resctrl_schema_all, list) {
- seq_printf(s, "%s:uninitialized\n", schema->name);
- }
- } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- if (!rdtgrp->plr->d) {
- rdt_last_cmd_clear();
- rdt_last_cmd_puts("Cache domain offline\n");
- ret = -ENODEV;
- } else {
- seq_printf(s, "%s:%d=%x\n",
- rdtgrp->plr->s->res->name,
- rdtgrp->plr->d->hdr.id,
- rdtgrp->plr->cbm);
- }
- } else {
- closid = rdtgrp->closid;
- list_for_each_entry(schema, &resctrl_schema_all, list) {
- if (closid < schema->num_closid)
- show_doms(s, schema, closid);
- }
- }
- } else {
- ret = -ENOENT;
- }
- rdtgroup_kn_unlock(of->kn);
- return ret;
-}
-
-static int smp_mon_event_count(void *arg)
-{
- mon_event_count(arg);
-
- return 0;
-}
-
-ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
- buf[nbytes - 1] = '\0';
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
- rdt_last_cmd_clear();
-
- if (!strcmp(buf, "mbm_local_bytes")) {
- if (is_mbm_local_enabled())
- rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
- else
- ret = -EINVAL;
- } else if (!strcmp(buf, "mbm_total_bytes")) {
- if (is_mbm_total_enabled())
- rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
- else
- ret = -EINVAL;
- } else {
- ret = -EINVAL;
- }
-
- if (ret)
- rdt_last_cmd_printf("Unsupported event id '%s'\n", buf);
-
- rdtgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
-}
-
-int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
-
- if (rdtgrp) {
- switch (rdtgrp->mba_mbps_event) {
- case QOS_L3_MBM_LOCAL_EVENT_ID:
- seq_puts(s, "mbm_local_bytes\n");
- break;
- case QOS_L3_MBM_TOTAL_EVENT_ID:
- seq_puts(s, "mbm_total_bytes\n");
- break;
- default:
- pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event);
- ret = -EINVAL;
- break;
- }
- } else {
- ret = -ENOENT;
- }
-
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
- cpumask_t *cpumask, int evtid, int first)
-{
- int cpu;
-
- /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- /*
- * Setup the parameters to pass to mon_event_count() to read the data.
- */
- rr->rgrp = rdtgrp;
- rr->evtid = evtid;
- rr->r = r;
- rr->d = d;
- rr->first = first;
- rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
- if (IS_ERR(rr->arch_mon_ctx)) {
- rr->err = -EINVAL;
- return;
- }
-
- cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
-
- /*
- * cpumask_any_housekeeping() prefers housekeeping CPUs, but
- * are all the CPUs nohz_full? If yes, pick a CPU to IPI.
- * MPAM's resctrl_arch_rmid_read() is unable to read the
- * counters on some platforms if its called in IRQ context.
- */
- if (tick_nohz_full_cpu(cpu))
- smp_call_function_any(cpumask, mon_event_count, rr, 1);
- else
- smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
-
- resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
-}
-
-int rdtgroup_mondata_show(struct seq_file *m, void *arg)
-{
- struct kernfs_open_file *of = m->private;
- struct rdt_domain_hdr *hdr;
- struct rmid_read rr = {0};
- struct rdt_mon_domain *d;
- u32 resid, evtid, domid;
- struct rdtgroup *rdtgrp;
- struct rdt_resource *r;
- union mon_data_bits md;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- ret = -ENOENT;
- goto out;
- }
-
- md.priv = of->kn->priv;
- resid = md.u.rid;
- domid = md.u.domid;
- evtid = md.u.evtid;
- r = &rdt_resources_all[resid].r_resctrl;
-
- if (md.u.sum) {
- /*
- * This file requires summing across all domains that share
- * the L3 cache id that was provided in the "domid" field of the
- * mon_data_bits union. Search all domains in the resource for
- * one that matches this cache id.
- */
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- if (d->ci->id == domid) {
- rr.ci = d->ci;
- mon_event_read(&rr, r, NULL, rdtgrp,
- &d->ci->shared_cpu_map, evtid, false);
- goto checkresult;
- }
- }
- ret = -ENOENT;
- goto out;
- } else {
- /*
- * This file provides data from a single domain. Search
- * the resource to find the domain with "domid".
- */
- hdr = rdt_find_domain(&r->mon_domains, domid, NULL);
- if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
- ret = -ENOENT;
- goto out;
- }
- d = container_of(hdr, struct rdt_mon_domain, hdr);
- mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false);
- }
-
-checkresult:
-
- if (rr.err == -EIO)
- seq_puts(m, "Error\n");
- else if (rr.err == -EINVAL)
- seq_puts(m, "Unavailable\n");
- else
- seq_printf(m, "%llu\n", rr.val);
-
-out:
- rdtgroup_kn_unlock(of->kn);
- return ret;
-}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 20c898f09b7e..5e3c41b36437 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -3,28 +3,21 @@
#define _ASM_X86_RESCTRL_INTERNAL_H
#include <linux/resctrl.h>
-#include <linux/sched.h>
-#include <linux/kernfs.h>
-#include <linux/fs_context.h>
-#include <linux/jump_label.h>
-#include <linux/tick.h>
-
-#include <asm/resctrl.h>
#define L3_QOS_CDP_ENABLE 0x01ULL
#define L2_QOS_CDP_ENABLE 0x01ULL
-#define CQM_LIMBOCHECK_INTERVAL 1000
-
#define MBM_CNTR_WIDTH_BASE 24
-#define MBM_OVERFLOW_INTERVAL 1000
-#define MAX_MBA_BW 100u
+
#define MBA_IS_LINEAR 0x4
+
#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
+
#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
/*
* With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
* data to be returned. The counter width is discovered from the hardware
@@ -32,343 +25,6 @@
*/
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
-/* Reads to Local DRAM Memory */
-#define READS_TO_LOCAL_MEM BIT(0)
-
-/* Reads to Remote DRAM Memory */
-#define READS_TO_REMOTE_MEM BIT(1)
-
-/* Non-Temporal Writes to Local Memory */
-#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2)
-
-/* Non-Temporal Writes to Remote Memory */
-#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3)
-
-/* Reads to Local Memory the system identifies as "Slow Memory" */
-#define READS_TO_LOCAL_S_MEM BIT(4)
-
-/* Reads to Remote Memory the system identifies as "Slow Memory" */
-#define READS_TO_REMOTE_S_MEM BIT(5)
-
-/* Dirty Victims to All Types of Memory */
-#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6)
-
-/* Max event bits supported */
-#define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
-
-/**
- * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
- * aren't marked nohz_full
- * @mask: The mask to pick a CPU from.
- * @exclude_cpu:The CPU to avoid picking.
- *
- * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
- * CPUs that don't use nohz_full, these are preferred. Pass
- * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
- *
- * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
- */
-static inline unsigned int
-cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
-{
- unsigned int cpu, hk_cpu;
-
- if (exclude_cpu == RESCTRL_PICK_ANY_CPU)
- cpu = cpumask_any(mask);
- else
- cpu = cpumask_any_but(mask, exclude_cpu);
-
- /* Only continue if tick_nohz_full_mask has been initialized. */
- if (!tick_nohz_full_enabled())
- return cpu;
-
- /* If the CPU picked isn't marked nohz_full nothing more needs doing. */
- if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu))
- return cpu;
-
- /* Try to find a CPU that isn't nohz_full to use in preference */
- hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
- if (hk_cpu == exclude_cpu)
- hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask);
-
- if (hk_cpu < nr_cpu_ids)
- cpu = hk_cpu;
-
- return cpu;
-}
-
-struct rdt_fs_context {
- struct kernfs_fs_context kfc;
- bool enable_cdpl2;
- bool enable_cdpl3;
- bool enable_mba_mbps;
- bool enable_debug;
-};
-
-static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
-{
- struct kernfs_fs_context *kfc = fc->fs_private;
-
- return container_of(kfc, struct rdt_fs_context, kfc);
-}
-
-/**
- * struct mon_evt - Entry in the event list of a resource
- * @evtid: event id
- * @name: name of the event
- * @configurable: true if the event is configurable
- * @list: entry in &rdt_resource->evt_list
- */
-struct mon_evt {
- enum resctrl_event_id evtid;
- char *name;
- bool configurable;
- struct list_head list;
-};
-
-/**
- * union mon_data_bits - Monitoring details for each event file.
- * @priv: Used to store monitoring event data in @u
- * as kernfs private data.
- * @u.rid: Resource id associated with the event file.
- * @u.evtid: Event id associated with the event file.
- * @u.sum: Set when event must be summed across multiple
- * domains.
- * @u.domid: When @u.sum is zero this is the domain to which
- * the event file belongs. When @sum is one this
- * is the id of the L3 cache that all domains to be
- * summed share.
- * @u: Name of the bit fields struct.
- */
-union mon_data_bits {
- void *priv;
- struct {
- unsigned int rid : 10;
- enum resctrl_event_id evtid : 7;
- unsigned int sum : 1;
- unsigned int domid : 14;
- } u;
-};
-
-/**
- * struct rmid_read - Data passed across smp_call*() to read event count.
- * @rgrp: Resource group for which the counter is being read. If it is a parent
- * resource group then its event count is summed with the count from all
- * its child resource groups.
- * @r: Resource describing the properties of the event being read.
- * @d: Domain that the counter should be read from. If NULL then sum all
- * domains in @r sharing L3 @ci.id
- * @evtid: Which monitor event to read.
- * @first: Initialize MBM counter when true.
- * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
- * @err: Error encountered when reading counter.
- * @val: Returned value of event counter. If @rgrp is a parent resource group,
- * @val includes the sum of event counts from its child resource groups.
- * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id,
- * (summed across child resource groups if @rgrp is a parent resource group).
- * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only).
- */
-struct rmid_read {
- struct rdtgroup *rgrp;
- struct rdt_resource *r;
- struct rdt_mon_domain *d;
- enum resctrl_event_id evtid;
- bool first;
- struct cacheinfo *ci;
- int err;
- u64 val;
- void *arch_mon_ctx;
-};
-
-extern unsigned int rdt_mon_features;
-extern struct list_head resctrl_schema_all;
-extern bool resctrl_mounted;
-
-enum rdt_group_type {
- RDTCTRL_GROUP = 0,
- RDTMON_GROUP,
- RDT_NUM_GROUP,
-};
-
-/**
- * enum rdtgrp_mode - Mode of a RDT resource group
- * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
- * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
- * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
- * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
- * allowed AND the allocations are Cache Pseudo-Locked
- * @RDT_NUM_MODES: Total number of modes
- *
- * The mode of a resource group enables control over the allowed overlap
- * between allocations associated with different resource groups (classes
- * of service). User is able to modify the mode of a resource group by
- * writing to the "mode" resctrl file associated with the resource group.
- *
- * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
- * writing the appropriate text to the "mode" file. A resource group enters
- * "pseudo-locked" mode after the schemata is written while the resource
- * group is in "pseudo-locksetup" mode.
- */
-enum rdtgrp_mode {
- RDT_MODE_SHAREABLE = 0,
- RDT_MODE_EXCLUSIVE,
- RDT_MODE_PSEUDO_LOCKSETUP,
- RDT_MODE_PSEUDO_LOCKED,
-
- /* Must be last */
- RDT_NUM_MODES,
-};
-
-/**
- * struct mongroup - store mon group's data in resctrl fs.
- * @mon_data_kn: kernfs node for the mon_data directory
- * @parent: parent rdtgrp
- * @crdtgrp_list: child rdtgroup node list
- * @rmid: rmid for this rdtgroup
- */
-struct mongroup {
- struct kernfs_node *mon_data_kn;
- struct rdtgroup *parent;
- struct list_head crdtgrp_list;
- u32 rmid;
-};
-
-/**
- * struct pseudo_lock_region - pseudo-lock region information
- * @s: Resctrl schema for the resource to which this
- * pseudo-locked region belongs
- * @d: RDT domain to which this pseudo-locked region
- * belongs
- * @cbm: bitmask of the pseudo-locked region
- * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread
- * completion
- * @thread_done: variable used by waitqueue to test if pseudo-locking
- * thread completed
- * @cpu: core associated with the cache on which the setup code
- * will be run
- * @line_size: size of the cache lines
- * @size: size of pseudo-locked region in bytes
- * @kmem: the kernel memory associated with pseudo-locked region
- * @minor: minor number of character device associated with this
- * region
- * @debugfs_dir: pointer to this region's directory in the debugfs
- * filesystem
- * @pm_reqs: Power management QoS requests related to this region
- */
-struct pseudo_lock_region {
- struct resctrl_schema *s;
- struct rdt_ctrl_domain *d;
- u32 cbm;
- wait_queue_head_t lock_thread_wq;
- int thread_done;
- int cpu;
- unsigned int line_size;
- unsigned int size;
- void *kmem;
- unsigned int minor;
- struct dentry *debugfs_dir;
- struct list_head pm_reqs;
-};
-
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn: kernfs node
- * @rdtgroup_list: linked list for all rdtgroups
- * @closid: closid for this rdtgroup
- * @cpu_mask: CPUs assigned to this rdtgroup
- * @flags: status bits
- * @waitcount: how many cpus expect to find this
- * group when they acquire rdtgroup_mutex
- * @type: indicates type of this rdtgroup - either
- * monitor only or ctrl_mon group
- * @mon: mongroup related data
- * @mode: mode of resource group
- * @mba_mbps_event: input monitoring event id when mba_sc is enabled
- * @plr: pseudo-locked region
- */
-struct rdtgroup {
- struct kernfs_node *kn;
- struct list_head rdtgroup_list;
- u32 closid;
- struct cpumask cpu_mask;
- int flags;
- atomic_t waitcount;
- enum rdt_group_type type;
- struct mongroup mon;
- enum rdtgrp_mode mode;
- enum resctrl_event_id mba_mbps_event;
- struct pseudo_lock_region *plr;
-};
-
-/* rdtgroup.flags */
-#define RDT_DELETED 1
-
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST 1
-
-/*
- * Define the file type flags for base and info directories.
- */
-#define RFTYPE_INFO BIT(0)
-#define RFTYPE_BASE BIT(1)
-#define RFTYPE_CTRL BIT(4)
-#define RFTYPE_MON BIT(5)
-#define RFTYPE_TOP BIT(6)
-#define RFTYPE_RES_CACHE BIT(8)
-#define RFTYPE_RES_MB BIT(9)
-#define RFTYPE_DEBUG BIT(10)
-#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
-#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
-#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
-#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
-#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON)
-
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-void __exit rdtgroup_exit(void);
-
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name: File name
- * @mode: Access mode
- * @kf_ops: File operations
- * @flags: File specific RFTYPE_FLAGS_* flags
- * @fflags: File specific RFTYPE_* flags
- * @seq_show: Show content of the file
- * @write: Write to the file
- */
-struct rftype {
- char *name;
- umode_t mode;
- const struct kernfs_ops *kf_ops;
- unsigned long flags;
- unsigned long fflags;
-
- int (*seq_show)(struct kernfs_open_file *of,
- struct seq_file *sf, void *v);
- /*
- * write() is the generic write callback which maps directly to
- * kernfs write operation and overrides all other operations.
- * Maximum write size is determined by ->max_write_len.
- */
- ssize_t (*write)(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-};
-
-/**
- * struct mbm_state - status for each MBM counter in each domain
- * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
- * @prev_bw: The most recent bandwidth in MBps
- */
-struct mbm_state {
- u64 prev_bw_bytes;
- u32 prev_bw;
-};
-
/**
* struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
* return value.
@@ -433,37 +89,6 @@ struct msr_param {
u32 high;
};
-static inline bool is_llc_occupancy_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
-}
-
-static inline bool is_mbm_total_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
-}
-
-static inline bool is_mbm_local_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
-}
-
-static inline bool is_mbm_enabled(void)
-{
- return (is_mbm_total_enabled() || is_mbm_local_enabled());
-}
-
-static inline bool is_mbm_event(int e)
-{
- return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
- e <= QOS_L3_MBM_LOCAL_EVENT_ID);
-}
-
-struct rdt_parse_data {
- struct rdtgroup *rdtgrp;
- char *buf;
-};
-
/**
* struct rdt_hw_resource - arch private attributes of a resctrl resource
* @r_resctrl: Attributes of the resource used directly by resctrl.
@@ -476,8 +101,6 @@ struct rdt_parse_data {
* @msr_update: Function pointer to update QOS MSRs
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
* @mbm_width: Monitor width, to detect and correct for overflow.
- * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth
- * Monitoring Event Configuration (BMEC) is supported.
* @cdp_enabled: CDP state of this resource
*
* Members of this structure are either private to the architecture
@@ -491,7 +114,6 @@ struct rdt_hw_resource {
void (*msr_update)(struct msr_param *m);
unsigned int mon_scale;
unsigned int mbm_width;
- unsigned int mbm_cfg_mask;
bool cdp_enabled;
};
@@ -500,66 +122,10 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
return container_of(r, struct rdt_hw_resource, r_resctrl);
}
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d);
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_ctrl_domain *d);
-
-extern struct mutex rdtgroup_mutex;
-
extern struct rdt_hw_resource rdt_resources_all[];
-extern struct rdtgroup rdtgroup_default;
-extern struct dentry *debugfs_resctrl;
-extern enum resctrl_event_id mba_mbps_default_event;
-
-enum resctrl_res_level {
- RDT_RESOURCE_L3,
- RDT_RESOURCE_L2,
- RDT_RESOURCE_MBA,
- RDT_RESOURCE_SMBA,
-
- /* Must be the last */
- RDT_NUM_RESOURCES,
-};
-
-static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
-{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res);
-
- hw_res++;
- return &hw_res->r_resctrl;
-}
-
-static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
-{
- return rdt_resources_all[l].cdp_enabled;
-}
-
-int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
-/*
- * To return the common struct rdt_resource, which is contained in struct
- * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
- */
-#define for_each_rdt_resource(r) \
- for (r = &rdt_resources_all[0].r_resctrl; \
- r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \
- r = resctrl_inc(r))
-
-#define for_each_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable || r->mon_capable)
-
-#define for_each_alloc_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable)
-
-#define for_each_mon_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->mon_capable)
-
/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
union cpuid_0x10_1_eax {
struct {
@@ -593,69 +159,14 @@ union cpuid_0x10_x_edx {
unsigned int full;
};
-void rdt_last_cmd_clear(void);
-void rdt_last_cmd_puts(const char *s);
-__printf(1, 2)
-void rdt_last_cmd_printf(const char *fmt, ...);
-
void rdt_ctrl_update(void *arg);
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
-void rdtgroup_kn_unlock(struct kernfs_node *kn);
-int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
-int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
- umode_t mask);
-struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
- struct list_head **pos);
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v);
-ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
- unsigned long cbm, int closid, bool exclusive);
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
- unsigned long cbm);
-enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
-int rdtgroup_tasks_assigned(struct rdtgroup *r);
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
-int rdt_pseudo_lock_init(void);
-void rdt_pseudo_lock_release(void);
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
-struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r);
-struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r);
-int closids_supported(void);
-void closid_free(int closid);
-int alloc_rmid(u32 closid);
-void free_rmid(u32 closid, u32 rmid);
+
int rdt_get_mon_l3_config(struct rdt_resource *r);
-void __exit rdt_put_mon_l3_config(void);
-bool __init rdt_cpu_has(int flag);
-void mon_event_count(void *info);
-int rdtgroup_mondata_show(struct seq_file *m, void *arg);
-void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
- cpumask_t *cpumask, int evtid, int first);
-void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
- unsigned long delay_ms,
- int exclude_cpu);
-void mbm_handle_overflow(struct work_struct *work);
+
+bool rdt_cpu_has(int flag);
+
void __init intel_rdt_mbm_apply_quirk(void);
-bool is_mba_sc(struct rdt_resource *r);
-void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
- int exclude_cpu);
-void cqm_handle_limbo(struct work_struct *work);
-bool has_busy_rmid(struct rdt_mon_domain *d);
-void __check_limbo(struct rdt_mon_domain *d, bool force_free);
+
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
-void resctrl_file_fflags_init(const char *config, unsigned long fflags);
-void rdt_staged_configs_clear(void);
-bool closid_allocated(unsigned int closid);
-int resctrl_find_cleanest_closid(void);
+
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 94a1d9780461..c261558276cd 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -18,62 +18,12 @@
#define pr_fmt(fmt) "resctrl: " fmt
#include <linux/cpu.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
+#include <linux/resctrl.h>
#include <asm/cpu_device_id.h>
-#include <asm/resctrl.h>
+#include <asm/msr.h>
#include "internal.h"
-#include "trace.h"
-
-/**
- * struct rmid_entry - dirty tracking for all RMID.
- * @closid: The CLOSID for this entry.
- * @rmid: The RMID for this entry.
- * @busy: The number of domains with cached data using this RMID.
- * @list: Member of the rmid_free_lru list when busy == 0.
- *
- * Depending on the architecture the correct monitor is accessed using
- * both @closid and @rmid, or @rmid only.
- *
- * Take the rdtgroup_mutex when accessing.
- */
-struct rmid_entry {
- u32 closid;
- u32 rmid;
- int busy;
- struct list_head list;
-};
-
-/*
- * @rmid_free_lru - A least recently used list of free RMIDs
- * These RMIDs are guaranteed to have an occupancy less than the
- * threshold occupancy
- */
-static LIST_HEAD(rmid_free_lru);
-
-/*
- * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has.
- * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
- * Indexed by CLOSID. Protected by rdtgroup_mutex.
- */
-static u32 *closid_num_dirty_rmid;
-
-/*
- * @rmid_limbo_count - count of currently unused but (potentially)
- * dirty RMIDs.
- * This counts RMIDs that no one is currently using but that
- * may have a occupancy value > resctrl_rmid_realloc_threshold. User can
- * change the threshold occupancy value.
- */
-static unsigned int rmid_limbo_count;
-
-/*
- * @rmid_entry - The entry in the limbo and free lists.
- */
-static struct rmid_entry *rmid_ptrs;
/*
* Global boolean for rdt_monitor which is true if any
@@ -86,23 +36,12 @@ bool rdt_mon_capable;
*/
unsigned int rdt_mon_features;
-/*
- * This is the threshold cache occupancy in bytes at which we will consider an
- * RMID available for re-allocation.
- */
-unsigned int resctrl_rmid_realloc_threshold;
-
-/*
- * This is the maximum value for the reallocation threshold, in bytes.
- */
-unsigned int resctrl_rmid_realloc_limit;
-
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
static int snc_nodes_per_l3_cache = 1;
/*
- * The correction factor table is documented in Documentation/arch/x86/resctrl.rst.
+ * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
* If rmid > rmid threshold, MBM total and local values should be multiplied
* by the correction factor.
*
@@ -151,6 +90,7 @@ static const struct mbm_correction_factor_table {
};
static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+
static u64 mbm_cf __read_mostly;
static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
@@ -163,33 +103,6 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
}
/*
- * x86 and arm64 differ in their handling of monitoring.
- * x86's RMID are independent numbers, there is only one source of traffic
- * with an RMID value of '1'.
- * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
- * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
- * value is no longer unique.
- * To account for this, resctrl uses an index. On x86 this is just the RMID,
- * on arm64 it encodes the CLOSID and RMID. This gives a unique number.
- *
- * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
- * must accept an attempt to read every index.
- */
-static inline struct rmid_entry *__rmid_entry(u32 idx)
-{
- struct rmid_entry *entry;
- u32 closid, rmid;
-
- entry = &rmid_ptrs[idx];
- resctrl_arch_rmid_idx_decode(idx, &closid, &rmid);
-
- WARN_ON_ONCE(entry->closid != closid);
- WARN_ON_ONCE(entry->rmid != rmid);
-
- return entry;
-}
-
-/*
* When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
* "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
* needed. The physical RMID is the same as the logical RMID.
@@ -238,7 +151,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
* are error bits.
*/
wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
- rdmsrl(MSR_IA32_QM_CTR, msr_val);
+ rdmsrq(MSR_IA32_QM_CTR, msr_val);
if (msr_val & RMID_VAL_ERROR)
return -EIO;
@@ -260,12 +173,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do
return &hw_dom->arch_mbm_total[rmid];
case QOS_L3_MBM_LOCAL_EVENT_ID:
return &hw_dom->arch_mbm_local[rmid];
+ default:
+ /* Never expect to get here */
+ WARN_ON_ONCE(1);
+ return NULL;
}
-
- /* Never expect to get here */
- WARN_ON_ONCE(1);
-
- return NULL;
}
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
@@ -295,11 +207,11 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *
{
struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
memset(hw_dom->arch_mbm_total, 0,
sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
memset(hw_dom->arch_mbm_local, 0,
sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
}
@@ -346,749 +258,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
return 0;
}
-static void limbo_release_entry(struct rmid_entry *entry)
-{
- lockdep_assert_held(&rdtgroup_mutex);
-
- rmid_limbo_count--;
- list_add_tail(&entry->list, &rmid_free_lru);
-
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
- closid_num_dirty_rmid[entry->closid]--;
-}
-
-/*
- * Check the RMIDs that are marked as busy for this domain. If the
- * reported LLC occupancy is below the threshold clear the busy bit and
- * decrement the count. If the busy count gets to zero on an RMID, we
- * free the RMID
- */
-void __check_limbo(struct rdt_mon_domain *d, bool force_free)
-{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- u32 idx_limit = resctrl_arch_system_num_rmid_idx();
- struct rmid_entry *entry;
- u32 idx, cur_idx = 1;
- void *arch_mon_ctx;
- bool rmid_dirty;
- u64 val = 0;
-
- arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
- if (IS_ERR(arch_mon_ctx)) {
- pr_warn_ratelimited("Failed to allocate monitor context: %ld",
- PTR_ERR(arch_mon_ctx));
- return;
- }
-
- /*
- * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
- * are marked as busy for occupancy < threshold. If the occupancy
- * is less than the threshold decrement the busy counter of the
- * RMID and move it to the free list when the counter reaches 0.
- */
- for (;;) {
- idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx);
- if (idx >= idx_limit)
- break;
-
- entry = __rmid_entry(idx);
- if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
- QOS_L3_OCCUP_EVENT_ID, &val,
- arch_mon_ctx)) {
- rmid_dirty = true;
- } else {
- rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
-
- /*
- * x86's CLOSID and RMID are independent numbers, so the entry's
- * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
- * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
- * used to select the configuration. It is thus necessary to track both
- * CLOSID and RMID because there may be dependencies between them
- * on some architectures.
- */
- trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val);
- }
-
- if (force_free || !rmid_dirty) {
- clear_bit(idx, d->rmid_busy_llc);
- if (!--entry->busy)
- limbo_release_entry(entry);
- }
- cur_idx = idx + 1;
- }
-
- resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
-}
-
-bool has_busy_rmid(struct rdt_mon_domain *d)
-{
- u32 idx_limit = resctrl_arch_system_num_rmid_idx();
-
- return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit;
-}
-
-static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
-{
- struct rmid_entry *itr;
- u32 itr_idx, cmp_idx;
-
- if (list_empty(&rmid_free_lru))
- return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC);
-
- list_for_each_entry(itr, &rmid_free_lru, list) {
- /*
- * Get the index of this free RMID, and the index it would need
- * to be if it were used with this CLOSID.
- * If the CLOSID is irrelevant on this architecture, the two
- * index values are always the same on every entry and thus the
- * very first entry will be returned.
- */
- itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid);
- cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid);
-
- if (itr_idx == cmp_idx)
- return itr;
- }
-
- return ERR_PTR(-ENOSPC);
-}
-
-/**
- * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
- * RMID are clean, or the CLOSID that has
- * the most clean RMID.
- *
- * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
- * may not be able to allocate clean RMID. To avoid this the allocator will
- * choose the CLOSID with the most clean RMID.
- *
- * When the CLOSID and RMID are independent numbers, the first free CLOSID will
- * be returned.
- */
-int resctrl_find_cleanest_closid(void)
-{
- u32 cleanest_closid = ~0;
- int i = 0;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
- return -EIO;
-
- for (i = 0; i < closids_supported(); i++) {
- int num_dirty;
-
- if (closid_allocated(i))
- continue;
-
- num_dirty = closid_num_dirty_rmid[i];
- if (num_dirty == 0)
- return i;
-
- if (cleanest_closid == ~0)
- cleanest_closid = i;
-
- if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
- cleanest_closid = i;
- }
-
- if (cleanest_closid == ~0)
- return -ENOSPC;
-
- return cleanest_closid;
-}
-
-/*
- * For MPAM the RMID value is not unique, and has to be considered with
- * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
- * allows all domains to be managed by a single free list.
- * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
- */
-int alloc_rmid(u32 closid)
-{
- struct rmid_entry *entry;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- entry = resctrl_find_free_rmid(closid);
- if (IS_ERR(entry))
- return PTR_ERR(entry);
-
- list_del(&entry->list);
- return entry->rmid;
-}
-
-static void add_rmid_to_limbo(struct rmid_entry *entry)
-{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- struct rdt_mon_domain *d;
- u32 idx;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
-
- entry->busy = 0;
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- /*
- * For the first limbo RMID in the domain,
- * setup up the limbo worker.
- */
- if (!has_busy_rmid(d))
- cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL,
- RESCTRL_PICK_ANY_CPU);
- set_bit(idx, d->rmid_busy_llc);
- entry->busy++;
- }
-
- rmid_limbo_count++;
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
- closid_num_dirty_rmid[entry->closid]++;
-}
-
-void free_rmid(u32 closid, u32 rmid)
-{
- u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
- struct rmid_entry *entry;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- /*
- * Do not allow the default rmid to be free'd. Comparing by index
- * allows architectures that ignore the closid parameter to avoid an
- * unnecessary check.
- */
- if (!resctrl_arch_mon_capable() ||
- idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
- RESCTRL_RESERVED_RMID))
- return;
-
- entry = __rmid_entry(idx);
-
- if (is_llc_occupancy_enabled())
- add_rmid_to_limbo(entry);
- else
- list_add_tail(&entry->list, &rmid_free_lru);
-}
-
-static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
- u32 rmid, enum resctrl_event_id evtid)
-{
- u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
-
- switch (evtid) {
- case QOS_L3_MBM_TOTAL_EVENT_ID:
- return &d->mbm_total[idx];
- case QOS_L3_MBM_LOCAL_EVENT_ID:
- return &d->mbm_local[idx];
- default:
- return NULL;
- }
-}
-
-static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
-{
- int cpu = smp_processor_id();
- struct rdt_mon_domain *d;
- struct mbm_state *m;
- int err, ret;
- u64 tval = 0;
-
- if (rr->first) {
- resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
- m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
- if (m)
- memset(m, 0, sizeof(struct mbm_state));
- return 0;
- }
-
- if (rr->d) {
- /* Reading a single domain, must be on a CPU in that domain. */
- if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
- return -EINVAL;
- rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
- rr->evtid, &tval, rr->arch_mon_ctx);
- if (rr->err)
- return rr->err;
-
- rr->val += tval;
-
- return 0;
- }
-
- /* Summing domains that share a cache, must be on a CPU for that cache. */
- if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
- return -EINVAL;
-
- /*
- * Legacy files must report the sum of an event across all
- * domains that share the same L3 cache instance.
- * Report success if a read from any domain succeeds, -EINVAL
- * (translated to "Unavailable" for user space) if reading from
- * all domains fail for any reason.
- */
- ret = -EINVAL;
- list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
- if (d->ci->id != rr->ci->id)
- continue;
- err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
- rr->evtid, &tval, rr->arch_mon_ctx);
- if (!err) {
- rr->val += tval;
- ret = 0;
- }
- }
-
- if (ret)
- rr->err = ret;
-
- return ret;
-}
-
-/*
- * mbm_bw_count() - Update bw count from values previously read by
- * __mon_event_count().
- * @closid: The closid used to identify the cached mbm_state.
- * @rmid: The rmid used to identify the cached mbm_state.
- * @rr: The struct rmid_read populated by __mon_event_count().
- *
- * Supporting function to calculate the memory bandwidth
- * and delta bandwidth in MBps. The chunks value previously read by
- * __mon_event_count() is compared with the chunks value from the previous
- * invocation. This must be called once per second to maintain values in MBps.
- */
-static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
-{
- u64 cur_bw, bytes, cur_bytes;
- struct mbm_state *m;
-
- m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
- if (WARN_ON_ONCE(!m))
- return;
-
- cur_bytes = rr->val;
- bytes = cur_bytes - m->prev_bw_bytes;
- m->prev_bw_bytes = cur_bytes;
-
- cur_bw = bytes / SZ_1M;
-
- m->prev_bw = cur_bw;
-}
-
-/*
- * This is scheduled by mon_event_read() to read the CQM/MBM counters
- * on a domain.
- */
-void mon_event_count(void *info)
-{
- struct rdtgroup *rdtgrp, *entry;
- struct rmid_read *rr = info;
- struct list_head *head;
- int ret;
-
- rdtgrp = rr->rgrp;
-
- ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
-
- /*
- * For Ctrl groups read data from child monitor groups and
- * add them together. Count events which are read successfully.
- * Discard the rmid_read's reporting errors.
- */
- head = &rdtgrp->mon.crdtgrp_list;
-
- if (rdtgrp->type == RDTCTRL_GROUP) {
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- if (__mon_event_count(entry->closid, entry->mon.rmid,
- rr) == 0)
- ret = 0;
- }
- }
-
- /*
- * __mon_event_count() calls for newly created monitor groups may
- * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
- * Discard error if any of the monitor event reads succeeded.
- */
- if (ret == 0)
- rr->err = 0;
-}
-
-/*
- * Feedback loop for MBA software controller (mba_sc)
- *
- * mba_sc is a feedback loop where we periodically read MBM counters and
- * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
- * that:
- *
- * current bandwidth(cur_bw) < user specified bandwidth(user_bw)
- *
- * This uses the MBM counters to measure the bandwidth and MBA throttle
- * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
- * fact that resctrl rdtgroups have both monitoring and control.
- *
- * The frequency of the checks is 1s and we just tag along the MBM overflow
- * timer. Having 1s interval makes the calculation of bandwidth simpler.
- *
- * Although MBA's goal is to restrict the bandwidth to a maximum, there may
- * be a need to increase the bandwidth to avoid unnecessarily restricting
- * the L2 <-> L3 traffic.
- *
- * Since MBA controls the L2 external bandwidth where as MBM measures the
- * L3 external bandwidth the following sequence could lead to such a
- * situation.
- *
- * Consider an rdtgroup which had high L3 <-> memory traffic in initial
- * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
- * after some time rdtgroup has mostly L2 <-> L3 traffic.
- *
- * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
- * throttle MSRs already have low percentage values. To avoid
- * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
- */
-static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
-{
- u32 closid, rmid, cur_msr_val, new_msr_val;
- struct mbm_state *pmbm_data, *cmbm_data;
- struct rdt_ctrl_domain *dom_mba;
- enum resctrl_event_id evt_id;
- struct rdt_resource *r_mba;
- struct list_head *head;
- struct rdtgroup *entry;
- u32 cur_bw, user_bw;
-
- r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
- evt_id = rgrp->mba_mbps_event;
-
- closid = rgrp->closid;
- rmid = rgrp->mon.rmid;
- pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
- if (WARN_ON_ONCE(!pmbm_data))
- return;
-
- dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
- if (!dom_mba) {
- pr_warn_once("Failure to get domain for MBA update\n");
- return;
- }
-
- cur_bw = pmbm_data->prev_bw;
- user_bw = dom_mba->mbps_val[closid];
-
- /* MBA resource doesn't support CDP */
- cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
-
- /*
- * For Ctrl groups read data from child monitor groups.
- */
- head = &rgrp->mon.crdtgrp_list;
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
- if (WARN_ON_ONCE(!cmbm_data))
- return;
- cur_bw += cmbm_data->prev_bw;
- }
-
- /*
- * Scale up/down the bandwidth linearly for the ctrl group. The
- * bandwidth step is the bandwidth granularity specified by the
- * hardware.
- * Always increase throttling if current bandwidth is above the
- * target set by user.
- * But avoid thrashing up and down on every poll by checking
- * whether a decrease in throttling is likely to push the group
- * back over target. E.g. if currently throttling to 30% of bandwidth
- * on a system with 10% granularity steps, check whether moving to
- * 40% would go past the limit by multiplying current bandwidth by
- * "(30 + 10) / 30".
- */
- if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
- new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
- } else if (cur_msr_val < MAX_MBA_BW &&
- (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
- new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
- } else {
- return;
- }
-
- resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
-}
-
-static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
- u32 closid, u32 rmid, enum resctrl_event_id evtid)
-{
- struct rmid_read rr = {0};
-
- rr.r = r;
- rr.d = d;
- rr.evtid = evtid;
- rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
- if (IS_ERR(rr.arch_mon_ctx)) {
- pr_warn_ratelimited("Failed to allocate monitor context: %ld",
- PTR_ERR(rr.arch_mon_ctx));
- return;
- }
-
- __mon_event_count(closid, rmid, &rr);
-
- /*
- * If the software controller is enabled, compute the
- * bandwidth for this event id.
- */
- if (is_mba_sc(NULL))
- mbm_bw_count(closid, rmid, &rr);
-
- resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
-}
-
-static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
- u32 closid, u32 rmid)
-{
- /*
- * This is protected from concurrent reads from user as both
- * the user and overflow handler hold the global mutex.
- */
- if (is_mbm_total_enabled())
- mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
-
- if (is_mbm_local_enabled())
- mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
-}
-
-/*
- * Handler to scan the limbo list and move the RMIDs
- * to free list whose occupancy < threshold_occupancy.
- */
-void cqm_handle_limbo(struct work_struct *work)
-{
- unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
- struct rdt_mon_domain *d;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);
-
- __check_limbo(d, false);
-
- if (has_busy_rmid(d)) {
- d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
- RESCTRL_PICK_ANY_CPU);
- schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
- delay);
- }
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-}
-
-/**
- * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
- * domain.
- * @dom: The domain the limbo handler should run for.
- * @delay_ms: How far in the future the handler should run.
- * @exclude_cpu: Which CPU the handler should not run on,
- * RESCTRL_PICK_ANY_CPU to pick any CPU.
- */
-void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
- int exclude_cpu)
-{
- unsigned long delay = msecs_to_jiffies(delay_ms);
- int cpu;
-
- cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
- dom->cqm_work_cpu = cpu;
-
- if (cpu < nr_cpu_ids)
- schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
-}
-
-void mbm_handle_overflow(struct work_struct *work)
-{
- unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
- struct rdtgroup *prgrp, *crgrp;
- struct rdt_mon_domain *d;
- struct list_head *head;
- struct rdt_resource *r;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- /*
- * If the filesystem has been unmounted this work no longer needs to
- * run.
- */
- if (!resctrl_mounted || !resctrl_arch_mon_capable())
- goto out_unlock;
-
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- d = container_of(work, struct rdt_mon_domain, mbm_over.work);
-
- list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
-
- head = &prgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list)
- mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
-
- if (is_mba_sc(NULL))
- update_mba_bw(prgrp, d);
- }
-
- /*
- * Re-check for housekeeping CPUs. This allows the overflow handler to
- * move off a nohz_full CPU quickly.
- */
- d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
- RESCTRL_PICK_ANY_CPU);
- schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
-
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-}
-
-/**
- * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
- * domain.
- * @dom: The domain the overflow handler should run for.
- * @delay_ms: How far in the future the handler should run.
- * @exclude_cpu: Which CPU the handler should not run on,
- * RESCTRL_PICK_ANY_CPU to pick any CPU.
- */
-void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
- int exclude_cpu)
-{
- unsigned long delay = msecs_to_jiffies(delay_ms);
- int cpu;
-
- /*
- * When a domain comes online there is no guarantee the filesystem is
- * mounted. If not, there is no need to catch counter overflow.
- */
- if (!resctrl_mounted || !resctrl_arch_mon_capable())
- return;
- cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
- dom->mbm_work_cpu = cpu;
-
- if (cpu < nr_cpu_ids)
- schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
-}
-
-static int dom_data_init(struct rdt_resource *r)
-{
- u32 idx_limit = resctrl_arch_system_num_rmid_idx();
- u32 num_closid = resctrl_arch_get_num_closid(r);
- struct rmid_entry *entry = NULL;
- int err = 0, i;
- u32 idx;
-
- mutex_lock(&rdtgroup_mutex);
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
- u32 *tmp;
-
- /*
- * If the architecture hasn't provided a sanitised value here,
- * this may result in larger arrays than necessary. Resctrl will
- * use a smaller system wide value based on the resources in
- * use.
- */
- tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
- if (!tmp) {
- err = -ENOMEM;
- goto out_unlock;
- }
-
- closid_num_dirty_rmid = tmp;
- }
-
- rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
- if (!rmid_ptrs) {
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
- kfree(closid_num_dirty_rmid);
- closid_num_dirty_rmid = NULL;
- }
- err = -ENOMEM;
- goto out_unlock;
- }
-
- for (i = 0; i < idx_limit; i++) {
- entry = &rmid_ptrs[i];
- INIT_LIST_HEAD(&entry->list);
-
- resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid);
- list_add_tail(&entry->list, &rmid_free_lru);
- }
-
- /*
- * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
- * are always allocated. These are used for the rdtgroup_default
- * control group, which will be setup later in rdtgroup_init().
- */
- idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
- RESCTRL_RESERVED_RMID);
- entry = __rmid_entry(idx);
- list_del(&entry->list);
-
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
-
- return err;
-}
-
-static void __exit dom_data_exit(void)
-{
- mutex_lock(&rdtgroup_mutex);
-
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
- kfree(closid_num_dirty_rmid);
- closid_num_dirty_rmid = NULL;
- }
-
- kfree(rmid_ptrs);
- rmid_ptrs = NULL;
-
- mutex_unlock(&rdtgroup_mutex);
-}
-
-static struct mon_evt llc_occupancy_event = {
- .name = "llc_occupancy",
- .evtid = QOS_L3_OCCUP_EVENT_ID,
-};
-
-static struct mon_evt mbm_total_event = {
- .name = "mbm_total_bytes",
- .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
-};
-
-static struct mon_evt mbm_local_event = {
- .name = "mbm_local_bytes",
- .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
-};
-
-/*
- * Initialize the event list for the resource.
- *
- * Note that MBM events are also part of RDT_RESOURCE_L3 resource
- * because as per the SDM the total and local memory bandwidth
- * are enumerated as part of L3 monitoring.
- */
-static void l3_mon_evt_init(struct rdt_resource *r)
-{
- INIT_LIST_HEAD(&r->evt_list);
-
- if (is_llc_occupancy_enabled())
- list_add_tail(&llc_occupancy_event.list, &r->evt_list);
- if (is_mbm_total_enabled())
- list_add_tail(&mbm_total_event.list, &r->evt_list);
- if (is_mbm_local_enabled())
- list_add_tail(&mbm_local_event.list, &r->evt_list);
-}
-
/*
* The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
* which indicates that RMIDs are configured in legacy mode.
@@ -1177,7 +346,6 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
unsigned int threshold;
- int ret;
snc_nodes_per_l3_cache = snc_get_config();
@@ -1207,41 +375,19 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
*/
resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
- ret = dom_data_init(r);
- if (ret)
- return ret;
-
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
u32 eax, ebx, ecx, edx;
/* Detect list of bandwidth sources that can be tracked */
cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
- hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
-
- if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
- mbm_total_event.configurable = true;
- resctrl_file_fflags_init("mbm_total_bytes_config",
- RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
- }
- if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
- mbm_local_event.configurable = true;
- resctrl_file_fflags_init("mbm_local_bytes_config",
- RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
- }
+ r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
}
- l3_mon_evt_init(r);
-
r->mon_capable = true;
return 0;
}
-void __exit rdt_put_mon_l3_config(void)
-{
- dom_data_exit();
-}
-
void __init intel_rdt_mbm_apply_quirk(void)
{
int cf_index;
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 42cc162f7fc9..de580eca3363 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -11,26 +11,22 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cacheflush.h>
#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/debugfs.h>
-#include <linux/kthread.h>
-#include <linux/mman.h>
#include <linux/perf_event.h>
#include <linux/pm_qos.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/resctrl.h>
-#include <asm/cacheflush.h>
#include <asm/cpu_device_id.h>
-#include <asm/resctrl.h>
#include <asm/perf_event.h>
+#include <asm/msr.h>
#include "../../events/perf_event.h" /* For X86_CONFIG() */
#include "internal.h"
#define CREATE_TRACE_POINTS
-#include "trace.h"
+
+#include "pseudo_lock_trace.h"
/*
* The bits needed to disable hardware prefetching varies based on the
@@ -38,30 +34,9 @@
*/
static u64 prefetch_disable_bits;
-/*
- * Major number assigned to and shared by all devices exposing
- * pseudo-locked regions.
- */
-static unsigned int pseudo_lock_major;
-static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
-
-static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
-{
- const struct rdtgroup *rdtgrp;
-
- rdtgrp = dev_get_drvdata(dev);
- if (mode)
- *mode = 0600;
- return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
-}
-
-static const struct class pseudo_lock_class = {
- .name = "pseudo_lock",
- .devnode = pseudo_lock_devnode,
-};
-
/**
- * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
+ * platforms
* @void: It takes no parameters.
*
* Capture the list of platforms that have been validated to support
@@ -75,14 +50,16 @@ static const struct class pseudo_lock_class = {
* in the SDM.
*
* When adding a platform here also add support for its cache events to
- * measure_cycles_perf_fn()
+ * resctrl_arch_measure_l*_residency()
*
* Return:
* If platform is supported, the bits to disable hardware prefetchers, 0
* if platform is not supported.
*/
-static u64 get_prefetch_disable_bits(void)
+u64 resctrl_arch_get_prefetch_disable_bits(void)
{
+ prefetch_disable_bits = 0;
+
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
boot_cpu_data.x86 != 6)
return 0;
@@ -98,7 +75,8 @@ static u64 get_prefetch_disable_bits(void)
* 3 DCU IP Prefetcher Disable (R/W)
* 63:4 Reserved
*/
- return 0xF;
+ prefetch_disable_bits = 0xF;
+ break;
case INTEL_ATOM_GOLDMONT:
case INTEL_ATOM_GOLDMONT_PLUS:
/*
@@ -109,307 +87,16 @@ static u64 get_prefetch_disable_bits(void)
* 2 DCU Hardware Prefetcher Disable (R/W)
* 63:3 Reserved
*/
- return 0x5;
- }
-
- return 0;
-}
-
-/**
- * pseudo_lock_minor_get - Obtain available minor number
- * @minor: Pointer to where new minor number will be stored
- *
- * A bitmask is used to track available minor numbers. Here the next free
- * minor number is marked as unavailable and returned.
- *
- * Return: 0 on success, <0 on failure.
- */
-static int pseudo_lock_minor_get(unsigned int *minor)
-{
- unsigned long first_bit;
-
- first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
-
- if (first_bit == MINORBITS)
- return -ENOSPC;
-
- __clear_bit(first_bit, &pseudo_lock_minor_avail);
- *minor = first_bit;
-
- return 0;
-}
-
-/**
- * pseudo_lock_minor_release - Return minor number to available
- * @minor: The minor number made available
- */
-static void pseudo_lock_minor_release(unsigned int minor)
-{
- __set_bit(minor, &pseudo_lock_minor_avail);
-}
-
-/**
- * region_find_by_minor - Locate a pseudo-lock region by inode minor number
- * @minor: The minor number of the device representing pseudo-locked region
- *
- * When the character device is accessed we need to determine which
- * pseudo-locked region it belongs to. This is done by matching the minor
- * number of the device to the pseudo-locked region it belongs.
- *
- * Minor numbers are assigned at the time a pseudo-locked region is associated
- * with a cache instance.
- *
- * Return: On success return pointer to resource group owning the pseudo-locked
- * region, NULL on failure.
- */
-static struct rdtgroup *region_find_by_minor(unsigned int minor)
-{
- struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
-
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
- rdtgrp_match = rdtgrp;
- break;
- }
- }
- return rdtgrp_match;
-}
-
-/**
- * struct pseudo_lock_pm_req - A power management QoS request list entry
- * @list: Entry within the @pm_reqs list for a pseudo-locked region
- * @req: PM QoS request
- */
-struct pseudo_lock_pm_req {
- struct list_head list;
- struct dev_pm_qos_request req;
-};
-
-static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
-{
- struct pseudo_lock_pm_req *pm_req, *next;
-
- list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
- dev_pm_qos_remove_request(&pm_req->req);
- list_del(&pm_req->list);
- kfree(pm_req);
- }
-}
-
-/**
- * pseudo_lock_cstates_constrain - Restrict cores from entering C6
- * @plr: Pseudo-locked region
- *
- * To prevent the cache from being affected by power management entering
- * C6 has to be avoided. This is accomplished by requesting a latency
- * requirement lower than lowest C6 exit latency of all supported
- * platforms as found in the cpuidle state tables in the intel_idle driver.
- * At this time it is possible to do so with a single latency requirement
- * for all supported platforms.
- *
- * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
- * the ACPI latencies need to be considered while keeping in mind that C2
- * may be set to map to deeper sleep states. In this case the latency
- * requirement needs to prevent entering C2 also.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
-{
- struct pseudo_lock_pm_req *pm_req;
- int cpu;
- int ret;
-
- for_each_cpu(cpu, &plr->d->hdr.cpu_mask) {
- pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
- if (!pm_req) {
- rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
- ret = -ENOMEM;
- goto out_err;
- }
- ret = dev_pm_qos_add_request(get_cpu_device(cpu),
- &pm_req->req,
- DEV_PM_QOS_RESUME_LATENCY,
- 30);
- if (ret < 0) {
- rdt_last_cmd_printf("Failed to add latency req CPU%d\n",
- cpu);
- kfree(pm_req);
- ret = -1;
- goto out_err;
- }
- list_add(&pm_req->list, &plr->pm_reqs);
- }
-
- return 0;
-
-out_err:
- pseudo_lock_cstates_relax(plr);
- return ret;
-}
-
-/**
- * pseudo_lock_region_clear - Reset pseudo-lock region data
- * @plr: pseudo-lock region
- *
- * All content of the pseudo-locked region is reset - any memory allocated
- * freed.
- *
- * Return: void
- */
-static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
-{
- plr->size = 0;
- plr->line_size = 0;
- kfree(plr->kmem);
- plr->kmem = NULL;
- plr->s = NULL;
- if (plr->d)
- plr->d->plr = NULL;
- plr->d = NULL;
- plr->cbm = 0;
- plr->debugfs_dir = NULL;
-}
-
-/**
- * pseudo_lock_region_init - Initialize pseudo-lock region information
- * @plr: pseudo-lock region
- *
- * Called after user provided a schemata to be pseudo-locked. From the
- * schemata the &struct pseudo_lock_region is on entry already initialized
- * with the resource, domain, and capacity bitmask. Here the information
- * required for pseudo-locking is deduced from this data and &struct
- * pseudo_lock_region initialized further. This information includes:
- * - size in bytes of the region to be pseudo-locked
- * - cache line size to know the stride with which data needs to be accessed
- * to be pseudo-locked
- * - a cpu associated with the cache instance on which the pseudo-locking
- * flow can be executed
- *
- * Return: 0 on success, <0 on failure. Descriptive error will be written
- * to last_cmd_status buffer.
- */
-static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
-{
- enum resctrl_scope scope = plr->s->res->ctrl_scope;
- struct cacheinfo *ci;
- int ret;
-
- if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE))
- return -ENODEV;
-
- /* Pick the first cpu we find that is associated with the cache. */
- plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask);
-
- if (!cpu_online(plr->cpu)) {
- rdt_last_cmd_printf("CPU %u associated with cache not online\n",
- plr->cpu);
- ret = -ENODEV;
- goto out_region;
- }
-
- ci = get_cpu_cacheinfo_level(plr->cpu, scope);
- if (ci) {
- plr->line_size = ci->coherency_line_size;
- plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
- return 0;
- }
-
- ret = -1;
- rdt_last_cmd_puts("Unable to determine cache line size\n");
-out_region:
- pseudo_lock_region_clear(plr);
- return ret;
-}
-
-/**
- * pseudo_lock_init - Initialize a pseudo-lock region
- * @rdtgrp: resource group to which new pseudo-locked region will belong
- *
- * A pseudo-locked region is associated with a resource group. When this
- * association is created the pseudo-locked region is initialized. The
- * details of the pseudo-locked region are not known at this time so only
- * allocation is done and association established.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_init(struct rdtgroup *rdtgrp)
-{
- struct pseudo_lock_region *plr;
-
- plr = kzalloc(sizeof(*plr), GFP_KERNEL);
- if (!plr)
- return -ENOMEM;
-
- init_waitqueue_head(&plr->lock_thread_wq);
- INIT_LIST_HEAD(&plr->pm_reqs);
- rdtgrp->plr = plr;
- return 0;
-}
-
-/**
- * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
- * @plr: pseudo-lock region
- *
- * Initialize the details required to set up the pseudo-locked region and
- * allocate the contiguous memory that will be pseudo-locked to the cache.
- *
- * Return: 0 on success, <0 on failure. Descriptive error will be written
- * to last_cmd_status buffer.
- */
-static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
-{
- int ret;
-
- ret = pseudo_lock_region_init(plr);
- if (ret < 0)
- return ret;
-
- /*
- * We do not yet support contiguous regions larger than
- * KMALLOC_MAX_SIZE.
- */
- if (plr->size > KMALLOC_MAX_SIZE) {
- rdt_last_cmd_puts("Requested region exceeds maximum size\n");
- ret = -E2BIG;
- goto out_region;
- }
-
- plr->kmem = kzalloc(plr->size, GFP_KERNEL);
- if (!plr->kmem) {
- rdt_last_cmd_puts("Unable to allocate memory\n");
- ret = -ENOMEM;
- goto out_region;
+ prefetch_disable_bits = 0x5;
+ break;
}
- ret = 0;
- goto out;
-out_region:
- pseudo_lock_region_clear(plr);
-out:
- return ret;
-}
-
-/**
- * pseudo_lock_free - Free a pseudo-locked region
- * @rdtgrp: resource group to which pseudo-locked region belonged
- *
- * The pseudo-locked region's resources have already been released, or not
- * yet created at this point. Now it can be freed and disassociated from the
- * resource group.
- *
- * Return: void
- */
-static void pseudo_lock_free(struct rdtgroup *rdtgrp)
-{
- pseudo_lock_region_clear(rdtgrp->plr);
- kfree(rdtgrp->plr);
- rdtgrp->plr = NULL;
+ return prefetch_disable_bits;
}
/**
- * pseudo_lock_fn - Load kernel memory into cache
- * @_rdtgrp: resource group to which pseudo-lock region belongs
+ * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
+ * @_plr: the pseudo-lock region descriptor
*
* This is the core pseudo-locking flow.
*
@@ -426,10 +113,9 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int pseudo_lock_fn(void *_rdtgrp)
+int resctrl_arch_pseudo_lock_fn(void *_plr)
{
- struct rdtgroup *rdtgrp = _rdtgrp;
- struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct pseudo_lock_region *plr = _plr;
u32 rmid_p, closid_p;
unsigned long i;
u64 saved_msr;
@@ -476,8 +162,8 @@ static int pseudo_lock_fn(void *_rdtgrp)
* the buffer and evict pseudo-locked memory read earlier from the
* cache.
*/
- saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL);
- __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
+ native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
closid_p = this_cpu_read(pqr_state.cur_closid);
rmid_p = this_cpu_read(pqr_state.cur_rmid);
mem_r = plr->kmem;
@@ -489,7 +175,8 @@ static int pseudo_lock_fn(void *_rdtgrp)
* pseudo-locked followed by reading of kernel memory to load it
* into the cache.
*/
- __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
+
/*
* Cache was flushed earlier. Now access kernel memory to read it
* into cache region associated with just activated plr->closid.
@@ -525,10 +212,10 @@ static int pseudo_lock_fn(void *_rdtgrp)
* Critical section end: restore closid with capacity bitmask that
* does not overlap with pseudo-locked region.
*/
- __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
/* Re-enable the hardware prefetcher(s) */
- wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
local_irq_enable();
plr->thread_done = 1;
@@ -537,342 +224,8 @@ static int pseudo_lock_fn(void *_rdtgrp)
}
/**
- * rdtgroup_monitor_in_progress - Test if monitoring in progress
- * @rdtgrp: resource group being queried
- *
- * Return: 1 if monitor groups have been created for this resource
- * group, 0 otherwise.
- */
-static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
-{
- return !list_empty(&rdtgrp->mon.crdtgrp_list);
-}
-
-/**
- * rdtgroup_locksetup_user_restrict - Restrict user access to group
- * @rdtgrp: resource group needing access restricted
- *
- * A resource group used for cache pseudo-locking cannot have cpus or tasks
- * assigned to it. This is communicated to the user by restricting access
- * to all the files that can be used to make such changes.
- *
- * Permissions restored with rdtgroup_locksetup_user_restore()
- *
- * Return: 0 on success, <0 on failure. If a failure occurs during the
- * restriction of access an attempt will be made to restore permissions but
- * the state of the mode of these files will be uncertain when a failure
- * occurs.
- */
-static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
- if (ret)
- return ret;
-
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
- if (ret)
- goto err_tasks;
-
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
- if (ret)
- goto err_cpus;
-
- if (resctrl_arch_mon_capable()) {
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
- if (ret)
- goto err_cpus_list;
- }
-
- ret = 0;
- goto out;
-
-err_cpus_list:
- rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
-err_cpus:
- rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
-err_tasks:
- rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
-out:
- return ret;
-}
-
-/**
- * rdtgroup_locksetup_user_restore - Restore user access to group
- * @rdtgrp: resource group needing access restored
- *
- * Restore all file access previously removed using
- * rdtgroup_locksetup_user_restrict()
- *
- * Return: 0 on success, <0 on failure. If a failure occurs during the
- * restoration of access an attempt will be made to restrict permissions
- * again but the state of the mode of these files will be uncertain when
- * a failure occurs.
- */
-static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
- if (ret)
- return ret;
-
- ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
- if (ret)
- goto err_tasks;
-
- ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
- if (ret)
- goto err_cpus;
-
- if (resctrl_arch_mon_capable()) {
- ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
- if (ret)
- goto err_cpus_list;
- }
-
- ret = 0;
- goto out;
-
-err_cpus_list:
- rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
-err_cpus:
- rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
-err_tasks:
- rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
-out:
- return ret;
-}
-
-/**
- * rdtgroup_locksetup_enter - Resource group enters locksetup mode
- * @rdtgrp: resource group requested to enter locksetup mode
- *
- * A resource group enters locksetup mode to reflect that it would be used
- * to represent a pseudo-locked region and is in the process of being set
- * up to do so. A resource group used for a pseudo-locked region would
- * lose the closid associated with it so we cannot allow it to have any
- * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
- * future. Monitoring of a pseudo-locked region is not allowed either.
- *
- * The above and more restrictions on a pseudo-locked region are checked
- * for and enforced before the resource group enters the locksetup mode.
- *
- * Returns: 0 if the resource group successfully entered locksetup mode, <0
- * on failure. On failure the last_cmd_status buffer is updated with text to
- * communicate details of failure to the user.
- */
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- /*
- * The default resource group can neither be removed nor lose the
- * default closid associated with it.
- */
- if (rdtgrp == &rdtgroup_default) {
- rdt_last_cmd_puts("Cannot pseudo-lock default group\n");
- return -EINVAL;
- }
-
- /*
- * Cache Pseudo-locking not supported when CDP is enabled.
- *
- * Some things to consider if you would like to enable this
- * support (using L3 CDP as example):
- * - When CDP is enabled two separate resources are exposed,
- * L3DATA and L3CODE, but they are actually on the same cache.
- * The implication for pseudo-locking is that if a
- * pseudo-locked region is created on a domain of one
- * resource (eg. L3CODE), then a pseudo-locked region cannot
- * be created on that same domain of the other resource
- * (eg. L3DATA). This is because the creation of a
- * pseudo-locked region involves a call to wbinvd that will
- * affect all cache allocations on particular domain.
- * - Considering the previous, it may be possible to only
- * expose one of the CDP resources to pseudo-locking and
- * hide the other. For example, we could consider to only
- * expose L3DATA and since the L3 cache is unified it is
- * still possible to place instructions there are execute it.
- * - If only one region is exposed to pseudo-locking we should
- * still keep in mind that availability of a portion of cache
- * for pseudo-locking should take into account both resources.
- * Similarly, if a pseudo-locked region is created in one
- * resource, the portion of cache used by it should be made
- * unavailable to all future allocations from both resources.
- */
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
- resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
- rdt_last_cmd_puts("CDP enabled\n");
- return -EINVAL;
- }
-
- /*
- * Not knowing the bits to disable prefetching implies that this
- * platform does not support Cache Pseudo-Locking.
- */
- prefetch_disable_bits = get_prefetch_disable_bits();
- if (prefetch_disable_bits == 0) {
- rdt_last_cmd_puts("Pseudo-locking not supported\n");
- return -EINVAL;
- }
-
- if (rdtgroup_monitor_in_progress(rdtgrp)) {
- rdt_last_cmd_puts("Monitoring in progress\n");
- return -EINVAL;
- }
-
- if (rdtgroup_tasks_assigned(rdtgrp)) {
- rdt_last_cmd_puts("Tasks assigned to resource group\n");
- return -EINVAL;
- }
-
- if (!cpumask_empty(&rdtgrp->cpu_mask)) {
- rdt_last_cmd_puts("CPUs assigned to resource group\n");
- return -EINVAL;
- }
-
- if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
- rdt_last_cmd_puts("Unable to modify resctrl permissions\n");
- return -EIO;
- }
-
- ret = pseudo_lock_init(rdtgrp);
- if (ret) {
- rdt_last_cmd_puts("Unable to init pseudo-lock region\n");
- goto out_release;
- }
-
- /*
- * If this system is capable of monitoring a rmid would have been
- * allocated when the control group was created. This is not needed
- * anymore when this group would be used for pseudo-locking. This
- * is safe to call on platforms not capable of monitoring.
- */
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-
- ret = 0;
- goto out;
-
-out_release:
- rdtgroup_locksetup_user_restore(rdtgrp);
-out:
- return ret;
-}
-
-/**
- * rdtgroup_locksetup_exit - resource group exist locksetup mode
- * @rdtgrp: resource group
- *
- * When a resource group exits locksetup mode the earlier restrictions are
- * lifted.
- *
- * Return: 0 on success, <0 on failure
- */
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- if (resctrl_arch_mon_capable()) {
- ret = alloc_rmid(rdtgrp->closid);
- if (ret < 0) {
- rdt_last_cmd_puts("Out of RMIDs\n");
- return ret;
- }
- rdtgrp->mon.rmid = ret;
- }
-
- ret = rdtgroup_locksetup_user_restore(rdtgrp);
- if (ret) {
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
- return ret;
- }
-
- pseudo_lock_free(rdtgrp);
- return 0;
-}
-
-/**
- * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
- * @d: RDT domain
- * @cbm: CBM to test
- *
- * @d represents a cache instance and @cbm a capacity bitmask that is
- * considered for it. Determine if @cbm overlaps with any existing
- * pseudo-locked region on @d.
- *
- * @cbm is unsigned long, even if only 32 bits are used, to make the
- * bitmap functions work correctly.
- *
- * Return: true if @cbm overlaps with pseudo-locked region on @d, false
- * otherwise.
- */
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
-{
- unsigned int cbm_len;
- unsigned long cbm_b;
-
- if (d->plr) {
- cbm_len = d->plr->s->res->cache.cbm_len;
- cbm_b = d->plr->cbm;
- if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
- return true;
- }
- return false;
-}
-
-/**
- * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
- * @d: RDT domain under test
- *
- * The setup of a pseudo-locked region affects all cache instances within
- * the hierarchy of the region. It is thus essential to know if any
- * pseudo-locked regions exist within a cache hierarchy to prevent any
- * attempts to create new pseudo-locked regions in the same hierarchy.
- *
- * Return: true if a pseudo-locked region exists in the hierarchy of @d or
- * if it is not possible to test due to memory allocation issue,
- * false otherwise.
- */
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
-{
- struct rdt_ctrl_domain *d_i;
- cpumask_var_t cpu_with_psl;
- struct rdt_resource *r;
- bool ret = false;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
- return true;
-
- /*
- * First determine which cpus have pseudo-locked regions
- * associated with them.
- */
- for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) {
- if (d_i->plr)
- cpumask_or(cpu_with_psl, cpu_with_psl,
- &d_i->hdr.cpu_mask);
- }
- }
-
- /*
- * Next test if new pseudo-locked region would intersect with
- * existing region.
- */
- if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl))
- ret = true;
-
- free_cpumask_var(cpu_with_psl);
- return ret;
-}
-
-/**
- * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory
+ * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
+ * pseudo-locked memory
* @_plr: pseudo-lock region to measure
*
* There is no deterministic way to test if a memory region is cached. One
@@ -885,7 +238,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int measure_cycles_lat_fn(void *_plr)
+int resctrl_arch_measure_cycles_lat_fn(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
u32 saved_low, saved_high;
@@ -898,7 +251,7 @@ static int measure_cycles_lat_fn(void *_plr)
* Disable hardware prefetchers.
*/
rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
- wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
mem_r = READ_ONCE(plr->kmem);
/*
* Dummy execute of the time measurement to load the needed
@@ -994,7 +347,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
* Disable hardware prefetchers.
*/
rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
- wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
/* Initialize rest of local variables */
/*
@@ -1012,8 +365,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
* used in L1 cache, second to capture accurate value that does not
* include cache misses incurred because of instruction loads.
*/
- rdpmcl(hit_pmcnum, hits_before);
- rdpmcl(miss_pmcnum, miss_before);
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
/*
* From SDM: Performing back-to-back fast reads are not guaranteed
* to be monotonic.
@@ -1021,8 +374,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
* before proceeding.
*/
rmb();
- rdpmcl(hit_pmcnum, hits_before);
- rdpmcl(miss_pmcnum, miss_before);
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
/*
* Use LFENCE to ensure all previous instructions are retired
* before proceeding.
@@ -1044,8 +397,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
* before proceeding.
*/
rmb();
- rdpmcl(hit_pmcnum, hits_after);
- rdpmcl(miss_pmcnum, miss_after);
+ hits_after = rdpmc(hit_pmcnum);
+ miss_after = rdpmc(miss_pmcnum);
/*
* Use LFENCE to ensure all previous instructions are retired
* before proceeding.
@@ -1069,7 +422,7 @@ out:
return 0;
}
-static int measure_l2_residency(void *_plr)
+int resctrl_arch_measure_l2_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1107,7 +460,7 @@ out:
return 0;
}
-static int measure_l3_residency(void *_plr)
+int resctrl_arch_measure_l3_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1162,428 +515,3 @@ out:
wake_up_interruptible(&plr->lock_thread_wq);
return 0;
}
-
-/**
- * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
- * @rdtgrp: Resource group to which the pseudo-locked region belongs.
- * @sel: Selector of which measurement to perform on a pseudo-locked region.
- *
- * The measurement of latency to access a pseudo-locked region should be
- * done from a cpu that is associated with that pseudo-locked region.
- * Determine which cpu is associated with this region and start a thread on
- * that cpu to perform the measurement, wait for that thread to complete.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
-{
- struct pseudo_lock_region *plr = rdtgrp->plr;
- struct task_struct *thread;
- unsigned int cpu;
- int ret = -1;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- if (rdtgrp->flags & RDT_DELETED) {
- ret = -ENODEV;
- goto out;
- }
-
- if (!plr->d) {
- ret = -ENODEV;
- goto out;
- }
-
- plr->thread_done = 0;
- cpu = cpumask_first(&plr->d->hdr.cpu_mask);
- if (!cpu_online(cpu)) {
- ret = -ENODEV;
- goto out;
- }
-
- plr->cpu = cpu;
-
- if (sel == 1)
- thread = kthread_run_on_cpu(measure_cycles_lat_fn, plr,
- cpu, "pseudo_lock_measure/%u");
- else if (sel == 2)
- thread = kthread_run_on_cpu(measure_l2_residency, plr,
- cpu, "pseudo_lock_measure/%u");
- else if (sel == 3)
- thread = kthread_run_on_cpu(measure_l3_residency, plr,
- cpu, "pseudo_lock_measure/%u");
- else
- goto out;
-
- if (IS_ERR(thread)) {
- ret = PTR_ERR(thread);
- goto out;
- }
-
- ret = wait_event_interruptible(plr->lock_thread_wq,
- plr->thread_done == 1);
- if (ret < 0)
- goto out;
-
- ret = 0;
-
-out:
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
- return ret;
-}
-
-static ssize_t pseudo_lock_measure_trigger(struct file *file,
- const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct rdtgroup *rdtgrp = file->private_data;
- size_t buf_size;
- char buf[32];
- int ret;
- int sel;
-
- buf_size = min(count, (sizeof(buf) - 1));
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- ret = kstrtoint(buf, 10, &sel);
- if (ret == 0) {
- if (sel != 1 && sel != 2 && sel != 3)
- return -EINVAL;
- ret = debugfs_file_get(file->f_path.dentry);
- if (ret)
- return ret;
- ret = pseudo_lock_measure_cycles(rdtgrp, sel);
- if (ret == 0)
- ret = count;
- debugfs_file_put(file->f_path.dentry);
- }
-
- return ret;
-}
-
-static const struct file_operations pseudo_measure_fops = {
- .write = pseudo_lock_measure_trigger,
- .open = simple_open,
- .llseek = default_llseek,
-};
-
-/**
- * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
- * @rdtgrp: resource group to which pseudo-lock region belongs
- *
- * Called when a resource group in the pseudo-locksetup mode receives a
- * valid schemata that should be pseudo-locked. Since the resource group is
- * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
- * allocated and initialized with the essential information. If a failure
- * occurs the resource group remains in the pseudo-locksetup mode with the
- * &struct pseudo_lock_region associated with it, but cleared from all
- * information and ready for the user to re-attempt pseudo-locking by
- * writing the schemata again.
- *
- * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
- * on failure. Descriptive error will be written to last_cmd_status buffer.
- */
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
-{
- struct pseudo_lock_region *plr = rdtgrp->plr;
- struct task_struct *thread;
- unsigned int new_minor;
- struct device *dev;
- int ret;
-
- ret = pseudo_lock_region_alloc(plr);
- if (ret < 0)
- return ret;
-
- ret = pseudo_lock_cstates_constrain(plr);
- if (ret < 0) {
- ret = -EINVAL;
- goto out_region;
- }
-
- plr->thread_done = 0;
-
- thread = kthread_run_on_cpu(pseudo_lock_fn, rdtgrp,
- plr->cpu, "pseudo_lock/%u");
- if (IS_ERR(thread)) {
- ret = PTR_ERR(thread);
- rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
- goto out_cstates;
- }
-
- ret = wait_event_interruptible(plr->lock_thread_wq,
- plr->thread_done == 1);
- if (ret < 0) {
- /*
- * If the thread does not get on the CPU for whatever
- * reason and the process which sets up the region is
- * interrupted then this will leave the thread in runnable
- * state and once it gets on the CPU it will dereference
- * the cleared, but not freed, plr struct resulting in an
- * empty pseudo-locking loop.
- */
- rdt_last_cmd_puts("Locking thread interrupted\n");
- goto out_cstates;
- }
-
- ret = pseudo_lock_minor_get(&new_minor);
- if (ret < 0) {
- rdt_last_cmd_puts("Unable to obtain a new minor number\n");
- goto out_cstates;
- }
-
- /*
- * Unlock access but do not release the reference. The
- * pseudo-locked region will still be here on return.
- *
- * The mutex has to be released temporarily to avoid a potential
- * deadlock with the mm->mmap_lock which is obtained in the
- * device_create() and debugfs_create_dir() callpath below as well as
- * before the mmap() callback is called.
- */
- mutex_unlock(&rdtgroup_mutex);
-
- if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
- plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
- debugfs_resctrl);
- if (!IS_ERR_OR_NULL(plr->debugfs_dir))
- debugfs_create_file("pseudo_lock_measure", 0200,
- plr->debugfs_dir, rdtgrp,
- &pseudo_measure_fops);
- }
-
- dev = device_create(&pseudo_lock_class, NULL,
- MKDEV(pseudo_lock_major, new_minor),
- rdtgrp, "%s", rdtgrp->kn->name);
-
- mutex_lock(&rdtgroup_mutex);
-
- if (IS_ERR(dev)) {
- ret = PTR_ERR(dev);
- rdt_last_cmd_printf("Failed to create character device: %d\n",
- ret);
- goto out_debugfs;
- }
-
- /* We released the mutex - check if group was removed while we did so */
- if (rdtgrp->flags & RDT_DELETED) {
- ret = -ENODEV;
- goto out_device;
- }
-
- plr->minor = new_minor;
-
- rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
- closid_free(rdtgrp->closid);
- rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
- rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
-
- ret = 0;
- goto out;
-
-out_device:
- device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
-out_debugfs:
- debugfs_remove_recursive(plr->debugfs_dir);
- pseudo_lock_minor_release(new_minor);
-out_cstates:
- pseudo_lock_cstates_relax(plr);
-out_region:
- pseudo_lock_region_clear(plr);
-out:
- return ret;
-}
-
-/**
- * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
- * @rdtgrp: resource group to which the pseudo-locked region belongs
- *
- * The removal of a pseudo-locked region can be initiated when the resource
- * group is removed from user space via a "rmdir" from userspace or the
- * unmount of the resctrl filesystem. On removal the resource group does
- * not go back to pseudo-locksetup mode before it is removed, instead it is
- * removed directly. There is thus asymmetry with the creation where the
- * &struct pseudo_lock_region is removed here while it was not created in
- * rdtgroup_pseudo_lock_create().
- *
- * Return: void
- */
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
-{
- struct pseudo_lock_region *plr = rdtgrp->plr;
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- /*
- * Default group cannot be a pseudo-locked region so we can
- * free closid here.
- */
- closid_free(rdtgrp->closid);
- goto free;
- }
-
- pseudo_lock_cstates_relax(plr);
- debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
- device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
- pseudo_lock_minor_release(plr->minor);
-
-free:
- pseudo_lock_free(rdtgrp);
-}
-
-static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
-{
- struct rdtgroup *rdtgrp;
-
- mutex_lock(&rdtgroup_mutex);
-
- rdtgrp = region_find_by_minor(iminor(inode));
- if (!rdtgrp) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
-
- filp->private_data = rdtgrp;
- atomic_inc(&rdtgrp->waitcount);
- /* Perform a non-seekable open - llseek is not supported */
- filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
-
- mutex_unlock(&rdtgroup_mutex);
-
- return 0;
-}
-
-static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
-{
- struct rdtgroup *rdtgrp;
-
- mutex_lock(&rdtgroup_mutex);
- rdtgrp = filp->private_data;
- WARN_ON(!rdtgrp);
- if (!rdtgrp) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
- filp->private_data = NULL;
- atomic_dec(&rdtgrp->waitcount);
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
-{
- /* Not supported */
- return -EINVAL;
-}
-
-static const struct vm_operations_struct pseudo_mmap_ops = {
- .mremap = pseudo_lock_dev_mremap,
-};
-
-static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
-{
- unsigned long vsize = vma->vm_end - vma->vm_start;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
- struct pseudo_lock_region *plr;
- struct rdtgroup *rdtgrp;
- unsigned long physical;
- unsigned long psize;
-
- mutex_lock(&rdtgroup_mutex);
-
- rdtgrp = filp->private_data;
- WARN_ON(!rdtgrp);
- if (!rdtgrp) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
-
- plr = rdtgrp->plr;
-
- if (!plr->d) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
-
- /*
- * Task is required to run with affinity to the cpus associated
- * with the pseudo-locked region. If this is not the case the task
- * may be scheduled elsewhere and invalidate entries in the
- * pseudo-locked region.
- */
- if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) {
- mutex_unlock(&rdtgroup_mutex);
- return -EINVAL;
- }
-
- physical = __pa(plr->kmem) >> PAGE_SHIFT;
- psize = plr->size - off;
-
- if (off > plr->size) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENOSPC;
- }
-
- /*
- * Ensure changes are carried directly to the memory being mapped,
- * do not allow copy-on-write mapping.
- */
- if (!(vma->vm_flags & VM_SHARED)) {
- mutex_unlock(&rdtgroup_mutex);
- return -EINVAL;
- }
-
- if (vsize > psize) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENOSPC;
- }
-
- memset(plr->kmem + off, 0, vsize);
-
- if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
- vsize, vma->vm_page_prot)) {
- mutex_unlock(&rdtgroup_mutex);
- return -EAGAIN;
- }
- vma->vm_ops = &pseudo_mmap_ops;
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static const struct file_operations pseudo_lock_dev_fops = {
- .owner = THIS_MODULE,
- .read = NULL,
- .write = NULL,
- .open = pseudo_lock_dev_open,
- .release = pseudo_lock_dev_release,
- .mmap = pseudo_lock_dev_mmap,
-};
-
-int rdt_pseudo_lock_init(void)
-{
- int ret;
-
- ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
- if (ret < 0)
- return ret;
-
- pseudo_lock_major = ret;
-
- ret = class_register(&pseudo_lock_class);
- if (ret) {
- unregister_chrdev(pseudo_lock_major, "pseudo_lock");
- return ret;
- }
-
- return 0;
-}
-
-void rdt_pseudo_lock_release(void)
-{
- class_unregister(&pseudo_lock_class);
- unregister_chrdev(pseudo_lock_major, "pseudo_lock");
- pseudo_lock_major = 0;
-}
diff --git a/arch/x86/kernel/cpu/resctrl/trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
index 2a506316b303..7c8aef08010f 100644
--- a/arch/x86/kernel/cpu/resctrl/trace.h
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
@@ -2,8 +2,8 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM resctrl
-#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_RESCTRL_H
+#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H
#include <linux/tracepoint.h>
@@ -35,25 +35,11 @@ TRACE_EVENT(pseudo_lock_l3,
TP_printk("hits=%llu miss=%llu",
__entry->l3_hits, __entry->l3_miss));
-TRACE_EVENT(mon_llc_occupancy_limbo,
- TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
- TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
- TP_STRUCT__entry(__field(u32, ctrl_hw_id)
- __field(u32, mon_hw_id)
- __field(int, domain_id)
- __field(u64, llc_occupancy_bytes)),
- TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
- __entry->mon_hw_id = mon_hw_id;
- __entry->domain_id = domain_id;
- __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
- TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
- __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
- __entry->llc_occupancy_bytes)
- );
-
-#endif /* _TRACE_RESCTRL_H */
+#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE trace
+
+#define TRACE_INCLUDE_FILE pseudo_lock_trace
+
#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 6419e04d8a7b..885026468440 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -18,6 +18,7 @@
#include <linux/fs_parser.h>
#include <linux/sysfs.h>
#include <linux/kernfs.h>
+#include <linux/resctrl.h>
#include <linux/seq_buf.h>
#include <linux/seq_file.h>
#include <linux/sched/signal.h>
@@ -28,333 +29,28 @@
#include <uapi/linux/magic.h>
-#include <asm/resctrl.h>
+#include <asm/msr.h>
#include "internal.h"
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
-
-/* Mutex to protect rdtgroup access. */
-DEFINE_MUTEX(rdtgroup_mutex);
-
-static struct kernfs_root *rdt_root;
-struct rdtgroup rdtgroup_default;
-LIST_HEAD(rdt_all_groups);
-
-/* list of entries for the schemata file */
-LIST_HEAD(resctrl_schema_all);
-
-/* The filesystem can only be mounted once. */
-bool resctrl_mounted;
-
-/* Kernel fs node for "info" directory under root */
-static struct kernfs_node *kn_info;
-
-/* Kernel fs node for "mon_groups" directory under root */
-static struct kernfs_node *kn_mongrp;
-
-/* Kernel fs node for "mon_data" directory under root */
-static struct kernfs_node *kn_mondata;
-
-static struct seq_buf last_cmd_status;
-static char last_cmd_status_buf[512];
-
-static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
-static void rdtgroup_destroy_root(void);
-
-struct dentry *debugfs_resctrl;
-
-/*
- * Memory bandwidth monitoring event to use for the default CTRL_MON group
- * and each new CTRL_MON group created by the user. Only relevant when
- * the filesystem is mounted with the "mba_MBps" option so it does not
- * matter that it remains uninitialized on systems that do not support
- * the "mba_MBps" option.
- */
-enum resctrl_event_id mba_mbps_default_event;
-
-static bool resctrl_debug;
-
-void rdt_last_cmd_clear(void)
-{
- lockdep_assert_held(&rdtgroup_mutex);
- seq_buf_clear(&last_cmd_status);
-}
-
-void rdt_last_cmd_puts(const char *s)
-{
- lockdep_assert_held(&rdtgroup_mutex);
- seq_buf_puts(&last_cmd_status, s);
-}
-
-void rdt_last_cmd_printf(const char *fmt, ...)
-{
- va_list ap;
-
- va_start(ap, fmt);
- lockdep_assert_held(&rdtgroup_mutex);
- seq_buf_vprintf(&last_cmd_status, fmt, ap);
- va_end(ap);
-}
-
-void rdt_staged_configs_clear(void)
-{
- struct rdt_ctrl_domain *dom;
- struct rdt_resource *r;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
- memset(dom->staged_config, 0, sizeof(dom->staged_config));
- }
-}
-
-/*
- * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
- * we can keep a bitmap of free CLOSIDs in a single integer.
- *
- * Using a global CLOSID across all resources has some advantages and
- * some drawbacks:
- * + We can simply set current's closid to assign a task to a resource
- * group.
- * + Context switch code can avoid extra memory references deciding which
- * CLOSID to load into the PQR_ASSOC MSR
- * - We give up some options in configuring resource groups across multi-socket
- * systems.
- * - Our choices on how to configure each resource become progressively more
- * limited as the number of resources grows.
- */
-static unsigned long closid_free_map;
-static int closid_free_map_len;
-
-int closids_supported(void)
-{
- return closid_free_map_len;
-}
-
-static void closid_init(void)
-{
- struct resctrl_schema *s;
- u32 rdt_min_closid = 32;
-
- /* Compute rdt_min_closid across all resources */
- list_for_each_entry(s, &resctrl_schema_all, list)
- rdt_min_closid = min(rdt_min_closid, s->num_closid);
-
- closid_free_map = BIT_MASK(rdt_min_closid) - 1;
-
- /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
- __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map);
- closid_free_map_len = rdt_min_closid;
-}
-
-static int closid_alloc(void)
-{
- int cleanest_closid;
- u32 closid;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
- cleanest_closid = resctrl_find_cleanest_closid();
- if (cleanest_closid < 0)
- return cleanest_closid;
- closid = cleanest_closid;
- } else {
- closid = ffs(closid_free_map);
- if (closid == 0)
- return -ENOSPC;
- closid--;
- }
- __clear_bit(closid, &closid_free_map);
-
- return closid;
-}
-
-void closid_free(int closid)
-{
- lockdep_assert_held(&rdtgroup_mutex);
-
- __set_bit(closid, &closid_free_map);
-}
-
-/**
- * closid_allocated - test if provided closid is in use
- * @closid: closid to be tested
- *
- * Return: true if @closid is currently associated with a resource group,
- * false if @closid is free
- */
-bool closid_allocated(unsigned int closid)
-{
- lockdep_assert_held(&rdtgroup_mutex);
-
- return !test_bit(closid, &closid_free_map);
-}
-
-/**
- * rdtgroup_mode_by_closid - Return mode of resource group with closid
- * @closid: closid if the resource group
- *
- * Each resource group is associated with a @closid. Here the mode
- * of a resource group can be queried by searching for it using its closid.
- *
- * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
- */
-enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
-{
- struct rdtgroup *rdtgrp;
-
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (rdtgrp->closid == closid)
- return rdtgrp->mode;
- }
-
- return RDT_NUM_MODES;
-}
-
-static const char * const rdt_mode_str[] = {
- [RDT_MODE_SHAREABLE] = "shareable",
- [RDT_MODE_EXCLUSIVE] = "exclusive",
- [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup",
- [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked",
-};
-
-/**
- * rdtgroup_mode_str - Return the string representation of mode
- * @mode: the resource group mode as &enum rdtgroup_mode
- *
- * Return: string representation of valid mode, "unknown" otherwise
- */
-static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
-{
- if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
- return "unknown";
-
- return rdt_mode_str[mode];
-}
-
-/* set uid and gid of rdtgroup dirs and files to that of the creator */
-static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
-{
- struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = current_fsuid(),
- .ia_gid = current_fsgid(), };
-
- if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
- gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
- return 0;
-
- return kernfs_setattr(kn, &iattr);
-}
-
-static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
-{
- struct kernfs_node *kn;
- int ret;
-
- kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
- 0, rft->kf_ops, rft, NULL, NULL);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
- return 0;
-}
-
-static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
-{
- struct kernfs_open_file *of = m->private;
- struct rftype *rft = of->kn->priv;
-
- if (rft->seq_show)
- return rft->seq_show(of, m, arg);
- return 0;
-}
-
-static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off)
-{
- struct rftype *rft = of->kn->priv;
-
- if (rft->write)
- return rft->write(of, buf, nbytes, off);
-
- return -EINVAL;
-}
-
-static const struct kernfs_ops rdtgroup_kf_single_ops = {
- .atomic_write_len = PAGE_SIZE,
- .write = rdtgroup_file_write,
- .seq_show = rdtgroup_seqfile_show,
-};
-
-static const struct kernfs_ops kf_mondata_ops = {
- .atomic_write_len = PAGE_SIZE,
- .seq_show = rdtgroup_mondata_show,
-};
-static bool is_cpu_list(struct kernfs_open_file *of)
-{
- struct rftype *rft = of->kn->priv;
-
- return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
-}
-
-static int rdtgroup_cpus_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- struct cpumask *mask;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
-
- if (rdtgrp) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- if (!rdtgrp->plr->d) {
- rdt_last_cmd_clear();
- rdt_last_cmd_puts("Cache domain offline\n");
- ret = -ENODEV;
- } else {
- mask = &rdtgrp->plr->d->hdr.cpu_mask;
- seq_printf(s, is_cpu_list(of) ?
- "%*pbl\n" : "%*pb\n",
- cpumask_pr_args(mask));
- }
- } else {
- seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
- cpumask_pr_args(&rdtgrp->cpu_mask));
- }
- } else {
- ret = -ENOENT;
- }
- rdtgroup_kn_unlock(of->kn);
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
- return ret;
-}
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
/*
- * This is safe against resctrl_sched_in() called from __switch_to()
+ * This is safe against resctrl_arch_sched_in() called from __switch_to()
* because __switch_to() is executed with interrupts disabled. A local call
* from update_closid_rmid() is protected against __switch_to() because
* preemption is disabled.
*/
-static void update_cpu_closid_rmid(void *info)
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
{
- struct rdtgroup *r = info;
+ struct resctrl_cpu_defaults *r = info;
if (r) {
this_cpu_write(pqr_state.default_closid, r->closid);
- this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+ this_cpu_write(pqr_state.default_rmid, r->rmid);
}
/*
@@ -362,1201 +58,9 @@ static void update_cpu_closid_rmid(void *info)
* executing task might have its own closid selected. Just reuse
* the context switch code.
*/
- resctrl_sched_in(current);
-}
-
-/*
- * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
- *
- * Per task closids/rmids must have been set up before calling this function.
- */
-static void
-update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
-{
- on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1);
-}
-
-static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
- cpumask_var_t tmpmask)
-{
- struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
- struct list_head *head;
-
- /* Check whether cpus belong to parent ctrl group */
- cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
- if (!cpumask_empty(tmpmask)) {
- rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
- return -EINVAL;
- }
-
- /* Check whether cpus are dropped from this group */
- cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (!cpumask_empty(tmpmask)) {
- /* Give any dropped cpus to parent rdtgroup */
- cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
- update_closid_rmid(tmpmask, prgrp);
- }
-
- /*
- * If we added cpus, remove them from previous group that owned them
- * and update per-cpu rmid
- */
- cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (!cpumask_empty(tmpmask)) {
- head = &prgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
- if (crgrp == rdtgrp)
- continue;
- cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
- tmpmask);
- }
- update_closid_rmid(tmpmask, rdtgrp);
- }
-
- /* Done pushing/pulling - update this group with new mask */
- cpumask_copy(&rdtgrp->cpu_mask, newmask);
-
- return 0;
-}
-
-static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
-{
- struct rdtgroup *crgrp;
-
- cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
- /* update the child mon group masks as well*/
- list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
- cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+ resctrl_arch_sched_in(current);
}
-static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
- cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
-{
- struct rdtgroup *r, *crgrp;
- struct list_head *head;
-
- /* Check whether cpus are dropped from this group */
- cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (!cpumask_empty(tmpmask)) {
- /* Can't drop from default group */
- if (rdtgrp == &rdtgroup_default) {
- rdt_last_cmd_puts("Can't drop CPUs from default group\n");
- return -EINVAL;
- }
-
- /* Give any dropped cpus to rdtgroup_default */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, tmpmask);
- update_closid_rmid(tmpmask, &rdtgroup_default);
- }
-
- /*
- * If we added cpus, remove them from previous group and
- * the prev group's child groups that owned them
- * and update per-cpu closid/rmid.
- */
- cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (!cpumask_empty(tmpmask)) {
- list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
- if (r == rdtgrp)
- continue;
- cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
- if (!cpumask_empty(tmpmask1))
- cpumask_rdtgrp_clear(r, tmpmask1);
- }
- update_closid_rmid(tmpmask, rdtgrp);
- }
-
- /* Done pushing/pulling - update this group with new mask */
- cpumask_copy(&rdtgrp->cpu_mask, newmask);
-
- /*
- * Clear child mon group masks since there is a new parent mask
- * now and update the rmid for the cpus the child lost.
- */
- head = &rdtgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
- cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
- update_closid_rmid(tmpmask, rdtgrp);
- cpumask_clear(&crgrp->cpu_mask);
- }
-
- return 0;
-}
-
-static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- cpumask_var_t tmpmask, newmask, tmpmask1;
- struct rdtgroup *rdtgrp;
- int ret;
-
- if (!buf)
- return -EINVAL;
-
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
- return -ENOMEM;
- if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
- free_cpumask_var(tmpmask);
- return -ENOMEM;
- }
- if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
- free_cpumask_var(tmpmask);
- free_cpumask_var(newmask);
- return -ENOMEM;
- }
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- ret = -ENOENT;
- goto unlock;
- }
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Pseudo-locking in progress\n");
- goto unlock;
- }
-
- if (is_cpu_list(of))
- ret = cpulist_parse(buf, newmask);
- else
- ret = cpumask_parse(buf, newmask);
-
- if (ret) {
- rdt_last_cmd_puts("Bad CPU list/mask\n");
- goto unlock;
- }
-
- /* check that user didn't specify any offline cpus */
- cpumask_andnot(tmpmask, newmask, cpu_online_mask);
- if (!cpumask_empty(tmpmask)) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Can only assign online CPUs\n");
- goto unlock;
- }
-
- if (rdtgrp->type == RDTCTRL_GROUP)
- ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
- else if (rdtgrp->type == RDTMON_GROUP)
- ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
- else
- ret = -EINVAL;
-
-unlock:
- rdtgroup_kn_unlock(of->kn);
- free_cpumask_var(tmpmask);
- free_cpumask_var(newmask);
- free_cpumask_var(tmpmask1);
-
- return ret ?: nbytes;
-}
-
-/**
- * rdtgroup_remove - the helper to remove resource group safely
- * @rdtgrp: resource group to remove
- *
- * On resource group creation via a mkdir, an extra kernfs_node reference is
- * taken to ensure that the rdtgroup structure remains accessible for the
- * rdtgroup_kn_unlock() calls where it is removed.
- *
- * Drop the extra reference here, then free the rdtgroup structure.
- *
- * Return: void
- */
-static void rdtgroup_remove(struct rdtgroup *rdtgrp)
-{
- kernfs_put(rdtgrp->kn);
- kfree(rdtgrp);
-}
-
-static void _update_task_closid_rmid(void *task)
-{
- /*
- * If the task is still current on this CPU, update PQR_ASSOC MSR.
- * Otherwise, the MSR is updated when the task is scheduled in.
- */
- if (task == current)
- resctrl_sched_in(task);
-}
-
-static void update_task_closid_rmid(struct task_struct *t)
-{
- if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
- smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
- else
- _update_task_closid_rmid(t);
-}
-
-static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
-{
- u32 closid, rmid = rdtgrp->mon.rmid;
-
- if (rdtgrp->type == RDTCTRL_GROUP)
- closid = rdtgrp->closid;
- else if (rdtgrp->type == RDTMON_GROUP)
- closid = rdtgrp->mon.parent->closid;
- else
- return false;
-
- return resctrl_arch_match_closid(tsk, closid) &&
- resctrl_arch_match_rmid(tsk, closid, rmid);
-}
-
-static int __rdtgroup_move_task(struct task_struct *tsk,
- struct rdtgroup *rdtgrp)
-{
- /* If the task is already in rdtgrp, no need to move the task. */
- if (task_in_rdtgroup(tsk, rdtgrp))
- return 0;
-
- /*
- * Set the task's closid/rmid before the PQR_ASSOC MSR can be
- * updated by them.
- *
- * For ctrl_mon groups, move both closid and rmid.
- * For monitor groups, can move the tasks only from
- * their parent CTRL group.
- */
- if (rdtgrp->type == RDTMON_GROUP &&
- !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
- rdt_last_cmd_puts("Can't move task to different control group\n");
- return -EINVAL;
- }
-
- if (rdtgrp->type == RDTMON_GROUP)
- resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
- rdtgrp->mon.rmid);
- else
- resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
- rdtgrp->mon.rmid);
-
- /*
- * Ensure the task's closid and rmid are written before determining if
- * the task is current that will decide if it will be interrupted.
- * This pairs with the full barrier between the rq->curr update and
- * resctrl_sched_in() during context switch.
- */
- smp_mb();
-
- /*
- * By now, the task's closid and rmid are set. If the task is current
- * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
- * group go into effect. If the task is not current, the MSR will be
- * updated when the task is scheduled in.
- */
- update_task_closid_rmid(tsk);
-
- return 0;
-}
-
-static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
-{
- return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
- resctrl_arch_match_closid(t, r->closid));
-}
-
-static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
-{
- return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
- resctrl_arch_match_rmid(t, r->mon.parent->closid,
- r->mon.rmid));
-}
-
-/**
- * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
- * @r: Resource group
- *
- * Return: 1 if tasks have been assigned to @r, 0 otherwise
- */
-int rdtgroup_tasks_assigned(struct rdtgroup *r)
-{
- struct task_struct *p, *t;
- int ret = 0;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- rcu_read_lock();
- for_each_process_thread(p, t) {
- if (is_closid_match(t, r) || is_rmid_match(t, r)) {
- ret = 1;
- break;
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static int rdtgroup_task_write_permission(struct task_struct *task,
- struct kernfs_open_file *of)
-{
- const struct cred *tcred = get_task_cred(task);
- const struct cred *cred = current_cred();
- int ret = 0;
-
- /*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
- */
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid)) {
- rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
- ret = -EPERM;
- }
-
- put_cred(tcred);
- return ret;
-}
-
-static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
- struct kernfs_open_file *of)
-{
- struct task_struct *tsk;
- int ret;
-
- rcu_read_lock();
- if (pid) {
- tsk = find_task_by_vpid(pid);
- if (!tsk) {
- rcu_read_unlock();
- rdt_last_cmd_printf("No task %d\n", pid);
- return -ESRCH;
- }
- } else {
- tsk = current;
- }
-
- get_task_struct(tsk);
- rcu_read_unlock();
-
- ret = rdtgroup_task_write_permission(tsk, of);
- if (!ret)
- ret = __rdtgroup_move_task(tsk, rdtgrp);
-
- put_task_struct(tsk);
- return ret;
-}
-
-static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct rdtgroup *rdtgrp;
- char *pid_str;
- int ret = 0;
- pid_t pid;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
- rdt_last_cmd_clear();
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Pseudo-locking in progress\n");
- goto unlock;
- }
-
- while (buf && buf[0] != '\0' && buf[0] != '\n') {
- pid_str = strim(strsep(&buf, ","));
-
- if (kstrtoint(pid_str, 0, &pid)) {
- rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
- ret = -EINVAL;
- break;
- }
-
- if (pid < 0) {
- rdt_last_cmd_printf("Invalid pid %d\n", pid);
- ret = -EINVAL;
- break;
- }
-
- ret = rdtgroup_move_task(pid, rdtgrp, of);
- if (ret) {
- rdt_last_cmd_printf("Error while processing task %d\n", pid);
- break;
- }
- }
-
-unlock:
- rdtgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
-}
-
-static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
-{
- struct task_struct *p, *t;
- pid_t pid;
-
- rcu_read_lock();
- for_each_process_thread(p, t) {
- if (is_closid_match(t, r) || is_rmid_match(t, r)) {
- pid = task_pid_vnr(t);
- if (pid)
- seq_printf(s, "%d\n", pid);
- }
- }
- rcu_read_unlock();
-}
-
-static int rdtgroup_tasks_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp)
- show_rdt_tasks(rdtgrp, s);
- else
- ret = -ENOENT;
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-static int rdtgroup_closid_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp)
- seq_printf(s, "%u\n", rdtgrp->closid);
- else
- ret = -ENOENT;
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-static int rdtgroup_rmid_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp)
- seq_printf(s, "%u\n", rdtgrp->mon.rmid);
- else
- ret = -ENOENT;
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-#ifdef CONFIG_PROC_CPU_RESCTRL
-
-/*
- * A task can only be part of one resctrl control group and of one monitor
- * group which is associated to that control group.
- *
- * 1) res:
- * mon:
- *
- * resctrl is not available.
- *
- * 2) res:/
- * mon:
- *
- * Task is part of the root resctrl control group, and it is not associated
- * to any monitor group.
- *
- * 3) res:/
- * mon:mon0
- *
- * Task is part of the root resctrl control group and monitor group mon0.
- *
- * 4) res:group0
- * mon:
- *
- * Task is part of resctrl control group group0, and it is not associated
- * to any monitor group.
- *
- * 5) res:group0
- * mon:mon1
- *
- * Task is part of resctrl control group group0 and monitor group mon1.
- */
-int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
-{
- struct rdtgroup *rdtg;
- int ret = 0;
-
- mutex_lock(&rdtgroup_mutex);
-
- /* Return empty if resctrl has not been mounted. */
- if (!resctrl_mounted) {
- seq_puts(s, "res:\nmon:\n");
- goto unlock;
- }
-
- list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
- struct rdtgroup *crg;
-
- /*
- * Task information is only relevant for shareable
- * and exclusive groups.
- */
- if (rdtg->mode != RDT_MODE_SHAREABLE &&
- rdtg->mode != RDT_MODE_EXCLUSIVE)
- continue;
-
- if (!resctrl_arch_match_closid(tsk, rdtg->closid))
- continue;
-
- seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
- rdtg->kn->name);
- seq_puts(s, "mon:");
- list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
- mon.crdtgrp_list) {
- if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
- crg->mon.rmid))
- continue;
- seq_printf(s, "%s", crg->kn->name);
- break;
- }
- seq_putc(s, '\n');
- goto unlock;
- }
- /*
- * The above search should succeed. Otherwise return
- * with an error.
- */
- ret = -ENOENT;
-unlock:
- mutex_unlock(&rdtgroup_mutex);
-
- return ret;
-}
-#endif
-
-static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- int len;
-
- mutex_lock(&rdtgroup_mutex);
- len = seq_buf_used(&last_cmd_status);
- if (len)
- seq_printf(seq, "%.*s", len, last_cmd_status_buf);
- else
- seq_puts(seq, "ok\n");
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static int rdt_num_closids_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
-
- seq_printf(seq, "%u\n", s->num_closid);
- return 0;
-}
-
-static int rdt_default_ctrl_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%x\n", r->default_ctrl);
- return 0;
-}
-
-static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
- return 0;
-}
-
-static int rdt_shareable_bits_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%x\n", r->cache.shareable_bits);
- return 0;
-}
-
-/*
- * rdt_bit_usage_show - Display current usage of resources
- *
- * A domain is a shared resource that can now be allocated differently. Here
- * we display the current regions of the domain as an annotated bitmask.
- * For each domain of this resource its allocation bitmask
- * is annotated as below to indicate the current usage of the corresponding bit:
- * 0 - currently unused
- * X - currently available for sharing and used by software and hardware
- * H - currently used by hardware only but available for software use
- * S - currently used and shareable by software only
- * E - currently used exclusively by one resource group
- * P - currently pseudo-locked by one resource group
- */
-static int rdt_bit_usage_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- /*
- * Use unsigned long even though only 32 bits are used to ensure
- * test_bit() is used safely.
- */
- unsigned long sw_shareable = 0, hw_shareable = 0;
- unsigned long exclusive = 0, pseudo_locked = 0;
- struct rdt_resource *r = s->res;
- struct rdt_ctrl_domain *dom;
- int i, hwb, swb, excl, psl;
- enum rdtgrp_mode mode;
- bool sep = false;
- u32 ctrl_val;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
- hw_shareable = r->cache.shareable_bits;
- list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
- if (sep)
- seq_putc(seq, ';');
- sw_shareable = 0;
- exclusive = 0;
- seq_printf(seq, "%d=", dom->hdr.id);
- for (i = 0; i < closids_supported(); i++) {
- if (!closid_allocated(i))
- continue;
- ctrl_val = resctrl_arch_get_config(r, dom, i,
- s->conf_type);
- mode = rdtgroup_mode_by_closid(i);
- switch (mode) {
- case RDT_MODE_SHAREABLE:
- sw_shareable |= ctrl_val;
- break;
- case RDT_MODE_EXCLUSIVE:
- exclusive |= ctrl_val;
- break;
- case RDT_MODE_PSEUDO_LOCKSETUP:
- /*
- * RDT_MODE_PSEUDO_LOCKSETUP is possible
- * here but not included since the CBM
- * associated with this CLOSID in this mode
- * is not initialized and no task or cpu can be
- * assigned this CLOSID.
- */
- break;
- case RDT_MODE_PSEUDO_LOCKED:
- case RDT_NUM_MODES:
- WARN(1,
- "invalid mode for closid %d\n", i);
- break;
- }
- }
- for (i = r->cache.cbm_len - 1; i >= 0; i--) {
- pseudo_locked = dom->plr ? dom->plr->cbm : 0;
- hwb = test_bit(i, &hw_shareable);
- swb = test_bit(i, &sw_shareable);
- excl = test_bit(i, &exclusive);
- psl = test_bit(i, &pseudo_locked);
- if (hwb && swb)
- seq_putc(seq, 'X');
- else if (hwb && !swb)
- seq_putc(seq, 'H');
- else if (!hwb && swb)
- seq_putc(seq, 'S');
- else if (excl)
- seq_putc(seq, 'E');
- else if (psl)
- seq_putc(seq, 'P');
- else /* Unused bits remain */
- seq_putc(seq, '0');
- }
- sep = true;
- }
- seq_putc(seq, '\n');
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
- return 0;
-}
-
-static int rdt_min_bw_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->membw.min_bw);
- return 0;
-}
-
-static int rdt_num_rmids_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = of->kn->parent->priv;
-
- seq_printf(seq, "%d\n", r->num_rmid);
-
- return 0;
-}
-
-static int rdt_mon_features_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = of->kn->parent->priv;
- struct mon_evt *mevt;
-
- list_for_each_entry(mevt, &r->evt_list, list) {
- seq_printf(seq, "%s\n", mevt->name);
- if (mevt->configurable)
- seq_printf(seq, "%s_config\n", mevt->name);
- }
-
- return 0;
-}
-
-static int rdt_bw_gran_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->membw.bw_gran);
- return 0;
-}
-
-static int rdt_delay_linear_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->membw.delay_linear);
- return 0;
-}
-
-static int max_threshold_occ_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
-
- return 0;
-}
-
-static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
- seq_puts(seq, "per-thread\n");
- else
- seq_puts(seq, "max\n");
-
- return 0;
-}
-
-static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- unsigned int bytes;
- int ret;
-
- ret = kstrtouint(buf, 0, &bytes);
- if (ret)
- return ret;
-
- if (bytes > resctrl_rmid_realloc_limit)
- return -EINVAL;
-
- resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
-
- return nbytes;
-}
-
-/*
- * rdtgroup_mode_show - Display mode of this resource group
- */
-static int rdtgroup_mode_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
-
- seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
-
- rdtgroup_kn_unlock(of->kn);
- return 0;
-}
-
-static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
-{
- switch (my_type) {
- case CDP_CODE:
- return CDP_DATA;
- case CDP_DATA:
- return CDP_CODE;
- default:
- case CDP_NONE:
- return CDP_NONE;
- }
-}
-
-static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
-
- return 0;
-}
-
-/**
- * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
- * @r: Resource to which domain instance @d belongs.
- * @d: The domain instance for which @closid is being tested.
- * @cbm: Capacity bitmask being tested.
- * @closid: Intended closid for @cbm.
- * @type: CDP type of @r.
- * @exclusive: Only check if overlaps with exclusive resource groups
- *
- * Checks if provided @cbm intended to be used for @closid on domain
- * @d overlaps with any other closids or other hardware usage associated
- * with this domain. If @exclusive is true then only overlaps with
- * resource groups in exclusive mode will be considered. If @exclusive
- * is false then overlaps with any resource group or hardware entities
- * will be considered.
- *
- * @cbm is unsigned long, even if only 32 bits are used, to make the
- * bitmap functions work correctly.
- *
- * Return: false if CBM does not overlap, true if it does.
- */
-static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
- unsigned long cbm, int closid,
- enum resctrl_conf_type type, bool exclusive)
-{
- enum rdtgrp_mode mode;
- unsigned long ctrl_b;
- int i;
-
- /* Check for any overlap with regions used by hardware directly */
- if (!exclusive) {
- ctrl_b = r->cache.shareable_bits;
- if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
- return true;
- }
-
- /* Check for overlap with other resource groups */
- for (i = 0; i < closids_supported(); i++) {
- ctrl_b = resctrl_arch_get_config(r, d, i, type);
- mode = rdtgroup_mode_by_closid(i);
- if (closid_allocated(i) && i != closid &&
- mode != RDT_MODE_PSEUDO_LOCKSETUP) {
- if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
- if (exclusive) {
- if (mode == RDT_MODE_EXCLUSIVE)
- return true;
- continue;
- }
- return true;
- }
- }
- }
-
- return false;
-}
-
-/**
- * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
- * @s: Schema for the resource to which domain instance @d belongs.
- * @d: The domain instance for which @closid is being tested.
- * @cbm: Capacity bitmask being tested.
- * @closid: Intended closid for @cbm.
- * @exclusive: Only check if overlaps with exclusive resource groups
- *
- * Resources that can be allocated using a CBM can use the CBM to control
- * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
- * for overlap. Overlap test is not limited to the specific resource for
- * which the CBM is intended though - when dealing with CDP resources that
- * share the underlying hardware the overlap check should be performed on
- * the CDP resource sharing the hardware also.
- *
- * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
- * overlap test.
- *
- * Return: true if CBM overlap detected, false if there is no overlap
- */
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
- unsigned long cbm, int closid, bool exclusive)
-{
- enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
- struct rdt_resource *r = s->res;
-
- if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
- exclusive))
- return true;
-
- if (!resctrl_arch_get_cdp_enabled(r->rid))
- return false;
- return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
-}
-
-/**
- * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
- * @rdtgrp: Resource group identified through its closid.
- *
- * An exclusive resource group implies that there should be no sharing of
- * its allocated resources. At the time this group is considered to be
- * exclusive this test can determine if its current schemata supports this
- * setting by testing for overlap with all other resource groups.
- *
- * Return: true if resource group can be exclusive, false if there is overlap
- * with allocations of other resource groups and thus this resource group
- * cannot be exclusive.
- */
-static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
-{
- int closid = rdtgrp->closid;
- struct rdt_ctrl_domain *d;
- struct resctrl_schema *s;
- struct rdt_resource *r;
- bool has_cache = false;
- u32 ctrl;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
- continue;
- has_cache = true;
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- ctrl = resctrl_arch_get_config(r, d, closid,
- s->conf_type);
- if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
- rdt_last_cmd_puts("Schemata overlaps\n");
- return false;
- }
- }
- }
-
- if (!has_cache) {
- rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
- return false;
- }
-
- return true;
-}
-
-/*
- * rdtgroup_mode_write - Modify the resource group's mode
- */
-static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct rdtgroup *rdtgrp;
- enum rdtgrp_mode mode;
- int ret = 0;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
- buf[nbytes - 1] = '\0';
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
-
- rdt_last_cmd_clear();
-
- mode = rdtgrp->mode;
-
- if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
- (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
- (!strcmp(buf, "pseudo-locksetup") &&
- mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
- (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
- goto out;
-
- if (mode == RDT_MODE_PSEUDO_LOCKED) {
- rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
- ret = -EINVAL;
- goto out;
- }
-
- if (!strcmp(buf, "shareable")) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = rdtgroup_locksetup_exit(rdtgrp);
- if (ret)
- goto out;
- }
- rdtgrp->mode = RDT_MODE_SHAREABLE;
- } else if (!strcmp(buf, "exclusive")) {
- if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
- ret = -EINVAL;
- goto out;
- }
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = rdtgroup_locksetup_exit(rdtgrp);
- if (ret)
- goto out;
- }
- rdtgrp->mode = RDT_MODE_EXCLUSIVE;
- } else if (!strcmp(buf, "pseudo-locksetup")) {
- ret = rdtgroup_locksetup_enter(rdtgrp);
- if (ret)
- goto out;
- rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
- } else {
- rdt_last_cmd_puts("Unknown or unsupported mode\n");
- ret = -EINVAL;
- }
-
-out:
- rdtgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
-/**
- * rdtgroup_cbm_to_size - Translate CBM to size in bytes
- * @r: RDT resource to which @d belongs.
- * @d: RDT domain instance.
- * @cbm: bitmask for which the size should be computed.
- *
- * The bitmask provided associated with the RDT domain instance @d will be
- * translated into how many bytes it represents. The size in bytes is
- * computed by first dividing the total cache size by the CBM length to
- * determine how many bytes each bit in the bitmask represents. The result
- * is multiplied with the number of bits set in the bitmask.
- *
- * @cbm is unsigned long, even if only 32 bits are used to make the
- * bitmap functions work correctly.
- */
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
- struct rdt_ctrl_domain *d, unsigned long cbm)
-{
- unsigned int size = 0;
- struct cacheinfo *ci;
- int num_b;
-
- if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
- return size;
-
- num_b = bitmap_weight(&cbm, r->cache.cbm_len);
- ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
- if (ci)
- size = ci->size / r->cache.cbm_len * num_b;
-
- return size;
-}
-
-/*
- * rdtgroup_size_show - Display size in bytes of allocated regions
- *
- * The "size" file mirrors the layout of the "schemata" file, printing the
- * size in bytes of each region instead of the capacity bitmask.
- */
-static int rdtgroup_size_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct resctrl_schema *schema;
- enum resctrl_conf_type type;
- struct rdt_ctrl_domain *d;
- struct rdtgroup *rdtgrp;
- struct rdt_resource *r;
- unsigned int size;
- int ret = 0;
- u32 closid;
- bool sep;
- u32 ctrl;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- if (!rdtgrp->plr->d) {
- rdt_last_cmd_clear();
- rdt_last_cmd_puts("Cache domain offline\n");
- ret = -ENODEV;
- } else {
- seq_printf(s, "%*s:", max_name_width,
- rdtgrp->plr->s->name);
- size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
- rdtgrp->plr->d,
- rdtgrp->plr->cbm);
- seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
- }
- goto out;
- }
-
- closid = rdtgrp->closid;
-
- list_for_each_entry(schema, &resctrl_schema_all, list) {
- r = schema->res;
- type = schema->conf_type;
- sep = false;
- seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- if (sep)
- seq_putc(s, ';');
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- size = 0;
- } else {
- if (is_mba_sc(r))
- ctrl = d->mbps_val[closid];
- else
- ctrl = resctrl_arch_get_config(r, d,
- closid,
- type);
- if (r->rid == RDT_RESOURCE_MBA ||
- r->rid == RDT_RESOURCE_SMBA)
- size = ctrl;
- else
- size = rdtgroup_cbm_to_size(r, d, ctrl);
- }
- seq_printf(s, "%d=%u", d->hdr.id, size);
- sep = true;
- }
- seq_putc(s, '\n');
- }
-
-out:
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-struct mon_config_info {
- u32 evtid;
- u32 mon_config;
-};
-
#define INVALID_CONFIG_INDEX UINT_MAX
/**
@@ -1581,681 +85,48 @@ static inline unsigned int mon_event_config_index_get(u32 evtid)
}
}
-static void mon_event_config_read(void *info)
+void resctrl_arch_mon_event_config_read(void *_config_info)
{
- struct mon_config_info *mon_info = info;
+ struct resctrl_mon_config_info *config_info = _config_info;
unsigned int index;
u64 msrval;
- index = mon_event_config_index_get(mon_info->evtid);
+ index = mon_event_config_index_get(config_info->evtid);
if (index == INVALID_CONFIG_INDEX) {
- pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
return;
}
- rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
+ rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval);
/* Report only the valid event configuration bits */
- mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
-}
-
-static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info)
-{
- smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1);
+ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
}
-static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
+void resctrl_arch_mon_event_config_write(void *_config_info)
{
- struct mon_config_info mon_info;
- struct rdt_mon_domain *dom;
- bool sep = false;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- list_for_each_entry(dom, &r->mon_domains, hdr.list) {
- if (sep)
- seq_puts(s, ";");
-
- memset(&mon_info, 0, sizeof(struct mon_config_info));
- mon_info.evtid = evtid;
- mondata_config_read(dom, &mon_info);
-
- seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
- sep = true;
- }
- seq_puts(s, "\n");
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-
- return 0;
-}
-
-static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = of->kn->parent->priv;
-
- mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
-
- return 0;
-}
-
-static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = of->kn->parent->priv;
-
- mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
-
- return 0;
-}
-
-static void mon_event_config_write(void *info)
-{
- struct mon_config_info *mon_info = info;
+ struct resctrl_mon_config_info *config_info = _config_info;
unsigned int index;
- index = mon_event_config_index_get(mon_info->evtid);
+ index = mon_event_config_index_get(config_info->evtid);
if (index == INVALID_CONFIG_INDEX) {
- pr_warn_once("Invalid event id %d\n", mon_info->evtid);
- return;
- }
- wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
-}
-
-static void mbm_config_write_domain(struct rdt_resource *r,
- struct rdt_mon_domain *d, u32 evtid, u32 val)
-{
- struct mon_config_info mon_info = {0};
-
- /*
- * Read the current config value first. If both are the same then
- * no need to write it again.
- */
- mon_info.evtid = evtid;
- mondata_config_read(d, &mon_info);
- if (mon_info.mon_config == val)
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
return;
-
- mon_info.mon_config = val;
-
- /*
- * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
- * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
- * are scoped at the domain level. Writing any of these MSRs
- * on one CPU is observed by all the CPUs in the domain.
- */
- smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write,
- &mon_info, 1);
-
- /*
- * When an Event Configuration is changed, the bandwidth counters
- * for all RMIDs and Events will be cleared by the hardware. The
- * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
- * every RMID on the next read to any event for every RMID.
- * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
- * cleared while it is tracked by the hardware. Clear the
- * mbm_local and mbm_total counts for all the RMIDs.
- */
- resctrl_arch_reset_rmid_all(r, d);
-}
-
-static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
-{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- char *dom_str = NULL, *id_str;
- unsigned long dom_id, val;
- struct rdt_mon_domain *d;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
-next:
- if (!tok || tok[0] == '\0')
- return 0;
-
- /* Start processing the strings for each domain */
- dom_str = strim(strsep(&tok, ";"));
- id_str = strsep(&dom_str, "=");
-
- if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
- rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
- return -EINVAL;
- }
-
- if (!dom_str || kstrtoul(dom_str, 16, &val)) {
- rdt_last_cmd_puts("Non-numeric event configuration value\n");
- return -EINVAL;
}
-
- /* Value from user cannot be more than the supported set of events */
- if ((val & hw_res->mbm_cfg_mask) != val) {
- rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
- hw_res->mbm_cfg_mask);
- return -EINVAL;
- }
-
- list_for_each_entry(d, &r->mon_domains, hdr.list) {
- if (d->hdr.id == dom_id) {
- mbm_config_write_domain(r, d, evtid, val);
- goto next;
- }
- }
-
- return -EINVAL;
-}
-
-static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
-{
- struct rdt_resource *r = of->kn->parent->priv;
- int ret;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- rdt_last_cmd_clear();
-
- buf[nbytes - 1] = '\0';
-
- ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-
- return ret ?: nbytes;
-}
-
-static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
-{
- struct rdt_resource *r = of->kn->parent->priv;
- int ret;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- rdt_last_cmd_clear();
-
- buf[nbytes - 1] = '\0';
-
- ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-
- return ret ?: nbytes;
-}
-
-/* rdtgroup information files for one cache resource. */
-static struct rftype res_common_files[] = {
- {
- .name = "last_cmd_status",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_last_cmd_status_show,
- .fflags = RFTYPE_TOP_INFO,
- },
- {
- .name = "num_closids",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_num_closids_show,
- .fflags = RFTYPE_CTRL_INFO,
- },
- {
- .name = "mon_features",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_mon_features_show,
- .fflags = RFTYPE_MON_INFO,
- },
- {
- .name = "num_rmids",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_num_rmids_show,
- .fflags = RFTYPE_MON_INFO,
- },
- {
- .name = "cbm_mask",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_default_ctrl_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "min_cbm_bits",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_min_cbm_bits_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "shareable_bits",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_shareable_bits_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "bit_usage",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_bit_usage_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "min_bandwidth",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_min_bw_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
- },
- {
- .name = "bandwidth_gran",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_bw_gran_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
- },
- {
- .name = "delay_linear",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_delay_linear_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
- },
- /*
- * Platform specific which (if any) capabilities are provided by
- * thread_throttle_mode. Defer "fflags" initialization to platform
- * discovery.
- */
- {
- .name = "thread_throttle_mode",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_thread_throttle_mode_show,
- },
- {
- .name = "max_threshold_occupancy",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = max_threshold_occ_write,
- .seq_show = max_threshold_occ_show,
- .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "mbm_total_bytes_config",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = mbm_total_bytes_config_show,
- .write = mbm_total_bytes_config_write,
- },
- {
- .name = "mbm_local_bytes_config",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = mbm_local_bytes_config_show,
- .write = mbm_local_bytes_config_write,
- },
- {
- .name = "cpus",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- .fflags = RFTYPE_BASE,
- },
- {
- .name = "cpus_list",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- .flags = RFTYPE_FLAGS_CPUS_LIST,
- .fflags = RFTYPE_BASE,
- },
- {
- .name = "tasks",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_tasks_write,
- .seq_show = rdtgroup_tasks_show,
- .fflags = RFTYPE_BASE,
- },
- {
- .name = "mon_hw_id",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdtgroup_rmid_show,
- .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG,
- },
- {
- .name = "schemata",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_schemata_write,
- .seq_show = rdtgroup_schemata_show,
- .fflags = RFTYPE_CTRL_BASE,
- },
- {
- .name = "mba_MBps_event",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_mba_mbps_event_write,
- .seq_show = rdtgroup_mba_mbps_event_show,
- },
- {
- .name = "mode",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_mode_write,
- .seq_show = rdtgroup_mode_show,
- .fflags = RFTYPE_CTRL_BASE,
- },
- {
- .name = "size",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdtgroup_size_show,
- .fflags = RFTYPE_CTRL_BASE,
- },
- {
- .name = "sparse_masks",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_has_sparse_bitmasks_show,
- .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "ctrl_hw_id",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdtgroup_closid_show,
- .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
- },
-
-};
-
-static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
-{
- struct rftype *rfts, *rft;
- int ret, len;
-
- rfts = res_common_files;
- len = ARRAY_SIZE(res_common_files);
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- if (resctrl_debug)
- fflags |= RFTYPE_DEBUG;
-
- for (rft = rfts; rft < rfts + len; rft++) {
- if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
- ret = rdtgroup_add_file(kn, rft);
- if (ret)
- goto error;
- }
- }
-
- return 0;
-error:
- pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
- while (--rft >= rfts) {
- if ((fflags & rft->fflags) == rft->fflags)
- kernfs_remove_by_name(kn, rft->name);
- }
- return ret;
-}
-
-static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
-{
- struct rftype *rfts, *rft;
- int len;
-
- rfts = res_common_files;
- len = ARRAY_SIZE(res_common_files);
-
- for (rft = rfts; rft < rfts + len; rft++) {
- if (!strcmp(rft->name, name))
- return rft;
- }
-
- return NULL;
-}
-
-void resctrl_file_fflags_init(const char *config, unsigned long fflags)
-{
- struct rftype *rft;
-
- rft = rdtgroup_get_rftype_by_name(config);
- if (rft)
- rft->fflags = fflags;
-}
-
-/**
- * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
- * @r: The resource group with which the file is associated.
- * @name: Name of the file
- *
- * The permissions of named resctrl file, directory, or link are modified
- * to not allow read, write, or execute by any user.
- *
- * WARNING: This function is intended to communicate to the user that the
- * resctrl file has been locked down - that it is not relevant to the
- * particular state the system finds itself in. It should not be relied
- * on to protect from user access because after the file's permissions
- * are restricted the user can still change the permissions using chmod
- * from the command line.
- *
- * Return: 0 on success, <0 on failure.
- */
-int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
-{
- struct iattr iattr = {.ia_valid = ATTR_MODE,};
- struct kernfs_node *kn;
- int ret = 0;
-
- kn = kernfs_find_and_get_ns(r->kn, name, NULL);
- if (!kn)
- return -ENOENT;
-
- switch (kernfs_type(kn)) {
- case KERNFS_DIR:
- iattr.ia_mode = S_IFDIR;
- break;
- case KERNFS_FILE:
- iattr.ia_mode = S_IFREG;
- break;
- case KERNFS_LINK:
- iattr.ia_mode = S_IFLNK;
- break;
- }
-
- ret = kernfs_setattr(kn, &iattr);
- kernfs_put(kn);
- return ret;
-}
-
-/**
- * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
- * @r: The resource group with which the file is associated.
- * @name: Name of the file
- * @mask: Mask of permissions that should be restored
- *
- * Restore the permissions of the named file. If @name is a directory the
- * permissions of its parent will be used.
- *
- * Return: 0 on success, <0 on failure.
- */
-int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
- umode_t mask)
-{
- struct iattr iattr = {.ia_valid = ATTR_MODE,};
- struct kernfs_node *kn, *parent;
- struct rftype *rfts, *rft;
- int ret, len;
-
- rfts = res_common_files;
- len = ARRAY_SIZE(res_common_files);
-
- for (rft = rfts; rft < rfts + len; rft++) {
- if (!strcmp(rft->name, name))
- iattr.ia_mode = rft->mode & mask;
- }
-
- kn = kernfs_find_and_get_ns(r->kn, name, NULL);
- if (!kn)
- return -ENOENT;
-
- switch (kernfs_type(kn)) {
- case KERNFS_DIR:
- parent = kernfs_get_parent(kn);
- if (parent) {
- iattr.ia_mode |= parent->mode;
- kernfs_put(parent);
- }
- iattr.ia_mode |= S_IFDIR;
- break;
- case KERNFS_FILE:
- iattr.ia_mode |= S_IFREG;
- break;
- case KERNFS_LINK:
- iattr.ia_mode |= S_IFLNK;
- break;
- }
-
- ret = kernfs_setattr(kn, &iattr);
- kernfs_put(kn);
- return ret;
-}
-
-static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
- unsigned long fflags)
-{
- struct kernfs_node *kn_subdir;
- int ret;
-
- kn_subdir = kernfs_create_dir(kn_info, name,
- kn_info->mode, priv);
- if (IS_ERR(kn_subdir))
- return PTR_ERR(kn_subdir);
-
- ret = rdtgroup_kn_set_ugid(kn_subdir);
- if (ret)
- return ret;
-
- ret = rdtgroup_add_files(kn_subdir, fflags);
- if (!ret)
- kernfs_activate(kn_subdir);
-
- return ret;
-}
-
-static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
-{
- struct resctrl_schema *s;
- struct rdt_resource *r;
- unsigned long fflags;
- char name[32];
- int ret;
-
- /* create the directory */
- kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
- if (IS_ERR(kn_info))
- return PTR_ERR(kn_info);
-
- ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
- if (ret)
- goto out_destroy;
-
- /* loop over enabled controls, these are all alloc_capable */
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- fflags = r->fflags | RFTYPE_CTRL_INFO;
- ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
- if (ret)
- goto out_destroy;
- }
-
- for_each_mon_capable_rdt_resource(r) {
- fflags = r->fflags | RFTYPE_MON_INFO;
- sprintf(name, "%s_MON", r->name);
- ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
- if (ret)
- goto out_destroy;
- }
-
- ret = rdtgroup_kn_set_ugid(kn_info);
- if (ret)
- goto out_destroy;
-
- kernfs_activate(kn_info);
-
- return 0;
-
-out_destroy:
- kernfs_remove(kn_info);
- return ret;
-}
-
-static int
-mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
- char *name, struct kernfs_node **dest_kn)
-{
- struct kernfs_node *kn;
- int ret;
-
- /* create the directory */
- kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- if (dest_kn)
- *dest_kn = kn;
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
-
- kernfs_activate(kn);
-
- return 0;
-
-out_destroy:
- kernfs_remove(kn);
- return ret;
+ wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config);
}
static void l3_qos_cfg_update(void *arg)
{
bool *enable = arg;
- wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+ wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
}
static void l2_qos_cfg_update(void *arg)
{
bool *enable = arg;
- wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
-}
-
-static inline bool is_mba_linear(void)
-{
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear;
+ wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
}
static int set_cache_qos_cfg(int level, bool enable)
@@ -2313,76 +184,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
l3_qos_cfg_update(&hw_res->cdp_enabled);
}
-static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
-{
- u32 num_closid = resctrl_arch_get_num_closid(r);
- int cpu = cpumask_any(&d->hdr.cpu_mask);
- int i;
-
- d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
- GFP_KERNEL, cpu_to_node(cpu));
- if (!d->mbps_val)
- return -ENOMEM;
-
- for (i = 0; i < num_closid; i++)
- d->mbps_val[i] = MBA_MAX_MBPS;
-
- return 0;
-}
-
-static void mba_sc_domain_destroy(struct rdt_resource *r,
- struct rdt_ctrl_domain *d)
-{
- kfree(d->mbps_val);
- d->mbps_val = NULL;
-}
-
-/*
- * MBA software controller is supported only if
- * MBM is supported and MBA is in linear scale,
- * and the MBM monitor scope is the same as MBA
- * control scope.
- */
-static bool supports_mba_mbps(void)
-{
- struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
-
- return (is_mbm_enabled() &&
- r->alloc_capable && is_mba_linear() &&
- r->ctrl_scope == rmbm->mon_scope);
-}
-
-/*
- * Enable or disable the MBA software controller
- * which helps user specify bandwidth in MBps.
- */
-static int set_mba_sc(bool mba_sc)
-{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
- u32 num_closid = resctrl_arch_get_num_closid(r);
- struct rdt_ctrl_domain *d;
- unsigned long fflags;
- int i;
-
- if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
- return -EINVAL;
-
- r->membw.mba_sc = mba_sc;
-
- rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
-
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- for (i = 0; i < num_closid; i++)
- d->mbps_val[i] = MBA_MAX_MBPS;
- }
-
- fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
- resctrl_file_fflags_init("mba_MBps_event", fflags);
-
- return 0;
-}
-
static int cdp_enable(int level)
{
struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
@@ -2423,407 +224,12 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
return 0;
}
-/*
- * We don't allow rdtgroup directories to be created anywhere
- * except the root directory. Thus when looking for the rdtgroup
- * structure for a kernfs node we are either looking at a directory,
- * in which case the rdtgroup structure is pointed at by the "priv"
- * field, otherwise we have a file, and need only look to the parent
- * to find the rdtgroup.
- */
-static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
-{
- if (kernfs_type(kn) == KERNFS_DIR) {
- /*
- * All the resource directories use "kn->priv"
- * to point to the "struct rdtgroup" for the
- * resource. "info" and its subdirectories don't
- * have rdtgroup structures, so return NULL here.
- */
- if (kn == kn_info || kn->parent == kn_info)
- return NULL;
- else
- return kn->priv;
- } else {
- return kn->parent->priv;
- }
-}
-
-static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
-{
- atomic_inc(&rdtgrp->waitcount);
- kernfs_break_active_protection(kn);
-}
-
-static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
-{
- if (atomic_dec_and_test(&rdtgrp->waitcount) &&
- (rdtgrp->flags & RDT_DELETED)) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
- rdtgroup_pseudo_lock_remove(rdtgrp);
- kernfs_unbreak_active_protection(kn);
- rdtgroup_remove(rdtgrp);
- } else {
- kernfs_unbreak_active_protection(kn);
- }
-}
-
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
-{
- struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
-
- if (!rdtgrp)
- return NULL;
-
- rdtgroup_kn_get(rdtgrp, kn);
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- /* Was this group deleted while we waited? */
- if (rdtgrp->flags & RDT_DELETED)
- return NULL;
-
- return rdtgrp;
-}
-
-void rdtgroup_kn_unlock(struct kernfs_node *kn)
-{
- struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
-
- if (!rdtgrp)
- return;
-
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-
- rdtgroup_kn_put(rdtgrp, kn);
-}
-
-static int mkdir_mondata_all(struct kernfs_node *parent_kn,
- struct rdtgroup *prgrp,
- struct kernfs_node **mon_data_kn);
-
-static void rdt_disable_ctx(void)
-{
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
- set_mba_sc(false);
-
- resctrl_debug = false;
-}
-
-static int rdt_enable_ctx(struct rdt_fs_context *ctx)
-{
- int ret = 0;
-
- if (ctx->enable_cdpl2) {
- ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
- if (ret)
- goto out_done;
- }
-
- if (ctx->enable_cdpl3) {
- ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
- if (ret)
- goto out_cdpl2;
- }
-
- if (ctx->enable_mba_mbps) {
- ret = set_mba_sc(true);
- if (ret)
- goto out_cdpl3;
- }
-
- if (ctx->enable_debug)
- resctrl_debug = true;
-
- return 0;
-
-out_cdpl3:
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
-out_cdpl2:
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
-out_done:
- return ret;
-}
-
-static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
-{
- struct resctrl_schema *s;
- const char *suffix = "";
- int ret, cl;
-
- s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- s->res = r;
- s->num_closid = resctrl_arch_get_num_closid(r);
- if (resctrl_arch_get_cdp_enabled(r->rid))
- s->num_closid /= 2;
-
- s->conf_type = type;
- switch (type) {
- case CDP_CODE:
- suffix = "CODE";
- break;
- case CDP_DATA:
- suffix = "DATA";
- break;
- case CDP_NONE:
- suffix = "";
- break;
- }
-
- ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
- if (ret >= sizeof(s->name)) {
- kfree(s);
- return -EINVAL;
- }
-
- cl = strlen(s->name);
-
- /*
- * If CDP is supported by this resource, but not enabled,
- * include the suffix. This ensures the tabular format of the
- * schemata file does not change between mounts of the filesystem.
- */
- if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
- cl += 4;
-
- if (cl > max_name_width)
- max_name_width = cl;
-
- INIT_LIST_HEAD(&s->list);
- list_add(&s->list, &resctrl_schema_all);
-
- return 0;
-}
-
-static int schemata_list_create(void)
-{
- struct rdt_resource *r;
- int ret = 0;
-
- for_each_alloc_capable_rdt_resource(r) {
- if (resctrl_arch_get_cdp_enabled(r->rid)) {
- ret = schemata_list_add(r, CDP_CODE);
- if (ret)
- break;
-
- ret = schemata_list_add(r, CDP_DATA);
- } else {
- ret = schemata_list_add(r, CDP_NONE);
- }
-
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static void schemata_list_destroy(void)
-{
- struct resctrl_schema *s, *tmp;
-
- list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
- list_del(&s->list);
- kfree(s);
- }
-}
-
-static int rdt_get_tree(struct fs_context *fc)
-{
- struct rdt_fs_context *ctx = rdt_fc2context(fc);
- unsigned long flags = RFTYPE_CTRL_BASE;
- struct rdt_mon_domain *dom;
- struct rdt_resource *r;
- int ret;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
- /*
- * resctrl file system can only be mounted once.
- */
- if (resctrl_mounted) {
- ret = -EBUSY;
- goto out;
- }
-
- ret = rdtgroup_setup_root(ctx);
- if (ret)
- goto out;
-
- ret = rdt_enable_ctx(ctx);
- if (ret)
- goto out_root;
-
- ret = schemata_list_create();
- if (ret) {
- schemata_list_destroy();
- goto out_ctx;
- }
-
- closid_init();
-
- if (resctrl_arch_mon_capable())
- flags |= RFTYPE_MON;
-
- ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
- if (ret)
- goto out_schemata_free;
-
- kernfs_activate(rdtgroup_default.kn);
-
- ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
- if (ret < 0)
- goto out_schemata_free;
-
- if (resctrl_arch_mon_capable()) {
- ret = mongroup_create_dir(rdtgroup_default.kn,
- &rdtgroup_default, "mon_groups",
- &kn_mongrp);
- if (ret < 0)
- goto out_info;
-
- ret = mkdir_mondata_all(rdtgroup_default.kn,
- &rdtgroup_default, &kn_mondata);
- if (ret < 0)
- goto out_mongrp;
- rdtgroup_default.mon.mon_data_kn = kn_mondata;
- }
-
- ret = rdt_pseudo_lock_init();
- if (ret)
- goto out_mondata;
-
- ret = kernfs_get_tree(fc);
- if (ret < 0)
- goto out_psl;
-
- if (resctrl_arch_alloc_capable())
- resctrl_arch_enable_alloc();
- if (resctrl_arch_mon_capable())
- resctrl_arch_enable_mon();
-
- if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
- resctrl_mounted = true;
-
- if (is_mbm_enabled()) {
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- list_for_each_entry(dom, &r->mon_domains, hdr.list)
- mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
- RESCTRL_PICK_ANY_CPU);
- }
-
- goto out;
-
-out_psl:
- rdt_pseudo_lock_release();
-out_mondata:
- if (resctrl_arch_mon_capable())
- kernfs_remove(kn_mondata);
-out_mongrp:
- if (resctrl_arch_mon_capable())
- kernfs_remove(kn_mongrp);
-out_info:
- kernfs_remove(kn_info);
-out_schemata_free:
- schemata_list_destroy();
-out_ctx:
- rdt_disable_ctx();
-out_root:
- rdtgroup_destroy_root();
-out:
- rdt_last_cmd_clear();
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
- return ret;
-}
-
-enum rdt_param {
- Opt_cdp,
- Opt_cdpl2,
- Opt_mba_mbps,
- Opt_debug,
- nr__rdt_params
-};
-
-static const struct fs_parameter_spec rdt_fs_parameters[] = {
- fsparam_flag("cdp", Opt_cdp),
- fsparam_flag("cdpl2", Opt_cdpl2),
- fsparam_flag("mba_MBps", Opt_mba_mbps),
- fsparam_flag("debug", Opt_debug),
- {}
-};
-
-static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
- struct rdt_fs_context *ctx = rdt_fc2context(fc);
- struct fs_parse_result result;
- const char *msg;
- int opt;
-
- opt = fs_parse(fc, rdt_fs_parameters, param, &result);
- if (opt < 0)
- return opt;
-
- switch (opt) {
- case Opt_cdp:
- ctx->enable_cdpl3 = true;
- return 0;
- case Opt_cdpl2:
- ctx->enable_cdpl2 = true;
- return 0;
- case Opt_mba_mbps:
- msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
- if (!supports_mba_mbps())
- return invalfc(fc, msg);
- ctx->enable_mba_mbps = true;
- return 0;
- case Opt_debug:
- ctx->enable_debug = true;
- return 0;
- }
-
- return -EINVAL;
-}
-
-static void rdt_fs_context_free(struct fs_context *fc)
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
{
- struct rdt_fs_context *ctx = rdt_fc2context(fc);
-
- kernfs_free_fs_context(fc);
- kfree(ctx);
+ return rdt_resources_all[l].cdp_enabled;
}
-static const struct fs_context_operations rdt_fs_context_ops = {
- .free = rdt_fs_context_free,
- .parse_param = rdt_parse_param,
- .get_tree = rdt_get_tree,
-};
-
-static int rdt_init_fs_context(struct fs_context *fc)
-{
- struct rdt_fs_context *ctx;
-
- ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
-
- ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
- fc->fs_private = &ctx->kfc;
- fc->ops = &rdt_fs_context_ops;
- put_user_ns(fc->user_ns);
- fc->user_ns = get_user_ns(&init_user_ns);
- fc->global = true;
- return 0;
-}
-
-static int reset_all_ctrls(struct rdt_resource *r)
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rdt_hw_ctrl_domain *hw_dom;
@@ -2847,1400 +253,10 @@ static int reset_all_ctrls(struct rdt_resource *r)
hw_dom = resctrl_to_arch_ctrl_dom(d);
for (i = 0; i < hw_res->num_closid; i++)
- hw_dom->ctrl_val[i] = r->default_ctrl;
+ hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
msr_param.dom = d;
smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- return 0;
-}
-
-/*
- * Move tasks from one to the other group. If @from is NULL, then all tasks
- * in the systems are moved unconditionally (used for teardown).
- *
- * If @mask is not NULL the cpus on which moved tasks are running are set
- * in that mask so the update smp function call is restricted to affected
- * cpus.
- */
-static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
- struct cpumask *mask)
-{
- struct task_struct *p, *t;
-
- read_lock(&tasklist_lock);
- for_each_process_thread(p, t) {
- if (!from || is_closid_match(t, from) ||
- is_rmid_match(t, from)) {
- resctrl_arch_set_closid_rmid(t, to->closid,
- to->mon.rmid);
-
- /*
- * Order the closid/rmid stores above before the loads
- * in task_curr(). This pairs with the full barrier
- * between the rq->curr update and resctrl_sched_in()
- * during context switch.
- */
- smp_mb();
-
- /*
- * If the task is on a CPU, set the CPU in the mask.
- * The detection is inaccurate as tasks might move or
- * schedule before the smp function call takes place.
- * In such a case the function call is pointless, but
- * there is no other side effect.
- */
- if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
- cpumask_set_cpu(task_cpu(t), mask);
- }
- }
- read_unlock(&tasklist_lock);
-}
-
-static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
-{
- struct rdtgroup *sentry, *stmp;
- struct list_head *head;
-
- head = &rdtgrp->mon.crdtgrp_list;
- list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
- free_rmid(sentry->closid, sentry->mon.rmid);
- list_del(&sentry->mon.crdtgrp_list);
-
- if (atomic_read(&sentry->waitcount) != 0)
- sentry->flags = RDT_DELETED;
- else
- rdtgroup_remove(sentry);
- }
-}
-
-/*
- * Forcibly remove all of subdirectories under root.
- */
-static void rmdir_all_sub(void)
-{
- struct rdtgroup *rdtgrp, *tmp;
-
- /* Move all tasks to the default resource group */
- rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
-
- list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
- /* Free any child rmids */
- free_all_child_rdtgrp(rdtgrp);
-
- /* Remove each rdtgroup other than root */
- if (rdtgrp == &rdtgroup_default)
- continue;
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
- rdtgroup_pseudo_lock_remove(rdtgrp);
-
- /*
- * Give any CPUs back to the default group. We cannot copy
- * cpu_online_mask because a CPU might have executed the
- * offline callback already, but is still marked online.
- */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
-
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-
- kernfs_remove(rdtgrp->kn);
- list_del(&rdtgrp->rdtgroup_list);
-
- if (atomic_read(&rdtgrp->waitcount) != 0)
- rdtgrp->flags = RDT_DELETED;
- else
- rdtgroup_remove(rdtgrp);
- }
- /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
- update_closid_rmid(cpu_online_mask, &rdtgroup_default);
-
- kernfs_remove(kn_info);
- kernfs_remove(kn_mongrp);
- kernfs_remove(kn_mondata);
-}
-
-static void rdt_kill_sb(struct super_block *sb)
-{
- struct rdt_resource *r;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- rdt_disable_ctx();
-
- /*Put everything back to default values. */
- for_each_alloc_capable_rdt_resource(r)
- reset_all_ctrls(r);
- rmdir_all_sub();
- rdt_pseudo_lock_release();
- rdtgroup_default.mode = RDT_MODE_SHAREABLE;
- schemata_list_destroy();
- rdtgroup_destroy_root();
- if (resctrl_arch_alloc_capable())
- resctrl_arch_disable_alloc();
- if (resctrl_arch_mon_capable())
- resctrl_arch_disable_mon();
- resctrl_mounted = false;
- kernfs_kill_sb(sb);
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-}
-
-static struct file_system_type rdt_fs_type = {
- .name = "resctrl",
- .init_fs_context = rdt_init_fs_context,
- .parameters = rdt_fs_parameters,
- .kill_sb = rdt_kill_sb,
-};
-
-static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
- void *priv)
-{
- struct kernfs_node *kn;
- int ret = 0;
-
- kn = __kernfs_create_file(parent_kn, name, 0444,
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
- &kf_mondata_ops, priv, NULL, NULL);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
- return ret;
-}
-
-static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
-{
- struct kernfs_node *kn;
-
- kn = kernfs_find_and_get(pkn, name);
- if (!kn)
- return;
- kernfs_put(kn);
-
- if (kn->dir.subdirs <= 1)
- kernfs_remove(kn);
- else
- kernfs_remove_by_name(kn, subname);
-}
-
-/*
- * Remove all subdirectories of mon_data of ctrl_mon groups
- * and monitor groups for the given domain.
- * Remove files and directories containing "sum" of domain data
- * when last domain being summed is removed.
- */
-static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_mon_domain *d)
-{
- struct rdtgroup *prgrp, *crgrp;
- char subname[32];
- bool snc_mode;
- char name[32];
-
- snc_mode = r->mon_scope == RESCTRL_L3_NODE;
- sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
- if (snc_mode)
- sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
-
- list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
-
- list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
- mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
- }
-}
-
-static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
- struct rdt_resource *r, struct rdtgroup *prgrp,
- bool do_sum)
-{
- struct rmid_read rr = {0};
- union mon_data_bits priv;
- struct mon_evt *mevt;
- int ret;
-
- if (WARN_ON(list_empty(&r->evt_list)))
- return -EPERM;
-
- priv.u.rid = r->rid;
- priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
- priv.u.sum = do_sum;
- list_for_each_entry(mevt, &r->evt_list, list) {
- priv.u.evtid = mevt->evtid;
- ret = mon_addfile(kn, mevt->name, priv.priv);
- if (ret)
- return ret;
-
- if (!do_sum && is_mbm_event(mevt->evtid))
- mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
- }
-
- return 0;
-}
-
-static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
- struct rdt_mon_domain *d,
- struct rdt_resource *r, struct rdtgroup *prgrp)
-{
- struct kernfs_node *kn, *ckn;
- char name[32];
- bool snc_mode;
- int ret = 0;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- snc_mode = r->mon_scope == RESCTRL_L3_NODE;
- sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
- kn = kernfs_find_and_get(parent_kn, name);
- if (kn) {
- /*
- * rdtgroup_mutex will prevent this directory from being
- * removed. No need to keep this hold.
- */
- kernfs_put(kn);
- } else {
- kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
- ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
- if (ret)
- goto out_destroy;
- }
-
- if (snc_mode) {
- sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
- ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(ckn)) {
- ret = -EINVAL;
- goto out_destroy;
- }
-
- ret = rdtgroup_kn_set_ugid(ckn);
- if (ret)
- goto out_destroy;
-
- ret = mon_add_all_files(ckn, d, r, prgrp, false);
- if (ret)
- goto out_destroy;
- }
-
- kernfs_activate(kn);
- return 0;
-
-out_destroy:
- kernfs_remove(kn);
- return ret;
-}
-
-/*
- * Add all subdirectories of mon_data for "ctrl_mon" groups
- * and "monitor" groups with given domain id.
- */
-static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_mon_domain *d)
-{
- struct kernfs_node *parent_kn;
- struct rdtgroup *prgrp, *crgrp;
- struct list_head *head;
-
- list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- parent_kn = prgrp->mon.mon_data_kn;
- mkdir_mondata_subdir(parent_kn, d, r, prgrp);
-
- head = &prgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
- parent_kn = crgrp->mon.mon_data_kn;
- mkdir_mondata_subdir(parent_kn, d, r, crgrp);
- }
- }
-}
-
-static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
- struct rdt_resource *r,
- struct rdtgroup *prgrp)
-{
- struct rdt_mon_domain *dom;
- int ret;
-
- /* Walking r->domains, ensure it can't race with cpuhp */
- lockdep_assert_cpus_held();
-
- list_for_each_entry(dom, &r->mon_domains, hdr.list) {
- ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * This creates a directory mon_data which contains the monitored data.
- *
- * mon_data has one directory for each domain which are named
- * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
- * with L3 domain looks as below:
- * ./mon_data:
- * mon_L3_00
- * mon_L3_01
- * mon_L3_02
- * ...
- *
- * Each domain directory has one file per event:
- * ./mon_L3_00/:
- * llc_occupancy
- *
- */
-static int mkdir_mondata_all(struct kernfs_node *parent_kn,
- struct rdtgroup *prgrp,
- struct kernfs_node **dest_kn)
-{
- struct rdt_resource *r;
- struct kernfs_node *kn;
- int ret;
-
- /*
- * Create the mon_data directory first.
- */
- ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
- if (ret)
- return ret;
-
- if (dest_kn)
- *dest_kn = kn;
-
- /*
- * Create the subdirectories for each domain. Note that all events
- * in a domain like L3 are grouped into a resource whose domain is L3
- */
- for_each_mon_capable_rdt_resource(r) {
- ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
- if (ret)
- goto out_destroy;
- }
-
- return 0;
-
-out_destroy:
- kernfs_remove(kn);
- return ret;
-}
-
-/**
- * cbm_ensure_valid - Enforce validity on provided CBM
- * @_val: Candidate CBM
- * @r: RDT resource to which the CBM belongs
- *
- * The provided CBM represents all cache portions available for use. This
- * may be represented by a bitmap that does not consist of contiguous ones
- * and thus be an invalid CBM.
- * Here the provided CBM is forced to be a valid CBM by only considering
- * the first set of contiguous bits as valid and clearing all bits.
- * The intention here is to provide a valid default CBM with which a new
- * resource group is initialized. The user can follow this with a
- * modification to the CBM if the default does not satisfy the
- * requirements.
- */
-static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
-{
- unsigned int cbm_len = r->cache.cbm_len;
- unsigned long first_bit, zero_bit;
- unsigned long val = _val;
-
- if (!val)
- return 0;
-
- first_bit = find_first_bit(&val, cbm_len);
- zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
- /* Clear any remaining bits to ensure contiguous region */
- bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
- return (u32)val;
-}
-
-/*
- * Initialize cache resources per RDT domain
- *
- * Set the RDT domain up to start off with all usable allocations. That is,
- * all shareable and unused bits. All-zero CBM is invalid.
- */
-static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
- u32 closid)
-{
- enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
- enum resctrl_conf_type t = s->conf_type;
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- u32 used_b = 0, unused_b = 0;
- unsigned long tmp_cbm;
- enum rdtgrp_mode mode;
- u32 peer_ctl, ctrl_val;
- int i;
-
- cfg = &d->staged_config[t];
- cfg->have_new_ctrl = false;
- cfg->new_ctrl = r->cache.shareable_bits;
- used_b = r->cache.shareable_bits;
- for (i = 0; i < closids_supported(); i++) {
- if (closid_allocated(i) && i != closid) {
- mode = rdtgroup_mode_by_closid(i);
- if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
- /*
- * ctrl values for locksetup aren't relevant
- * until the schemata is written, and the mode
- * becomes RDT_MODE_PSEUDO_LOCKED.
- */
- continue;
- /*
- * If CDP is active include peer domain's
- * usage to ensure there is no overlap
- * with an exclusive group.
- */
- if (resctrl_arch_get_cdp_enabled(r->rid))
- peer_ctl = resctrl_arch_get_config(r, d, i,
- peer_type);
- else
- peer_ctl = 0;
- ctrl_val = resctrl_arch_get_config(r, d, i,
- s->conf_type);
- used_b |= ctrl_val | peer_ctl;
- if (mode == RDT_MODE_SHAREABLE)
- cfg->new_ctrl |= ctrl_val | peer_ctl;
- }
- }
- if (d->plr && d->plr->cbm > 0)
- used_b |= d->plr->cbm;
- unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
- unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
- cfg->new_ctrl |= unused_b;
- /*
- * Force the initial CBM to be valid, user can
- * modify the CBM based on system availability.
- */
- cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
- /*
- * Assign the u32 CBM to an unsigned long to ensure that
- * bitmap_weight() does not access out-of-bound memory.
- */
- tmp_cbm = cfg->new_ctrl;
- if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
- return -ENOSPC;
- }
- cfg->have_new_ctrl = true;
-
- return 0;
-}
-
-/*
- * Initialize cache resources with default values.
- *
- * A new RDT group is being created on an allocation capable (CAT)
- * supporting system. Set this group up to start off with all usable
- * allocations.
- *
- * If there are no more shareable bits available on any domain then
- * the entire allocation will fail.
- */
-static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
-{
- struct rdt_ctrl_domain *d;
- int ret;
-
- list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
- ret = __init_one_rdt_domain(d, s, closid);
- if (ret < 0)
- return ret;
- }
-
- return 0;
-}
-
-/* Initialize MBA resource with default values. */
-static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
-{
- struct resctrl_staged_config *cfg;
- struct rdt_ctrl_domain *d;
-
- list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
- if (is_mba_sc(r)) {
- d->mbps_val[closid] = MBA_MAX_MBPS;
- continue;
- }
-
- cfg = &d->staged_config[CDP_NONE];
- cfg->new_ctrl = r->default_ctrl;
- cfg->have_new_ctrl = true;
- }
-}
-
-/* Initialize the RDT group's allocations. */
-static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
-{
- struct resctrl_schema *s;
- struct rdt_resource *r;
- int ret = 0;
-
- rdt_staged_configs_clear();
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- if (r->rid == RDT_RESOURCE_MBA ||
- r->rid == RDT_RESOURCE_SMBA) {
- rdtgroup_init_mba(r, rdtgrp->closid);
- if (is_mba_sc(r))
- continue;
- } else {
- ret = rdtgroup_init_cat(s, rdtgrp->closid);
- if (ret < 0)
- goto out;
- }
-
- ret = resctrl_arch_update_domains(r, rdtgrp->closid);
- if (ret < 0) {
- rdt_last_cmd_puts("Failed to initialize allocations\n");
- goto out;
- }
-
- }
-
- rdtgrp->mode = RDT_MODE_SHAREABLE;
-
-out:
- rdt_staged_configs_clear();
- return ret;
-}
-
-static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- if (!resctrl_arch_mon_capable())
- return 0;
-
- ret = alloc_rmid(rdtgrp->closid);
- if (ret < 0) {
- rdt_last_cmd_puts("Out of RMIDs\n");
- return ret;
- }
- rdtgrp->mon.rmid = ret;
-
- ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
- if (ret) {
- rdt_last_cmd_puts("kernfs subdir error\n");
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
- return ret;
- }
-
- return 0;
-}
-
-static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
-{
- if (resctrl_arch_mon_capable())
- free_rmid(rgrp->closid, rgrp->mon.rmid);
-}
-
-static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
- const char *name, umode_t mode,
- enum rdt_group_type rtype, struct rdtgroup **r)
-{
- struct rdtgroup *prdtgrp, *rdtgrp;
- unsigned long files = 0;
- struct kernfs_node *kn;
- int ret;
-
- prdtgrp = rdtgroup_kn_lock_live(parent_kn);
- if (!prdtgrp) {
- ret = -ENODEV;
- goto out_unlock;
- }
-
- if (rtype == RDTMON_GROUP &&
- (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Pseudo-locking in progress\n");
- goto out_unlock;
- }
-
- /* allocate the rdtgroup. */
- rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
- if (!rdtgrp) {
- ret = -ENOSPC;
- rdt_last_cmd_puts("Kernel out of memory\n");
- goto out_unlock;
- }
- *r = rdtgrp;
- rdtgrp->mon.parent = prdtgrp;
- rdtgrp->type = rtype;
- INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
-
- /* kernfs creates the directory for rdtgrp */
- kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
- if (IS_ERR(kn)) {
- ret = PTR_ERR(kn);
- rdt_last_cmd_puts("kernfs create error\n");
- goto out_free_rgrp;
- }
- rdtgrp->kn = kn;
-
- /*
- * kernfs_remove() will drop the reference count on "kn" which
- * will free it. But we still need it to stick around for the
- * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
- * which will be dropped by kernfs_put() in rdtgroup_remove().
- */
- kernfs_get(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret) {
- rdt_last_cmd_puts("kernfs perm error\n");
- goto out_destroy;
- }
-
- if (rtype == RDTCTRL_GROUP) {
- files = RFTYPE_BASE | RFTYPE_CTRL;
- if (resctrl_arch_mon_capable())
- files |= RFTYPE_MON;
- } else {
- files = RFTYPE_BASE | RFTYPE_MON;
- }
-
- ret = rdtgroup_add_files(kn, files);
- if (ret) {
- rdt_last_cmd_puts("kernfs fill error\n");
- goto out_destroy;
- }
-
- /*
- * The caller unlocks the parent_kn upon success.
- */
- return 0;
-
-out_destroy:
- kernfs_put(rdtgrp->kn);
- kernfs_remove(rdtgrp->kn);
-out_free_rgrp:
- kfree(rdtgrp);
-out_unlock:
- rdtgroup_kn_unlock(parent_kn);
- return ret;
-}
-
-static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
-{
- kernfs_remove(rgrp->kn);
- rdtgroup_remove(rgrp);
-}
-
-/*
- * Create a monitor group under "mon_groups" directory of a control
- * and monitor group(ctrl_mon). This is a resource group
- * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
- */
-static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
- const char *name, umode_t mode)
-{
- struct rdtgroup *rdtgrp, *prgrp;
- int ret;
-
- ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
- if (ret)
- return ret;
-
- prgrp = rdtgrp->mon.parent;
- rdtgrp->closid = prgrp->closid;
-
- ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
- if (ret) {
- mkdir_rdt_prepare_clean(rdtgrp);
- goto out_unlock;
- }
-
- kernfs_activate(rdtgrp->kn);
-
- /*
- * Add the rdtgrp to the list of rdtgrps the parent
- * ctrl_mon group has to track.
- */
- list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
-
-out_unlock:
- rdtgroup_kn_unlock(parent_kn);
- return ret;
-}
-
-/*
- * These are rdtgroups created under the root directory. Can be used
- * to allocate and monitor resources.
- */
-static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
- const char *name, umode_t mode)
-{
- struct rdtgroup *rdtgrp;
- struct kernfs_node *kn;
- u32 closid;
- int ret;
-
- ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
- if (ret)
- return ret;
-
- kn = rdtgrp->kn;
- ret = closid_alloc();
- if (ret < 0) {
- rdt_last_cmd_puts("Out of CLOSIDs\n");
- goto out_common_fail;
- }
- closid = ret;
- ret = 0;
-
- rdtgrp->closid = closid;
-
- ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
- if (ret)
- goto out_closid_free;
-
- kernfs_activate(rdtgrp->kn);
-
- ret = rdtgroup_init_alloc(rdtgrp);
- if (ret < 0)
- goto out_rmid_free;
-
- list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
-
- if (resctrl_arch_mon_capable()) {
- /*
- * Create an empty mon_groups directory to hold the subset
- * of tasks and cpus to monitor.
- */
- ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
- if (ret) {
- rdt_last_cmd_puts("kernfs subdir error\n");
- goto out_del_list;
- }
- if (is_mba_sc(NULL))
- rdtgrp->mba_mbps_event = mba_mbps_default_event;
- }
-
- goto out_unlock;
-
-out_del_list:
- list_del(&rdtgrp->rdtgroup_list);
-out_rmid_free:
- mkdir_rdt_prepare_rmid_free(rdtgrp);
-out_closid_free:
- closid_free(closid);
-out_common_fail:
- mkdir_rdt_prepare_clean(rdtgrp);
-out_unlock:
- rdtgroup_kn_unlock(parent_kn);
- return ret;
-}
-
-/*
- * We allow creating mon groups only with in a directory called "mon_groups"
- * which is present in every ctrl_mon group. Check if this is a valid
- * "mon_groups" directory.
- *
- * 1. The directory should be named "mon_groups".
- * 2. The mon group itself should "not" be named "mon_groups".
- * This makes sure "mon_groups" directory always has a ctrl_mon group
- * as parent.
- */
-static bool is_mon_groups(struct kernfs_node *kn, const char *name)
-{
- return (!strcmp(kn->name, "mon_groups") &&
- strcmp(name, "mon_groups"));
-}
-
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
-{
- /* Do not accept '\n' to avoid unparsable situation. */
- if (strchr(name, '\n'))
- return -EINVAL;
-
- /*
- * If the parent directory is the root directory and RDT
- * allocation is supported, add a control and monitoring
- * subdirectory
- */
- if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
- return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
-
- /*
- * If RDT monitoring is supported and the parent directory is a valid
- * "mon_groups" directory, add a monitoring subdirectory.
- */
- if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name))
- return rdtgroup_mkdir_mon(parent_kn, name, mode);
-
- return -EPERM;
-}
-
-static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
-{
- struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
- int cpu;
-
- /* Give any tasks back to the parent group */
- rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
-
- /* Update per cpu rmid of the moved CPUs first */
- for_each_cpu(cpu, &rdtgrp->cpu_mask)
- per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
- /*
- * Update the MSR on moved CPUs and CPUs which have moved
- * task running on them.
- */
- cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
- update_closid_rmid(tmpmask, NULL);
-
- rdtgrp->flags = RDT_DELETED;
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
-
- /*
- * Remove the rdtgrp from the parent ctrl_mon group's list
- */
- WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
- list_del(&rdtgrp->mon.crdtgrp_list);
-
- kernfs_remove(rdtgrp->kn);
-
- return 0;
-}
-
-static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
-{
- rdtgrp->flags = RDT_DELETED;
- list_del(&rdtgrp->rdtgroup_list);
-
- kernfs_remove(rdtgrp->kn);
- return 0;
-}
-
-static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
-{
- int cpu;
-
- /* Give any tasks back to the default group */
- rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
-
- /* Give any CPUs back to the default group */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
-
- /* Update per cpu closid and rmid of the moved CPUs first */
- for_each_cpu(cpu, &rdtgrp->cpu_mask) {
- per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
- per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
- }
-
- /*
- * Update the MSR on moved CPUs and CPUs which have moved
- * task running on them.
- */
- cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
- update_closid_rmid(tmpmask, NULL);
-
- free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
- closid_free(rdtgrp->closid);
-
- rdtgroup_ctrl_remove(rdtgrp);
-
- /*
- * Free all the child monitor group rmids.
- */
- free_all_child_rdtgrp(rdtgrp);
-
- return 0;
-}
-
-static int rdtgroup_rmdir(struct kernfs_node *kn)
-{
- struct kernfs_node *parent_kn = kn->parent;
- struct rdtgroup *rdtgrp;
- cpumask_var_t tmpmask;
- int ret = 0;
-
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
- return -ENOMEM;
-
- rdtgrp = rdtgroup_kn_lock_live(kn);
- if (!rdtgrp) {
- ret = -EPERM;
- goto out;
- }
-
- /*
- * If the rdtgroup is a ctrl_mon group and parent directory
- * is the root directory, remove the ctrl_mon group.
- *
- * If the rdtgroup is a mon group and parent directory
- * is a valid "mon_groups" directory, remove the mon group.
- */
- if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
- rdtgrp != &rdtgroup_default) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- ret = rdtgroup_ctrl_remove(rdtgrp);
- } else {
- ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
- }
- } else if (rdtgrp->type == RDTMON_GROUP &&
- is_mon_groups(parent_kn, kn->name)) {
- ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
- } else {
- ret = -EPERM;
- }
-
-out:
- rdtgroup_kn_unlock(kn);
- free_cpumask_var(tmpmask);
- return ret;
-}
-
-/**
- * mongrp_reparent() - replace parent CTRL_MON group of a MON group
- * @rdtgrp: the MON group whose parent should be replaced
- * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp
- * @cpus: cpumask provided by the caller for use during this call
- *
- * Replaces the parent CTRL_MON group for a MON group, resulting in all member
- * tasks' CLOSID immediately changing to that of the new parent group.
- * Monitoring data for the group is unaffected by this operation.
- */
-static void mongrp_reparent(struct rdtgroup *rdtgrp,
- struct rdtgroup *new_prdtgrp,
- cpumask_var_t cpus)
-{
- struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
-
- WARN_ON(rdtgrp->type != RDTMON_GROUP);
- WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
-
- /* Nothing to do when simply renaming a MON group. */
- if (prdtgrp == new_prdtgrp)
- return;
-
- WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
- list_move_tail(&rdtgrp->mon.crdtgrp_list,
- &new_prdtgrp->mon.crdtgrp_list);
-
- rdtgrp->mon.parent = new_prdtgrp;
- rdtgrp->closid = new_prdtgrp->closid;
-
- /* Propagate updated closid to all tasks in this group. */
- rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
-
- update_closid_rmid(cpus, NULL);
-}
-
-static int rdtgroup_rename(struct kernfs_node *kn,
- struct kernfs_node *new_parent, const char *new_name)
-{
- struct rdtgroup *new_prdtgrp;
- struct rdtgroup *rdtgrp;
- cpumask_var_t tmpmask;
- int ret;
-
- rdtgrp = kernfs_to_rdtgroup(kn);
- new_prdtgrp = kernfs_to_rdtgroup(new_parent);
- if (!rdtgrp || !new_prdtgrp)
- return -ENOENT;
-
- /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
- rdtgroup_kn_get(rdtgrp, kn);
- rdtgroup_kn_get(new_prdtgrp, new_parent);
-
- mutex_lock(&rdtgroup_mutex);
-
- rdt_last_cmd_clear();
-
- /*
- * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
- * either kernfs_node is a file.
- */
- if (kernfs_type(kn) != KERNFS_DIR ||
- kernfs_type(new_parent) != KERNFS_DIR) {
- rdt_last_cmd_puts("Source and destination must be directories");
- ret = -EPERM;
- goto out;
- }
-
- if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
- ret = -ENOENT;
- goto out;
- }
-
- if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
- !is_mon_groups(kn->parent, kn->name)) {
- rdt_last_cmd_puts("Source must be a MON group\n");
- ret = -EPERM;
- goto out;
- }
-
- if (!is_mon_groups(new_parent, new_name)) {
- rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
- ret = -EPERM;
- goto out;
- }
-
- /*
- * If the MON group is monitoring CPUs, the CPUs must be assigned to the
- * current parent CTRL_MON group and therefore cannot be assigned to
- * the new parent, making the move illegal.
- */
- if (!cpumask_empty(&rdtgrp->cpu_mask) &&
- rdtgrp->mon.parent != new_prdtgrp) {
- rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
- ret = -EPERM;
- goto out;
- }
-
- /*
- * Allocate the cpumask for use in mongrp_reparent() to avoid the
- * possibility of failing to allocate it after kernfs_rename() has
- * succeeded.
- */
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out;
- }
-
- /*
- * Perform all input validation and allocations needed to ensure
- * mongrp_reparent() will succeed before calling kernfs_rename(),
- * otherwise it would be necessary to revert this call if
- * mongrp_reparent() failed.
- */
- ret = kernfs_rename(kn, new_parent, new_name);
- if (!ret)
- mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
-
- free_cpumask_var(tmpmask);
-
-out:
- mutex_unlock(&rdtgroup_mutex);
- rdtgroup_kn_put(rdtgrp, kn);
- rdtgroup_kn_put(new_prdtgrp, new_parent);
- return ret;
-}
-
-static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
-{
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
- seq_puts(seq, ",cdp");
-
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
- seq_puts(seq, ",cdpl2");
-
- if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
- seq_puts(seq, ",mba_MBps");
-
- if (resctrl_debug)
- seq_puts(seq, ",debug");
-
- return 0;
-}
-
-static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
- .mkdir = rdtgroup_mkdir,
- .rmdir = rdtgroup_rmdir,
- .rename = rdtgroup_rename,
- .show_options = rdtgroup_show_options,
-};
-
-static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
-{
- rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
- KERNFS_ROOT_CREATE_DEACTIVATED |
- KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
- &rdtgroup_default);
- if (IS_ERR(rdt_root))
- return PTR_ERR(rdt_root);
-
- ctx->kfc.root = rdt_root;
- rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
-
- return 0;
-}
-
-static void rdtgroup_destroy_root(void)
-{
- kernfs_destroy_root(rdt_root);
- rdtgroup_default.kn = NULL;
-}
-
-static void __init rdtgroup_setup_default(void)
-{
- mutex_lock(&rdtgroup_mutex);
-
- rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
- rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
- rdtgroup_default.type = RDTCTRL_GROUP;
- INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
-
- list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
-
- mutex_unlock(&rdtgroup_mutex);
-}
-
-static void domain_destroy_mon_state(struct rdt_mon_domain *d)
-{
- bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
- kfree(d->mbm_local);
-}
-
-void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
-{
- mutex_lock(&rdtgroup_mutex);
-
- if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
- mba_sc_domain_destroy(r, d);
-
- mutex_unlock(&rdtgroup_mutex);
-}
-
-void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
-{
- mutex_lock(&rdtgroup_mutex);
-
- /*
- * If resctrl is mounted, remove all the
- * per domain monitor data directories.
- */
- if (resctrl_mounted && resctrl_arch_mon_capable())
- rmdir_mondata_subdir_allrdtgrp(r, d);
-
- if (is_mbm_enabled())
- cancel_delayed_work(&d->mbm_over);
- if (is_llc_occupancy_enabled() && has_busy_rmid(d)) {
- /*
- * When a package is going down, forcefully
- * decrement rmid->ebusy. There is no way to know
- * that the L3 was flushed and hence may lead to
- * incorrect counts in rare scenarios, but leaving
- * the RMID as busy creates RMID leaks if the
- * package never comes back.
- */
- __check_limbo(d, true);
- cancel_delayed_work(&d->cqm_limbo);
- }
-
- domain_destroy_mon_state(d);
-
- mutex_unlock(&rdtgroup_mutex);
-}
-
-static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
-{
- u32 idx_limit = resctrl_arch_system_num_rmid_idx();
- size_t tsize;
-
- if (is_llc_occupancy_enabled()) {
- d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
- if (!d->rmid_busy_llc)
- return -ENOMEM;
- }
- if (is_mbm_total_enabled()) {
- tsize = sizeof(*d->mbm_total);
- d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
- if (!d->mbm_total) {
- bitmap_free(d->rmid_busy_llc);
- return -ENOMEM;
- }
- }
- if (is_mbm_local_enabled()) {
- tsize = sizeof(*d->mbm_local);
- d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
- if (!d->mbm_local) {
- bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
- return -ENOMEM;
- }
- }
-
- return 0;
-}
-
-int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
-{
- int err = 0;
-
- mutex_lock(&rdtgroup_mutex);
-
- if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
- /* RDT_RESOURCE_MBA is never mon_capable */
- err = mba_sc_domain_allocate(r, d);
- }
-
- mutex_unlock(&rdtgroup_mutex);
-
- return err;
-}
-
-int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
-{
- int err;
-
- mutex_lock(&rdtgroup_mutex);
-
- err = domain_setup_mon_state(r, d);
- if (err)
- goto out_unlock;
-
- if (is_mbm_enabled()) {
- INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
- mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
- RESCTRL_PICK_ANY_CPU);
- }
-
- if (is_llc_occupancy_enabled())
- INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
-
- /*
- * If the filesystem is not mounted then only the default resource group
- * exists. Creation of its directories is deferred until mount time
- * by rdt_get_tree() calling mkdir_mondata_all().
- * If resctrl is mounted, add per domain monitor data directories.
- */
- if (resctrl_mounted && resctrl_arch_mon_capable())
- mkdir_mondata_subdir_allrdtgrp(r, d);
-
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
-
- return err;
-}
-
-void resctrl_online_cpu(unsigned int cpu)
-{
- mutex_lock(&rdtgroup_mutex);
- /* The CPU is set in default rdtgroup after online. */
- cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
- mutex_unlock(&rdtgroup_mutex);
-}
-
-static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
-{
- struct rdtgroup *cr;
-
- list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
- if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
- break;
- }
-}
-
-void resctrl_offline_cpu(unsigned int cpu)
-{
- struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- struct rdt_mon_domain *d;
- struct rdtgroup *rdtgrp;
-
- mutex_lock(&rdtgroup_mutex);
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
- clear_childcpus(rdtgrp, cpu);
- break;
- }
- }
-
- if (!l3->mon_capable)
- goto out_unlock;
-
- d = get_mon_domain_from_cpu(cpu, l3);
- if (d) {
- if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
- cancel_delayed_work(&d->mbm_over);
- mbm_setup_overflow_handler(d, 0, cpu);
- }
- if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
- has_busy_rmid(d)) {
- cancel_delayed_work(&d->cqm_limbo);
- cqm_setup_limbo_handler(d, 0, cpu);
- }
- }
-
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
-}
-
-/*
- * rdtgroup_init - rdtgroup initialization
- *
- * Setup resctrl file system including set up root, create mount point,
- * register rdtgroup filesystem, and initialize files under root directory.
- *
- * Return: 0 on success or -errno
- */
-int __init rdtgroup_init(void)
-{
- int ret = 0;
-
- seq_buf_init(&last_cmd_status, last_cmd_status_buf,
- sizeof(last_cmd_status_buf));
-
- rdtgroup_setup_default();
-
- ret = sysfs_create_mount_point(fs_kobj, "resctrl");
- if (ret)
- return ret;
-
- ret = register_filesystem(&rdt_fs_type);
- if (ret)
- goto cleanup_mountpoint;
-
- /*
- * Adding the resctrl debugfs directory here may not be ideal since
- * it would let the resctrl debugfs directory appear on the debugfs
- * filesystem before the resctrl filesystem is mounted.
- * It may also be ok since that would enable debugging of RDT before
- * resctrl is mounted.
- * The reason why the debugfs directory is created here and not in
- * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
- * during the debugfs directory creation also &sb->s_type->i_mutex_key
- * (the lockdep class of inode->i_rwsem). Other filesystem
- * interactions (eg. SyS_getdents) have the lock ordering:
- * &sb->s_type->i_mutex_key --> &mm->mmap_lock
- * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
- * is taken, thus creating dependency:
- * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
- * issues considering the other two lock dependencies.
- * By creating the debugfs directory here we avoid a dependency
- * that may cause deadlock (even though file operations cannot
- * occur until the filesystem is mounted, but I do not know how to
- * tell lockdep that).
- */
- debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
-
- return 0;
-
-cleanup_mountpoint:
- sysfs_remove_mount_point(fs_kobj, "resctrl");
-
- return ret;
-}
-
-void __exit rdtgroup_exit(void)
-{
- debugfs_remove_recursive(debugfs_resctrl);
- unregister_filesystem(&rdt_fs_type);
- sysfs_remove_mount_point(fs_kobj, "resctrl");
+ return;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 16f3ca30626a..dbf6d71bdf18 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
+ { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 },
{ X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
{ X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
{ X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
@@ -53,7 +54,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
- { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 },
+ { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 22b65a5f5ec6..7f8d1e11dbee 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -150,13 +150,15 @@ int __init sgx_drv_init(void)
u64 xfrm_mask;
int ret;
- if (!cpu_feature_enabled(X86_FEATURE_SGX_LC))
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
+ }
cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
if (!(eax & 1)) {
- pr_err("SGX disabled: SGX1 instruction support not available.\n");
+ pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
}
@@ -173,8 +175,10 @@ int __init sgx_drv_init(void)
}
ret = misc_register(&sgx_dev_enclave);
- if (ret)
+ if (ret) {
+ pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret);
return ret;
+ }
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
index 4eddb4d571ef..30f39f92c98f 100644
--- a/arch/x86/kernel/cpu/sgx/driver.h
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -2,7 +2,6 @@
#ifndef __ARCH_SGX_DRIVER_H__
#define __ARCH_SGX_DRIVER_H__
-#include <crypto/hash.h>
#include <linux/kref.h>
#include <linux/mmu_notifier.h>
#include <linux/radix-tree.h>
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index b65ab214bdf5..66f1efa16fbb 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -3,6 +3,7 @@
#include <asm/mman.h>
#include <asm/sgx.h>
+#include <crypto/sha2.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -64,6 +65,13 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
struct file *backing;
long ret;
+ /*
+ * ECREATE would detect this too, but checking here also ensures
+ * that the 'encl_size' calculations below can never overflow.
+ */
+ if (!is_power_of_2(secs->size))
+ return -EINVAL;
+
va_page = sgx_encl_grow(encl, true);
if (IS_ERR(va_page))
return PTR_ERR(va_page);
@@ -456,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
return ret;
}
-static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
- void *hash)
-{
- SHASH_DESC_ON_STACK(shash, tfm);
-
- shash->tfm = tfm;
-
- return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
-}
-
-static int sgx_get_key_hash(const void *modulus, void *hash)
-{
- struct crypto_shash *tfm;
- int ret;
-
- tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- ret = __sgx_get_key_hash(tfm, modulus, hash);
-
- crypto_free_shash(tfm);
- return ret;
-}
-
static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
void *token)
{
@@ -516,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
sgx_xfrm_reserved_mask)
return -EINVAL;
- ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
- if (ret)
- return ret;
+ sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner);
mutex_lock(&encl->lock);
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8ce352fc72ac..2de01b379aa3 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/vmalloc.h>
+#include <asm/msr.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
@@ -719,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags)
goto out;
}
+ sgx_unmark_page_reclaimable(page);
+
/*
* TBD: Add additional plumbing to enable pre-emptive
* action for asynchronous poison notification. Until
@@ -871,7 +874,7 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
WARN_ON_ONCE(preemptible());
for (i = 0; i < 4; i++)
- wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+ wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
}
const struct file_operations sgx_provision_fops = {
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 01456236a6dd..e35ccdc84910 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -30,6 +30,7 @@
#include <asm/hypervisor.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
+#include <asm/msr.h>
#include <asm/smp.h>
#include "cpu.h"
@@ -154,7 +155,7 @@ static __init bool check_for_real_bsp(u32 apic_id)
* kernel must rely on the firmware enumeration order.
*/
if (has_apic_base) {
- rdmsrl(MSR_IA32_APICBASE, msr);
+ rdmsrq(MSR_IA32_APICBASE, msr);
is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
}
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
index 03b3c9c3a45e..843b1655ab45 100644
--- a/arch/x86/kernel/cpu/topology_amd.c
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -3,6 +3,7 @@
#include <asm/apic.h>
#include <asm/memtype.h>
+#include <asm/msr.h>
#include <asm/processor.h>
#include "cpu.h"
@@ -133,7 +134,7 @@ static void parse_fam10h_node_id(struct topo_scan *tscan)
if (!boot_cpu_has(X86_FEATURE_NODEID_MSR))
return;
- rdmsrl(MSR_FAM10H_NODE_ID, nid.msr);
+ rdmsrq(MSR_FAM10H_NODE_ID, nid.msr);
store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id);
tscan->c->topo.llc_id = nid.node_id;
}
@@ -160,7 +161,7 @@ static void topoext_fixup(struct topo_scan *tscan)
if (msr_set_bit(0xc0011005, 54) <= 0)
return;
- rdmsrl(0xc0011005, msrval);
+ rdmsrq(0xc0011005, msrval);
if (msrval & BIT_64(54)) {
set_cpu_cap(c, X86_FEATURE_TOPOEXT);
pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
@@ -182,7 +183,7 @@ static void parse_topology_amd(struct topo_scan *tscan)
if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
has_topoext = cpu_parse_topology_ext(tscan);
- if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES))
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES))
tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
if (!has_topoext && !parse_8000_0008(tscan))
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index b31ee4f1657a..49782724a943 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -12,6 +12,7 @@
#include <asm/cmdline.h>
#include <asm/cpu.h>
+#include <asm/msr.h>
#include "cpu.h"
@@ -24,7 +25,7 @@ static void tsx_disable(void)
{
u64 tsx;
- rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
/* Force all transactions to immediately abort */
tsx |= TSX_CTRL_RTM_DISABLE;
@@ -37,14 +38,14 @@ static void tsx_disable(void)
*/
tsx |= TSX_CTRL_CPUID_CLEAR;
- wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
}
static void tsx_enable(void)
{
u64 tsx;
- rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
/* Enable the RTM feature in the cpu */
tsx &= ~TSX_CTRL_RTM_DISABLE;
@@ -56,7 +57,7 @@ static void tsx_enable(void)
*/
tsx &= ~TSX_CTRL_CPUID_CLEAR;
- wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
}
static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
@@ -115,13 +116,13 @@ static void tsx_clear_cpuid(void)
*/
if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
- rdmsrl(MSR_TSX_FORCE_ABORT, msr);
+ rdmsrq(MSR_TSX_FORCE_ABORT, msr);
msr |= MSR_TFA_TSX_CPUID_CLEAR;
- wrmsrl(MSR_TSX_FORCE_ABORT, msr);
+ wrmsrq(MSR_TSX_FORCE_ABORT, msr);
} else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) {
- rdmsrl(MSR_IA32_TSX_CTRL, msr);
+ rdmsrq(MSR_IA32_TSX_CTRL, msr);
msr |= TSX_CTRL_CPUID_CLEAR;
- wrmsrl(MSR_IA32_TSX_CTRL, msr);
+ wrmsrq(MSR_IA32_TSX_CTRL, msr);
}
}
@@ -146,11 +147,11 @@ static void tsx_dev_mode_disable(void)
!cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
return;
- rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
if (mcu_opt_ctrl & RTM_ALLOW) {
mcu_opt_ctrl &= ~RTM_ALLOW;
- wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
}
}
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index 2293efd6ffa6..933fcd7ff250 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -33,7 +33,7 @@ static DEFINE_MUTEX(umwait_lock);
static void umwait_update_control_msr(void * unused)
{
lockdep_assert_irqs_disabled();
- wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached));
}
/*
@@ -71,7 +71,7 @@ static int umwait_cpu_offline(unsigned int cpu)
* the original control MSR value in umwait_init(). So there
* is no race condition here.
*/
- wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0);
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
return 0;
}
@@ -214,7 +214,7 @@ static int __init umwait_init(void)
* changed. This is the only place where orig_umwait_control_cached
* is modified.
*/
- rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
+ rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
umwait_cpu_online, umwait_cpu_offline);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 00189cdeb775..cb3f900c46fc 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/clocksource.h>
#include <linux/cpu.h>
+#include <linux/efi.h>
#include <linux/reboot.h>
#include <linux/static_call.h>
#include <asm/div64.h>
@@ -429,6 +430,9 @@ static void __init vmware_platform_setup(void)
pr_warn("Failed to get TSC freq from the hypervisor\n");
}
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT))
+ x86_init.mpparse.find_mptable = mpparse_find_mptable;
+
vmware_paravirt_ops_setup();
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 90eba7eb5335..89b1c8a70fe8 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -4,6 +4,7 @@
#include <asm/cpu.h>
#include <asm/cpufeature.h>
+#include <asm/msr.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 340af8155658..0be61c45400c 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -140,7 +140,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
x86_platform.guest.enc_kexec_begin();
x86_platform.guest.enc_kexec_finish();
- crash_save_cpu(regs, safe_smp_processor_id());
+ crash_save_cpu(regs, smp_processor_id());
}
#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 59d23cdf4ed0..dd8748c45529 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -2,6 +2,7 @@
/*
* Architecture specific OF callbacks.
*/
+#include <linux/acpi.h>
#include <linux/export.h>
#include <linux/io.h>
#include <linux/interrupt.h>
@@ -313,6 +314,6 @@ void __init x86_flattree_get_config(void)
if (initial_dtb)
early_memunmap(dt, map_len);
#endif
- if (of_have_populated_dt())
+ if (acpi_disabled && of_have_populated_dt())
x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index a7d562697e50..71ee20102a8a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -23,8 +23,6 @@
#include <asm/stacktrace.h>
#include <asm/unwind.h>
-int panic_on_unrecovered_nmi;
-int panic_on_io_nmi;
static int die_counter;
static struct pt_regs exec_summary_regs;
@@ -195,6 +193,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
+ stack = stack ?: get_stack_pointer(task, regs);
regs = unwind_get_entry_regs(&state, &partial);
/*
@@ -213,9 +212,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - hardirq stack
* - entry stack
*/
- for (stack = stack ?: get_stack_pointer(task, regs);
- stack;
- stack = stack_info.next_sp) {
+ for (; stack; stack = stack_info.next_sp) {
const char *stack_name;
stack = PTR_ALIGN(stack, sizeof(long));
@@ -395,18 +392,13 @@ NOKPROBE_SYMBOL(oops_end);
static void __die_header(const char *str, struct pt_regs *regs, long err)
{
- const char *pr = "";
-
/* Save the regs of the first oops for the executive summary later. */
if (!die_counter)
exec_summary_regs = *regs;
- if (IS_ENABLED(CONFIG_PREEMPTION))
- pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
-
printk(KERN_DEFAULT
- "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff,
- ++die_counter, pr,
+ "Oops: %s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff,
+ ++die_counter,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b4905d5173fd..722fd712e1cf 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type)
static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
@@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr);
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index f05339fee778..6c5defd6569a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
+ unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *begin;
/*
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 82b96ed9890a..9920122018a0 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -28,18 +28,13 @@
* the first 128 E820 memory entries in boot_params.e820_table and the remaining
* (if any) entries of the SETUP_E820_EXT nodes. We use this to:
*
- * - inform the user about the firmware's notion of memory layout
- * via /sys/firmware/memmap
- *
* - the hibernation code uses it to generate a kernel-independent CRC32
* checksum of the physical memory layout of a system.
*
* - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
* passed to us by the bootloader - the major difference between
- * e820_table_firmware[] and this one is that, the latter marks the setup_data
- * list created by the EFI boot stub as reserved, so that kexec can reuse the
- * setup_data information in the second kernel. Besides, e820_table_kexec[]
- * might also be modified by the kexec itself to fake a mptable.
+ * e820_table_firmware[] and this one is that e820_table_kexec[]
+ * might be modified by the kexec itself to fake an mptable.
* We use this to:
*
* - kexec, which is a bootloader in disguise, uses the original E820
@@ -47,6 +42,11 @@
* can have a restricted E820 map while the kexec()-ed kexec-kernel
* can have access to full memory - etc.
*
+ * Export the memory layout via /sys/firmware/memmap. kexec-tools uses
+ * the entries to create an E820 table for the kexec kernel.
+ *
+ * kexec_file_load in-kernel code uses the table for the kexec kernel.
+ *
* - 'e820_table': this is the main E820 table that is massaged by the
* low level x86 platform code, or modified by boot parameters, before
* passed on to higher level MM layers.
@@ -187,8 +187,7 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type)
static void __init e820_print_type(enum e820_type type)
{
switch (type) {
- case E820_TYPE_RAM: /* Fall through: */
- case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break;
+ case E820_TYPE_RAM: pr_cont("usable"); break;
case E820_TYPE_RESERVED: pr_cont("reserved"); break;
case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break;
case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
@@ -754,22 +753,21 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
void __init e820__register_nosave_regions(unsigned long limit_pfn)
{
int i;
- unsigned long pfn = 0;
+ u64 last_addr = 0;
for (i = 0; i < e820_table->nr_entries; i++) {
struct e820_entry *entry = &e820_table->entries[i];
- if (pfn < PFN_UP(entry->addr))
- register_nosave_region(pfn, PFN_UP(entry->addr));
-
- pfn = PFN_DOWN(entry->addr + entry->size);
+ if (entry->type != E820_TYPE_RAM)
+ continue;
- if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
- register_nosave_region(PFN_UP(entry->addr), pfn);
+ if (last_addr < entry->addr)
+ register_nosave_region(PFN_DOWN(last_addr), PFN_UP(entry->addr));
- if (pfn >= limit_pfn)
- break;
+ last_addr = entry->addr + entry->size;
}
+
+ register_nosave_region(PFN_DOWN(last_addr), limit_pfn);
}
#ifdef CONFIG_ACPI
@@ -991,60 +989,6 @@ static int __init parse_memmap_opt(char *str)
early_param("memmap", parse_memmap_opt);
/*
- * Reserve all entries from the bootloader's extensible data nodes list,
- * because if present we are going to use it later on to fetch e820
- * entries from it:
- */
-void __init e820__reserve_setup_data(void)
-{
- struct setup_indirect *indirect;
- struct setup_data *data;
- u64 pa_data, pa_next;
- u32 len;
-
- pa_data = boot_params.hdr.setup_data;
- if (!pa_data)
- return;
-
- while (pa_data) {
- data = early_memremap(pa_data, sizeof(*data));
- if (!data) {
- pr_warn("e820: failed to memremap setup_data entry\n");
- return;
- }
-
- len = sizeof(*data);
- pa_next = data->next;
-
- e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
-
- if (data->type == SETUP_INDIRECT) {
- len += data->len;
- early_memunmap(data, sizeof(*data));
- data = early_memremap(pa_data, len);
- if (!data) {
- pr_warn("e820: failed to memremap indirect setup_data\n");
- return;
- }
-
- indirect = (struct setup_indirect *)data->data;
-
- if (indirect->type != SETUP_INDIRECT)
- e820__range_update(indirect->addr, indirect->len,
- E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- }
-
- pa_data = pa_next;
- early_memunmap(data, len);
- }
-
- e820__update_table(e820_table);
-
- pr_info("extended physical RAM map:\n");
- e820__print_table("reserve setup_data");
-}
-
-/*
* Called after parse_early_param(), after early parameters (such as mem=)
* have been processed, in which case we already have an E820 table filled in
* via the parameter callback function(s), but it's not sorted and printed yet:
@@ -1063,7 +1007,6 @@ void __init e820__finish_early_params(void)
static const char *__init e820_type_to_string(struct e820_entry *entry)
{
switch (entry->type) {
- case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: return "System RAM";
case E820_TYPE_ACPI: return "ACPI Tables";
case E820_TYPE_NVS: return "ACPI Non-volatile Storage";
@@ -1079,7 +1022,6 @@ static const char *__init e820_type_to_string(struct e820_entry *entry)
static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
{
switch (entry->type) {
- case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM;
case E820_TYPE_ACPI: /* Fall-through: */
case E820_TYPE_NVS: /* Fall-through: */
@@ -1101,7 +1043,6 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
case E820_TYPE_RESERVED: return IORES_DESC_RESERVED;
case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED;
- case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE: /* Fall-through: */
default: return IORES_DESC_NONE;
@@ -1124,7 +1065,6 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res)
case E820_TYPE_PRAM:
case E820_TYPE_PMEM:
return false;
- case E820_TYPE_RESERVED_KERN:
case E820_TYPE_RAM:
case E820_TYPE_ACPI:
case E820_TYPE_NVS:
@@ -1176,9 +1116,9 @@ void __init e820__reserve_resources(void)
res++;
}
- /* Expose the bootloader-provided memory layout to the sysfs. */
- for (i = 0; i < e820_table_firmware->nr_entries; i++) {
- struct e820_entry *entry = e820_table_firmware->entries + i;
+ /* Expose the kexec e820 table to the sysfs. */
+ for (i = 0; i < e820_table_kexec->nr_entries; i++) {
+ struct e820_entry *entry = e820_table_kexec->entries + i;
firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
}
@@ -1302,6 +1242,36 @@ void __init e820__memblock_setup(void)
int i;
u64 end;
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ if (movable_node_is_enabled())
+ memblock_set_bottom_up(true);
+#endif
+
+ /*
+ * At this point only the first megabyte is mapped for sure, the
+ * rest of the memory cannot be used for memblock resizing
+ */
+ memblock_set_current_limit(ISA_END_ADDRESS);
+
/*
* The bootstrap memblock region count maximum is 128 entries
* (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
@@ -1323,12 +1293,20 @@ void __init e820__memblock_setup(void)
if (entry->type == E820_TYPE_SOFT_RESERVED)
memblock_reserve(entry->addr, entry->size);
- if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+ if (entry->type != E820_TYPE_RAM)
continue;
memblock_add(entry->addr, entry->size);
}
+ /*
+ * 32-bit systems are limited to 4BG of memory even with HIGHMEM and
+ * to even less without it.
+ * Discard memory after max_pfn - the actual limit detected at runtime.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ memblock_remove(PFN_PHYS(max_pfn), -1);
+
/* Throw away partial pages: */
memblock_trim_memory(PAGE_SIZE);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 44f937015e1e..cba75306e5b6 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/console.h>
#include <linux/kernel.h>
+#include <linux/kexec.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/screen_info.h>
@@ -19,6 +20,7 @@
#include <linux/usb/ehci_def.h>
#include <linux/usb/xhci-dbgp.h>
#include <asm/pci_x86.h>
+#include <linux/static_call.h>
/* Simple VGA output */
#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -94,26 +96,28 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
#define DLL 0 /* Divisor Latch Low */
#define DLH 1 /* Divisor latch High */
-static unsigned int io_serial_in(unsigned long addr, int offset)
+static __noendbr unsigned int io_serial_in(unsigned long addr, int offset)
{
return inb(addr + offset);
}
+ANNOTATE_NOENDBR_SYM(io_serial_in);
-static void io_serial_out(unsigned long addr, int offset, int value)
+static __noendbr void io_serial_out(unsigned long addr, int offset, int value)
{
outb(value, addr + offset);
}
+ANNOTATE_NOENDBR_SYM(io_serial_out);
-static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in;
-static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out;
+DEFINE_STATIC_CALL(serial_in, io_serial_in);
+DEFINE_STATIC_CALL(serial_out, io_serial_out);
static int early_serial_putc(unsigned char ch)
{
unsigned timeout = 0xffff;
- while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
+ while ((static_call(serial_in)(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
cpu_relax();
- serial_out(early_serial_base, TXR, ch);
+ static_call(serial_out)(early_serial_base, TXR, ch);
return timeout ? 0 : -1;
}
@@ -131,16 +135,21 @@ static __init void early_serial_hw_init(unsigned divisor)
{
unsigned char c;
- serial_out(early_serial_base, LCR, 0x3); /* 8n1 */
- serial_out(early_serial_base, IER, 0); /* no interrupt */
- serial_out(early_serial_base, FCR, 0); /* no fifo */
- serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */
+ static_call(serial_out)(early_serial_base, LCR, 0x3); /* 8n1 */
+ static_call(serial_out)(early_serial_base, IER, 0); /* no interrupt */
+ static_call(serial_out)(early_serial_base, FCR, 0); /* no fifo */
+ static_call(serial_out)(early_serial_base, MCR, 0x3); /* DTR + RTS */
- c = serial_in(early_serial_base, LCR);
- serial_out(early_serial_base, LCR, c | DLAB);
- serial_out(early_serial_base, DLL, divisor & 0xff);
- serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff);
- serial_out(early_serial_base, LCR, c & ~DLAB);
+ c = static_call(serial_in)(early_serial_base, LCR);
+ static_call(serial_out)(early_serial_base, LCR, c | DLAB);
+ static_call(serial_out)(early_serial_base, DLL, divisor & 0xff);
+ static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff);
+ static_call(serial_out)(early_serial_base, LCR, c & ~DLAB);
+
+#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64)
+ if (static_call_query(serial_in) == io_serial_in)
+ kexec_debug_8250_port = early_serial_base;
+#endif
}
#define DEFAULT_BAUD 9600
@@ -183,30 +192,66 @@ static __init void early_serial_init(char *s)
/* Convert from baud to divisor value */
divisor = 115200 / baud;
- /* These will always be IO based ports */
- serial_in = io_serial_in;
- serial_out = io_serial_out;
-
/* Set up the HW */
early_serial_hw_init(divisor);
}
-#ifdef CONFIG_PCI
-static void mem32_serial_out(unsigned long addr, int offset, int value)
+static __noendbr void mem32_serial_out(unsigned long addr, int offset, int value)
{
u32 __iomem *vaddr = (u32 __iomem *)addr;
/* shift implied by pointer type */
writel(value, vaddr + offset);
}
+ANNOTATE_NOENDBR_SYM(mem32_serial_out);
-static unsigned int mem32_serial_in(unsigned long addr, int offset)
+static __noendbr unsigned int mem32_serial_in(unsigned long addr, int offset)
{
u32 __iomem *vaddr = (u32 __iomem *)addr;
/* shift implied by pointer type */
return readl(vaddr + offset);
}
+ANNOTATE_NOENDBR_SYM(mem32_serial_in);
/*
+ * early_mmio_serial_init() - Initialize MMIO-based early serial console.
+ * @s: MMIO-based serial specification.
+ */
+static __init void early_mmio_serial_init(char *s)
+{
+ unsigned long baudrate;
+ unsigned long membase;
+ char *e;
+
+ if (*s == ',')
+ s++;
+
+ if (!strncmp(s, "0x", 2)) {
+ /* NB: only 32-bit addresses are supported. */
+ membase = simple_strtoul(s, &e, 16);
+ early_serial_base = (unsigned long)early_ioremap(membase, PAGE_SIZE);
+
+ static_call_update(serial_in, mem32_serial_in);
+ static_call_update(serial_out, mem32_serial_out);
+
+ s += strcspn(s, ",");
+ if (*s == ',')
+ s++;
+ }
+
+ if (!strncmp(s, "nocfg", 5)) {
+ baudrate = 0;
+ } else {
+ baudrate = simple_strtoul(s, &e, 0);
+ if (baudrate == 0 || s == e)
+ baudrate = DEFAULT_BAUD;
+ }
+
+ if (baudrate)
+ early_serial_hw_init(115200 / baudrate);
+}
+
+#ifdef CONFIG_PCI
+/*
* early_pci_serial_init()
*
* This function is invoked when the early_printk param starts with "pciserial"
@@ -278,18 +323,19 @@ static __init void early_pci_serial_init(char *s)
*/
if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
/* it is IO mapped */
- serial_in = io_serial_in;
- serial_out = io_serial_out;
early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK;
write_pci_config(bus, slot, func, PCI_COMMAND,
cmdreg|PCI_COMMAND_IO);
} else {
/* It is memory mapped - assume 32-bit alignment */
- serial_in = mem32_serial_in;
- serial_out = mem32_serial_out;
+ static_call_update(serial_in, mem32_serial_in);
+ static_call_update(serial_out, mem32_serial_out);
/* WARNING! assuming the address is always in the first 4G */
early_serial_base =
(unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10);
+#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64)
+ kexec_debug_8250_mmio32 = bar0 & PCI_BASE_ADDRESS_MEM_MASK;
+#endif
write_pci_config(bus, slot, func, PCI_COMMAND,
cmdreg|PCI_COMMAND_MEMORY);
}
@@ -352,6 +398,11 @@ static int __init setup_early_printk(char *buf)
keep = (strstr(buf, "keep") != NULL);
while (*buf != '\0') {
+ if (!strncmp(buf, "mmio32", 6)) {
+ buf += 6;
+ early_mmio_serial_init(buf);
+ early_console_register(&early_serial_console, keep);
+ }
if (!strncmp(buf, "serial", 6)) {
buf += 6;
early_serial_init(buf);
@@ -365,9 +416,9 @@ static int __init setup_early_printk(char *buf)
}
#ifdef CONFIG_PCI
if (!strncmp(buf, "pciserial", 9)) {
- early_pci_serial_init(buf + 9);
+ buf += 9; /* Keep from match the above "pciserial" */
+ early_pci_serial_init(buf);
early_console_register(&early_serial_console, keep);
- buf += 9; /* Keep from match the above "serial" */
}
#endif
if (!strncmp(buf, "vga", 3) &&
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
index f6d856bd50bc..10d0a720659c 100644
--- a/arch/x86/kernel/fpu/context.h
+++ b/arch/x86/kernel/fpu/context.h
@@ -53,7 +53,7 @@ static inline void fpregs_activate(struct fpu *fpu)
/* Internal helper for switch_fpu_return() and signal frame setup */
static inline void fpregs_restore_userregs(void)
{
- struct fpu *fpu = &current->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current);
int cpu = smp_processor_id();
if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
@@ -67,7 +67,7 @@ static inline void fpregs_restore_userregs(void)
* If PKRU is enabled, then the PKRU value is already
* correct because it was either set in switch_to() or in
* flush_thread(). So it is excluded because it might be
- * not up to date in current->thread.fpu.xsave state.
+ * not up to date in current->thread.fpu->xsave state.
*
* XFD state is handled in restore_fpregs_from_fpstate().
*/
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1209c7aebb21..ea138583dd92 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -11,6 +11,7 @@
#include <asm/fpu/sched.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
+#include <asm/msr.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>
@@ -43,14 +44,27 @@ struct fpu_state_config fpu_user_cfg __ro_after_init;
*/
struct fpstate init_fpstate __ro_after_init;
-/* Track in-kernel FPU usage */
-static DEFINE_PER_CPU(bool, in_kernel_fpu);
+/*
+ * Track FPU initialization and kernel-mode usage. 'true' means the FPU is
+ * initialized and is not currently being used by the kernel:
+ */
+DEFINE_PER_CPU(bool, kernel_fpu_allowed);
/*
* Track which context is using the FPU on the CPU:
*/
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+#ifdef CONFIG_X86_DEBUG_FPU
+struct fpu *x86_task_fpu(struct task_struct *task)
+{
+ if (WARN_ON_ONCE(task->flags & PF_KTHREAD))
+ return NULL;
+
+ return (void *)task + sizeof(*task);
+}
+#endif
+
/*
* Can we use the FPU in kernel mode with the
* whole "kernel_fpu_begin/end()" sequence?
@@ -60,8 +74,18 @@ bool irq_fpu_usable(void)
if (WARN_ON_ONCE(in_nmi()))
return false;
- /* In kernel FPU usage already active? */
- if (this_cpu_read(in_kernel_fpu))
+ /*
+ * Return false in the following cases:
+ *
+ * - FPU is not yet initialized. This can happen only when the call is
+ * coming from CPU onlining, for example for microcode checksumming.
+ * - The kernel is already using the FPU, either because of explicit
+ * nesting (which should never be done), or because of implicit
+ * nesting when a hardirq interrupted a kernel-mode FPU section.
+ *
+ * The single boolean check below handles both cases:
+ */
+ if (!this_cpu_read(kernel_fpu_allowed))
return false;
/*
@@ -195,7 +219,7 @@ void fpu_reset_from_exception_fixup(void)
#if IS_ENABLED(CONFIG_KVM)
static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);
-static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
+static void fpu_lock_guest_permissions(void)
{
struct fpu_state_perm *fpuperm;
u64 perm;
@@ -204,15 +228,13 @@ static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
return;
spin_lock_irq(&current->sighand->siglock);
- fpuperm = &current->group_leader->thread.fpu.guest_perm;
+ fpuperm = &x86_task_fpu(current->group_leader)->guest_perm;
perm = fpuperm->__state_perm;
/* First fpstate allocation locks down permissions. */
WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);
spin_unlock_irq(&current->sighand->siglock);
-
- gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED;
}
bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
@@ -220,7 +242,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
struct fpstate *fpstate;
unsigned int size;
- size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+ size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
fpstate = vzalloc(size);
if (!fpstate)
return false;
@@ -232,8 +254,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
fpstate->is_guest = true;
gfpu->fpstate = fpstate;
- gfpu->xfeatures = fpu_user_cfg.default_features;
- gfpu->perm = fpu_user_cfg.default_features;
+ gfpu->xfeatures = fpu_kernel_cfg.default_features;
/*
* KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
@@ -248,7 +269,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
gfpu->uabi_size = fpu_user_cfg.default_size;
- fpu_init_guest_permissions(gfpu);
+ fpu_lock_guest_permissions();
return true;
}
@@ -256,16 +277,16 @@ EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
{
- struct fpstate *fps = gfpu->fpstate;
+ struct fpstate *fpstate = gfpu->fpstate;
- if (!fps)
+ if (!fpstate)
return;
- if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use))
+ if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use))
return;
gfpu->fpstate = NULL;
- vfree(fps);
+ vfree(fpstate);
}
EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
@@ -316,12 +337,12 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
*/
void fpu_sync_guest_vmexit_xfd_state(void)
{
- struct fpstate *fps = current->thread.fpu.fpstate;
+ struct fpstate *fpstate = x86_task_fpu(current)->fpstate;
lockdep_assert_irqs_disabled();
if (fpu_state_size_dynamic()) {
- rdmsrl(MSR_IA32_XFD, fps->xfd);
- __this_cpu_write(xfd_state, fps->xfd);
+ rdmsrq(MSR_IA32_XFD, fpstate->xfd);
+ __this_cpu_write(xfd_state, fpstate->xfd);
}
}
EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
@@ -330,7 +351,7 @@ EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
{
struct fpstate *guest_fps = guest_fpu->fpstate;
- struct fpu *fpu = &current->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current);
struct fpstate *cur_fps = fpu->fpstate;
fpregs_lock();
@@ -420,17 +441,19 @@ EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
- preempt_disable();
+ if (!irqs_disabled())
+ fpregs_lock();
WARN_ON_FPU(!irq_fpu_usable());
- WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
- this_cpu_write(in_kernel_fpu, true);
+ /* Toggle kernel_fpu_allowed to false: */
+ WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed));
+ this_cpu_write(kernel_fpu_allowed, false);
if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
- save_fpregs_to_fpstate(&current->thread.fpu);
+ save_fpregs_to_fpstate(x86_task_fpu(current));
}
__cpu_invalidate_fpregs_state();
@@ -445,10 +468,12 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
void kernel_fpu_end(void)
{
- WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+ /* Toggle kernel_fpu_allowed back to true: */
+ WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed));
+ this_cpu_write(kernel_fpu_allowed, true);
- this_cpu_write(in_kernel_fpu, false);
- preempt_enable();
+ if (!irqs_disabled())
+ fpregs_unlock();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);
@@ -458,7 +483,7 @@ EXPORT_SYMBOL_GPL(kernel_fpu_end);
*/
void fpu_sync_fpstate(struct fpu *fpu)
{
- WARN_ON_FPU(fpu != &current->thread.fpu);
+ WARN_ON_FPU(fpu != x86_task_fpu(current));
fpregs_lock();
trace_x86_fpu_before_save(fpu);
@@ -499,7 +524,7 @@ static inline void fpstate_init_fstate(struct fpstate *fpstate)
/*
* Used in two places:
* 1) Early boot to setup init_fpstate for non XSAVE systems
- * 2) fpu_init_fpstate_user() which is invoked from KVM
+ * 2) fpu_alloc_guest_fpstate() which is invoked from KVM
*/
void fpstate_init_user(struct fpstate *fpstate)
{
@@ -543,7 +568,7 @@ void fpstate_reset(struct fpu *fpu)
static inline void fpu_inherit_perms(struct fpu *dst_fpu)
{
if (fpu_state_size_dynamic()) {
- struct fpu *src_fpu = &current->group_leader->thread.fpu;
+ struct fpu *src_fpu = x86_task_fpu(current->group_leader);
spin_lock_irq(&current->sighand->siglock);
/* Fork also inherits the permissions of the parent */
@@ -563,7 +588,7 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
if (!ssp)
return 0;
- xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave,
+ xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave,
XFEATURE_CET_USER);
/*
@@ -584,8 +609,16 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
unsigned long ssp)
{
- struct fpu *src_fpu = &current->thread.fpu;
- struct fpu *dst_fpu = &dst->thread.fpu;
+ /*
+ * We allocate the new FPU structure right after the end of the task struct.
+ * task allocation size already took this into account.
+ *
+ * This is safe because task_struct size is a multiple of cacheline size,
+ * thus x86_task_fpu() will always be cacheline aligned as well.
+ */
+ struct fpu *dst_fpu = (void *)dst + sizeof(*dst);
+
+ BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);
/* The new task's FPU state cannot be valid in the hardware. */
dst_fpu->last_cpu = -1;
@@ -648,19 +681,22 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
if (update_fpu_shstk(dst, ssp))
return 1;
- trace_x86_fpu_copy_src(src_fpu);
trace_x86_fpu_copy_dst(dst_fpu);
return 0;
}
/*
- * Whitelist the FPU register state embedded into task_struct for hardened
- * usercopy.
+ * While struct fpu is no longer part of struct thread_struct, it is still
+ * allocated after struct task_struct in the "task_struct" kmem cache. But
+ * since FPU is expected to be part of struct thread_struct, we have to
+ * adjust for it here.
*/
void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
{
- *offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
+ /* The allocation follows struct task_struct. */
+ *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread);
+ *offset += offsetof(struct fpu, __fpstate.regs);
*size = fpu_kernel_cfg.default_size;
}
@@ -673,11 +709,18 @@ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
* a state-restore is coming: either an explicit one,
* or a reschedule.
*/
-void fpu__drop(struct fpu *fpu)
+void fpu__drop(struct task_struct *tsk)
{
+ struct fpu *fpu;
+
+ if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD))
+ return;
+
+ fpu = x86_task_fpu(tsk);
+
preempt_disable();
- if (fpu == &current->thread.fpu) {
+ if (fpu == x86_task_fpu(current)) {
/* Ignore delayed exceptions from user space */
asm volatile("1: fwait\n"
"2:\n"
@@ -709,9 +752,9 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
/*
* Reset current->fpu memory state to the init values.
*/
-static void fpu_reset_fpregs(void)
+static void fpu_reset_fpstate_regs(void)
{
- struct fpu *fpu = &current->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current);
fpregs_lock();
__fpu_invalidate_fpregs_state(fpu);
@@ -740,11 +783,11 @@ static void fpu_reset_fpregs(void)
*/
void fpu__clear_user_states(struct fpu *fpu)
{
- WARN_ON_FPU(fpu != &current->thread.fpu);
+ WARN_ON_FPU(fpu != x86_task_fpu(current));
fpregs_lock();
if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
- fpu_reset_fpregs();
+ fpu_reset_fpstate_regs();
fpregs_unlock();
return;
}
@@ -773,8 +816,8 @@ void fpu__clear_user_states(struct fpu *fpu)
void fpu_flush_thread(void)
{
- fpstate_reset(&current->thread.fpu);
- fpu_reset_fpregs();
+ fpstate_reset(x86_task_fpu(current));
+ fpu_reset_fpstate_regs();
}
/*
* Load FPU context before returning to userspace.
@@ -814,7 +857,7 @@ void fpregs_lock_and_load(void)
*/
void fpregs_assert_state_consistent(void)
{
- struct fpu *fpu = &current->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current);
if (test_thread_flag(TIF_NEED_FPU_LOAD))
return;
@@ -826,7 +869,7 @@ EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
void fpregs_mark_activate(void)
{
- struct fpu *fpu = &current->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current);
fpregs_activate(fpu);
fpu->last_cpu = smp_processor_id();
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 998a08f17e33..99db41bf9fa6 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -38,7 +38,7 @@ static void fpu__init_cpu_generic(void)
/* Flush out any pending x87 state: */
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU))
- fpstate_init_soft(&current->thread.fpu.fpstate->regs.soft);
+ ;
else
#endif
asm volatile ("fninit");
@@ -51,6 +51,9 @@ void fpu__init_cpu(void)
{
fpu__init_cpu_generic();
fpu__init_cpu_xstate();
+
+ /* Start allowing kernel-mode FPU: */
+ this_cpu_write(kernel_fpu_allowed, true);
}
static bool __init fpu__probe_without_cpuid(void)
@@ -73,6 +76,8 @@ static bool __init fpu__probe_without_cpuid(void)
static void __init fpu__init_system_early_generic(void)
{
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+
if (!boot_cpu_has(X86_FEATURE_CPUID) &&
!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
if (fpu__probe_without_cpuid())
@@ -94,7 +99,6 @@ static void __init fpu__init_system_early_generic(void)
* Boot time FPU feature detection code:
*/
unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu;
-EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
static void __init fpu__init_system_mxcsr(void)
{
@@ -150,11 +154,13 @@ static void __init fpu__init_task_struct_size(void)
{
int task_size = sizeof(struct task_struct);
+ task_size += sizeof(struct fpu);
+
/*
* Subtract off the static size of the register state.
* It potentially has a bunch of padding.
*/
- task_size -= sizeof(current->thread.fpu.__fpstate.regs);
+ task_size -= sizeof(union fpregs_state);
/*
* Add back the dynamically-calculated register state
@@ -164,14 +170,9 @@ static void __init fpu__init_task_struct_size(void)
/*
* We dynamically size 'struct fpu', so we require that
- * it be at the end of 'thread_struct' and that
- * 'thread_struct' be at the end of 'task_struct'. If
- * you hit a compile error here, check the structure to
- * see if something got added to the end.
+ * 'state' be at the end of 'it:
*/
CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate);
- CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
- CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
arch_task_struct_size = task_size;
}
@@ -204,7 +205,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
fpu_kernel_cfg.default_size = size;
fpu_user_cfg.max_size = size;
fpu_user_cfg.default_size = size;
- fpstate_reset(&current->thread.fpu);
}
/*
@@ -213,7 +213,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
void __init fpu__init_system(void)
{
- fpstate_reset(&current->thread.fpu);
fpu__init_system_early_generic();
/*
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index dbdb31f55fc7..975de070c9c9 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -18,7 +18,7 @@ static __always_inline __pure bool use_fxsr(void)
#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
-# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; })
#endif
/* Used in init.c */
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 887b0b8e21e3..0986c2200adc 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -45,7 +45,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r
*/
static void sync_fpstate(struct fpu *fpu)
{
- if (fpu == &current->thread.fpu)
+ if (fpu == x86_task_fpu(current))
fpu_sync_fpstate(fpu);
}
@@ -63,7 +63,7 @@ static void fpu_force_restore(struct fpu *fpu)
* Only stopped child tasks can be used to modify the FPU
* state in the fpstate buffer:
*/
- WARN_ON_FPU(fpu == &current->thread.fpu);
+ WARN_ON_FPU(fpu == x86_task_fpu(current));
__fpu_invalidate_fpregs_state(fpu);
}
@@ -71,7 +71,7 @@ static void fpu_force_restore(struct fpu *fpu)
int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
struct membuf to)
{
- struct fpu *fpu = &target->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(target);
if (!cpu_feature_enabled(X86_FEATURE_FXSR))
return -ENODEV;
@@ -91,7 +91,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- struct fpu *fpu = &target->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(target);
struct fxregs_state newstate;
int ret;
@@ -133,7 +133,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
return -ENODEV;
- sync_fpstate(&target->thread.fpu);
+ sync_fpstate(x86_task_fpu(target));
copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE);
return 0;
@@ -143,7 +143,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- struct fpu *fpu = &target->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(target);
struct xregs_state *tmpbuf = NULL;
int ret;
@@ -187,7 +187,7 @@ int ssp_active(struct task_struct *target, const struct user_regset *regset)
int ssp_get(struct task_struct *target, const struct user_regset *regset,
struct membuf to)
{
- struct fpu *fpu = &target->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(target);
struct cet_user_state *cetregs;
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
@@ -214,7 +214,7 @@ int ssp_set(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- struct fpu *fpu = &target->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(target);
struct xregs_state *xsave = &fpu->fpstate->regs.xsave;
struct cet_user_state *cetregs;
unsigned long user_ssp;
@@ -368,7 +368,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
void
convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
{
- __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave);
+ __convert_from_fxsr(env, tsk, &x86_task_fpu(tsk)->fpstate->regs.fxsave);
}
void convert_to_fxsr(struct fxregs_state *fxsave,
@@ -401,7 +401,7 @@ void convert_to_fxsr(struct fxregs_state *fxsave,
int fpregs_get(struct task_struct *target, const struct user_regset *regset,
struct membuf to)
{
- struct fpu *fpu = &target->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(target);
struct user_i387_ia32_struct env;
struct fxregs_state fxsave, *fx;
@@ -433,7 +433,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- struct fpu *fpu = &target->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(target);
struct user_i387_ia32_struct env;
int ret;
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 8f62e0666dea..c3ec2512f2bb 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -27,19 +27,14 @@
static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
struct _fpx_sw_bytes *fx_sw)
{
- int min_xstate_size = sizeof(struct fxregs_state) +
- sizeof(struct xstate_header);
void __user *fpstate = fxbuf;
unsigned int magic2;
if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
return false;
- /* Check for the first magic field and other error scenarios. */
- if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
- fx_sw->xstate_size < min_xstate_size ||
- fx_sw->xstate_size > current->thread.fpu.fpstate->user_size ||
- fx_sw->xstate_size > fx_sw->extended_size)
+ /* Check for the first magic field */
+ if (fx_sw->magic1 != FP_XSTATE_MAGIC1)
goto setfx;
/*
@@ -48,13 +43,13 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
- if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
+ if (__get_user(magic2, (__u32 __user *)(fpstate + x86_task_fpu(current)->fpstate->user_size)))
return false;
if (likely(magic2 == FP_XSTATE_MAGIC2))
return true;
setfx:
- trace_x86_fpu_xstate_check_failed(&current->thread.fpu);
+ trace_x86_fpu_xstate_check_failed(x86_task_fpu(current));
/* Set the parameters for fx only state */
fx_sw->magic1 = 0;
@@ -69,13 +64,13 @@ setfx:
static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
{
if (use_fxsr()) {
- struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave;
+ struct xregs_state *xsave = &x86_task_fpu(tsk)->fpstate->regs.xsave;
struct user_i387_ia32_struct env;
struct _fpstate_32 __user *fp = buf;
fpregs_lock();
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- fxsave(&tsk->thread.fpu.fpstate->regs.fxsave);
+ fxsave(&x86_task_fpu(tsk)->fpstate->regs.fxsave);
fpregs_unlock();
convert_from_fxsr(&env, tsk);
@@ -119,7 +114,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
{
struct xregs_state __user *x = buf;
struct _fpx_sw_bytes sw_bytes = {};
- u32 xfeatures;
int err;
/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
@@ -133,12 +127,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
(__u32 __user *)(buf + fpstate->user_size));
/*
- * Read the xfeatures which we copied (directly from the cpu or
- * from the state in task struct) to the user buffers.
- */
- err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
-
- /*
* For legacy compatible, we always set FP/SSE bits in the bit
* vector while saving the state to the user context. This will
* enable us capturing any changes(during sigreturn) to
@@ -149,9 +137,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
* header as well as change any contents in the memory layout.
* xrestore as part of sigreturn will capture all the changes.
*/
- xfeatures |= XFEATURE_MASK_FPSSE;
-
- err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
+ err |= set_xfeature_in_sigframe(x, XFEATURE_MASK_FPSSE);
return !err;
}
@@ -189,7 +175,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pk
bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru)
{
struct task_struct *tsk = current;
- struct fpstate *fpstate = tsk->thread.fpu.fpstate;
+ struct fpstate *fpstate = x86_task_fpu(tsk)->fpstate;
bool ia32_fxstate = (buf != buf_fx);
int ret;
@@ -277,7 +263,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
*/
static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
{
- struct fpu *fpu = &current->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current);
int ret;
/* Restore enabled features only. */
@@ -337,7 +323,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
bool ia32_fxstate)
{
struct task_struct *tsk = current;
- struct fpu *fpu = &tsk->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(tsk);
struct user_i387_ia32_struct env;
bool success, fx_only = false;
union fpregs_state *fpregs;
@@ -457,7 +443,7 @@ static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
*/
bool fpu__restore_sig(void __user *buf, int ia32_frame)
{
- struct fpu *fpu = &current->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current);
void __user *buf_fx = buf;
bool ia32_fxstate = false;
bool success = false;
@@ -504,7 +490,7 @@ unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long *buf_fx, unsigned long *size)
{
- unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate);
+ unsigned long frame_size = xstate_sigframe_size(x86_task_fpu(current)->fpstate);
*buf_fx = sp = round_down(sp - frame_size, 64);
if (ia32_frame && use_fxsr()) {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 27417b685c1d..9aa9ac8399ae 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -14,13 +14,15 @@
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
#include <linux/coredump.h>
+#include <linux/sort.h>
#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/xcr.h>
-#include <asm/cpuid.h>
+#include <asm/cpuid/api.h>
+#include <asm/msr.h>
#include <asm/tlbflush.h>
#include <asm/prctl.h>
#include <asm/elf.h>
@@ -62,6 +64,7 @@ static const char *xfeature_names[] =
"unknown xstate feature",
"AMX Tile config",
"AMX Tile data",
+ "APX registers",
"unknown xstate feature",
};
@@ -80,6 +83,7 @@ static unsigned short xsave_cpuid_features[] __initdata = {
[XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
[XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
[XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
+ [XFEATURE_APX] = X86_FEATURE_APX,
};
static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
@@ -88,6 +92,31 @@ static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
{ [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
+/*
+ * Ordering of xstate components in uncompacted format: The xfeature
+ * number does not necessarily indicate its position in the XSAVE buffer.
+ * This array defines the traversal order of xstate features.
+ */
+static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+
+static inline unsigned int next_xfeature_order(unsigned int i, u64 mask)
+{
+ for (; xfeature_uncompact_order[i] != -1; i++) {
+ if (mask & BIT_ULL(xfeature_uncompact_order[i]))
+ break;
+ }
+
+ return i;
+}
+
+/* Iterate xstate features in uncompacted order: */
+#define for_each_extended_xfeature_in_order(i, mask) \
+ for (i = 0; \
+ i = next_xfeature_order(i, mask), \
+ xfeature_uncompact_order[i] != -1; \
+ i++)
+
#define XSTATE_FLAG_SUPERVISOR BIT(0)
#define XSTATE_FLAG_ALIGNED64 BIT(1)
@@ -199,7 +228,7 @@ void fpu__init_cpu_xstate(void)
* MSR_IA32_XSS sets supervisor states managed by XSAVES.
*/
if (boot_cpu_has(X86_FEATURE_XSAVES)) {
- wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
xfeatures_mask_independent());
}
}
@@ -209,16 +238,20 @@ static bool xfeature_enabled(enum xfeature xfeature)
return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
}
+static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2)
+{
+ return xstate_offsets[*(unsigned int *)xfeature1] -
+ xstate_offsets[*(unsigned int *)xfeature2];
+}
+
/*
* Record the offsets and sizes of various xstates contained
- * in the XSAVE state memory layout.
+ * in the XSAVE state memory layout. Also, create an ordered
+ * list of xfeatures for handling out-of-order offsets.
*/
static void __init setup_xstate_cache(void)
{
- u32 eax, ebx, ecx, edx, i;
- /* start at the beginning of the "extended state" */
- unsigned int last_good_offset = offsetof(struct xregs_state,
- extended_state_area);
+ u32 eax, ebx, ecx, edx, xfeature, i = 0;
/*
* The FP xstates and SSE xstates are legacy states. They are always
* in the fixed offsets in the xsave area in either compacted form
@@ -232,39 +265,30 @@ static void __init setup_xstate_cache(void)
xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
xmm_space);
- for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
- cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
+ for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) {
+ cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx);
- xstate_sizes[i] = eax;
- xstate_flags[i] = ecx;
+ xstate_sizes[xfeature] = eax;
+ xstate_flags[xfeature] = ecx;
/*
* If an xfeature is supervisor state, the offset in EBX is
* invalid, leave it to -1.
*/
- if (xfeature_is_supervisor(i))
+ if (xfeature_is_supervisor(xfeature))
continue;
- xstate_offsets[i] = ebx;
+ xstate_offsets[xfeature] = ebx;
- /*
- * In our xstate size checks, we assume that the highest-numbered
- * xstate feature has the highest offset in the buffer. Ensure
- * it does.
- */
- WARN_ONCE(last_good_offset > xstate_offsets[i],
- "x86/fpu: misordered xstate at %d\n", last_good_offset);
-
- last_good_offset = xstate_offsets[i];
+ /* Populate the list of xfeatures before sorting */
+ xfeature_uncompact_order[i++] = xfeature;
}
-}
-static void __init print_xstate_feature(u64 xstate_mask)
-{
- const char *feature_name;
-
- if (cpu_has_xfeatures(xstate_mask, &feature_name))
- pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
+ /*
+ * Sort xfeatures by their offsets to support out-of-order
+ * offsets in the uncompacted format.
+ */
+ sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL);
}
/*
@@ -272,19 +296,15 @@ static void __init print_xstate_feature(u64 xstate_mask)
*/
static void __init print_xstate_features(void)
{
- print_xstate_feature(XFEATURE_MASK_FP);
- print_xstate_feature(XFEATURE_MASK_SSE);
- print_xstate_feature(XFEATURE_MASK_YMM);
- print_xstate_feature(XFEATURE_MASK_BNDREGS);
- print_xstate_feature(XFEATURE_MASK_BNDCSR);
- print_xstate_feature(XFEATURE_MASK_OPMASK);
- print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
- print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
- print_xstate_feature(XFEATURE_MASK_PKRU);
- print_xstate_feature(XFEATURE_MASK_PASID);
- print_xstate_feature(XFEATURE_MASK_CET_USER);
- print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
- print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
+ int i;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = BIT_ULL(i);
+ const char *name;
+
+ if (cpu_has_xfeatures(mask, &name))
+ pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
+ }
}
/*
@@ -352,7 +372,8 @@ static __init void os_xrstor_booting(struct xregs_state *xstate)
XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_PASID | \
XFEATURE_MASK_CET_USER | \
- XFEATURE_MASK_XTILE)
+ XFEATURE_MASK_XTILE | \
+ XFEATURE_MASK_APX)
/*
* setup the xstate image representing the init state
@@ -552,6 +573,7 @@ static bool __init check_xstate_against_struct(int nr)
case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
+ case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state);
case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
default:
XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
@@ -564,13 +586,20 @@ static bool __init check_xstate_against_struct(int nr)
static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
{
unsigned int topmost = fls64(xfeatures) - 1;
- unsigned int offset = xstate_offsets[topmost];
+ unsigned int offset, i;
if (topmost <= XFEATURE_SSE)
return sizeof(struct xregs_state);
- if (compacted)
+ if (compacted) {
offset = xfeature_get_offset(xfeatures, topmost);
+ } else {
+ /* Walk through the xfeature order to pick the last */
+ for_each_extended_xfeature_in_order(i, xfeatures)
+ topmost = xfeature_uncompact_order[i];
+ offset = xstate_offsets[topmost];
+ }
+
return offset + xstate_sizes[topmost];
}
@@ -651,7 +680,7 @@ static unsigned int __init get_xsave_compacted_size(void)
return get_compacted_size();
/* Disable independent features. */
- wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor());
/*
* Ask the hardware what size is required of the buffer.
@@ -660,7 +689,7 @@ static unsigned int __init get_xsave_compacted_size(void)
size = get_compacted_size();
/* Re-enable independent features so XSAVES will work on them again. */
- wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
return size;
}
@@ -723,6 +752,8 @@ static int __init init_xstate_size(void)
*/
static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
{
+ pr_info("x86/fpu: XSAVE disabled\n");
+
fpu_kernel_cfg.max_features = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
@@ -739,7 +770,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
*/
init_fpstate.xfd = 0;
- fpstate_reset(&current->thread.fpu);
+ fpstate_reset(x86_task_fpu(current));
}
/*
@@ -787,6 +818,17 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}
+ if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX &&
+ fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) {
+ /*
+ * This is a problematic CPU configuration where two
+ * conflicting state components are both enumerated.
+ */
+ pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n",
+ fpu_kernel_cfg.max_features);
+ goto out_disable;
+ }
+
fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
XFEATURE_MASK_INDEPENDENT;
@@ -846,9 +888,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
if (err)
goto out_disable;
- /* Reset the state for the current task */
- fpstate_reset(&current->thread.fpu);
-
/*
* Update info used for ptrace frames; use standard-format size and no
* supervisor xstates:
@@ -864,7 +903,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
init_fpstate.xfeatures = fpu_kernel_cfg.default_features;
if (init_fpstate.size > sizeof(init_fpstate.regs)) {
- pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
+ pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n",
sizeof(init_fpstate.regs), init_fpstate.size);
goto out_disable;
}
@@ -876,7 +915,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
* xfeatures mask.
*/
if (xfeatures != fpu_kernel_cfg.max_features) {
- pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
+ pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n",
xfeatures, fpu_kernel_cfg.max_features);
goto out_disable;
}
@@ -916,12 +955,12 @@ void fpu__resume_cpu(void)
* of XSAVES and MSR_IA32_XSS.
*/
if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
- wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
xfeatures_mask_independent());
}
if (fpu_state_size_dynamic())
- wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
+ wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd);
}
/*
@@ -1083,10 +1122,9 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
struct xregs_state *xinit = &init_fpstate.regs.xsave;
struct xregs_state *xsave = &fpstate->regs.xsave;
+ unsigned int zerofrom, i, xfeature;
struct xstate_header header;
- unsigned int zerofrom;
u64 mask;
- int i;
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
@@ -1155,15 +1193,16 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
*/
mask = header.xfeatures;
- for_each_extended_xfeature(i, mask) {
+ for_each_extended_xfeature_in_order(i, mask) {
+ xfeature = xfeature_uncompact_order[i];
/*
* If there was a feature or alignment gap, zero the space
* in the destination buffer.
*/
- if (zerofrom < xstate_offsets[i])
- membuf_zero(&to, xstate_offsets[i] - zerofrom);
+ if (zerofrom < xstate_offsets[xfeature])
+ membuf_zero(&to, xstate_offsets[xfeature] - zerofrom);
- if (i == XFEATURE_PKRU) {
+ if (xfeature == XFEATURE_PKRU) {
struct pkru_state pkru = {0};
/*
* PKRU is not necessarily up to date in the
@@ -1173,14 +1212,14 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
membuf_write(&to, &pkru, sizeof(pkru));
} else {
membuf_write(&to,
- __raw_xsave_addr(xsave, i),
- xstate_sizes[i]);
+ __raw_xsave_addr(xsave, xfeature),
+ xstate_sizes[xfeature]);
}
/*
* Keep track of the last copied state in the non-compacted
* target buffer for gap zeroing.
*/
- zerofrom = xstate_offsets[i] + xstate_sizes[i];
+ zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature];
}
out:
@@ -1203,8 +1242,8 @@ out:
void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode copy_mode)
{
- __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
- tsk->thread.fpu.fpstate->user_xfeatures,
+ __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate,
+ x86_task_fpu(tsk)->fpstate->user_xfeatures,
tsk->thread.pkru, copy_mode);
}
@@ -1344,7 +1383,7 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u
int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
const void __user *ubuf)
{
- return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
+ return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru);
}
static bool validate_independent_components(u64 mask)
@@ -1410,9 +1449,9 @@ void xrstors(struct xregs_state *xstate, u64 mask)
}
#if IS_ENABLED(CONFIG_KVM)
-void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
+void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature)
{
- void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
+ void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature);
if (addr)
memset(addr, 0, xstate_sizes[xfeature]);
@@ -1438,7 +1477,7 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
* The XFD MSR does not match fpstate->xfd. That's invalid when
* the passed in fpstate is current's fpstate.
*/
- if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
+ if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd)
return false;
/*
@@ -1515,7 +1554,7 @@ void fpstate_free(struct fpu *fpu)
static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
unsigned int usize, struct fpu_guest *guest_fpu)
{
- struct fpu *fpu = &current->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current);
struct fpstate *curfps, *newfps = NULL;
unsigned int fpsize;
bool in_use;
@@ -1608,7 +1647,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
* AVX512.
*/
bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
- struct fpu *fpu = &current->group_leader->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current->group_leader);
struct fpu_state_perm *perm;
unsigned int ksize, usize;
u64 mask;
@@ -1618,16 +1657,20 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
if ((permitted & requested) == requested)
return 0;
- /* Calculate the resulting kernel state size */
+ /*
+ * Calculate the resulting kernel state size. Note, @permitted also
+ * contains supervisor xfeatures even though supervisor are always
+ * permitted for kernel and guest FPUs, and never permitted for user
+ * FPUs.
+ */
mask = permitted | requested;
- /* Take supervisor states into account on the host */
- if (!guest)
- mask |= xfeatures_mask_supervisor();
ksize = xstate_calculate_size(mask, compacted);
- /* Calculate the resulting user state size */
- mask &= XFEATURE_MASK_USER_SUPPORTED;
- usize = xstate_calculate_size(mask, false);
+ /*
+ * Calculate the resulting user state size. Take care not to clobber
+ * the supervisor xfeatures in the new mask!
+ */
+ usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false);
if (!guest) {
ret = validate_sigaltstack(usize);
@@ -1711,7 +1754,7 @@ int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
return -EPERM;
}
- fpu = &current->group_leader->thread.fpu;
+ fpu = x86_task_fpu(current->group_leader);
perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
ksize = perm->__state_size;
usize = perm->__user_state_size;
@@ -1816,7 +1859,7 @@ long fpu_xstate_prctl(int option, unsigned long arg2)
*/
static void avx512_status(struct seq_file *m, struct task_struct *task)
{
- unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
+ unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
long delta;
if (!timestamp) {
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index aa16f1a1bbcf..52ce19289989 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -5,6 +5,7 @@
#include <asm/cpufeature.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>
+#include <asm/msr.h>
#ifdef CONFIG_X86_64
DECLARE_PER_CPU(u64, xfd_state);
@@ -22,7 +23,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
static inline u64 xstate_get_group_perm(bool guest)
{
- struct fpu *fpu = &current->group_leader->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current->group_leader);
struct fpu_state_perm *perm;
/* Pairs with WRITE_ONCE() in xstate_request_perm() */
@@ -69,21 +70,31 @@ static inline u64 xfeatures_mask_independent(void)
return fpu_kernel_cfg.independent_features;
}
+static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask)
+{
+ u64 xfeatures;
+ int err;
+
+ /* Read the xfeatures value already saved in the user buffer */
+ err = __get_user(xfeatures, &xbuf->header.xfeatures);
+ xfeatures |= mask;
+ err |= __put_user(xfeatures, &xbuf->header.xfeatures);
+
+ return err;
+}
+
/*
* Update the value of PKRU register that was already pushed onto the signal frame.
*/
-static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 mask, u32 pkru)
+static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
{
- u64 xstate_bv;
int err;
if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
return 0;
/* Mark PKRU as in-use so that it is restored correctly. */
- xstate_bv = (mask & xfeatures_in_use()) | XFEATURE_MASK_PKRU;
-
- err = __put_user(xstate_bv, &buf->header.xfeatures);
+ err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU);
if (err)
return err;
@@ -94,30 +105,33 @@ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 ma
/* XSAVE/XRSTOR wrapper functions */
#ifdef CONFIG_X86_64
-#define REX_PREFIX "0x48, "
+#define REX_SUFFIX "64"
#else
-#define REX_PREFIX
+#define REX_SUFFIX
#endif
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27"
-#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+#define XSAVE "xsave" REX_SUFFIX " %[xa]"
+#define XSAVEOPT "xsaveopt" REX_SUFFIX " %[xa]"
+#define XSAVEC "xsavec" REX_SUFFIX " %[xa]"
+#define XSAVES "xsaves" REX_SUFFIX " %[xa]"
+#define XRSTOR "xrstor" REX_SUFFIX " %[xa]"
+#define XRSTORS "xrstors" REX_SUFFIX " %[xa]"
/*
* After this @err contains 0 on success or the trap number when the
* operation raises an exception.
+ *
+ * The [xa] input parameter below represents the struct xregs_state pointer
+ * and the asm symbolic name for the argument used in the XSAVE/XRSTOR insns
+ * above.
*/
#define XSTATE_OP(op, st, lmask, hmask, err) \
asm volatile("1:" op "\n\t" \
"xor %[err], %[err]\n" \
- "2:\n\t" \
+ "2:\n" \
_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
: [err] "=a" (err) \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
: "memory")
/*
@@ -137,12 +151,12 @@ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 ma
XSAVEOPT, X86_FEATURE_XSAVEOPT, \
XSAVEC, X86_FEATURE_XSAVEC, \
XSAVES, X86_FEATURE_XSAVES) \
- "\n" \
+ "\n\t" \
"xor %[err], %[err]\n" \
"3:\n" \
_ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
: [err] "=r" (err) \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
: "memory")
/*
@@ -156,7 +170,7 @@ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 ma
"3:\n" \
_ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \
: \
- : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
: "memory")
#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
@@ -168,7 +182,7 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs
#ifdef CONFIG_X86_64
static inline void xfd_set_state(u64 xfd)
{
- wrmsrl(MSR_IA32_XFD, xfd);
+ wrmsrq(MSR_IA32_XFD, xfd);
__this_cpu_write(xfd_state, xfd);
}
@@ -285,7 +299,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkr
* internally, e.g. PKRU. That's user space ABI and also required
* to allow the signal handler to modify PKRU.
*/
- struct fpstate *fpstate = current->thread.fpu.fpstate;
+ struct fpstate *fpstate = x86_task_fpu(current)->fpstate;
u64 mask = fpstate->user_xfeatures;
u32 lmask;
u32 hmask;
@@ -304,7 +318,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkr
clac();
if (!err)
- err = update_pkru_in_sigframe(buf, mask, pkru);
+ err = update_pkru_in_sigframe(buf, pkru);
return err;
}
@@ -319,7 +333,7 @@ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64
u32 hmask = mask >> 32;
int err;
- xfd_validate_state(current->thread.fpu.fpstate, mask, true);
+ xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true);
stac();
XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
index 5e2cd1004980..816187da3a47 100644
--- a/arch/x86/kernel/fred.c
+++ b/arch/x86/kernel/fred.c
@@ -3,6 +3,7 @@
#include <asm/desc.h>
#include <asm/fred.h>
+#include <asm/msr.h>
#include <asm/tlbflush.h>
#include <asm/traps.h>
@@ -43,23 +44,23 @@ void cpu_init_fred_exceptions(void)
*/
loadsegment(ss, __KERNEL_DS);
- wrmsrl(MSR_IA32_FRED_CONFIG,
+ wrmsrq(MSR_IA32_FRED_CONFIG,
/* Reserve for CALL emulation */
FRED_CONFIG_REDZONE |
FRED_CONFIG_INT_STKLVL(0) |
FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user));
- wrmsrl(MSR_IA32_FRED_STKLVLS, 0);
+ wrmsrq(MSR_IA32_FRED_STKLVLS, 0);
/*
* Ater a CPU offline/online cycle, the FRED RSP0 MSR should be
* resynchronized with its per-CPU cache.
*/
- wrmsrl(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0));
+ wrmsrq(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0));
- wrmsrl(MSR_IA32_FRED_RSP1, 0);
- wrmsrl(MSR_IA32_FRED_RSP2, 0);
- wrmsrl(MSR_IA32_FRED_RSP3, 0);
+ wrmsrq(MSR_IA32_FRED_RSP1, 0);
+ wrmsrq(MSR_IA32_FRED_RSP2, 0);
+ wrmsrq(MSR_IA32_FRED_RSP3, 0);
/* Enable FRED */
cr4_set_bits(X86_CR4_FRED);
@@ -79,14 +80,14 @@ void cpu_init_fred_rsps(void)
* (remember that user space faults are always taken on stack level 0)
* is to avoid overflowing the kernel stack.
*/
- wrmsrl(MSR_IA32_FRED_STKLVLS,
+ wrmsrq(MSR_IA32_FRED_STKLVLS,
FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) |
FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) |
FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) |
FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL));
/* The FRED equivalents to IST stacks... */
- wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
- wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
- wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
+ wrmsrq(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
+ wrmsrq(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
+ wrmsrq(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 166bc0ea3bdf..252e82bcfd2f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -55,10 +55,10 @@ void ftrace_arch_code_modify_post_process(void)
{
/*
* ftrace_make_{call,nop}() may be called during
- * module load, and we need to finish the text_poke_queue()
+ * module load, and we need to finish the smp_text_poke_batch_add()
* that they do, here.
*/
- text_poke_finish();
+ smp_text_poke_batch_finish();
ftrace_poke_late = 0;
mutex_unlock(&text_mutex);
}
@@ -118,13 +118,10 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code,
return ret;
/* replace the text with the new text */
- if (ftrace_poke_late) {
- text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
- } else {
- mutex_lock(&text_mutex);
- text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE);
- mutex_unlock(&text_mutex);
- }
+ if (ftrace_poke_late)
+ smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+ else
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
}
@@ -189,11 +186,11 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
ip = (unsigned long)(&ftrace_call);
new = ftrace_call_replace(ip, (unsigned long)func);
- text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
ip = (unsigned long)(&ftrace_regs_call);
new = ftrace_call_replace(ip, (unsigned long)func);
- text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
@@ -250,10 +247,10 @@ void ftrace_replace_code(int enable)
break;
}
- text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
ftrace_update_record(rec, enable);
}
- text_poke_finish();
+ smp_text_poke_batch_finish();
}
void arch_ftrace_update_code(int command)
@@ -321,7 +318,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
union ftrace_op_code_union op_ptr;
- void *ret;
+ int ret;
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
start_offset = (unsigned long)ftrace_regs_caller;
@@ -352,15 +349,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
/* Copy ftrace_caller onto the trampoline memory */
- ret = text_poke_copy(trampoline, (void *)start_offset, size);
- if (WARN_ON(!ret))
+ ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size);
+ if (WARN_ON(ret < 0))
goto fail;
ip = trampoline + size;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk_at(ip))
__text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
else
- text_poke_copy(ip, retq, sizeof(retq));
+ memcpy(ip, retq, sizeof(retq));
/* No need to test direct calls on created trampolines */
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
@@ -368,7 +365,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
ip = trampoline + (jmp_offset - start_offset);
if (WARN_ON(*(char *)ip != 0x75))
goto fail;
- if (!text_poke_copy(ip, x86_nops[2], 2))
+ ret = copy_from_kernel_nofault(ip, x86_nops[2], 2);
+ if (ret < 0)
goto fail;
}
@@ -381,7 +379,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
*/
ptr = (unsigned long *)(trampoline + size + RET_SIZE);
- text_poke_copy(ptr, &ops, sizeof(unsigned long));
+ *ptr = (unsigned long)ops;
op_offset -= start_offset;
memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
@@ -397,7 +395,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
op_ptr.offset = offset;
/* put in the new offset to the ftrace_ops */
- text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
/* put in the call to the function */
mutex_lock(&text_mutex);
@@ -407,9 +405,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
* the depth accounting before the call already.
*/
dest = ftrace_ops_get_func(ops);
- text_poke_copy_locked(trampoline + call_offset,
- text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
- CALL_INSN_SIZE, false);
+ memcpy(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
+ CALL_INSN_SIZE);
mutex_unlock(&text_mutex);
/* ALLOC_TRAMP flags lets us know we created it */
@@ -494,7 +492,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
mutex_lock(&text_mutex);
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
- text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
mutex_unlock(&text_mutex);
}
@@ -588,7 +586,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func)
const char *new;
new = ftrace_jmp_replace(ip, (unsigned long)func);
- text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index d51647228596..367da3638167 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -146,12 +146,14 @@ SYM_FUNC_END(ftrace_stub_graph)
#ifdef CONFIG_DYNAMIC_FTRACE
SYM_FUNC_START(__fentry__)
+ ANNOTATE_NOENDBR
CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
SYM_FUNC_START(ftrace_caller)
+ ANNOTATE_NOENDBR
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
@@ -197,6 +199,7 @@ SYM_FUNC_END(ftrace_caller);
STACK_FRAME_NON_STANDARD_FP(ftrace_caller)
SYM_FUNC_START(ftrace_regs_caller)
+ ANNOTATE_NOENDBR
/* Save the current flags before any operations that can change them */
pushfq
@@ -310,6 +313,7 @@ SYM_FUNC_END(ftrace_regs_caller)
STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
SYM_FUNC_START(ftrace_stub_direct_tramp)
+ ANNOTATE_NOENDBR
CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(ftrace_stub_direct_tramp)
@@ -317,6 +321,7 @@ SYM_FUNC_END(ftrace_stub_direct_tramp)
#else /* ! CONFIG_DYNAMIC_FTRACE */
SYM_FUNC_START(__fentry__)
+ ANNOTATE_NOENDBR
CALL_DEPTH_ACCOUNT
cmpq $ftrace_stub, ftrace_trace_function
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index de001b2146ab..375f2d7f1762 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void)
*ptr = (unsigned long)ptep + PAGE_OFFSET;
#ifdef CONFIG_MICROCODE_INITRD32
- /* Running on a hypervisor? */
- if (native_cpuid_ecx(1) & BIT(31))
- return;
-
params = (struct boot_params *)__pa_nodebug(&boot_params);
if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
return;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 22c9ba305ac1..533fcf5636fc 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -5,8 +5,6 @@
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
*/
-#define DISABLE_BRANCH_PROFILING
-
/* cpu_feature_enabled() cannot be used this early */
#define USE_EARLY_PGTABLE_L5
@@ -49,234 +47,22 @@
* Manage page tables very early on.
*/
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
-static unsigned int __initdata next_early_pgt;
+unsigned int __initdata next_early_pgt;
+SYM_PIC_ALIAS(next_early_pgt);
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
-#ifdef CONFIG_X86_5LEVEL
unsigned int __pgtable_l5_enabled __ro_after_init;
unsigned int pgdir_shift __ro_after_init = 39;
EXPORT_SYMBOL(pgdir_shift);
unsigned int ptrs_per_p4d __ro_after_init = 1;
EXPORT_SYMBOL(ptrs_per_p4d);
-#endif
-#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
EXPORT_SYMBOL(page_offset_base);
unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
EXPORT_SYMBOL(vmalloc_base);
unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
EXPORT_SYMBOL(vmemmap_base);
-#endif
-
-static inline bool check_la57_support(void)
-{
- if (!IS_ENABLED(CONFIG_X86_5LEVEL))
- return false;
-
- /*
- * 5-level paging is detected and enabled at kernel decompression
- * stage. Only check if it has been enabled there.
- */
- if (!(native_read_cr4() & X86_CR4_LA57))
- return false;
-
- RIP_REL_REF(__pgtable_l5_enabled) = 1;
- RIP_REL_REF(pgdir_shift) = 48;
- RIP_REL_REF(ptrs_per_p4d) = 512;
- RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5;
- RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5;
- RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5;
-
- return true;
-}
-
-static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
- pmdval_t *pmd,
- unsigned long p2v_offset)
-{
- unsigned long paddr, paddr_end;
- int i;
-
- /* Encrypt the kernel and related (if SME is active) */
- sme_encrypt_kernel(bp);
-
- /*
- * Clear the memory encryption mask from the .bss..decrypted section.
- * The bss section will be memset to zero later in the initialization so
- * there is no need to zero it after changing the memory encryption
- * attribute.
- */
- if (sme_get_me_mask()) {
- paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted);
- paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted);
-
- for (; paddr < paddr_end; paddr += PMD_SIZE) {
- /*
- * On SNP, transition the page to shared in the RMP table so that
- * it is consistent with the page table attribute change.
- *
- * __start_bss_decrypted has a virtual address in the high range
- * mapping (kernel .text). PVALIDATE, by way of
- * early_snp_set_memory_shared(), requires a valid virtual
- * address but the kernel is currently running off of the identity
- * mapping so use the PA to get a *currently* valid virtual address.
- */
- early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
-
- i = pmd_index(paddr - p2v_offset);
- pmd[i] -= sme_get_me_mask();
- }
- }
-
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
-}
-
-/* Code in __startup_64() can be relocated during execution, but the compiler
- * doesn't have to generate PC-relative relocations when accessing globals from
- * that function. Clang actually does not generate them, which leads to
- * boot-time crashes. To work around this problem, every global pointer must
- * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
- * by subtracting p2v_offset from the RIP-relative address.
- */
-unsigned long __head __startup_64(unsigned long p2v_offset,
- struct boot_params *bp)
-{
- pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
- unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text);
- unsigned long va_text, va_end;
- unsigned long pgtable_flags;
- unsigned long load_delta;
- pgdval_t *pgd;
- p4dval_t *p4d;
- pudval_t *pud;
- pmdval_t *pmd, pmd_entry;
- bool la57;
- int i;
-
- la57 = check_la57_support();
-
- /* Is the address too large? */
- if (physaddr >> MAX_PHYSMEM_BITS)
- for (;;);
-
- /*
- * Compute the delta between the address I am compiled to run at
- * and the address I am actually running at.
- */
- load_delta = __START_KERNEL_map + p2v_offset;
- RIP_REL_REF(phys_base) = load_delta;
-
- /* Is the address not 2M aligned? */
- if (load_delta & ~PMD_MASK)
- for (;;);
-
- va_text = physaddr - p2v_offset;
- va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset;
-
- /* Include the SME encryption mask in the fixup value */
- load_delta += sme_get_me_mask();
-
- /* Fixup the physical addresses in the page table */
-
- pgd = &RIP_REL_REF(early_top_pgt)->pgd;
- pgd[pgd_index(__START_KERNEL_map)] += load_delta;
-
- if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
- p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
- p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
-
- pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
- }
-
- RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
- RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta;
-
- for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
- RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta;
-
- /*
- * Set up the identity mapping for the switchover. These
- * entries should *NOT* have the global bit set! This also
- * creates a bunch of nonsense entries but that is fine --
- * it avoids problems around wraparound.
- */
-
- pud = &early_pgts[0]->pmd;
- pmd = &early_pgts[1]->pmd;
- RIP_REL_REF(next_early_pgt) = 2;
-
- pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
-
- if (la57) {
- p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd;
-
- i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
- pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
-
- i = physaddr >> P4D_SHIFT;
- p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
- p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
- } else {
- i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
- pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
- pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
- }
-
- i = physaddr >> PUD_SHIFT;
- pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
- pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
-
- pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
- /* Filter out unsupported __PAGE_KERNEL_* bits: */
- pmd_entry &= RIP_REL_REF(__supported_pte_mask);
- pmd_entry += sme_get_me_mask();
- pmd_entry += physaddr;
-
- for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
- int idx = i + (physaddr >> PMD_SHIFT);
-
- pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
- }
-
- /*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
- *
- * Only the region occupied by the kernel image has so far
- * been checked against the table of usable memory regions
- * provided by the firmware, so invalidate pages outside that
- * region. A page table entry that maps to a reserved area of
- * memory would allow processor speculation into that area,
- * and on some hardware (particularly the UV platform) even
- * speculative access to some reserved areas is caught as an
- * error, causing the BIOS to halt the system.
- */
-
- pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
-
- /* invalidate pages before the kernel image */
- for (i = 0; i < pmd_index(va_text); i++)
- pmd[i] &= ~_PAGE_PRESENT;
-
- /* fixup pages that are part of the kernel image */
- for (; i <= pmd_index(va_end); i++)
- if (pmd[i] & _PAGE_PRESENT)
- pmd[i] += load_delta;
-
- /* invalidate pages after the kernel image */
- for (; i < PTRS_PER_PMD; i++)
- pmd[i] &= ~_PAGE_PRESENT;
-
- return sme_postprocess_startup(bp, pmd, p2v_offset);
-}
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
@@ -451,6 +237,12 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode
/* Kill off the identity-map trampoline */
reset_early_page_tables();
+ if (pgtable_l5_enabled()) {
+ page_offset_base = __PAGE_OFFSET_BASE_L5;
+ vmalloc_base = __VMALLOC_BASE_L5;
+ vmemmap_base = __VMEMMAP_BASE_L5;
+ }
+
clear_bss();
/*
@@ -515,41 +307,6 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data)
start_kernel();
}
-/*
- * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is
- * used until the idt_table takes over. On the boot CPU this happens in
- * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases
- * this happens in the functions called from head_64.S.
- *
- * The idt_table can't be used that early because all the code modifying it is
- * in idt.c and can be instrumented by tracing or KASAN, which both don't work
- * during early CPU bringup. Also the idt_table has the runtime vectors
- * configured which require certain CPU state to be setup already (like TSS),
- * which also hasn't happened yet in early CPU bringup.
- */
-static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
-
-/* This may run while still in the direct mapping */
-static void __head startup_64_load_idt(void *vc_handler)
-{
- struct desc_ptr desc = {
- .address = (unsigned long)&RIP_REL_REF(bringup_idt_table),
- .size = sizeof(bringup_idt_table) - 1,
- };
- struct idt_data data;
- gate_desc idt_desc;
-
- /* @vc_handler is set only for a VMM Communication Exception */
- if (vc_handler) {
- init_idt_data(&data, X86_TRAP_VC, vc_handler);
- idt_init_desc(&idt_desc, &data);
- native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc);
- }
-
- native_load_idt(&desc);
-}
-
-/* This is used when running on kernel addresses */
void early_setup_idt(void)
{
void *handler = NULL;
@@ -561,30 +318,3 @@ void early_setup_idt(void)
startup_64_load_idt(handler);
}
-
-/*
- * Setup boot CPU state needed before kernel switches to virtual addresses.
- */
-void __head startup_64_setup_gdt_idt(void)
-{
- struct desc_struct *gdt = (void *)(__force unsigned long)init_per_cpu_var(gdt_page.gdt);
- void *handler = NULL;
-
- struct desc_ptr startup_gdt_descr = {
- .address = (unsigned long)&RIP_REL_REF(*gdt),
- .size = GDT_SIZE - 1,
- };
-
- /* Load GDT */
- native_load_gdt(&startup_gdt_descr);
-
- /* New GDT is live - reload data segment registers */
- asm volatile("movl %%eax, %%ds\n"
- "movl %%eax, %%ss\n"
- "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
- handler = &RIP_REL_REF(vc_no_ghcb);
-
- startup_64_load_idt(handler);
-}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 2e42056d2306..76743dfad6ab 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -86,7 +86,7 @@ SYM_CODE_START(startup_32)
movl $pa(__bss_stop),%ecx
subl %edi,%ecx
shrl $2,%ecx
- rep ; stosl
+ rep stosl
/*
* Copy bootup parameters out of the way.
* Note: %esi still has the pointer to the real-mode data.
@@ -98,15 +98,13 @@ SYM_CODE_START(startup_32)
movl $pa(boot_params),%edi
movl $(PARAM_SIZE/4),%ecx
cld
- rep
- movsl
+ rep movsl
movl pa(boot_params) + NEW_CL_POINTER,%esi
andl %esi,%esi
jz 1f # No command line
movl $pa(boot_command_line),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
- rep
- movsl
+ rep movsl
1:
#ifdef CONFIG_OLPC
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 31345e0ba006..3e9b3a3bd039 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -61,11 +61,14 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Set up the stack for verify_cpu() */
leaq __top_init_kernel_stack(%rip), %rsp
- /* Setup GSBASE to allow stack canary access for C code */
+ /*
+ * Set up GSBASE.
+ * Note that on SMP the boot CPU uses the init data section until
+ * the per-CPU areas are set up.
+ */
movl $MSR_GS_BASE, %ecx
- leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
- movl %edx, %eax
- shrq $32, %rdx
+ xorl %eax, %eax
+ xorl %edx, %edx
wrmsr
call startup_64_setup_gdt_idt
@@ -319,7 +322,7 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
*
* RDX contains the per-cpu offset
*/
- movq pcpu_hot + X86_current_task(%rdx), %rax
+ movq current_task(%rdx), %rax
movq TASK_threadsp(%rax), %rsp
/*
@@ -359,17 +362,12 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
movl %eax,%fs
movl %eax,%gs
- /* Set up %gs.
- *
- * The base of %gs always points to fixed_percpu_data. If the
- * stack protector canary is enabled, it is located at %gs:40.
+ /*
+ * Set up GSBASE.
* Note that, on SMP, the boot cpu uses init data section until
* the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
-#ifndef CONFIG_SMP
- leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
-#endif
movl %edx, %eax
shrq $32, %rdx
wrmsr
@@ -435,7 +433,7 @@ SYM_CODE_START(soft_restart_cpu)
UNWIND_HINT_END_OF_STACK
/* Find the idle task stack */
- movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
+ movq PER_CPU_VAR(current_task), %rcx
movq TASK_threadsp(%rcx), %rsp
jmp .Ljump_to_C_code
@@ -575,6 +573,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
/* Pure iret required here - don't use INTERRUPT_RETURN */
iretq
SYM_CODE_END(vc_no_ghcb)
+SYM_PIC_ALIAS(vc_no_ghcb);
#endif
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
@@ -606,10 +605,12 @@ SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
SYM_DATA_END(early_top_pgt)
+SYM_PIC_ALIAS(early_top_pgt)
SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
SYM_DATA_END(early_dynamic_pgts)
+SYM_PIC_ALIAS(early_dynamic_pgts);
SYM_DATA(early_recursion_flag, .long 0)
@@ -648,12 +649,11 @@ SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
SYM_DATA_END(init_top_pgt)
#endif
-#ifdef CONFIG_X86_5LEVEL
SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
SYM_DATA_END(level4_kernel_pgt)
-#endif
+SYM_PIC_ALIAS(level4_kernel_pgt)
SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
@@ -661,6 +661,7 @@ SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
SYM_DATA_END(level3_kernel_pgt)
+SYM_PIC_ALIAS(level3_kernel_pgt)
SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
/*
@@ -678,6 +679,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
*/
PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
SYM_DATA_END(level2_kernel_pgt)
+SYM_PIC_ALIAS(level2_kernel_pgt)
SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
.fill (512 - 4 - FIXMAP_PMD_NUM),8,0
@@ -690,6 +692,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
/* 6 MB reserved space + a 2MB hole */
.fill 4,8,0
SYM_DATA_END(level2_fixmap_pgt)
+SYM_PIC_ALIAS(level2_fixmap_pgt)
SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
.rept (FIXMAP_PMD_NUM)
@@ -705,6 +708,7 @@ SYM_DATA(smpboot_control, .long 0)
.align 16
/* This must match the first entry in level2_kernel_pgt */
SYM_DATA(phys_base, .quad 0x0)
+SYM_PIC_ALIAS(phys_base);
EXPORT_SYMBOL(phys_base)
#include "../xen/xen-head.S"
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7f4b2966e15c..d6387dde3ff9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -7,11 +7,12 @@
#include <linux/cpu.h>
#include <linux/irq.h>
-#include <asm/cpuid.h>
+#include <asm/cpuid/api.h>
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/time.h>
#include <asm/mwait.h>
+#include <asm/msr.h>
#undef pr_fmt
#define pr_fmt(fmt) "hpet: " fmt
@@ -970,7 +971,7 @@ static bool __init hpet_is_pc10_damaged(void)
return false;
/* Check whether PC10 is enabled in PKG C-state limit */
- rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
+ rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
if ((pcfg & 0xF) < 8)
return false;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 80e262bb627f..cb9852ad6098 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -46,7 +46,8 @@ bool __init pit_timer_init(void)
* VMMs otherwise steal CPU time just to pointlessly waggle
* the (masked) IRQ.
*/
- clockevent_i8253_disable();
+ scoped_guard(irq)
+ clockevent_i8253_disable();
return false;
}
clockevent_i8253_init(true);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index c20d1832c481..2bade73f49e3 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -23,6 +23,7 @@
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/i8259.h>
+#include <asm/io_apic.h>
/*
* This is the 'legacy' 8259A Programmable Interrupt Controller,
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index e2fab3ceb09f..6290dd120f5e 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -144,7 +144,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
* Update the sequence number to force a TSS update on return to
* user mode.
*/
- iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence);
+ iobm->sequence = atomic64_inc_return(&io_bitmap_sequence);
return 0;
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 385e3a5fc304..81f9b78e0f7b 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -25,12 +25,19 @@
#include <asm/posted_intr.h>
#include <asm/irq_remapping.h>
+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR)
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
+#endif
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
+DEFINE_PER_CPU_CACHE_HOT(u16, __softirq_pending);
+EXPORT_PER_CPU_SYMBOL(__softirq_pending);
+
+DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr);
+
atomic_t irq_err_count;
/*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index dc1049c01f9b..c7a5d2960d57 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -29,12 +29,9 @@
int sysctl_panic_on_stackoverflow __read_mostly;
/* Debugging check for stack overflow: is there less than 1KB free? */
-static int check_stack_overflow(void)
+static bool check_stack_overflow(void)
{
- long sp;
-
- __asm__ __volatile__("andl %%esp,%0" :
- "=r" (sp) : "0" (THREAD_SIZE - 1));
+ unsigned long sp = current_stack_pointer & (THREAD_SIZE - 1);
return sp < (sizeof(struct thread_info) + STACK_WARN);
}
@@ -48,18 +45,19 @@ static void print_stack_overflow(void)
}
#else
-static inline int check_stack_overflow(void) { return 0; }
+static inline bool check_stack_overflow(void) { return false; }
static inline void print_stack_overflow(void) { }
#endif
+DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr);
+
static void call_on_stack(void *func, void *stack)
{
- asm volatile("xchgl %%ebx,%%esp \n"
+ asm volatile("xchgl %[sp], %%esp\n"
CALL_NOSPEC
- "movl %%ebx,%%esp \n"
- : "=b" (stack)
- : "0" (stack),
- [thunk_target] "D"(func)
+ "movl %[sp], %%esp"
+ : [sp] "+b" (stack)
+ : [thunk_target] "D" (func)
: "memory", "cc", "edx", "ecx", "eax");
}
@@ -68,13 +66,13 @@ static inline void *current_stack(void)
return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
}
-static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
+static inline bool execute_on_irq_stack(bool overflow, struct irq_desc *desc)
{
struct irq_stack *curstk, *irqstk;
- u32 *isp, *prev_esp, arg1;
+ u32 *isp, *prev_esp;
curstk = (struct irq_stack *) current_stack();
- irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr);
+ irqstk = __this_cpu_read(hardirq_stack_ptr);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -83,7 +81,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
* current stack (which is the irq stack already after all)
*/
if (unlikely(curstk == irqstk))
- return 0;
+ return false;
isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
@@ -94,14 +92,13 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
- asm volatile("xchgl %%ebx,%%esp \n"
+ asm volatile("xchgl %[sp], %%esp\n"
CALL_NOSPEC
- "movl %%ebx,%%esp \n"
- : "=a" (arg1), "=b" (isp)
- : "0" (desc), "1" (isp),
- [thunk_target] "D" (desc->handle_irq)
- : "memory", "cc", "ecx");
- return 1;
+ "movl %[sp], %%esp"
+ : "+a" (desc), [sp] "+b" (isp)
+ : [thunk_target] "D" (desc->handle_irq)
+ : "memory", "cc", "edx", "ecx");
+ return true;
}
/*
@@ -112,7 +109,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)
int node = cpu_to_node(cpu);
struct page *ph, *ps;
- if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu))
+ if (per_cpu(hardirq_stack_ptr, cpu))
return 0;
ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
@@ -124,8 +121,8 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return -ENOMEM;
}
- per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph);
- per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps);
+ per_cpu(hardirq_stack_ptr, cpu) = page_address(ph);
+ per_cpu(softirq_stack_ptr, cpu) = page_address(ps);
return 0;
}
@@ -135,7 +132,7 @@ void do_softirq_own_stack(void)
struct irq_stack *irqstk;
u32 *isp, *prev_esp;
- irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr);
+ irqstk = __this_cpu_read(softirq_stack_ptr);
/* build the stack frame on the softirq stack */
isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
@@ -150,7 +147,7 @@ void do_softirq_own_stack(void)
void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
- int overflow = check_stack_overflow();
+ bool overflow = check_stack_overflow();
if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
if (unlikely(overflow))
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index ade0043ce56e..ca78dce39361 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,8 +26,8 @@
#include <asm/io_apic.h>
#include <asm/apic.h>
+DEFINE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse);
DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible;
-DECLARE_INIT_PER_CPU(irq_stack_backing_store);
#ifdef CONFIG_VMAP_STACK
/*
@@ -51,7 +51,7 @@ static int map_irq_stack(unsigned int cpu)
return -ENOMEM;
/* Store actual TOS to avoid adjustment in the hotpath */
- per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#else
@@ -64,14 +64,14 @@ static int map_irq_stack(unsigned int cpu)
void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
/* Store actual TOS to avoid adjustment in the hotpath */
- per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#endif
int irq_init_percpu_irqstack(unsigned int cpu)
{
- if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu))
+ if (per_cpu(hardirq_stack_ptr, cpu))
return 0;
return map_irq_stack(cpu);
}
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 7f542a7799cb..fdabd5dda154 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -9,6 +9,7 @@
*/
.pushsection .noinstr.text, "ax"
SYM_FUNC_START(native_save_fl)
+ ENDBR
pushf
pop %_ASM_AX
RET
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index cd8ed1edbf9e..9e9a591a5fec 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -49,7 +49,7 @@ static uint32_t jailhouse_cpuid_base(void)
!boot_cpu_has(X86_FEATURE_HYPERVISOR))
return 0;
- return hypervisor_cpuid_base("Jailhouse\0\0\0", 0);
+ return cpuid_base_hypervisor("Jailhouse\0\0\0", 0);
}
static uint32_t __init jailhouse_detect(void)
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index f5b8ef02d172..a7949a54a0ff 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -102,7 +102,7 @@ __jump_label_transform(struct jump_entry *entry,
return;
}
- text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
+ smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
}
static void __ref jump_label_transform(struct jump_entry *entry,
@@ -135,7 +135,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
mutex_lock(&text_mutex);
jlp = __jump_label_patch(entry, type);
- text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
+ smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
mutex_unlock(&text_mutex);
return true;
}
@@ -143,6 +143,6 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
void arch_jump_label_transform_apply(void)
{
mutex_lock(&text_mutex);
- text_poke_finish();
+ smp_text_poke_batch_finish();
mutex_unlock(&text_mutex);
}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 9c9faa1634fb..102641fd2172 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -655,7 +655,7 @@ void kgdb_arch_late(void)
if (breakinfo[i].pev)
continue;
breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
- if (IS_ERR((void * __force)breakinfo[i].pev)) {
+ if (IS_ERR_PCPU(breakinfo[i].pev)) {
printk(KERN_ERR "kgdb: Could not allocate hw"
"breakpoints\nDisabling the kernel debugger\n");
breakinfo[i].pev = NULL;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 72e6a45e7ec2..47cb8eb138ba 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -373,16 +373,7 @@ out:
kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
bool *on_func_entry)
{
- u32 insn;
-
- /*
- * Since 'addr' is not guaranteed to be safe to access, use
- * copy_from_kernel_nofault() to read the instruction:
- */
- if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32)))
- return NULL;
-
- if (is_endbr(insn)) {
+ if (is_endbr((u32 *)addr)) {
*on_func_entry = !offset || offset == 4;
if (*on_func_entry)
offset = 4;
@@ -817,7 +808,7 @@ void arch_arm_kprobe(struct kprobe *p)
u8 int3 = INT3_INSN_OPCODE;
text_poke(p->addr, &int3, 1);
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
}
@@ -827,7 +818,7 @@ void arch_disarm_kprobe(struct kprobe *p)
perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
text_poke(p->addr, &p->opcode, 1);
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
}
void arch_remove_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 36d6809c6c9e..0aabd4c4e2c4 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -488,7 +488,7 @@ void arch_optimize_kprobes(struct list_head *oplist)
insn_buff[0] = JMP32_INSN_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
- text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
+ smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
list_del_init(&op->list);
}
@@ -513,11 +513,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op)
JMP32_INSN_SIZE - INT3_INSN_SIZE);
text_poke(addr, new, INT3_INSN_SIZE);
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
text_poke(addr + INT3_INSN_SIZE,
new + INT3_INSN_SIZE,
JMP32_INSN_SIZE - INT3_INSN_SIZE);
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7a422a6c5983..921c1c783bc1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -40,6 +40,7 @@
#include <asm/mtrr.h>
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
+#include <asm/msr.h>
#include <asm/ptrace.h>
#include <asm/reboot.h>
#include <asm/svm.h>
@@ -301,7 +302,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
token = __this_cpu_read(apf_reason.token);
kvm_async_pf_task_wake(token);
__this_cpu_write(apf_reason.token, 0);
- wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
+ wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1);
}
set_irq_regs(old_regs);
@@ -327,7 +328,7 @@ static void kvm_register_steal_time(void)
if (!has_steal_clock)
return;
- wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
+ wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
(unsigned long long) slow_virt_to_phys(st));
}
@@ -361,9 +362,9 @@ static void kvm_guest_cpu_init(void)
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
- wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
+ wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
- wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
+ wrmsrq(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(async_pf_enabled, true);
pr_debug("setup async PF for cpu %d\n", smp_processor_id());
}
@@ -376,7 +377,7 @@ static void kvm_guest_cpu_init(void)
__this_cpu_write(kvm_apic_eoi, 0);
pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
| KVM_MSR_ENABLED;
- wrmsrl(MSR_KVM_PV_EOI_EN, pa);
+ wrmsrq(MSR_KVM_PV_EOI_EN, pa);
}
if (has_steal_clock)
@@ -388,7 +389,7 @@ static void kvm_pv_disable_apf(void)
if (!__this_cpu_read(async_pf_enabled))
return;
- wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+ wrmsrq(MSR_KVM_ASYNC_PF_EN, 0);
__this_cpu_write(async_pf_enabled, false);
pr_debug("disable async PF for cpu %d\n", smp_processor_id());
@@ -399,7 +400,7 @@ static void kvm_disable_steal_time(void)
if (!has_steal_clock)
return;
- wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+ wrmsrq(MSR_KVM_STEAL_TIME, 0);
}
static u64 kvm_steal_clock(int cpu)
@@ -451,9 +452,9 @@ static void kvm_guest_cpu_offline(bool shutdown)
{
kvm_disable_steal_time();
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
- wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ wrmsrq(MSR_KVM_PV_EOI_EN, 0);
if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
- wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0);
+ wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0);
kvm_pv_disable_apf();
if (!shutdown)
apf_task_wake_all();
@@ -615,7 +616,7 @@ static int __init setup_efi_kvm_sev_migration(void)
}
pr_info("%s : live migration enabled in EFI\n", __func__);
- wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
+ wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
return 1;
}
@@ -728,7 +729,7 @@ static int kvm_suspend(void)
#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
- rdmsrl(MSR_KVM_POLL_CONTROL, val);
+ rdmsrq(MSR_KVM_POLL_CONTROL, val);
has_guest_poll = !(val & 1);
#endif
return 0;
@@ -740,7 +741,7 @@ static void kvm_resume(void)
#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
- wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+ wrmsrq(MSR_KVM_POLL_CONTROL, 0);
#endif
}
@@ -838,7 +839,6 @@ static void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
if (pv_tlb_flush_supported()) {
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
pr_info("KVM setup pv remote TLB flush\n");
}
@@ -875,7 +875,7 @@ static noinline uint32_t __kvm_cpuid_base(void)
return 0; /* So we don't blow up on old processors */
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return hypervisor_cpuid_base(KVM_SIGNATURE, 0);
+ return cpuid_base_hypervisor(KVM_SIGNATURE, 0);
return 0;
}
@@ -976,7 +976,7 @@ static void __init kvm_init_platform(void)
* If not booted using EFI, enable Live migration support.
*/
if (!efi_enabled(EFI_BOOT))
- wrmsrl(MSR_KVM_MIGRATION_CONTROL,
+ wrmsrq(MSR_KVM_MIGRATION_CONTROL,
KVM_MIGRATION_READY);
}
kvmclock_init();
@@ -1125,12 +1125,12 @@ out:
static void kvm_disable_host_haltpoll(void *i)
{
- wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+ wrmsrq(MSR_KVM_POLL_CONTROL, 0);
}
static void kvm_enable_host_haltpoll(void *i)
{
- wrmsrl(MSR_KVM_POLL_CONTROL, 1);
+ wrmsrq(MSR_KVM_POLL_CONTROL, 1);
}
void arch_haltpoll_enable(unsigned int cpu)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5b2c15214a6b..ca0a49eeac4a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -60,7 +60,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
*/
static void kvm_get_wallclock(struct timespec64 *now)
{
- wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
+ wrmsrq(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
preempt_disable();
pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
preempt_enable();
@@ -173,7 +173,7 @@ static void kvm_register_clock(char *txt)
return;
pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
- wrmsrl(msr_kvm_system_time, pa);
+ wrmsrq(msr_kvm_system_time, pa);
pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}
@@ -196,7 +196,7 @@ static void kvm_setup_secondary_clock(void)
void kvmclock_disable(void)
{
if (msr_kvm_system_time)
- native_write_msr(msr_kvm_system_time, 0, 0);
+ native_write_msr(msr_kvm_system_time, 0);
}
static void __init kvmclock_init_mem(void)
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 80265162aeff..1f325304c4a8 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -42,7 +42,7 @@ static void load_segments(void)
static void machine_kexec_free_page_tables(struct kimage *image)
{
- free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER);
+ free_pages((unsigned long)image->arch.pgd, pgd_allocation_order());
image->arch.pgd = NULL;
#ifdef CONFIG_X86_PAE
free_page((unsigned long)image->arch.pmd0);
@@ -59,7 +59,7 @@ static void machine_kexec_free_page_tables(struct kimage *image)
static int machine_kexec_alloc_page_tables(struct kimage *image)
{
image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- PGD_ALLOCATION_ORDER);
+ pgd_allocation_order());
#ifdef CONFIG_X86_PAE
image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index a68f5a0a9f37..949c9e4bfad2 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -76,6 +76,19 @@ map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; }
#endif
+static int map_mmio_serial(struct x86_mapping_info *info, pgd_t *level4p)
+{
+ unsigned long mstart, mend;
+
+ if (!kexec_debug_8250_mmio32)
+ return 0;
+
+ mstart = kexec_debug_8250_mmio32 & PAGE_MASK;
+ mend = (kexec_debug_8250_mmio32 + PAGE_SIZE + 23) & PAGE_MASK;
+ pr_info("Map PCI serial at %lx - %lx\n", mstart, mend);
+ return kernel_ident_mapping_init(info, level4p, mstart, mend);
+}
+
#ifdef CONFIG_KEXEC_FILE
const struct kexec_file_ops * const kexec_file_loaders[] = {
&kexec_bzImage64_ops,
@@ -285,6 +298,10 @@ static int init_pgtable(struct kimage *image, unsigned long control_page)
if (result)
return result;
+ result = map_mmio_serial(&info, image->arch.pgd);
+ if (result)
+ return result;
+
/*
* This must be last because the intermediate page table pages it
* allocates will not be control pages and may overlap the image.
@@ -304,6 +321,24 @@ static void load_segments(void)
);
}
+static void prepare_debug_idt(unsigned long control_page, unsigned long vec_ofs)
+{
+ gate_desc idtentry = { 0 };
+ int i;
+
+ idtentry.bits.p = 1;
+ idtentry.bits.type = GATE_TRAP;
+ idtentry.segment = __KERNEL_CS;
+ idtentry.offset_low = (control_page & 0xFFFF) + vec_ofs;
+ idtentry.offset_middle = (control_page >> 16) & 0xFFFF;
+ idtentry.offset_high = control_page >> 32;
+
+ for (i = 0; i < 16; i++) {
+ kexec_debug_idt[i] = idtentry;
+ idtentry.offset_low += KEXEC_DEBUG_EXC_HANDLER_SIZE;
+ }
+}
+
int machine_kexec_prepare(struct kimage *image)
{
void *control_page = page_address(image->control_code_page);
@@ -321,6 +356,9 @@ int machine_kexec_prepare(struct kimage *image)
if (image->type == KEXEC_TYPE_DEFAULT)
kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT;
+ prepare_debug_idt((unsigned long)__pa(control_page),
+ (unsigned long)kexec_debug_exc_vectors - reloc_start);
+
__memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start);
set_memory_rox((unsigned long)control_page, 1);
@@ -396,16 +434,10 @@ void __nocfi machine_kexec(struct kimage *image)
* with from a table in memory. At no other time is the
* descriptor table in memory accessed.
*
- * I take advantage of this here by force loading the
- * segments, before I zap the gdt with an invalid value.
+ * Take advantage of this here by force loading the segments,
+ * before the GDT is zapped with an invalid value.
*/
load_segments();
- /*
- * The gdt & idt are now invalid.
- * If you want to load them you must set up your own idt & gdt.
- */
- native_idt_invalidate();
- native_gdt_invalidate();
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 1f54eedc3015..ef6104e7cc72 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -97,7 +97,7 @@ static void get_fam10h_pci_mmconf_base(void)
/* SYS_CFG */
address = MSR_AMD64_SYSCFG;
- rdmsrl(address, val);
+ rdmsrq(address, val);
/* TOP_MEM2 is not enabled? */
if (!(val & (1<<21))) {
@@ -105,7 +105,7 @@ static void get_fam10h_pci_mmconf_base(void)
} else {
/* TOP_MEM2 */
address = MSR_K8_TOP_MEM2;
- rdmsrl(address, val);
+ rdmsrq(address, val);
tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
}
@@ -177,7 +177,7 @@ void fam10h_check_enable_mmcfg(void)
return;
address = MSR_FAM10H_MMIO_CONF_BASE;
- rdmsrl(address, val);
+ rdmsrq(address, val);
/* try to make sure that AP's setting is identical to BSP setting */
if (val & FAM10H_MMIO_CONF_ENABLE) {
@@ -212,7 +212,7 @@ void fam10h_check_enable_mmcfg(void)
(FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT));
val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
FAM10H_MMIO_CONF_ENABLE;
- wrmsrl(address, val);
+ wrmsrq(address, val);
}
static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8984abd91c00..0ffbae902e2f 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -19,6 +19,7 @@
#include <linux/jump_label.h>
#include <linux/random.h>
#include <linux/memory.h>
+#include <linux/stackprotector.h>
#include <asm/text-patching.h>
#include <asm/page.h>
@@ -130,6 +131,20 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
goto overflow;
size = 4;
break;
+#if defined(CONFIG_STACKPROTECTOR) && \
+ defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
+ case R_X86_64_REX_GOTPCRELX: {
+ static unsigned long __percpu *const addr = &__stack_chk_guard;
+
+ if (sym->st_value != (u64)addr) {
+ pr_err("%s: Unsupported GOTPCREL relocation\n", me->name);
+ return -ENOEXEC;
+ }
+
+ val = (u64)&addr + rel[i].r_addend;
+ fallthrough;
+ }
+#endif
case R_X86_64_PC32:
case R_X86_64_PLT32:
val -= (u64)loc;
@@ -146,21 +161,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
}
if (apply) {
- void *wr_loc = module_writable_address(me, loc);
-
- if (memcmp(wr_loc, &zero, size)) {
+ if (memcmp(loc, &zero, size)) {
pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
return -ENOEXEC;
}
- write(wr_loc, &val, size);
+ write(loc, &val, size);
} else {
if (memcmp(loc, &val, size)) {
pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
return -ENOEXEC;
}
- /* FIXME: needs care for ROX module allocations */
write(loc, &zero, size);
}
}
@@ -194,7 +206,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs,
write, apply);
if (!early) {
- text_poke_sync();
+ smp_text_poke_sync_each_cpu();
mutex_unlock(&text_mutex);
}
@@ -227,7 +239,7 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *alt = NULL,
+ const Elf_Shdr *s, *alt = NULL, *locks = NULL,
*orc = NULL, *orc_ip = NULL,
*retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
*calls = NULL, *cfi = NULL;
@@ -236,6 +248,8 @@ int module_finalize(const Elf_Ehdr *hdr,
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
+ if (!strcmp(".smp_locks", secstrings + s->sh_name))
+ locks = s;
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
@@ -252,6 +266,8 @@ int module_finalize(const Elf_Ehdr *hdr,
ibt_endbr = s;
}
+ its_init_mod(me);
+
if (retpolines || cfi) {
void *rseg = NULL, *cseg = NULL;
unsigned int rsize = 0, csize = 0;
@@ -266,60 +282,36 @@ int module_finalize(const Elf_Ehdr *hdr,
csize = cfi->sh_size;
}
- apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me);
+ apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
}
if (retpolines) {
void *rseg = (void *)retpolines->sh_addr;
- apply_retpolines(rseg, rseg + retpolines->sh_size, me);
+ apply_retpolines(rseg, rseg + retpolines->sh_size);
}
+
+ its_fini_mod(me);
+
if (returns) {
void *rseg = (void *)returns->sh_addr;
- apply_returns(rseg, rseg + returns->sh_size, me);
- }
- if (alt) {
- /* patch .altinstructions */
- void *aseg = (void *)alt->sh_addr;
- apply_alternatives(aseg, aseg + alt->sh_size, me);
+ apply_returns(rseg, rseg + returns->sh_size);
}
- if (calls || alt) {
+ if (calls) {
struct callthunk_sites cs = {};
- if (calls) {
- cs.call_start = (void *)calls->sh_addr;
- cs.call_end = (void *)calls->sh_addr + calls->sh_size;
- }
-
- if (alt) {
- cs.alt_start = (void *)alt->sh_addr;
- cs.alt_end = (void *)alt->sh_addr + alt->sh_size;
- }
+ cs.call_start = (void *)calls->sh_addr;
+ cs.call_end = (void *)calls->sh_addr + calls->sh_size;
callthunks_patch_module_calls(&cs, me);
}
+ if (alt) {
+ /* patch .altinstructions */
+ void *aseg = (void *)alt->sh_addr;
+ apply_alternatives(aseg, aseg + alt->sh_size);
+ }
if (ibt_endbr) {
void *iseg = (void *)ibt_endbr->sh_addr;
- apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me);
+ apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size);
}
-
- if (orc && orc_ip)
- unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
- (void *)orc->sh_addr, orc->sh_size);
-
- return 0;
-}
-
-int module_post_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- const Elf_Shdr *s, *locks = NULL;
- char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
- for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks = s;
- }
-
if (locks) {
void *lseg = (void *)locks->sh_addr;
void *text = me->mem[MOD_TEXT].base;
@@ -329,10 +321,15 @@ int module_post_finalize(const Elf_Ehdr *hdr,
text, text_end);
}
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
+
return 0;
}
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
+ its_free_mod(mod);
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ed163c8c8604..be93ec7255bf 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -40,32 +40,29 @@
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
+/*
+ * An emergency handler can be set in any context including NMI
+ */
struct nmi_desc {
raw_spinlock_t lock;
+ nmi_handler_t emerg_handler;
struct list_head head;
};
-static struct nmi_desc nmi_desc[NMI_MAX] =
-{
- {
- .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
- .head = LIST_HEAD_INIT(nmi_desc[0].head),
- },
- {
- .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
- .head = LIST_HEAD_INIT(nmi_desc[1].head),
- },
- {
- .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
- .head = LIST_HEAD_INIT(nmi_desc[2].head),
- },
- {
- .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
- .head = LIST_HEAD_INIT(nmi_desc[3].head),
- },
+#define NMI_DESC_INIT(type) { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[type].lock), \
+ .head = LIST_HEAD_INIT(nmi_desc[type].head), \
+}
+static struct nmi_desc nmi_desc[NMI_MAX] = {
+ NMI_DESC_INIT(NMI_LOCAL),
+ NMI_DESC_INIT(NMI_UNKNOWN),
+ NMI_DESC_INIT(NMI_SERR),
+ NMI_DESC_INIT(NMI_IO_CHECK),
};
+#define nmi_to_desc(type) (&nmi_desc[type])
+
struct nmi_stats {
unsigned int normal;
unsigned int unknown;
@@ -87,6 +84,9 @@ static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
static int ignore_nmis __read_mostly;
int unknown_nmi_panic;
+int panic_on_unrecovered_nmi;
+int panic_on_io_nmi;
+
/*
* Prevent NMI reason port (0x61) being accessed simultaneously, can
* only be used in NMI handler.
@@ -100,8 +100,6 @@ static int __init setup_unknown_nmi_panic(char *str)
}
__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-#define nmi_to_desc(type) (&nmi_desc[type])
-
static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
static int __init nmi_warning_debugfs(void)
@@ -121,20 +119,33 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration)
action->max_duration = duration;
- remainder_ns = do_div(duration, (1000 * 1000));
- decimal_msecs = remainder_ns / 1000;
+ /* Convert duration from nsec to msec */
+ remainder_ns = do_div(duration, NSEC_PER_MSEC);
+ decimal_msecs = remainder_ns / NSEC_PER_USEC;
- printk_ratelimited(KERN_INFO
- "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
- action->handler, duration, decimal_msecs);
+ pr_info_ratelimited("INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
+ action->handler, duration, decimal_msecs);
}
static int nmi_handle(unsigned int type, struct pt_regs *regs)
{
struct nmi_desc *desc = nmi_to_desc(type);
+ nmi_handler_t ehandler;
struct nmiaction *a;
int handled=0;
+ /*
+ * Call the emergency handler, if set
+ *
+ * In the case of crash_nmi_callback() emergency handler, it will
+ * return in the case of the crashing CPU to enable it to complete
+ * other necessary crashing actions ASAP. Other handlers in the
+ * linked list won't need to be run.
+ */
+ ehandler = desc->emerg_handler;
+ if (ehandler)
+ return ehandler(type, regs);
+
rcu_read_lock();
/*
@@ -224,6 +235,31 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
+/**
+ * set_emergency_nmi_handler - Set emergency handler
+ * @type: NMI type
+ * @handler: the emergency handler to be stored
+ *
+ * Set an emergency NMI handler which, if set, will preempt all the other
+ * handlers in the linked list. If a NULL handler is passed in, it will clear
+ * it. It is expected that concurrent calls to this function will not happen
+ * or the system is screwed beyond repair.
+ */
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+
+ if (WARN_ON_ONCE(desc->emerg_handler == handler))
+ return;
+ desc->emerg_handler = handler;
+
+ /*
+ * Ensure the emergency handler is visible to other CPUs before
+ * function return
+ */
+ smp_wmb();
+}
+
static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
@@ -291,10 +327,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
int handled;
/*
- * Use 'false' as back-to-back NMIs are dealt with one level up.
- * Of course this makes having multiple 'unknown' handlers useless
- * as only the first one is ever run (unless it can actually determine
- * if it caused the NMI)
+ * As a last resort, let the "unknown" handlers make a
+ * best-effort attempt to figure out if they can claim
+ * responsibility for this Unknown NMI.
*/
handled = nmi_handle(NMI_UNKNOWN, regs);
if (handled) {
@@ -324,17 +359,18 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
bool b2b = false;
/*
- * CPU-specific NMI must be processed before non-CPU-specific
- * NMI, otherwise we may lose it, because the CPU-specific
- * NMI can not be detected/processed on other CPUs.
- */
-
- /*
- * Back-to-back NMIs are interesting because they can either
- * be two NMI or more than two NMIs (any thing over two is dropped
- * due to NMI being edge-triggered). If this is the second half
- * of the back-to-back NMI, assume we dropped things and process
- * more handlers. Otherwise reset the 'swallow' NMI behaviour
+ * Back-to-back NMIs are detected by comparing the RIP of the
+ * current NMI with that of the previous NMI. If it is the same,
+ * it is assumed that the CPU did not have a chance to jump back
+ * into a non-NMI context and execute code in between the two
+ * NMIs.
+ *
+ * They are interesting because even if there are more than two,
+ * only a maximum of two can be detected (anything over two is
+ * dropped due to NMI being edge-triggered). If this is the
+ * second half of the back-to-back NMI, assume we dropped things
+ * and process more handlers. Otherwise, reset the 'swallow' NMI
+ * behavior.
*/
if (regs->ip == __this_cpu_read(last_nmi_rip))
b2b = true;
@@ -348,6 +384,11 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
if (microcode_nmi_handler_enabled() && microcode_nmi_handler())
goto out;
+ /*
+ * CPU-specific NMI must be processed before non-CPU-specific
+ * NMI, otherwise we may lose it, because the CPU-specific
+ * NMI can not be detected/processed on other CPUs.
+ */
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
if (handled) {
@@ -384,13 +425,14 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
pci_serr_error(reason, regs);
else if (reason & NMI_REASON_IOCHK)
io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
+
/*
* Reassert NMI in case it became active
* meanwhile as it's edge-triggered:
*/
- reassert_nmi();
-#endif
+ if (IS_ENABLED(CONFIG_X86_32))
+ reassert_nmi();
+
__this_cpu_add(nmi_stats.external, 1);
raw_spin_unlock(&nmi_reason_lock);
goto out;
@@ -709,4 +751,3 @@ void local_touch_nmi(void)
{
__this_cpu_write(last_nmi_rip, 0);
}
-EXPORT_SYMBOL_GPL(local_touch_nmi);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index e93a8545c74d..a010e9d062bf 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * arch/x86/kernel/nmi-selftest.c
- *
* Testsuite for NMI: IPIs
*
* Started by Don Zickus:
@@ -30,7 +28,6 @@ static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
static int __initdata testcase_total;
static int __initdata testcase_successes;
-static int __initdata expected_testcase_failures;
static int __initdata unexpected_testcase_failures;
static int __initdata unexpected_testcase_unknowns;
@@ -120,26 +117,22 @@ static void __init dotest(void (*testcase_fn)(void), int expected)
unexpected_testcase_failures++;
if (nmi_fail == FAILURE)
- printk(KERN_CONT "FAILED |");
+ pr_cont("FAILED |");
else if (nmi_fail == TIMEOUT)
- printk(KERN_CONT "TIMEOUT|");
+ pr_cont("TIMEOUT|");
else
- printk(KERN_CONT "ERROR |");
+ pr_cont("ERROR |");
dump_stack();
} else {
testcase_successes++;
- printk(KERN_CONT " ok |");
+ pr_cont(" ok |");
}
- testcase_total++;
+ pr_cont("\n");
+ testcase_total++;
reset_nmi();
}
-static inline void __init print_testname(const char *testname)
-{
- printk("%12s:", testname);
-}
-
void __init nmi_selftest(void)
{
init_nmi_testsuite();
@@ -147,38 +140,25 @@ void __init nmi_selftest(void)
/*
* Run the testsuite:
*/
- printk("----------------\n");
- printk("| NMI testsuite:\n");
- printk("--------------------\n");
+ pr_info("----------------\n");
+ pr_info("| NMI testsuite:\n");
+ pr_info("--------------------\n");
- print_testname("remote IPI");
+ pr_info("%12s:", "remote IPI");
dotest(remote_ipi, SUCCESS);
- printk(KERN_CONT "\n");
- print_testname("local IPI");
+
+ pr_info("%12s:", "local IPI");
dotest(local_ipi, SUCCESS);
- printk(KERN_CONT "\n");
cleanup_nmi_testsuite();
+ pr_info("--------------------\n");
if (unexpected_testcase_failures) {
- printk("--------------------\n");
- printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+ pr_info("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
unexpected_testcase_failures, testcase_total);
- printk("-----------------------------------------------------------------\n");
- } else if (expected_testcase_failures && testcase_successes) {
- printk("--------------------\n");
- printk("%3d out of %3d testcases failed, as expected. |\n",
- expected_testcase_failures, testcase_total);
- printk("----------------------------------------------------\n");
- } else if (expected_testcase_failures && !testcase_successes) {
- printk("--------------------\n");
- printk("All %3d testcases failed, as expected. |\n",
- expected_testcase_failures);
- printk("----------------------------------------\n");
} else {
- printk("--------------------\n");
- printk("Good, all %3d testcases passed! |\n",
+ pr_info("Good, all %3d testcases passed! |\n",
testcase_successes);
- printk("---------------------------------\n");
}
+ pr_info("-----------------------------------------------------------------\n");
}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1ccaa3397a67..ab3e172dcc69 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -33,6 +33,7 @@
#include <asm/tlb.h>
#include <asm/io_bitmap.h>
#include <asm/gsseg.h>
+#include <asm/msr.h>
/* stub always returning 0. */
DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text);
@@ -59,21 +60,6 @@ void __init native_pv_lock_init(void)
static_branch_enable(&virt_spin_lock_key);
}
-#ifndef CONFIG_PT_RECLAIM
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct ptdesc *ptdesc = (struct ptdesc *)table;
-
- pagetable_dtor(ptdesc);
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_table(tlb, table);
-}
-#endif
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -90,24 +76,9 @@ void paravirt_set_sched_clock(u64 (*func)(void))
static_call_update(pv_sched_clock, func);
}
-/* These are in entry.S */
-static struct resource reserve_ioports = {
- .start = 0,
- .end = IO_SPACE_LIMIT,
- .name = "paravirt-ioport",
- .flags = IORESOURCE_IO | IORESOURCE_BUSY,
-};
-
-/*
- * Reserve the whole legacy IO space to prevent any legacy drivers
- * from wasting time probing for their hardware. This is a fairly
- * brute-force approach to disabling all non-virtual drivers.
- *
- * Note that this must be called very early to have any effect.
- */
-int paravirt_disable_iospace(void)
+static noinstr void pv_native_safe_halt(void)
{
- return request_resource(&ioport_resource, &reserve_ioports);
+ native_safe_halt();
}
#ifdef CONFIG_PARAVIRT_XXL
@@ -116,6 +87,16 @@ static noinstr void pv_native_write_cr2(unsigned long val)
native_write_cr2(val);
}
+static noinstr unsigned long pv_native_read_cr3(void)
+{
+ return __native_read_cr3();
+}
+
+static noinstr void pv_native_write_cr3(unsigned long cr3)
+{
+ native_write_cr3(cr3);
+}
+
static noinstr unsigned long pv_native_get_debugreg(int regno)
{
return native_get_debugreg(regno);
@@ -125,11 +106,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
{
native_set_debugreg(regno, val);
}
-
-static noinstr void pv_native_safe_halt(void)
-{
- native_safe_halt();
-}
#endif
struct pv_info pv_info = {
@@ -186,16 +162,17 @@ struct paravirt_patch_template pv_ops = {
.irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
+#endif /* CONFIG_PARAVIRT_XXL */
+
+ /* Irq HLT ops. */
.irq.safe_halt = pv_native_safe_halt,
.irq.halt = native_halt,
-#endif /* CONFIG_PARAVIRT_XXL */
/* Mmu ops. */
.mmu.flush_tlb_user = native_flush_tlb_local,
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table = native_tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
@@ -203,8 +180,8 @@ struct paravirt_patch_template pv_ops = {
#ifdef CONFIG_PARAVIRT_XXL
.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2),
.mmu.write_cr2 = pv_native_write_cr2,
- .mmu.read_cr3 = __native_read_cr3,
- .mmu.write_cr3 = native_write_cr3,
+ .mmu.read_cr3 = pv_native_read_cr3,
+ .mmu.write_cr3 = pv_native_write_cr3,
.mmu.pgd_alloc = __paravirt_pgd_alloc,
.mmu.pgd_free = paravirt_nop,
@@ -234,12 +211,10 @@ struct paravirt_patch_template pv_ops = {
.mmu.set_p4d = native_set_p4d,
-#if CONFIG_PGTABLE_LEVELS >= 5
.mmu.p4d_val = PTE_IDENT,
.mmu.make_p4d = PTE_IDENT,
.mmu.set_pgd = native_set_pgd,
-#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
.mmu.pte_val = PTE_IDENT,
.mmu.pgd_val = PTE_IDENT,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6da6769d7254..c1d2dac72b9c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -30,7 +30,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/entry-common.h>
#include <asm/cpu.h>
-#include <asm/cpuid.h>
+#include <asm/cpuid/api.h>
#include <asm/apic.h>
#include <linux/uaccess.h>
#include <asm/mwait.h>
@@ -52,6 +52,7 @@
#include <asm/unwind.h>
#include <asm/tdx.h>
#include <asm/mmu_context.h>
+#include <asm/msr.h>
#include <asm/shstk.h>
#include "process.h"
@@ -93,12 +94,12 @@ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- memcpy(dst, src, arch_task_struct_size);
+ /* fpu_clone() will initialize the "dst_fpu" memory */
+ memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(*dst), 0);
+
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif
- /* Drop the copied pointer to current's fpstate */
- dst->thread.fpu.fpstate = NULL;
return 0;
}
@@ -106,8 +107,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
#ifdef CONFIG_X86_64
void arch_release_task_struct(struct task_struct *tsk)
{
- if (fpu_state_size_dynamic())
- fpstate_free(&tsk->thread.fpu);
+ if (fpu_state_size_dynamic() && !(tsk->flags & (PF_KTHREAD | PF_USER_WORKER)))
+ fpstate_free(x86_task_fpu(tsk));
}
#endif
@@ -117,7 +118,6 @@ void arch_release_task_struct(struct task_struct *tsk)
void exit_thread(struct task_struct *tsk)
{
struct thread_struct *t = &tsk->thread;
- struct fpu *fpu = &t->fpu;
if (test_thread_flag(TIF_IO_BITMAP))
io_bitmap_exit(tsk);
@@ -125,7 +125,7 @@ void exit_thread(struct task_struct *tsk)
free_vm86(t);
shstk_free(tsk);
- fpu__drop(fpu);
+ fpu__drop(tsk);
}
static int set_new_tls(struct task_struct *p, unsigned long tls)
@@ -339,7 +339,7 @@ static void set_cpuid_faulting(bool on)
msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
this_cpu_write(msr_misc_features_shadow, msrval);
- wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval);
}
static void disable_cpuid(void)
@@ -556,7 +556,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
if (!static_cpu_has(X86_FEATURE_ZEN)) {
msr |= ssbd_tif_to_amd_ls_cfg(tifn);
- wrmsrl(MSR_AMD64_LS_CFG, msr);
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
return;
}
@@ -573,7 +573,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
raw_spin_lock(&st->shared_state->lock);
/* First sibling enables SSBD: */
if (!st->shared_state->disable_state)
- wrmsrl(MSR_AMD64_LS_CFG, msr);
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
st->shared_state->disable_state++;
raw_spin_unlock(&st->shared_state->lock);
} else {
@@ -583,7 +583,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
raw_spin_lock(&st->shared_state->lock);
st->shared_state->disable_state--;
if (!st->shared_state->disable_state)
- wrmsrl(MSR_AMD64_LS_CFG, msr);
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
raw_spin_unlock(&st->shared_state->lock);
}
}
@@ -592,7 +592,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
{
u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
- wrmsrl(MSR_AMD64_LS_CFG, msr);
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
}
#endif
@@ -602,7 +602,7 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
* SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
* so ssbd_tif_to_spec_ctrl() just works.
*/
- wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
+ wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
}
/*
@@ -705,11 +705,11 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
arch_has_block_step()) {
unsigned long debugctl, msk;
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
debugctl &= ~DEBUGCTLMSR_BTF;
msk = tifn & _TIF_BLOCKSTEP;
debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
}
if ((tifp ^ tifn) & _TIF_NOTSC)
@@ -902,13 +902,10 @@ static __init bool prefer_mwait_c1_over_halt(void)
static __cpuidle void mwait_idle(void)
{
if (!current_set_polling_and_test()) {
- if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
- mb(); /* quirk */
- clflush((void *)&current_thread_info()->flags);
- mb(); /* quirk */
- }
+ const void *addr = &current_thread_info()->flags;
- __monitor((void *)&current_thread_info()->flags, 0, 0);
+ alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr));
+ __monitor(addr, 0, 0);
if (!need_resched()) {
__sti_mwait(0, 0);
raw_local_irq_disable();
@@ -934,7 +931,7 @@ void __init select_idle_routine(void)
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
- static_call_update(x86_idle, tdx_safe_halt);
+ static_call_update(x86_idle, tdx_halt);
} else {
static_call_update(x86_idle, default_idle);
}
@@ -1043,7 +1040,7 @@ unsigned long __get_wchan(struct task_struct *p)
return addr;
}
-long do_arch_prctl_common(int option, unsigned long arg2)
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
switch (option) {
case ARCH_GET_CPUID:
@@ -1058,5 +1055,13 @@ long do_arch_prctl_common(int option, unsigned long arg2)
return fpu_xstate_prctl(option, arg2);
}
+ if (!in_ia32_syscall())
+ return do_arch_prctl_64(current, option, arg2);
+
return -EINVAL;
}
+
+SYSCALL_DEFINE0(ni_syscall)
+{
+ return -ENOSYS;
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0917c7f25720..a10e180cbf23 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -160,8 +160,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_p, cpu);
+ switch_fpu(prev_p, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -190,13 +189,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
arch_end_context_switch(next_p);
/*
- * Reload esp0 and pcpu_hot.top_of_stack. This changes
+ * Reload esp0 and cpu_current_top_of_stack. This changes
* current_thread_info(). Refresh the SYSENTER configuration in
* case prev or next is vm86.
*/
update_task_stack(next_p);
refresh_sysenter_cs(next);
- this_cpu_write(pcpu_hot.top_of_stack,
+ this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
@@ -206,17 +205,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
loadsegment(gs, next->gs);
- raw_cpu_write(pcpu_hot.current_task, next_p);
-
- switch_fpu_finish(next_p);
+ raw_cpu_write(current_task, next_p);
/* Load the Intel cache allocation PQR MSR. */
- resctrl_sched_in(next_p);
+ resctrl_arch_sched_in(next_p);
return prev_p;
}
-
-SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
-{
- return do_arch_prctl_common(option, arg2);
-}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 226472332a70..8d6cf25127aa 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -57,6 +57,7 @@
#include <asm/unistd.h>
#include <asm/fsgsbase.h>
#include <asm/fred.h>
+#include <asm/msr.h>
#ifdef CONFIG_IA32_EMULATION
/* Not included via unistd.h */
#include <asm/unistd_32_ia32.h>
@@ -95,8 +96,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
return;
if (mode == SHOW_REGS_USER) {
- rdmsrl(MSR_FS_BASE, fs);
- rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ rdmsrq(MSR_FS_BASE, fs);
+ rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
printk("%sFS: %016lx GS: %016lx\n",
log_lvl, fs, shadowgs);
return;
@@ -107,9 +108,9 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
asm("movl %%fs,%0" : "=r" (fsindex));
asm("movl %%gs,%0" : "=r" (gsindex));
- rdmsrl(MSR_FS_BASE, fs);
- rdmsrl(MSR_GS_BASE, gs);
- rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ rdmsrq(MSR_FS_BASE, fs);
+ rdmsrq(MSR_GS_BASE, gs);
+ rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
cr0 = read_cr0();
cr2 = read_cr2();
@@ -195,7 +196,7 @@ static noinstr unsigned long __rdgsbase_inactive(void)
native_swapgs();
} else {
instrumentation_begin();
- rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
instrumentation_end();
}
@@ -221,7 +222,7 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase)
native_swapgs();
} else {
instrumentation_begin();
- wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
instrumentation_end();
}
}
@@ -353,7 +354,7 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
} else {
if (prev_index != next_index)
loadseg(which, next_index);
- wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+ wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
next_base);
}
} else {
@@ -463,7 +464,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void)
gsbase = __rdgsbase_inactive();
local_irq_restore(flags);
} else {
- rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
}
return gsbase;
@@ -478,7 +479,7 @@ void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
__wrgsbase_inactive(gsbase);
local_irq_restore(flags);
} else {
- wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
}
}
@@ -614,10 +615,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
- this_cpu_read(pcpu_hot.hardirq_stack_inuse));
+ this_cpu_read(hardirq_stack_inuse));
- if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_p, cpu);
+ switch_fpu(prev_p, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
@@ -668,10 +668,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- raw_cpu_write(pcpu_hot.current_task, next_p);
- raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
-
- switch_fpu_finish(next_p);
+ raw_cpu_write(current_task, next_p);
+ raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload sp0. */
update_task_stack(next_p);
@@ -707,7 +705,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
}
/* Load the Intel cache allocation PQR MSR. */
- resctrl_sched_in(next_p);
+ resctrl_arch_sched_in(next_p);
return prev_p;
}
@@ -942,7 +940,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
case ARCH_MAP_VDSO_X32:
return prctl_map_vdso(&vdso_image_x32, arg2);
# endif
-# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+# ifdef CONFIG_IA32_EMULATION
case ARCH_MAP_VDSO_32:
return prctl_map_vdso(&vdso_image_32, arg2);
# endif
@@ -979,26 +977,3 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
return ret;
}
-
-SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
-{
- long ret;
-
- ret = do_arch_prctl_64(current, option, arg2);
- if (ret == -EINVAL)
- ret = do_arch_prctl_common(option, arg2);
-
- return ret;
-}
-
-#ifdef CONFIG_IA32_EMULATION
-COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
-{
- return do_arch_prctl_common(option, arg2);
-}
-#endif
-
-unsigned long KSTK_ESP(struct task_struct *task)
-{
- return task_pt_regs(task)->sp;
-}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6d0df6a58873..a92f18db9610 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -10,6 +10,8 @@
#include <asm/setup.h>
#include <asm/mce.h>
+#include <linux/platform_data/x86/apple.h>
+
#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
static void quirk_intel_irqbalance(struct pci_dev *dev)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index dc1dd3f3e67f..964f6b0a3d68 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -921,20 +921,16 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
return;
/* Make a note of crashing cpu. Will be used in NMI callback. */
- crashing_cpu = safe_smp_processor_id();
+ crashing_cpu = smp_processor_id();
shootdown_callback = callback;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
- NMI_FLAG_FIRST, "crash"))
- return; /* Return what? */
+
/*
- * Ensure the new callback function is set before sending
- * out the NMI
+ * Set emergency handler to preempt other handlers.
*/
- wmb();
+ set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback);
apic_send_IPI_allbutself(NMI_VECTOR);
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index b7c0f142d026..4679ac0a03eb 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -27,7 +27,7 @@ static void cs5530a_warm_reset(struct pci_dev *dev)
static void cs5536_warm_reset(struct pci_dev *dev)
{
/* writing 1 to the LSB of this MSR causes a hard reset */
- wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL);
+ wrmsrq(MSR_DIVIL_SOFT_RESET, 1ULL);
udelay(50); /* shouldn't get here but be safe and spin a while */
}
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c7c4b1917336..57276f134d12 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -263,17 +263,17 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movl %edx, %edi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
movl %ebp, %edi
movl %eax, %esi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
movl %eax, %edi
movl %edx, %esi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
lea PAGE_SIZE(%ebp), %esi
jmp 0b
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index b44d8863e57f..ea604f4d0b52 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -39,6 +39,23 @@ SYM_DATA(kexec_va_control_page, .quad 0)
SYM_DATA(kexec_pa_table_page, .quad 0)
SYM_DATA(kexec_pa_swap_page, .quad 0)
SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
+SYM_DATA(kexec_debug_8250_port, .word 0)
+
+ .balign 16
+SYM_DATA_START_LOCAL(kexec_debug_gdt)
+ .word kexec_debug_gdt_end - kexec_debug_gdt - 1
+ .long 0
+ .word 0
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
+
+ .balign 8
+SYM_DATA_START(kexec_debug_idt)
+ .skip 0x100, 0x00
+SYM_DATA_END(kexec_debug_idt)
.section .text..relocate_kernel,"ax";
.code64
@@ -62,8 +79,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
pushq %r15
pushf
- /* zero out flags, and disable interrupts */
- pushq $0
+ /* Invalidate GDT/IDT, zero out flags */
+ pushq $0
+ pushq $0
+
+ lidt (%rsp)
+ lgdt (%rsp)
+ addq $8, %rsp
popfq
/* Switch to the identity mapped page tables */
@@ -116,6 +138,28 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/* store the start address on the stack */
pushq %rdx
+ /* Create a GDTR (16 bits limit, 64 bits addr) on stack */
+ leaq kexec_debug_gdt(%rip), %rax
+ pushq %rax
+ pushw (%rax)
+
+ /* Load the GDT, put the stack back */
+ lgdt (%rsp)
+ addq $10, %rsp
+
+ /* Test that we can load segments */
+ movq %ds, %rax
+ movq %rax, %ds
+
+ /* Now an IDTR on the stack to load the IDT the kernel created */
+ leaq kexec_debug_idt(%rip), %rsi
+ pushq %rsi
+ pushw $0xff
+ lidt (%rsp)
+ addq $10, %rsp
+
+ //int3
+
/*
* Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
* below.
@@ -319,20 +363,20 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
/* copy source page to swap page */
movq kexec_pa_swap_page(%rip), %rdi
movl $512, %ecx
- rep ; movsq
+ rep movsq
/* copy destination page to source page */
movq %rax, %rdi
movq %rdx, %rsi
movl $512, %ecx
- rep ; movsq
+ rep movsq
/* copy swap page to destination page */
movq %rdx, %rdi
movq kexec_pa_swap_page(%rip), %rsi
.Lnoswap:
movl $512, %ecx
- rep ; movsq
+ rep movsq
lea PAGE_SIZE(%rax), %rsi
jmp .Lloop
@@ -341,3 +385,222 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
ret
int3
SYM_CODE_END(swap_pages)
+
+/*
+ * Generic 'print character' routine
+ * - %al: Character to be printed (may clobber %rax)
+ * - %rdx: MMIO address or port.
+ */
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ addw $LSR, %dx
+ xchg %al, %ah
+.Lxmtrdy_loop:
+ inb %dx, %al
+ testb $XMTRDY, %al
+ jnz .Lready
+ pause
+ jmp .Lxmtrdy_loop
+
+.Lready:
+ subw $LSR, %dx
+ xchg %al, %ah
+ outb %al, %dx
+pr_char_null:
+ ANNOTATE_NOENDBR
+
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+.Lxmtrdy_loop_mmio:
+ movb (LSR*4)(%rdx), %ah
+ testb $XMTRDY, %ah
+ jnz .Lready_mmio
+ pause
+ jmp .Lxmtrdy_loop_mmio
+
+.Lready_mmio:
+ movb %al, (%rdx)
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250_mmio32)
+
+/*
+ * Load pr_char function pointer into %rsi and load %rdx with whatever
+ * that function wants to see there (typically port/MMIO address).
+ */
+.macro pr_setup
+ leaq pr_char_8250(%rip), %rsi
+ movw kexec_debug_8250_port(%rip), %dx
+ testw %dx, %dx
+ jnz 1f
+
+ leaq pr_char_8250_mmio32(%rip), %rsi
+ movq kexec_debug_8250_mmio32(%rip), %rdx
+ testq %rdx, %rdx
+ jnz 1f
+
+ leaq pr_char_null(%rip), %rsi
+1:
+.endm
+
+/* Print the nybble in %bl, clobber %rax */
+SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
+ UNWIND_HINT_FUNC
+ movb %bl, %al
+ nop
+ andb $0x0f, %al
+ addb $0x30, %al
+ cmpb $0x3a, %al
+ jb 1f
+ addb $('a' - '0' - 10), %al
+ ANNOTATE_RETPOLINE_SAFE
+1: jmp *%rsi
+SYM_CODE_END(pr_nybble)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
+ UNWIND_HINT_FUNC
+ movq $16, %rcx
+1: rolq $4, %rbx
+ call pr_nybble
+ loop 1b
+ movb $'\n', %al
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
+SYM_CODE_END(pr_qword)
+
+.macro print_reg a, b, c, d, r
+ movb $\a, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\b, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\c, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\d, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movq \r, %rbx
+ call pr_qword
+.endm
+
+SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
+ /* Each of these is 6 bytes. */
+.macro vec_err exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ nop
+ nop
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+.macro vec_noerr exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ pushq $0
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+ ANNOTATE_NOENDBR
+ vec_noerr 0 // #DE
+ vec_noerr 1 // #DB
+ vec_noerr 2 // #NMI
+ vec_noerr 3 // #BP
+ vec_noerr 4 // #OF
+ vec_noerr 5 // #BR
+ vec_noerr 6 // #UD
+ vec_noerr 7 // #NM
+ vec_err 8 // #DF
+ vec_noerr 9
+ vec_err 10 // #TS
+ vec_err 11 // #NP
+ vec_err 12 // #SS
+ vec_err 13 // #GP
+ vec_err 14 // #PF
+ vec_noerr 15
+SYM_CODE_END(kexec_debug_exc_vectors)
+
+SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
+ /* No need for RET mitigations during kexec */
+ VALIDATE_UNRET_END
+
+ pushq %rax
+ pushq %rbx
+ pushq %rcx
+ pushq %rdx
+ pushq %rsi
+
+ /* Stack frame */
+#define EXC_SS 0x58 /* Architectural... */
+#define EXC_RSP 0x50
+#define EXC_EFLAGS 0x48
+#define EXC_CS 0x40
+#define EXC_RIP 0x38
+#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */
+#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */
+#define EXC_RAX 0x20 /* Pushed just above in exc_handler */
+#define EXC_RBX 0x18
+#define EXC_RCX 0x10
+#define EXC_RDX 0x08
+#define EXC_RSI 0x00
+
+ /* Set up %rdx/%rsi for debug output */
+ pr_setup
+
+ /* rip and exception info */
+ print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
+ print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
+ print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
+ print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
+
+ /* We spilled these to the stack */
+ print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
+ print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
+ print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
+ print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
+ print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
+
+ /* Other registers untouched */
+ print_reg 'r', 'd', 'i', ':', %rdi
+ print_reg 'r', '8', ' ', ':', %r8
+ print_reg 'r', '9', ' ', ':', %r9
+ print_reg 'r', '1', '0', ':', %r10
+ print_reg 'r', '1', '1', ':', %r11
+ print_reg 'r', '1', '2', ':', %r12
+ print_reg 'r', '1', '3', ':', %r13
+ print_reg 'r', '1', '4', ':', %r14
+ print_reg 'r', '1', '5', ':', %r15
+ print_reg 'c', 'r', '2', ':', %cr2
+
+ /* Only return from INT3 */
+ cmpq $3, EXC_EXCEPTION(%rsp)
+ jne .Ldie
+
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %rbx
+ popq %rax
+
+ addq $16, %rsp
+ iretq
+
+.Ldie:
+ hlt
+ jmp .Ldie
+
+SYM_CODE_END(exc_handler)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cebee310e200..7d9ed79a93c0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -11,6 +11,7 @@
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
#include <linux/efi.h>
+#include <linux/hugetlb.h>
#include <linux/ima.h>
#include <linux/init_ohci1394_dma.h>
#include <linux/initrd.h>
@@ -18,21 +19,19 @@
#include <linux/memblock.h>
#include <linux/panic_notifier.h>
#include <linux/pci.h>
+#include <linux/random.h>
#include <linux/root_dev.h>
-#include <linux/hugetlb.h>
-#include <linux/tboot.h>
-#include <linux/usb/xhci-dbgp.h>
#include <linux/static_call.h>
#include <linux/swiotlb.h>
-#include <linux/random.h>
+#include <linux/tboot.h>
+#include <linux/usb/xhci-dbgp.h>
+#include <linux/vmalloc.h>
#include <uapi/linux/mount.h>
#include <xen/xen.h>
#include <asm/apic.h>
-#include <asm/efi.h>
-#include <asm/numa.h>
#include <asm/bios_ebda.h>
#include <asm/bugs.h>
#include <asm/cacheinfo.h>
@@ -47,15 +46,16 @@
#include <asm/mce.h>
#include <asm/memtype.h>
#include <asm/mtrr.h>
-#include <asm/realmode.h>
+#include <asm/nmi.h>
+#include <asm/numa.h>
#include <asm/olpc_ofw.h>
#include <asm/pci-direct.h>
#include <asm/prom.h>
#include <asm/proto.h>
+#include <asm/realmode.h>
#include <asm/thermal.h>
#include <asm/unwind.h>
#include <asm/vsyscall.h>
-#include <linux/vmalloc.h>
/*
* max_low_pfn_mapped: highest directly mapped pfn < 4 GB
@@ -131,6 +131,7 @@ struct ist_info ist_info;
struct cpuinfo_x86 boot_cpu_data __read_mostly;
EXPORT_SYMBOL(boot_cpu_data);
+SYM_PIC_ALIAS(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
__visible unsigned long mmu_cr4_features __ro_after_init;
@@ -146,6 +147,67 @@ static size_t ima_kexec_buffer_size;
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
int bootloader_type, bootloader_version;
+static const struct ctl_table x86_sysctl_table[] = {
+ {
+ .procname = "unknown_nmi_panic",
+ .data = &unknown_nmi_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_unrecovered_nmi",
+ .data = &panic_on_unrecovered_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_io_nmi",
+ .data = &panic_on_io_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "bootloader_type",
+ .data = &bootloader_type,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "bootloader_version",
+ .data = &bootloader_version,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "io_delay_type",
+ .data = &io_delay_type,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#if defined(CONFIG_ACPI_SLEEP)
+ {
+ .procname = "acpi_video_flags",
+ .data = &acpi_realmode_flags,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+#endif
+};
+
+static int __init init_x86_sysctl(void)
+{
+ register_sysctl_init("kernel", x86_sysctl_table);
+ return 0;
+}
+arch_initcall(init_x86_sysctl);
+
/*
* Setup options
*/
@@ -429,6 +491,46 @@ static void __init parse_setup_data(void)
}
}
+/*
+ * Translate the fields of 'struct boot_param' into global variables
+ * representing these parameters.
+ */
+static void __init parse_boot_params(void)
+{
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+ edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+#endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
+
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI32_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI64_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+ }
+#endif
+
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+}
+
static void __init memblock_x86_reserve_range_setup_data(void)
{
struct setup_indirect *indirect;
@@ -472,14 +574,13 @@ static void __init memblock_x86_reserve_range_setup_data(void)
static void __init arch_reserve_crashkernel(void)
{
unsigned long long crash_base, crash_size, low_size = 0;
- char *cmdline = boot_command_line;
bool high = false;
int ret;
if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
return;
- ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
+ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
&low_size, &high);
if (ret)
@@ -490,8 +591,7 @@ static void __init arch_reserve_crashkernel(void)
return;
}
- reserve_crashkernel_generic(cmdline, crash_size, crash_base,
- low_size, high);
+ reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
}
static struct resource standard_io_resources[] = {
@@ -527,6 +627,23 @@ void __init reserve_standard_io_resources(void)
}
+static void __init setup_kernel_resources(void)
+{
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ rodata_resource.start = __pa_symbol(__start_rodata);
+ rodata_resource.end = __pa_symbol(__end_rodata)-1;
+ data_resource.start = __pa_symbol(_sdata);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
+
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &rodata_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+}
+
static bool __init snb_gfx_workaround_needed(void)
{
#ifdef CONFIG_PCI
@@ -789,35 +906,7 @@ void __init setup_arch(char **cmdline_p)
setup_olpc_ofw_pgd();
- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
- screen_info = boot_params.screen_info;
- edid_info = boot_params.edid_info;
-#ifdef CONFIG_X86_32
- apm_info.bios = boot_params.apm_bios_info;
- ist_info = boot_params.ist_info;
-#endif
- saved_video_mode = boot_params.hdr.vid_mode;
- bootloader_type = boot_params.hdr.type_of_loader;
- if ((bootloader_type >> 4) == 0xe) {
- bootloader_type &= 0xf;
- bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
- }
- bootloader_version = bootloader_type & 0xf;
- bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
-
-#ifdef CONFIG_BLK_DEV_RAM
- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
-#endif
-#ifdef CONFIG_EFI
- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- EFI32_LOADER_SIGNATURE, 4)) {
- set_bit(EFI_BOOT, &efi.flags);
- } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- EFI64_LOADER_SIGNATURE, 4)) {
- set_bit(EFI_BOOT, &efi.flags);
- set_bit(EFI_64BIT, &efi.flags);
- }
-#endif
+ parse_boot_params();
x86_init.oem.arch_setup();
@@ -841,19 +930,8 @@ void __init setup_arch(char **cmdline_p)
copy_edd();
- if (!boot_params.hdr.root_flags)
- root_mountflags &= ~MS_RDONLY;
setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);
- code_resource.start = __pa_symbol(_text);
- code_resource.end = __pa_symbol(_etext)-1;
- rodata_resource.start = __pa_symbol(__start_rodata);
- rodata_resource.end = __pa_symbol(__end_rodata)-1;
- data_resource.start = __pa_symbol(_sdata);
- data_resource.end = __pa_symbol(_edata)-1;
- bss_resource.start = __pa_symbol(__bss_start);
- bss_resource.end = __pa_symbol(__bss_stop)-1;
-
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
@@ -866,30 +944,6 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled(EFI_BOOT))
efi_memblock_x86_reserve_range();
-#ifdef CONFIG_MEMORY_HOTPLUG
- /*
- * Memory used by the kernel cannot be hot-removed because Linux
- * cannot migrate the kernel pages. When memory hotplug is
- * enabled, we should prevent memblock from allocating memory
- * for the kernel.
- *
- * ACPI SRAT records all hotpluggable memory ranges. But before
- * SRAT is parsed, we don't know about it.
- *
- * The kernel image is loaded into memory at very early time. We
- * cannot prevent this anyway. So on NUMA system, we set any
- * node the kernel resides in as un-hotpluggable.
- *
- * Since on modern servers, one node could have double-digit
- * gigabytes memory, we can assume the memory around the kernel
- * image is also un-hotpluggable. So before SRAT is parsed, just
- * allocate memory near the kernel image to try the best to keep
- * the kernel away from hotpluggable memory.
- */
- if (movable_node_is_enabled())
- memblock_set_bottom_up(true);
-#endif
-
x86_report_nx();
apic_setup_apic_calls();
@@ -901,7 +955,6 @@ void __init setup_arch(char **cmdline_p)
setup_clear_cpu_cap(X86_FEATURE_APIC);
}
- e820__reserve_setup_data();
e820__finish_early_params();
if (efi_enabled(EFI_BOOT))
@@ -921,11 +974,11 @@ void __init setup_arch(char **cmdline_p)
tsc_early_init();
x86_init.resources.probe_roms();
- /* after parse_early_param, so could debug it */
- insert_resource(&iomem_resource, &code_resource);
- insert_resource(&iomem_resource, &rodata_resource);
- insert_resource(&iomem_resource, &data_resource);
- insert_resource(&iomem_resource, &bss_resource);
+ /*
+ * Add resources for kernel text and data to the iomem_resource.
+ * Do it after parse_early_param, so it can be debugged.
+ */
+ setup_kernel_resources();
e820_add_kernel_range();
trim_bios_range();
@@ -972,8 +1025,6 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = e820__end_of_low_ram_pfn();
else
max_low_pfn = max_pfn;
-
- high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
#endif
/* Find and reserve MPTABLE area */
@@ -990,7 +1041,6 @@ void __init setup_arch(char **cmdline_p)
cleanup_highmap();
- memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
/*
@@ -1108,8 +1158,10 @@ void __init setup_arch(char **cmdline_p)
initmem_init();
dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
- if (boot_cpu_has(X86_FEATURE_GBPAGES))
+ if (boot_cpu_has(X86_FEATURE_GBPAGES)) {
hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
+ hugetlb_bootmem_alloc();
+ }
/*
* Reserve memory for crash kernel after SRAT is parsed so that it
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index b30d6e180df7..bfa48e7a32a2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -23,18 +23,13 @@
#include <asm/cpumask.h>
#include <asm/cpu.h>
-#ifdef CONFIG_X86_64
-#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
-#else
-#define BOOT_PERCPU_OFFSET 0
-#endif
+DEFINE_PER_CPU_CACHE_HOT(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
-DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off);
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
- [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
-};
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init;
EXPORT_SYMBOL(__per_cpu_offset);
/*
@@ -169,7 +164,7 @@ void __init setup_per_cpu_areas(void)
for_each_possible_cpu(cpu) {
per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
- per_cpu(pcpu_hot.cpu_number, cpu) = cpu;
+ per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
/*
* Copy data used in early init routines from the
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 059685612362..2ddf23387c7e 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -173,8 +173,8 @@ static int shstk_setup(void)
return PTR_ERR((void *)addr);
fpregs_lock_and_load();
- wrmsrl(MSR_IA32_PL3_SSP, addr + size);
- wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
+ wrmsrq(MSR_IA32_PL3_SSP, addr + size);
+ wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN);
fpregs_unlock();
shstk->base = addr;
@@ -239,7 +239,7 @@ static unsigned long get_user_shstk_addr(void)
fpregs_lock_and_load();
- rdmsrl(MSR_IA32_PL3_SSP, ssp);
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
fpregs_unlock();
@@ -372,7 +372,7 @@ int setup_signal_shadow_stack(struct ksignal *ksig)
return -EFAULT;
fpregs_lock_and_load();
- wrmsrl(MSR_IA32_PL3_SSP, ssp);
+ wrmsrq(MSR_IA32_PL3_SSP, ssp);
fpregs_unlock();
return 0;
@@ -396,7 +396,7 @@ int restore_signal_shadow_stack(void)
return err;
fpregs_lock_and_load();
- wrmsrl(MSR_IA32_PL3_SSP, ssp);
+ wrmsrq(MSR_IA32_PL3_SSP, ssp);
fpregs_unlock();
return 0;
@@ -460,7 +460,7 @@ static int wrss_control(bool enable)
return 0;
fpregs_lock_and_load();
- rdmsrl(MSR_IA32_U_CET, msrval);
+ rdmsrq(MSR_IA32_U_CET, msrval);
if (enable) {
features_set(ARCH_SHSTK_WRSS);
@@ -473,7 +473,7 @@ static int wrss_control(bool enable)
msrval &= ~CET_WRSS_EN;
}
- wrmsrl(MSR_IA32_U_CET, msrval);
+ wrmsrq(MSR_IA32_U_CET, msrval);
unlock:
fpregs_unlock();
@@ -492,8 +492,8 @@ static int shstk_disable(void)
fpregs_lock_and_load();
/* Disable WRSS too when disabling shadow stack */
- wrmsrl(MSR_IA32_U_CET, 0);
- wrmsrl(MSR_IA32_PL3_SSP, 0);
+ wrmsrq(MSR_IA32_U_CET, 0);
+ wrmsrq(MSR_IA32_PL3_SSP, 0);
fpregs_unlock();
shstk_free(current);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 5f441039b572..2404233336ab 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -255,7 +255,7 @@ static void
handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
bool stepping, failed;
- struct fpu *fpu = &current->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(current);
if (v8086_mode(regs))
save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
@@ -423,14 +423,14 @@ bool sigaltstack_size_valid(size_t ss_size)
if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
return true;
- fsize += current->group_leader->thread.fpu.perm.__user_state_size;
+ fsize += x86_task_fpu(current->group_leader)->perm.__user_state_size;
if (likely(ss_size > fsize))
return true;
if (strict_sigaltstack_size)
return ss_size > fsize;
- mask = current->group_leader->thread.fpu.perm.__state_perm;
+ mask = x86_task_fpu(current->group_leader)->perm.__state_perm;
if (mask & XFEATURE_MASK_USER_DYNAMIC)
return ss_size > fsize;
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index ef654530bf5a..98123ff10506 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -33,25 +33,55 @@
#include <asm/smap.h>
#include <asm/gsseg.h>
+/*
+ * The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0
+ * and 1 of a segment selector, i.e., the RPL bits, are NOT used to index
+ * GDT, selector values 0~3 all point to the NULL descriptor, thus values
+ * 0, 1, 2 and 3 are all valid NULL selector values.
+ *
+ * However IRET zeros ES, FS, GS, and DS segment registers if any of them
+ * is found to have any nonzero NULL selector value, which can be used by
+ * userspace in pre-FRED systems to spot any interrupt/exception by loading
+ * a nonzero NULL selector and waiting for it to become zero. Before FRED
+ * there was nothing software could do to prevent such an information leak.
+ *
+ * ERETU, the only legit instruction to return to userspace from kernel
+ * under FRED, by design does NOT zero any segment register to avoid this
+ * problem behavior.
+ *
+ * As such, leave NULL selector values 0~3 unchanged.
+ */
+static inline u16 fixup_rpl(u16 sel)
+{
+ return sel <= 3 ? sel : sel | 3;
+}
+
#ifdef CONFIG_IA32_EMULATION
#include <asm/unistd_32_ia32.h>
static inline void reload_segments(struct sigcontext_32 *sc)
{
- unsigned int cur;
+ u16 cur;
+ /*
+ * Reload fs and gs if they have changed in the signal
+ * handler. This does not handle long fs/gs base changes in
+ * the handler, but does not clobber them at least in the
+ * normal case.
+ */
savesegment(gs, cur);
- if ((sc->gs | 0x03) != cur)
- load_gs_index(sc->gs | 0x03);
+ if (fixup_rpl(sc->gs) != cur)
+ load_gs_index(fixup_rpl(sc->gs));
savesegment(fs, cur);
- if ((sc->fs | 0x03) != cur)
- loadsegment(fs, sc->fs | 0x03);
+ if (fixup_rpl(sc->fs) != cur)
+ loadsegment(fs, fixup_rpl(sc->fs));
+
savesegment(ds, cur);
- if ((sc->ds | 0x03) != cur)
- loadsegment(ds, sc->ds | 0x03);
+ if (fixup_rpl(sc->ds) != cur)
+ loadsegment(ds, fixup_rpl(sc->ds));
savesegment(es, cur);
- if ((sc->es | 0x03) != cur)
- loadsegment(es, sc->es | 0x03);
+ if (fixup_rpl(sc->es) != cur)
+ loadsegment(es, fixup_rpl(sc->es));
}
#define sigset32_t compat_sigset_t
@@ -105,18 +135,12 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs,
regs->orig_ax = -1;
#ifdef CONFIG_IA32_EMULATION
- /*
- * Reload fs and gs if they have changed in the signal
- * handler. This does not handle long fs/gs base changes in
- * the handler, but does not clobber them at least in the
- * normal case.
- */
reload_segments(&sc);
#else
- loadsegment(gs, sc.gs);
- regs->fs = sc.fs;
- regs->es = sc.es;
- regs->ds = sc.ds;
+ loadsegment(gs, fixup_rpl(sc.gs));
+ regs->fs = fixup_rpl(sc.fs);
+ regs->es = fixup_rpl(sc.es);
+ regs->ds = fixup_rpl(sc.ds);
#endif
return fpu__restore_sig(compat_ptr(sc.fpstate), 1);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c10850ae6f09..1ba92ac9441d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -64,7 +64,7 @@
#include <asm/acpi.h>
#include <asm/cacheinfo.h>
-#include <asm/cpuid.h>
+#include <asm/cpuid/api.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
@@ -190,7 +190,7 @@ static void ap_starting(void)
apic_ap_setup();
/* Save the processor parameters. */
- smp_store_cpu_info(cpuid);
+ identify_secondary_cpu(cpuid);
/*
* The topology information must be up to date before
@@ -215,7 +215,7 @@ static void ap_calibrate_delay(void)
{
/*
* Calibrate the delay loop and update loops_per_jiffy in cpu_data.
- * smp_store_cpu_info() stored a value that is close but not as
+ * identify_secondary_cpu() stored a value that is close but not as
* accurate as the value just calculated.
*
* As this is invoked after the TSC synchronization check,
@@ -229,7 +229,7 @@ static void ap_calibrate_delay(void)
/*
* Activate a secondary processor.
*/
-static void notrace start_secondary(void *unused)
+static void notrace __noendbr start_secondary(void *unused)
{
/*
* Don't put *anything* except direct CPU state initialization
@@ -314,26 +314,7 @@ static void notrace start_secondary(void *unused)
wmb();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-
-/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
- */
-void smp_store_cpu_info(int id)
-{
- struct cpuinfo_x86 *c = &cpu_data(id);
-
- /* Copy boot_cpu_data only on the first bringup */
- if (!c->initialized)
- *c = boot_cpu_data;
- c->cpu_index = id;
- /*
- * During boot time, CPU0 has this setup already. Save the info when
- * bringing up an AP.
- */
- identify_secondary_cpu(c);
- c->initialized = true;
-}
+ANNOTATE_NOENDBR_SYM(start_secondary);
static bool
topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
@@ -654,10 +635,9 @@ static void impress_friends(void)
* But that slows boot and resume on modern processors, which include
* many cores and don't require that delay.
*
- * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
- * Modern processor families are quirked to remove the delay entirely.
+ * Cmdline "cpu_init_udelay=" is available to override this delay.
*/
-#define UDELAY_10MS_DEFAULT 10000
+#define UDELAY_10MS_LEGACY 10000
static unsigned int init_udelay = UINT_MAX;
@@ -669,21 +649,21 @@ static int __init cpu_init_udelay(char *str)
}
early_param("cpu_init_udelay", cpu_init_udelay);
-static void __init smp_quirk_init_udelay(void)
+static void __init smp_set_init_udelay(void)
{
/* if cmdline changed it from default, leave it alone */
if (init_udelay != UINT_MAX)
return;
/* if modern processor, use no delay */
- if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
- ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) ||
- ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= 0x18) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xF)) {
init_udelay = 0;
return;
}
/* else, use legacy delay */
- init_udelay = UDELAY_10MS_DEFAULT;
+ init_udelay = UDELAY_10MS_LEGACY;
}
/*
@@ -841,7 +821,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
/* Just in case we booted with a single CPU. */
alternatives_enable_smp();
- per_cpu(pcpu_hot.current_task, cpu) = idle;
+ per_cpu(current_task, cpu) = idle;
cpu_init_stack_canary(cpu, idle);
/* Initialize the interrupt stack(s) */
@@ -851,7 +831,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
- per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle);
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#endif
return 0;
}
@@ -1094,7 +1074,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
uv_system_init();
- smp_quirk_init_udelay();
+ smp_set_init_udelay();
speculative_store_bypass_ht_init();
@@ -1208,6 +1188,12 @@ void cpu_disable_common(void)
remove_siblinginfo(cpu);
+ /*
+ * Stop allowing kernel-mode FPU. This is needed so that if the CPU is
+ * brought online again, the initial state is not allowed:
+ */
+ this_cpu_write(kernel_fpu_allowed, false);
+
/* It's now safe to remove this processor from the online map */
lock_vector_lock();
remove_cpu_from_maps(cpu);
@@ -1258,47 +1244,9 @@ void play_dead_common(void)
local_irq_disable();
}
-/*
- * We need to flush the caches before going to sleep, lest we have
- * dirty data in our caches when we come back up.
- */
-static inline void mwait_play_dead(void)
+void __noreturn mwait_play_dead(unsigned int eax_hint)
{
struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
- unsigned int eax, ebx, ecx, edx;
- unsigned int highest_cstate = 0;
- unsigned int highest_subcstate = 0;
- int i;
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
- return;
- if (!this_cpu_has(X86_FEATURE_MWAIT))
- return;
- if (!this_cpu_has(X86_FEATURE_CLFLUSH))
- return;
-
- eax = CPUID_LEAF_MWAIT;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- /*
- * eax will be 0 if EDX enumeration is not valid.
- * Initialized below to cstate, sub_cstate value when EDX is valid.
- */
- if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
- eax = 0;
- } else {
- edx >>= MWAIT_SUBSTATE_SIZE;
- for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
- if (edx & MWAIT_SUBSTATE_MASK) {
- highest_cstate = i;
- highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
- }
- }
- eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
- (highest_subcstate - 1);
- }
/* Set up state for the kexec() hack below */
md->status = CPUDEAD_MWAIT_WAIT;
@@ -1319,7 +1267,7 @@ static inline void mwait_play_dead(void)
mb();
__monitor(md, 0, 0);
mb();
- __mwait(eax, 0);
+ __mwait(eax_hint, 0);
if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
/*
@@ -1342,6 +1290,50 @@ static inline void mwait_play_dead(void)
}
/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead_cpuid_hint(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int highest_cstate = 0;
+ unsigned int highest_subcstate = 0;
+ int i;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ return;
+ if (!this_cpu_has(X86_FEATURE_MWAIT))
+ return;
+ if (!this_cpu_has(X86_FEATURE_CLFLUSH))
+ return;
+
+ eax = CPUID_LEAF_MWAIT;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ /*
+ * eax will be 0 if EDX enumeration is not valid.
+ * Initialized below to cstate, sub_cstate value when EDX is valid.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+ eax = 0;
+ } else {
+ edx >>= MWAIT_SUBSTATE_SIZE;
+ for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+ if (edx & MWAIT_SUBSTATE_MASK) {
+ highest_cstate = i;
+ highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+ }
+ }
+ eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+ (highest_subcstate - 1);
+ }
+
+ mwait_play_dead(eax);
+}
+
+/*
* Kick all "offline" CPUs out of mwait on kexec(). See comment in
* mwait_play_dead().
*/
@@ -1391,7 +1383,7 @@ void native_play_dead(void)
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- mwait_play_dead();
+ mwait_play_dead_cpuid_hint();
if (cpuidle_play_dead())
hlt_play_dead();
}
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 9e51242ed125..378c388d1b31 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -81,7 +81,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
break;
case RET:
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk_at(insn))
code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk);
else
code = &retinsn;
@@ -90,7 +90,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
case JCC:
if (!func) {
func = __static_call_return;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk())
func = x86_return_thunk;
}
@@ -108,7 +108,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
if (system_state == SYSTEM_BOOTING || modinit)
return text_poke_early(insn, code, size);
- text_poke_bp(insn, code, size, emulate);
+ smp_text_poke_single(insn, code, size, emulate);
}
static void __static_call_validate(u8 *insn, bool tail, bool tramp)
@@ -158,7 +158,7 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
{
mutex_lock(&text_mutex);
- if (tramp) {
+ if (tramp && !site) {
__static_call_validate(tramp, true, true);
__static_call_transform(tramp, __sc_insn(!func, true), func, false);
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4c1bcb6053fc..46b8f1f16676 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -200,8 +200,7 @@ static int tboot_setup_sleep(void)
tboot->num_mac_regions = 0;
for (i = 0; i < e820_table->nr_entries; i++) {
- if ((e820_table->entries[i].type != E820_TYPE_RAM)
- && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN))
+ if (e820_table->entries[i].type != E820_TYPE_RAM)
continue;
add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size);
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index b8e7abe00b06..708d61743d15 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -4,7 +4,7 @@
*/
#include <asm/trace_clock.h>
#include <asm/barrier.h>
-#include <asm/msr.h>
+#include <asm/tsc.h>
/*
* trace_clock_x86_tsc(): A clock that is just the cycle counter.
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
deleted file mode 100644
index 03ae1caaa878..000000000000
--- a/arch/x86/kernel/tracepoint.c
+++ /dev/null
@@ -1,21 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com>
- */
-#include <linux/jump_label.h>
-#include <linux/atomic.h>
-
-#include <asm/trace/exceptions.h>
-
-DEFINE_STATIC_KEY_FALSE(trace_pagefault_key);
-
-int trace_pagefault_reg(void)
-{
- static_branch_inc(&trace_pagefault_key);
- return 0;
-}
-
-void trace_pagefault_unreg(void)
-{
- static_branch_dec(&trace_pagefault_key);
-}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 2dbadf347b5f..c5c897a86418 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -68,6 +68,7 @@
#include <asm/vdso.h>
#include <asm/tdx.h>
#include <asm/cfi.h>
+#include <asm/msr.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -94,10 +95,20 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
/*
* Check for UD1 or UD2, accounting for Address Size Override Prefixes.
- * If it's a UD1, get the ModRM byte to pass along to UBSan.
+ * If it's a UD1, further decode to determine its use:
+ *
+ * FineIBT: ea (bad)
+ * FineIBT: f0 75 f9 lock jne . - 6
+ * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax
+ * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
+ * static_call: 0f b9 cc ud1 %esp,%ecx
+ *
+ * Notably UBSAN uses EAX, static_call uses ECX.
*/
-__always_inline int decode_bug(unsigned long addr, u32 *imm)
+__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
{
+ unsigned long start = addr;
+ bool lock = false;
u8 v;
if (addr < TASK_SIZE_MAX)
@@ -106,28 +117,67 @@ __always_inline int decode_bug(unsigned long addr, u32 *imm)
v = *(u8 *)(addr++);
if (v == INSN_ASOP)
v = *(u8 *)(addr++);
- if (v != OPCODE_ESCAPE)
+
+ if (v == INSN_LOCK) {
+ lock = true;
+ v = *(u8 *)(addr++);
+ }
+
+ switch (v) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ addr += 1; /* d8 */
+ *len = addr - start;
+ WARN_ON_ONCE(!lock);
+ return BUG_LOCK;
+
+ case 0xea:
+ *len = addr - start;
+ return BUG_EA;
+
+ case OPCODE_ESCAPE:
+ break;
+
+ default:
return BUG_NONE;
+ }
v = *(u8 *)(addr++);
- if (v == SECOND_BYTE_OPCODE_UD2)
+ if (v == SECOND_BYTE_OPCODE_UD2) {
+ *len = addr - start;
return BUG_UD2;
+ }
- if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1)
+ if (v != SECOND_BYTE_OPCODE_UD1)
return BUG_NONE;
- /* Retrieve the immediate (type value) for the UBSAN UD1 */
- v = *(u8 *)(addr++);
- if (X86_MODRM_RM(v) == 4)
- addr++;
-
*imm = 0;
- if (X86_MODRM_MOD(v) == 1)
- *imm = *(u8 *)addr;
- else if (X86_MODRM_MOD(v) == 2)
- *imm = *(u32 *)addr;
- else
- WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v));
+ v = *(u8 *)(addr++); /* ModRM */
+
+ if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
+ addr++; /* SIB */
+
+ /* Decode immediate, if present */
+ switch (X86_MODRM_MOD(v)) {
+ case 0: if (X86_MODRM_RM(v) == 5)
+ addr += 4; /* RIP + disp32 */
+ break;
+
+ case 1: *imm = *(s8 *)addr;
+ addr += 1;
+ break;
+
+ case 2: *imm = *(s32 *)addr;
+ addr += 4;
+ break;
+
+ case 3: break;
+ }
+
+ /* record instruction length */
+ *len = addr - start;
+
+ if (X86_MODRM_REG(v) == 0) /* EAX */
+ return BUG_UD1_UBSAN;
return BUG_UD1;
}
@@ -257,11 +307,12 @@ static inline void handle_invalid_op(struct pt_regs *regs)
static noinstr bool handle_bug(struct pt_regs *regs)
{
+ unsigned long addr = regs->ip;
bool handled = false;
- int ud_type;
- u32 imm;
+ int ud_type, ud_len;
+ s32 ud_imm;
- ud_type = decode_bug(regs->ip, &imm);
+ ud_type = decode_bug(addr, &ud_imm, &ud_len);
if (ud_type == BUG_NONE)
return handled;
@@ -281,15 +332,47 @@ static noinstr bool handle_bug(struct pt_regs *regs)
*/
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_enable();
- if (ud_type == BUG_UD2) {
- if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
- handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
- regs->ip += LEN_UD2;
+
+ switch (ud_type) {
+ case BUG_UD2:
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+ handled = true;
+ break;
+ }
+ fallthrough;
+
+ case BUG_EA:
+ case BUG_LOCK:
+ if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
handled = true;
+ break;
+ }
+ break;
+
+ case BUG_UD1_UBSAN:
+ if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
+ pr_crit("%s at %pS\n",
+ report_ubsan_failure(ud_imm),
+ (void *)regs->ip);
}
- } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
- pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * When continuing, and regs->ip hasn't changed, move it to the next
+ * instruction. When not continuing execution, restore the instruction
+ * pointer.
+ */
+ if (handled) {
+ if (regs->ip == addr)
+ regs->ip += ud_len;
+ } else {
+ regs->ip = addr;
}
+
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_disable();
instrumentation_end();
@@ -380,6 +463,21 @@ __visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
#endif
/*
+ * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64
+ * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch
+ * between configs triggers objtool warnings.
+ *
+ * This is a temporary hack until we have compiler or plugin support for
+ * annotating noreturns.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+#define always_true() true
+#else
+bool always_true(void);
+bool __weak always_true(void) { return true; }
+#endif
+
+/*
* Runs on an IST stack for x86_64 and on a special task stack for x86_32.
*
* On x86_64, this is more or less a normal kernel entry. Notwithstanding the
@@ -514,7 +612,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
die("double fault", regs, error_code);
- panic("Machine halted.");
+ if (always_true())
+ panic("Machine halted.");
instrumentation_end();
}
@@ -651,7 +750,7 @@ static bool try_fixup_enqcmd_gp(void)
if (current->pasid_activated)
return false;
- wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
+ wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
current->pasid_activated = 1;
return true;
@@ -784,16 +883,16 @@ static void do_int3_user(struct pt_regs *regs)
DEFINE_IDTENTRY_RAW(exc_int3)
{
/*
- * poke_int3_handler() is completely self contained code; it does (and
+ * smp_text_poke_int3_handler() is completely self contained code; it does (and
* must) *NOT* call out to anything, lest it hits upon yet another
* INT3.
*/
- if (poke_int3_handler(regs))
+ if (smp_text_poke_int3_handler(regs))
return;
/*
* irqentry_enter_from_user_mode() uses static_branch_{,un}likely()
- * and therefore can trigger INT3, hence poke_int3_handler() must
+ * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must
* be done before. If the entry came from kernel mode, then use
* nmi_enter() because the INT3 could have been hit in any context
* including NMI.
@@ -1022,9 +1121,9 @@ static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6)
*/
unsigned long debugctl;
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
debugctl |= DEBUGCTLMSR_BTF;
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
}
/*
@@ -1197,7 +1296,7 @@ DEFINE_IDTENTRY_RAW(exc_debug)
static void math_error(struct pt_regs *regs, int trapnr)
{
struct task_struct *task = current;
- struct fpu *fpu = &task->thread.fpu;
+ struct fpu *fpu = x86_task_fpu(task);
int si_code;
char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
"simd exception";
@@ -1288,11 +1387,11 @@ static bool handle_xfd_event(struct pt_regs *regs)
if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
return false;
- rdmsrl(MSR_IA32_XFD_ERR, xfd_err);
+ rdmsrq(MSR_IA32_XFD_ERR, xfd_err);
if (!xfd_err)
return false;
- wrmsrl(MSR_IA32_XFD_ERR, 0);
+ wrmsrq(MSR_IA32_XFD_ERR, 0);
/* Die if that happens in kernel space */
if (WARN_ON(!user_mode(regs)))
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 34dec0b72ea8..87e749106dda 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -16,7 +16,7 @@
#include <linux/static_key.h>
#include <linux/static_call.h>
-#include <asm/cpuid.h>
+#include <asm/cpuid/api.h>
#include <asm/hpet.h>
#include <asm/timer.h>
#include <asm/vgtod.h>
@@ -29,6 +29,7 @@
#include <asm/apic.h>
#include <asm/cpu_device_id.h>
#include <asm/i8259.h>
+#include <asm/msr.h>
#include <asm/topology.h>
#include <asm/uv/uv.h>
#include <asm/sev.h>
@@ -959,7 +960,7 @@ static unsigned long long cyc2ns_suspend;
void tsc_save_sched_clock_state(void)
{
- if (!sched_clock_stable())
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
return;
cyc2ns_suspend = sched_clock();
@@ -979,7 +980,7 @@ void tsc_restore_sched_clock_state(void)
unsigned long flags;
int cpu;
- if (!sched_clock_stable())
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
return;
local_irq_save(flags);
@@ -1098,7 +1099,7 @@ static void __init detect_art(void)
if (art_base_clk.denominator < ART_MIN_DENOMINATOR)
return;
- rdmsrl(MSR_IA32_TSC_ADJUST, art_base_clk.offset);
+ rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset);
/* Make this sticky over multiple CPU init calls */
setup_force_cpu_cap(X86_FEATURE_ART);
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index deeb02825670..48e6cc1cb017 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -152,7 +152,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt),
X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng),
X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht),
- X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &freq_desc_ann),
X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm),
{}
};
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 4334033658ed..ec3aa340d351 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <asm/msr.h>
#include <asm/tsc.h>
struct tsc_adjust {
@@ -65,12 +66,12 @@ void tsc_verify_tsc_adjust(bool resume)
adj->nextcheck = jiffies + HZ;
- rdmsrl(MSR_IA32_TSC_ADJUST, curval);
+ rdmsrq(MSR_IA32_TSC_ADJUST, curval);
if (adj->adjusted == curval)
return;
/* Restore the original value */
- wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
+ wrmsrq(MSR_IA32_TSC_ADJUST, adj->adjusted);
if (!adj->warned || resume) {
pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
@@ -142,7 +143,7 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
if (likely(!tsc_async_resets)) {
pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
cpu, bootval);
- wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ wrmsrq(MSR_IA32_TSC_ADJUST, 0);
bootval = 0;
} else {
pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
@@ -165,7 +166,7 @@ bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
if (check_tsc_unstable())
return false;
- rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ rdmsrq(MSR_IA32_TSC_ADJUST, bootval);
cur->bootval = bootval;
cur->nextcheck = jiffies + HZ;
tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
@@ -187,7 +188,7 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return false;
- rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ rdmsrq(MSR_IA32_TSC_ADJUST, bootval);
cur->bootval = bootval;
cur->nextcheck = jiffies + HZ;
cur->warned = false;
@@ -229,7 +230,7 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
*/
if (bootval != ref->adjusted) {
cur->adjusted = ref->adjusted;
- wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
+ wrmsrq(MSR_IA32_TSC_ADJUST, ref->adjusted);
}
/*
* We have the TSCs forced to be in sync on this package. Skip sync
@@ -518,7 +519,7 @@ retry:
pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
cpu, cur_max_warp, cur->adjusted);
- wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
+ wrmsrq(MSR_IA32_TSC_ADJUST, cur->adjusted);
goto retry;
}
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index d4705a348a80..977ee75e047c 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -476,7 +476,7 @@ bool unwind_next_frame(struct unwind_state *state)
return false;
/* Don't let modules unload while we're reading their ORC data. */
- preempt_disable();
+ guard(rcu)();
/* End-of-stack check for user tasks: */
if (state->regs && user_mode(state->regs))
@@ -669,14 +669,12 @@ bool unwind_next_frame(struct unwind_state *state)
goto err;
}
- preempt_enable();
return true;
err:
state->error = true;
the_end:
- preempt_enable();
state->stack_info.type = STACK_TYPE_UNKNOWN;
return false;
}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 5a952c5ea66b..6d383839e839 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -357,19 +357,23 @@ void *arch_uprobe_trampoline(unsigned long *psize)
return &insn;
}
-static unsigned long trampoline_check_ip(void)
+static unsigned long trampoline_check_ip(unsigned long tramp)
{
- unsigned long tramp = uprobe_get_trampoline_vaddr();
-
return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry);
}
SYSCALL_DEFINE0(uretprobe)
{
struct pt_regs *regs = task_pt_regs(current);
- unsigned long err, ip, sp, r11_cx_ax[3];
+ unsigned long err, ip, sp, r11_cx_ax[3], tramp;
+
+ /* If there's no trampoline, we are called from wrong place. */
+ tramp = uprobe_get_trampoline_vaddr();
+ if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR))
+ goto sigill;
- if (regs->ip != trampoline_check_ip())
+ /* Make sure the ip matches the only allowed sys_uretprobe caller. */
+ if (unlikely(regs->ip != trampoline_check_ip(tramp)))
goto sigill;
err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax));
@@ -836,6 +840,11 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
insn_byte_t p;
int i;
+ /* x86_nops[insn->length]; same as jmp with .offs = 0 */
+ if (insn->length <= ASM_NOP_MAX &&
+ !memcmp(insn->kaddr, x86_nops[insn->length], insn->length))
+ goto setup;
+
switch (opc1) {
case 0xeb: /* jmp 8 */
case 0xe9: /* jmp 32 */
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 1258a5872d12..37ad43792452 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -29,8 +29,12 @@
*/
#include <asm/cpufeatures.h>
+#include <asm/cpufeaturemasks.h>
#include <asm/msr-index.h>
+#define SSE_MASK \
+ (REQUIRED_MASK0 & ((1<<(X86_FEATURE_XMM & 31)) | (1<<(X86_FEATURE_XMM2 & 31))))
+
SYM_FUNC_START_LOCAL(verify_cpu)
pushf # Save caller passed flags
push $0 # Kill any dangerous flags
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0deb4887d6e9..4fa0be732af1 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -43,7 +43,8 @@ ENTRY(phys_startup_64)
#endif
jiffies = jiffies_64;
-const_pcpu_hot = pcpu_hot;
+const_current_task = current_task;
+const_cpu_current_top_of_stack = cpu_current_top_of_stack;
#if defined(CONFIG_X86_64)
/*
@@ -78,11 +79,13 @@ const_pcpu_hot = pcpu_hot;
#define BSS_DECRYPTED \
. = ALIGN(PMD_SIZE); \
__start_bss_decrypted = .; \
+ __pi___start_bss_decrypted = .; \
*(.bss..decrypted); \
. = ALIGN(PAGE_SIZE); \
__start_bss_decrypted_unused = .; \
. = ALIGN(PMD_SIZE); \
__end_bss_decrypted = .; \
+ __pi___end_bss_decrypted = .; \
#else
@@ -112,12 +115,6 @@ ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(6); /* RW_ */
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_SMP
- percpu PT_LOAD FLAGS(6); /* RW_ */
-#endif
- init PT_LOAD FLAGS(7); /* RWE */
-#endif
note PT_NOTE FLAGS(0); /* ___ */
}
@@ -133,6 +130,7 @@ SECTIONS
/* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
_text = .;
+ __pi__text = .;
_stext = .;
ALIGN_ENTRY_TEXT_BEGIN
*(.text..__x86.rethunk_untrain)
@@ -193,6 +191,8 @@ SECTIONS
PAGE_ALIGNED_DATA(PAGE_SIZE)
+ CACHE_HOT_DATA(L1_CACHE_BYTES)
+
CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
DATA_DATA
@@ -216,21 +216,7 @@ SECTIONS
__init_begin = .; /* paired with __init_end */
}
-#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
- /*
- * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
- * output PHDR, so the next output section - .init.text - should
- * start another segment - init.
- */
- PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
- ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START,
- "per-CPU data too large - increase CONFIG_PHYSICAL_START")
-#endif
-
INIT_TEXT_SECTION(PAGE_SIZE)
-#ifdef CONFIG_X86_64
- :init
-#endif
/*
* Section for code used exclusively before alternatives are run. All
@@ -347,9 +333,8 @@ SECTIONS
EXIT_DATA
}
-#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
- PERCPU_SECTION(INTERNODE_CACHE_BYTES)
-#endif
+ PERCPU_SECTION(L1_CACHE_BYTES)
+ ASSERT(__per_cpu_hot_end - __per_cpu_hot_start <= 64, "percpu cache hot data too large")
RUNTIME_CONST_VARIABLES
RUNTIME_CONST(ptr, USER_PTR_MAX)
@@ -409,6 +394,7 @@ SECTIONS
. = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
_end = .;
+ __pi__end = .;
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
@@ -484,28 +470,23 @@ SECTIONS
}
/*
- * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause
+ * this. Let's assume that nobody will be running a COMPILE_TEST kernel and
+ * let's assert that fuller build coverage is more valuable than being able to
+ * run a COMPILE_TEST kernel.
+ */
+#ifndef CONFIG_COMPILE_TEST
+/*
+ * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility:
*/
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
+#endif
/* needed for Clang - see arch/x86/entry/entry.S */
PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
#ifdef CONFIG_X86_64
-/*
- * Per-cpu symbols which need to be offset from __per_cpu_load
- * for the boot processor.
- */
-#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(fixed_percpu_data);
-INIT_PER_CPU(irq_stack_backing_store);
-
-#ifdef CONFIG_SMP
-. = ASSERT((fixed_percpu_data == 0),
- "fixed_percpu_data is not at start of per-cpu area");
-#endif
#ifdef CONFIG_MITIGATION_UNRET_ENTRY
. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
@@ -528,6 +509,16 @@ INIT_PER_CPU(irq_stack_backing_store);
"SRSO function pair won't alias");
#endif
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline");
+. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart");
+. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array");
+#endif
+
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline");
+#endif
+
#endif /* CONFIG_X86_64 */
/*